X-Git-Url: https://osm.etsi.org/gitweb/?a=blobdiff_plain;f=osm_mon%2Fevaluator%2Fevaluator.py;h=6ca0dc576b01ac8f82e744958aac90be869a07c6;hb=8d12a63175dcfbb7d5468f55c93a05797a912ddd;hp=24e8e43f0adbda6793cdbae672ee2cfd684800fe;hpb=649e6b264fb9e92103267dabd16ed9a3dbadb302;p=osm%2FMON.git diff --git a/osm_mon/evaluator/evaluator.py b/osm_mon/evaluator/evaluator.py index 24e8e43..6ca0dc5 100644 --- a/osm_mon/evaluator/evaluator.py +++ b/osm_mon/evaluator/evaluator.py @@ -22,30 +22,18 @@ ## import asyncio import logging -import multiprocessing import time -from enum import Enum +import concurrent.futures -import peewee -import requests -from osm_common.dbbase import DbException - -from osm_mon.collector.backends.prometheus import OSM_METRIC_PREFIX -from osm_mon.core.common_db import CommonDbClient from osm_mon.core.config import Config -from osm_mon.core.database import DatabaseManager, Alarm from osm_mon.core.message_bus_client import MessageBusClient +from osm_mon.core.models import Alarm from osm_mon.core.response import ResponseBuilder +from osm_mon.evaluator.service import EvaluatorService, AlarmStatus log = logging.getLogger(__name__) -class AlarmStatus(Enum): - ALARM = 'alarm' - OK = 'ok' - INSUFFICIENT = 'insufficient-data' - - class Evaluator: def __init__(self, config: Config, loop=None): @@ -53,148 +41,91 @@ class Evaluator: if not loop: loop = asyncio.get_event_loop() self.loop = loop - self.common_db = CommonDbClient(self.conf) - self.plugins = [] - self.database_manager = DatabaseManager(self.conf) - self.database_manager.create_tables() - self.queue = multiprocessing.Queue() + self.service = EvaluatorService(config) self.msg_bus = MessageBusClient(config) - def _evaluate_metric(self, - nsr_id: str, - vnf_member_index: int, - vdur_name: str, - metric_name: str, - alarm: Alarm): - log.debug("_evaluate_metric") - # TODO: Refactor to fit backend plugin model - query_section = "query={0}{{ns_id=\"{1}\",vdu_name=\"{2}\",vnf_member_index=\"{3}\"}}".format( - OSM_METRIC_PREFIX + metric_name, nsr_id, vdur_name, vnf_member_index) - request_url = self.conf.get('prometheus', 'url') + "/api/v1/query?" + query_section - log.info("Querying Prometheus: %s", request_url) - r = requests.get(request_url, timeout=int(self.conf.get('global', 'request_timeout'))) - if r.status_code == 200: - json_response = r.json() - if json_response['status'] == 'success': - result = json_response['data']['result'] - if result: - metric_value = float(result[0]['value'][1]) - log.info("Metric value: %s", metric_value) - if alarm.operation.upper() == 'GT': - if metric_value > alarm.threshold: - self.queue.put((alarm, AlarmStatus.ALARM)) - else: - self.queue.put((alarm, AlarmStatus.OK)) - elif alarm.operation.upper() == 'LT': - if metric_value < alarm.threshold: - self.queue.put((alarm, AlarmStatus.ALARM)) - else: - self.queue.put((alarm, AlarmStatus.OK)) - else: - log.warning("No metric result for alarm %s", alarm.id) - self.queue.put((alarm, AlarmStatus.INSUFFICIENT)) - - else: - log.warning("Prometheus response is not success. Got status %s", json_response['status']) - else: - log.warning("Error contacting Prometheus. Got status code %s: %s", r.status_code, r.text) - def evaluate_forever(self): log.debug('evaluate_forever') while True: try: self.evaluate() time.sleep(int(self.conf.get('evaluator', 'interval'))) - except peewee.PeeweeException: - log.exception("Database error evaluating alarms: ") - raise except Exception: log.exception("Error evaluating alarms") def evaluate(self): log.debug('evaluate') - processes = [] - for alarm in Alarm.select(): - try: - vnfr = self.common_db.get_vnfr(alarm.nsr_id, alarm.vnf_member_index) - except DbException: - log.exception("Error getting vnfr: ") - continue - vnfd = self.common_db.get_vnfd(vnfr['vnfd-id']) + alarms_tuples = self.service.evaluate_alarms() + # Starting evaluate executor pool with pool size process_pool_size. Default process_pool_size is 20 + with concurrent.futures.ProcessPoolExecutor(self.conf.get('evaluator', 'process_pool_size')) as executor: + log.info('Started evaluate process pool with pool size %s' % (self.conf.get('evaluator', + 'process_pool_size'))) + evaluate_futures = [] + for alarm, status in alarms_tuples: + evaluate_futures.append(executor.submit(Evaluator._notify_alarm, self.conf, alarm, status)) + try: - vdur = next(filter(lambda vdur: vdur['name'] == alarm.vdur_name, vnfr['vdur'])) - except StopIteration: - log.warning("No vdur found with name %s for alarm %s", alarm.vdur_name, alarm.id) - continue - vdu = next(filter(lambda vdu: vdu['id'] == vdur['vdu-id-ref'], vnfd['vdu'])) - vnf_monitoring_param = next( - filter(lambda param: param['id'] == alarm.monitoring_param, vnfd['monitoring-param'])) - nsr_id = vnfr['nsr-id-ref'] - vnf_member_index = vnfr['member-vnf-index-ref'] - vdur_name = vdur['name'] - if 'vdu-monitoring-param' in vnf_monitoring_param: - vdu_monitoring_param = next(filter( - lambda param: param['id'] == vnf_monitoring_param['vdu-monitoring-param'][ - 'vdu-monitoring-param-ref'], vdu['monitoring-param'])) - nfvi_metric = vdu_monitoring_param['nfvi-metric'] - - p = multiprocessing.Process(target=self._evaluate_metric, - args=(nsr_id, - vnf_member_index, - vdur_name, - nfvi_metric, - alarm)) - processes.append(p) - p.start() - if 'vdu-metric' in vnf_monitoring_param: - vnf_metric_name = vnf_monitoring_param['vdu-metric']['vdu-metric-name-ref'] - p = multiprocessing.Process(target=self._evaluate_metric, - args=(nsr_id, - vnf_member_index, - vdur_name, - vnf_metric_name, - alarm)) - processes.append(p) - p.start() - if 'vnf-metric' in vnf_monitoring_param: - vnf_metric_name = vnf_monitoring_param['vnf-metric']['vnf-metric-name-ref'] - p = multiprocessing.Process(target=self._evaluate_metric, - args=(nsr_id, - vnf_member_index, - '', - vnf_metric_name, - alarm)) - processes.append(p) - p.start() - - for process in processes: - process.join(timeout=10) - alarms_tuples = [] - while not self.queue.empty(): - alarms_tuples.append(self.queue.get()) - for alarm, status in alarms_tuples: - p = multiprocessing.Process(target=self.notify_alarm, - args=(alarm, status)) - p.start() - - def notify_alarm(self, alarm: Alarm, status: AlarmStatus): - log.debug("notify_alarm") - resp_message = self._build_alarm_response(alarm, status) + # Wait for future calls to complete till process_timeout. Default is 50 seconds + for evaluate_future in concurrent.futures.as_completed(evaluate_futures, + self.conf.get('evaluator', 'process_timeout')): + result = evaluate_future.result(timeout=int(self.conf.get('evaluator', + 'process_timeout'))) + log.debug('result = %s' % (result)) + except concurrent.futures.TimeoutError as e: + # Some processes have not completed due to timeout error + log.info('Some processes have not finished due to TimeoutError exception') + log.debug('concurrent.futures.TimeoutError exception %s' % (e)) + + # Shutting down process pool executor + Evaluator._stop_process_pool(executor) + + @staticmethod + def _stop_process_pool(executor): + log.debug("_stop_process_pool") + log.info('Shutting down process pool') + try: + log.debug('Stopping residual processes in the process pool') + for pid, process in executor._processes.items(): + if process.is_alive(): + process.terminate() + except Exception as e: + log.info("Exception during process termination") + log.debug("Exception %s" % (e)) + + try: + # Shutting down executor + log.debug('Shutting down process pool executor') + executor.shutdown() + except RuntimeError as e: + log.info('RuntimeError in shutting down executer') + log.debug('RuntimeError %s' % (e)) + return + + @staticmethod + def _notify_alarm(conf: Config, alarm: Alarm, status: AlarmStatus): + log.debug("_notify_alarm") + resp_message = Evaluator._build_alarm_response(alarm, status) + msg_bus = MessageBusClient(conf) + loop = asyncio.get_event_loop() log.info("Sent alarm notification: %s", resp_message) - self.loop.run_until_complete(self.msg_bus.aiowrite('alarm_response', 'notify_alarm', resp_message)) + loop.run_until_complete(msg_bus.aiowrite('alarm_response', 'notify_alarm', resp_message)) + return - def _build_alarm_response(self, alarm: Alarm, status: AlarmStatus): + @staticmethod + def _build_alarm_response(alarm: Alarm, status: AlarmStatus): + log.debug("_build_alarm_response") response = ResponseBuilder() + tags = {} + for name, value in alarm.tags.items(): + tags[name] = value now = time.strftime("%d-%m-%Y") + " " + time.strftime("%X") return response.generate_response( 'notify_alarm', alarm_id=alarm.uuid, - vdu_name=alarm.vdur_name, - vnf_member_index=alarm.vnf_member_index, - ns_id=alarm.nsr_id, - metric_name=alarm.monitoring_param, + metric_name=alarm.metric, operation=alarm.operation, threshold_value=alarm.threshold, sev=alarm.severity, status=status.value, - date=now) + date=now, + tags=tags)