osm_mon/plugins/OpenStack/Aodh/alarming.py

   1 # Copyright 2017 Intel Research and Development Ireland Limited
   2 # *************************************************************
   3
   4 # This file is part of OSM Monitoring module
   5 # All Rights Reserved to Intel Corporation
   6
   7 # Licensed under the Apache License, Version 2.0 (the "License"); you may
   8 # not use this file except in compliance with the License. You may obtain
   9 # a copy of the License at
  10
  11 #         http://www.apache.org/licenses/LICENSE-2.0
  12
  13 # Unless required by applicable law or agreed to in writing, software
  14 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16 # License for the specific language governing permissions and limitations
  17 # under the License.
  18
  19 # For those usages not covered by the Apache License, Version 2.0 please
  20 # contact: helena.mcgough@intel.com or adrian.hoban@intel.com
  21 ##
  22 """Carry out alarming requests via Aodh API."""
  23
  24 import json
  25 import logging
  26
  27 import six
  28 import yaml
  29
  30 from osm_mon.core.database import DatabaseManager
  31 from osm_mon.core.message_bus.producer import KafkaProducer
  32 from osm_mon.plugins.OpenStack.common import Common
  33 from osm_mon.plugins.OpenStack.response import OpenStack_Response
  34 from osm_mon.plugins.OpenStack.settings import Config
  35
  36 log = logging.getLogger(__name__)
  37
  38 ALARM_NAMES = {
  39     "average_memory_usage_above_threshold": "average_memory_utilization",
  40     "disk_read_ops": "disk_read_ops",
  41     "disk_write_ops": "disk_write_ops",
  42     "disk_read_bytes": "disk_read_bytes",
  43     "disk_write_bytes": "disk_write_bytes",
  44     "net_packets_dropped": "packets_dropped",
  45     "packets_in_above_threshold": "packets_received",
  46     "packets_out_above_threshold": "packets_sent",
  47     "cpu_utilization_above_threshold": "cpu_utilization"}
  48
  49 METRIC_MAPPINGS = {
  50     "average_memory_utilization": "memory.percent",
  51     "disk_read_ops": "disk.read.requests",
  52     "disk_write_ops": "disk.write.requests",
  53     "disk_read_bytes": "disk.read.bytes",
  54     "disk_write_bytes": "disk.write.bytes",
  55     "packets_dropped": "interface.if_dropped",
  56     "packets_received": "interface.if_packets",
  57     "packets_sent": "interface.if_packets",
  58     "cpu_utilization": "cpu_util",
  59 }
  60
  61 SEVERITIES = {
  62     "warning": "low",
  63     "minor": "low",
  64     "major": "moderate",
  65     "critical": "critical",
  66     "indeterminate": "critical"}
  67
  68 STATISTICS = {
  69     "average": "mean",
  70     "minimum": "min",
  71     "maximum": "max",
  72     "count": "count",
  73     "sum": "sum"}
  74
  75
  76 class Alarming(object):
  77     """Carries out alarming requests and responses via Aodh API."""
  78
  79     def __init__(self):
  80         """Create the OpenStack alarming instance."""
  81         # Initialize configuration and notifications
  82         config = Config.instance()
  83         config.read_environ()
  84
  85         self._database_manager = DatabaseManager()
  86
  87         # Use the Response class to generate valid json response messages
  88         self._response = OpenStack_Response()
  89
  90         # Initializer a producer to send responses back to SO
  91         self._producer = KafkaProducer("alarm_response")
  92
  93     def alarming(self, message):
  94         """Consume info from the message bus to manage alarms."""
  95         try:
  96             values = json.loads(message.value)
  97         except ValueError:
  98             values = yaml.safe_load(message.value)
  99
 100         log.info("OpenStack alarm action required.")
 101         vim_uuid = values['vim_uuid']
 102
 103         auth_token = Common.get_auth_token(vim_uuid)
 104
 105         alarm_endpoint = Common.get_endpoint("alarming", vim_uuid)
 106         metric_endpoint = Common.get_endpoint("metric", vim_uuid)
 107
 108         if message.key == "create_alarm_request":
 109             # Configure/Update an alarm
 110             alarm_details = values['alarm_create_request']
 111
 112             alarm_id, alarm_status = self.configure_alarm(
 113                 alarm_endpoint, metric_endpoint, auth_token, alarm_details)
 114
 115             # Generate a valid response message, send via producer
 116             try:
 117                 if alarm_status is True:
 118                     log.info("Alarm successfully created")
 119                     self._database_manager.save_alarm(alarm_id, vim_uuid)
 120
 121                 resp_message = self._response.generate_response(
 122                     'create_alarm_response', status=alarm_status,
 123                     alarm_id=alarm_id,
 124                     cor_id=alarm_details['correlation_id'])
 125                 log.info("Response Message: %s", resp_message)
 126                 self._producer.create_alarm_response(
 127                     'create_alarm_response', resp_message,
 128                     'alarm_response')
 129             except Exception as exc:
 130                 log.exception("Response creation failed:")
 131
 132         elif message.key == "list_alarm_request":
 133             # Check for a specified: alarm_name, resource_uuid, severity
 134             # and generate the appropriate list
 135             list_details = values['alarm_list_request']
 136
 137             alarm_list = self.list_alarms(
 138                 alarm_endpoint, auth_token, list_details)
 139
 140             try:
 141                 # Generate and send a list response back
 142                 resp_message = self._response.generate_response(
 143                     'list_alarm_response', alarm_list=alarm_list,
 144                     cor_id=list_details['correlation_id'])
 145                 log.info("Response Message: %s", resp_message)
 146                 self._producer.list_alarm_response(
 147                     'list_alarm_response', resp_message,
 148                     'alarm_response')
 149             except Exception as exc:
 150                 log.exception("Failed to send a valid response back.")
 151
 152         elif message.key == "delete_alarm_request":
 153             request_details = values['alarm_delete_request']
 154             alarm_id = request_details['alarm_uuid']
 155
 156             resp_status = self.delete_alarm(
 157                 alarm_endpoint, auth_token, alarm_id)
 158
 159             # Generate and send a response message
 160             try:
 161                 resp_message = self._response.generate_response(
 162                     'delete_alarm_response', alarm_id=alarm_id,
 163                     status=resp_status,
 164                     cor_id=request_details['correlation_id'])
 165                 log.info("Response message: %s", resp_message)
 166                 self._producer.delete_alarm_response(
 167                     'delete_alarm_response', resp_message,
 168                     'alarm_response')
 169             except Exception as exc:
 170                 log.warn("Failed to create delete response:%s", exc)
 171
 172         elif message.key == "acknowledge_alarm":
 173             # Acknowledge that an alarm has been dealt with by the SO
 174             alarm_id = values['ack_details']['alarm_uuid']
 175
 176             response = self.update_alarm_state(
 177                 alarm_endpoint, auth_token, alarm_id)
 178
 179             # Log if an alarm was reset
 180             if response is True:
 181                 log.info("Acknowledged the alarm and cleared it.")
 182             else:
 183                 log.warn("Failed to acknowledge/clear the alarm.")
 184
 185         elif message.key == "update_alarm_request":
 186             # Update alarm configurations
 187             alarm_details = values['alarm_update_request']
 188
 189             alarm_id, status = self.update_alarm(
 190                 alarm_endpoint, auth_token, alarm_details)
 191
 192             # Generate a response for an update request
 193             try:
 194                 resp_message = self._response.generate_response(
 195                     'update_alarm_response', alarm_id=alarm_id,
 196                     cor_id=alarm_details['correlation_id'],
 197                     status=status)
 198                 log.info("Response message: %s", resp_message)
 199                 self._producer.update_alarm_response(
 200                     'update_alarm_response', resp_message,
 201                     'alarm_response')
 202             except Exception as exc:
 203                 log.warn("Failed to send an update response:%s", exc)
 204
 205         else:
 206             log.debug("Unknown key, no action will be performed")
 207
 208         return
 209
 210     def configure_alarm(self, alarm_endpoint, metric_endpoint, auth_token, values):
 211         """Create requested alarm in Aodh."""
 212         url = "{}/v2/alarms/".format(alarm_endpoint)
 213
 214         # Check if the desired alarm is supported
 215         alarm_name = values['alarm_name'].lower()
 216         metric_name = values['metric_name'].lower()
 217         resource_id = values['resource_uuid']
 218
 219         if metric_name not in METRIC_MAPPINGS.keys():
 220             log.warn("This metric is not supported.")
 221             return None, False
 222
 223         # Check for the required metric
 224         metric_id = self.check_for_metric(auth_token, metric_endpoint, metric_name, resource_id)
 225
 226         try:
 227             if metric_id is not None:
 228                 # Create the alarm if metric is available
 229                 payload = self.check_payload(values, metric_name, resource_id,
 230                                              alarm_name)
 231                 new_alarm = Common.perform_request(
 232                     url, auth_token, req_type="post", payload=payload)
 233                 return json.loads(new_alarm.text)['alarm_id'], True
 234             else:
 235                 log.warn("The required Gnocchi metric does not exist.")
 236                 return None, False
 237
 238         except Exception as exc:
 239             log.warn("Failed to create the alarm: %s", exc)
 240         return None, False
 241
 242     def delete_alarm(self, endpoint, auth_token, alarm_id):
 243         """Delete alarm function."""
 244         url = "{}/v2/alarms/%s".format(endpoint) % alarm_id
 245
 246         try:
 247             result = Common.perform_request(
 248                 url, auth_token, req_type="delete")
 249             if str(result.status_code) == "404":
 250                 log.info("Alarm doesn't exist: %s", result.status_code)
 251                 # If status code is 404 alarm did not exist
 252                 return False
 253             else:
 254                 return True
 255
 256         except Exception as exc:
 257             log.warn("Failed to delete alarm: %s because %s.", alarm_id, exc)
 258         return False
 259
 260     def list_alarms(self, endpoint, auth_token, list_details):
 261         """Generate the requested list of alarms."""
 262         url = "{}/v2/alarms/".format(endpoint)
 263         a_list, name_list, sev_list, res_list = [], [], [], []
 264
 265         # TODO(mcgoughh): for now resource_id is a mandatory field
 266         # Check for a resource id
 267         try:
 268             resource = list_details['resource_uuid']
 269         except KeyError as exc:
 270             log.warn("Resource id not specified for list request: %s", exc)
 271             return None
 272
 273         # Checking what fields are specified for a list request
 274         try:
 275             name = list_details['alarm_name'].lower()
 276             if name not in ALARM_NAMES.keys():
 277                 log.warn("This alarm is not supported, won't be used!")
 278                 name = None
 279         except KeyError as exc:
 280             log.info("Alarm name isn't specified.")
 281             name = None
 282
 283         try:
 284             severity = list_details['severity'].lower()
 285             sev = SEVERITIES[severity]
 286         except KeyError as exc:
 287             log.info("Severity is unspecified/incorrectly configured")
 288             sev = None
 289
 290         # Perform the request to get the desired list
 291         try:
 292             result = Common.perform_request(
 293                 url, auth_token, req_type="get")
 294
 295             if result is not None:
 296                 # Get list based on resource id
 297                 for alarm in json.loads(result.text):
 298                     rule = alarm['gnocchi_resources_threshold_rule']
 299                     if resource == rule['resource_id']:
 300                         res_list.append(alarm)
 301                     if not res_list:
 302                         log.info("No alarms for this resource")
 303                         return a_list
 304
 305                 # Generate specified listed if requested
 306                 if name is not None and sev is not None:
 307                     log.info("Return a list of %s alarms with %s severity.",
 308                              name, sev)
 309                     for alarm in json.loads(result.text):
 310                         if name == alarm['name']:
 311                             name_list.append(alarm)
 312                     for alarm in json.loads(result.text):
 313                         if sev == alarm['severity']:
 314                             sev_list.append(alarm)
 315                     name_sev_list = list(set(name_list).intersection(sev_list))
 316                     a_list = list(set(name_sev_list).intersection(res_list))
 317                 elif name is not None:
 318                     log.info("Returning a %s list of alarms.", name)
 319                     for alarm in json.loads(result.text):
 320                         if name == alarm['name']:
 321                             name_list.append(alarm)
 322                     a_list = list(set(name_list).intersection(res_list))
 323                 elif sev is not None:
 324                     log.info("Returning %s severity alarm list.", sev)
 325                     for alarm in json.loads(result.text):
 326                         if sev == alarm['severity']:
 327                             sev_list.append(alarm)
 328                     a_list = list(set(sev_list).intersection(res_list))
 329                 else:
 330                     log.info("Returning an entire list of alarms.")
 331                     a_list = res_list
 332             else:
 333                 log.info("There are no alarms!")
 334
 335         except Exception as exc:
 336             log.info("Failed to generate required list: %s", exc)
 337             return None
 338
 339         return a_list
 340
 341     def update_alarm_state(self, endpoint, auth_token, alarm_id):
 342         """Set the state of an alarm to ok when ack message is received."""
 343         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 344         payload = json.dumps("ok")
 345
 346         try:
 347             Common.perform_request(
 348                 url, auth_token, req_type="put", payload=payload)
 349             return True
 350         except Exception as exc:
 351             log.warn("Unable to update alarm state: %s", exc)
 352         return False
 353
 354     def update_alarm(self, endpoint, auth_token, values):
 355         """Get alarm name for an alarm configuration update."""
 356         # Get already existing alarm details
 357         url = "{}/v2/alarms/%s".format(endpoint) % values['alarm_uuid']
 358
 359         # Gets current configurations about the alarm
 360         try:
 361             result = Common.perform_request(
 362                 url, auth_token, req_type="get")
 363             alarm_name = json.loads(result.text)['name']
 364             rule = json.loads(result.text)['gnocchi_resources_threshold_rule']
 365             alarm_state = json.loads(result.text)['state']
 366             resource_id = rule['resource_id']
 367             metric_name = [key for key, value in six.iteritems(METRIC_MAPPINGS) if value == rule['metric']][0]
 368         except Exception as exc:
 369             log.warn("Failed to retrieve existing alarm info: %s.\
 370                      Can only update OSM alarms.", exc)
 371             return None, False
 372
 373         # Generates and check payload configuration for alarm update
 374         payload = self.check_payload(values, metric_name, resource_id,
 375                                      alarm_name, alarm_state=alarm_state)
 376
 377         # Updates the alarm configurations with the valid payload
 378         if payload is not None:
 379             try:
 380                 update_alarm = Common.perform_request(
 381                     url, auth_token, req_type="put", payload=payload)
 382
 383                 return json.loads(update_alarm.text)['alarm_id'], True
 384             except Exception as exc:
 385                 log.warn("Alarm update could not be performed: %s", exc)
 386                 return None, False
 387         return None, False
 388
 389     def check_payload(self, values, metric_name, resource_id,
 390                       alarm_name, alarm_state=None):
 391         """Check that the payload is configuration for update/create alarm."""
 392         try:
 393             cfg = Config.instance()
 394             # Check state and severity
 395
 396             severity = 'critical'
 397             if 'severity' in values:
 398                 severity = values['severity'].lower()
 399
 400             if severity == "indeterminate":
 401                 alarm_state = "insufficient data"
 402             if alarm_state is None:
 403                 alarm_state = "ok"
 404
 405             statistic = values['statistic'].lower()
 406
 407             granularity = '300'
 408             if 'granularity' in values:
 409                 granularity = values['granularity']
 410
 411             resource_type = 'generic'
 412             if 'resource_type' in values:
 413                 resource_type = values['resource_type'].lower()
 414
 415             # Try to configure the payload for the update/create request
 416             # Can only update: threshold, operation, statistic and
 417             # the severity of the alarm
 418             rule = {'threshold': values['threshold_value'],
 419                     'comparison_operator': values['operation'].lower(),
 420                     'metric': METRIC_MAPPINGS[metric_name],
 421                     'resource_id': resource_id,
 422                     'resource_type': resource_type,
 423                     'aggregation_method': STATISTICS[statistic],
 424                     'granularity': granularity, }
 425             payload = json.dumps({'state': alarm_state,
 426                                   'name': alarm_name,
 427                                   'severity': SEVERITIES[severity],
 428                                   'type': 'gnocchi_resources_threshold',
 429                                   'gnocchi_resources_threshold_rule': rule,
 430                                   'alarm_actions': [cfg.OS_NOTIFIER_URI], })
 431             return payload
 432         except KeyError as exc:
 433             log.warn("Alarm is not configured correctly: %s", exc)
 434         return None
 435
 436     def get_alarm_state(self, endpoint, auth_token, alarm_id):
 437         """Get the state of the alarm."""
 438         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 439
 440         try:
 441             alarm_state = Common.perform_request(
 442                 url, auth_token, req_type="get")
 443             return json.loads(alarm_state.text)
 444         except Exception as exc:
 445             log.warn("Failed to get the state of the alarm:%s", exc)
 446         return None
 447
 448     def check_for_metric(self, auth_token, metric_endpoint, m_name, r_id):
 449         """Check for the alarm metric."""
 450         try:
 451             url = "{}/v1/metric?sort=name:asc".format(metric_endpoint)
 452             result = Common.perform_request(
 453                 url, auth_token, req_type="get")
 454             metric_list = []
 455             metrics_partial = json.loads(result.text)
 456             for metric in metrics_partial:
 457                 metric_list.append(metric)
 458
 459             while len(json.loads(result.text)) > 0:
 460                 last_metric_id = metrics_partial[-1]['id']
 461                 url = "{}/v1/metric?sort=name:asc&marker={}".format(metric_endpoint, last_metric_id)
 462                 result = Common.perform_request(
 463                     url, auth_token, req_type="get")
 464                 if len(json.loads(result.text)) > 0:
 465                     metrics_partial = json.loads(result.text)
 466                     for metric in metrics_partial:
 467                         metric_list.append(metric)
 468             metric_id = None
 469             for metric in metric_list:
 470                 name = metric['name']
 471                 resource = metric['resource_id']
 472                 if name == METRIC_MAPPINGS[m_name] and resource == r_id:
 473                     metric_id = metric['id']
 474             log.info("The required metric exists, an alarm will be created.")
 475             return metric_id
 476         except Exception as exc:
 477             log.info("Desired Gnocchi metric not found:%s", exc)
 478         return None