osm_mon/plugins/OpenStack/Aodh/alarming.py

   1 # Copyright 2017 Intel Research and Development Ireland Limited
   2 # *************************************************************
   3
   4 # This file is part of OSM Monitoring module
   5 # All Rights Reserved to Intel Corporation
   6
   7 # Licensed under the Apache License, Version 2.0 (the "License"); you may
   8 # not use this file except in compliance with the License. You may obtain
   9 # a copy of the License at
  10
  11 #         http://www.apache.org/licenses/LICENSE-2.0
  12
  13 # Unless required by applicable law or agreed to in writing, software
  14 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16 # License for the specific language governing permissions and limitations
  17 # under the License.
  18
  19 # For those usages not covered by the Apache License, Version 2.0 please
  20 # contact: helena.mcgough@intel.com or adrian.hoban@intel.com
  21 ##
  22 """Carry out alarming requests via Aodh API."""
  23
  24 import json
  25 import logging
  26
  27 import six
  28 import yaml
  29
  30 from osm_mon.core.auth import AuthManager
  31 from osm_mon.core.database import DatabaseManager
  32 from osm_mon.core.message_bus.producer import KafkaProducer
  33 from osm_mon.core.settings import Config
  34 from osm_mon.plugins.OpenStack.common import Common
  35 from osm_mon.plugins.OpenStack.response import OpenStack_Response
  36
  37 log = logging.getLogger(__name__)
  38
  39 ALARM_NAMES = {
  40     "average_memory_usage_above_threshold": "average_memory_utilization",
  41     "disk_read_ops": "disk_read_ops",
  42     "disk_write_ops": "disk_write_ops",
  43     "disk_read_bytes": "disk_read_bytes",
  44     "disk_write_bytes": "disk_write_bytes",
  45     "net_packets_dropped": "packets_dropped",
  46     "packets_in_above_threshold": "packets_received",
  47     "packets_out_above_threshold": "packets_sent",
  48     "cpu_utilization_above_threshold": "cpu_utilization"}
  49
  50 METRIC_MAPPINGS = {
  51     "average_memory_utilization": "memory.percent",
  52     "disk_read_ops": "disk.read.requests",
  53     "disk_write_ops": "disk.write.requests",
  54     "disk_read_bytes": "disk.read.bytes",
  55     "disk_write_bytes": "disk.write.bytes",
  56     "packets_dropped": "interface.if_dropped",
  57     "packets_received": "interface.if_packets",
  58     "packets_sent": "interface.if_packets",
  59     "cpu_utilization": "cpu_util",
  60 }
  61
  62 SEVERITIES = {
  63     "warning": "low",
  64     "minor": "low",
  65     "major": "moderate",
  66     "critical": "critical",
  67     "indeterminate": "critical"}
  68
  69 STATISTICS = {
  70     "average": "mean",
  71     "minimum": "min",
  72     "maximum": "max",
  73     "count": "count",
  74     "sum": "sum"}
  75
  76
  77 class Alarming(object):
  78     """Carries out alarming requests and responses via Aodh API."""
  79
  80     def __init__(self):
  81         """Create the OpenStack alarming instance."""
  82         # Initialize configuration and notifications
  83         config = Config.instance()
  84         config.read_environ()
  85
  86         self._database_manager = DatabaseManager()
  87         self._auth_manager = AuthManager()
  88
  89         # Use the Response class to generate valid json response messages
  90         self._response = OpenStack_Response()
  91
  92         # Initializer a producer to send responses back to SO
  93         self._producer = KafkaProducer("alarm_response")
  94
  95     def configure_alarm(self, alarm_endpoint, metric_endpoint, auth_token, values, vim_config):
  96         """Create requested alarm in Aodh."""
  97         url = "{}/v2/alarms/".format(alarm_endpoint)
  98
  99         # Check if the desired alarm is supported
 100         alarm_name = values['alarm_name'].lower()
 101         metric_name = values['metric_name'].lower()
 102         resource_id = values['resource_uuid']
 103
 104         if metric_name not in METRIC_MAPPINGS.keys():
 105             log.warn("This metric is not supported.")
 106             return None, False
 107
 108         # Check for the required metric
 109         metric_id = self.check_for_metric(auth_token, metric_endpoint, metric_name, resource_id)
 110
 111         try:
 112             if metric_id is not None:
 113                 # Create the alarm if metric is available
 114                 if 'granularity' in vim_config and 'granularity' not in values:
 115                     values['granularity'] = vim_config['granularity']
 116                 payload = self.check_payload(values, metric_name, resource_id,
 117                                              alarm_name)
 118                 new_alarm = Common.perform_request(
 119                     url, auth_token, req_type="post", payload=payload)
 120                 return json.loads(new_alarm.text)['alarm_id'], True
 121             else:
 122                 log.warn("The required Gnocchi metric does not exist.")
 123                 return None, False
 124
 125         except Exception as exc:
 126             log.warn("Failed to create the alarm: %s", exc)
 127         return None, False
 128
 129     def alarming(self, message):
 130         """Consume info from the message bus to manage alarms."""
 131         try:
 132             values = json.loads(message.value)
 133         except ValueError:
 134             values = yaml.safe_load(message.value)
 135
 136         log.info("OpenStack alarm action required.")
 137         vim_uuid = values['vim_uuid']
 138
 139         auth_token = Common.get_auth_token(vim_uuid)
 140
 141         alarm_endpoint = Common.get_endpoint("alarming", vim_uuid)
 142         metric_endpoint = Common.get_endpoint("metric", vim_uuid)
 143
 144         vim_account = self._auth_manager.get_credentials(vim_uuid)
 145         vim_config = json.loads(vim_account.config)
 146
 147         if message.key == "create_alarm_request":
 148             # Configure/Update an alarm
 149             alarm_details = values['alarm_create_request']
 150
 151             alarm_id, alarm_status = self.configure_alarm(
 152                 alarm_endpoint, metric_endpoint, auth_token, alarm_details, vim_config)
 153
 154             # Generate a valid response message, send via producer
 155             try:
 156                 if alarm_status is True:
 157                     log.info("Alarm successfully created")
 158                     self._database_manager.save_alarm(alarm_id, vim_uuid)
 159
 160                 resp_message = self._response.generate_response(
 161                     'create_alarm_response', status=alarm_status,
 162                     alarm_id=alarm_id,
 163                     cor_id=alarm_details['correlation_id'])
 164                 log.info("Response Message: %s", resp_message)
 165                 self._producer.create_alarm_response(
 166                     'create_alarm_response', resp_message,
 167                     'alarm_response')
 168             except Exception:
 169                 log.exception("Response creation failed:")
 170
 171         elif message.key == "list_alarm_request":
 172             # Check for a specified: alarm_name, resource_uuid, severity
 173             # and generate the appropriate list
 174             list_details = values['alarm_list_request']
 175
 176             alarm_list = self.list_alarms(
 177                 alarm_endpoint, auth_token, list_details)
 178
 179             try:
 180                 # Generate and send a list response back
 181                 resp_message = self._response.generate_response(
 182                     'list_alarm_response', alarm_list=alarm_list,
 183                     cor_id=list_details['correlation_id'])
 184                 log.info("Response Message: %s", resp_message)
 185                 self._producer.list_alarm_response(
 186                     'list_alarm_response', resp_message,
 187                     'alarm_response')
 188             except Exception:
 189                 log.exception("Failed to send a valid response back.")
 190
 191         elif message.key == "delete_alarm_request":
 192             request_details = values['alarm_delete_request']
 193             alarm_id = request_details['alarm_uuid']
 194
 195             resp_status = self.delete_alarm(
 196                 alarm_endpoint, auth_token, alarm_id)
 197
 198             # Generate and send a response message
 199             try:
 200                 resp_message = self._response.generate_response(
 201                     'delete_alarm_response', alarm_id=alarm_id,
 202                     status=resp_status,
 203                     cor_id=request_details['correlation_id'])
 204                 log.info("Response message: %s", resp_message)
 205                 self._producer.delete_alarm_response(
 206                     'delete_alarm_response', resp_message,
 207                     'alarm_response')
 208             except Exception:
 209                 log.exception("Failed to create delete response: ")
 210
 211         elif message.key == "acknowledge_alarm":
 212             # Acknowledge that an alarm has been dealt with by the SO
 213             alarm_id = values['ack_details']['alarm_uuid']
 214
 215             response = self.update_alarm_state(
 216                 alarm_endpoint, auth_token, alarm_id)
 217
 218             # Log if an alarm was reset
 219             if response is True:
 220                 log.info("Acknowledged the alarm and cleared it.")
 221             else:
 222                 log.warn("Failed to acknowledge/clear the alarm.")
 223
 224         elif message.key == "update_alarm_request":
 225             # Update alarm configurations
 226             alarm_details = values['alarm_update_request']
 227
 228             alarm_id, status = self.update_alarm(
 229                 alarm_endpoint, auth_token, alarm_details, vim_config)
 230
 231             # Generate a response for an update request
 232             try:
 233                 resp_message = self._response.generate_response(
 234                     'update_alarm_response', alarm_id=alarm_id,
 235                     cor_id=alarm_details['correlation_id'],
 236                     status=status)
 237                 log.info("Response message: %s", resp_message)
 238                 self._producer.update_alarm_response(
 239                     'update_alarm_response', resp_message,
 240                     'alarm_response')
 241             except Exception:
 242                 log.exception("Failed to send an update response: ")
 243
 244         else:
 245             log.debug("Unknown key, no action will be performed")
 246
 247         return
 248
 249     def delete_alarm(self, endpoint, auth_token, alarm_id):
 250         """Delete alarm function."""
 251         url = "{}/v2/alarms/%s".format(endpoint) % alarm_id
 252
 253         try:
 254             result = Common.perform_request(
 255                 url, auth_token, req_type="delete")
 256             if str(result.status_code) == "404":
 257                 log.info("Alarm doesn't exist: %s", result.status_code)
 258                 # If status code is 404 alarm did not exist
 259                 return False
 260             else:
 261                 return True
 262
 263         except Exception:
 264             log.exception("Failed to delete alarm %s :", alarm_id)
 265         return False
 266
 267     def list_alarms(self, endpoint, auth_token, list_details):
 268         """Generate the requested list of alarms."""
 269         url = "{}/v2/alarms/".format(endpoint)
 270         a_list, name_list, sev_list, res_list = [], [], [], []
 271
 272         # TODO(mcgoughh): for now resource_id is a mandatory field
 273         # Check for a resource id
 274         try:
 275             resource = list_details['resource_uuid']
 276         except KeyError as exc:
 277             log.warn("Resource id not specified for list request: %s", exc)
 278             return None
 279
 280         # Checking what fields are specified for a list request
 281         try:
 282             name = list_details['alarm_name'].lower()
 283             if name not in ALARM_NAMES.keys():
 284                 log.warn("This alarm is not supported, won't be used!")
 285                 name = None
 286         except KeyError as exc:
 287             log.info("Alarm name isn't specified.")
 288             name = None
 289
 290         try:
 291             severity = list_details['severity'].lower()
 292             sev = SEVERITIES[severity]
 293         except KeyError as exc:
 294             log.info("Severity is unspecified/incorrectly configured")
 295             sev = None
 296
 297         # Perform the request to get the desired list
 298         try:
 299             result = Common.perform_request(
 300                 url, auth_token, req_type="get")
 301
 302             if result is not None:
 303                 # Get list based on resource id
 304                 for alarm in json.loads(result.text):
 305                     rule = alarm['gnocchi_resources_threshold_rule']
 306                     if resource == rule['resource_id']:
 307                         res_list.append(alarm)
 308                     if not res_list:
 309                         log.info("No alarms for this resource")
 310                         return a_list
 311
 312                 # Generate specified listed if requested
 313                 if name is not None and sev is not None:
 314                     log.info("Return a list of %s alarms with %s severity.",
 315                              name, sev)
 316                     for alarm in json.loads(result.text):
 317                         if name == alarm['name']:
 318                             name_list.append(alarm)
 319                     for alarm in json.loads(result.text):
 320                         if sev == alarm['severity']:
 321                             sev_list.append(alarm)
 322                     name_sev_list = list(set(name_list).intersection(sev_list))
 323                     a_list = list(set(name_sev_list).intersection(res_list))
 324                 elif name is not None:
 325                     log.info("Returning a %s list of alarms.", name)
 326                     for alarm in json.loads(result.text):
 327                         if name == alarm['name']:
 328                             name_list.append(alarm)
 329                     a_list = list(set(name_list).intersection(res_list))
 330                 elif sev is not None:
 331                     log.info("Returning %s severity alarm list.", sev)
 332                     for alarm in json.loads(result.text):
 333                         if sev == alarm['severity']:
 334                             sev_list.append(alarm)
 335                     a_list = list(set(sev_list).intersection(res_list))
 336                 else:
 337                     log.info("Returning an entire list of alarms.")
 338                     a_list = res_list
 339             else:
 340                 log.info("There are no alarms!")
 341
 342         except Exception as exc:
 343             log.info("Failed to generate required list: %s", exc)
 344             return None
 345
 346         return a_list
 347
 348     def update_alarm_state(self, endpoint, auth_token, alarm_id):
 349         """Set the state of an alarm to ok when ack message is received."""
 350         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 351         payload = json.dumps("ok")
 352
 353         try:
 354             Common.perform_request(
 355                 url, auth_token, req_type="put", payload=payload)
 356             return True
 357         except Exception:
 358             log.exception("Unable to update alarm state: ")
 359         return False
 360
 361     def update_alarm(self, endpoint, auth_token, values, vim_config):
 362         """Get alarm name for an alarm configuration update."""
 363         # Get already existing alarm details
 364         url = "{}/v2/alarms/%s".format(endpoint) % values['alarm_uuid']
 365
 366         # Gets current configurations about the alarm
 367         try:
 368             result = Common.perform_request(
 369                 url, auth_token, req_type="get")
 370             alarm_name = json.loads(result.text)['name']
 371             rule = json.loads(result.text)['gnocchi_resources_threshold_rule']
 372             alarm_state = json.loads(result.text)['state']
 373             resource_id = rule['resource_id']
 374             metric_name = [key for key, value in six.iteritems(METRIC_MAPPINGS) if value == rule['metric']][0]
 375         except Exception as exc:
 376             log.warn("Failed to retrieve existing alarm info: %s.\
 377                      Can only update OSM alarms.", exc)
 378             return None, False
 379
 380         # Generates and check payload configuration for alarm update
 381         if 'granularity' in vim_config and 'granularity' not in values:
 382             values['granularity'] = vim_config['granularity']
 383         payload = self.check_payload(values, metric_name, resource_id,
 384                                      alarm_name, alarm_state=alarm_state)
 385
 386         # Updates the alarm configurations with the valid payload
 387         if payload is not None:
 388             try:
 389                 update_alarm = Common.perform_request(
 390                     url, auth_token, req_type="put", payload=payload)
 391
 392                 return json.loads(update_alarm.text)['alarm_id'], True
 393             except Exception as exc:
 394                 log.warn("Alarm update could not be performed: %s", exc)
 395                 return None, False
 396         return None, False
 397
 398     def check_payload(self, values, metric_name, resource_id,
 399                       alarm_name, alarm_state=None):
 400         """Check that the payload is configuration for update/create alarm."""
 401         try:
 402             cfg = Config.instance()
 403             # Check state and severity
 404
 405             severity = 'critical'
 406             if 'severity' in values:
 407                 severity = values['severity'].lower()
 408
 409             if severity == "indeterminate":
 410                 alarm_state = "insufficient data"
 411             if alarm_state is None:
 412                 alarm_state = "ok"
 413
 414             statistic = values['statistic'].lower()
 415
 416             granularity = cfg.OS_DEFAULT_GRANULARITY
 417             if 'granularity' in values:
 418                 granularity = values['granularity']
 419
 420             resource_type = 'generic'
 421             if 'resource_type' in values:
 422                 resource_type = values['resource_type'].lower()
 423
 424             # Try to configure the payload for the update/create request
 425             # Can only update: threshold, operation, statistic and
 426             # the severity of the alarm
 427             rule = {'threshold': values['threshold_value'],
 428                     'comparison_operator': values['operation'].lower(),
 429                     'metric': METRIC_MAPPINGS[metric_name],
 430                     'resource_id': resource_id,
 431                     'resource_type': resource_type,
 432                     'aggregation_method': STATISTICS[statistic],
 433                     'granularity': granularity, }
 434             payload = json.dumps({'state': alarm_state,
 435                                   'name': alarm_name,
 436                                   'severity': SEVERITIES[severity],
 437                                   'type': 'gnocchi_resources_threshold',
 438                                   'gnocchi_resources_threshold_rule': rule,
 439                                   'alarm_actions': [cfg.OS_NOTIFIER_URI], })
 440             return payload
 441         except KeyError as exc:
 442             log.warn("Alarm is not configured correctly: %s", exc)
 443         return None
 444
 445     def get_alarm_state(self, endpoint, auth_token, alarm_id):
 446         """Get the state of the alarm."""
 447         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 448
 449         try:
 450             alarm_state = Common.perform_request(
 451                 url, auth_token, req_type="get")
 452             return json.loads(alarm_state.text)
 453         except Exception as exc:
 454             log.warn("Failed to get the state of the alarm:%s", exc)
 455         return None
 456
 457     def check_for_metric(self, auth_token, metric_endpoint, m_name, r_id):
 458         """Check for the alarm metric."""
 459         try:
 460             url = "{}/v1/metric?sort=name:asc".format(metric_endpoint)
 461             result = Common.perform_request(
 462                 url, auth_token, req_type="get")
 463             metric_list = []
 464             metrics_partial = json.loads(result.text)
 465             for metric in metrics_partial:
 466                 metric_list.append(metric)
 467
 468             while len(json.loads(result.text)) > 0:
 469                 last_metric_id = metrics_partial[-1]['id']
 470                 url = "{}/v1/metric?sort=name:asc&marker={}".format(metric_endpoint, last_metric_id)
 471                 result = Common.perform_request(
 472                     url, auth_token, req_type="get")
 473                 if len(json.loads(result.text)) > 0:
 474                     metrics_partial = json.loads(result.text)
 475                     for metric in metrics_partial:
 476                         metric_list.append(metric)
 477             metric_id = None
 478             for metric in metric_list:
 479                 name = metric['name']
 480                 resource = metric['resource_id']
 481                 if name == METRIC_MAPPINGS[m_name] and resource == r_id:
 482                     metric_id = metric['id']
 483             log.info("The required metric exists, an alarm will be created.")
 484             return metric_id
 485         except Exception as exc:
 486             log.info("Desired Gnocchi metric not found:%s", exc)
 487         return None