OpenStack/Aodh/alarming.py

   1 # Copyright 2017 Intel Research and Development Ireland Limited
   2 # *************************************************************
   3
   4 # This file is part of OSM Monitoring module
   5 # All Rights Reserved to Intel Corporation
   6
   7 # Licensed under the Apache License, Version 2.0 (the "License"); you may
   8 # not use this file except in compliance with the License. You may obtain
   9 # a copy of the License at
  10
  11 #         http://www.apache.org/licenses/LICENSE-2.0
  12
  13 # Unless required by applicable law or agreed to in writing, software
  14 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16 # License for the specific language governing permissions and limitations
  17 # under the License.
  18
  19 # For those usages not covered by the Apache License, Version 2.0 please
  20 # contact: helena.mcgough@intel.com or adrian.hoban@intel.com
  21 ##
  22 """Carry out alarming requests via Aodh API."""
  23
  24 import json
  25
  26 import logging
  27
  28 from osm_mon.core.message_bus.producer import KafkaProducer
  29
  30 from osm_mon.plugins.OpenStack.response import OpenStack_Response
  31 from osm_mon.plugins.OpenStack.settings import Config
  32
  33 log = logging.getLogger(__name__)
  34
  35 ALARM_NAMES = {
  36     "average_memory_usage_above_threshold": "average_memory_utilization",
  37     "disk_read_ops": "disk_read_ops",
  38     "disk_write_ops": "disk_write_ops",
  39     "disk_read_bytes": "disk_read_bytes",
  40     "disk_write_bytes": "disk_write_bytes",
  41     "net_packets_dropped": "packets_dropped",
  42     "packets_in_above_threshold": "packets_received",
  43     "packets_out_above_threshold": "packets_sent",
  44     "cpu_utilization_above_threshold": "cpu_utilization"}
  45
  46 SEVERITIES = {
  47     "warning": "low",
  48     "minor": "low",
  49     "major": "moderate",
  50     "critical": "critical",
  51     "indeterminate": "critical"}
  52
  53 STATISTICS = {
  54     "average": "avg",
  55     "minimum": "min",
  56     "maximum": "max",
  57     "count": "count",
  58     "sum": "sum"}
  59
  60
  61 class Alarming(object):
  62     """Carries out alarming requests and responses via Aodh API."""
  63
  64     def __init__(self):
  65         """Create the OpenStack alarming instance."""
  66         # Initialize configuration and notifications
  67         config = Config.instance()
  68         config.read_environ("aodh")
  69
  70         # Initialise authentication for API requests
  71         self.auth_token = None
  72         self.endpoint = None
  73         self.common = None
  74
  75         # Use the Response class to generate valid json response messages
  76         self._response = OpenStack_Response()
  77
  78         # Initializer a producer to send responses back to SO
  79         self._producer = KafkaProducer("alarm_response")
  80
  81     def alarming(self, message, common, auth_token):
  82         """Consume info from the message bus to manage alarms."""
  83         values = json.loads(message.value)
  84         self.common = common
  85
  86         log.info("OpenStack alarm action required.")
  87
  88         # Generate and auth_token and endpoint for request
  89         if auth_token is not None:
  90             if self.auth_token != auth_token:
  91                 log.info("Auth_token for alarming set by access_credentials.")
  92                 self.auth_token = auth_token
  93             else:
  94                 log.info("Auth_token has not been updated.")
  95         else:
  96             log.info("Using environment variables to set auth_token for Aodh.")
  97             self.auth_token = self.common._authenticate()
  98
  99         if self.endpoint is None:
 100             log.info("Generating a new endpoint for Aodh.")
 101             self.endpoint = self.common.get_endpoint("alarming")
 102
 103         if message.key == "create_alarm_request":
 104             # Configure/Update an alarm
 105             alarm_details = values['alarm_create_request']
 106
 107             alarm_id, alarm_status = self.configure_alarm(
 108                 self.endpoint, self.auth_token, alarm_details)
 109
 110             # Generate a valid response message, send via producer
 111             try:
 112                 if alarm_status is True:
 113                     log.info("Alarm successfully created")
 114
 115                 resp_message = self._response.generate_response(
 116                     'create_alarm_response', status=alarm_status,
 117                     alarm_id=alarm_id,
 118                     cor_id=alarm_details['correlation_id'])
 119                 log.info("Response Message: %s", resp_message)
 120                 self._producer.create_alarm_response(
 121                     'create_alarm_response', resp_message,
 122                     'alarm_response')
 123             except Exception as exc:
 124                 log.warn("Response creation failed: %s", exc)
 125
 126         elif message.key == "list_alarm_request":
 127             # Check for a specifed: alarm_name, resource_uuid, severity
 128             # and generate the appropriate list
 129             list_details = values['alarm_list_request']
 130
 131             alarm_list = self.list_alarms(
 132                 self.endpoint, self.auth_token, list_details)
 133
 134             try:
 135                 # Generate and send a list response back
 136                 resp_message = self._response.generate_response(
 137                     'list_alarm_response', alarm_list=alarm_list,
 138                     cor_id=list_details['correlation_id'])
 139                 log.info("Response Message: %s", resp_message)
 140                 self._producer.list_alarm_response(
 141                     'list_alarm_response', resp_message,
 142                     'alarm_response')
 143             except Exception as exc:
 144                 log.warn("Failed to send a valid response back.")
 145
 146         elif message.key == "delete_alarm_request":
 147             request_details = values['alarm_delete_request']
 148             alarm_id = request_details['alarm_uuid']
 149
 150             resp_status = self.delete_alarm(
 151                 self.endpoint, self.auth_token, alarm_id)
 152
 153             # Generate and send a response message
 154             try:
 155                 resp_message = self._response.generate_response(
 156                     'delete_alarm_response', alarm_id=alarm_id,
 157                     status=resp_status,
 158                     cor_id=request_details['correlation_id'])
 159                 log.info("Response message: %s", resp_message)
 160                 self._producer.delete_alarm_response(
 161                     'delete_alarm_response', resp_message,
 162                     'alarm_response')
 163             except Exception as exc:
 164                 log.warn("Failed to create delete reponse:%s", exc)
 165
 166         elif message.key == "acknowledge_alarm":
 167             # Acknowledge that an alarm has been dealt with by the SO
 168             alarm_id = values['ack_details']['alarm_uuid']
 169
 170             response = self.update_alarm_state(
 171                 self.endpoint, self.auth_token, alarm_id)
 172
 173             # Log if an alarm was reset
 174             if response is True:
 175                 log.info("Acknowledged the alarm and cleared it.")
 176             else:
 177                 log.warn("Failed to acknowledge/clear the alarm.")
 178
 179         elif message.key == "update_alarm_request":
 180             # Update alarm configurations
 181             alarm_details = values['alarm_update_request']
 182
 183             alarm_id, status = self.update_alarm(
 184                 self.endpoint, self.auth_token, alarm_details)
 185
 186             # Generate a response for an update request
 187             try:
 188                 resp_message = self._response.generate_response(
 189                     'update_alarm_response', alarm_id=alarm_id,
 190                     cor_id=alarm_details['correlation_id'],
 191                     status=status)
 192                 log.info("Response message: %s", resp_message)
 193                 self._producer.update_alarm_response(
 194                     'update_alarm_response', resp_message,
 195                     'alarm_response')
 196             except Exception as exc:
 197                 log.warn("Failed to send an update response:%s", exc)
 198
 199         else:
 200             log.debug("Unknown key, no action will be performed")
 201
 202         return
 203
 204     def configure_alarm(self, endpoint, auth_token, values):
 205         """Create requested alarm in Aodh."""
 206         url = "{}/v2/alarms/".format(endpoint)
 207
 208         # Check if the desired alarm is supported
 209         alarm_name = values['alarm_name'].lower()
 210         metric_name = values['metric_name'].lower()
 211         resource_id = values['resource_uuid']
 212
 213         if alarm_name not in ALARM_NAMES.keys():
 214             log.warn("This alarm is not supported, by a valid metric.")
 215             return None, False
 216         if ALARM_NAMES[alarm_name] != metric_name:
 217             log.warn("This is not the correct metric for this alarm.")
 218             return None, False
 219
 220         # Check for the required metric
 221         metric_id = self.check_for_metric(auth_token, metric_name, resource_id)
 222
 223         try:
 224             if metric_id is not None:
 225                 # Create the alarm if metric is available
 226                 payload = self.check_payload(values, metric_name, resource_id,
 227                                              alarm_name)
 228                 new_alarm = self.common._perform_request(
 229                     url, auth_token, req_type="post", payload=payload)
 230                 return json.loads(new_alarm.text)['alarm_id'], True
 231             else:
 232                 log.warn("The required Gnocchi metric does not exist.")
 233                 return None, False
 234
 235         except Exception as exc:
 236             log.warn("Failed to create the alarm: %s", exc)
 237         return None, False
 238
 239     def delete_alarm(self, endpoint, auth_token, alarm_id):
 240         """Delete alarm function."""
 241         url = "{}/v2/alarms/%s".format(endpoint) % (alarm_id)
 242
 243         try:
 244             result = self.common._perform_request(
 245                 url, auth_token, req_type="delete")
 246             if str(result.status_code) == "404":
 247                 log.info("Alarm doesn't exist: %s", result.status_code)
 248                 # If status code is 404 alarm did not exist
 249                 return False
 250             else:
 251                 return True
 252
 253         except Exception as exc:
 254             log.warn("Failed to delete alarm: %s because %s.", alarm_id, exc)
 255         return False
 256
 257     def list_alarms(self, endpoint, auth_token, list_details):
 258         """Generate the requested list of alarms."""
 259         url = "{}/v2/alarms/".format(endpoint)
 260         a_list, name_list, sev_list, res_list = [], [], [], []
 261
 262         # TODO(mcgoughh): for now resource_id is a mandatory field
 263         # Check for a reqource is
 264         try:
 265             resource = list_details['resource_uuid']
 266         except KeyError as exc:
 267             log.warn("Resource id not specified for list request: %s", exc)
 268             return None
 269
 270         # Checking what fields are specified for a list request
 271         try:
 272             name = list_details['alarm_name'].lower()
 273             if name not in ALARM_NAMES.keys():
 274                 log.warn("This alarm is not supported, won't be used!")
 275                 name = None
 276         except KeyError as exc:
 277             log.info("Alarm name isn't specified.")
 278             name = None
 279
 280         try:
 281             severity = list_details['severity'].lower()
 282             sev = SEVERITIES[severity]
 283         except KeyError as exc:
 284             log.info("Severity is unspecified/incorrectly configured")
 285             sev = None
 286
 287         # Perform the request to get the desired list
 288         try:
 289             result = self.common._perform_request(
 290                 url, auth_token, req_type="get")
 291
 292             if result is not None:
 293                 # Get list based on resource id
 294                 for alarm in json.loads(result.text):
 295                     rule = alarm['gnocchi_resources_threshold_rule']
 296                     if resource == rule['resource_id']:
 297                         res_list.append(str(alarm))
 298                     if not res_list:
 299                         log.info("No alarms for this resource")
 300                         return a_list
 301
 302                 # Generate specified listed if requested
 303                 if name is not None and sev is not None:
 304                     log.info("Return a list of %s alarms with %s severity.",
 305                              name, sev)
 306                     for alarm in json.loads(result.text):
 307                         if name == alarm['name']:
 308                             name_list.append(str(alarm))
 309                     for alarm in json.loads(result.text):
 310                         if sev == alarm['severity']:
 311                             sev_list.append(str(alarm))
 312                     name_sev_list = list(set(name_list).intersection(sev_list))
 313                     a_list = list(set(name_sev_list).intersection(res_list))
 314                 elif name is not None:
 315                     log.info("Returning a %s list of alarms.", name)
 316                     for alarm in json.loads(result.text):
 317                         if name == alarm['name']:
 318                             name_list.append(str(alarm))
 319                     a_list = list(set(name_list).intersection(res_list))
 320                 elif sev is not None:
 321                     log.info("Returning %s severity alarm list.", sev)
 322                     for alarm in json.loads(result.text):
 323                         if sev == alarm['severity']:
 324                             sev_list.append(str(alarm))
 325                     a_list = list(set(sev_list).intersection(res_list))
 326                 else:
 327                     log.info("Returning an entire list of alarms.")
 328                     a_list = res_list
 329             else:
 330                 log.info("There are no alarms!")
 331
 332         except Exception as exc:
 333             log.info("Failed to generate required list: %s", exc)
 334             return None
 335
 336         return a_list
 337
 338     def update_alarm_state(self, endpoint, auth_token, alarm_id):
 339         """Set the state of an alarm to ok when ack message is received."""
 340         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 341         payload = json.dumps("ok")
 342
 343         try:
 344             self.common._perform_request(
 345                 url, auth_token, req_type="put", payload=payload)
 346             return True
 347         except Exception as exc:
 348             log.warn("Unable to update alarm state: %s", exc)
 349         return False
 350
 351     def update_alarm(self, endpoint, auth_token, values):
 352         """Get alarm name for an alarm configuration update."""
 353         # Get already existing alarm details
 354         url = "{}/v2/alarms/%s".format(endpoint) % values['alarm_uuid']
 355
 356         # Gets current configurations about the alarm
 357         try:
 358             result = self.common._perform_request(
 359                 url, auth_token, req_type="get")
 360             alarm_name = json.loads(result.text)['name']
 361             rule = json.loads(result.text)['gnocchi_resources_threshold_rule']
 362             alarm_state = json.loads(result.text)['state']
 363             resource_id = rule['resource_id']
 364             metric_name = rule['metric']
 365         except Exception as exc:
 366             log.warn("Failed to retreive existing alarm info: %s.\
 367                      Can only update OSM alarms.", exc)
 368             return None, False
 369
 370         # Generates and check payload configuration for alarm update
 371         payload = self.check_payload(values, metric_name, resource_id,
 372                                      alarm_name, alarm_state=alarm_state)
 373
 374         # Updates the alarm configurations with the valid payload
 375         if payload is not None:
 376             try:
 377                 update_alarm = self.common._perform_request(
 378                     url, auth_token, req_type="put", payload=payload)
 379
 380                 return json.loads(update_alarm.text)['alarm_id'], True
 381             except Exception as exc:
 382                 log.warn("Alarm update could not be performed: %s", exc)
 383                 return None, False
 384         return None, False
 385
 386     def check_payload(self, values, metric_name, resource_id,
 387                       alarm_name, alarm_state=None):
 388         """Check that the payload is configuration for update/create alarm."""
 389         try:
 390             # Check state and severity
 391             severity = values['severity'].lower()
 392             if severity == "indeterminate":
 393                 alarm_state = "insufficient data"
 394             if alarm_state is None:
 395                 alarm_state = "ok"
 396
 397             statistic = values['statistic'].lower()
 398             # Try to configure the payload for the update/create request
 399             # Can only update: threshold, operation, statistic and
 400             # the severity of the alarm
 401             rule = {'threshold': values['threshold_value'],
 402                     'comparison_operator': values['operation'].lower(),
 403                     'metric': metric_name,
 404                     'resource_id': resource_id,
 405                     'resource_type': 'generic',
 406                     'aggregation_method': STATISTICS[statistic], }
 407             payload = json.dumps({'state': alarm_state,
 408                                   'name': alarm_name,
 409                                   'severity': SEVERITIES[severity],
 410                                   'type': 'gnocchi_resources_threshold',
 411                                   'gnocchi_resources_threshold_rule': rule,
 412                                   'alarm_actions': ['http://localhost:8662'], })
 413             return payload
 414         except KeyError as exc:
 415             log.warn("Alarm is not configured correctly: %s", exc)
 416         return None
 417
 418     def get_alarm_state(self, endpoint, auth_token, alarm_id):
 419         """Get the state of the alarm."""
 420         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 421
 422         try:
 423             alarm_state = self.common._perform_request(
 424                 url, auth_token, req_type="get")
 425             return json.loads(alarm_state.text)
 426         except Exception as exc:
 427             log.warn("Failed to get the state of the alarm:%s", exc)
 428         return None
 429
 430     def check_for_metric(self, auth_token, m_name, r_id):
 431         """Check for the alarm metric."""
 432         try:
 433             endpoint = self.common.get_endpoint("metric")
 434
 435             url = "{}/v1/metric/".format(endpoint)
 436             metric_list = self.common._perform_request(
 437                 url, auth_token, req_type="get")
 438
 439             for metric in json.loads(metric_list.text):
 440                 name = metric['name']
 441                 resource = metric['resource_id']
 442                 if (name == m_name and resource == r_id):
 443                     metric_id = metric['id']
 444             log.info("The required metric exists, an alarm will be created.")
 445             return metric_id
 446         except Exception as exc:
 447             log.info("Desired Gnocchi metric not found:%s", exc)
 448         return None