alarming.py

   1 # Copyright 2017 Intel Research and Development Ireland Limited
   2 # *************************************************************
   3
   4 # This file is part of OSM Monitoring module
   5 # All Rights Reserved to Intel Corporation
   6
   7 # Licensed under the Apache License, Version 2.0 (the "License"); you may
   8 # not use this file except in compliance with the License. You may obtain
   9 # a copy of the License at
  10
  11 #         http://www.apache.org/licenses/LICENSE-2.0
  12
  13 # Unless required by applicable law or agreed to in writing, software
  14 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16 # License for the specific language governing permissions and limitations
  17 # under the License.
  18
  19 # For those usages not covered by the Apache License, Version 2.0 please
  20 # contact: helena.mcgough@intel.com or adrian.hoban@intel.com
  21 ##
  22 """Carry out alarming requests via Aodh API."""
  23
  24 import json
  25
  26 import logging
  27
  28 from core.message_bus.producer import KafkaProducer
  29
  30 from plugins.OpenStack.response import OpenStack_Response
  31 from plugins.OpenStack.settings import Config
  32
  33 __author__ = "Helena McGough"
  34
  35 log = logging.getLogger(__name__)
  36
  37 ALARM_NAMES = {
  38     "average_memory_usage_above_threshold": "average_memory_utilization",
  39     "disk_read_ops": "disk_read_ops",
  40     "disk_write_ops": "disk_write_ops",
  41     "disk_read_bytes": "disk_read_bytes",
  42     "disk_write_bytes": "disk_write_bytes",
  43     "net_packets_dropped": "packets_dropped",
  44     "packets_in_above_threshold": "packets_received",
  45     "packets_out_above_threshold": "packets_sent",
  46     "cpu_utilization_above_threshold": "cpu_utilization"}
  47
  48 SEVERITIES = {
  49     "warning": "low",
  50     "minor": "low",
  51     "major": "moderate",
  52     "critical": "critical",
  53     "indeterminate": "critical"}
  54
  55 STATISTICS = {
  56     "average": "avg",
  57     "minimum": "min",
  58     "maximum": "max",
  59     "count": "count",
  60     "sum": "sum"}
  61
  62
  63 class Alarming(object):
  64     """Carries out alarming requests and responses via Aodh API."""
  65
  66     def __init__(self):
  67         """Create the OpenStack alarming instance."""
  68         # Initialize configuration and notifications
  69         config = Config.instance()
  70         config.read_environ("aodh")
  71
  72         # Initialise authentication for API requests
  73         self.auth_token = None
  74         self.endpoint = None
  75         self.common = None
  76
  77         # Use the Response class to generate valid json response messages
  78         self._response = OpenStack_Response()
  79
  80         # Initializer a producer to send responses back to SO
  81         self._producer = KafkaProducer("alarm_response")
  82
  83     def alarming(self, message, common, auth_token):
  84         """Consume info from the message bus to manage alarms."""
  85         values = json.loads(message.value)
  86         self.common = common
  87
  88         log.info("OpenStack alarm action required.")
  89
  90         # Generate and auth_token and endpoint for request
  91         if auth_token is not None:
  92             if self.auth_token != auth_token:
  93                 log.info("Auth_token for alarming set by access_credentials.")
  94                 self.auth_token = auth_token
  95             else:
  96                 log.info("Auth_token has not been updated.")
  97         else:
  98             log.info("Using environment variables to set auth_token for Aodh.")
  99             self.auth_token = self.common._authenticate()
 100
 101         if self.endpoint is None:
 102             log.info("Generating a new endpoint for Aodh.")
 103             self.endpoint = self.common.get_endpoint("alarming")
 104
 105         if message.key == "create_alarm_request":
 106             # Configure/Update an alarm
 107             alarm_details = values['alarm_create_request']
 108
 109             alarm_id, alarm_status = self.configure_alarm(
 110                 self.endpoint, self.auth_token, alarm_details)
 111
 112             # Generate a valid response message, send via producer
 113             try:
 114                 if alarm_status is True:
 115                     log.info("Alarm successfully created")
 116
 117                 resp_message = self._response.generate_response(
 118                     'create_alarm_response', status=alarm_status,
 119                     alarm_id=alarm_id,
 120                     cor_id=alarm_details['correlation_id'])
 121                 log.info("Response Message: %s", resp_message)
 122                 self._producer.create_alarm_response(
 123                     'create_alarm_resonse', resp_message,
 124                     'alarm_response')
 125             except Exception as exc:
 126                 log.warn("Response creation failed: %s", exc)
 127
 128         elif message.key == "list_alarm_request":
 129             # Check for a specifed: alarm_name, resource_uuid, severity
 130             # and generate the appropriate list
 131             list_details = values['alarm_list_request']
 132
 133             alarm_list = self.list_alarms(
 134                 self.endpoint, self.auth_token, list_details)
 135
 136             try:
 137                 # Generate and send a list response back
 138                 resp_message = self._response.generate_response(
 139                     'list_alarm_response', alarm_list=alarm_list,
 140                     cor_id=list_details['correlation_id'])
 141                 log.info("Response Message: %s", resp_message)
 142                 self._producer.list_alarm_response(
 143                     'list_alarm_response', resp_message,
 144                     'alarm_response')
 145             except Exception as exc:
 146                 log.warn("Failed to send a valid response back.")
 147
 148         elif message.key == "delete_alarm_request":
 149             request_details = values['alarm_delete_request']
 150             alarm_id = request_details['alarm_uuid']
 151
 152             resp_status = self.delete_alarm(
 153                 self.endpoint, self.auth_token, alarm_id)
 154
 155             # Generate and send a response message
 156             try:
 157                 resp_message = self._response.generate_response(
 158                     'delete_alarm_response', alarm_id=alarm_id,
 159                     status=resp_status,
 160                     cor_id=request_details['correlation_id'])
 161                 log.info("Response message: %s", resp_message)
 162                 self._producer.delete_alarm_response(
 163                     'delete_alarm_response', resp_message,
 164                     'alarm_response')
 165             except Exception as exc:
 166                 log.warn("Failed to create delete reponse:%s", exc)
 167
 168         elif message.key == "acknowledge_alarm":
 169             # Acknowledge that an alarm has been dealt with by the SO
 170             alarm_id = values['ack_details']['alarm_uuid']
 171
 172             response = self.update_alarm_state(
 173                 self.endpoint, self.auth_token, alarm_id)
 174
 175             # Log if an alarm was reset
 176             if response is True:
 177                 log.info("Acknowledged the alarm and cleared it.")
 178             else:
 179                 log.warn("Failed to acknowledge/clear the alarm.")
 180
 181         elif message.key == "update_alarm_request":
 182             # Update alarm configurations
 183             alarm_details = values['alarm_update_request']
 184
 185             alarm_id, status = self.update_alarm(
 186                 self.endpoint, self.auth_token, alarm_details)
 187
 188             # Generate a response for an update request
 189             try:
 190                 resp_message = self._response.generate_response(
 191                     'update_alarm_response', alarm_id=alarm_id,
 192                     cor_id=alarm_details['correlation_id'],
 193                     status=status)
 194                 log.info("Response message: %s", resp_message)
 195                 self._producer.update_alarm_response(
 196                     'update_alarm_response', resp_message,
 197                     'alarm_response')
 198             except Exception as exc:
 199                 log.warn("Failed to send an update response:%s", exc)
 200
 201         else:
 202             log.debug("Unknown key, no action will be performed")
 203
 204         return
 205
 206     def configure_alarm(self, endpoint, auth_token, values):
 207         """Create requested alarm in Aodh."""
 208         url = "{}/v2/alarms/".format(endpoint)
 209
 210         # Check if the desired alarm is supported
 211         alarm_name = values['alarm_name'].lower()
 212         metric_name = values['metric_name'].lower()
 213         resource_id = values['resource_uuid']
 214
 215         if alarm_name not in ALARM_NAMES.keys():
 216             log.warn("This alarm is not supported, by a valid metric.")
 217             return None, False
 218         if ALARM_NAMES[alarm_name] != metric_name:
 219             log.warn("This is not the correct metric for this alarm.")
 220             return None, False
 221
 222         # Check for the required metric
 223         metric_id = self.check_for_metric(auth_token, metric_name, resource_id)
 224
 225         try:
 226             if metric_id is not None:
 227                 # Create the alarm if metric is available
 228                 payload = self.check_payload(values, metric_name, resource_id,
 229                                              alarm_name)
 230                 new_alarm = self.common._perform_request(
 231                     url, auth_token, req_type="post", payload=payload)
 232                 return json.loads(new_alarm.text)['alarm_id'], True
 233             else:
 234                 log.warn("The required Gnocchi metric does not exist.")
 235                 return None, False
 236
 237         except Exception as exc:
 238             log.warn("Failed to create the alarm: %s", exc)
 239         return None, False
 240
 241     def delete_alarm(self, endpoint, auth_token, alarm_id):
 242         """Delete alarm function."""
 243         url = "{}/v2/alarms/%s".format(endpoint) % (alarm_id)
 244
 245         try:
 246             result = self.common._perform_request(
 247                 url, auth_token, req_type="delete")
 248             if str(result.status_code) == "404":
 249                 log.info("Alarm doesn't exist: %s", result.status_code)
 250                 # If status code is 404 alarm did not exist
 251                 return False
 252             else:
 253                 return True
 254
 255         except Exception as exc:
 256             log.warn("Failed to delete alarm: %s because %s.", alarm_id, exc)
 257         return False
 258
 259     def list_alarms(self, endpoint, auth_token, list_details):
 260         """Generate the requested list of alarms."""
 261         url = "{}/v2/alarms/".format(endpoint)
 262         a_list, name_list, sev_list, res_list = [], [], [], []
 263
 264         # TODO(mcgoughh): for now resource_id is a mandatory field
 265         # Check for a reqource is
 266         try:
 267             resource = list_details['resource_uuid']
 268         except KeyError as exc:
 269             log.warn("Resource id not specified for list request: %s", exc)
 270             return None
 271
 272         # Checking what fields are specified for a list request
 273         try:
 274             name = list_details['alarm_name'].lower()
 275             if name not in ALARM_NAMES.keys():
 276                 log.warn("This alarm is not supported, won't be used!")
 277                 name = None
 278         except KeyError as exc:
 279             log.info("Alarm name isn't specified.")
 280             name = None
 281
 282         try:
 283             severity = list_details['severity'].lower()
 284             sev = SEVERITIES[severity]
 285         except KeyError as exc:
 286             log.info("Severity is unspecified/incorrectly configured")
 287             sev = None
 288
 289         # Perform the request to get the desired list
 290         try:
 291             result = self.common._perform_request(
 292                 url, auth_token, req_type="get")
 293
 294             if result is not None:
 295                 # Get list based on resource id
 296                 for alarm in json.loads(result.text):
 297                     rule = alarm['gnocchi_resources_threshold_rule']
 298                     if resource == rule['resource_id']:
 299                         res_list.append(str(alarm))
 300                     if not res_list:
 301                         log.info("No alarms for this resource")
 302                         return a_list
 303
 304                 # Generate specified listed if requested
 305                 if name is not None and sev is not None:
 306                     log.info("Return a list of %s alarms with %s severity.",
 307                              name, sev)
 308                     for alarm in json.loads(result.text):
 309                         if name == alarm['name']:
 310                             name_list.append(str(alarm))
 311                     for alarm in json.loads(result.text):
 312                         if sev == alarm['severity']:
 313                             sev_list.append(str(alarm))
 314                     name_sev_list = list(set(name_list).intersection(sev_list))
 315                     a_list = list(set(name_sev_list).intersection(res_list))
 316                 elif name is not None:
 317                     log.info("Returning a %s list of alarms.", name)
 318                     for alarm in json.loads(result.text):
 319                         if name == alarm['name']:
 320                             name_list.append(str(alarm))
 321                     a_list = list(set(name_list).intersection(res_list))
 322                 elif sev is not None:
 323                     log.info("Returning %s severity alarm list.", sev)
 324                     for alarm in json.loads(result.text):
 325                         if sev == alarm['severity']:
 326                             sev_list.append(str(alarm))
 327                     a_list = list(set(sev_list).intersection(res_list))
 328                 else:
 329                     log.info("Returning an entire list of alarms.")
 330                     a_list = res_list
 331             else:
 332                 log.info("There are no alarms!")
 333
 334         except Exception as exc:
 335             log.info("Failed to generate required list: %s", exc)
 336             return None
 337
 338         return a_list
 339
 340     def update_alarm_state(self, endpoint, auth_token, alarm_id):
 341         """Set the state of an alarm to ok when ack message is received."""
 342         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 343         payload = json.dumps("ok")
 344
 345         try:
 346             self.common._perform_request(
 347                 url, auth_token, req_type="put", payload=payload)
 348             return True
 349         except Exception as exc:
 350             log.warn("Unable to update alarm state: %s", exc)
 351         return False
 352
 353     def update_alarm(self, endpoint, auth_token, values):
 354         """Get alarm name for an alarm configuration update."""
 355         # Get already existing alarm details
 356         url = "{}/v2/alarms/%s".format(endpoint) % values['alarm_uuid']
 357
 358         # Gets current configurations about the alarm
 359         try:
 360             result = self.common._perform_request(
 361                 url, auth_token, req_type="get")
 362             alarm_name = json.loads(result.text)['name']
 363             rule = json.loads(result.text)['gnocchi_resources_threshold_rule']
 364             alarm_state = json.loads(result.text)['state']
 365             resource_id = rule['resource_id']
 366             metric_name = rule['metric']
 367         except Exception as exc:
 368             log.warn("Failed to retreive existing alarm info: %s.\
 369                      Can only update OSM alarms.", exc)
 370             return None, False
 371
 372         # Generates and check payload configuration for alarm update
 373         payload = self.check_payload(values, metric_name, resource_id,
 374                                      alarm_name, alarm_state=alarm_state)
 375
 376         # Updates the alarm configurations with the valid payload
 377         if payload is not None:
 378             try:
 379                 update_alarm = self.common._perform_request(
 380                     url, auth_token, req_type="put", payload=payload)
 381
 382                 return json.loads(update_alarm.text)['alarm_id'], True
 383             except Exception as exc:
 384                 log.warn("Alarm update could not be performed: %s", exc)
 385                 return None, False
 386         return None, False
 387
 388     def check_payload(self, values, metric_name, resource_id,
 389                       alarm_name, alarm_state=None):
 390         """Check that the payload is configuration for update/create alarm."""
 391         try:
 392             # Check state and severity
 393             severity = values['severity'].lower()
 394             if severity == "indeterminate":
 395                 alarm_state = "insufficient data"
 396             if alarm_state is None:
 397                 alarm_state = "ok"
 398
 399             statistic = values['statistic'].lower()
 400             # Try to configure the payload for the update/create request
 401             # Can only update: threshold, operation, statistic and
 402             # the severity of the alarm
 403             rule = {'threshold': values['threshold_value'],
 404                     'comparison_operator': values['operation'].lower(),
 405                     'metric': metric_name,
 406                     'resource_id': resource_id,
 407                     'resource_type': 'generic',
 408                     'aggregation_method': STATISTICS[statistic]}
 409             payload = json.dumps({'state': alarm_state,
 410                                   'name': alarm_name,
 411                                   'severity': SEVERITIES[severity],
 412                                   'type': 'gnocchi_resources_threshold',
 413                                   'gnocchi_resources_threshold_rule': rule, })
 414             return payload
 415         except KeyError as exc:
 416             log.warn("Alarm is not configured correctly: %s", exc)
 417         return None
 418
 419     def get_alarm_state(self, endpoint, auth_token, alarm_id):
 420         """Get the state of the alarm."""
 421         url = "{}/v2/alarms/%s/state".format(endpoint) % alarm_id
 422
 423         try:
 424             alarm_state = self.common._perform_request(
 425                 url, auth_token, req_type="get")
 426             return json.loads(alarm_state.text)
 427         except Exception as exc:
 428             log.warn("Failed to get the state of the alarm:%s", exc)
 429         return None
 430
 431     def check_for_metric(self, auth_token, m_name, r_id):
 432         """Check for the alarm metric."""
 433         try:
 434             endpoint = self.common.get_endpoint("metric")
 435
 436             url = "{}/v1/metric/".format(endpoint)
 437             metric_list = self.common._perform_request(
 438                 url, auth_token, req_type="get")
 439
 440             for metric in json.loads(metric_list.text):
 441                 name = metric['name']
 442                 resource = metric['resource_id']
 443                 if (name == m_name and resource == r_id):
 444                     metric_id = metric['id']
 445             log.info("The required metric exists, an alarm will be created.")
 446             return metric_id
 447         except Exception as exc:
 448             log.info("Desired Gnocchi metric not found:%s", exc)
 449         return None