f7cd679c897faf9b9d20c1931ebf0bf5ca5c264e
[osm/POL.git] / osm_policy_module / core / agent.py
1 # -*- coding: utf-8 -*-
2
3 # Copyright 2018 Whitestack, LLC
4 # *************************************************************
5
6 # This file is part of OSM Monitoring module
7 # All Rights Reserved to Whitestack, LLC
8
9 # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 # not use this file except in compliance with the License. You may obtain
11 # a copy of the License at
12
13 # http://www.apache.org/licenses/LICENSE-2.0
14
15 # Unless required by applicable law or agreed to in writing, software
16 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
17 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
18 # License for the specific language governing permissions and limitations
19 # under the License.
20
21 # For those usages not covered by the Apache License, Version 2.0 please
22 # contact: bdiaz@whitestack.com or glavado@whitestack.com
23 ##
24 import asyncio
25 import datetime
26 import json
27 import logging
28 from json import JSONDecodeError
29
30 import yaml
31 from aiokafka import AIOKafkaConsumer
32
33 from osm_policy_module.common.common_db_client import CommonDbClient
34 from osm_policy_module.common.lcm_client import LcmClient
35 from osm_policy_module.common.mon_client import MonClient
36 from osm_policy_module.core import database
37 from osm_policy_module.core.config import Config
38 from osm_policy_module.core.database import ScalingGroup, ScalingAlarm, ScalingPolicy, ScalingCriteria, DatabaseManager
39 from osm_policy_module.core.exceptions import VdurNotFound
40 from osm_policy_module.utils.vnfd import VnfdUtils
41
42 log = logging.getLogger(__name__)
43
44 ALLOWED_KAFKA_KEYS = ['instantiated', 'scaled', 'terminated', 'notify_alarm']
45
46
47 class PolicyModuleAgent:
48 def __init__(self, loop=None):
49 cfg = Config.instance()
50 if not loop:
51 loop = asyncio.get_event_loop()
52 self.loop = loop
53 self.db_client = CommonDbClient()
54 self.mon_client = MonClient(loop=self.loop)
55 self.lcm_client = LcmClient(loop=self.loop)
56 self.kafka_server = '{}:{}'.format(cfg.OSMPOL_MESSAGE_HOST,
57 cfg.OSMPOL_MESSAGE_PORT)
58 self.database_manager = DatabaseManager()
59
60 def run(self):
61 self.loop.run_until_complete(self.start())
62
63 async def start(self):
64 consumer = AIOKafkaConsumer(
65 "ns",
66 "alarm_response",
67 loop=self.loop,
68 bootstrap_servers=self.kafka_server,
69 group_id="pol-consumer",
70 key_deserializer=bytes.decode,
71 value_deserializer=bytes.decode,
72 )
73 await consumer.start()
74 try:
75 async for msg in consumer:
76 log.info("Message arrived: %s", msg)
77 await self._process_msg(msg.topic, msg.key, msg.value)
78 finally:
79 await consumer.stop()
80
81 async def _process_msg(self, topic, key, msg):
82 log.debug("_process_msg topic=%s key=%s msg=%s", topic, key, msg)
83 try:
84 if key in ALLOWED_KAFKA_KEYS:
85 try:
86 content = json.loads(msg)
87 except JSONDecodeError:
88 content = yaml.safe_load(msg)
89
90 if key == 'instantiated' or key == 'scaled':
91 await self._handle_instantiated_or_scaled(content)
92
93 if key == 'terminated':
94 await self._handle_terminated(content)
95
96 if key == 'notify_alarm':
97 await self._handle_alarm_notification(content)
98 else:
99 log.debug("Key %s is not in ALLOWED_KAFKA_KEYS", key)
100 except Exception:
101 log.exception("Error consuming message: ")
102
103 async def _handle_alarm_notification(self, content):
104 log.debug("_handle_alarm_notification: %s", content)
105 alarm_uuid = content['notify_details']['alarm_uuid']
106 metric_name = content['notify_details']['metric_name']
107 operation = content['notify_details']['operation']
108 threshold = content['notify_details']['threshold_value']
109 vdu_name = content['notify_details']['vdu_name']
110 vnf_member_index = content['notify_details']['vnf_member_index']
111 nsr_id = content['notify_details']['ns_id']
112 log.info(
113 "Received alarm notification for alarm %s, \
114 metric %s, \
115 operation %s, \
116 threshold %s, \
117 vdu_name %s, \
118 vnf_member_index %s, \
119 ns_id %s ",
120 alarm_uuid, metric_name, operation, threshold, vdu_name, vnf_member_index, nsr_id)
121 try:
122 alarm = self.database_manager.get_alarm(alarm_uuid)
123 delta = datetime.datetime.now() - alarm.scaling_criteria.scaling_policy.last_scale
124 log.debug("last_scale: %s", alarm.scaling_criteria.scaling_policy.last_scale)
125 log.debug("now: %s", datetime.datetime.now())
126 log.debug("delta: %s", delta)
127 if delta.total_seconds() < alarm.scaling_criteria.scaling_policy.cooldown_time:
128 log.info("Time between last scale and now is less than cooldown time. Skipping.")
129 return
130 log.info("Sending scaling action message for ns: %s", nsr_id)
131 await self.lcm_client.scale(nsr_id,
132 alarm.scaling_criteria.scaling_policy.scaling_group.name,
133 alarm.vnf_member_index,
134 alarm.action)
135 alarm.scaling_criteria.scaling_policy.last_scale = datetime.datetime.now()
136 alarm.scaling_criteria.scaling_policy.save()
137 except ScalingAlarm.DoesNotExist:
138 log.info("There is no action configured for alarm %s.", alarm_uuid)
139
140 async def _handle_instantiated_or_scaled(self, content):
141 log.debug("_handle_instantiated_or_scaled: %s", content)
142 nslcmop_id = content['nslcmop_id']
143 nslcmop = self.db_client.get_nslcmop(nslcmop_id)
144 if nslcmop['operationState'] == 'COMPLETED' or nslcmop['operationState'] == 'PARTIALLY_COMPLETED':
145 nsr_id = nslcmop['nsInstanceId']
146 log.info("Configuring scaling groups for network service with nsr_id: %s", nsr_id)
147 await self._configure_scaling_groups(nsr_id)
148 log.info("Checking for orphaned alarms to be deleted for network service with nsr_id: %s", nsr_id)
149 await self._delete_orphaned_alarms(nsr_id)
150 else:
151 log.info(
152 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
153 "Current state is %s. Skipping...",
154 nslcmop['operationState'])
155
156 async def _handle_terminated(self, content):
157 log.debug("_handle_deleted: %s", content)
158 nsr_id = content['nsr_id']
159 if content['operationState'] == 'COMPLETED' or content['operationState'] == 'PARTIALLY_COMPLETED':
160 log.info("Deleting scaling groups and alarms for network service with nsr_id: %s", nsr_id)
161 await self._delete_scaling_groups(nsr_id)
162 else:
163 log.info(
164 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
165 "Current state is %s. Skipping...",
166 content['operationState'])
167
168 async def _configure_scaling_groups(self, nsr_id: str):
169 log.debug("_configure_scaling_groups: %s", nsr_id)
170 alarms_created = []
171 with database.db.atomic() as tx:
172 try:
173 vnfrs = self.db_client.get_vnfrs(nsr_id)
174 for vnfr in vnfrs:
175 log.info("Processing vnfr: %s", vnfr)
176 vnfd = self.db_client.get_vnfd(vnfr['vnfd-id'])
177 log.info("Looking for vnfd %s", vnfr['vnfd-id'])
178 if 'scaling-group-descriptor' not in vnfd:
179 continue
180 scaling_groups = vnfd['scaling-group-descriptor']
181 vnf_monitoring_params = vnfd['monitoring-param']
182 for scaling_group in scaling_groups:
183 try:
184 scaling_group_record = ScalingGroup.select().where(
185 ScalingGroup.nsr_id == nsr_id,
186 ScalingGroup.vnf_member_index == int(vnfr['member-vnf-index-ref']),
187 ScalingGroup.name == scaling_group['name']
188 ).get()
189 log.info("Found existing scaling group record in DB...")
190 except ScalingGroup.DoesNotExist:
191 log.info("Creating scaling group record in DB...")
192 scaling_group_record = ScalingGroup.create(
193 nsr_id=nsr_id,
194 vnf_member_index=vnfr['member-vnf-index-ref'],
195 name=scaling_group['name'],
196 content=json.dumps(scaling_group)
197 )
198 log.info(
199 "Created scaling group record in DB : nsr_id=%s, vnf_member_index=%s, name=%s",
200 scaling_group_record.nsr_id,
201 scaling_group_record.vnf_member_index,
202 scaling_group_record.name)
203 for scaling_policy in scaling_group['scaling-policy']:
204 if scaling_policy['scaling-type'] != 'automatic':
205 continue
206 try:
207 scaling_policy_record = ScalingPolicy.select().join(ScalingGroup).where(
208 ScalingPolicy.name == scaling_policy['name'],
209 ScalingGroup.id == scaling_group_record.id
210 ).get()
211 log.info("Found existing scaling policy record in DB...")
212 except ScalingPolicy.DoesNotExist:
213 log.info("Creating scaling policy record in DB...")
214 scaling_policy_record = ScalingPolicy.create(
215 nsr_id=nsr_id,
216 name=scaling_policy['name'],
217 cooldown_time=scaling_policy['cooldown-time'],
218 scaling_group=scaling_group_record
219 )
220 log.info("Created scaling policy record in DB : name=%s, scaling_group.name=%s",
221 scaling_policy_record.name,
222 scaling_policy_record.scaling_group.name)
223
224 for scaling_criteria in scaling_policy['scaling-criteria']:
225 try:
226 scaling_criteria_record = ScalingCriteria.select().join(ScalingPolicy).where(
227 ScalingPolicy.id == scaling_policy_record.id,
228 ScalingCriteria.name == scaling_criteria['name']
229 ).get()
230 log.info("Found existing scaling criteria record in DB...")
231 except ScalingCriteria.DoesNotExist:
232 log.info("Creating scaling criteria record in DB...")
233 scaling_criteria_record = ScalingCriteria.create(
234 nsr_id=nsr_id,
235 name=scaling_criteria['name'],
236 scaling_policy=scaling_policy_record
237 )
238 log.info(
239 "Created scaling criteria record in DB : name=%s, scaling_policy.name=%s",
240 scaling_criteria_record.name,
241 scaling_criteria_record.scaling_policy.name)
242
243 vnf_monitoring_param = next(
244 filter(
245 lambda param: param['id'] == scaling_criteria[
246 'vnf-monitoring-param-ref'
247 ],
248 vnf_monitoring_params)
249 )
250 if 'vdu-monitoring-param' in vnf_monitoring_param:
251 vdurs = list(
252 filter(
253 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
254 ['vdu-monitoring-param']
255 ['vdu-ref'],
256 vnfr['vdur']
257 )
258 )
259 elif 'vdu-metric' in vnf_monitoring_param:
260 vdurs = list(
261 filter(
262 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
263 ['vdu-metric']
264 ['vdu-ref'],
265 vnfr['vdur']
266 )
267 )
268 elif 'vnf-metric' in vnf_monitoring_param:
269 vdu = VnfdUtils.get_mgmt_vdu(vnfd)
270 vdurs = list(
271 filter(
272 lambda vdur: vdur['vdu-id-ref'] == vdu['id'],
273 vnfr['vdur']
274 )
275 )
276 else:
277 log.warning(
278 "Scaling criteria is referring to a vnf-monitoring-param that does not "
279 "contain a reference to a vdu or vnf metric.")
280 continue
281 for vdur in vdurs:
282 log.info("Creating alarm for vdur %s ", vdur)
283 try:
284 (ScalingAlarm.select()
285 .join(ScalingCriteria)
286 .join(ScalingPolicy)
287 .join(ScalingGroup)
288 .where(
289 ScalingAlarm.vdu_name == vdur['name'],
290 ScalingCriteria.name == scaling_criteria['name'],
291 ScalingPolicy.name == scaling_policy['name'],
292 ScalingGroup.nsr_id == nsr_id
293 ).get())
294 log.debug("vdu %s already has an alarm configured", vdur['name'])
295 continue
296 except ScalingAlarm.DoesNotExist:
297 pass
298 alarm_uuid = await self.mon_client.create_alarm(
299 metric_name=vnf_monitoring_param['id'],
300 ns_id=nsr_id,
301 vdu_name=vdur['name'],
302 vnf_member_index=vnfr['member-vnf-index-ref'],
303 threshold=scaling_criteria['scale-in-threshold'],
304 operation=scaling_criteria['scale-in-relational-operation'],
305 statistic=vnf_monitoring_param['aggregation-type']
306 )
307 alarm = ScalingAlarm.create(
308 alarm_uuid=alarm_uuid,
309 action='scale_in',
310 vnf_member_index=int(vnfr['member-vnf-index-ref']),
311 vdu_name=vdur['name'],
312 scaling_criteria=scaling_criteria_record
313 )
314 alarms_created.append(alarm)
315 alarm_uuid = await self.mon_client.create_alarm(
316 metric_name=vnf_monitoring_param['id'],
317 ns_id=nsr_id,
318 vdu_name=vdur['name'],
319 vnf_member_index=vnfr['member-vnf-index-ref'],
320 threshold=scaling_criteria['scale-out-threshold'],
321 operation=scaling_criteria['scale-out-relational-operation'],
322 statistic=vnf_monitoring_param['aggregation-type']
323 )
324 alarm = ScalingAlarm.create(
325 alarm_uuid=alarm_uuid,
326 action='scale_out',
327 vnf_member_index=int(vnfr['member-vnf-index-ref']),
328 vdu_name=vdur['name'],
329 scaling_criteria=scaling_criteria_record
330 )
331 alarms_created.append(alarm)
332
333 except Exception as e:
334 log.exception("Error configuring scaling groups:")
335 tx.rollback()
336 if len(alarms_created) > 0:
337 log.info("Cleaning alarm resources in MON")
338 for alarm in alarms_created:
339 await self.mon_client.delete_alarm(alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
340 alarm.vnf_member_index,
341 alarm.vdu_name,
342 alarm.alarm_uuid)
343 raise e
344
345 async def _delete_scaling_groups(self, nsr_id: str):
346 with database.db.atomic() as tx:
347 try:
348 for scaling_group in ScalingGroup.select().where(ScalingGroup.nsr_id == nsr_id):
349 for scaling_policy in scaling_group.scaling_policies:
350 for scaling_criteria in scaling_policy.scaling_criterias:
351 for alarm in scaling_criteria.scaling_alarms:
352 try:
353 await self.mon_client.delete_alarm(
354 alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
355 alarm.vnf_member_index,
356 alarm.vdu_name,
357 alarm.alarm_uuid)
358 except ValueError:
359 log.exception("Error deleting alarm in MON %s", alarm.alarm_uuid)
360 alarm.delete_instance()
361 scaling_criteria.delete_instance()
362 scaling_policy.delete_instance()
363 scaling_group.delete_instance()
364
365 except Exception as e:
366 log.exception("Error deleting scaling groups and alarms:")
367 tx.rollback()
368 raise e
369
370 async def _delete_orphaned_alarms(self, nsr_id):
371 with database.db.atomic() as tx:
372 try:
373 for scaling_group in ScalingGroup.select().where(ScalingGroup.nsr_id == nsr_id):
374 for scaling_policy in scaling_group.scaling_policies:
375 for scaling_criteria in scaling_policy.scaling_criterias:
376 for alarm in scaling_criteria.scaling_alarms:
377 try:
378 self.db_client.get_vdur(nsr_id, alarm.vnf_member_index, alarm.vdu_name)
379 except VdurNotFound:
380 log.info("Deleting orphaned alarm %s", alarm.alarm_uuid)
381 try:
382 await self.mon_client.delete_alarm(
383 alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
384 alarm.vnf_member_index,
385 alarm.vdu_name,
386 alarm.alarm_uuid)
387 except ValueError:
388 log.exception("Error deleting alarm in MON %s", alarm.alarm_uuid)
389 alarm.delete_instance()
390
391 except Exception as e:
392 log.exception("Error deleting orphaned alarms:")
393 tx.rollback()
394 raise e