ab8f857c9e38c32d723a5181d640c065b8d2189d
[osm/POL.git] / osm_policy_module / core / agent.py
1 # -*- coding: utf-8 -*-
2
3 # Copyright 2018 Whitestack, LLC
4 # *************************************************************
5
6 # This file is part of OSM Monitoring module
7 # All Rights Reserved to Whitestack, LLC
8
9 # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 # not use this file except in compliance with the License. You may obtain
11 # a copy of the License at
12
13 # http://www.apache.org/licenses/LICENSE-2.0
14
15 # Unless required by applicable law or agreed to in writing, software
16 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
17 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
18 # License for the specific language governing permissions and limitations
19 # under the License.
20
21 # For those usages not covered by the Apache License, Version 2.0 please
22 # contact: bdiaz@whitestack.com or glavado@whitestack.com
23 ##
24 import asyncio
25 import datetime
26 import json
27 import logging
28 from json import JSONDecodeError
29
30 import yaml
31 from aiokafka import AIOKafkaConsumer
32
33 from osm_policy_module.common.common_db_client import CommonDbClient
34 from osm_policy_module.common.lcm_client import LcmClient
35 from osm_policy_module.common.mon_client import MonClient
36 from osm_policy_module.core import database
37 from osm_policy_module.core.config import Config
38 from osm_policy_module.core.database import ScalingGroup, ScalingAlarm, ScalingPolicy, ScalingCriteria, DatabaseManager
39
40 log = logging.getLogger(__name__)
41
42 ALLOWED_KAFKA_KEYS = ['instantiated', 'scaled', 'terminated', 'notify_alarm']
43
44
45 class PolicyModuleAgent:
46 def __init__(self, loop=None):
47 cfg = Config.instance()
48 if not loop:
49 loop = asyncio.get_event_loop()
50 self.loop = loop
51 self.db_client = CommonDbClient()
52 self.mon_client = MonClient(loop=self.loop)
53 self.lcm_client = LcmClient(loop=self.loop)
54 self.kafka_server = '{}:{}'.format(cfg.OSMPOL_MESSAGE_HOST,
55 cfg.OSMPOL_MESSAGE_PORT)
56 self.database_manager = DatabaseManager()
57
58 def run(self):
59 self.loop.run_until_complete(self.start())
60
61 async def start(self):
62 consumer = AIOKafkaConsumer(
63 "ns",
64 "alarm_response",
65 loop=self.loop,
66 bootstrap_servers=self.kafka_server,
67 group_id="pol-consumer",
68 key_deserializer=bytes.decode,
69 value_deserializer=bytes.decode,
70 )
71 await consumer.start()
72 try:
73 async for msg in consumer:
74 log.info("Message arrived: %s", msg)
75 await self._process_msg(msg.topic, msg.key, msg.value)
76 finally:
77 await consumer.stop()
78
79 async def _process_msg(self, topic, key, msg):
80 log.debug("_process_msg topic=%s key=%s msg=%s", topic, key, msg)
81 try:
82 if key in ALLOWED_KAFKA_KEYS:
83 try:
84 content = json.loads(msg)
85 except JSONDecodeError:
86 content = yaml.safe_load(msg)
87
88 if key == 'instantiated' or key == 'scaled':
89 await self._handle_instantiated_or_scaled(content)
90
91 if key == 'terminated':
92 await self._handle_terminated(content)
93
94 if key == 'notify_alarm':
95 await self._handle_alarm_notification(content)
96 else:
97 log.debug("Key %s is not in ALLOWED_KAFKA_KEYS", key)
98 except Exception:
99 log.exception("Error consuming message: ")
100
101 async def _handle_alarm_notification(self, content):
102 log.debug("_handle_alarm_notification: %s", content)
103 alarm_uuid = content['notify_details']['alarm_uuid']
104 metric_name = content['notify_details']['metric_name']
105 operation = content['notify_details']['operation']
106 threshold = content['notify_details']['threshold_value']
107 vdu_name = content['notify_details']['vdu_name']
108 vnf_member_index = content['notify_details']['vnf_member_index']
109 nsr_id = content['notify_details']['ns_id']
110 log.info(
111 "Received alarm notification for alarm %s, \
112 metric %s, \
113 operation %s, \
114 threshold %s, \
115 vdu_name %s, \
116 vnf_member_index %s, \
117 ns_id %s ",
118 alarm_uuid, metric_name, operation, threshold, vdu_name, vnf_member_index, nsr_id)
119 try:
120 alarm = self.database_manager.get_alarm(alarm_uuid)
121 delta = datetime.datetime.now() - alarm.scaling_criteria.scaling_policy.last_scale
122 log.debug("last_scale: %s", alarm.scaling_criteria.scaling_policy.last_scale)
123 log.debug("now: %s", datetime.datetime.now())
124 log.debug("delta: %s", delta)
125 if delta.total_seconds() < alarm.scaling_criteria.scaling_policy.cooldown_time:
126 log.info("Time between last scale and now is less than cooldown time. Skipping.")
127 return
128 log.info("Sending scaling action message for ns: %s", nsr_id)
129 await self.lcm_client.scale(nsr_id,
130 alarm.scaling_criteria.scaling_policy.scaling_group.name,
131 alarm.vnf_member_index,
132 alarm.action)
133 alarm.scaling_criteria.scaling_policy.last_scale = datetime.datetime.now()
134 alarm.scaling_criteria.scaling_policy.save()
135 except ScalingAlarm.DoesNotExist:
136 log.info("There is no action configured for alarm %s.", alarm_uuid)
137
138 async def _handle_instantiated_or_scaled(self, content):
139 log.debug("_handle_instantiated_or_scaled: %s", content)
140 nslcmop_id = content['nslcmop_id']
141 nslcmop = self.db_client.get_nslcmop(nslcmop_id)
142 if nslcmop['operationState'] == 'COMPLETED' or nslcmop['operationState'] == 'PARTIALLY_COMPLETED':
143 nsr_id = nslcmop['nsInstanceId']
144 log.info("Configuring scaling groups for network service with nsr_id: %s", nsr_id)
145 await self._configure_scaling_groups(nsr_id)
146 else:
147 log.info(
148 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
149 "Current state is %s. Skipping...",
150 nslcmop['operationState'])
151
152 async def _handle_terminated(self, content):
153 log.debug("_handle_deleted: %s", content)
154 nsr_id = content['nsr_id']
155 if content['operationState'] == 'COMPLETED' or content['operationState'] == 'PARTIALLY_COMPLETED':
156 log.info("Deleting scaling groups and alarms for network service with nsr_id: %s", nsr_id)
157 await self._delete_scaling_groups(nsr_id)
158 else:
159 log.info(
160 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
161 "Current state is %s. Skipping...",
162 content['operationState'])
163
164 async def _configure_scaling_groups(self, nsr_id: str):
165 log.debug("_configure_scaling_groups: %s", nsr_id)
166 alarms_created = []
167 with database.db.atomic() as tx:
168 try:
169 vnfrs = self.db_client.get_vnfrs(nsr_id)
170 for vnfr in vnfrs:
171 log.info("Processing vnfr: %s", vnfr)
172 vnfd = self.db_client.get_vnfd(vnfr['vnfd-id'])
173 log.info("Looking for vnfd %s", vnfr['vnfd-id'])
174 if 'scaling-group-descriptor' not in vnfd:
175 continue
176 scaling_groups = vnfd['scaling-group-descriptor']
177 vnf_monitoring_params = vnfd['monitoring-param']
178 for scaling_group in scaling_groups:
179 try:
180 scaling_group_record = ScalingGroup.select().where(
181 ScalingGroup.nsr_id == nsr_id,
182 ScalingGroup.vnf_member_index == int(vnfr['member-vnf-index-ref']),
183 ScalingGroup.name == scaling_group['name']
184 ).get()
185 log.info("Found existing scaling group record in DB...")
186 except ScalingGroup.DoesNotExist:
187 log.info("Creating scaling group record in DB...")
188 scaling_group_record = ScalingGroup.create(
189 nsr_id=nsr_id,
190 vnf_member_index=vnfr['member-vnf-index-ref'],
191 name=scaling_group['name'],
192 content=json.dumps(scaling_group)
193 )
194 log.info(
195 "Created scaling group record in DB : nsr_id=%s, vnf_member_index=%s, name=%s",
196 scaling_group_record.nsr_id,
197 scaling_group_record.vnf_member_index,
198 scaling_group_record.name)
199 for scaling_policy in scaling_group['scaling-policy']:
200 if scaling_policy['scaling-type'] != 'automatic':
201 continue
202 try:
203 scaling_policy_record = ScalingPolicy.select().join(ScalingGroup).where(
204 ScalingPolicy.name == scaling_policy['name'],
205 ScalingGroup.id == scaling_group_record.id
206 ).get()
207 log.info("Found existing scaling policy record in DB...")
208 except ScalingPolicy.DoesNotExist:
209 log.info("Creating scaling policy record in DB...")
210 scaling_policy_record = ScalingPolicy.create(
211 nsr_id=nsr_id,
212 name=scaling_policy['name'],
213 cooldown_time=scaling_policy['cooldown-time'],
214 scaling_group=scaling_group_record
215 )
216 log.info("Created scaling policy record in DB : name=%s, scaling_group.name=%s",
217 scaling_policy_record.name,
218 scaling_policy_record.scaling_group.name)
219
220 for scaling_criteria in scaling_policy['scaling-criteria']:
221 try:
222 scaling_criteria_record = ScalingCriteria.select().join(ScalingPolicy).where(
223 ScalingPolicy.id == scaling_policy_record.id,
224 ScalingCriteria.name == scaling_criteria['name']
225 ).get()
226 log.info("Found existing scaling criteria record in DB...")
227 except ScalingCriteria.DoesNotExist:
228 log.info("Creating scaling criteria record in DB...")
229 scaling_criteria_record = ScalingCriteria.create(
230 nsr_id=nsr_id,
231 name=scaling_criteria['name'],
232 scaling_policy=scaling_policy_record
233 )
234 log.info(
235 "Created scaling criteria record in DB : name=%s, scaling_policy.name=%s",
236 scaling_criteria_record.name,
237 scaling_criteria_record.scaling_policy.name)
238
239 vnf_monitoring_param = next(
240 filter(
241 lambda param: param['id'] == scaling_criteria[
242 'vnf-monitoring-param-ref'
243 ],
244 vnf_monitoring_params)
245 )
246 if 'vdu-monitoring-param' in vnf_monitoring_param:
247 vdurs = list(
248 filter(
249 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
250 ['vdu-monitoring-param']
251 ['vdu-ref'],
252 vnfr['vdur']
253 )
254 )
255 elif 'vdu-metric' in vnf_monitoring_param:
256 vdurs = list(
257 filter(
258 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
259 ['vdu-metric']
260 ['vdu-ref'],
261 vnfr['vdur']
262 )
263 )
264 elif 'vnf-metric' in vnf_monitoring_param:
265 log.warning("vnf-metric is not currently supported.")
266 continue
267 else:
268 log.warning(
269 "Scaling criteria is referring to a vnf-monitoring-param that does not "
270 "contain a reference to a vdu or vnf metric.")
271 continue
272 for vdur in vdurs:
273 log.info("Creating alarm for vdur %s ", vdur)
274 try:
275 (ScalingAlarm.select()
276 .join(ScalingCriteria)
277 .join(ScalingPolicy)
278 .join(ScalingGroup)
279 .where(
280 ScalingAlarm.vdu_name == vdur['name'],
281 ScalingCriteria.name == scaling_criteria['name'],
282 ScalingPolicy.name == scaling_policy['name'],
283 ScalingGroup.nsr_id == nsr_id
284 ).get())
285 log.debug("vdu %s already has an alarm configured", vdur['name'])
286 continue
287 except ScalingAlarm.DoesNotExist:
288 pass
289 alarm_uuid = await self.mon_client.create_alarm(
290 metric_name=vnf_monitoring_param['id'],
291 ns_id=nsr_id,
292 vdu_name=vdur['name'],
293 vnf_member_index=vnfr['member-vnf-index-ref'],
294 threshold=scaling_criteria['scale-in-threshold'],
295 operation=scaling_criteria['scale-in-relational-operation'],
296 statistic=vnf_monitoring_param['aggregation-type']
297 )
298 alarm = ScalingAlarm.create(
299 alarm_uuid=alarm_uuid,
300 action='scale_in',
301 vnf_member_index=int(vnfr['member-vnf-index-ref']),
302 vdu_name=vdur['name'],
303 scaling_criteria=scaling_criteria_record
304 )
305 alarms_created.append(alarm)
306 alarm_uuid = await self.mon_client.create_alarm(
307 metric_name=vnf_monitoring_param['id'],
308 ns_id=nsr_id,
309 vdu_name=vdur['name'],
310 vnf_member_index=vnfr['member-vnf-index-ref'],
311 threshold=scaling_criteria['scale-out-threshold'],
312 operation=scaling_criteria['scale-out-relational-operation'],
313 statistic=vnf_monitoring_param['aggregation-type']
314 )
315 alarm = ScalingAlarm.create(
316 alarm_uuid=alarm_uuid,
317 action='scale_out',
318 vnf_member_index=int(vnfr['member-vnf-index-ref']),
319 vdu_name=vdur['name'],
320 scaling_criteria=scaling_criteria_record
321 )
322 alarms_created.append(alarm)
323
324 except Exception as e:
325 log.exception("Error configuring scaling groups:")
326 tx.rollback()
327 if len(alarms_created) > 0:
328 log.info("Cleaning alarm resources in MON")
329 for alarm in alarms_created:
330 await self.mon_client.delete_alarm(alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
331 alarm.vnf_member_index,
332 alarm.vdu_name,
333 alarm.alarm_uuid)
334 raise e
335
336 async def _delete_scaling_groups(self, nsr_id: str):
337 with database.db.atomic() as tx:
338 try:
339 for scaling_group in ScalingGroup.select().where(ScalingGroup.nsr_id == nsr_id):
340 for scaling_policy in scaling_group.scaling_policies:
341 for scaling_criteria in scaling_policy.scaling_criterias:
342 for alarm in scaling_criteria.scaling_alarms:
343 try:
344 await self.mon_client.delete_alarm(
345 alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
346 alarm.vnf_member_index,
347 alarm.vdu_name,
348 alarm.alarm_uuid)
349 except ValueError:
350 log.exception("Error deleting alarm in MON %s", alarm.alarm_uuid)
351 alarm.delete_instance()
352 scaling_criteria.delete_instance()
353 scaling_policy.delete_instance()
354 scaling_group.delete_instance()
355
356 except Exception as e:
357 log.exception("Error deleting scaling groups and alarms:")
358 tx.rollback()
359 raise e