Merge "added env vars for mongo ha"
[osm/POL.git] / osm_policy_module / core / agent.py
1 # -*- coding: utf-8 -*-
2
3 # Copyright 2018 Whitestack, LLC
4 # *************************************************************
5
6 # This file is part of OSM Monitoring module
7 # All Rights Reserved to Whitestack, LLC
8
9 # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 # not use this file except in compliance with the License. You may obtain
11 # a copy of the License at
12
13 # http://www.apache.org/licenses/LICENSE-2.0
14
15 # Unless required by applicable law or agreed to in writing, software
16 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
17 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
18 # License for the specific language governing permissions and limitations
19 # under the License.
20
21 # For those usages not covered by the Apache License, Version 2.0 please
22 # contact: bdiaz@whitestack.com or glavado@whitestack.com
23 ##
24 import asyncio
25 import datetime
26 import json
27 import logging
28 from json import JSONDecodeError
29
30 import yaml
31 from aiokafka import AIOKafkaConsumer
32
33 from osm_policy_module.common.common_db_client import CommonDbClient
34 from osm_policy_module.common.lcm_client import LcmClient
35 from osm_policy_module.common.mon_client import MonClient
36 from osm_policy_module.core import database
37 from osm_policy_module.core.config import Config
38 from osm_policy_module.core.database import ScalingGroup, ScalingAlarm, ScalingPolicy, ScalingCriteria, DatabaseManager
39 from osm_policy_module.utils.vnfd import VnfdUtils
40
41 log = logging.getLogger(__name__)
42
43 ALLOWED_KAFKA_KEYS = ['instantiated', 'scaled', 'terminated', 'notify_alarm']
44
45
46 class PolicyModuleAgent:
47 def __init__(self, loop=None):
48 cfg = Config.instance()
49 if not loop:
50 loop = asyncio.get_event_loop()
51 self.loop = loop
52 self.db_client = CommonDbClient()
53 self.mon_client = MonClient(loop=self.loop)
54 self.lcm_client = LcmClient(loop=self.loop)
55 self.kafka_server = '{}:{}'.format(cfg.OSMPOL_MESSAGE_HOST,
56 cfg.OSMPOL_MESSAGE_PORT)
57 self.database_manager = DatabaseManager()
58
59 def run(self):
60 self.loop.run_until_complete(self.start())
61
62 async def start(self):
63 consumer = AIOKafkaConsumer(
64 "ns",
65 "alarm_response",
66 loop=self.loop,
67 bootstrap_servers=self.kafka_server,
68 group_id="pol-consumer",
69 key_deserializer=bytes.decode,
70 value_deserializer=bytes.decode,
71 )
72 await consumer.start()
73 try:
74 async for msg in consumer:
75 log.info("Message arrived: %s", msg)
76 await self._process_msg(msg.topic, msg.key, msg.value)
77 finally:
78 await consumer.stop()
79
80 async def _process_msg(self, topic, key, msg):
81 log.debug("_process_msg topic=%s key=%s msg=%s", topic, key, msg)
82 try:
83 if key in ALLOWED_KAFKA_KEYS:
84 try:
85 content = json.loads(msg)
86 except JSONDecodeError:
87 content = yaml.safe_load(msg)
88
89 if key == 'instantiated' or key == 'scaled':
90 await self._handle_instantiated_or_scaled(content)
91
92 if key == 'terminated':
93 await self._handle_terminated(content)
94
95 if key == 'notify_alarm':
96 await self._handle_alarm_notification(content)
97 else:
98 log.debug("Key %s is not in ALLOWED_KAFKA_KEYS", key)
99 except Exception:
100 log.exception("Error consuming message: ")
101
102 async def _handle_alarm_notification(self, content):
103 log.debug("_handle_alarm_notification: %s", content)
104 alarm_uuid = content['notify_details']['alarm_uuid']
105 metric_name = content['notify_details']['metric_name']
106 operation = content['notify_details']['operation']
107 threshold = content['notify_details']['threshold_value']
108 vdu_name = content['notify_details']['vdu_name']
109 vnf_member_index = content['notify_details']['vnf_member_index']
110 nsr_id = content['notify_details']['ns_id']
111 log.info(
112 "Received alarm notification for alarm %s, \
113 metric %s, \
114 operation %s, \
115 threshold %s, \
116 vdu_name %s, \
117 vnf_member_index %s, \
118 ns_id %s ",
119 alarm_uuid, metric_name, operation, threshold, vdu_name, vnf_member_index, nsr_id)
120 try:
121 alarm = self.database_manager.get_alarm(alarm_uuid)
122 delta = datetime.datetime.now() - alarm.scaling_criteria.scaling_policy.last_scale
123 log.debug("last_scale: %s", alarm.scaling_criteria.scaling_policy.last_scale)
124 log.debug("now: %s", datetime.datetime.now())
125 log.debug("delta: %s", delta)
126 if delta.total_seconds() < alarm.scaling_criteria.scaling_policy.cooldown_time:
127 log.info("Time between last scale and now is less than cooldown time. Skipping.")
128 return
129 log.info("Sending scaling action message for ns: %s", nsr_id)
130 await self.lcm_client.scale(nsr_id,
131 alarm.scaling_criteria.scaling_policy.scaling_group.name,
132 alarm.vnf_member_index,
133 alarm.action)
134 alarm.scaling_criteria.scaling_policy.last_scale = datetime.datetime.now()
135 alarm.scaling_criteria.scaling_policy.save()
136 except ScalingAlarm.DoesNotExist:
137 log.info("There is no action configured for alarm %s.", alarm_uuid)
138
139 async def _handle_instantiated_or_scaled(self, content):
140 log.debug("_handle_instantiated_or_scaled: %s", content)
141 nslcmop_id = content['nslcmop_id']
142 nslcmop = self.db_client.get_nslcmop(nslcmop_id)
143 if nslcmop['operationState'] == 'COMPLETED' or nslcmop['operationState'] == 'PARTIALLY_COMPLETED':
144 nsr_id = nslcmop['nsInstanceId']
145 log.info("Configuring scaling groups for network service with nsr_id: %s", nsr_id)
146 await self._configure_scaling_groups(nsr_id)
147 else:
148 log.info(
149 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
150 "Current state is %s. Skipping...",
151 nslcmop['operationState'])
152
153 async def _handle_terminated(self, content):
154 log.debug("_handle_deleted: %s", content)
155 nsr_id = content['nsr_id']
156 if content['operationState'] == 'COMPLETED' or content['operationState'] == 'PARTIALLY_COMPLETED':
157 log.info("Deleting scaling groups and alarms for network service with nsr_id: %s", nsr_id)
158 await self._delete_scaling_groups(nsr_id)
159 else:
160 log.info(
161 "Network service is not in COMPLETED or PARTIALLY_COMPLETED state. "
162 "Current state is %s. Skipping...",
163 content['operationState'])
164
165 async def _configure_scaling_groups(self, nsr_id: str):
166 log.debug("_configure_scaling_groups: %s", nsr_id)
167 alarms_created = []
168 with database.db.atomic() as tx:
169 try:
170 vnfrs = self.db_client.get_vnfrs(nsr_id)
171 for vnfr in vnfrs:
172 log.info("Processing vnfr: %s", vnfr)
173 vnfd = self.db_client.get_vnfd(vnfr['vnfd-id'])
174 log.info("Looking for vnfd %s", vnfr['vnfd-id'])
175 if 'scaling-group-descriptor' not in vnfd:
176 continue
177 scaling_groups = vnfd['scaling-group-descriptor']
178 vnf_monitoring_params = vnfd['monitoring-param']
179 for scaling_group in scaling_groups:
180 try:
181 scaling_group_record = ScalingGroup.select().where(
182 ScalingGroup.nsr_id == nsr_id,
183 ScalingGroup.vnf_member_index == int(vnfr['member-vnf-index-ref']),
184 ScalingGroup.name == scaling_group['name']
185 ).get()
186 log.info("Found existing scaling group record in DB...")
187 except ScalingGroup.DoesNotExist:
188 log.info("Creating scaling group record in DB...")
189 scaling_group_record = ScalingGroup.create(
190 nsr_id=nsr_id,
191 vnf_member_index=vnfr['member-vnf-index-ref'],
192 name=scaling_group['name'],
193 content=json.dumps(scaling_group)
194 )
195 log.info(
196 "Created scaling group record in DB : nsr_id=%s, vnf_member_index=%s, name=%s",
197 scaling_group_record.nsr_id,
198 scaling_group_record.vnf_member_index,
199 scaling_group_record.name)
200 for scaling_policy in scaling_group['scaling-policy']:
201 if scaling_policy['scaling-type'] != 'automatic':
202 continue
203 try:
204 scaling_policy_record = ScalingPolicy.select().join(ScalingGroup).where(
205 ScalingPolicy.name == scaling_policy['name'],
206 ScalingGroup.id == scaling_group_record.id
207 ).get()
208 log.info("Found existing scaling policy record in DB...")
209 except ScalingPolicy.DoesNotExist:
210 log.info("Creating scaling policy record in DB...")
211 scaling_policy_record = ScalingPolicy.create(
212 nsr_id=nsr_id,
213 name=scaling_policy['name'],
214 cooldown_time=scaling_policy['cooldown-time'],
215 scaling_group=scaling_group_record
216 )
217 log.info("Created scaling policy record in DB : name=%s, scaling_group.name=%s",
218 scaling_policy_record.name,
219 scaling_policy_record.scaling_group.name)
220
221 for scaling_criteria in scaling_policy['scaling-criteria']:
222 try:
223 scaling_criteria_record = ScalingCriteria.select().join(ScalingPolicy).where(
224 ScalingPolicy.id == scaling_policy_record.id,
225 ScalingCriteria.name == scaling_criteria['name']
226 ).get()
227 log.info("Found existing scaling criteria record in DB...")
228 except ScalingCriteria.DoesNotExist:
229 log.info("Creating scaling criteria record in DB...")
230 scaling_criteria_record = ScalingCriteria.create(
231 nsr_id=nsr_id,
232 name=scaling_criteria['name'],
233 scaling_policy=scaling_policy_record
234 )
235 log.info(
236 "Created scaling criteria record in DB : name=%s, scaling_policy.name=%s",
237 scaling_criteria_record.name,
238 scaling_criteria_record.scaling_policy.name)
239
240 vnf_monitoring_param = next(
241 filter(
242 lambda param: param['id'] == scaling_criteria[
243 'vnf-monitoring-param-ref'
244 ],
245 vnf_monitoring_params)
246 )
247 if 'vdu-monitoring-param' in vnf_monitoring_param:
248 vdurs = list(
249 filter(
250 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
251 ['vdu-monitoring-param']
252 ['vdu-ref'],
253 vnfr['vdur']
254 )
255 )
256 elif 'vdu-metric' in vnf_monitoring_param:
257 vdurs = list(
258 filter(
259 lambda vdur: vdur['vdu-id-ref'] == vnf_monitoring_param
260 ['vdu-metric']
261 ['vdu-ref'],
262 vnfr['vdur']
263 )
264 )
265 elif 'vnf-metric' in vnf_monitoring_param:
266 vdu = VnfdUtils.get_mgmt_vdu(vnfd)
267 vdurs = list(
268 filter(
269 lambda vdur: vdur['vdu-id-ref'] == vdu['id'],
270 vnfr['vdur']
271 )
272 )
273 else:
274 log.warning(
275 "Scaling criteria is referring to a vnf-monitoring-param that does not "
276 "contain a reference to a vdu or vnf metric.")
277 continue
278 for vdur in vdurs:
279 log.info("Creating alarm for vdur %s ", vdur)
280 try:
281 (ScalingAlarm.select()
282 .join(ScalingCriteria)
283 .join(ScalingPolicy)
284 .join(ScalingGroup)
285 .where(
286 ScalingAlarm.vdu_name == vdur['name'],
287 ScalingCriteria.name == scaling_criteria['name'],
288 ScalingPolicy.name == scaling_policy['name'],
289 ScalingGroup.nsr_id == nsr_id
290 ).get())
291 log.debug("vdu %s already has an alarm configured", vdur['name'])
292 continue
293 except ScalingAlarm.DoesNotExist:
294 pass
295 alarm_uuid = await self.mon_client.create_alarm(
296 metric_name=vnf_monitoring_param['id'],
297 ns_id=nsr_id,
298 vdu_name=vdur['name'],
299 vnf_member_index=vnfr['member-vnf-index-ref'],
300 threshold=scaling_criteria['scale-in-threshold'],
301 operation=scaling_criteria['scale-in-relational-operation'],
302 statistic=vnf_monitoring_param['aggregation-type']
303 )
304 alarm = ScalingAlarm.create(
305 alarm_uuid=alarm_uuid,
306 action='scale_in',
307 vnf_member_index=int(vnfr['member-vnf-index-ref']),
308 vdu_name=vdur['name'],
309 scaling_criteria=scaling_criteria_record
310 )
311 alarms_created.append(alarm)
312 alarm_uuid = await self.mon_client.create_alarm(
313 metric_name=vnf_monitoring_param['id'],
314 ns_id=nsr_id,
315 vdu_name=vdur['name'],
316 vnf_member_index=vnfr['member-vnf-index-ref'],
317 threshold=scaling_criteria['scale-out-threshold'],
318 operation=scaling_criteria['scale-out-relational-operation'],
319 statistic=vnf_monitoring_param['aggregation-type']
320 )
321 alarm = ScalingAlarm.create(
322 alarm_uuid=alarm_uuid,
323 action='scale_out',
324 vnf_member_index=int(vnfr['member-vnf-index-ref']),
325 vdu_name=vdur['name'],
326 scaling_criteria=scaling_criteria_record
327 )
328 alarms_created.append(alarm)
329
330 except Exception as e:
331 log.exception("Error configuring scaling groups:")
332 tx.rollback()
333 if len(alarms_created) > 0:
334 log.info("Cleaning alarm resources in MON")
335 for alarm in alarms_created:
336 await self.mon_client.delete_alarm(alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
337 alarm.vnf_member_index,
338 alarm.vdu_name,
339 alarm.alarm_uuid)
340 raise e
341
342 async def _delete_scaling_groups(self, nsr_id: str):
343 with database.db.atomic() as tx:
344 try:
345 for scaling_group in ScalingGroup.select().where(ScalingGroup.nsr_id == nsr_id):
346 for scaling_policy in scaling_group.scaling_policies:
347 for scaling_criteria in scaling_policy.scaling_criterias:
348 for alarm in scaling_criteria.scaling_alarms:
349 try:
350 await self.mon_client.delete_alarm(
351 alarm.scaling_criteria.scaling_policy.scaling_group.nsr_id,
352 alarm.vnf_member_index,
353 alarm.vdu_name,
354 alarm.alarm_uuid)
355 except ValueError:
356 log.exception("Error deleting alarm in MON %s", alarm.alarm_uuid)
357 alarm.delete_instance()
358 scaling_criteria.delete_instance()
359 scaling_policy.delete_instance()
360 scaling_group.delete_instance()
361
362 except Exception as e:
363 log.exception("Error deleting scaling groups and alarms:")
364 tx.rollback()
365 raise e