making healtcheck without kafka reading
[osm/LCM.git] / osm_lcm / lcm.py
1 #!/usr/bin/python3
2 # -*- coding: utf-8 -*-
3
4 ##
5 # Copyright 2018 Telefonica S.A.
6 #
7 # Licensed under the Apache License, Version 2.0 (the "License"); you may
8 # not use this file except in compliance with the License. You may obtain
9 # a copy of the License at
10 #
11 # http://www.apache.org/licenses/LICENSE-2.0
12 #
13 # Unless required by applicable law or agreed to in writing, software
14 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16 # License for the specific language governing permissions and limitations
17 # under the License.
18 ##
19
20 import asyncio
21 import yaml
22 import logging
23 import logging.handlers
24 import getopt
25 import sys
26 import ROclient
27 import ns
28 import vim_sdn
29 import netslice
30 from time import time, sleep
31 from lcm_utils import versiontuple, LcmException, TaskRegistry, LcmExceptionExit
32
33 # from osm_lcm import version as lcm_version, version_date as lcm_version_date, ROclient
34 from osm_common import dbmemory, dbmongo, fslocal, msglocal, msgkafka
35 from osm_common import version as common_version
36 from osm_common.dbbase import DbException
37 from osm_common.fsbase import FsException
38 from osm_common.msgbase import MsgException
39 from os import environ, path
40 from n2vc import version as n2vc_version
41
42
43 __author__ = "Alfonso Tierno"
44 min_RO_version = [0, 6, 3]
45 min_n2vc_version = "0.0.2"
46 min_common_version = "0.1.11"
47 # uncomment if LCM is installed as library and installed, and get them from __init__.py
48 lcm_version = '0.1.35'
49 lcm_version_date = '2019-01-31'
50 health_check_file = path.expanduser("~") + "/time_last_ping" # TODO find better location for this file
51
52
53 class Lcm:
54
55 ping_interval_pace = 120 # how many time ping is send once is confirmed all is running
56 ping_interval_boot = 5 # how many time ping is sent when booting
57
58 def __init__(self, config_file, loop=None):
59 """
60 Init, Connect to database, filesystem storage, and messaging
61 :param config: two level dictionary with configuration. Top level should contain 'database', 'storage',
62 :return: None
63 """
64
65 self.db = None
66 self.msg = None
67 self.fs = None
68 self.pings_not_received = 1
69 self.consecutive_errors = 0
70 self.first_start = False
71
72 # contains created tasks/futures to be able to cancel
73 self.lcm_tasks = TaskRegistry()
74 # logging
75 self.logger = logging.getLogger('lcm')
76 # load configuration
77 config = self.read_config_file(config_file)
78 self.config = config
79 self.ro_config = {
80 "endpoint_url": "http://{}:{}/openmano".format(config["RO"]["host"], config["RO"]["port"]),
81 "tenant": config.get("tenant", "osm"),
82 "logger_name": "lcm.ROclient",
83 "loglevel": "ERROR",
84 }
85
86 self.vca_config = config["VCA"]
87
88 self.loop = loop or asyncio.get_event_loop()
89
90 # logging
91 log_format_simple = "%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)s %(message)s"
92 log_formatter_simple = logging.Formatter(log_format_simple, datefmt='%Y-%m-%dT%H:%M:%S')
93 config["database"]["logger_name"] = "lcm.db"
94 config["storage"]["logger_name"] = "lcm.fs"
95 config["message"]["logger_name"] = "lcm.msg"
96 if config["global"].get("logfile"):
97 file_handler = logging.handlers.RotatingFileHandler(config["global"]["logfile"],
98 maxBytes=100e6, backupCount=9, delay=0)
99 file_handler.setFormatter(log_formatter_simple)
100 self.logger.addHandler(file_handler)
101 if not config["global"].get("nologging"):
102 str_handler = logging.StreamHandler()
103 str_handler.setFormatter(log_formatter_simple)
104 self.logger.addHandler(str_handler)
105
106 if config["global"].get("loglevel"):
107 self.logger.setLevel(config["global"]["loglevel"])
108
109 # logging other modules
110 for k1, logname in {"message": "lcm.msg", "database": "lcm.db", "storage": "lcm.fs"}.items():
111 config[k1]["logger_name"] = logname
112 logger_module = logging.getLogger(logname)
113 if config[k1].get("logfile"):
114 file_handler = logging.handlers.RotatingFileHandler(config[k1]["logfile"],
115 maxBytes=100e6, backupCount=9, delay=0)
116 file_handler.setFormatter(log_formatter_simple)
117 logger_module.addHandler(file_handler)
118 if config[k1].get("loglevel"):
119 logger_module.setLevel(config[k1]["loglevel"])
120 self.logger.critical("starting osm/lcm version {} {}".format(lcm_version, lcm_version_date))
121
122 # check version of N2VC
123 # TODO enhance with int conversion or from distutils.version import LooseVersion
124 # or with list(map(int, version.split(".")))
125 if versiontuple(n2vc_version) < versiontuple(min_n2vc_version):
126 raise LcmException("Not compatible osm/N2VC version '{}'. Needed '{}' or higher".format(
127 n2vc_version, min_n2vc_version))
128 # check version of common
129 if versiontuple(common_version) < versiontuple(min_common_version):
130 raise LcmException("Not compatible osm/common version '{}'. Needed '{}' or higher".format(
131 common_version, min_common_version))
132
133 try:
134 # TODO check database version
135 if config["database"]["driver"] == "mongo":
136 self.db = dbmongo.DbMongo()
137 self.db.db_connect(config["database"])
138 elif config["database"]["driver"] == "memory":
139 self.db = dbmemory.DbMemory()
140 self.db.db_connect(config["database"])
141 else:
142 raise LcmException("Invalid configuration param '{}' at '[database]':'driver'".format(
143 config["database"]["driver"]))
144
145 if config["storage"]["driver"] == "local":
146 self.fs = fslocal.FsLocal()
147 self.fs.fs_connect(config["storage"])
148 else:
149 raise LcmException("Invalid configuration param '{}' at '[storage]':'driver'".format(
150 config["storage"]["driver"]))
151
152 config_message = config["message"].copy()
153 config_message["loop"] = self.loop
154 if config_message["driver"] == "local":
155 self.msg = msglocal.MsgLocal()
156 self.msg.connect(config_message)
157 elif config_message["driver"] == "kafka":
158 self.msg = msgkafka.MsgKafka()
159 self.msg.connect(config_message)
160 else:
161 raise LcmException("Invalid configuration param '{}' at '[message]':'driver'".format(
162 config["message"]["driver"]))
163 except (DbException, FsException, MsgException) as e:
164 self.logger.critical(str(e), exc_info=True)
165 raise LcmException(str(e))
166
167 self.ns = ns.NsLcm(self.db, self.msg, self.fs, self.lcm_tasks, self.ro_config, self.vca_config, self.loop)
168 self.netslice = netslice.NetsliceLcm(self.db, self.msg, self.fs, self.lcm_tasks, self.ro_config,
169 self.vca_config, self.loop)
170 self.vim = vim_sdn.VimLcm(self.db, self.msg, self.fs, self.lcm_tasks, self.ro_config, self.loop)
171 self.wim = vim_sdn.WimLcm(self.db, self.msg, self.fs, self.lcm_tasks, self.ro_config, self.loop)
172 self.sdn = vim_sdn.SdnLcm(self.db, self.msg, self.fs, self.lcm_tasks, self.ro_config, self.loop)
173
174 async def check_RO_version(self):
175 try:
176 RO = ROclient.ROClient(self.loop, **self.ro_config)
177 RO_version = await RO.get_version()
178 if RO_version < min_RO_version:
179 raise LcmException("Not compatible osm/RO version '{}.{}.{}'. Needed '{}.{}.{}' or higher".format(
180 *RO_version, *min_RO_version
181 ))
182 except ROclient.ROClientException as e:
183 error_text = "Error while conneting to osm/RO " + str(e)
184 self.logger.critical(error_text, exc_info=True)
185 raise LcmException(error_text)
186
187 async def test(self, param=None):
188 self.logger.debug("Starting/Ending test task: {}".format(param))
189
190 async def kafka_ping(self):
191 self.logger.debug("Task kafka_ping Enter")
192 consecutive_errors = 0
193 first_start = True
194 kafka_has_received = False
195 self.pings_not_received = 1
196 while True:
197 try:
198 await self.msg.aiowrite("admin", "ping", {"from": "lcm", "to": "lcm"}, self.loop)
199 # time between pings are low when it is not received and at starting
200 wait_time = self.ping_interval_boot if not kafka_has_received else self.ping_interval_pace
201 if not self.pings_not_received:
202 kafka_has_received = True
203 self.pings_not_received += 1
204 await asyncio.sleep(wait_time, loop=self.loop)
205 if self.pings_not_received > 10:
206 raise LcmException("It is not receiving pings from Kafka bus")
207 consecutive_errors = 0
208 first_start = False
209 except LcmException:
210 raise
211 except Exception as e:
212 # if not first_start is the first time after starting. So leave more time and wait
213 # to allow kafka starts
214 if consecutive_errors == 8 if not first_start else 30:
215 self.logger.error("Task kafka_read task exit error too many errors. Exception: {}".format(e))
216 raise
217 consecutive_errors += 1
218 self.logger.error("Task kafka_read retrying after Exception {}".format(e))
219 wait_time = 1 if not first_start else 5
220 await asyncio.sleep(wait_time, loop=self.loop)
221
222 def kafka_read_callback(self, topic, command, params):
223 order_id = 1
224
225 if topic != "admin" and command != "ping":
226 self.logger.debug("Task kafka_read receives {} {}: {}".format(topic, command, params))
227 self.consecutive_errors = 0
228 self.first_start = False
229 order_id += 1
230 if command == "exit":
231 raise LcmExceptionExit
232 elif command.startswith("#"):
233 return
234 elif command == "echo":
235 # just for test
236 print(params)
237 sys.stdout.flush()
238 return
239 elif command == "test":
240 asyncio.Task(self.test(params), loop=self.loop)
241 return
242
243 if topic == "admin":
244 if command == "ping" and params["to"] == "lcm" and params["from"] == "lcm":
245 self.pings_not_received = 0
246 try:
247 with open(health_check_file, "w") as f:
248 f.write(str(time()))
249 except Exception as e:
250 self.logger.error("Cannot write into '{}' for healthcheck: {}".format(health_check_file, e))
251 return
252 elif topic == "ns":
253 if command == "instantiate":
254 # self.logger.debug("Deploying NS {}".format(nsr_id))
255 nslcmop = params
256 nslcmop_id = nslcmop["_id"]
257 nsr_id = nslcmop["nsInstanceId"]
258 task = asyncio.ensure_future(self.ns.instantiate(nsr_id, nslcmop_id))
259 self.lcm_tasks.register("ns", nsr_id, nslcmop_id, "ns_instantiate", task)
260 return
261 elif command == "terminate":
262 # self.logger.debug("Deleting NS {}".format(nsr_id))
263 nslcmop = params
264 nslcmop_id = nslcmop["_id"]
265 nsr_id = nslcmop["nsInstanceId"]
266 self.lcm_tasks.cancel(topic, nsr_id)
267 task = asyncio.ensure_future(self.ns.terminate(nsr_id, nslcmop_id))
268 self.lcm_tasks.register("ns", nsr_id, nslcmop_id, "ns_terminate", task)
269 return
270 elif command == "action":
271 # self.logger.debug("Update NS {}".format(nsr_id))
272 nslcmop = params
273 nslcmop_id = nslcmop["_id"]
274 nsr_id = nslcmop["nsInstanceId"]
275 task = asyncio.ensure_future(self.ns.action(nsr_id, nslcmop_id))
276 self.lcm_tasks.register("ns", nsr_id, nslcmop_id, "ns_action", task)
277 return
278 elif command == "scale":
279 # self.logger.debug("Update NS {}".format(nsr_id))
280 nslcmop = params
281 nslcmop_id = nslcmop["_id"]
282 nsr_id = nslcmop["nsInstanceId"]
283 task = asyncio.ensure_future(self.ns.scale(nsr_id, nslcmop_id))
284 self.lcm_tasks.register("ns", nsr_id, nslcmop_id, "ns_scale", task)
285 return
286 elif command == "show":
287 nsr_id = params
288 try:
289 db_nsr = self.db.get_one("nsrs", {"_id": nsr_id})
290 print("nsr:\n _id={}\n operational-status: {}\n config-status: {}"
291 "\n detailed-status: {}\n deploy: {}\n tasks: {}"
292 "".format(nsr_id, db_nsr["operational-status"], db_nsr["config-status"],
293 db_nsr["detailed-status"],
294 db_nsr["_admin"]["deployed"], self.lcm_ns_tasks.get(nsr_id)))
295 except Exception as e:
296 print("nsr {} not found: {}".format(nsr_id, e))
297 sys.stdout.flush()
298 return
299 elif command == "deleted":
300 return # TODO cleaning of task just in case should be done
301 elif command in ("terminated", "instantiated", "scaled", "actioned"): # "scaled-cooldown-time"
302 return
303 elif topic == "nsi": # netslice LCM processes (instantiate, terminate, etc)
304 if command == "instantiate":
305 # self.logger.debug("Instantiating Network Slice {}".format(nsilcmop["netsliceInstanceId"]))
306 nsilcmop = params
307 nsilcmop_id = nsilcmop["_id"] # slice operation id
308 nsir_id = nsilcmop["netsliceInstanceId"] # slice record id
309 task = asyncio.ensure_future(self.netslice.instantiate(nsir_id, nsilcmop_id))
310 self.lcm_tasks.register("nsi", nsir_id, nsilcmop_id, "nsi_instantiate", task)
311 return
312 elif command == "terminate":
313 # self.logger.debug("Terminating Network Slice NS {}".format(nsilcmop["netsliceInstanceId"]))
314 nsilcmop = params
315 nsilcmop_id = nsilcmop["_id"] # slice operation id
316 nsir_id = nsilcmop["netsliceInstanceId"] # slice record id
317 self.lcm_tasks.cancel(topic, nsir_id)
318 task = asyncio.ensure_future(self.netslice.terminate(nsir_id, nsilcmop_id))
319 self.lcm_tasks.register("nsi", nsir_id, nsilcmop_id, "nsi_terminate", task)
320 return
321 elif command == "show":
322 nsir_id = params
323 try:
324 db_nsir = self.db.get_one("nsirs", {"_id": nsir_id})
325 print("nsir:\n _id={}\n operational-status: {}\n config-status: {}"
326 "\n detailed-status: {}\n deploy: {}\n tasks: {}"
327 "".format(nsir_id, db_nsir["operational-status"], db_nsir["config-status"],
328 db_nsir["detailed-status"],
329 db_nsir["_admin"]["deployed"], self.lcm_netslice_tasks.get(nsir_id)))
330 except Exception as e:
331 print("nsir {} not found: {}".format(nsir_id, e))
332 sys.stdout.flush()
333 return
334 elif command == "deleted":
335 return # TODO cleaning of task just in case should be done
336 elif command in ("terminated", "instantiated", "scaled", "actioned"): # "scaled-cooldown-time"
337 return
338 elif topic == "vim_account":
339 vim_id = params["_id"]
340 if command == "create":
341 task = asyncio.ensure_future(self.vim.create(params, order_id))
342 self.lcm_tasks.register("vim_account", vim_id, order_id, "vim_create", task)
343 return
344 elif command == "delete":
345 self.lcm_tasks.cancel(topic, vim_id)
346 task = asyncio.ensure_future(self.vim.delete(vim_id, order_id))
347 self.lcm_tasks.register("vim_account", vim_id, order_id, "vim_delete", task)
348 return
349 elif command == "show":
350 print("not implemented show with vim_account")
351 sys.stdout.flush()
352 return
353 elif command == "edit":
354 task = asyncio.ensure_future(self.vim.edit(params, order_id))
355 self.lcm_tasks.register("vim_account", vim_id, order_id, "vim_edit", task)
356 return
357 elif topic == "wim_account":
358 wim_id = params["_id"]
359 if command == "create":
360 task = asyncio.ensure_future(self.wim.create(params, order_id))
361 self.lcm_tasks.register("wim_account", wim_id, order_id, "wim_create", task)
362 return
363 elif command == "delete":
364 self.lcm_tasks.cancel(topic, wim_id)
365 task = asyncio.ensure_future(self.wim.delete(wim_id, order_id))
366 self.lcm_tasks.register("wim_account", wim_id, order_id, "wim_delete", task)
367 return
368 elif command == "show":
369 print("not implemented show with wim_account")
370 sys.stdout.flush()
371 return
372 elif command == "edit":
373 task = asyncio.ensure_future(self.wim.edit(params, order_id))
374 self.lcm_tasks.register("wim_account", wim_id, order_id, "wim_edit", task)
375 return
376 elif topic == "sdn":
377 _sdn_id = params["_id"]
378 if command == "create":
379 task = asyncio.ensure_future(self.sdn.create(params, order_id))
380 self.lcm_tasks.register("sdn", _sdn_id, order_id, "sdn_create", task)
381 return
382 elif command == "delete":
383 self.lcm_tasks.cancel(topic, _sdn_id)
384 task = asyncio.ensure_future(self.sdn.delete(_sdn_id, order_id))
385 self.lcm_tasks.register("sdn", _sdn_id, order_id, "sdn_delete", task)
386 return
387 elif command == "edit":
388 task = asyncio.ensure_future(self.sdn.edit(params, order_id))
389 self.lcm_tasks.register("sdn", _sdn_id, order_id, "sdn_edit", task)
390 return
391 self.logger.critical("unknown topic {} and command '{}'".format(topic, command))
392
393 async def kafka_read(self):
394 self.logger.debug("Task kafka_read Enter")
395 # future = asyncio.Future()
396 self.consecutive_errors = 0
397 self.first_start = True
398 while self.consecutive_errors < 10:
399 try:
400 topics = ("admin", "ns", "vim_account", "wim_account", "sdn", "nsi")
401 await self.msg.aioread(topics, self.loop, self.kafka_read_callback)
402
403 except LcmExceptionExit:
404 self.logger.debug("Bye!")
405 break
406 except Exception as e:
407 # if not first_start is the first time after starting. So leave more time and wait
408 # to allow kafka starts
409 if self.consecutive_errors == 8 if not self.first_start else 30:
410 self.logger.error("Task kafka_read task exit error too many errors. Exception: {}".format(e))
411 raise
412 self.consecutive_errors += 1
413 self.logger.error("Task kafka_read retrying after Exception {}".format(e))
414 wait_time = 2 if not self.first_start else 5
415 await asyncio.sleep(wait_time, loop=self.loop)
416
417 # self.logger.debug("Task kafka_read terminating")
418 self.logger.debug("Task kafka_read exit")
419
420 def start(self):
421
422 # check RO version
423 self.loop.run_until_complete(self.check_RO_version())
424
425 self.loop.run_until_complete(asyncio.gather(
426 self.kafka_read(),
427 self.kafka_ping()
428 ))
429 # TODO
430 # self.logger.debug("Terminating cancelling creation tasks")
431 # self.lcm_tasks.cancel("ALL", "create")
432 # timeout = 200
433 # while self.is_pending_tasks():
434 # self.logger.debug("Task kafka_read terminating. Waiting for tasks termination")
435 # await asyncio.sleep(2, loop=self.loop)
436 # timeout -= 2
437 # if not timeout:
438 # self.lcm_tasks.cancel("ALL", "ALL")
439 self.loop.close()
440 self.loop = None
441 if self.db:
442 self.db.db_disconnect()
443 if self.msg:
444 self.msg.disconnect()
445 if self.fs:
446 self.fs.fs_disconnect()
447
448 def read_config_file(self, config_file):
449 # TODO make a [ini] + yaml inside parser
450 # the configparser library is not suitable, because it does not admit comments at the end of line,
451 # and not parse integer or boolean
452 try:
453 with open(config_file) as f:
454 conf = yaml.load(f)
455 for k, v in environ.items():
456 if not k.startswith("OSMLCM_"):
457 continue
458 k_items = k.lower().split("_")
459 if len(k_items) < 3:
460 continue
461 if k_items[1] in ("ro", "vca"):
462 # put in capital letter
463 k_items[1] = k_items[1].upper()
464 c = conf
465 try:
466 for k_item in k_items[1:-1]:
467 c = c[k_item]
468 if k_items[-1] == "port":
469 c[k_items[-1]] = int(v)
470 else:
471 c[k_items[-1]] = v
472 except Exception as e:
473 self.logger.warn("skipping environ '{}' on exception '{}'".format(k, e))
474
475 return conf
476 except Exception as e:
477 self.logger.critical("At config file '{}': {}".format(config_file, e))
478 exit(1)
479
480
481 def usage():
482 print("""Usage: {} [options]
483 -c|--config [configuration_file]: loads the configuration file (default: ./nbi.cfg)
484 --health-check: do not run lcm, but inspect kafka bus to determine if lcm is healthy
485 -h|--help: shows this help
486 """.format(sys.argv[0]))
487 # --log-socket-host HOST: send logs to this host")
488 # --log-socket-port PORT: send logs using this port (default: 9022)")
489
490
491 def health_check():
492 retry = 2
493 while retry:
494 retry -= 1
495 try:
496 with open(health_check_file, "r") as f:
497 last_received_ping = f.read()
498
499 if time() - float(last_received_ping) < Lcm.ping_interval_pace + 10:
500 exit(0)
501 except Exception:
502 pass
503 if retry:
504 sleep(6)
505 exit(1)
506
507
508 if __name__ == '__main__':
509 try:
510 # load parameters and configuration
511 opts, args = getopt.getopt(sys.argv[1:], "hc:", ["config=", "help", "health-check"])
512 # TODO add "log-socket-host=", "log-socket-port=", "log-file="
513 config_file = None
514 for o, a in opts:
515 if o in ("-h", "--help"):
516 usage()
517 sys.exit()
518 elif o in ("-c", "--config"):
519 config_file = a
520 elif o == "--health-check":
521 health_check()
522 # elif o == "--log-socket-port":
523 # log_socket_port = a
524 # elif o == "--log-socket-host":
525 # log_socket_host = a
526 # elif o == "--log-file":
527 # log_file = a
528 else:
529 assert False, "Unhandled option"
530 if config_file:
531 if not path.isfile(config_file):
532 print("configuration file '{}' not exist".format(config_file), file=sys.stderr)
533 exit(1)
534 else:
535 for config_file in (__file__[:__file__.rfind(".")] + ".cfg", "./lcm.cfg", "/etc/osm/lcm.cfg"):
536 if path.isfile(config_file):
537 break
538 else:
539 print("No configuration file 'lcm.cfg' found neither at local folder nor at /etc/osm/", file=sys.stderr)
540 exit(1)
541 lcm = Lcm(config_file)
542 lcm.start()
543 except (LcmException, getopt.GetoptError) as e:
544 print(str(e), file=sys.stderr)
545 # usage()
546 exit(1)