osm_lcm/lcm_helm_conn.py

   1 ##
   2 # Copyright 2020 Telefonica Investigacion y Desarrollo, S.A.U.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #    http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  13 # implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17 ##
  18 import functools
  19 import yaml
  20 import asyncio
  21 import socket
  22 import uuid
  23 import os
  24
  25 from grpclib.client import Channel
  26
  27 from osm_lcm.frontend_pb2 import PrimitiveRequest
  28 from osm_lcm.frontend_pb2 import SshKeyRequest, SshKeyReply
  29 from osm_lcm.frontend_grpc import FrontendExecutorStub
  30 from osm_lcm.lcm_utils import LcmBase
  31
  32 from osm_lcm.data_utils.database.database import Database
  33 from osm_lcm.data_utils.filesystem.filesystem import Filesystem
  34
  35 from n2vc.n2vc_conn import N2VCConnector
  36 from n2vc.k8s_helm_conn import K8sHelmConnector
  37 from n2vc.k8s_helm3_conn import K8sHelm3Connector
  38 from n2vc.exceptions import N2VCBadArgumentsException, N2VCException, N2VCExecutionException
  39
  40 from osm_lcm.lcm_utils import deep_get
  41
  42
  43 def retryer(max_wait_time_var="_initial_retry_time", delay_time_var="_retry_delay"):
  44     def wrapper(func):
  45         retry_exceptions = (
  46             ConnectionRefusedError
  47         )
  48
  49         @functools.wraps(func)
  50         async def wrapped(*args, **kwargs):
  51             # default values for wait time and delay_time
  52             delay_time = 10
  53             max_wait_time = 300
  54
  55             # obtain arguments from variable names
  56             self = args[0]
  57             if self.__dict__.get(max_wait_time_var):
  58                 max_wait_time = self.__dict__.get(max_wait_time_var)
  59             if self.__dict__.get(delay_time_var):
  60                 delay_time = self.__dict__.get(delay_time_var)
  61
  62             wait_time = max_wait_time
  63             while wait_time > 0:
  64                 try:
  65                     return await func(*args, **kwargs)
  66                 except retry_exceptions:
  67                     wait_time = wait_time - delay_time
  68                     await asyncio.sleep(delay_time)
  69                     continue
  70             else:
  71                 return ConnectionRefusedError
  72         return wrapped
  73     return wrapper
  74
  75
  76 class LCMHelmConn(N2VCConnector, LcmBase):
  77     _KUBECTL_OSM_NAMESPACE = "osm"
  78     _KUBECTL_OSM_CLUSTER_NAME = "_system-osm-k8s"
  79     _EE_SERVICE_PORT = 50050
  80
  81     # Initial max retry time
  82     _MAX_INITIAL_RETRY_TIME = 600
  83     # Max retry time for normal operations
  84     _MAX_RETRY_TIME = 30
  85     # Time beetween retries, retry time after a connection error is raised
  86     _EE_RETRY_DELAY = 10
  87
  88     def __init__(self,
  89                  log: object = None,
  90                  loop: object = None,
  91                  url: str = None,
  92                  username: str = None,
  93                  vca_config: dict = None,
  94                  on_update_db=None, ):
  95         """
  96         Initialize EE helm connector.
  97         """
  98
  99         self.db = Database().instance.db
 100         self.fs = Filesystem().instance.fs
 101
 102         # parent class constructor
 103         N2VCConnector.__init__(
 104             self,
 105             log=log,
 106             loop=loop,
 107             url=url,
 108             username=username,
 109             vca_config=vca_config,
 110             on_update_db=on_update_db,
 111             db=self.db,
 112             fs=self.fs
 113         )
 114
 115         self.log.debug("Initialize helm N2VC connector")
 116         self.log.debug("initial vca_config: {}".format(vca_config))
 117
 118         # TODO - Obtain data from configuration
 119         self._ee_service_port = self._EE_SERVICE_PORT
 120
 121         self._retry_delay = self._EE_RETRY_DELAY
 122
 123         if self.vca_config and self.vca_config.get("eegrpcinittimeout"):
 124             self._initial_retry_time = self.vca_config.get("eegrpcinittimeout")
 125             self.log.debug("Initial retry time: {}".format(self._initial_retry_time))
 126         else:
 127             self._initial_retry_time = self._MAX_INITIAL_RETRY_TIME
 128             self.log.debug("Applied default retry time: {}".format(self._initial_retry_time))
 129
 130         if self.vca_config and self.vca_config.get("eegrpctimeout"):
 131             self._max_retry_time = self.vca_config.get("eegrpctimeout")
 132             self.log.debug("Retry time: {}".format(self._max_retry_time))
 133         else:
 134             self._max_retry_time = self._MAX_RETRY_TIME
 135             self.log.debug("Applied default retry time: {}".format(self._max_retry_time))
 136
 137         # initialize helm connector for helmv2 and helmv3
 138         self._k8sclusterhelm2 = K8sHelmConnector(
 139             kubectl_command=self.vca_config.get("kubectlpath"),
 140             helm_command=self.vca_config.get("helmpath"),
 141             fs=self.fs,
 142             db=self.db,
 143             log=self.log,
 144             on_update_db=None,
 145         )
 146
 147         self._k8sclusterhelm3 = K8sHelm3Connector(
 148             kubectl_command=self.vca_config.get("kubectlpath"),
 149             helm_command=self.vca_config.get("helm3path"),
 150             fs=self.fs,
 151             log=self.log,
 152             db=self.db,
 153             on_update_db=None,
 154         )
 155
 156         self._system_cluster_id = None
 157         self.log.info("Helm N2VC connector initialized")
 158
 159     # TODO - ¿reuse_ee_id?
 160     async def create_execution_environment(self,
 161                                            namespace: str,
 162                                            db_dict: dict,
 163                                            reuse_ee_id: str = None,
 164                                            progress_timeout: float = None,
 165                                            total_timeout: float = None,
 166                                            config: dict = None,
 167                                            artifact_path: str = None,
 168                                            vca_type: str = None,
 169                                            *kargs, **kwargs) -> (str, dict):
 170         """
 171         Creates a new helm execution environment deploying the helm-chat indicated in the
 172         attifact_path
 173         :param str namespace: This param is not used, all helm charts are deployed in the osm
 174         system namespace
 175         :param dict db_dict: where to write to database when the status changes.
 176             It contains a dictionary with {collection: str, filter: {},  path: str},
 177                 e.g. {collection: "nsrs", filter: {_id: <nsd-id>, path:
 178                 "_admin.deployed.VCA.3"}
 179         :param str reuse_ee_id: ee id from an older execution. TODO - right now this params is not used
 180         :param float progress_timeout:
 181         :param float total_timeout:
 182         :param dict config:  General variables to instantiate KDU
 183         :param str artifact_path:  path of package content
 184         :param str vca_type:  Type of vca, must be type helm or helm-v3
 185         :returns str, dict: id of the new execution environment including namespace.helm_id
 186         and credentials object set to None as all credentials should be osm kubernetes .kubeconfig
 187         """
 188
 189         self.log.info(
 190             "create_execution_environment: namespace: {}, artifact_path: {}, db_dict: {}, "
 191             "reuse_ee_id: {}".format(
 192                 namespace, artifact_path, db_dict, reuse_ee_id)
 193         )
 194
 195         # Validate artifact-path is provided
 196         if artifact_path is None or len(artifact_path) == 0:
 197             raise N2VCBadArgumentsException(
 198                 message="artifact_path is mandatory", bad_args=["artifact_path"]
 199             )
 200
 201         # Validate artifact-path exists and sync path
 202         from_path = os.path.split(artifact_path)[0]
 203         self.fs.sync(from_path)
 204
 205         # remove / in charm path
 206         while artifact_path.find("//") >= 0:
 207             artifact_path = artifact_path.replace("//", "/")
 208
 209         # check charm path
 210         if self.fs.file_exists(artifact_path):
 211             helm_chart_path = artifact_path
 212         else:
 213             msg = "artifact path does not exist: {}".format(artifact_path)
 214             raise N2VCBadArgumentsException(message=msg, bad_args=["artifact_path"])
 215
 216         if artifact_path.startswith("/"):
 217             full_path = self.fs.path + helm_chart_path
 218         else:
 219             full_path = self.fs.path + "/" + helm_chart_path
 220
 221         while full_path.find("//") >= 0:
 222             full_path = full_path.replace("//", "/")
 223
 224         try:
 225             # Call helm conn install
 226             # Obtain system cluster id from database
 227             system_cluster_uuid = await self._get_system_cluster_id()
 228             # Add parameter osm if exist to global
 229             if config and config.get("osm"):
 230                 if not config.get("global"):
 231                     config["global"] = {}
 232                 config["global"]["osm"] = config.get("osm")
 233
 234             self.log.debug("install helm chart: {}".format(full_path))
 235             if vca_type == "helm":
 236                 helm_id = self._k8sclusterhelm2.generate_kdu_instance_name(
 237                     db_dict=db_dict,
 238                     kdu_model=full_path,
 239                 )
 240                 await self._k8sclusterhelm2.install(system_cluster_uuid, kdu_model=full_path,
 241                                                     kdu_instance=helm_id,
 242                                                     namespace=self._KUBECTL_OSM_NAMESPACE,
 243                                                     params=config,
 244                                                     db_dict=db_dict,
 245                                                     timeout=progress_timeout)
 246             else:
 247                 helm_id = self._k8sclusterhelm2.generate_kdu_instance_name(
 248                     db_dict=db_dict,
 249                     kdu_model=full_path,
 250                 )
 251                 await self._k8sclusterhelm3.install(system_cluster_uuid, kdu_model=full_path,
 252                                                     kdu_instance=helm_id,
 253                                                     namespace=self._KUBECTL_OSM_NAMESPACE,
 254                                                     params=config,
 255                                                     db_dict=db_dict,
 256                                                     timeout=progress_timeout)
 257
 258             ee_id = "{}:{}.{}".format(vca_type, self._KUBECTL_OSM_NAMESPACE, helm_id)
 259             return ee_id, None
 260         except N2VCException:
 261             raise
 262         except Exception as e:
 263             self.log.error("Error deploying chart ee: {}".format(e), exc_info=True)
 264             raise N2VCException("Error deploying chart ee: {}".format(e))
 265
 266     async def register_execution_environment(self, namespace: str, credentials: dict, db_dict: dict,
 267                                              progress_timeout: float = None, total_timeout: float = None,
 268                                              *kargs, **kwargs) -> str:
 269         # nothing to do
 270         pass
 271
 272     async def install_configuration_sw(self,
 273                                        ee_id: str,
 274                                        artifact_path: str,
 275                                        db_dict: dict,
 276                                        progress_timeout: float = None,
 277                                        total_timeout: float = None,
 278                                        config: dict = None,
 279                                        num_units: int = 1,
 280                                        vca_type: str = None
 281                                        ):
 282         # nothing to do
 283         pass
 284
 285     async def add_relation(self, ee_id_1: str, ee_id_2: str, endpoint_1: str, endpoint_2: str):
 286         # nothing to do
 287         pass
 288
 289     async def remove_relation(self):
 290         # nothing to to
 291         pass
 292
 293     async def get_status(self, namespace: str, yaml_format: bool = True):
 294         # not used for this connector
 295         pass
 296
 297     async def get_ee_ssh_public__key(self, ee_id: str, db_dict: dict, progress_timeout: float = None,
 298                                      total_timeout: float = None) -> str:
 299         """
 300         Obtains ssh-public key from ee executing GetSShKey method from the ee.
 301
 302         :param str ee_id: the id of the execution environment returned by
 303             create_execution_environment or register_execution_environment
 304         :param dict db_dict:
 305         :param float progress_timeout:
 306         :param float total_timeout:
 307         :returns: public key of the execution environment
 308         """
 309
 310         self.log.info(
 311             "get_ee_ssh_public_key: ee_id: {}, db_dict: {}".format(
 312                 ee_id, db_dict)
 313         )
 314
 315         # check arguments
 316         if ee_id is None or len(ee_id) == 0:
 317             raise N2VCBadArgumentsException(
 318                 message="ee_id is mandatory", bad_args=["ee_id"]
 319             )
 320
 321         try:
 322             # Obtain ip_addr for the ee service, it is resolved by dns from the ee name by kubernetes
 323             version, namespace, helm_id = self._get_ee_id_parts(ee_id)
 324             ip_addr = socket.gethostbyname(helm_id)
 325
 326             # Obtain ssh_key from the ee, this method will implement retries to allow the ee
 327             # install libraries and start successfully
 328             ssh_key = await self._get_ssh_key(ip_addr)
 329             return ssh_key
 330         except Exception as e:
 331             self.log.error("Error obtaining ee ssh_key: {}".format(e), exc_info=True)
 332             raise N2VCException("Error obtaining ee ssh_ke: {}".format(e))
 333
 334     async def exec_primitive(self, ee_id: str, primitive_name: str, params_dict: dict, db_dict: dict = None,
 335                              progress_timeout: float = None, total_timeout: float = None) -> str:
 336         """
 337         Execute a primitive in the execution environment
 338
 339         :param str ee_id: the one returned by create_execution_environment or
 340             register_execution_environment with the format namespace.helm_id
 341         :param str primitive_name: must be one defined in the software. There is one
 342             called 'config', where, for the proxy case, the 'credentials' of VM are
 343             provided
 344         :param dict params_dict: parameters of the action
 345         :param dict db_dict: where to write into database when the status changes.
 346                         It contains a dict with
 347                             {collection: <str>, filter: {},  path: <str>},
 348                             e.g. {collection: "nslcmops", filter:
 349                                 {_id: <nslcmop_id>, path: "_admin.VCA"}
 350                         It will be used to store information about intermediate notifications
 351         :param float progress_timeout:
 352         :param float total_timeout:
 353         :returns str: primitive result, if ok. It raises exceptions in case of fail
 354         """
 355
 356         self.log.info("exec primitive for ee_id : {}, primitive_name: {}, params_dict: {}, db_dict: {}".format(
 357             ee_id, primitive_name, params_dict, db_dict
 358         ))
 359
 360         # check arguments
 361         if ee_id is None or len(ee_id) == 0:
 362             raise N2VCBadArgumentsException(
 363                 message="ee_id is mandatory", bad_args=["ee_id"]
 364             )
 365         if primitive_name is None or len(primitive_name) == 0:
 366             raise N2VCBadArgumentsException(
 367                 message="action_name is mandatory", bad_args=["action_name"]
 368             )
 369         if params_dict is None:
 370             params_dict = dict()
 371
 372         try:
 373             version, namespace, helm_id = self._get_ee_id_parts(ee_id)
 374             ip_addr = socket.gethostbyname(helm_id)
 375         except Exception as e:
 376             self.log.error("Error getting ee ip ee: {}".format(e))
 377             raise N2VCException("Error getting ee ip ee: {}".format(e))
 378
 379         if primitive_name == "config":
 380             try:
 381                 # Execute config primitive, higher timeout to check the case ee is starting
 382                 status, detailed_message = await self._execute_config_primitive(ip_addr, params_dict, db_dict=db_dict)
 383                 self.log.debug("Executed config primitive ee_id_ {}, status: {}, message: {}".format(
 384                     ee_id, status, detailed_message))
 385                 if status != "OK":
 386                     self.log.error("Error configuring helm ee, status: {}, message: {}".format(
 387                         status, detailed_message))
 388                     raise N2VCExecutionException(
 389                         message="Error configuring helm ee_id: {}, status: {}, message: {}: ".format(
 390                             ee_id, status, detailed_message
 391                         ),
 392                         primitive_name=primitive_name,
 393                     )
 394             except Exception as e:
 395                 self.log.error("Error configuring helm ee: {}".format(e))
 396                 raise N2VCExecutionException(
 397                     message="Error configuring helm ee_id: {}, {}".format(
 398                         ee_id, e
 399                     ),
 400                     primitive_name=primitive_name,
 401                 )
 402             return "CONFIG OK"
 403         else:
 404             try:
 405                 # Execute primitive
 406                 status, detailed_message = await self._execute_primitive(ip_addr, primitive_name,
 407                                                                          params_dict, db_dict=db_dict)
 408                 self.log.debug("Executed primitive {} ee_id_ {}, status: {}, message: {}".format(
 409                     primitive_name, ee_id, status, detailed_message))
 410                 if status != "OK" and status != "PROCESSING":
 411                     self.log.error(
 412                         "Execute primitive {} returned not ok status: {}, message: {}".format(
 413                             primitive_name, status, detailed_message)
 414                     )
 415                     raise N2VCExecutionException(
 416                         message="Execute primitive {} returned not ok status: {}, message: {}".format(
 417                             primitive_name, status, detailed_message
 418                         ),
 419                         primitive_name=primitive_name,
 420                     )
 421             except Exception as e:
 422                 self.log.error(
 423                     "Error executing primitive {}: {}".format(primitive_name, e)
 424                 )
 425                 raise N2VCExecutionException(
 426                     message="Error executing primitive {} into ee={} : {}".format(
 427                         primitive_name, ee_id, e
 428                     ),
 429                     primitive_name=primitive_name,
 430                 )
 431             return detailed_message
 432
 433     async def deregister_execution_environments(self):
 434         # nothing to be done
 435         pass
 436
 437     async def delete_execution_environment(self, ee_id: str, db_dict: dict = None, total_timeout: float = None):
 438         """
 439         Delete an execution environment
 440         :param str ee_id: id of the execution environment to delete, included namespace.helm_id
 441         :param dict db_dict: where to write into database when the status changes.
 442                         It contains a dict with
 443                             {collection: <str>, filter: {},  path: <str>},
 444                             e.g. {collection: "nsrs", filter:
 445                                 {_id: <nsd-id>, path: "_admin.deployed.VCA.3"}
 446         :param float total_timeout:
 447         """
 448
 449         self.log.info("ee_id: {}".format(ee_id))
 450
 451         # check arguments
 452         if ee_id is None:
 453             raise N2VCBadArgumentsException(
 454                 message="ee_id is mandatory", bad_args=["ee_id"]
 455             )
 456
 457         try:
 458
 459             # Obtain cluster_uuid
 460             system_cluster_uuid = await self._get_system_cluster_id()
 461
 462             # Get helm_id
 463             version, namespace, helm_id = self._get_ee_id_parts(ee_id)
 464
 465             # Uninstall chart, for backward compatibility we must assume that if there is no
 466             # version it is helm-v2
 467             if version == "helm-v3":
 468                 await self._k8sclusterhelm3.uninstall(system_cluster_uuid, helm_id)
 469             else:
 470                 await self._k8sclusterhelm2.uninstall(system_cluster_uuid, helm_id)
 471             self.log.info("ee_id: {} deleted".format(ee_id))
 472         except N2VCException:
 473             raise
 474         except Exception as e:
 475             self.log.error("Error deleting ee id: {}: {}".format(ee_id, e), exc_info=True)
 476             raise N2VCException("Error deleting ee id {}: {}".format(ee_id, e))
 477
 478     async def delete_namespace(self, namespace: str, db_dict: dict = None, total_timeout: float = None):
 479         # method not implemented for this connector, execution environments must be deleted individually
 480         pass
 481
 482     async def install_k8s_proxy_charm(
 483         self,
 484         charm_name: str,
 485         namespace: str,
 486         artifact_path: str,
 487         db_dict: dict,
 488         progress_timeout: float = None,
 489         total_timeout: float = None,
 490         config: dict = None,
 491         *kargs, **kwargs
 492     ) -> str:
 493         pass
 494
 495     @retryer(max_wait_time_var="_initial_retry_time", delay_time_var="_retry_delay")
 496     async def _get_ssh_key(self, ip_addr):
 497         channel = Channel(ip_addr, self._ee_service_port)
 498         try:
 499             stub = FrontendExecutorStub(channel)
 500             self.log.debug("get ssh key, ip_addr: {}".format(ip_addr))
 501             reply: SshKeyReply = await stub.GetSshKey(SshKeyRequest())
 502             return reply.message
 503         finally:
 504             channel.close()
 505
 506     @retryer(max_wait_time_var="_initial_retry_time", delay_time_var="_retry_delay")
 507     async def _execute_config_primitive(self, ip_addr, params, db_dict=None):
 508         return await self._execute_primitive_internal(ip_addr, "config", params, db_dict=db_dict)
 509
 510     @retryer(max_wait_time_var="_max_retry_time", delay_time_var="_retry_delay")
 511     async def _execute_primitive(self, ip_addr, primitive_name, params, db_dict=None):
 512         return await self._execute_primitive_internal(ip_addr, primitive_name, params, db_dict=db_dict)
 513
 514     async def _execute_primitive_internal(self, ip_addr, primitive_name, params, db_dict=None):
 515
 516         channel = Channel(ip_addr, self._ee_service_port)
 517         try:
 518             stub = FrontendExecutorStub(channel)
 519             async with stub.RunPrimitive.open() as stream:
 520                 primitive_id = str(uuid.uuid1())
 521                 result = None
 522                 self.log.debug("Execute primitive internal: id:{}, name:{}, params: {}".
 523                                format(primitive_id, primitive_name, params))
 524                 await stream.send_message(
 525                     PrimitiveRequest(id=primitive_id, name=primitive_name, params=yaml.dump(params)), end=True)
 526                 async for reply in stream:
 527                     self.log.debug("Received reply: {}".format(reply))
 528                     result = reply
 529                     # If db_dict provided write notifs in database
 530                     if db_dict:
 531                         self._write_op_detailed_status(db_dict, reply.status, reply.detailed_message)
 532                 if result:
 533                     return reply.status, reply.detailed_message
 534                 else:
 535                     return "ERROR", "No result received"
 536         finally:
 537             channel.close()
 538
 539     def _write_op_detailed_status(self, db_dict, status, detailed_message):
 540
 541         # write ee_id to database: _admin.deployed.VCA.x
 542         try:
 543             the_table = db_dict["collection"]
 544             the_filter = db_dict["filter"]
 545             update_dict = {"detailed-status": "{}: {}".format(status, detailed_message)}
 546             # self.log.debug('Writing ee_id to database: {}'.format(the_path))
 547             self.db.set_one(
 548                 table=the_table,
 549                 q_filter=the_filter,
 550                 update_dict=update_dict,
 551                 fail_on_empty=True,
 552             )
 553         except asyncio.CancelledError:
 554             raise
 555         except Exception as e:
 556             self.log.error("Error writing detailedStatus to database: {}".format(e))
 557
 558     async def _get_system_cluster_id(self):
 559         if not self._system_cluster_id:
 560             db_k8cluster = self.db.get_one("k8sclusters", {"name": self._KUBECTL_OSM_CLUSTER_NAME})
 561             k8s_hc_id = deep_get(db_k8cluster, ("_admin", "helm-chart-v3", "id"))
 562             if not k8s_hc_id:
 563                 try:
 564                     # backward compatibility for existing clusters that have not been initialized for helm v3
 565                     cluster_id = db_k8cluster.get("_id")
 566                     k8s_credentials = yaml.safe_dump(db_k8cluster.get("credentials"))
 567                     k8s_hc_id, uninstall_sw = await self._k8sclusterhelm3.init_env(k8s_credentials,
 568                                                                                    reuse_cluster_uuid=cluster_id)
 569                     db_k8scluster_update = {"_admin.helm-chart-v3.error_msg": None,
 570                                             "_admin.helm-chart-v3.id": k8s_hc_id,
 571                                             "_admin.helm-chart-v3}.created": uninstall_sw,
 572                                             "_admin.helm-chart-v3.operationalState": "ENABLED"}
 573                     self.update_db_2("k8sclusters", cluster_id, db_k8scluster_update)
 574                 except Exception as e:
 575                     self.log.error("error initializing helm-v3 cluster: {}".format(str(e)))
 576                     raise N2VCException("K8s system cluster '{}' has not been initialized for helm-chart-v3".format(
 577                         cluster_id))
 578             self._system_cluster_id = k8s_hc_id
 579         return self._system_cluster_id
 580
 581     def _get_ee_id_parts(self, ee_id):
 582         """
 583         Parses ee_id stored at database that can be either 'version:namespace.helm_id' or only
 584         namespace.helm_id for backward compatibility
 585         If exists helm version can be helm-v3 or helm (helm-v2 old version)
 586         """
 587         version, _, part_id = ee_id.rpartition(':')
 588         namespace, _, helm_id = part_id.rpartition('.')
 589         return version, namespace, helm_id