osm_lcm/lcm_helm_conn.py

   1 ##
   2 # Copyright 2020 Telefonica Investigacion y Desarrollo, S.A.U.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #    http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  13 # implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17 ##
  18 import functools
  19 import yaml
  20 import asyncio
  21 import socket
  22 import uuid
  23 import os
  24
  25 from grpclib.client import Channel
  26
  27 from osm_lcm.frontend_pb2 import PrimitiveRequest
  28 from osm_lcm.frontend_pb2 import SshKeyRequest, SshKeyReply
  29 from osm_lcm.frontend_grpc import FrontendExecutorStub
  30
  31 from n2vc.n2vc_conn import N2VCConnector
  32 from n2vc.k8s_helm_conn import K8sHelmConnector
  33 from n2vc.exceptions import N2VCBadArgumentsException, N2VCException, N2VCExecutionException
  34
  35 from osm_lcm.lcm_utils import deep_get
  36
  37
  38 def retryer(max_wait_time=60, delay_time=10):
  39     def wrapper(func):
  40         retry_exceptions = (
  41             ConnectionRefusedError
  42         )
  43
  44         @functools.wraps(func)
  45         async def wrapped(*args, **kwargs):
  46             wait_time = max_wait_time
  47             while wait_time > 0:
  48                 try:
  49                     return await func(*args, **kwargs)
  50                 except retry_exceptions:
  51                     wait_time = wait_time - delay_time
  52                     await asyncio.sleep(delay_time)
  53                     continue
  54             else:
  55                 return ConnectionRefusedError
  56         return wrapped
  57     return wrapper
  58
  59
  60 class LCMHelmConn(N2VCConnector):
  61     _KUBECTL_OSM_NAMESPACE = "osm"
  62     _KUBECTL_OSM_CLUSTER_NAME = "_system-osm-k8s"
  63     _EE_SERVICE_PORT = 50050
  64
  65     # Time beetween retries
  66     _EE_RETRY_DELAY = 10
  67     # Initial max retry time
  68     _MAX_INITIAL_RETRY_TIME = 300
  69     # Other retry time
  70     _MAX_RETRY_TIME = 30
  71
  72     def __init__(self,
  73                  db: object,
  74                  fs: object,
  75                  log: object = None,
  76                  loop: object = None,
  77                  url: str = None,
  78                  username: str = None,
  79                  vca_config: dict = None,
  80                  on_update_db=None, ):
  81         """
  82         Initialize EE helm connector.
  83         """
  84
  85         # parent class constructor
  86         N2VCConnector.__init__(
  87             self,
  88             db=db,
  89             fs=fs,
  90             log=log,
  91             loop=loop,
  92             url=url,
  93             username=username,
  94             vca_config=vca_config,
  95             on_update_db=on_update_db,
  96         )
  97
  98         self.log.debug("Initialize helm N2VC connector")
  99
 100         # TODO - Obtain data from configuration
 101         self._ee_service_port = self._EE_SERVICE_PORT
 102
 103         self._retry_delay = self._EE_RETRY_DELAY
 104         self._max_retry_time = self._MAX_RETRY_TIME
 105         self._initial_retry_time = self._MAX_INITIAL_RETRY_TIME
 106
 107         # initialize helm connector
 108         self._k8sclusterhelm = K8sHelmConnector(
 109             kubectl_command=self.vca_config.get("kubectlpath"),
 110             helm_command=self.vca_config.get("helmpath"),
 111             fs=self.fs,
 112             log=self.log,
 113             db=self.db,
 114             on_update_db=None,
 115         )
 116
 117         self._system_cluster_id = None
 118         self.log.info("Helm N2VC connector initialized")
 119
 120     # TODO - ¿reuse_ee_id?
 121     async def create_execution_environment(self,
 122                                            namespace: str,
 123                                            db_dict: dict,
 124                                            reuse_ee_id: str = None,
 125                                            progress_timeout: float = None,
 126                                            total_timeout: float = None,
 127                                            config: dict = None,
 128                                            artifact_path: str = None,
 129                                            vca_type: str = None) -> (str, dict):
 130         """
 131         Creates a new helm execution environment deploying the helm-chat indicated in the
 132         attifact_path
 133         :param str namespace: This param is not used, all helm charts are deployed in the osm
 134         system namespace
 135         :param dict db_dict: where to write to database when the status changes.
 136             It contains a dictionary with {collection: str, filter: {},  path: str},
 137                 e.g. {collection: "nsrs", filter: {_id: <nsd-id>, path:
 138                 "_admin.deployed.VCA.3"}
 139         :param str reuse_ee_id: ee id from an older execution. TODO - right now this params is not used
 140         :param float progress_timeout:
 141         :param float total_timeout:
 142         :param dict config:  General variables to instantiate KDU
 143         :param str artifact_path:  path of package content
 144         :param str vca_type:  Type of vca, not used as assumed of type helm
 145         :returns str, dict: id of the new execution environment including namespace.helm_id
 146         and credentials object set to None as all credentials should be osm kubernetes .kubeconfig
 147         """
 148
 149         self.log.info(
 150             "create_execution_environment: namespace: {}, artifact_path: {}, db_dict: {}, "
 151             "reuse_ee_id: {}".format(
 152                 namespace, artifact_path, db_dict, reuse_ee_id)
 153         )
 154
 155         # Validate artifact-path is provided
 156         if artifact_path is None or len(artifact_path) == 0:
 157             raise N2VCBadArgumentsException(
 158                 message="artifact_path is mandatory", bad_args=["artifact_path"]
 159             )
 160
 161         # Validate artifact-path exists and sync path
 162         from_path = os.path.split(artifact_path)[0]
 163         self.fs.sync(from_path)
 164
 165         # remove / in charm path
 166         while artifact_path.find("//") >= 0:
 167             artifact_path = artifact_path.replace("//", "/")
 168
 169         # check charm path
 170         if self.fs.file_exists(artifact_path):
 171             helm_chart_path = artifact_path
 172         else:
 173             msg = "artifact path does not exist: {}".format(artifact_path)
 174             raise N2VCBadArgumentsException(message=msg, bad_args=["artifact_path"])
 175
 176         if artifact_path.startswith("/"):
 177             full_path = self.fs.path + helm_chart_path
 178         else:
 179             full_path = self.fs.path + "/" + helm_chart_path
 180
 181         try:
 182             # Call helm conn install
 183             # Obtain system cluster id from database
 184             system_cluster_uuid = self._get_system_cluster_id()
 185             # Add parameter osm if exist to global
 186             if config and config.get("osm"):
 187                 if not config.get("global"):
 188                     config["global"] = {}
 189                 config["global"]["osm"] = config.get("osm")
 190
 191             self.log.debug("install helm chart: {}".format(full_path))
 192             helm_id = await self._k8sclusterhelm.install(system_cluster_uuid, kdu_model=full_path,
 193                                                          namespace=self._KUBECTL_OSM_NAMESPACE,
 194                                                          params=config,
 195                                                          db_dict=db_dict,
 196                                                          timeout=progress_timeout)
 197
 198             ee_id = "{}.{}".format(self._KUBECTL_OSM_NAMESPACE, helm_id)
 199             return ee_id, None
 200         except N2VCException:
 201             raise
 202         except Exception as e:
 203             self.log.error("Error deploying chart ee: {}".format(e), exc_info=True)
 204             raise N2VCException("Error deploying chart ee: {}".format(e))
 205
 206     async def register_execution_environment(self, namespace: str, credentials: dict, db_dict: dict,
 207                                              progress_timeout: float = None, total_timeout: float = None) -> str:
 208         # nothing to do
 209         pass
 210
 211     async def install_configuration_sw(self,
 212                                        ee_id: str,
 213                                        artifact_path: str,
 214                                        db_dict: dict,
 215                                        progress_timeout: float = None,
 216                                        total_timeout: float = None,
 217                                        config: dict = None,
 218                                        num_units: int = 1,
 219                                        vca_type: str = None
 220                                        ):
 221         # nothing to do
 222         pass
 223
 224     async def add_relation(self, ee_id_1: str, ee_id_2: str, endpoint_1: str, endpoint_2: str):
 225         # nothing to do
 226         pass
 227
 228     async def remove_relation(self):
 229         # nothing to to
 230         pass
 231
 232     async def get_status(self, namespace: str, yaml_format: bool = True):
 233         # not used for this connector
 234         pass
 235
 236     async def get_ee_ssh_public__key(self, ee_id: str, db_dict: dict, progress_timeout: float = None,
 237                                      total_timeout: float = None) -> str:
 238         """
 239         Obtains ssh-public key from ee executing GetSShKey method from the ee.
 240
 241         :param str ee_id: the id of the execution environment returned by
 242             create_execution_environment or register_execution_environment
 243         :param dict db_dict:
 244         :param float progress_timeout:
 245         :param float total_timeout:
 246         :returns: public key of the execution environment
 247         """
 248
 249         self.log.info(
 250             "get_ee_ssh_public_key: ee_id: {}, db_dict: {}".format(
 251                 ee_id, db_dict)
 252         )
 253
 254         # check arguments
 255         if ee_id is None or len(ee_id) == 0:
 256             raise N2VCBadArgumentsException(
 257                 message="ee_id is mandatory", bad_args=["ee_id"]
 258             )
 259
 260         try:
 261             # Obtain ip_addr for the ee service, it is resolved by dns from the ee name by kubernetes
 262             namespace, helm_id = self._get_ee_id_parts(ee_id)
 263             ip_addr = socket.gethostbyname(helm_id)
 264
 265             # Obtain ssh_key from the ee, this method will implement retries to allow the ee
 266             # install libraries and start successfully
 267             ssh_key = await self._get_ssh_key(ip_addr)
 268             return ssh_key
 269         except Exception as e:
 270             self.log.error("Error obtaining ee ssh_key: {}".format(e), exc_info=True)
 271             raise N2VCException("Error obtaining ee ssh_ke: {}".format(e))
 272
 273     async def exec_primitive(self, ee_id: str, primitive_name: str, params_dict: dict, db_dict: dict = None,
 274                              progress_timeout: float = None, total_timeout: float = None) -> str:
 275         """
 276         Execute a primitive in the execution environment
 277
 278         :param str ee_id: the one returned by create_execution_environment or
 279             register_execution_environment with the format namespace.helm_id
 280         :param str primitive_name: must be one defined in the software. There is one
 281             called 'config', where, for the proxy case, the 'credentials' of VM are
 282             provided
 283         :param dict params_dict: parameters of the action
 284         :param dict db_dict: where to write into database when the status changes.
 285                         It contains a dict with
 286                             {collection: <str>, filter: {},  path: <str>},
 287                             e.g. {collection: "nslcmops", filter:
 288                                 {_id: <nslcmop_id>, path: "_admin.VCA"}
 289                         It will be used to store information about intermediate notifications
 290         :param float progress_timeout:
 291         :param float total_timeout:
 292         :returns str: primitive result, if ok. It raises exceptions in case of fail
 293         """
 294
 295         self.log.info("exec primitive for ee_id : {}, primitive_name: {}, params_dict: {}, db_dict: {}".format(
 296             ee_id, primitive_name, params_dict, db_dict
 297         ))
 298
 299         # check arguments
 300         if ee_id is None or len(ee_id) == 0:
 301             raise N2VCBadArgumentsException(
 302                 message="ee_id is mandatory", bad_args=["ee_id"]
 303             )
 304         if primitive_name is None or len(primitive_name) == 0:
 305             raise N2VCBadArgumentsException(
 306                 message="action_name is mandatory", bad_args=["action_name"]
 307             )
 308         if params_dict is None:
 309             params_dict = dict()
 310
 311         try:
 312             namespace, helm_id = self._get_ee_id_parts(ee_id)
 313             ip_addr = socket.gethostbyname(helm_id)
 314         except Exception as e:
 315             self.log.error("Error getting ee ip ee: {}".format(e))
 316             raise N2VCException("Error getting ee ip ee: {}".format(e))
 317
 318         if primitive_name == "config":
 319             try:
 320                 # Execute config primitive, higher timeout to check the case ee is starting
 321                 status, detailed_message = await self._execute_config_primitive(ip_addr, params_dict, db_dict=db_dict)
 322                 self.log.debug("Executed config primitive ee_id_ {}, status: {}, message: {}".format(
 323                     ee_id, status, detailed_message))
 324                 if status != "OK":
 325                     self.log.error("Error configuring helm ee, status: {}, message: {}".format(
 326                         status, detailed_message))
 327                     raise N2VCExecutionException(
 328                         message="Error configuring helm ee_id: {}, status: {}, message: {}: ".format(
 329                             ee_id, status, detailed_message
 330                         ),
 331                         primitive_name=primitive_name,
 332                     )
 333             except Exception as e:
 334                 self.log.error("Error configuring helm ee: {}".format(e))
 335                 raise N2VCExecutionException(
 336                     message="Error configuring helm ee_id: {}, {}".format(
 337                         ee_id, e
 338                     ),
 339                     primitive_name=primitive_name,
 340                 )
 341             return "CONFIG OK"
 342         else:
 343             try:
 344                 # Execute primitive
 345                 status, detailed_message = await self._execute_primitive(ip_addr, primitive_name,
 346                                                                          params_dict, db_dict=db_dict)
 347                 self.log.debug("Executed primitive {} ee_id_ {}, status: {}, message: {}".format(
 348                     primitive_name, ee_id, status, detailed_message))
 349                 if status != "OK" and status != "PROCESSING":
 350                     self.log.error(
 351                         "Execute primitive {} returned not ok status: {}, message: {}".format(
 352                             primitive_name, status, detailed_message)
 353                     )
 354                     raise N2VCExecutionException(
 355                         message="Execute primitive {} returned not ok status: {}, message: {}".format(
 356                             primitive_name, status, detailed_message
 357                         ),
 358                         primitive_name=primitive_name,
 359                     )
 360             except Exception as e:
 361                 self.log.error(
 362                     "Error executing primitive {}: {}".format(primitive_name, e)
 363                 )
 364                 raise N2VCExecutionException(
 365                     message="Error executing primitive {} into ee={} : {}".format(
 366                         primitive_name, ee_id, e
 367                     ),
 368                     primitive_name=primitive_name,
 369                 )
 370             return detailed_message
 371
 372     async def deregister_execution_environments(self):
 373         # nothing to be done
 374         pass
 375
 376     async def delete_execution_environment(self, ee_id: str, db_dict: dict = None, total_timeout: float = None):
 377         """
 378         Delete an execution environment
 379         :param str ee_id: id of the execution environment to delete, included namespace.helm_id
 380         :param dict db_dict: where to write into database when the status changes.
 381                         It contains a dict with
 382                             {collection: <str>, filter: {},  path: <str>},
 383                             e.g. {collection: "nsrs", filter:
 384                                 {_id: <nsd-id>, path: "_admin.deployed.VCA.3"}
 385         :param float total_timeout:
 386         """
 387
 388         self.log.info("ee_id: {}".format(ee_id))
 389
 390         # check arguments
 391         if ee_id is None:
 392             raise N2VCBadArgumentsException(
 393                 message="ee_id is mandatory", bad_args=["ee_id"]
 394             )
 395
 396         try:
 397
 398             # Obtain cluster_uuid
 399             system_cluster_uuid = self._get_system_cluster_id()
 400
 401             # Get helm_id
 402             namespace, helm_id = self._get_ee_id_parts(ee_id)
 403
 404             # Uninstall chart
 405             await self._k8sclusterhelm.uninstall(system_cluster_uuid, helm_id)
 406             self.log.info("ee_id: {} deleted".format(ee_id))
 407         except N2VCException:
 408             raise
 409         except Exception as e:
 410             self.log.error("Error deleting ee id: {}: {}".format(ee_id, e), exc_info=True)
 411             raise N2VCException("Error deleting ee id {}: {}".format(ee_id, e))
 412
 413     async def delete_namespace(self, namespace: str, db_dict: dict = None, total_timeout: float = None):
 414         # method not implemented for this connector, execution environments must be deleted individually
 415         pass
 416
 417     async def install_k8s_proxy_charm(
 418         self,
 419         charm_name: str,
 420         namespace: str,
 421         artifact_path: str,
 422         db_dict: dict,
 423         progress_timeout: float = None,
 424         total_timeout: float = None,
 425         config: dict = None,
 426     ) -> str:
 427         pass
 428
 429     @retryer(max_wait_time=_MAX_INITIAL_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
 430     async def _get_ssh_key(self, ip_addr):
 431         channel = Channel(ip_addr, self._ee_service_port)
 432         try:
 433             stub = FrontendExecutorStub(channel)
 434             self.log.debug("get ssh key, ip_addr: {}".format(ip_addr))
 435             reply: SshKeyReply = await stub.GetSshKey(SshKeyRequest())
 436             return reply.message
 437         finally:
 438             channel.close()
 439
 440     @retryer(max_wait_time=_MAX_INITIAL_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
 441     async def _execute_config_primitive(self, ip_addr, params, db_dict=None):
 442         return await self._execute_primitive_internal(ip_addr, "config", params, db_dict=db_dict)
 443
 444     @retryer(max_wait_time=_MAX_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
 445     async def _execute_primitive(self, ip_addr, primitive_name, params, db_dict=None):
 446         return await  self._execute_primitive_internal(ip_addr, primitive_name, params, db_dict=db_dict)
 447
 448     async def _execute_primitive_internal(self, ip_addr, primitive_name, params, db_dict=None):
 449
 450         channel = Channel(ip_addr, self._ee_service_port)
 451         try:
 452             stub = FrontendExecutorStub(channel)
 453             async with stub.RunPrimitive.open() as stream:
 454                 primitive_id = str(uuid.uuid1())
 455                 result = None
 456                 self.log.debug("Execute primitive internal: id:{}, name:{}, params: {}".
 457                                format(primitive_id, primitive_name, params))
 458                 await stream.send_message(
 459                     PrimitiveRequest(id=primitive_id, name=primitive_name, params=yaml.dump(params)), end=True)
 460                 async for reply in stream:
 461                     self.log.debug("Received reply: {}".format(reply))
 462                     result = reply
 463                     # If db_dict provided write notifs in database
 464                     if db_dict:
 465                         self._write_op_detailed_status(db_dict, reply.status, reply.detailed_message)
 466                 if result:
 467                     return reply.status, reply.detailed_message
 468                 else:
 469                     return "ERROR", "No result received"
 470         finally:
 471             channel.close()
 472
 473     def _write_op_detailed_status(self, db_dict, status, detailed_message):
 474
 475         # write ee_id to database: _admin.deployed.VCA.x
 476         try:
 477             the_table = db_dict["collection"]
 478             the_filter = db_dict["filter"]
 479             update_dict = {"detailed-status": "{}: {}".format(status, detailed_message)}
 480             # self.log.debug('Writing ee_id to database: {}'.format(the_path))
 481             self.db.set_one(
 482                 table=the_table,
 483                 q_filter=the_filter,
 484                 update_dict=update_dict,
 485                 fail_on_empty=True,
 486             )
 487         except asyncio.CancelledError:
 488             raise
 489         except Exception as e:
 490             self.log.error("Error writing detailedStatus to database: {}".format(e))
 491
 492     def _get_system_cluster_id(self):
 493         if not self._system_cluster_id:
 494             db_k8cluster = self.db.get_one("k8sclusters", {"name": self._KUBECTL_OSM_CLUSTER_NAME})
 495             k8s_hc_id = deep_get(db_k8cluster, ("_admin", "helm-chart", "id"))
 496             if not k8s_hc_id:
 497                 self.log.error("osm system cluster has not been properly initialized for helm connector, "
 498                                "helm-chart id is not defined")
 499                 raise N2VCException("osm system cluster has not been properly initialized for helm connector")
 500             self._system_cluster_id = k8s_hc_id
 501         return self._system_cluster_id
 502
 503     def _get_ee_id_parts(self, ee_id):
 504         namespace, _, helm_id = ee_id.partition('.')
 505         return namespace, helm_id