X-Git-Url: https://osm.etsi.org/gitweb/?a=blobdiff_plain;f=osm_lcm%2Fns.py;h=e00f0d274b53dbdc70ec2012805e667f394eab90;hb=refs%2Fchanges%2F77%2F14077%2F2;hp=9ceb60984657cf02bd1e52c182062bdd5304cfe4;hpb=6316219125d503815baf8a869fbacc8c252876ad;p=osm%2FLCM.git diff --git a/osm_lcm/ns.py b/osm_lcm/ns.py index 9ceb609..e00f0d2 100644 --- a/osm_lcm/ns.py +++ b/osm_lcm/ns.py @@ -98,7 +98,6 @@ from osm_lcm.data_utils.vnfr import ( from osm_lcm.data_utils.dict_utils import parse_yaml_strings from osm_lcm.data_utils.database.vim_account import VimAccountDB from n2vc.definitions import RelationEndpoint -from n2vc.k8s_helm_conn import K8sHelmConnector from n2vc.k8s_helm3_conn import K8sHelm3Connector from n2vc.k8s_juju_conn import K8sJujuConnector @@ -133,7 +132,16 @@ class NsLcm(LcmBase): SUBOPERATION_STATUS_NOT_FOUND = -1 SUBOPERATION_STATUS_NEW = -2 SUBOPERATION_STATUS_SKIP = -3 + EE_TLS_NAME = "ee-tls" task_name_deploy_vca = "Deploying VCA" + rel_operation_types = { + "GE": ">=", + "LE": "<=", + "GT": ">", + "LT": "<", + "EQ": "==", + "NE": "!=", + } def __init__(self, msg, lcm_tasks, config: LcmCfg): """ @@ -164,15 +172,6 @@ class NsLcm(LcmBase): on_update_db=self._on_update_n2vc_db, ) - self.k8sclusterhelm2 = K8sHelmConnector( - kubectl_command=self.vca_config.kubectlpath, - helm_command=self.vca_config.helmpath, - log=self.logger, - on_update_db=None, - fs=self.fs, - db=self.db, - ) - self.k8sclusterhelm3 = K8sHelm3Connector( kubectl_command=self.vca_config.kubectlpath, helm_command=self.vca_config.helm3path, @@ -192,7 +191,6 @@ class NsLcm(LcmBase): ) self.k8scluster_map = { - "helm-chart": self.k8sclusterhelm2, "helm-chart-v3": self.k8sclusterhelm3, "chart": self.k8sclusterhelm3, "juju-bundle": self.k8sclusterjuju, @@ -895,7 +893,7 @@ class NsLcm(LcmBase): ) vdur = next((vdur for vdur in target_vnf.get("vdur", ())), None) if not vdur: - return + continue for a_index, a_vld in enumerate(target["ns"]["vld"]): target_vld = find_in_list( get_iterable(vdur, "interfaces"), @@ -1804,7 +1802,7 @@ class NsLcm(LcmBase): vca_id = self.get_vca_id(db_vnfr, db_nsr) # create or register execution environment in VCA - if vca_type in ("lxc_proxy_charm", "k8s_proxy_charm", "helm", "helm-v3"): + if vca_type in ("lxc_proxy_charm", "k8s_proxy_charm", "helm-v3"): self._write_configuration_status( nsr_id=nsr_id, vca_index=vca_index, @@ -1826,11 +1824,11 @@ class NsLcm(LcmBase): db_dict=db_dict, vca_id=vca_id, ) - elif vca_type == "helm" or vca_type == "helm-v3": + elif vca_type == "helm-v3": ee_id, credentials = await self.vca_map[ vca_type ].create_execution_environment( - namespace=namespace, + namespace=nsr_id, reuse_ee_id=ee_id, db_dict=db_dict, config=osm_config, @@ -1972,7 +1970,7 @@ class NsLcm(LcmBase): # if SSH access is required, then get execution environment SSH public # if native charm we have waited already to VM be UP - if vca_type in ("k8s_proxy_charm", "lxc_proxy_charm", "helm", "helm-v3"): + if vca_type in ("k8s_proxy_charm", "lxc_proxy_charm", "helm-v3"): pub_key = None user = None # self.logger.debug("get ssh key block") @@ -2114,7 +2112,7 @@ class NsLcm(LcmBase): # TODO register in database that primitive is done # STEP 7 Configure metrics - if vca_type == "helm" or vca_type == "helm-v3": + if vca_type == "helm-v3": # TODO: review for those cases where the helm chart is a reference and # is not part of the NF package prometheus_jobs = await self.extract_prometheus_scrape_jobs( @@ -2381,14 +2379,6 @@ class NsLcm(LcmBase): df = vnfd.get("df", [{}])[0] # Checking for auto-scaling configuration if "scaling-aspect" in df: - rel_operation_types = { - "GE": ">=", - "LE": "<=", - "GT": ">", - "LT": "<", - "EQ": "==", - "NE": "!=", - } scaling_aspects = df["scaling-aspect"] all_vnfd_monitoring_params = {} for ivld in vnfd.get("int-virtual-link-desc", ()): @@ -2464,7 +2454,9 @@ class NsLcm(LcmBase): operation = scaling_criteria[ "scale-in-relational-operation" ] - rel_operator = rel_operation_types.get(operation, "<=") + rel_operator = self.rel_operation_types.get( + operation, "<=" + ) metric_selector = f'{metric_name}{{ns_id="{nsr_id}", vnf_member_index="{vnf_member_index}", vdu_id="{vdu_id}"}}' expression = f"(count ({metric_selector}) > {instances_min_number}) and (avg({metric_selector}) {rel_operator} {scalein_threshold})" labels = { @@ -2505,7 +2497,9 @@ class NsLcm(LcmBase): operation = scaling_criteria[ "scale-out-relational-operation" ] - rel_operator = rel_operation_types.get(operation, "<=") + rel_operator = self.rel_operation_types.get( + operation, "<=" + ) metric_selector = f'{metric_name}{{ns_id="{nsr_id}", vnf_member_index="{vnf_member_index}", vdu_id="{vdu_id}"}}' expression = f"(count ({metric_selector}) < {instances_max_number}) and (avg({metric_selector}) {rel_operator} {scaleout_threshold})" labels = { @@ -2541,6 +2535,96 @@ class NsLcm(LcmBase): alerts.append(alert) return alerts + def _gather_vnfr_alarm_alerts(self, vnfr, vnfd): + alerts = [] + nsr_id = vnfr["nsr-id-ref"] + vnf_member_index = vnfr["member-vnf-index-ref"] + + # Checking for VNF alarm configuration + for vdur in vnfr["vdur"]: + vdu_id = vdur["vdu-id-ref"] + vdu = next(filter(lambda vdu: vdu["id"] == vdu_id, vnfd["vdu"])) + if "alarm" in vdu: + # Get VDU monitoring params, since alerts are based on them + vdu_monitoring_params = {} + for mp in vdu.get("monitoring-parameter", []): + vdu_monitoring_params[mp.get("id")] = mp + if not vdu_monitoring_params: + self.logger.error( + "VDU alarm refers to a VDU monitoring param, but there are no VDU monitoring params in the VDU" + ) + continue + # Get alarms in the VDU + alarm_descriptors = vdu["alarm"] + # Create VDU alarms for each alarm in the VDU + for alarm_descriptor in alarm_descriptors: + # Check that the VDU alarm refers to a proper monitoring param + alarm_monitoring_param = alarm_descriptor.get( + "vnf-monitoring-param-ref", "" + ) + vdu_specific_monitoring_param = vdu_monitoring_params.get( + alarm_monitoring_param, {} + ) + if not vdu_specific_monitoring_param: + self.logger.error( + "VDU alarm refers to a VDU monitoring param not present in the VDU" + ) + continue + metric_name = vdu_specific_monitoring_param.get( + "performance-metric" + ) + if not metric_name: + self.logger.error( + "VDU alarm refers to a VDU monitoring param that has no associated performance-metric" + ) + continue + # Set params of the alarm to be created in Prometheus + metric_name = f"osm_{metric_name}" + metric_threshold = alarm_descriptor.get("value") + uuid = str(uuid4()) + alert_name = f"vdu_alarm_{uuid}" + operation = alarm_descriptor["operation"] + rel_operator = self.rel_operation_types.get(operation, "<=") + metric_selector = f'{metric_name}{{ns_id="{nsr_id}", vnf_member_index="{vnf_member_index}", vdu_id="{vdu_id}"}}' + expression = f"{metric_selector} {rel_operator} {metric_threshold}" + labels = { + "ns_id": nsr_id, + "vnf_member_index": vnf_member_index, + "vdu_id": vdu_id, + "vdu_name": "{{ $labels.vdu_name }}", + } + prom_cfg = { + "alert": alert_name, + "expr": expression, + "for": "1m", # default value. Ideally, this should be related to an IM param, but there is not such param + "labels": labels, + } + alarm_action = dict() + for action_type in ["ok", "insufficient-data", "alarm"]: + if ( + "actions" in alarm_descriptor + and action_type in alarm_descriptor["actions"] + ): + alarm_action[action_type] = alarm_descriptor["actions"][ + action_type + ] + alert = { + "uuid": uuid, + "name": alert_name, + "metric": metric_name, + "tags": { + "ns_id": nsr_id, + "vnf_member_index": vnf_member_index, + "vdu_id": vdu_id, + }, + "alarm_status": "ok", + "action_type": "vdu_alarm", + "action": alarm_action, + "prometheus_config": prom_cfg, + } + alerts.append(alert) + return alerts + def update_nsrs_with_pla_result(self, params): try: nslcmop_id = deep_get(params, ("placement", "nslcmopId")) @@ -2750,13 +2834,16 @@ class NsLcm(LcmBase): # create namespace and certificate if any helm based EE is present in the NS if check_helm_ee_in_ns(db_vnfds): - # TODO: create EE namespace + await self.vca_map["helm-v3"].setup_ns_namespace( + name=nsr_id, + ) # create TLS certificates await self.vca_map["helm-v3"].create_tls_certificate( - secret_name="ee-tls-{}".format(nsr_id), + secret_name=self.EE_TLS_NAME, dns_prefix="*", nsr_id=nsr_id, usage="server auth", + namespace=nsr_id, ) nsi_id = None # TODO put nsi_id when this nsr belongs to a NSI @@ -3033,7 +3120,15 @@ class NsLcm(LcmBase): stage[1] = stage[2] = "" except asyncio.CancelledError: error_list.append("Cancelled") - # TODO cancel all tasks + await self._cancel_pending_tasks(logging_text, tasks_dict_info) + await self._wait_for_tasks( + logging_text, + tasks_dict_info, + timeout_ns_deploy, + stage, + nslcmop_id, + nsr_id=nsr_id, + ) except Exception as exc: error_list.append(str(exc)) @@ -3091,6 +3186,10 @@ class NsLcm(LcmBase): self.logger.info(f"Storing scaling alert in MongoDB: {alert}") self.db.create("alerts", alert) + alarm_alerts = self._gather_vnfr_alarm_alerts(vnfr, vnfd) + for alert in alarm_alerts: + self.logger.info(f"Storing VNF alarm alert in MongoDB: {alert}") + self.db.create("alerts", alert) if db_nsr: self._write_ns_status( nsr_id=nsr_id, @@ -3118,6 +3217,12 @@ class NsLcm(LcmBase): "nsr_id": nsr_id, "nslcmop_id": nslcmop_id, "operationState": nslcmop_operation_state, + "startTime": db_nslcmop["startTime"], + "links": db_nslcmop["links"], + "operationParams": { + "nsInstanceId": nsr_id, + "nsdId": db_nsr["nsd-id"], + }, }, ) except Exception as e: @@ -3686,9 +3791,11 @@ class NsLcm(LcmBase): vnfr_data.get("_id"), {"kdur.{}.status".format(kdu_index): "ERROR"}, ) - except Exception: + except Exception as error: # ignore to keep original exception - pass + self.logger.warning( + f"An exception occurred while updating DB: {str(error)}" + ) # reraise original error raise @@ -3707,7 +3814,6 @@ class NsLcm(LcmBase): k8scluster_id_2_uuic = { "helm-chart-v3": {}, - "helm-chart": {}, "juju-bundle": {}, } @@ -3806,11 +3912,6 @@ class NsLcm(LcmBase): # Default version: helm3, if helm-version is v2 assign v2 k8sclustertype = "helm-chart-v3" self.logger.debug("kdur: {}".format(kdur)) - if ( - kdur.get("helm-version") - and kdur.get("helm-version") == "v2" - ): - k8sclustertype = "helm-chart" elif kdur.get("juju-bundle"): kdumodel = kdur["juju-bundle"] k8sclustertype = "juju-bundle" @@ -3848,8 +3949,8 @@ class NsLcm(LcmBase): kdumodel = self.fs.path + filename except (asyncio.TimeoutError, asyncio.CancelledError): raise - except Exception: # it is not a file - pass + except Exception as e: # it is not a file + self.logger.warning(f"An exception occurred: {str(e)}") k8s_cluster_id = kdur["k8s-cluster"]["id"] step = "Synchronize repos for k8s cluster '{}'".format( @@ -4031,10 +4132,7 @@ class NsLcm(LcmBase): vca_type = "native_charm" elif ee_item.get("helm-chart"): vca_name = ee_item["helm-chart"] - if ee_item.get("helm-version") and ee_item.get("helm-version") == "v2": - vca_type = "helm" - else: - vca_type = "helm-v3" + vca_type = "helm-v3" else: self.logger.debug( logging_text + "skipping non juju neither charm configuration" @@ -4591,9 +4689,7 @@ class NsLcm(LcmBase): ) and vca.get("needed_terminate") # For helm we must destroy_ee. Also for native_charm, as juju_model cannot be deleted if there are # pending native charms - destroy_ee = ( - True if vca_type in ("helm", "helm-v3", "native_charm") else False - ) + destroy_ee = True if vca_type in ("helm-v3", "native_charm") else False # self.logger.debug(logging_text + "vca_index: {}, ee_id: {}, vca_type: {} destroy_ee: {}".format( # vca_index, vca.get("ee_id"), vca_type, destroy_ee)) task = asyncio.ensure_future( @@ -4646,9 +4742,12 @@ class NsLcm(LcmBase): # Delete Namespace and Certificates if necessary if check_helm_ee_in_ns(list(db_vnfds_from_member_index.values())): await self.vca_map["helm-v3"].delete_tls_certificate( - certificate_name=db_nslcmop["nsInstanceId"], + namespace=db_nslcmop["nsInstanceId"], + certificate_name=self.EE_TLS_NAME, + ) + await self.vca_map["helm-v3"].delete_namespace( + namespace=db_nslcmop["nsInstanceId"], ) - # TODO: Delete namespace # Delete from k8scluster stage[1] = "Deleting KDUs." @@ -4730,7 +4829,14 @@ class NsLcm(LcmBase): stage[1] = stage[2] = "" except asyncio.CancelledError: error_list.append("Cancelled") - # TODO cancell all tasks + await self._cancel_pending_tasks(logging_text, tasks_dict_info) + await self._wait_for_tasks( + logging_text, + tasks_dict_info, + timeout_ns_terminate, + stage, + nslcmop_id, + ) except Exception as exc: error_list.append(str(exc)) # update status at database @@ -4893,6 +4999,11 @@ class NsLcm(LcmBase): self._write_op_status(nslcmop_id, stage) return error_detail_list + async def _cancel_pending_tasks(self, logging_text, created_tasks_info): + for task, name in created_tasks_info.items(): + self.logger.debug(logging_text + "Cancelling task: " + name) + task.cancel() + @staticmethod def _map_primitive_params(primitive_desc, params, instantiation_params): """ @@ -5257,7 +5368,7 @@ class NsLcm(LcmBase): kdu_action = ( True if primitive_name in actions - and kdu["k8scluster-type"] not in ("helm-chart", "helm-chart-v3") + and kdu["k8scluster-type"] != "helm-chart-v3" else False ) @@ -5947,13 +6058,7 @@ class NsLcm(LcmBase): # add chart to list and all parameters step = "Getting helm chart name" chart_name = ee_item.get("helm-chart") - if ( - ee_item.get("helm-version") - and ee_item.get("helm-version") == "v2" - ): - vca_type = "helm" - else: - vca_type = "helm-v3" + vca_type = "helm-v3" step = "Setting Helm chart artifact paths" helm_artifacts.append( @@ -6520,11 +6625,6 @@ class NsLcm(LcmBase): if kdur.get("helm-chart"): k8s_cluster_type = "helm-chart-v3" self.logger.debug("kdur: {}".format(kdur)) - if ( - kdur.get("helm-version") - and kdur.get("helm-version") == "v2" - ): - k8s_cluster_type = "helm-chart" elif kdur.get("juju-bundle"): k8s_cluster_type = "juju-bundle" else: @@ -6653,11 +6753,6 @@ class NsLcm(LcmBase): if kdur.get("helm-chart"): k8s_cluster_type = "helm-chart-v3" self.logger.debug("kdur: {}".format(kdur)) - if ( - kdur.get("helm-version") - and kdur.get("helm-version") == "v2" - ): - k8s_cluster_type = "helm-chart" elif kdur.get("juju-bundle"): k8s_cluster_type = "juju-bundle" else: @@ -7273,16 +7368,31 @@ class NsLcm(LcmBase): exc_info=True, ) finally: + error_list = list() + if exc: + error_list.append(str(exc)) self._write_ns_status( nsr_id=nsr_id, ns_state=None, current_operation="IDLE", current_operation_id=None, ) - if tasks_dict_info: - stage[1] = "Waiting for instantiate pending tasks." - self.logger.debug(logging_text + stage[1]) - exc = await self._wait_for_tasks( + try: + if tasks_dict_info: + stage[1] = "Waiting for instantiate pending tasks." + self.logger.debug(logging_text + stage[1]) + exc = await self._wait_for_tasks( + logging_text, + tasks_dict_info, + self.timeout.ns_deploy, + stage, + nslcmop_id, + nsr_id=nsr_id, + ) + except asyncio.CancelledError: + error_list.append("Cancelled") + await self._cancel_pending_tasks(logging_text, tasks_dict_info) + await self._wait_for_tasks( logging_text, tasks_dict_info, self.timeout.ns_deploy, @@ -7290,10 +7400,13 @@ class NsLcm(LcmBase): nslcmop_id, nsr_id=nsr_id, ) - if exc: + if error_list: + error_detail = "; ".join(error_list) db_nslcmop_update[ "detailed-status" - ] = error_description_nslcmop = "FAILED {}: {}".format(step, exc) + ] = error_description_nslcmop = "FAILED {}: {}".format( + step, error_detail + ) nslcmop_operation_state = "FAILED" if db_nsr: db_nsr_update["operational-status"] = old_operational_status @@ -7307,7 +7420,7 @@ class NsLcm(LcmBase): db_nsr_update[ "detailed-status" ] = "FAILED scaling nslcmop={} {}: {}".format( - nslcmop_id, step, exc + nslcmop_id, step, error_detail ) else: error_description_nslcmop = None @@ -7608,10 +7721,11 @@ class NsLcm(LcmBase): f"Timeout waiting KDU with name={kdu_name} and index={kdu_index} to be intantiated" ) - # TODO get_service if ee_id is not None: - _, _, service = ee_id.partition(".") # remove prefix "namespace." - host_name = "{}-{}".format(service, ee_config_descriptor["metric-service"]) + _, namespace, helm_id = get_ee_id_parts( + ee_id + ) # get namespace and EE gRPC service name + host_name = f'{helm_id}-{ee_config_descriptor["metric-service"]}.{namespace}.svc' # svc_name.namespace.svc host_port = "80" vnfr_id = vnfr_id.replace("-", "") variables = { @@ -8077,10 +8191,25 @@ class NsLcm(LcmBase): exc_info=True, ) finally: - if tasks_dict_info: - stage[1] = "Waiting for healing pending tasks." - self.logger.debug(logging_text + stage[1]) - exc = await self._wait_for_tasks( + error_list = list() + if exc: + error_list.append(str(exc)) + try: + if tasks_dict_info: + stage[1] = "Waiting for healing pending tasks." + self.logger.debug(logging_text + stage[1]) + exc = await self._wait_for_tasks( + logging_text, + tasks_dict_info, + self.timeout.ns_deploy, + stage, + nslcmop_id, + nsr_id=nsr_id, + ) + except asyncio.CancelledError: + error_list.append("Cancelled") + await self._cancel_pending_tasks(logging_text, tasks_dict_info) + await self._wait_for_tasks( logging_text, tasks_dict_info, self.timeout.ns_deploy, @@ -8088,17 +8217,22 @@ class NsLcm(LcmBase): nslcmop_id, nsr_id=nsr_id, ) - if exc: + if error_list: + error_detail = "; ".join(error_list) db_nslcmop_update[ "detailed-status" - ] = error_description_nslcmop = "FAILED {}: {}".format(step, exc) + ] = error_description_nslcmop = "FAILED {}: {}".format( + step, error_detail + ) nslcmop_operation_state = "FAILED" if db_nsr: db_nsr_update["operational-status"] = old_operational_status db_nsr_update["config-status"] = old_config_status db_nsr_update[ "detailed-status" - ] = "FAILED healing nslcmop={} {}: {}".format(nslcmop_id, step, exc) + ] = "FAILED healing nslcmop={} {}: {}".format( + nslcmop_id, step, error_detail + ) for task, task_name in tasks_dict_info.items(): if not task.done() or task.cancelled() or task.exception(): if task_name.startswith(self.task_name_deploy_vca): @@ -8298,10 +8432,7 @@ class NsLcm(LcmBase): vca_type = "native_charm" elif ee_item.get("helm-chart"): vca_name = ee_item["helm-chart"] - if ee_item.get("helm-version") and ee_item.get("helm-version") == "v2": - vca_type = "helm" - else: - vca_type = "helm-v3" + vca_type = "helm-v3" else: self.logger.debug( logging_text + "skipping non juju neither charm configuration" @@ -8633,7 +8764,7 @@ class NsLcm(LcmBase): # if SSH access is required, then get execution environment SSH public # if native charm we have waited already to VM be UP - if vca_type in ("k8s_proxy_charm", "lxc_proxy_charm", "helm", "helm-v3"): + if vca_type in ("k8s_proxy_charm", "lxc_proxy_charm", "helm-v3"): pub_key = None user = None # self.logger.debug("get ssh key block")