Improve logging in cluster operations in case of issues in ODU workflow 08/14908/1
authorgarciadeblas <gerardo.garciadeblas@telefonica.com>
Tue, 4 Feb 2025 15:08:51 +0000 (16:08 +0100)
committergarciadeblas <gerardo.garciadeblas@telefonica.com>
Tue, 4 Feb 2025 15:09:06 +0000 (16:09 +0100)
Change-Id: I09fc87cf33889fde4b4a94b8f530291888218813
Signed-off-by: garciadeblas <gerardo.garciadeblas@telefonica.com>
osm_lcm/k8s.py
osm_lcm/odu_libs/cluster_mgmt.py
osm_lcm/odu_workflows.py

index fd3667d..87b7160 100644 (file)
@@ -332,11 +332,27 @@ class ClusterLcm(GitOpsLcm):
         db_vim = self.db.get_one("vim_accounts", {"name": db_cluster["vim_account"]})
         workflow_content["vim_account"] = db_vim
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["state"] = "FAILED_CREATION"
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            # Clean items used in the workflow, no matter if the workflow succeeded
+            clean_status, clean_msg = await self.odu.clean_items_workflow(
+                "create_cluster", op_id, op_params, workflow_content
+            )
+            self.logger.info(
+                f"clean_status is :{clean_status} and clean_msg is :{clean_msg}"
+            )
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -564,11 +580,27 @@ class ClusterLcm(GitOpsLcm):
         if db_cluster["created"] == "false":
             return await self.deregister(params, order_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["state"] = "FAILED_DELETION"
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            # Clean items used in the workflow, no matter if the workflow succeeded
+            clean_status, clean_msg = await self.odu.clean_items_workflow(
+                "delete_cluster", op_id, op_params, workflow_content
+            )
+            self.logger.info(
+                f"clean_status is :{clean_status} and clean_msg is :{clean_msg}"
+            )
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -742,11 +774,19 @@ class ClusterLcm(GitOpsLcm):
         # content["profile"] = db_profile
         workflow_content["profile"] = db_profile
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "attach_profile_to_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["resourceState"] = "ERROR"
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -822,11 +862,19 @@ class ClusterLcm(GitOpsLcm):
         # content["profile"] = db_profile
         workflow_content["profile"] = db_profile
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "detach_profile_from_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -894,11 +942,27 @@ class ClusterLcm(GitOpsLcm):
             "cluster": self.decrypted_copy(db_cluster),
         }
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "register_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["state"] = "FAILED_CREATION"
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            # Clean items used in the workflow, no matter if the workflow succeeded
+            clean_status, clean_msg = await self.odu.clean_items_workflow(
+                "register_cluster", op_id, op_params, workflow_content
+            )
+            self.logger.info(
+                f"clean_status is :{clean_status} and clean_msg is :{clean_msg}"
+            )
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -1010,11 +1074,27 @@ class ClusterLcm(GitOpsLcm):
             "cluster": self.decrypted_copy(db_cluster),
         }
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "deregister_cluster", op_id, op_params, workflow_content
         )
-        self.logger.info("workflow_name is :{}".format(workflow_name))
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["state"] = "FAILED_DELETION"
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            # Clean items used in the workflow, no matter if the workflow succeeded
+            clean_status, clean_msg = await self.odu.clean_items_workflow(
+                "deregister_cluster", op_id, op_params, workflow_content
+            )
+            self.logger.info(
+                f"clean_status is :{clean_status} and clean_msg is :{clean_msg}"
+            )
+            return
 
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -1104,9 +1184,25 @@ class ClusterLcm(GitOpsLcm):
         db_vim = self.db.get_one("vim_accounts", {"name": db_cluster["vim_account"]})
         workflow_content["vim_account"] = db_vim
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "update_cluster", op_id, op_params, workflow_content
         )
+        if not workflow_res:
+            self.logger.error(f"Failed to launch workflow: {workflow_name}")
+            db_cluster["resourceState"] = "ERROR"
+            db_cluster = self.update_operation_history(
+                db_cluster, op_id, workflow_status=False, resource_status=None
+            )
+            self.db.set_one("clusters", {"_id": db_cluster["_id"]}, db_cluster)
+            # Clean items used in the workflow, no matter if the workflow succeeded
+            clean_status, clean_msg = await self.odu.clean_items_workflow(
+                "update_cluster", op_id, op_params, workflow_content
+            )
+            self.logger.info(
+                f"clean_status is :{clean_status} and clean_msg is :{clean_msg}"
+            )
+            return
+        self.logger.info("workflow_name is :{}".format(workflow_name))
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
             workflow_name
         )
@@ -1263,7 +1359,7 @@ class CloudCredentialsLcm(GitOpsLcm):
             salt=vim_id,
         )
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_cloud_credentials", op_id, op_params, db_content
         )
 
@@ -1316,7 +1412,7 @@ class CloudCredentialsLcm(GitOpsLcm):
             salt=vim_id,
         )
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "update_cloud_credentials", op_id, op_params, db_content
         )
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
@@ -1352,7 +1448,7 @@ class CloudCredentialsLcm(GitOpsLcm):
         op_params = params
         db_content = self.db.get_one("vim_accounts", {"_id": vim_id})
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_cloud_credentials", op_id, op_params, db_content
         )
         workflow_status, workflow_msg = await self.odu.check_workflow_status(
@@ -1400,7 +1496,7 @@ class K8sAppLcm(GitOpsLcm):
         op_params = self.get_operation_params(content, op_id)
         self.db.set_one("k8sapp", {"_id": content["_id"]}, content)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1429,7 +1525,7 @@ class K8sAppLcm(GitOpsLcm):
         content = self.db.get_one("k8sapp", {"_id": profile_id})
         op_params = self.get_operation_params(content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1476,7 +1572,7 @@ class K8sResourceLcm(GitOpsLcm):
         op_params = self.get_operation_params(content, op_id)
         self.db.set_one("k8sresource", {"_id": content["_id"]}, content)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1507,7 +1603,7 @@ class K8sResourceLcm(GitOpsLcm):
         content = self.db.get_one("k8sresource", {"_id": profile_id})
         op_params = self.get_operation_params(content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1556,7 +1652,7 @@ class K8sInfraControllerLcm(GitOpsLcm):
         op_params = self.get_operation_params(content, op_id)
         self.db.set_one("k8sinfra_controller", {"_id": content["_id"]}, content)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1587,7 +1683,7 @@ class K8sInfraControllerLcm(GitOpsLcm):
         content = self.db.get_one("k8sinfra_controller", {"_id": profile_id})
         op_params = self.get_operation_params(content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1636,7 +1732,7 @@ class K8sInfraConfigLcm(GitOpsLcm):
         op_params = self.get_operation_params(content, op_id)
         self.db.set_one("k8sinfra_config", {"_id": content["_id"]}, content)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1667,7 +1763,7 @@ class K8sInfraConfigLcm(GitOpsLcm):
         content = self.db.get_one("k8sinfra_config", {"_id": profile_id})
         op_params = self.get_operation_params(content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_profile", op_id, op_params, content
         )
         self.logger.info("workflow_name is :{}".format(workflow_name))
@@ -1711,7 +1807,7 @@ class OkaLcm(GitOpsLcm):
         db_content = self.db.get_one(self.db_collection, {"_id": oka_id})
         op_params = self.get_operation_params(db_content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_oka", op_id, op_params, db_content
         )
 
@@ -1735,7 +1831,7 @@ class OkaLcm(GitOpsLcm):
         db_content = self.db.get_one(self.db_collection, {"_id": oka_id})
         op_params = self.get_operation_params(db_content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "update_oka", op_id, op_params, db_content
         )
         workflow_status = await self.check_workflow_and_update_db(
@@ -1758,7 +1854,7 @@ class OkaLcm(GitOpsLcm):
         db_content = self.db.get_one(self.db_collection, {"_id": oka_id})
         op_params = self.get_operation_params(db_content, op_id)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_oka", op_id, op_params, db_content
         )
         workflow_status = await self.check_workflow_and_update_db(
@@ -1845,7 +1941,7 @@ class KsuLcm(GitOpsLcm):
             op_params.append(ksu_params)
 
         # A single workflow is launched for all KSUs
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "create_ksus", op_id, op_params, db_content
         )
         # Update workflow status in all KSUs
@@ -1910,7 +2006,7 @@ class KsuLcm(GitOpsLcm):
                     ] = f"{oka_type}/{db_oka['git_name']}/templates"
             op_params.append(ksu_params)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "update_ksus", op_id, op_params, db_content
         )
 
@@ -1964,7 +2060,7 @@ class KsuLcm(GitOpsLcm):
             ksu_params["profile"]["age_pubkey"] = db_profile.get("age_pubkey", "")
             op_params.append(ksu_params)
 
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "delete_ksus", op_id, op_params, db_content
         )
 
@@ -1993,7 +2089,7 @@ class KsuLcm(GitOpsLcm):
         self.initialize_operation(ksus_id, op_id)
         db_content = self.db.get_one(self.db_collection, {"_id": ksus_id})
         op_params = self.get_operation_params(db_content, op_id)
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "clone_ksus", op_id, op_params, db_content
         )
 
@@ -2017,7 +2113,7 @@ class KsuLcm(GitOpsLcm):
         self.initialize_operation(ksus_id, op_id)
         db_content = self.db.get_one(self.db_collection, {"_id": ksus_id})
         op_params = self.get_operation_params(db_content, op_id)
-        _, workflow_name = await self.odu.launch_workflow(
+        workflow_res, workflow_name = await self.odu.launch_workflow(
             "move_ksus", op_id, op_params, db_content
         )
 
index bed2781..efca1e2 100644 (file)
@@ -51,8 +51,8 @@ async def create_cluster(self, op_id, op_params, content):
 
     # Get age key
     public_key_new_cluster, private_key_new_cluster = gather_age_key(db_cluster)
-    self.logger.debug(f"public_key_new_cluster={public_key_new_cluster}")
-    self.logger.debug(f"private_key_new_cluster={private_key_new_cluster}")
+    self.logger.debug(f"public_key_new_cluster={public_key_new_cluster}")
+    self.logger.debug(f"private_key_new_cluster={private_key_new_cluster}")
 
     # Test kubectl connection
     self.logger.debug(self._kubectl._get_kubectl_version())
@@ -71,7 +71,7 @@ async def create_cluster(self, op_id, op_params, content):
         )
     except Exception as e:
         self.logger.info(f"Cannot create secret {secret_name}: {e}")
-        return False, ""
+        return False, f"Cannot create secret {secret_name}: {e}"
 
     # Additional params for the workflow
     cluster_kustomization_name = cluster_name
@@ -170,7 +170,7 @@ async def update_cluster(self, op_id, op_params, content):
         )
     except Exception as e:
         self.logger.info(f"Cannot create secret {secret_name}: {e}")
-        return False, ""
+        return False, f"Cannot create secret {secret_name}: {e}"
 
     # Additional params for the workflow
     cluster_kustomization_name = cluster_name
@@ -299,7 +299,10 @@ async def register_cluster(self, op_id, op_params, content):
         self.logger.info(
             f"Cannot create secret {secret_name} in namespace {secret_namespace}: {e}"
         )
-        return False, ""
+        return (
+            False,
+            f"Cannot create secret {secret_name} in namespace {secret_namespace}: {e}",
+        )
 
     # Create secret with kubeconfig
     secret_name2 = f"kubeconfig-{cluster_name}"
@@ -319,7 +322,10 @@ async def register_cluster(self, op_id, op_params, content):
         self.logger.info(
             f"Cannot create secret {secret_name} in namespace {secret_namespace}: {e}"
         )
-        return False, ""
+        return (
+            False,
+            f"Cannot create secret {secret_name} in namespace {secret_namespace}: {e}",
+        )
 
     # Additional params for the workflow
     cluster_kustomization_name = cluster_name
index 9c8e6e0..e812d86 100644 (file)
@@ -181,7 +181,12 @@ class OduWorkflow(LcmBase):
         )
         workflow_function = self._workflows[key]["workflow_function"]
         self.logger.info("workflow function : {}".format(workflow_function))
-        return await workflow_function(op_id, op_params, content)
+        try:
+            result, workflow_name = await workflow_function(op_id, op_params, content)
+            return result, workflow_name
+        except Exception as e:
+            self.logger.error(f"Error launching workflow: {e}")
+            return False, str(e)
 
     async def dummy_clean_items(self, op_id, op_params, content):
         self.logger.info(