Bug 1152, error if system cluster not properly initialized
[osm/LCM.git] / osm_lcm / lcm_helm_conn.py
1 ##
2 # Copyright 2020 Telefonica Investigacion y Desarrollo, S.A.U.
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 # implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 ##
18 import functools
19 import yaml
20 import asyncio
21 import socket
22 import uuid
23
24 from grpclib.client import Channel
25
26 from osm_lcm.frontend_pb2 import PrimitiveRequest
27 from osm_lcm.frontend_pb2 import SshKeyRequest, SshKeyReply
28 from osm_lcm.frontend_grpc import FrontendExecutorStub
29
30 from n2vc.n2vc_conn import N2VCConnector
31 from n2vc.k8s_helm_conn import K8sHelmConnector
32 from n2vc.exceptions import N2VCBadArgumentsException, N2VCException, N2VCExecutionException
33
34 from osm_lcm.lcm_utils import deep_get
35
36
37 def retryer(max_wait_time=60, delay_time=10):
38 def wrapper(func):
39 retry_exceptions = (
40 ConnectionRefusedError
41 )
42
43 @functools.wraps(func)
44 async def wrapped(*args, **kwargs):
45 wait_time = max_wait_time
46 while wait_time > 0:
47 try:
48 return await func(*args, **kwargs)
49 except retry_exceptions:
50 wait_time = wait_time - delay_time
51 await asyncio.sleep(delay_time)
52 continue
53 else:
54 return ConnectionRefusedError
55 return wrapped
56 return wrapper
57
58
59 class LCMHelmConn(N2VCConnector):
60 _KUBECTL_OSM_NAMESPACE = "osm"
61 _KUBECTL_OSM_CLUSTER_NAME = "_system-osm-k8s"
62 _EE_SERVICE_PORT = 50050
63
64 # Time beetween retries
65 _EE_RETRY_DELAY = 10
66 # Initial max retry time
67 _MAX_INITIAL_RETRY_TIME = 300
68 # Other retry time
69 _MAX_RETRY_TIME = 30
70
71 def __init__(self,
72 db: object,
73 fs: object,
74 log: object = None,
75 loop: object = None,
76 url: str = None,
77 username: str = None,
78 vca_config: dict = None,
79 on_update_db=None, ):
80 """
81 Initialize EE helm connector.
82 """
83
84 # parent class constructor
85 N2VCConnector.__init__(
86 self,
87 db=db,
88 fs=fs,
89 log=log,
90 loop=loop,
91 url=url,
92 username=username,
93 vca_config=vca_config,
94 on_update_db=on_update_db,
95 )
96
97 self.log.debug("Initialize helm N2VC connector")
98
99 # TODO - Obtain data from configuration
100 self._ee_service_port = self._EE_SERVICE_PORT
101
102 self._retry_delay = self._EE_RETRY_DELAY
103 self._max_retry_time = self._MAX_RETRY_TIME
104 self._initial_retry_time = self._MAX_INITIAL_RETRY_TIME
105
106 # initialize helm connector
107 self._k8sclusterhelm = K8sHelmConnector(
108 kubectl_command=self.vca_config.get("kubectlpath"),
109 helm_command=self.vca_config.get("helmpath"),
110 fs=self.fs,
111 log=self.log,
112 db=self.db,
113 on_update_db=None,
114 )
115
116 self._system_cluster_id = None
117 self.log.info("Helm N2VC connector initialized")
118
119 # TODO - ¿reuse_ee_id?
120 async def create_execution_environment(self,
121 namespace: str,
122 db_dict: dict,
123 reuse_ee_id: str = None,
124 progress_timeout: float = None,
125 total_timeout: float = None,
126 config: dict = None,
127 artifact_path: str = None,
128 vca_type: str = None) -> (str, dict):
129 """
130 Creates a new helm execution environment deploying the helm-chat indicated in the
131 attifact_path
132 :param str namespace: This param is not used, all helm charts are deployed in the osm
133 system namespace
134 :param dict db_dict: where to write to database when the status changes.
135 It contains a dictionary with {collection: str, filter: {}, path: str},
136 e.g. {collection: "nsrs", filter: {_id: <nsd-id>, path:
137 "_admin.deployed.VCA.3"}
138 :param str reuse_ee_id: ee id from an older execution. TODO - right now this params is not used
139 :param float progress_timeout:
140 :param float total_timeout:
141 :param dict config: General variables to instantiate KDU
142 :param str artifact_path: path of package content
143 :param str vca_type: Type of vca, not used as assumed of type helm
144 :returns str, dict: id of the new execution environment including namespace.helm_id
145 and credentials object set to None as all credentials should be osm kubernetes .kubeconfig
146 """
147
148 self.log.info(
149 "create_execution_environment: namespace: {}, artifact_path: {}, db_dict: {}, "
150 "reuse_ee_id: {}".format(
151 namespace, artifact_path, db_dict, reuse_ee_id)
152 )
153
154 # Validate artifact-path is provided
155 if artifact_path is None or len(artifact_path) == 0:
156 raise N2VCBadArgumentsException(
157 message="artifact_path is mandatory", bad_args=["artifact_path"]
158 )
159
160 # Validate artifact-path exists
161
162 # remove / in charm path
163 while artifact_path.find("//") >= 0:
164 artifact_path = artifact_path.replace("//", "/")
165
166 # check charm path
167 if self.fs.file_exists(artifact_path):
168 helm_chart_path = artifact_path
169 else:
170 msg = "artifact path does not exist: {}".format(artifact_path)
171 raise N2VCBadArgumentsException(message=msg, bad_args=["artifact_path"])
172
173 if artifact_path.startswith("/"):
174 full_path = self.fs.path + helm_chart_path
175 else:
176 full_path = self.fs.path + "/" + helm_chart_path
177
178 try:
179 # Call helm conn install
180 # Obtain system cluster id from database
181 system_cluster_uuid = self._get_system_cluster_id()
182 # Add parameter osm if exist to global
183 if config and config.get("osm"):
184 if not config.get("global"):
185 config["global"] = {}
186 config["global"]["osm"] = config.get("osm")
187
188 self.log.debug("install helm chart: {}".format(full_path))
189 helm_id = await self._k8sclusterhelm.install(system_cluster_uuid, kdu_model=full_path,
190 namespace=self._KUBECTL_OSM_NAMESPACE,
191 params=config,
192 db_dict=db_dict,
193 timeout=progress_timeout)
194
195 ee_id = "{}.{}".format(self._KUBECTL_OSM_NAMESPACE, helm_id)
196 return ee_id, None
197 except N2VCException:
198 raise
199 except Exception as e:
200 self.log.error("Error deploying chart ee: {}".format(e), exc_info=True)
201 raise N2VCException("Error deploying chart ee: {}".format(e))
202
203 async def register_execution_environment(self, namespace: str, credentials: dict, db_dict: dict,
204 progress_timeout: float = None, total_timeout: float = None) -> str:
205 # nothing to do
206 pass
207
208 async def install_configuration_sw(self,
209 ee_id: str,
210 artifact_path: str,
211 db_dict: dict,
212 progress_timeout: float = None,
213 total_timeout: float = None,
214 config: dict = None,
215 num_units: int = 1,
216 vca_type: str = None
217 ):
218 # nothing to do
219 pass
220
221 async def add_relation(self, ee_id_1: str, ee_id_2: str, endpoint_1: str, endpoint_2: str):
222 # nothing to do
223 pass
224
225 async def remove_relation(self):
226 # nothing to to
227 pass
228
229 async def get_status(self, namespace: str, yaml_format: bool = True):
230 # not used for this connector
231 pass
232
233 async def get_ee_ssh_public__key(self, ee_id: str, db_dict: dict, progress_timeout: float = None,
234 total_timeout: float = None) -> str:
235 """
236 Obtains ssh-public key from ee executing GetSShKey method from the ee.
237
238 :param str ee_id: the id of the execution environment returned by
239 create_execution_environment or register_execution_environment
240 :param dict db_dict:
241 :param float progress_timeout:
242 :param float total_timeout:
243 :returns: public key of the execution environment
244 """
245
246 self.log.info(
247 "get_ee_ssh_public_key: ee_id: {}, db_dict: {}".format(
248 ee_id, db_dict)
249 )
250
251 # check arguments
252 if ee_id is None or len(ee_id) == 0:
253 raise N2VCBadArgumentsException(
254 message="ee_id is mandatory", bad_args=["ee_id"]
255 )
256
257 try:
258 # Obtain ip_addr for the ee service, it is resolved by dns from the ee name by kubernetes
259 namespace, helm_id = self._get_ee_id_parts(ee_id)
260 ip_addr = socket.gethostbyname(helm_id)
261
262 # Obtain ssh_key from the ee, this method will implement retries to allow the ee
263 # install libraries and start successfully
264 ssh_key = await self._get_ssh_key(ip_addr)
265 return ssh_key
266 except Exception as e:
267 self.log.error("Error obtaining ee ssh_key: {}".format(e), exc_info=True)
268 raise N2VCException("Error obtaining ee ssh_ke: {}".format(e))
269
270 async def exec_primitive(self, ee_id: str, primitive_name: str, params_dict: dict, db_dict: dict = None,
271 progress_timeout: float = None, total_timeout: float = None) -> str:
272 """
273 Execute a primitive in the execution environment
274
275 :param str ee_id: the one returned by create_execution_environment or
276 register_execution_environment with the format namespace.helm_id
277 :param str primitive_name: must be one defined in the software. There is one
278 called 'config', where, for the proxy case, the 'credentials' of VM are
279 provided
280 :param dict params_dict: parameters of the action
281 :param dict db_dict: where to write into database when the status changes.
282 It contains a dict with
283 {collection: <str>, filter: {}, path: <str>},
284 e.g. {collection: "nslcmops", filter:
285 {_id: <nslcmop_id>, path: "_admin.VCA"}
286 It will be used to store information about intermediate notifications
287 :param float progress_timeout:
288 :param float total_timeout:
289 :returns str: primitive result, if ok. It raises exceptions in case of fail
290 """
291
292 self.log.info("exec primitive for ee_id : {}, primitive_name: {}, params_dict: {}, db_dict: {}".format(
293 ee_id, primitive_name, params_dict, db_dict
294 ))
295
296 # check arguments
297 if ee_id is None or len(ee_id) == 0:
298 raise N2VCBadArgumentsException(
299 message="ee_id is mandatory", bad_args=["ee_id"]
300 )
301 if primitive_name is None or len(primitive_name) == 0:
302 raise N2VCBadArgumentsException(
303 message="action_name is mandatory", bad_args=["action_name"]
304 )
305 if params_dict is None:
306 params_dict = dict()
307
308 try:
309 namespace, helm_id = self._get_ee_id_parts(ee_id)
310 ip_addr = socket.gethostbyname(helm_id)
311 except Exception as e:
312 self.log.error("Error getting ee ip ee: {}".format(e))
313 raise N2VCException("Error getting ee ip ee: {}".format(e))
314
315 if primitive_name == "config":
316 try:
317 # Execute config primitive, higher timeout to check the case ee is starting
318 status, detailed_message = await self._execute_config_primitive(ip_addr, params_dict, db_dict=db_dict)
319 self.log.debug("Executed config primitive ee_id_ {}, status: {}, message: {}".format(
320 ee_id, status, detailed_message))
321 if status != "OK":
322 self.log.error("Error configuring helm ee, status: {}, message: {}".format(
323 status, detailed_message))
324 raise N2VCExecutionException(
325 message="Error configuring helm ee_id: {}, status: {}, message: {}: ".format(
326 ee_id, status, detailed_message
327 ),
328 primitive_name=primitive_name,
329 )
330 except Exception as e:
331 self.log.error("Error configuring helm ee: {}".format(e))
332 raise N2VCExecutionException(
333 message="Error configuring helm ee_id: {}, {}".format(
334 ee_id, e
335 ),
336 primitive_name=primitive_name,
337 )
338 return "CONFIG OK"
339 else:
340 try:
341 # Execute primitive
342 status, detailed_message = await self._execute_primitive(ip_addr, primitive_name,
343 params_dict, db_dict=db_dict)
344 self.log.debug("Executed primitive {} ee_id_ {}, status: {}, message: {}".format(
345 primitive_name, ee_id, status, detailed_message))
346 if status != "OK" and status != "PROCESSING":
347 self.log.error(
348 "Execute primitive {} returned not ok status: {}, message: {}".format(
349 primitive_name, status, detailed_message)
350 )
351 raise N2VCExecutionException(
352 message="Execute primitive {} returned not ok status: {}, message: {}".format(
353 primitive_name, status, detailed_message
354 ),
355 primitive_name=primitive_name,
356 )
357 except Exception as e:
358 self.log.error(
359 "Error executing primitive {}: {}".format(primitive_name, e)
360 )
361 raise N2VCExecutionException(
362 message="Error executing primitive {} into ee={} : {}".format(
363 primitive_name, ee_id, e
364 ),
365 primitive_name=primitive_name,
366 )
367 return detailed_message
368
369 async def deregister_execution_environments(self):
370 # nothing to be done
371 pass
372
373 async def delete_execution_environment(self, ee_id: str, db_dict: dict = None, total_timeout: float = None):
374 """
375 Delete an execution environment
376 :param str ee_id: id of the execution environment to delete, included namespace.helm_id
377 :param dict db_dict: where to write into database when the status changes.
378 It contains a dict with
379 {collection: <str>, filter: {}, path: <str>},
380 e.g. {collection: "nsrs", filter:
381 {_id: <nsd-id>, path: "_admin.deployed.VCA.3"}
382 :param float total_timeout:
383 """
384
385 self.log.info("ee_id: {}".format(ee_id))
386
387 # check arguments
388 if ee_id is None:
389 raise N2VCBadArgumentsException(
390 message="ee_id is mandatory", bad_args=["ee_id"]
391 )
392
393 try:
394
395 # Obtain cluster_uuid
396 system_cluster_uuid = self._get_system_cluster_id()
397
398 # Get helm_id
399 namespace, helm_id = self._get_ee_id_parts(ee_id)
400
401 # Uninstall chart
402 await self._k8sclusterhelm.uninstall(system_cluster_uuid, helm_id)
403 self.log.info("ee_id: {} deleted".format(ee_id))
404 except N2VCException:
405 raise
406 except Exception as e:
407 self.log.error("Error deleting ee id: {}: {}".format(ee_id, e), exc_info=True)
408 raise N2VCException("Error deleting ee id {}: {}".format(ee_id, e))
409
410 async def delete_namespace(self, namespace: str, db_dict: dict = None, total_timeout: float = None):
411 # method not implemented for this connector, execution environments must be deleted individually
412 pass
413
414 async def install_k8s_proxy_charm(
415 self,
416 charm_name: str,
417 namespace: str,
418 artifact_path: str,
419 db_dict: dict,
420 progress_timeout: float = None,
421 total_timeout: float = None,
422 config: dict = None,
423 ) -> str:
424 pass
425
426 @retryer(max_wait_time=_MAX_INITIAL_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
427 async def _get_ssh_key(self, ip_addr):
428 channel = Channel(ip_addr, self._ee_service_port)
429 try:
430 stub = FrontendExecutorStub(channel)
431 self.log.debug("get ssh key, ip_addr: {}".format(ip_addr))
432 reply: SshKeyReply = await stub.GetSshKey(SshKeyRequest())
433 return reply.message
434 finally:
435 channel.close()
436
437 @retryer(max_wait_time=_MAX_INITIAL_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
438 async def _execute_config_primitive(self, ip_addr, params, db_dict=None):
439 return await self._execute_primitive_internal(ip_addr, "config", params, db_dict=db_dict)
440
441 @retryer(max_wait_time=_MAX_RETRY_TIME, delay_time=_EE_RETRY_DELAY)
442 async def _execute_primitive(self, ip_addr, primitive_name, params, db_dict=None):
443 return await self._execute_primitive_internal(ip_addr, primitive_name, params, db_dict=db_dict)
444
445 async def _execute_primitive_internal(self, ip_addr, primitive_name, params, db_dict=None):
446
447 channel = Channel(ip_addr, self._ee_service_port)
448 try:
449 stub = FrontendExecutorStub(channel)
450 async with stub.RunPrimitive.open() as stream:
451 primitive_id = str(uuid.uuid1())
452 result = None
453 self.log.debug("Execute primitive internal: id:{}, name:{}, params: {}".
454 format(primitive_id, primitive_name, params))
455 await stream.send_message(
456 PrimitiveRequest(id=primitive_id, name=primitive_name, params=yaml.dump(params)), end=True)
457 async for reply in stream:
458 self.log.debug("Received reply: {}".format(reply))
459 result = reply
460 # If db_dict provided write notifs in database
461 if db_dict:
462 self._write_op_detailed_status(db_dict, reply.status, reply.detailed_message)
463 if result:
464 return reply.status, reply.detailed_message
465 else:
466 return "ERROR", "No result received"
467 finally:
468 channel.close()
469
470 def _write_op_detailed_status(self, db_dict, status, detailed_message):
471
472 # write ee_id to database: _admin.deployed.VCA.x
473 try:
474 the_table = db_dict["collection"]
475 the_filter = db_dict["filter"]
476 update_dict = {"detailed-status": "{}: {}".format(status, detailed_message)}
477 # self.log.debug('Writing ee_id to database: {}'.format(the_path))
478 self.db.set_one(
479 table=the_table,
480 q_filter=the_filter,
481 update_dict=update_dict,
482 fail_on_empty=True,
483 )
484 except asyncio.CancelledError:
485 raise
486 except Exception as e:
487 self.log.error("Error writing detailedStatus to database: {}".format(e))
488
489 def _get_system_cluster_id(self):
490 if not self._system_cluster_id:
491 db_k8cluster = self.db.get_one("k8sclusters", {"name": self._KUBECTL_OSM_CLUSTER_NAME})
492 k8s_hc_id = deep_get(db_k8cluster, ("_admin", "helm-chart", "id"))
493 if not k8s_hc_id:
494 self.log.error("osm system cluster has not been properly initialized for helm connector, "
495 "helm-chart id is not defined")
496 raise N2VCException("osm system cluster has not been properly initialized for helm connector")
497 self._system_cluster_id = k8s_hc_id
498 return self._system_cluster_id
499
500 def _get_ee_id_parts(self, ee_id):
501 namespace, _, helm_id = ee_id.partition('.')
502 return namespace, helm_id