X-Git-Url: https://osm.etsi.org/gitweb/?a=blobdiff_plain;f=rwlaunchpad%2Fra%2Fpytest%2Fns%2Fpingpong%2Ftest_ha_pingpong.py;fp=rwlaunchpad%2Fra%2Fpytest%2Fns%2Fpingpong%2Ftest_ha_pingpong.py;h=02ed3a5103ce40425ccb310e68970153a5cb4306;hb=4870d0ee29789b859931e4e2c73e13dcb29537d5;hp=0000000000000000000000000000000000000000;hpb=6f1a3fe149e4a6b9803382cb299c902f4cf58ec9;p=osm%2FSO.git diff --git a/rwlaunchpad/ra/pytest/ns/pingpong/test_ha_pingpong.py b/rwlaunchpad/ra/pytest/ns/pingpong/test_ha_pingpong.py new file mode 100644 index 00000000..02ed3a51 --- /dev/null +++ b/rwlaunchpad/ra/pytest/ns/pingpong/test_ha_pingpong.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 +""" +# +# Copyright 2016 RIFT.IO Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +@file test_launchpad.py +@author Paul Laidler (Paul.Laidler@riftio.com) +@date 07/07/2016 +@brief High-availibility system test that runs ping pong workflow +""" + +import gi +import logging +import os +import pytest +import random +import re +import subprocess +import sys +import time +import uuid + +from contextlib import contextmanager + +import rift.auto.mano +import rift.auto.session +import rift.auto.descriptor + +gi.require_version('RwVnfrYang', '1.0') +from gi.repository import ( + NsrYang, + RwProjectNsdYang, + VnfrYang, + RwNsrYang, + RwVnfrYang, + RwBaseYang, +) + +gi.require_version('RwKeyspec', '1.0') +from gi.repository.RwKeyspec import quoted_key + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +@pytest.mark.setup('seed_random') +class TestSeedRandom: + def test_seed_random(self, random_seed): + logger.info("Seeding number generator with seed {}".format(random_seed)) + random.seed(random_seed) + +class MaxRetriesExceededException(Exception): + '''Indicates the maximum allowed number of retries has been exceeded for an operation + ''' + pass + +class HAVerifyException(Exception): + '''Indicates a failure to verify correct HA behaviour + ''' + pass + + +class HASession: + ''' Wrapper around management session, which kills off system components + in order to trigger HA functionality + ''' + + DEFAULT_ATTEMPTS=3 + DEFAULT_MIN_DELAY=0.0 + DEFAULT_MAX_DELAY=1 + DEFAULT_FREQUENCY=1 + DEFAULT_RECOVERY_TIMEOUT=120 + + def __init__(self, session): + ''' Create a new HASession instance + + Returns: + instance of HASession + ''' + self.session = session + self.set_config() + + @contextmanager + def config(self, *args, **kwargs): + ''' Context manager to allow HASession to temporarily have its config modified + ''' + current_config = self.get_config() + self.set_config(*args, **kwargs) + yield + self.set_config(*current_config) + + def get_config(self): + ''' Returns the current HA session config + ''' + return (self.attempts, self.min_delay, self.max_delay, self.ha_frequency, self.recovery_timeout) + + def set_config(self, attempts=None, min_delay=None, max_delay=None, ha_frequency=None, recovery_timeout=None): + ''' Set the HA session config, set default values for all config options not provided + + Arguments: + attempts - Number of times to attempt an operation before failing + min_delay - minimum time that must elapse before session is allowed to kill a component + max_delay - maximum time that may elapse before killing a component + ha_frequency - frequency at which operations are tested for ha + recovery_timeout - time allowed for system to recovery after a component is killed + ''' + if not attempts: + attempts = HASession.DEFAULT_ATTEMPTS + if not min_delay: + min_delay = HASession.DEFAULT_MIN_DELAY + if not max_delay: + max_delay = HASession.DEFAULT_MAX_DELAY + if not ha_frequency: + ha_frequency = HASession.DEFAULT_FREQUENCY + if not recovery_timeout: + recovery_timeout = HASession.DEFAULT_RECOVERY_TIMEOUT + + self.attempts = attempts + self.min_delay = min_delay + self.max_delay = max_delay + self.ha_frequency = ha_frequency + self.recovery_timeout = recovery_timeout + + def call(self, operation, *args, **kwargs): + ''' Call an operation using the wrapped management session, then + kill off a system component, and verify the operation still succeeds + + Arguments: + operation - operation to be invoked + ''' + # Choose to make the normal session call or do the HA test + if random.choice(range(0,int(1/self.ha_frequency))) != 0: + return operation(*args, **kwargs) + + # Make sure we're starting from a running system + rift.vcs.vcs.wait_until_system_started(self.session) + + def choose_any_tasklet(vcs_info): + tasklets = [component_info.component_name for component_info in vcs_info.components.component_info] + return random.choice(tasklets) + + def choose_restartable_tasklet(vcs_info): + restartable_tasklets = [ + component_info.component_name + for component_info in vcs_info.components.component_info + if component_info.recovery_action == 'RESTART' + and component_info.component_type == 'RWTASKLET' + ] + return random.choice(restartable_tasklets) + + vcs_info = self.session.proxy(RwBaseYang).get('/vcs/info') + component_name = choose_restartable_tasklet(vcs_info) + + ssh_cmd = 'ssh {} -o StrictHostKeyChecking=no -o BatchMode=yes'.format(self.session.host) + def get_component_process_pid(component_name): + cmd = '{} -- \'ps -ef | grep -v "grep" | grep rwmain | grep "{}" | tr -s " " | cut -d " " -f 2\''.format(ssh_cmd, component_name) + logger.info("Finding component [{}] pid using cmd: {}".format(component_name, cmd)) + output = subprocess.check_output(cmd, shell=True) + return output.decode('ascii').strip() + process_pid = get_component_process_pid(component_name) + logger.info('{} has pid {}'.format(component_name, process_pid)) + + # Kick off a background process to kill the tasklet after some delay + delay = self.min_delay + (self.max_delay-self.min_delay)*random.random() + logger.info("Killing {} [{}] in {}".format(component_name, process_pid, delay)) + cmd = '(sleep {} && {} -- "sudo kill -9 {}") &'.format(delay, ssh_cmd, process_pid) + os.system(cmd) + + # Invoke session operation + now = time.time() + result = None + attempt = 0 + while attempt < self.attempts: + try: + result = operation(*args, **kwargs) + # Possible improvement: implement optional verify step here + break + except Exception: + logger.error('operation failed - {}'.format(operation)) + attempt += 1 + # If the operation failed, wait until recovery occurs to re-attempt + rift.vcs.vcs.wait_until_system_started(self.session) + + if attempt >= self.attempts: + raise MaxRetriesExceededException("Killed %s [%d] - Subsequently failed operation : %s %s %s", component_name, process_pid, operation, args, kwargs ) + + # Wait until kill has definitely happened + elapsed = now - time.time() + remaining = delay - elapsed + if remaining > 0: + time.sleep(remaining) + time.sleep(3) + + # Verify system reaches running status again + rift.vcs.vcs.wait_until_system_started(self.session) + + # TODO: verify the tasklet process was actually restarted (got a new pid) + new_pid = get_component_process_pid(component_name) + if process_pid == new_pid: + raise HAVerifyException("Process pid unchanged : %d == %d ~ didn't die?" % (process_pid, new_pid)) + + return result + +@pytest.fixture +def ha_session(mgmt_session): + return HASession(mgmt_session) + +@pytest.mark.depends('seed_random') +@pytest.mark.setup('launchpad') +@pytest.mark.incremental +class TestLaunchpadSetup: + def test_create_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts): + '''Configure cloud accounts + + Asserts: + Cloud name and cloud type details + ''' + for cloud_account in cloud_accounts: + xpath = "{cloud_xpath}[name={cloud_account_name}]".format( + cloud_xpath=cloud_xpath, + cloud_account_name=quoted_key(cloud_account.name) + ) + ha_session.call(mgmt_session.proxy(cloud_module).replace_config, xpath, cloud_account) + response = ha_session.call(mgmt_session.proxy(cloud_module).get, xpath) + assert response.name == cloud_account.name + assert response.account_type == cloud_account.account_type + +@pytest.mark.teardown('launchpad') +@pytest.mark.incremental +class TestLaunchpadTeardown: + def test_delete_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts): + '''Unconfigure cloud_account''' + for cloud_account in cloud_accounts: + xpath = "{cloud_xpath}[name={cloud_account_name}]".format( + cloud_xpath=cloud_xpath, + cloud_account_name=quoted_key(cloud_account.name) + ) + ha_session.call(mgmt_session.proxy(cloud_module).delete_config, xpath) + +@pytest.mark.setup('pingpong') +@pytest.mark.depends('launchpad') +@pytest.mark.incremental +class TestSetupPingpong(object): + def test_onboard(self, ha_session, mgmt_session, descriptors): + for descriptor in descriptors: + with ha_session.config(max_delay=15): + ha_session.call(rift.auto.descriptor.onboard, mgmt_session, descriptor) + + def test_instantiate(self, ha_session, mgmt_session, cloud_account_name): + catalog = ha_session.call(mgmt_session.proxy(RwProjectNsdYang).get_config, '/nsd-catalog') + nsd = catalog.nsd[0] + nsr = rift.auto.descriptor.create_nsr(cloud_account_name, "pingpong_1", nsd) + ha_session.call(mgmt_session.proxy(RwNsrYang).create_config, '/ns-instance-config/nsr', nsr) + +@pytest.mark.depends('pingpong') +@pytest.mark.teardown('pingpong') +@pytest.mark.incremental +class TestTeardownPingpong(object): + def test_teardown(self, ha_session, mgmt_session): + ns_instance_config = ha_session.call(mgmt_session.proxy(RwNsrYang).get_config, '/ns-instance-config') + for nsr in ns_instance_config.nsr: + ha_session.call(mgmt_session.proxy(RwNsrYang).delete_config, "/ns-instance-config/nsr[id={}]".format(quoted_key(nsr.id))) + + time.sleep(60) + vnfr_catalog = ha_session.call(mgmt_session.proxy(RwVnfrYang).get, '/vnfr-catalog') + assert vnfr_catalog is None or len(vnfr_catalog.vnfr) == 0 + +@pytest.mark.depends('launchpad') +@pytest.mark.incremental +class TestLaunchpad: + def test_account_connection_status(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts): + '''Verify connection status on each cloud account + + Asserts: + Cloud account is successfully connected + ''' + for cloud_account in cloud_accounts: + with ha_session.config(attempts=2): + ha_session.call( + mgmt_session.proxy(cloud_module).wait_for, + '{}[name={}]/connection-status/status'.format(cloud_xpath, quoted_key(cloud_account.name)), + 'success', + timeout=60, + fail_on=['failure'] + ) + +@pytest.mark.depends('pingpong') +@pytest.mark.incremental +class TestPingpong: + def test_service_started(self, ha_session, mgmt_session): + nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata') + nsrs = nsr_opdata.nsr + + for nsr in nsrs: + xpath = ( + "/ns-instance-opdata/nsr[ns-instance-config-ref={ns_instance_config_ref}]/operational-status" + ).format( + ns_instance_config_ref=quoted_key(nsr.ns_instance_config_ref) + ) + + with ha_session.config(attempts=2, max_delay=60): + ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "running", fail_on=['failed'], timeout=300) + + def test_service_configured(self, ha_session, mgmt_session): + nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata') + nsrs = nsr_opdata.nsr + + for nsr in nsrs: + xpath = ( + "/ns-instance-opdata/nsr[ns-instance-config-ref={}]/config-status" + ).format( + quoted_key(nsr.ns_instance_config_ref) + ) + + with ha_session.config(attempts=2, max_delay=60): + ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "configured", fail_on=['failed'], timeout=300) +