rwlaunchpad/ra/pytest/ns/pingpong/test_ha_pingpong.py

   1 #!/usr/bin/env python3
   2 """
   3 #
   4 #   Copyright 2016 RIFT.IO Inc
   5 #
   6 #   Licensed under the Apache License, Version 2.0 (the "License");
   7 #   you may not use this file except in compliance with the License.
   8 #   You may obtain a copy of the License at
   9 #
  10 #       http://www.apache.org/licenses/LICENSE-2.0
  11 #
  12 #   Unless required by applicable law or agreed to in writing, software
  13 #   distributed under the License is distributed on an "AS IS" BASIS,
  14 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 #   See the License for the specific language governing permissions and
  16 #   limitations under the License.
  17 #
  18
  19 @file test_launchpad.py
  20 @author Paul Laidler (Paul.Laidler@riftio.com)
  21 @date 07/07/2016
  22 @brief High-availibility system test that runs ping pong workflow
  23 """
  24
  25 import gi
  26 import logging
  27 import os
  28 import pytest
  29 import random
  30 import re
  31 import subprocess
  32 import sys
  33 import time
  34 import uuid
  35
  36 from contextlib import contextmanager
  37
  38 import rift.auto.mano
  39 import rift.auto.session
  40 import rift.auto.descriptor
  41
  42 gi.require_version('RwVnfrYang', '1.0')
  43 from gi.repository import (
  44     NsrYang,
  45     RwProjectNsdYang,
  46     VnfrYang,
  47     RwNsrYang,
  48     RwVnfrYang,
  49     RwBaseYang,
  50 )
  51
  52 gi.require_version('RwKeyspec', '1.0')
  53 from gi.repository.RwKeyspec import quoted_key
  54
  55 logging.basicConfig(level=logging.DEBUG)
  56 logger = logging.getLogger(__name__)
  57
  58 @pytest.mark.setup('seed_random')
  59 class TestSeedRandom:
  60     def test_seed_random(self, random_seed):
  61         logger.info("Seeding number generator with seed {}".format(random_seed))
  62         random.seed(random_seed)
  63
  64 class MaxRetriesExceededException(Exception):
  65     '''Indicates the maximum allowed number of retries has been exceeded for an operation
  66     '''
  67     pass
  68
  69 class HAVerifyException(Exception):
  70     '''Indicates a failure to verify correct HA behaviour
  71     '''
  72     pass
  73
  74
  75 class HASession:
  76     ''' Wrapper around management session, which kills off system components
  77     in order to trigger HA functionality
  78     '''
  79
  80     DEFAULT_ATTEMPTS=3
  81     DEFAULT_MIN_DELAY=0.0
  82     DEFAULT_MAX_DELAY=1
  83     DEFAULT_FREQUENCY=1
  84     DEFAULT_RECOVERY_TIMEOUT=120
  85
  86     def __init__(self, session):
  87         ''' Create a new HASession instance
  88
  89         Returns:
  90             instance of HASession
  91         '''
  92         self.session = session
  93         self.set_config()
  94
  95     @contextmanager
  96     def config(self, *args, **kwargs):
  97         ''' Context manager to allow HASession to temporarily have its config modified
  98         '''
  99         current_config = self.get_config()
 100         self.set_config(*args, **kwargs)
 101         yield
 102         self.set_config(*current_config)
 103
 104     def get_config(self):
 105         ''' Returns the current HA session config
 106         '''
 107         return (self.attempts, self.min_delay, self.max_delay, self.ha_frequency, self.recovery_timeout)
 108
 109     def set_config(self, attempts=None, min_delay=None, max_delay=None, ha_frequency=None, recovery_timeout=None):
 110         ''' Set the HA session config, set default values for all config options not provided
 111
 112         Arguments:
 113             attempts - Number of times to attempt an operation before failing
 114             min_delay - minimum time that must elapse before session is allowed to kill a component
 115             max_delay - maximum time that may elapse before killing a component
 116             ha_frequency - frequency at which operations are tested for ha
 117             recovery_timeout - time allowed for system to recovery after a component is killed
 118         '''
 119         if not attempts:
 120             attempts = HASession.DEFAULT_ATTEMPTS
 121         if not min_delay:
 122             min_delay = HASession.DEFAULT_MIN_DELAY
 123         if not max_delay:
 124             max_delay = HASession.DEFAULT_MAX_DELAY
 125         if not ha_frequency:
 126             ha_frequency = HASession.DEFAULT_FREQUENCY
 127         if not recovery_timeout:
 128             recovery_timeout = HASession.DEFAULT_RECOVERY_TIMEOUT
 129
 130         self.attempts = attempts
 131         self.min_delay = min_delay
 132         self.max_delay = max_delay
 133         self.ha_frequency = ha_frequency
 134         self.recovery_timeout = recovery_timeout
 135
 136     def call(self, operation, *args, **kwargs):
 137         ''' Call an operation using the wrapped management session, then
 138         kill off a system component, and verify the operation still succeeds
 139
 140         Arguments:
 141             operation - operation to be invoked
 142         '''
 143         # Choose to make the normal session call or do the HA test
 144         if random.choice(range(0,int(1/self.ha_frequency))) != 0:
 145             return operation(*args, **kwargs)
 146
 147         # Make sure we're starting from a running system
 148         rift.vcs.vcs.wait_until_system_started(self.session)
 149
 150         def choose_any_tasklet(vcs_info):
 151             tasklets = [component_info.component_name for component_info in vcs_info.components.component_info]
 152             return random.choice(tasklets)
 153
 154         def choose_restartable_tasklet(vcs_info):
 155             restartable_tasklets = [
 156                 component_info.component_name
 157                 for component_info in vcs_info.components.component_info
 158                     if component_info.recovery_action == 'RESTART'
 159                     and component_info.component_type == 'RWTASKLET'
 160             ]
 161             return random.choice(restartable_tasklets)
 162
 163         vcs_info = self.session.proxy(RwBaseYang).get('/vcs/info')
 164         component_name = choose_restartable_tasklet(vcs_info)
 165
 166         ssh_cmd = 'ssh {} -o StrictHostKeyChecking=no -o BatchMode=yes'.format(self.session.host)
 167         def get_component_process_pid(component_name):
 168             cmd = '{} -- \'ps -ef | grep -v "grep" | grep rwmain | grep "{}" | tr -s " " | cut -d " " -f 2\''.format(ssh_cmd, component_name)
 169             logger.info("Finding component [{}] pid using cmd: {}".format(component_name, cmd))
 170             output = subprocess.check_output(cmd, shell=True)
 171             return output.decode('ascii').strip()
 172         process_pid = get_component_process_pid(component_name)
 173         logger.info('{} has pid {}'.format(component_name, process_pid))
 174
 175         # Kick off a background process to kill the tasklet after some delay
 176         delay = self.min_delay + (self.max_delay-self.min_delay)*random.random()
 177         logger.info("Killing {} [{}] in {}".format(component_name, process_pid, delay))
 178         cmd = '(sleep {} && {} -- "sudo kill -9 {}") &'.format(delay, ssh_cmd, process_pid)
 179         os.system(cmd)
 180
 181         # Invoke session operation
 182         now = time.time()
 183         result = None
 184         attempt = 0
 185         while attempt < self.attempts:
 186             try:
 187                 result = operation(*args, **kwargs)
 188                 # Possible improvement:  implement optional verify step here
 189                 break
 190             except Exception:
 191                 logger.error('operation failed - {}'.format(operation))
 192                 attempt += 1
 193             # If the operation failed, wait until recovery occurs to re-attempt
 194             rift.vcs.vcs.wait_until_system_started(self.session)
 195
 196         if attempt >= self.attempts:
 197             raise MaxRetriesExceededException("Killed %s [%d] - Subsequently failed operation : %s %s %s", component_name, process_pid, operation, args, kwargs )
 198
 199         # Wait until kill has definitely happened
 200         elapsed = now - time.time()
 201         remaining = delay - elapsed
 202         if remaining > 0:
 203             time.sleep(remaining)
 204         time.sleep(3)
 205
 206         # Verify system reaches running status again
 207         rift.vcs.vcs.wait_until_system_started(self.session)
 208
 209         # TODO: verify the tasklet process was actually restarted (got a new pid)
 210         new_pid = get_component_process_pid(component_name)
 211         if process_pid == new_pid:
 212             raise HAVerifyException("Process pid unchanged : %d == %d ~ didn't die?" % (process_pid, new_pid))
 213
 214         return result
 215
 216 @pytest.fixture
 217 def ha_session(mgmt_session):
 218    return HASession(mgmt_session)
 219
 220 @pytest.mark.depends('seed_random')
 221 @pytest.mark.setup('launchpad')
 222 @pytest.mark.incremental
 223 class TestLaunchpadSetup:
 224     def test_create_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
 225         '''Configure cloud accounts
 226
 227         Asserts:
 228             Cloud name and cloud type details
 229         '''
 230         for cloud_account in cloud_accounts:
 231             xpath = "{cloud_xpath}[name={cloud_account_name}]".format(
 232                 cloud_xpath=cloud_xpath,
 233                 cloud_account_name=quoted_key(cloud_account.name)
 234             )
 235             ha_session.call(mgmt_session.proxy(cloud_module).replace_config, xpath, cloud_account)
 236             response = ha_session.call(mgmt_session.proxy(cloud_module).get, xpath)
 237             assert response.name == cloud_account.name
 238             assert response.account_type == cloud_account.account_type
 239
 240 @pytest.mark.teardown('launchpad')
 241 @pytest.mark.incremental
 242 class TestLaunchpadTeardown:
 243     def test_delete_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
 244         '''Unconfigure cloud_account'''
 245         for cloud_account in cloud_accounts:
 246             xpath = "{cloud_xpath}[name={cloud_account_name}]".format(
 247                 cloud_xpath=cloud_xpath,
 248                 cloud_account_name=quoted_key(cloud_account.name)
 249             )
 250             ha_session.call(mgmt_session.proxy(cloud_module).delete_config, xpath)
 251
 252 @pytest.mark.setup('pingpong')
 253 @pytest.mark.depends('launchpad')
 254 @pytest.mark.incremental
 255 class TestSetupPingpong(object):
 256     def test_onboard(self, ha_session, mgmt_session, descriptors):
 257         for descriptor in descriptors:
 258             with ha_session.config(max_delay=15):
 259                 ha_session.call(rift.auto.descriptor.onboard, mgmt_session, descriptor)
 260
 261     def test_instantiate(self, ha_session, mgmt_session, cloud_account_name):
 262         catalog = ha_session.call(mgmt_session.proxy(RwProjectNsdYang).get_config, '/nsd-catalog')
 263         nsd = catalog.nsd[0]
 264         nsr = rift.auto.descriptor.create_nsr(cloud_account_name, "pingpong_1", nsd)
 265         ha_session.call(mgmt_session.proxy(RwNsrYang).create_config, '/ns-instance-config/nsr', nsr)
 266
 267 @pytest.mark.depends('pingpong')
 268 @pytest.mark.teardown('pingpong')
 269 @pytest.mark.incremental
 270 class TestTeardownPingpong(object):
 271     def test_teardown(self, ha_session, mgmt_session):
 272         ns_instance_config = ha_session.call(mgmt_session.proxy(RwNsrYang).get_config, '/ns-instance-config')
 273         for nsr in ns_instance_config.nsr:
 274             ha_session.call(mgmt_session.proxy(RwNsrYang).delete_config, "/ns-instance-config/nsr[id={}]".format(quoted_key(nsr.id)))
 275
 276         time.sleep(60)
 277         vnfr_catalog = ha_session.call(mgmt_session.proxy(RwVnfrYang).get, '/vnfr-catalog')
 278         assert vnfr_catalog is None or len(vnfr_catalog.vnfr) == 0
 279
 280 @pytest.mark.depends('launchpad')
 281 @pytest.mark.incremental
 282 class TestLaunchpad:
 283     def test_account_connection_status(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
 284         '''Verify connection status on each cloud account
 285
 286         Asserts:
 287             Cloud account is successfully connected
 288         '''
 289         for cloud_account in cloud_accounts:
 290             with ha_session.config(attempts=2):
 291                 ha_session.call(
 292                     mgmt_session.proxy(cloud_module).wait_for,
 293                     '{}[name={}]/connection-status/status'.format(cloud_xpath, quoted_key(cloud_account.name)),
 294                     'success',
 295                     timeout=60,
 296                     fail_on=['failure']
 297                 )
 298
 299 @pytest.mark.depends('pingpong')
 300 @pytest.mark.incremental
 301 class TestPingpong:
 302     def test_service_started(self, ha_session, mgmt_session):
 303         nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata')
 304         nsrs = nsr_opdata.nsr
 305
 306         for nsr in nsrs:
 307             xpath = (
 308                 "/ns-instance-opdata/nsr[ns-instance-config-ref={ns_instance_config_ref}]/operational-status"
 309             ).format(
 310                 ns_instance_config_ref=quoted_key(nsr.ns_instance_config_ref)
 311             )
 312
 313             with ha_session.config(attempts=2, max_delay=60):
 314                 ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "running", fail_on=['failed'], timeout=300)
 315
 316     def test_service_configured(self, ha_session, mgmt_session):
 317         nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata')
 318         nsrs = nsr_opdata.nsr
 319
 320         for nsr in nsrs:
 321             xpath = (
 322                 "/ns-instance-opdata/nsr[ns-instance-config-ref={}]/config-status"
 323             ).format(
 324                 quoted_key(nsr.ns_instance_config_ref)
 325             )
 326
 327             with ha_session.config(attempts=2, max_delay=60):
 328                 ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "configured", fail_on=['failed'], timeout=300)
 329