02ed3a5103ce40425ccb310e68970153a5cb4306
[osm/SO.git] / rwlaunchpad / ra / pytest / ns / pingpong / test_ha_pingpong.py
1 #!/usr/bin/env python3
2 """
3 #
4 # Copyright 2016 RIFT.IO Inc
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
17 #
18
19 @file test_launchpad.py
20 @author Paul Laidler (Paul.Laidler@riftio.com)
21 @date 07/07/2016
22 @brief High-availibility system test that runs ping pong workflow
23 """
24
25 import gi
26 import logging
27 import os
28 import pytest
29 import random
30 import re
31 import subprocess
32 import sys
33 import time
34 import uuid
35
36 from contextlib import contextmanager
37
38 import rift.auto.mano
39 import rift.auto.session
40 import rift.auto.descriptor
41
42 gi.require_version('RwVnfrYang', '1.0')
43 from gi.repository import (
44 NsrYang,
45 RwProjectNsdYang,
46 VnfrYang,
47 RwNsrYang,
48 RwVnfrYang,
49 RwBaseYang,
50 )
51
52 gi.require_version('RwKeyspec', '1.0')
53 from gi.repository.RwKeyspec import quoted_key
54
55 logging.basicConfig(level=logging.DEBUG)
56 logger = logging.getLogger(__name__)
57
58 @pytest.mark.setup('seed_random')
59 class TestSeedRandom:
60 def test_seed_random(self, random_seed):
61 logger.info("Seeding number generator with seed {}".format(random_seed))
62 random.seed(random_seed)
63
64 class MaxRetriesExceededException(Exception):
65 '''Indicates the maximum allowed number of retries has been exceeded for an operation
66 '''
67 pass
68
69 class HAVerifyException(Exception):
70 '''Indicates a failure to verify correct HA behaviour
71 '''
72 pass
73
74
75 class HASession:
76 ''' Wrapper around management session, which kills off system components
77 in order to trigger HA functionality
78 '''
79
80 DEFAULT_ATTEMPTS=3
81 DEFAULT_MIN_DELAY=0.0
82 DEFAULT_MAX_DELAY=1
83 DEFAULT_FREQUENCY=1
84 DEFAULT_RECOVERY_TIMEOUT=120
85
86 def __init__(self, session):
87 ''' Create a new HASession instance
88
89 Returns:
90 instance of HASession
91 '''
92 self.session = session
93 self.set_config()
94
95 @contextmanager
96 def config(self, *args, **kwargs):
97 ''' Context manager to allow HASession to temporarily have its config modified
98 '''
99 current_config = self.get_config()
100 self.set_config(*args, **kwargs)
101 yield
102 self.set_config(*current_config)
103
104 def get_config(self):
105 ''' Returns the current HA session config
106 '''
107 return (self.attempts, self.min_delay, self.max_delay, self.ha_frequency, self.recovery_timeout)
108
109 def set_config(self, attempts=None, min_delay=None, max_delay=None, ha_frequency=None, recovery_timeout=None):
110 ''' Set the HA session config, set default values for all config options not provided
111
112 Arguments:
113 attempts - Number of times to attempt an operation before failing
114 min_delay - minimum time that must elapse before session is allowed to kill a component
115 max_delay - maximum time that may elapse before killing a component
116 ha_frequency - frequency at which operations are tested for ha
117 recovery_timeout - time allowed for system to recovery after a component is killed
118 '''
119 if not attempts:
120 attempts = HASession.DEFAULT_ATTEMPTS
121 if not min_delay:
122 min_delay = HASession.DEFAULT_MIN_DELAY
123 if not max_delay:
124 max_delay = HASession.DEFAULT_MAX_DELAY
125 if not ha_frequency:
126 ha_frequency = HASession.DEFAULT_FREQUENCY
127 if not recovery_timeout:
128 recovery_timeout = HASession.DEFAULT_RECOVERY_TIMEOUT
129
130 self.attempts = attempts
131 self.min_delay = min_delay
132 self.max_delay = max_delay
133 self.ha_frequency = ha_frequency
134 self.recovery_timeout = recovery_timeout
135
136 def call(self, operation, *args, **kwargs):
137 ''' Call an operation using the wrapped management session, then
138 kill off a system component, and verify the operation still succeeds
139
140 Arguments:
141 operation - operation to be invoked
142 '''
143 # Choose to make the normal session call or do the HA test
144 if random.choice(range(0,int(1/self.ha_frequency))) != 0:
145 return operation(*args, **kwargs)
146
147 # Make sure we're starting from a running system
148 rift.vcs.vcs.wait_until_system_started(self.session)
149
150 def choose_any_tasklet(vcs_info):
151 tasklets = [component_info.component_name for component_info in vcs_info.components.component_info]
152 return random.choice(tasklets)
153
154 def choose_restartable_tasklet(vcs_info):
155 restartable_tasklets = [
156 component_info.component_name
157 for component_info in vcs_info.components.component_info
158 if component_info.recovery_action == 'RESTART'
159 and component_info.component_type == 'RWTASKLET'
160 ]
161 return random.choice(restartable_tasklets)
162
163 vcs_info = self.session.proxy(RwBaseYang).get('/vcs/info')
164 component_name = choose_restartable_tasklet(vcs_info)
165
166 ssh_cmd = 'ssh {} -o StrictHostKeyChecking=no -o BatchMode=yes'.format(self.session.host)
167 def get_component_process_pid(component_name):
168 cmd = '{} -- \'ps -ef | grep -v "grep" | grep rwmain | grep "{}" | tr -s " " | cut -d " " -f 2\''.format(ssh_cmd, component_name)
169 logger.info("Finding component [{}] pid using cmd: {}".format(component_name, cmd))
170 output = subprocess.check_output(cmd, shell=True)
171 return output.decode('ascii').strip()
172 process_pid = get_component_process_pid(component_name)
173 logger.info('{} has pid {}'.format(component_name, process_pid))
174
175 # Kick off a background process to kill the tasklet after some delay
176 delay = self.min_delay + (self.max_delay-self.min_delay)*random.random()
177 logger.info("Killing {} [{}] in {}".format(component_name, process_pid, delay))
178 cmd = '(sleep {} && {} -- "sudo kill -9 {}") &'.format(delay, ssh_cmd, process_pid)
179 os.system(cmd)
180
181 # Invoke session operation
182 now = time.time()
183 result = None
184 attempt = 0
185 while attempt < self.attempts:
186 try:
187 result = operation(*args, **kwargs)
188 # Possible improvement: implement optional verify step here
189 break
190 except Exception:
191 logger.error('operation failed - {}'.format(operation))
192 attempt += 1
193 # If the operation failed, wait until recovery occurs to re-attempt
194 rift.vcs.vcs.wait_until_system_started(self.session)
195
196 if attempt >= self.attempts:
197 raise MaxRetriesExceededException("Killed %s [%d] - Subsequently failed operation : %s %s %s", component_name, process_pid, operation, args, kwargs )
198
199 # Wait until kill has definitely happened
200 elapsed = now - time.time()
201 remaining = delay - elapsed
202 if remaining > 0:
203 time.sleep(remaining)
204 time.sleep(3)
205
206 # Verify system reaches running status again
207 rift.vcs.vcs.wait_until_system_started(self.session)
208
209 # TODO: verify the tasklet process was actually restarted (got a new pid)
210 new_pid = get_component_process_pid(component_name)
211 if process_pid == new_pid:
212 raise HAVerifyException("Process pid unchanged : %d == %d ~ didn't die?" % (process_pid, new_pid))
213
214 return result
215
216 @pytest.fixture
217 def ha_session(mgmt_session):
218 return HASession(mgmt_session)
219
220 @pytest.mark.depends('seed_random')
221 @pytest.mark.setup('launchpad')
222 @pytest.mark.incremental
223 class TestLaunchpadSetup:
224 def test_create_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
225 '''Configure cloud accounts
226
227 Asserts:
228 Cloud name and cloud type details
229 '''
230 for cloud_account in cloud_accounts:
231 xpath = "{cloud_xpath}[name={cloud_account_name}]".format(
232 cloud_xpath=cloud_xpath,
233 cloud_account_name=quoted_key(cloud_account.name)
234 )
235 ha_session.call(mgmt_session.proxy(cloud_module).replace_config, xpath, cloud_account)
236 response = ha_session.call(mgmt_session.proxy(cloud_module).get, xpath)
237 assert response.name == cloud_account.name
238 assert response.account_type == cloud_account.account_type
239
240 @pytest.mark.teardown('launchpad')
241 @pytest.mark.incremental
242 class TestLaunchpadTeardown:
243 def test_delete_cloud_accounts(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
244 '''Unconfigure cloud_account'''
245 for cloud_account in cloud_accounts:
246 xpath = "{cloud_xpath}[name={cloud_account_name}]".format(
247 cloud_xpath=cloud_xpath,
248 cloud_account_name=quoted_key(cloud_account.name)
249 )
250 ha_session.call(mgmt_session.proxy(cloud_module).delete_config, xpath)
251
252 @pytest.mark.setup('pingpong')
253 @pytest.mark.depends('launchpad')
254 @pytest.mark.incremental
255 class TestSetupPingpong(object):
256 def test_onboard(self, ha_session, mgmt_session, descriptors):
257 for descriptor in descriptors:
258 with ha_session.config(max_delay=15):
259 ha_session.call(rift.auto.descriptor.onboard, mgmt_session, descriptor)
260
261 def test_instantiate(self, ha_session, mgmt_session, cloud_account_name):
262 catalog = ha_session.call(mgmt_session.proxy(RwProjectNsdYang).get_config, '/nsd-catalog')
263 nsd = catalog.nsd[0]
264 nsr = rift.auto.descriptor.create_nsr(cloud_account_name, "pingpong_1", nsd)
265 ha_session.call(mgmt_session.proxy(RwNsrYang).create_config, '/ns-instance-config/nsr', nsr)
266
267 @pytest.mark.depends('pingpong')
268 @pytest.mark.teardown('pingpong')
269 @pytest.mark.incremental
270 class TestTeardownPingpong(object):
271 def test_teardown(self, ha_session, mgmt_session):
272 ns_instance_config = ha_session.call(mgmt_session.proxy(RwNsrYang).get_config, '/ns-instance-config')
273 for nsr in ns_instance_config.nsr:
274 ha_session.call(mgmt_session.proxy(RwNsrYang).delete_config, "/ns-instance-config/nsr[id={}]".format(quoted_key(nsr.id)))
275
276 time.sleep(60)
277 vnfr_catalog = ha_session.call(mgmt_session.proxy(RwVnfrYang).get, '/vnfr-catalog')
278 assert vnfr_catalog is None or len(vnfr_catalog.vnfr) == 0
279
280 @pytest.mark.depends('launchpad')
281 @pytest.mark.incremental
282 class TestLaunchpad:
283 def test_account_connection_status(self, ha_session, mgmt_session, cloud_module, cloud_xpath, cloud_accounts):
284 '''Verify connection status on each cloud account
285
286 Asserts:
287 Cloud account is successfully connected
288 '''
289 for cloud_account in cloud_accounts:
290 with ha_session.config(attempts=2):
291 ha_session.call(
292 mgmt_session.proxy(cloud_module).wait_for,
293 '{}[name={}]/connection-status/status'.format(cloud_xpath, quoted_key(cloud_account.name)),
294 'success',
295 timeout=60,
296 fail_on=['failure']
297 )
298
299 @pytest.mark.depends('pingpong')
300 @pytest.mark.incremental
301 class TestPingpong:
302 def test_service_started(self, ha_session, mgmt_session):
303 nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata')
304 nsrs = nsr_opdata.nsr
305
306 for nsr in nsrs:
307 xpath = (
308 "/ns-instance-opdata/nsr[ns-instance-config-ref={ns_instance_config_ref}]/operational-status"
309 ).format(
310 ns_instance_config_ref=quoted_key(nsr.ns_instance_config_ref)
311 )
312
313 with ha_session.config(attempts=2, max_delay=60):
314 ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "running", fail_on=['failed'], timeout=300)
315
316 def test_service_configured(self, ha_session, mgmt_session):
317 nsr_opdata = ha_session.call(mgmt_session.proxy(RwNsrYang).get, '/ns-instance-opdata')
318 nsrs = nsr_opdata.nsr
319
320 for nsr in nsrs:
321 xpath = (
322 "/ns-instance-opdata/nsr[ns-instance-config-ref={}]/config-status"
323 ).format(
324 quoted_key(nsr.ns_instance_config_ref)
325 )
326
327 with ha_session.config(attempts=2, max_delay=60):
328 ha_session.call(mgmt_session.proxy(RwNsrYang).wait_for, xpath, "configured", fail_on=['failed'], timeout=300)
329