Fix bug 1236: Retry if leader unit is not obtained 32/9832/4
authorDavid Garcia <david.garcia@canonical.com>
Thu, 15 Oct 2020 11:16:45 +0000 (13:16 +0200)
committerDavid Garcia <david.garcia@canonical.com>
Wed, 21 Oct 2020 11:28:23 +0000 (13:28 +0200)
There a racing condition when N2VC tries to get the leader unit, but there
leader-elected hook has not been triggered. In that case, day-1 action might fail.

This commit solved this issue by retrying a couple of times.

Change-Id: If79cd243aa9ebdf8ed0e6235481eeb9fd2640612
Signed-off-by: David Garcia <david.garcia@canonical.com>
n2vc/libjuju.py
n2vc/tests/unit/test_libjuju.py

index a00fa58..28474a0 100644 (file)
@@ -22,6 +22,7 @@ from juju.errors import JujuAPIError
 from juju.model import Model
 from juju.machine import Machine
 from juju.application import Application
+from juju.unit import Unit
 from juju.client._definitions import (
     FullStatus,
     QueryApplicationOffersResults,
@@ -629,11 +630,21 @@ class Libjuju:
             if application is None:
                 raise JujuApplicationNotFound("Cannot execute action")
 
-            # Get unit
+            # Get leader unit
+            # Racing condition:
+            #   Ocassionally, self._get_leader_unit() will return None
+            #   because the leader elected hook has not been triggered yet.
+            #   Therefore, we are doing some retries. If it happens again,
+            #   re-open bug 1236
+            attempts = 3
+            time_between_retries = 10
             unit = None
-            for u in application.units:
-                if await u.is_leader_from_status():
-                    unit = u
+            for _ in range(attempts):
+                unit = await self._get_leader_unit(application)
+                if unit is None:
+                    await asyncio.sleep(time_between_retries)
+                else:
+                    break
             if unit is None:
                 raise JujuLeaderUnitNotFound(
                     "Cannot execute action: leader unit not found"
@@ -1166,3 +1177,11 @@ class Libjuju:
             await controller.remove_cloud(name)
         finally:
             await self.disconnect_controller(controller)
+
+    async def _get_leader_unit(self, application: Application) -> Unit:
+        unit = None
+        for u in application.units:
+            if await u.is_leader_from_status():
+                unit = u
+                break
+        return unit
index 8c16c9d..123da4a 100644 (file)
@@ -30,6 +30,7 @@ from n2vc.exceptions import (
     JujuActionNotFound,
     JujuApplicationExists,
     JujuInvalidK8sConfiguration,
+    JujuLeaderUnitNotFound,
 )
 
 
@@ -579,7 +580,35 @@ class ExecuteActionTest(LibjujuTestCase):
         mock_disconnect_controller.assert_called()
         mock_disconnect_model.assert_called()
 
-    # TODO no leader unit found exception
+    @asynctest.mock.patch("asyncio.sleep")
+    @asynctest.mock.patch("n2vc.tests.unit.utils.FakeUnit.is_leader_from_status")
+    def test_no_leader(
+        self,
+        mock_is_leader_from_status,
+        mock_sleep,
+        mock_get_action_status,
+        mock_get_action_output,
+        mock_wait_for,
+        mock__get_application,
+        mock_disconnect_controller,
+        mock_disconnect_model,
+        mock_get_model,
+        mock_get_controller,
+    ):
+        mock_get_model.return_value = juju.model.Model()
+        mock__get_application.return_value = FakeApplication()
+        mock_is_leader_from_status.return_value = False
+        output = None
+        status = None
+        with self.assertRaises(JujuLeaderUnitNotFound):
+            output, status = self.loop.run_until_complete(
+                self.libjuju.execute_action("app", "model", "action",)
+            )
+        self.assertIsNone(output)
+        self.assertIsNone(status)
+
+        mock_disconnect_controller.assert_called()
+        mock_disconnect_model.assert_called()
 
     def test_succesful_exec(
         self,