From 0fa282eee7697427d70eef5255c2113397c7d816 Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Thu, 28 May 2020 18:01:13 +0200
Subject: [PATCH] agent: do not hard power off if soft power off actually
 succeeds

Change-Id: I747e7bd2a341c828006e9f2216c81dbf9a01eec6
---
 ironic/drivers/modules/agent_base.py          | 17 +++++--
 .../unit/drivers/modules/test_agent_base.py   | 45 ++++++++++++++++++-
 .../agent-power-off-2115fcfaac030bd0.yaml     |  5 +++
 3 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml

diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index 0cdd854703..61ef16451c 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -1015,16 +1015,25 @@ class AgentDeployMixin(HeartbeatMixin):
             if not oob_power_off:
                 try:
                     self._client.power_off(node)
-                    _wait_until_powered_off(task)
                 except Exception as e:
-                    LOG.warning('Failed to soft power off node %(node_uuid)s '
-                                'in at least %(timeout)d seconds. '
+                    LOG.warning('Failed to soft power off node %(node_uuid)s. '
                                 '%(cls)s: %(error)s',
                                 {'node_uuid': node.uuid,
-                                 'timeout': (wait * (attempts - 1)) / 1000,
                                  'cls': e.__class__.__name__, 'error': e},
                                 exc_info=not isinstance(
                                     e, exception.IronicException))
+
+                # NOTE(dtantsur): in rare cases it may happen that the power
+                # off request comes through but we never receive the response.
+                # Check the power state before trying to force off.
+                try:
+                    _wait_until_powered_off(task)
+                except Exception:
+                    LOG.warning('Failed to soft power off node %(node_uuid)s '
+                                'in at least %(timeout)d seconds. Forcing '
+                                'hard power off and proceeding.',
+                                {'node_uuid': node.uuid,
+                                 'timeout': (wait * (attempts - 1)) / 1000})
                     manager_utils.node_power_action(task, states.POWER_OFF)
             else:
                 # Flush the file system prior to hard rebooting the node
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index 31a4ade11d..d684fda9b5 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -926,7 +926,10 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
     @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy',
                        autospec=True)
     @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
+    @mock.patch.object(time, 'sleep', lambda seconds: None)
     @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
+    @mock.patch.object(fake.FakePower, 'get_power_state',
+                       spec=types.FunctionType)
     @mock.patch.object(agent_client.AgentClient, 'power_off',
                        spec=types.FunctionType)
     @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.'
@@ -935,13 +938,15 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
                 'configure_tenant_networks', spec_set=True, autospec=True)
     def test_reboot_and_finish_deploy_soft_poweroff_fails(
             self, configure_tenant_net_mock, remove_provisioning_net_mock,
-            power_off_mock, node_power_action_mock, mock_collect, resume_mock):
+            power_off_mock, get_power_state_mock, node_power_action_mock,
+            mock_collect, resume_mock):
         power_off_mock.side_effect = RuntimeError("boom")
         self.node.provision_state = states.DEPLOYING
         self.node.target_provision_state = states.ACTIVE
         self.node.save()
         with task_manager.acquire(self.context, self.node.uuid,
                                   shared=True) as task:
+            get_power_state_mock.return_value = states.POWER_ON
             self.deploy.reboot_and_finish_deploy(task)
             power_off_mock.assert_called_once_with(task.node)
             node_power_action_mock.assert_has_calls([
@@ -954,6 +959,44 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
             self.assertEqual(states.ACTIVE, task.node.target_provision_state)
             self.assertFalse(mock_collect.called)
 
+    @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy',
+                       autospec=True)
+    @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
+    @mock.patch.object(time, 'sleep', lambda seconds: None)
+    @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
+    @mock.patch.object(fake.FakePower, 'get_power_state',
+                       spec=types.FunctionType)
+    @mock.patch.object(agent_client.AgentClient, 'power_off',
+                       spec=types.FunctionType)
+    @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.'
+                'remove_provisioning_network', spec_set=True, autospec=True)
+    @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.'
+                'configure_tenant_networks', spec_set=True, autospec=True)
+    def test_reboot_and_finish_deploy_soft_poweroff_race(
+            self, configure_tenant_net_mock, remove_provisioning_net_mock,
+            power_off_mock, get_power_state_mock, node_power_action_mock,
+            mock_collect, resume_mock):
+        # Test the situation when soft power off works, but ironic doesn't
+        # learn about it.
+        power_off_mock.side_effect = RuntimeError("boom")
+        self.node.provision_state = states.DEPLOYING
+        self.node.target_provision_state = states.ACTIVE
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=True) as task:
+            get_power_state_mock.side_effect = [states.POWER_ON,
+                                                states.POWER_OFF]
+            self.deploy.reboot_and_finish_deploy(task)
+            power_off_mock.assert_called_once_with(task.node)
+            node_power_action_mock.assert_called_once_with(
+                task, states.POWER_ON)
+            remove_provisioning_net_mock.assert_called_once_with(mock.ANY,
+                                                                 task)
+            configure_tenant_net_mock.assert_called_once_with(mock.ANY, task)
+            self.assertEqual(states.DEPLOYWAIT, task.node.provision_state)
+            self.assertEqual(states.ACTIVE, task.node.target_provision_state)
+            self.assertFalse(mock_collect.called)
+
     @mock.patch.object(manager_utils, 'power_on_node_if_needed')
     @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy',
                        autospec=True)
diff --git a/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml b/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml
new file mode 100644
index 0000000000..74f84fbab3
--- /dev/null
+++ b/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes a rare issue when agent successfully powers off a node after
+    deployment, but ironic never learns about it and does another reboot.