From 7dd611dc5eda53a25a942be237731dca0d7e23d9 Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Fri, 21 Aug 2020 12:38:45 +0200
Subject: [PATCH] Ensure in-band deploy steps are present in time for
 fast-track deployments

Currently we load in-band deploy steps on the first heartbeat in DEPLOYWAIT.
With fast-track, however, this heartbeat may happen way too late, because
the node is up and heartbeating before it's moved to DEPLOYWAIT. This
results in in-band deploy steps not loaded. This change forces a refresh
of deploy steps in deploy.deploy if fast-track is used.

Also make sure that cached steps are cleared on reboot or power off since
they may become outdated next time the agent boots.

Change-Id: I003543452717183b9b3359e368757bcd824a5ce7
---
 ironic/conductor/utils.py                      | 18 +++++++++++++++---
 ironic/drivers/modules/agent.py                |  3 +++
 ironic/drivers/modules/iscsi_deploy.py         |  3 +++
 ironic/tests/unit/conductor/test_utils.py      |  5 ++++-
 .../tests/unit/drivers/modules/test_agent.py   |  5 ++++-
 .../unit/drivers/modules/test_iscsi_deploy.py  |  7 ++++++-
 .../fast-track-steps-81bd79a2a91e1b30.yaml     |  7 +++++++
 7 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 releasenotes/notes/fast-track-steps-81bd79a2a91e1b30.yaml

diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index b30fdee5e1..413cb0ce4a 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -284,14 +284,13 @@ def node_power_action(task, new_state, timeout=None):
         driver_internal_info = node.driver_internal_info
         driver_internal_info['last_power_state_change'] = str(
             timeutils.utcnow().isoformat())
+        node.driver_internal_info = driver_internal_info
         # NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in
         # fast-track (or an accidentally booted agent) will cause subsequent
         # actions to fail.
         if target_state in (states.POWER_OFF, states.SOFT_POWER_OFF,
                             states.REBOOT, states.SOFT_REBOOT):
-            if not is_agent_token_pregenerated(node):
-                driver_internal_info.pop('agent_secret_token', False)
-        node.driver_internal_info = driver_internal_info
+            wipe_internal_info_on_power_off(node)
         node.save()
 
     # take power action
@@ -451,6 +450,19 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
         task.process_event('fail', target_state=target_state)
 
 
+def wipe_internal_info_on_power_off(node):
+    """Wipe information that should not survive reboot/power off."""
+    driver_internal_info = node.driver_internal_info
+    if not is_agent_token_pregenerated(node):
+        # Wipe the token if it's not pre-generated, otherwise we'll refuse to
+        # generate it again for the newly booted agent.
+        driver_internal_info.pop('agent_secret_token', False)
+    # Wipe cached steps since they may change after reboot.
+    driver_internal_info.pop('agent_cached_deploy_steps', None)
+    driver_internal_info.pop('agent_cached_clean_steps', None)
+    node.driver_internal_info = driver_internal_info
+
+
 def wipe_token_and_url(task):
     """Remove agent URL and token from the task."""
     info = task.node.driver_internal_info
diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py
index 1466cb7650..8418c533ce 100644
--- a/ironic/drivers/modules/agent.py
+++ b/ironic/drivers/modules/agent.py
@@ -489,6 +489,9 @@ class AgentDeploy(AgentDeployMixin, agent_base.AgentBaseMixin,
             # immediately to the next deploy step.
             LOG.debug('Performing a fast track deployment for %(node)s.',
                       {'node': task.node.uuid})
+            # NOTE(dtantsur): while the node is up and heartbeating, we don't
+            # necessary have the deploy steps cached. Force a refresh here.
+            self.refresh_steps(task, 'deploy')
         elif task.driver.storage.should_write_image(task):
             # Check if the driver has already performed a reboot in a previous
             # deploy step.
diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py
index 8a48431363..c1f15d990b 100644
--- a/ironic/drivers/modules/iscsi_deploy.py
+++ b/ironic/drivers/modules/iscsi_deploy.py
@@ -655,6 +655,9 @@ class ISCSIDeploy(agent_base.AgentDeployMixin, agent_base.AgentBaseMixin,
                       {'node': task.node.uuid})
             deploy_utils.cache_instance_image(task.context, node)
             check_image_size(task)
+            # NOTE(dtantsur): while the node is up and heartbeating, we don't
+            # necessary have the deploy steps cached. Force a refresh here.
+            self.refresh_steps(task, 'deploy')
         elif task.driver.storage.should_write_image(task):
             # Standard deploy process
             deploy_utils.cache_instance_image(task.context, node)
diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py
index 0dea519e21..776a072179 100644
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@@ -225,7 +225,8 @@ class NodePowerActionTestCase(db_base.DbTestCase):
     @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
     def test_node_power_action_power_off(self, get_power_mock):
         """Test node_power_action to turn node power off."""
-        dii = {'agent_secret_token': 'token'}
+        dii = {'agent_secret_token': 'token',
+               'agent_cached_deploy_steps': ['steps']}
         node = obj_utils.create_test_node(self.context,
                                           uuid=uuidutils.generate_uuid(),
                                           driver='fake-hardware',
@@ -243,6 +244,8 @@ class NodePowerActionTestCase(db_base.DbTestCase):
         self.assertIsNone(node['target_power_state'])
         self.assertIsNone(node['last_error'])
         self.assertNotIn('agent_secret_token', node['driver_internal_info'])
+        self.assertNotIn('agent_cached_deploy_steps',
+                         node['driver_internal_info'])
 
     @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
     def test_node_power_action_power_off_pregenerated_token(self,
diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py
index 7db79b066b..926f665240 100644
--- a/ironic/tests/unit/drivers/modules/test_agent.py
+++ b/ironic/tests/unit/drivers/modules/test_agent.py
@@ -467,13 +467,15 @@ class TestAgentDeploy(db_base.DbTestCase):
             self.assertIsNone(driver_return)
             self.assertFalse(mock_power.called)
 
+    @mock.patch.object(agent.AgentDeploy, 'refresh_steps', autospec=True)
     @mock.patch.object(agent_client.AgentClient, 'prepare_image',
                        autospec=True)
     @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True)
     @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
     @mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
     def test_deploy_fast_track(self, power_mock, mock_pxe_instance,
-                               mock_is_fast_track, prepare_image_mock):
+                               mock_is_fast_track, prepare_image_mock,
+                               refresh_mock):
         mock_is_fast_track.return_value = True
         self.node.target_provision_state = states.ACTIVE
         self.node.provision_state = states.DEPLOYING
@@ -488,6 +490,7 @@ class TestAgentDeploy(db_base.DbTestCase):
             self.assertEqual(states.DEPLOYING, task.node.provision_state)
             self.assertEqual(states.ACTIVE,
                              task.node.target_provision_state)
+            refresh_mock.assert_called_once_with(self.driver, task, 'deploy')
 
     @mock.patch.object(noop_storage.NoopStorage, 'detach_volumes',
                        autospec=True)
diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
index b6eb127717..1cd12feb00 100644
--- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
+++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py
@@ -855,6 +855,8 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
             self.assertIsNone(ret)
             self.assertFalse(mock_node_power_action.called)
 
+    @mock.patch.object(iscsi_deploy.ISCSIDeploy, 'refresh_steps',
+                       autospec=True)
     @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True)
     @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True)
     @mock.patch.object(iscsi_deploy.ISCSIDeploy, 'write_image',
@@ -864,7 +866,8 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
     @mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
     def test_deploy_fast_track(self, power_mock, mock_pxe_instance,
                                mock_is_fast_track, write_image_mock,
-                               cache_image_mock, check_image_size_mock):
+                               cache_image_mock, check_image_size_mock,
+                               refresh_mock):
         mock_is_fast_track.return_value = True
         self.node.target_provision_state = states.ACTIVE
         self.node.provision_state = states.DEPLOYING
@@ -885,6 +888,8 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
             cache_image_mock.assert_called_with(mock.ANY, task.node)
             check_image_size_mock.assert_called_with(task)
             self.assertFalse(write_image_mock.called)
+            refresh_mock.assert_called_once_with(task.driver.deploy,
+                                                 task, 'deploy')
 
     @mock.patch.object(noop_storage.NoopStorage, 'detach_volumes',
                        autospec=True)
diff --git a/releasenotes/notes/fast-track-steps-81bd79a2a91e1b30.yaml b/releasenotes/notes/fast-track-steps-81bd79a2a91e1b30.yaml
new file mode 100644
index 0000000000..68e2ecce1c
--- /dev/null
+++ b/releasenotes/notes/fast-track-steps-81bd79a2a91e1b30.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Fixes an issue that caused in-band deploy steps inserted before
+    ``write_image`` to be skipped when fast-track is used.
+  - |
+    Makes sure in-band deploy and clean steps are not cached across reboots.