From 5db194c5030f2e71a2bdb6c8bf9ec3d4cedc9f9a Mon Sep 17 00:00:00 2001
From: Julia Kreger <juliaashleykreger@gmail.com>
Date: Wed, 29 Jan 2025 12:49:50 -0800
Subject: [PATCH] Fix agent from being locked out with complex steps

When mixing in-band out-of-band steps, the out-of-band status polling
flag was not being cleared, and was being left to remain in the node
driver_internal_info field, thus preventing future heartbeat operations
from the baremetal node from being processed to check the actual
completion status of a step.

We now always clear the field based upon the workflow in-progress
before starting a new step and should asynchronous steps also
be recorded as a result of any step's actions such as if a reboot
is required.

Special thanks goes to keekz for promptly providing upstream with
the information necessary for us to identify the root cause.

Closes-Bug: 2096938
Change-Id: I5198d9169cff8474c7a990332639b2d0758e6e1a
---
 ironic/common/async_steps.py                        |  6 +++++-
 ironic/conductor/cleaning.py                        |  5 +++++
 ironic/conductor/deployments.py                     |  4 ++++
 ironic/conductor/servicing.py                       |  4 ++++
 ironic/tests/unit/conductor/test_cleaning.py        |  4 +++-
 ironic/tests/unit/conductor/test_deployments.py     |  4 +++-
 ironic/tests/unit/conductor/test_servicing.py       |  4 +++-
 .../tests/unit/drivers/modules/test_deploy_utils.py | 12 ++++++++++++
 ...-polling-lockout-for-steps-b9645f0cae18da1e.yaml | 13 +++++++++++++
 tox.ini                                             |  2 +-
 10 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml

diff --git a/ironic/common/async_steps.py b/ironic/common/async_steps.py
index 8bec847627..d21fc42801 100644
--- a/ironic/common/async_steps.py
+++ b/ironic/common/async_steps.py
@@ -106,7 +106,8 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None,
     :param polling: Boolean value to set for node's driver_internal_info flag
         deployment_polling, servicing_polling or cleaning_polling. If it is
         None, the corresponding polling flag is not set in the node's
-        driver_internal_info.
+        driver_internal_info. A polling flag is otherwise deleted from
+        the node's driver_internal_info.
     :param step_type: The type of steps to process: 'clean', 'service'
         or 'deploy'. If None, detected from the node.
     """
@@ -134,6 +135,9 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None,
         node.set_driver_internal_info(skip_field, skip_current_step)
     if polling is not None:
         node.set_driver_internal_info(polling_field, polling)
+    else:
+        # Always unset the value *if* previously set.
+        node.del_driver_internal_info(polling_field)
     node.save()
 
 
diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py
index 044b64baf6..9c8b052805 100644
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@@ -175,6 +175,11 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
         eocn = step.get('execute_on_child_nodes', False)
         result = None
         try:
+            if async_steps.CLEANING_POLLING in node.driver_internal_info:
+                # We're executing a new step, so a prior polling
+                # flag should be dictated by the new step, *not* a
+                # prior step.
+                node.del_driver_internal_info(async_steps.CLEANING_POLLING)
             if not eocn:
                 LOG.info('Executing %(step)s on node %(node)s',
                          {'step': step, 'node': node.uuid})
diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py
index 06d7ffa308..6a4dbcf074 100644
--- a/ironic/conductor/deployments.py
+++ b/ironic/conductor/deployments.py
@@ -340,6 +340,10 @@ def do_next_deploy_step(task, step_index):
         child_node_execution = step.get('execute_on_child_nodes', False)
         result = None
         try:
+            if async_steps.DEPLOYMENT_POLLING in node.driver_internal_info:
+                # We're going to execute a new step, we should delete any
+                # older/out of date option.
+                node.del_driver_internal_info(async_steps.DEPLOYMENT_POLLING)
             if not child_node_execution:
                 interface = getattr(task.driver, step.get('interface'))
                 LOG.info('Executing %(step)s on node %(node)s',
diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py
index 3264643926..8b385d7c7d 100644
--- a/ironic/conductor/servicing.py
+++ b/ironic/conductor/servicing.py
@@ -133,6 +133,10 @@ def do_next_service_step(task, step_index, disable_ramdisk=None):
         eocn = step.get('execute_on_child_nodes', False)
         result = None
         try:
+            if async_steps.SERVICING_POLLING in node.driver_internal_info:
+                # We're going to execute a new step, we should delete any
+                # older/out of date option.
+                node.del_driver_internal_info(async_steps.SERVICING_POLLING)
             if not eocn:
                 LOG.info('Executing %(step)s on node %(node)s',
                          {'step': step, 'node': node.uuid})
diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py
index 1906ce430b..6f17585706 100644
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@@ -607,7 +607,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
             target_provision_state=tgt_prov_state,
             last_error=None,
             driver_internal_info={'clean_steps': self.clean_steps,
-                                  'clean_step_index': 0},
+                                  'clean_step_index': 0,
+                                  'cleaning_polling': True},
             clean_step=self.clean_steps[0])
         mock_execute.return_value = return_state
 
@@ -623,6 +624,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
         self.assertEqual(1, node.driver_internal_info['clean_step_index'])
         mock_execute.assert_called_once_with(
             mock.ANY, mock.ANY, self.clean_steps[1])
+        self.assertNotIn('cleaning_polling', node.driver_internal_info)
 
     def test_do_next_clean_step_continue_from_last_cleaning(self):
         self._do_next_clean_step_continue_from_last_cleaning(states.CLEANWAIT)
diff --git a/ironic/tests/unit/conductor/test_deployments.py b/ironic/tests/unit/conductor/test_deployments.py
index c96a6fae1a..a9a6e493b8 100644
--- a/ironic/tests/unit/conductor/test_deployments.py
+++ b/ironic/tests/unit/conductor/test_deployments.py
@@ -667,7 +667,8 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
                 autospec=True)
     def test__do_next_deploy_step_in_deploywait(self, mock_execute):
         driver_internal_info = {'deploy_step_index': None,
-                                'deploy_steps': self.deploy_steps}
+                                'deploy_steps': self.deploy_steps,
+                                'deployment_polling': True}
         self._start_service()
         node = obj_utils.create_test_node(
             self.context, driver='fake-hardware',
@@ -692,6 +693,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
         self.assertEqual(states.ACTIVE, node.target_provision_state)
         self.assertEqual(expected_first_step, node.deploy_step)
         self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
+        self.assertNotIn('deployment_polling', node.driver_internal_info)
         mock_execute.assert_called_once_with(mock.ANY, task,
                                              self.deploy_steps[0])
 
diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py
index 81e0d5d7e5..22c2612d96 100644
--- a/ironic/tests/unit/conductor/test_servicing.py
+++ b/ironic/tests/unit/conductor/test_servicing.py
@@ -374,7 +374,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
             target_provision_state=tgt_prov_state,
             last_error=None,
             driver_internal_info={'service_steps': self.service_steps,
-                                  'service_step_index': 0},
+                                  'service_step_index': 0,
+                                  'servicing_polling': True},
             service_step=self.service_steps[0])
         mock_execute.return_value = return_state
 
@@ -390,6 +391,7 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
         self.assertEqual(1, node.driver_internal_info['service_step_index'])
         mock_execute.assert_called_once_with(
             mock.ANY, mock.ANY, self.service_steps[1])
+        self.assertNotIn('servicing_polling', node.driver_internal_info)
 
     def test_do_next_clean_step_continue_from_last_cleaning(self):
         self._do_next_clean_step_continue_from_last_cleaning(
diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py
index c8940ff5ec..c9f6854581 100644
--- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py
+++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py
@@ -3307,3 +3307,15 @@ class AsyncStepTestCase(db_base.DbTestCase):
                                    skip_current_step=True,
                                    polling=True)
         self.assertEqual(expected, self.node.driver_internal_info)
+
+    def test_set_async_step_flags_clears_polling_if_not_set(self):
+        self.node.clean_step = {'step': 'create_configuration',
+                                'interface': 'raid'}
+        self.node.driver_internal_info = {'agent_secret_token': 'test',
+                                          'cleaning_polling': True}
+        expected = {'cleaning_reboot': True,
+                    'skip_current_clean_step': True}
+        self.node.save()
+        utils.set_async_step_flags(self.node, reboot=True,
+                                   skip_current_step=True)
+        self.assertEqual(expected, self.node.driver_internal_info)
diff --git a/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml b/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml
new file mode 100644
index 0000000000..d5e22db4e6
--- /dev/null
+++ b/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Fixes an issue where operators executing complex arrangement of steps
+    which include out-of-band and in-band steps, for example a hardware
+    RAID ``create_configuration`` step followed by in-band steps inside of
+    the agent, would effectively get the agent stuck in a ``wait`` state in
+    the Cleaning, Servicing, or Deploying workflows.
+    This was related to the way out-of-band steps are executed and monitored.
+    Ironic, before starting to execute a new step, now cleans the polling
+    lockout flag for the respective workflow being executed to prevent the
+    agent from getting stuck. For more information, please see
+    `bug 2096938 <https://bugs.launchpad.net/ironic/+bug/2096938>`_.
diff --git a/tox.ini b/tox.ini
index 1eb65623a4..801b9f7aee 100644
--- a/tox.ini
+++ b/tox.ini
@@ -162,7 +162,7 @@ filename = *.py,app.wsgi
 exclude=.*,dist,doc,*lib/python*,*egg,build
 import-order-style = pep8
 application-import-names = ironic
-max-complexity=19
+max-complexity=20
 # [H106] Don't put vim configuration in source files.
 # [H203] Use assertIs(Not)None to check for None.
 # [H204] Use assert(Not)Equal to check for equality.