From 4f924f2d648712acabcbdfeaa0733ec140e582ca Mon Sep 17 00:00:00 2001
From: Przemyslaw Szczerbik <przemyslaw.szczerbik@intel.com>
Date: Wed, 29 May 2024 03:25:10 -0700
Subject: [PATCH] Fix execution of node servicing steps exposed by IPA's
 HardwareManager

Implement execute_service_step() in AgentBaseMixin that will
asynchronously execute service step on the agent. Without it, Ironic
will try to find <step_name> attribute on the object that implements
interface specified by the servicing step.

Example:

Step: [{"interface": "deploy", "step": "burnin_cpu"}]
Error: AttributeError: 'AgentDeploy' object has no attribute 'burnin_cpu'

Closes-Bug: #2069430

Change-Id: Idb1d5b50656c3765ea5c9e21b7844946ae4cfc67
Signed-off-by: Przemyslaw Szczerbik <przemyslaw.szczerbik@intel.com>
---
 ironic/common/exception.py                    |  4 ++
 ironic/drivers/modules/agent_base.py          | 53 +++++++++++++++----
 .../unit/drivers/modules/test_agent_base.py   | 15 ++++++
 .../notes/bug-2069430-cb58c9beaa7a35de.yaml   |  6 +++
 4 files changed, 69 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/bug-2069430-cb58c9beaa7a35de.yaml

diff --git a/ironic/common/exception.py b/ironic/common/exception.py
index 59d60f3ba0..442ed2dd4b 100644
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@@ -567,6 +567,10 @@ class NodeCleaningFailure(IronicException):
     _msg_fmt = _("Failed to clean node %(node)s: %(reason)s")
 
 
+class NodeServicingFailure(IronicException):
+    _msg_fmt = _("Failed to service node %(node)s: %(reason)s")
+
+
 class PathNotFound(IronicException):
     _msg_fmt = _("Path %(dir)s does not exist.")
 
diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index f9cdd08599..6d915490b0 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -104,6 +104,8 @@ _FASTTRACK_HEARTBEAT_ALLOWED = _HEARTBEAT_ALLOWED + (states.MANAGEABLE,
                                                      states.ENROLL)
 FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED)
 
+_VALID_STEP_TYPES = ('clean', 'deploy', 'service')
+
 
 @METRICS.timer('AgentBase.post_clean_step_hook')
 def post_clean_step_hook(interface, step):
@@ -354,11 +356,24 @@ def find_step(task, step_type, interface, name):
         steps, {'interface': interface, 'step': name})
 
 
+def _validate_step_type(step_type):
+    if step_type not in _VALID_STEP_TYPES:
+        err_msg = _('Invalid step type "%(step_type)s". Valid step types: '
+                    '%(valid_step_types)s') % {
+                        'step_type': step_type,
+                        'valid_step_types': _VALID_STEP_TYPES}
+        LOG.error(err_msg)
+        raise exception.InvalidParameterValue(err_msg)
+
+
 def _raise(step_type, msg):
-    assert step_type in ('clean', 'deploy')
-    exc = (exception.NodeCleaningFailure if step_type == 'clean'
-           else exception.InstanceDeployFailure)
-    raise exc(msg)
+    _validate_step_type(step_type)
+    step_to_exc = {
+        'clean': exception.NodeCleaningFailure,
+        'deploy': exception.InstanceDeployFailure,
+        'service': exception.NodeServicingFailure,
+    }
+    raise step_to_exc[step_type](msg)
 
 
 def execute_step(task, step, step_type, client=None):
@@ -369,9 +384,10 @@ def execute_step(task, step, step_type, client=None):
     :param step_type: 'clean' or 'deploy'
     :param client: agent client (if available)
     :raises: NodeCleaningFailure (clean step) or InstanceDeployFailure (deploy
-        step) if the agent does not return a command status.
-    :returns: states.CLEANWAIT/DEPLOYWAIT to signify the step will be
-        completed async
+        step) or NodeServicingFailure (service step) if the agent does not
+        return a command status.
+    :returns: states.CLEANWAIT/DEPLOYWAIT/SERVICEWAIT to signify the step will
+        be completed async
     """
     if client is None:
         client = agent_client.get_client(task)
@@ -383,7 +399,13 @@ def execute_step(task, step, step_type, client=None):
         _raise(step_type, _(
             'Agent on node %(node)s returned bad command result: '
             '%(result)s') % {'node': task.node.uuid, 'result': result})
-    return states.CLEANWAIT if step_type == 'clean' else states.DEPLOYWAIT
+    _validate_step_type(step_type)
+    step_to_state = {
+        'clean': states.CLEANWAIT,
+        'deploy': states.DEPLOYWAIT,
+        'service': states.SERVICEWAIT,
+    }
+    return step_to_state[step_type]
 
 
 def execute_clean_step(task, step):
@@ -952,6 +974,19 @@ class AgentBaseMixin(object):
         """
         return execute_step(task, step, 'clean')
 
+    @METRICS.timer('AgentBaseMixin.execute_service_step')
+    def execute_service_step(self, task, step):
+        """Execute a service step asynchronously on the agent.
+
+        :param task: a TaskManager object containing the node
+        :param step: a service step dictionary to execute
+        :raises: NodeServicingFailure if the agent does not return a command
+            status
+        :returns: states.SERVICEWAIT to signify the step will be completed
+            async
+        """
+        return execute_step(task, step, 'service')
+
     def _process_version_mismatch(self, task, step_type):
         node = task.node
         # For manual clean, the target provision state is MANAGEABLE, whereas
@@ -1029,7 +1064,7 @@ class AgentBaseMixin(object):
         set to True, this method will coordinate the reboot once the step is
         completed.
         """
-        assert step_type in ('clean', 'deploy', 'service')
+        _validate_step_type(step_type)
 
         node = task.node
         client = agent_client.get_client(task)
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index e0ba42447a..f7a14fb7b3 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -2520,6 +2520,21 @@ class StepMethodsTestCase(db_base.DbTestCase):
                 task, self.clean_steps['deploy'][0], 'clean')
             self.assertEqual(states.CLEANWAIT, response)
 
+    @mock.patch('ironic.objects.Port.list_by_node_id',
+                spec_set=types.FunctionType)
+    @mock.patch.object(agent_client.AgentClient, 'execute_service_step',
+                       autospec=True)
+    def test_execute_service_step(self, client_mock, list_ports_mock):
+        client_mock.return_value = {
+            'command_status': 'SUCCEEDED'}
+        list_ports_mock.return_value = self.ports
+
+        with task_manager.acquire(
+                self.context, self.node.uuid, shared=False) as task:
+            response = agent_base.execute_step(
+                task, self.clean_steps['deploy'][0], 'service')
+            self.assertEqual(states.SERVICEWAIT, response)
+
     @mock.patch('ironic.objects.Port.list_by_node_id',
                 spec_set=types.FunctionType)
     @mock.patch.object(agent_client.AgentClient, 'execute_deploy_step',
diff --git a/releasenotes/notes/bug-2069430-cb58c9beaa7a35de.yaml b/releasenotes/notes/bug-2069430-cb58c9beaa7a35de.yaml
new file mode 100644
index 0000000000..115017779e
--- /dev/null
+++ b/releasenotes/notes/bug-2069430-cb58c9beaa7a35de.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    [`bug 2069430 <https://bugs.launchpad.net/ironic/+bug/2069430>`_]
+    Fixes an issue that prevented Ironic from being able to execute node
+    servicing steps exposed by IPA's HardwareManager