Fix fast-track with the direct deploy interface

Several small fixes: 1) Make sure the deploy.deploy step returns DEPLOYWAIT after running prepare_image otherwise the conductor wrongly assumes that the deploy is done. 2) Handle the case when provision_state==DEPLOYWAIT when returning from an asynchronous deploy step. 3) Do not assume that prepare_image is always the last command to run, sometimes get_deploy_steps sneaks in. 4) Do not issue a deprecation warning when receiving "agent is busy" on get_deploy_steps, this is normal for fast-track. Change-Id: I19274c48bd36fca19961a7d78467ec8c29f85905
2020-06-17 13:24:10 +02:00 · 2020-06-17 13:24:10 +02:00 · f0803493de
commit f0803493de
parent df439697ae
7 changed files with 94 additions and 13 deletions
--- a/ironic/conductor/deployments.py
+++ b/ironic/conductor/deployments.py
@ -291,7 +291,8 @@ def do_next_deploy_step(task, step_index, conductor_id):
            LOG.info('Deploy step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
-            task.process_event('wait')
+            if task.node.provision_state != states.DEPLOYWAIT:
                task.process_event('wait')
            return
        elif result is not None:
            # NOTE(rloo): This is an internal/dev error; shouldn't happen.
--- a/ironic/drivers/modules/agent.py
+++ b/ironic/drivers/modules/agent.py
@ -186,17 +186,13 @@ class AgentDeployMixin(agent_base.AgentDeployMixin):
        if not commands:
            return False
-        last_command = commands[-1]
+        try:
-
+            last_command = next(cmd for cmd in reversed(commands)
-        if last_command['command_name'] != 'prepare_image':
+                                if cmd['command_name'] == 'prepare_image')
-            # catches race condition where prepare_image is still processing
+        except StopIteration:
            # so deploy hasn't started yet
            return False
-
+        else:
-        if last_command['command_status'] != 'RUNNING':
+            return last_command['command_status'] != 'RUNNING'
            return True
        return False
    @METRICS.timer('AgentDeployMixin.continue_deploy')
    @task_manager.require_exclusive_lock
@ -487,6 +483,7 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
            # the state machine state going from DEPLOYWAIT -> DEPLOYING
            task.process_event('wait')
            self.continue_deploy(task)
            return states.DEPLOYWAIT
        elif task.driver.storage.should_write_image(task):
            # Check if the driver has already performed a reboot in a previous
            # deploy step.
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@ -722,10 +722,15 @@ class AgentDeployMixin(HeartbeatMixin):
                   'steps': previous_steps})
        call = getattr(self._client, 'get_%s_steps' % step_type)
        # TODO(dtantsur): remove the error handling in the V release.
        try:
            agent_result = call(node, task.ports).get('command_result', {})
        except exception.AgentAPIError as exc:
            if 'agent is busy' in str(exc):
                LOG.debug('Agent is busy with a command, will refresh steps '
                          'on the next heartbeat')
                return
            # TODO(dtantsur): change to just 'raise'
            if step_type == 'clean':
                raise
            else:
--- a/ironic/tests/unit/conductor/test_deployments.py
+++ b/ironic/tests/unit/conductor/test_deployments.py
@ -411,6 +411,39 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
        mock_execute.assert_called_once_with(mock.ANY, task,
                                             self.deploy_steps[0])
    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
                autospec=True)
    def test__do_next_deploy_step_in_deploywait(self, mock_execute):
        driver_internal_info = {'deploy_step_index': None,
                                'deploy_steps': self.deploy_steps}
        self._start_service()
        node = obj_utils.create_test_node(
            self.context, driver='fake-hardware',
            driver_internal_info=driver_internal_info,
            deploy_step={})
        def fake_execute(interface, task, step):
            # A deploy step leaves the node in DEPLOYWAIT
            task.process_event('wait')
            return states.DEPLOYWAIT
        mock_execute.side_effect = fake_execute
        expected_first_step = node.driver_internal_info['deploy_steps'][0]
        task = task_manager.TaskManager(self.context, node.uuid)
        task.process_event('deploy')
        deployments.do_next_deploy_step(task, 0, self.service.conductor.id)
        node.refresh()
        self.assertIsNone(node.last_error)
        self.assertEqual(states.DEPLOYWAIT, node.provision_state)
        self.assertEqual(states.ACTIVE, node.target_provision_state)
        self.assertEqual(expected_first_step, node.deploy_step)
        self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
        self.assertEqual(self.service.conductor.id, node.conductor_affinity)
        mock_execute.assert_called_once_with(mock.ANY, task,
                                             self.deploy_steps[0])
    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
                autospec=True)
    def test__do_next_deploy_step_continue_from_last_step(self, mock_execute):
--- a/ironic/tests/unit/drivers/modules/test_agent.py
+++ b/ironic/tests/unit/drivers/modules/test_agent.py
@ -492,7 +492,7 @@ class TestAgentDeploy(db_base.DbTestCase):
        self.node.save()
        with task_manager.acquire(
                self.context, self.node['uuid'], shared=False) as task:
-            self.driver.deploy(task)
+            self.assertEqual(states.DEPLOYWAIT, self.driver.deploy(task))
            self.assertFalse(power_mock.called)
            self.assertFalse(mock_pxe_instance.called)
            task.node.refresh()
@ -1739,6 +1739,27 @@ class TestAgentDeploy(db_base.DbTestCase):
                                          'command_status': 'RUNNING'}]
            self.assertFalse(task.driver.deploy.deploy_is_done(task))
    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
                       autospec=True)
    def test_deploy_is_done_several_results(self, mock_get_cmd):
        with task_manager.acquire(self.context, self.node.uuid) as task:
            mock_get_cmd.return_value = [
                {'command_name': 'prepare_image', 'command_status': 'SUCCESS'},
                {'command_name': 'other_command', 'command_status': 'SUCCESS'},
                {'command_name': 'prepare_image', 'command_status': 'RUNNING'},
            ]
            self.assertFalse(task.driver.deploy.deploy_is_done(task))
    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
                       autospec=True)
    def test_deploy_is_done_not_the_last(self, mock_get_cmd):
        with task_manager.acquire(self.context, self.node.uuid) as task:
            mock_get_cmd.return_value = [
                {'command_name': 'prepare_image', 'command_status': 'SUCCESS'},
                {'command_name': 'other_command', 'command_status': 'SUCCESS'},
            ]
            self.assertTrue(task.driver.deploy.deploy_is_done(task))
    @mock.patch.object(manager_utils, 'restore_power_state_if_needed',
                       autospec=True)
    @mock.patch.object(manager_utils, 'power_on_node_if_needed',
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@ -2235,6 +2235,25 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
            self.assertEqual([self.clean_steps['clean_steps'][
                'SpecificHardwareManager'][1]], steps['raid'])
    @mock.patch.object(agent_base.LOG, 'warning', autospec=True)
    @mock.patch.object(agent_client.AgentClient, 'get_deploy_steps',
                       autospec=True)
    def test_refresh_steps_busy(self, client_mock, log_mock):
        client_mock.side_effect = exception.AgentAPIError(
            node="node", status="500", error='agent is busy')
        with task_manager.acquire(
                self.context, self.node.uuid, shared=False) as task:
            self.deploy.refresh_steps(task, 'deploy')
            client_mock.assert_called_once_with(mock.ANY, task.node,
                                                task.ports)
            self.assertNotIn('agent_cached_deploy_steps_refreshed',
                             task.node.driver_internal_info)
            self.assertIsNone(task.node.driver_internal_info.get(
                'agent_cached_deploy_steps'))
            self.assertFalse(log_mock.called)
    @mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
                       autospec=True)
    def test_refresh_steps_missing_steps(self, client_mock):
--- a/releasenotes/notes/direct-fast-track-d0f43850b6e80751.yaml
+++ b/releasenotes/notes/direct-fast-track-d0f43850b6e80751.yaml
@ -0,0 +1,5 @@
 ---
 fixes:
  - |
    Fixes fast-track deployments with the ``direct`` deploy interface that
    used to hang previously.