From fcb6a1096f48336f19a5e8aa80445f57d4480c08 Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Wed, 23 Jun 2021 17:11:16 +0200
Subject: [PATCH] Cache AgentClient on Task, not globally

In order to avoid potential cache coherency issues
when using a globally cached AgentClient, e.g. with
TSL certificates from the IPA, cache the AgentClient
on a per task basis.

Co-Authored-By: Arne Wiebalck <arne.wiebalck@cern.ch>

Story: #2009004
Task: #42678

Change-Id: I0c458c8d9ae673181beb6d85c2ee68235ccef239
---
 ironic/drivers/modules/agent.py               |  7 +++--
 ironic/drivers/modules/agent_base.py          | 31 +++++++++----------
 ironic/drivers/modules/agent_client.py        |  9 ++++++
 ironic/drivers/modules/ansible/deploy.py      |  6 +---
 .../tests/unit/drivers/modules/test_agent.py  |  6 ++--
 .../unit/drivers/modules/test_agent_base.py   |  3 +-
 ...agentclient-per-task-ec2231684e6876d9.yaml |  5 +++
 7 files changed, 40 insertions(+), 27 deletions(-)
 create mode 100644 releasenotes/notes/cache-agentclient-per-task-ec2231684e6876d9.yaml

diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py
index 3007689692..c38b0cba68 100644
--- a/ironic/drivers/modules/agent.py
+++ b/ironic/drivers/modules/agent.py
@@ -32,6 +32,7 @@ from ironic.conductor import utils as manager_utils
 from ironic.conf import CONF
 from ironic.drivers import base
 from ironic.drivers.modules import agent_base
+from ironic.drivers.modules import agent_client
 from ironic.drivers.modules import boot_mode_utils
 from ironic.drivers.modules import deploy_utils
 
@@ -572,8 +573,9 @@ class AgentDeploy(CustomAgentDeploy):
                     'step': 'write_image',
                     'args': {'image_info': image_info,
                              'configdrive': configdrive}}
+        client = agent_client.get_client(task)
         return agent_base.execute_step(task, new_step, 'deploy',
-                                       client=self._client)
+                                       client=client)
 
     @METRICS.timer('AgentDeployMixin.prepare_instance_boot')
     @base.deploy_step(priority=60)
@@ -602,7 +604,8 @@ class AgentDeploy(CustomAgentDeploy):
         # ppc64* hardware we need to provide the 'PReP_Boot_partition_uuid' to
         # direct where the bootloader should be installed.
         driver_internal_info = task.node.driver_internal_info
-        partition_uuids = self._client.get_partition_uuids(node).get(
+        client = agent_client.get_client(task)
+        partition_uuids = client.get_partition_uuids(node).get(
             'command_result') or {}
         root_uuid = partition_uuids.get('root uuid')
 
diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index 2db1ae82c1..1263b2df35 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -99,11 +99,6 @@ _FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT,
 FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED)
 
 
-def _get_client():
-    client = agent_client.AgentClient()
-    return client
-
-
 @METRICS.timer('post_clean_step_hook')
 def post_clean_step_hook(interface, step):
     """Decorator method for adding a post clean step hook.
@@ -372,7 +367,7 @@ def execute_step(task, step, step_type, client=None):
         completed async
     """
     if client is None:
-        client = _get_client()
+        client = agent_client.get_client(task)
     ports = objects.Port.list_by_node_id(
         task.context, task.node.id)
     call = getattr(client, 'execute_%s_step' % step_type)
@@ -401,8 +396,7 @@ def _step_failure_handler(task, msg, step_type, traceback=False):
 class HeartbeatMixin(object):
     """Mixin class implementing heartbeat processing."""
 
-    def __init__(self):
-        self._client = _get_client()
+    collect_deploy_logs = True
 
     def reboot_to_instance(self, task):
         """Method invoked after the deployment is completed.
@@ -498,7 +492,7 @@ class HeartbeatMixin(object):
             # Do not call the error handler is the node is already DEPLOYFAIL
             if node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
                 deploy_utils.set_failed_state(
-                    task, last_error, collect_logs=bool(self._client))
+                    task, last_error, collect_logs=self.collect_deploy_logs)
 
     def _heartbeat_clean_wait(self, task):
         node = task.node
@@ -621,7 +615,8 @@ class HeartbeatMixin(object):
         """
         node = task.node
         try:
-            result = self._client.finalize_rescue(node)
+            client = agent_client.get_client(task)
+            result = client.finalize_rescue(node)
         except exception.IronicException as e:
             raise exception.InstanceRescueFailure(node=node.uuid,
                                                   instance=node.instance_uuid,
@@ -853,7 +848,8 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
                   {'node': node.uuid, 'type': step_type,
                    'steps': previous_steps})
 
-        call = getattr(self._client, 'get_%s_steps' % step_type)
+        client = agent_client.get_client(task)
+        call = getattr(client, 'get_%s_steps' % step_type)
         try:
             agent_result = call(node, task.ports).get('command_result', {})
         except exception.AgentInProgress as exc:
@@ -1039,7 +1035,8 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
         assert step_type in ('clean', 'deploy')
 
         node = task.node
-        agent_commands = self._client.get_commands_status(task.node)
+        client = agent_client.get_client(task)
+        agent_commands = client.get_commands_status(task.node)
 
         if _freshly_booted(agent_commands, step_type):
             field = ('cleaning_reboot' if step_type == 'clean'
@@ -1148,16 +1145,17 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
         can_power_on = (states.POWER_ON in
                         task.driver.power.get_supported_power_states(task))
 
+        client = agent_client.get_client(task)
         try:
             if not can_power_on:
                 LOG.info('Power interface of node %(node)s does not support '
                          'power on, using reboot to switch to the instance',
                          node.uuid)
-                self._client.sync(node)
+                client.sync(node)
                 manager_utils.node_power_action(task, states.REBOOT)
             elif not oob_power_off:
                 try:
-                    self._client.power_off(node)
+                    client.power_off(node)
                 except Exception as e:
                     LOG.warning('Failed to soft power off node %(node_uuid)s. '
                                 '%(cls)s: %(error)s',
@@ -1180,7 +1178,7 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
                     manager_utils.node_power_action(task, states.POWER_OFF)
             else:
                 # Flush the file system prior to hard rebooting the node
-                result = self._client.sync(node)
+                result = client.sync(node)
                 error = result.get('faultstring')
                 if error:
                     if 'Unknown command' in error:
@@ -1330,7 +1328,8 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
                       'partition %(part)s, EFI system partition %(efi)s',
                       {'node': node.uuid, 'part': root_uuid,
                        'efi': efi_system_part_uuid})
-            result = self._client.install_bootloader(
+            client = agent_client.get_client(task)
+            result = client.install_bootloader(
                 node, root_uuid=root_uuid,
                 efi_system_part_uuid=efi_system_part_uuid,
                 prep_boot_part_uuid=prep_boot_part_uuid,
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index 134d997143..b24d1812c4 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -36,6 +36,15 @@ DEFAULT_IPA_PORTAL_PORT = 3260
 REBOOT_COMMAND = 'run_image'
 
 
+def get_client(task):
+    """Get client for this node."""
+    try:
+        return task.cached_agent_client
+    except AttributeError:
+        task.cached_agent_client = AgentClient()
+        return task.cached_agent_client
+
+
 def get_command_error(command):
     """Extract an error string from the command result.
 
diff --git a/ironic/drivers/modules/ansible/deploy.py b/ironic/drivers/modules/ansible/deploy.py
index 218d046b2e..8e7b1a3a61 100644
--- a/ironic/drivers/modules/ansible/deploy.py
+++ b/ironic/drivers/modules/ansible/deploy.py
@@ -381,11 +381,7 @@ class AnsibleDeploy(agent_base.HeartbeatMixin,
                     base.DeployInterface):
     """Interface for deploy-related actions."""
 
-    def __init__(self):
-        super(AnsibleDeploy, self).__init__()
-        # NOTE(pas-ha) overriding agent creation as we won't be
-        # communicating with it, only processing heartbeats
-        self._client = None
+    collect_deploy_logs = False
 
     def get_properties(self):
         """Return the properties of the interface."""
diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py
index 2fad169aac..8cc3ef8093 100644
--- a/ironic/tests/unit/drivers/modules/test_agent.py
+++ b/ironic/tests/unit/drivers/modules/test_agent.py
@@ -1380,7 +1380,7 @@ class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase):
 
         with task_manager.acquire(self.context, self.node.uuid,
                                   shared=False) as task:
-            task.driver.deploy._client = client_mock
+            task.cached_agent_client = client_mock
             task.driver.deploy.write_image(task)
 
             step['args'] = {'image_info': expected_image_info,
@@ -1470,7 +1470,7 @@ class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase):
 
         with task_manager.acquire(self.context, self.node.uuid,
                                   shared=False) as task:
-            task.driver.deploy._client = client_mock
+            task.cached_agent_client = client_mock
             task.driver.deploy.write_image(task)
 
             step = {'step': 'write_image', 'interface': 'deploy',
@@ -1535,7 +1535,7 @@ class TestAgentDeploy(CommonTestsMixin, db_base.DbTestCase):
 
         with task_manager.acquire(self.context, self.node.uuid,
                                   shared=False) as task:
-            task.driver.deploy._client = client_mock
+            task.cached_agent_client = client_mock
             task.driver.deploy.write_image(task)
 
             step = {'step': 'write_image', 'interface': 'deploy',
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index 937312b874..23d45feb1e 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -828,7 +828,8 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
             self.assertEqual(states.ACTIVE, task.node.target_provision_state)
             collect_mock.assert_called_once_with(task.node)
             self.assertFalse(power_on_node_if_needed_mock.called)
-            sync_mock.assert_called_once_with(self.deploy._client, task.node)
+            sync_mock.assert_called_once_with(agent_client.get_client(task),
+                                              task.node)
 
     @mock.patch.object(manager_utils, 'restore_power_state_if_needed',
                        autospec=True)
diff --git a/releasenotes/notes/cache-agentclient-per-task-ec2231684e6876d9.yaml b/releasenotes/notes/cache-agentclient-per-task-ec2231684e6876d9.yaml
new file mode 100644
index 0000000000..d6822b7347
--- /dev/null
+++ b/releasenotes/notes/cache-agentclient-per-task-ec2231684e6876d9.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes potential cache coherency issues by caching the AgentClient
+    per task, rather than globally.