From 242775ae184582ce51a006e5797e7017133617f3 Mon Sep 17 00:00:00 2001
From: Julia Kreger <juliaashleykreger@gmail.com>
Date: Thu, 26 Mar 2020 09:14:30 -0700
Subject: [PATCH] Retry agent get_command_status upon failures

The agent command status code lacks any retry mechanism
which meant if any intermittent failure such as a dropped
packet or an overloaded firewall could potentially begin
to cause the entire deployment or cleaning process to derail
and fail.

This fix addes logic to ensure we retry upon such failures.

Worth noting, the exact same logic has been used elsewhere
in the agent client code for the exact same problem when
issuing commands.

Change-Id: I4f6581b7fb895ed2b1d505b9947e363665551b57
Story: 2007470
Task: 39158
---
 ironic/drivers/modules/agent_client.py            | 15 ++++++++++++++-
 .../unit/drivers/modules/test_agent_client.py     | 11 +++++++++++
 ...ent-command-status-retry-f9b6f53a823c6b01.yaml |  6 ++++++
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml

diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index 32427c6e71..fed6846707 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -137,6 +137,10 @@ class AgentClient(object):
         return result
 
     @METRICS.timer('AgentClient.get_commands_status')
+    @retrying.retry(
+        retry_on_exception=(
+            lambda e: isinstance(e, exception.AgentConnectionFailed)),
+        stop_max_attempt_number=CONF.agent.max_command_attempts)
     def get_commands_status(self, node):
         """Get command status from agent.
 
@@ -166,7 +170,16 @@ class AgentClient(object):
         """
         url = self._get_command_url(node)
         LOG.debug('Fetching status of agent commands for node %s', node.uuid)
-        resp = self.session.get(url, timeout=CONF.agent.command_timeout)
+        try:
+            resp = self.session.get(url, timeout=CONF.agent.command_timeout)
+        except (requests.ConnectionError, requests.Timeout) as e:
+            msg = (_('Failed to connect to the agent running on node %(node)s '
+                     'to collect commands status. '
+                     'Error: %(error)s') %
+                   {'node': node.uuid, 'error': e})
+            LOG.error(msg)
+            raise exception.AgentConnectionFailed(reason=msg)
+
         result = resp.json()['commands']
         status = '; '.join('%(cmd)s: result "%(res)s", error "%(err)s"' %
                            {'cmd': r.get('command_name'),
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index a39477f02b..78812b5a89 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -191,6 +191,17 @@ class TestAgentClient(base.TestCase):
                     'api_version': CONF.agent.agent_api_version},
                 timeout=CONF.agent.command_timeout)
 
+    def test_get_commands_status_retries(self):
+        with mock.patch.object(self.client.session, 'get',
+                               autospec=True) as mock_get:
+            res = mock.MagicMock(spec_set=['json'])
+            res.json.return_value = {'commands': []}
+            mock_get.side_effect = [
+                requests.ConnectionError('boom'),
+                res]
+            self.assertEqual([], self.client.get_commands_status(self.node))
+            self.assertEqual(2, mock_get.call_count)
+
     def test_prepare_image(self):
         self.client._command = mock.MagicMock(spec_set=[])
         image_info = {'image_id': 'image'}
diff --git a/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml
new file mode 100644
index 0000000000..df49202525
--- /dev/null
+++ b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    Fixes an issue with the agent client code where checks of the agent
+    command status had no logic to prevent an intermittent or transient
+    connection failure from causing the entire operation to fail.