From 0256e1e69f4dd38cf0a80532337fa30c89502cb4 Mon Sep 17 00:00:00 2001
From: Dmitry Galkin <galkindmitrii@gmail.com>
Date: Wed, 19 Sep 2018 14:02:48 +0000
Subject: [PATCH] Fix node exclusive lock not released on console
 start/restart.

This patch forces kill of console process with SIGKILL if it did not
terminate on SIGTERM within the CONF.console.kill_timeout and reads the
shellinabox subprocess stdout/stderr after CONF.console.subprocess_timeout
or if subprocess exited with non zero code.

Change-Id: I55a112d877d94f31d27487846ff59fe27f602f8b
Closes-Bug: 1587313
Story: 1587313
Task: 9654
---
 ironic/conf/console.py                        |  4 ++
 ironic/drivers/modules/console_utils.py       | 55 ++++++++++++-------
 .../drivers/modules/test_console_utils.py     | 30 ++++++++--
 ...llinabox-locking-fix-2fae2a451a8a489a.yaml |  8 +++
 4 files changed, 71 insertions(+), 26 deletions(-)
 create mode 100644 releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml

diff --git a/ironic/conf/console.py b/ironic/conf/console.py
index 2713511132..2c361c1816 100644
--- a/ironic/conf/console.py
+++ b/ironic/conf/console.py
@@ -45,6 +45,10 @@ opts = [
                default=10,
                help=_('Time (in seconds) to wait for the console subprocess '
                       'to start.')),
+    cfg.IntOpt('kill_timeout',
+               default=1,
+               help=_('Time (in seconds) to wait for the shellinabox console '
+                      'subprocess to exit before sending SIGKILL signal.')),
     cfg.IPOpt('socat_address',
               default='$my_ip',
               help=_('IP address of Socat service running on the host of '
diff --git a/ironic/drivers/modules/console_utils.py b/ironic/drivers/modules/console_utils.py
index f28b1d4e4f..84eae03e62 100644
--- a/ironic/drivers/modules/console_utils.py
+++ b/ironic/drivers/modules/console_utils.py
@@ -101,6 +101,23 @@ def _stop_console(node_uuid):
         console_pid = _get_console_pid(node_uuid)
 
         os.kill(console_pid, signal.SIGTERM)
+
+        # make sure the process gets killed hard if required
+        attempt = 0
+        max_attempts = CONF.console.kill_timeout // 0.2
+
+        while attempt < max_attempts:
+            if psutil.pid_exists(console_pid):
+                if attempt == max_attempts - 1:
+                    os.kill(console_pid, signal.SIGKILL)
+                LOG.debug("Waiting for the console process with PID %(pid)s "
+                          "to exit. Node: %(node)s.",
+                          {'pid': console_pid, 'node': node_uuid})
+                time.sleep(0.2)
+                attempt += 1
+            else:
+                break
+
     except OSError as exc:
         if exc.errno != errno.ESRCH:
             msg = (_("Could not stop the console for node '%(node)s'. "
@@ -198,28 +215,24 @@ def start_shellinabox_console(node_uuid, port, console_cmd):
         # check if the console pid is created and the process is running.
         # if it is, then the shellinaboxd is invoked successfully as a daemon.
         # otherwise check the error.
-        if locals['returncode'] is not None:
-            if (locals['returncode'] == 0 and os.path.exists(pid_file)
-                and psutil.pid_exists(_get_console_pid(node_uuid))):
-                raise loopingcall.LoopingCallDone()
-            else:
-                (stdout, stderr) = popen_obj.communicate()
-                locals['errstr'] = _(
-                    "Command: %(command)s.\n"
-                    "Exit code: %(return_code)s.\n"
-                    "Stdout: %(stdout)r\n"
-                    "Stderr: %(stderr)r") % {
-                        'command': ' '.join(args),
-                        'return_code': locals['returncode'],
-                        'stdout': stdout,
-                        'stderr': stderr}
-                LOG.warning(locals['errstr'])
-                raise loopingcall.LoopingCallDone()
+        if (locals['returncode'] == 0 and os.path.exists(pid_file)
+            and psutil.pid_exists(_get_console_pid(node_uuid))):
+            raise loopingcall.LoopingCallDone()
 
-        if (time.time() > expiration):
-            locals['errstr'] = (_("Timeout while waiting for console "
-                                  "subprocess to start for node %s.") %
-                                node_uuid)
+        if (time.time() > expiration or locals['returncode'] is not None):
+            (stdout, stderr) = popen_obj.communicate()
+            locals['errstr'] = _(
+                "Timeout or error while waiting for console "
+                "subprocess to start for node: %(node)s.\n"
+                "Command: %(command)s.\n"
+                "Exit code: %(return_code)s.\n"
+                "Stdout: %(stdout)r\n"
+                "Stderr: %(stderr)r") % {
+                    'node': node_uuid,
+                    'command': ' '.join(args),
+                    'return_code': locals['returncode'],
+                    'stdout': stdout,
+                    'stderr': stderr}
             LOG.warning(locals['errstr'])
             raise loopingcall.LoopingCallDone()
 
diff --git a/ironic/tests/unit/drivers/modules/test_console_utils.py b/ironic/tests/unit/drivers/modules/test_console_utils.py
index 8f998bb3d9..6355ad0494 100644
--- a/ironic/tests/unit/drivers/modules/test_console_utils.py
+++ b/ironic/tests/unit/drivers/modules/test_console_utils.py
@@ -145,8 +145,28 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
         console_utils._stop_console(self.info['uuid'])
 
         mock_pid.assert_called_once_with(self.info['uuid'])
-        mock_kill.assert_called_once_with(mock_pid.return_value,
-                                          signal.SIGTERM)
+
+        # a check if process still exist (signal 0) in a loop
+        mock_kill.assert_any_call(mock_pid.return_value, signal.SIG_DFL)
+        # and that it receives the SIGTERM
+        mock_kill.assert_any_call(mock_pid.return_value, signal.SIGTERM)
+        mock_unlink.assert_called_once_with(pid_file)
+
+    @mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
+    @mock.patch.object(os, 'kill', autospec=True)
+    @mock.patch.object(psutil, 'pid_exists', autospec=True, return_value=True)
+    @mock.patch.object(console_utils, '_get_console_pid', autospec=True)
+    def test__stop_console_forced_kill(self, mock_pid, mock_psutil, mock_kill,
+                                       mock_unlink):
+        pid_file = console_utils._get_console_pid_file(self.info['uuid'])
+        mock_pid.return_value = 12345
+
+        console_utils._stop_console(self.info['uuid'])
+
+        mock_pid.assert_called_once_with(self.info['uuid'])
+
+        # Make sure console process receives hard SIGKILL
+        mock_kill.assert_any_call(mock_pid.return_value, signal.SIGKILL)
         mock_unlink.assert_called_once_with(pid_file)
 
     @mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
@@ -322,7 +342,7 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
         mock_popen.assert_called_once_with(mock.ANY,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE)
-        mock_popen.return_value.poll.assert_called_once_with()
+        mock_popen.return_value.poll.assert_called_with()
 
     @mock.patch.object(console_utils, 'open',
                        mock.mock_open(read_data='12345\n'))
@@ -350,11 +370,11 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
 
         mock_stop.assert_called_once_with(self.info['uuid'])
         mock_dir_exists.assert_called_once_with()
-        mock_pid_exists.assert_called_once_with(12345)
+        mock_pid_exists.assert_called_with(12345)
         mock_popen.assert_called_once_with(mock.ANY,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE)
-        mock_popen.return_value.poll.assert_called_once_with()
+        mock_popen.return_value.poll.assert_called_with()
 
     @mock.patch.object(subprocess, 'Popen', autospec=True)
     @mock.patch.object(console_utils, '_ensure_console_pid_dir_exists',
diff --git a/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml b/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml
new file mode 100644
index 0000000000..12ee688b6a
--- /dev/null
+++ b/releasenotes/notes/shellinabox-locking-fix-2fae2a451a8a489a.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes a locking issue where ``ipmitool-shellinabox`` console
+    interface users may encounter a situation where the bare metal node is
+    locked until the conductor is restarted.
+    See story `1587313 <https://storyboard.openstack.org/#!/story/1587313>`_
+    for additional information.