From ae79881bbc92fc8a8b1d8867ddd9bb7763d3813e Mon Sep 17 00:00:00 2001 From: Yuriy Zveryanskyy Date: Mon, 24 Feb 2014 15:51:49 +0200 Subject: [PATCH] Add timeout for waiting callback from deploy ramdisk 'callback_timeout' options added to conductor, if timeout reached node switched to error state. New periodical task created for timeouts check. Related-Bug: #1270986 Change-Id: I3084d529baa13d4f848a7d050b8299582a28d7e9 --- etc/ironic/ironic.conf.sample | 8 +++ ironic/conductor/manager.py | 52 ++++++++++++++++++ ironic/conductor/utils.py | 27 ++++++++++ ironic/tests/conductor/test_manager.py | 73 ++++++++++++++++++++++++++ 4 files changed, 160 insertions(+) diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 1480e068f2..719a9010be 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -489,6 +489,14 @@ # database, in seconds. (integer value) #sync_power_state_interval=60 +# Interval between checks of provision timeouts, in seconds. +# (integer value) +#check_provision_state_interval=60 + +# Timeout (seconds) for waiting callback from deploy ramdisk. +# 0 - unlimited. (integer value) +#deploy_callback_timeout=1800 + [database] diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 9013cf02bd..f98df7db67 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -42,6 +42,8 @@ building or tearing down the TFTP environment for a node, notifying Neutron of a change, etc. """ +import datetime + from eventlet import greenpool from oslo.config import cfg @@ -59,6 +61,7 @@ from ironic.openstack.common import excutils from ironic.openstack.common import lockutils from ironic.openstack.common import log from ironic.openstack.common import periodic_task +from ironic.openstack.common import timeutils MANAGER_TOPIC = 'ironic.conductor_manager' WORKER_SPAWN_lOCK = "conductor_worker_spawn" @@ -82,6 +85,14 @@ conductor_opts = [ default=60, help='Interval between syncing the node power state to the ' 'database, in seconds.'), + cfg.IntOpt('check_provision_state_interval', + default=60, + help='Interval between checks of provision timeouts, ' + 'in seconds.'), + cfg.IntOpt('deploy_callback_timeout', + default=1800, + help='Timeout (seconds) for waiting callback from deploy ' + 'ramdisk. 0 - unlimited.'), ] CONF = cfg.CONF @@ -430,6 +441,47 @@ class ConductorManager(service.PeriodicService): {'node': node_uuid}) continue + @periodic_task.periodic_task( + spacing=CONF.conductor.check_provision_state_interval) + def _check_deploy_timeouts(self, context): + if not CONF.conductor.deploy_callback_timeout: + return + + filters = {'reserved': False, 'maintenance': False} + columns = ['uuid', 'driver', 'provision_state', 'provision_updated_at'] + node_list = self.dbapi.get_nodeinfo_list(columns=columns, + filters=filters) + + for (node_uuid, driver, state, update_time) in node_list: + mapped_hosts = self.driver_rings[driver].get_hosts(node_uuid) + if self.host not in mapped_hosts: + continue + + if state == states.DEPLOYWAIT: + limit = (timeutils.utcnow() - datetime.timedelta( + seconds=CONF.conductor.deploy_callback_timeout)) + if timeutils.normalize_time(update_time) <= limit: + try: + task = task_manager.TaskManager(context, node_uuid) + except (exception.NodeLocked, exception.NodeNotFound): + continue + + node = task.node + node.provision_state = states.DEPLOYFAIL + node.target_provision_state = states.NOSTATE + msg = (_('Timeout reached when waiting callback for ' + 'node %s') % node_uuid) + node.last_error = msg + LOG.error(msg) + node.save(task.context) + + try: + thread = self._spawn_worker( + utils.cleanup_after_timeout, task) + thread.link(lambda t: task.release_resources()) + except exception.NoFreeConductorWorker: + task.release_resources() + def _get_current_driver_rings(self): """Build the current hash ring for this ConductorManager's drivers.""" diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 627adbdc33..fc15fc4d8a 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -13,6 +13,7 @@ # License for the specific language governing permissions and limitations # under the License. +from ironic.common import exception from ironic.common import states from ironic.conductor import task_manager from ironic.openstack.common import excutils @@ -95,3 +96,29 @@ def node_power_action(task, node, state): finally: node['target_power_state'] = states.NOSTATE node.save(context) + + +@task_manager.require_exclusive_lock +def cleanup_after_timeout(task): + """Cleanup deploy task after timeout. + + :param task: a TaskManager instance. + """ + node = task.node + context = task.context + error_msg = _('Cleanup failed for node %(node)s after deploy timeout: ' + ' %(error)s') + try: + task.driver.deploy.clean_up(task, node) + except exception.IronicException as e: + msg = error_msg % {'node': node.uuid, 'error': e} + LOG.error(msg) + node.last_error = msg + node.save(context) + except Exception as e: + msg = error_msg % {'node': node.uuid, 'error': e} + LOG.error(msg) + node.last_error = _('Deploy timed out, but an unhandled exception was ' + 'encountered while aborting. More info may be ' + 'found in the log file.') + node.save(context) diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 64e47289c0..dcfd9f3104 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -19,6 +19,7 @@ """Test class for Ironic ManagerService.""" +import datetime import time import mock @@ -35,6 +36,7 @@ from ironic.conductor import utils as conductor_utils from ironic.db import api as dbapi from ironic import objects from ironic.openstack.common import context +from ironic.openstack.common import timeutils from ironic.tests.conductor import utils as mgr_utils from ironic.tests.db import base from ironic.tests.db import utils @@ -749,3 +751,74 @@ class ManagerTestCase(base.DbTestCase): # Verify reservation was released. node.refresh(self.context) self.assertIsNone(node.reservation) + + @mock.patch.object(timeutils, 'utcnow') + def test__check_deploy_timeouts_timeout(self, mock_utcnow): + self.config(deploy_callback_timeout=60, group='conductor') + past = datetime.datetime(2000, 1, 1, 0, 0) + present = past + datetime.timedelta(minutes=5) + mock_utcnow.return_value = past + self.service.start() + n = utils.get_test_node(provision_state=states.DEPLOYWAIT, + target_provision_state=states.DEPLOYDONE, + provision_updated_at=past) + node = self.dbapi.create_node(n) + mock_utcnow.return_value = present + with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock: + self.service._check_deploy_timeouts(self.context) + self.service._worker_pool.waitall() + node.refresh(self.context) + self.assertEqual(states.DEPLOYFAIL, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertIsNotNone(node.last_error) + clean_mock.assert_called_once_with(mock.ANY, mock.ANY) + + @mock.patch.object(timeutils, 'utcnow') + def test__check_deploy_timeouts_no_timeout(self, mock_utcnow): + self.config(deploy_callback_timeout=600, group='conductor') + past = datetime.datetime(2000, 1, 1, 0, 0) + present = past + datetime.timedelta(minutes=5) + mock_utcnow.return_value = past + self.service.start() + n = utils.get_test_node(provision_state=states.DEPLOYWAIT, + target_provision_state=states.DEPLOYDONE, + provision_updated_at=past) + node = self.dbapi.create_node(n) + mock_utcnow.return_value = present + with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock: + self.service._check_deploy_timeouts(self.context) + node.refresh(self.context) + self.assertEqual(states.DEPLOYWAIT, node.provision_state) + self.assertEqual(states.DEPLOYDONE, node.target_provision_state) + self.assertIsNone(node.last_error) + self.assertFalse(clean_mock.called) + + def test__check_deploy_timeouts_disabled(self): + self.config(deploy_callback_timeout=0, group='conductor') + self.service.start() + with mock.patch.object(self.dbapi, 'get_nodeinfo_list') as get_mock: + self.service._check_deploy_timeouts(self.context) + self.assertFalse(get_mock.called) + + @mock.patch.object(timeutils, 'utcnow') + def test__check_deploy_timeouts_cleanup_failed(self, mock_utcnow): + self.config(deploy_callback_timeout=60, group='conductor') + past = datetime.datetime(2000, 1, 1, 0, 0) + present = past + datetime.timedelta(minutes=5) + mock_utcnow.return_value = past + self.service.start() + n = utils.get_test_node(provision_state=states.DEPLOYWAIT, + target_provision_state=states.DEPLOYDONE, + provision_updated_at=past) + node = self.dbapi.create_node(n) + mock_utcnow.return_value = present + with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock: + error = 'test-123' + clean_mock.side_effect = exception.IronicException(message=error) + self.service._check_deploy_timeouts(self.context) + self.service._worker_pool.waitall() + node.refresh(self.context) + self.assertEqual(states.DEPLOYFAIL, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertIn(error, node.last_error) + self.assertIsNone(node.reservation)