Merge "Add timeout for waiting callback from deploy ramdisk"

This commit is contained in:
Jenkins 2014-02-28 11:42:21 +00:00 committed by Gerrit Code Review
commit 5638531a91
4 changed files with 160 additions and 0 deletions

View File

@ -489,6 +489,14 @@
# database, in seconds. (integer value) # database, in seconds. (integer value)
#sync_power_state_interval=60 #sync_power_state_interval=60
# Interval between checks of provision timeouts, in seconds.
# (integer value)
#check_provision_state_interval=60
# Timeout (seconds) for waiting callback from deploy ramdisk.
# 0 - unlimited. (integer value)
#deploy_callback_timeout=1800
[database] [database]

View File

@ -42,6 +42,8 @@ building or tearing down the TFTP environment for a node, notifying Neutron of
a change, etc. a change, etc.
""" """
import datetime
from eventlet import greenpool from eventlet import greenpool
from oslo.config import cfg from oslo.config import cfg
@ -59,6 +61,7 @@ from ironic.openstack.common import excutils
from ironic.openstack.common import lockutils from ironic.openstack.common import lockutils
from ironic.openstack.common import log from ironic.openstack.common import log
from ironic.openstack.common import periodic_task from ironic.openstack.common import periodic_task
from ironic.openstack.common import timeutils
MANAGER_TOPIC = 'ironic.conductor_manager' MANAGER_TOPIC = 'ironic.conductor_manager'
WORKER_SPAWN_lOCK = "conductor_worker_spawn" WORKER_SPAWN_lOCK = "conductor_worker_spawn"
@ -82,6 +85,14 @@ conductor_opts = [
default=60, default=60,
help='Interval between syncing the node power state to the ' help='Interval between syncing the node power state to the '
'database, in seconds.'), 'database, in seconds.'),
cfg.IntOpt('check_provision_state_interval',
default=60,
help='Interval between checks of provision timeouts, '
'in seconds.'),
cfg.IntOpt('deploy_callback_timeout',
default=1800,
help='Timeout (seconds) for waiting callback from deploy '
'ramdisk. 0 - unlimited.'),
] ]
CONF = cfg.CONF CONF = cfg.CONF
@ -421,6 +432,47 @@ class ConductorManager(service.PeriodicService):
{'node': node_uuid}) {'node': node_uuid})
continue continue
@periodic_task.periodic_task(
spacing=CONF.conductor.check_provision_state_interval)
def _check_deploy_timeouts(self, context):
if not CONF.conductor.deploy_callback_timeout:
return
filters = {'reserved': False, 'maintenance': False}
columns = ['uuid', 'driver', 'provision_state', 'provision_updated_at']
node_list = self.dbapi.get_nodeinfo_list(columns=columns,
filters=filters)
for (node_uuid, driver, state, update_time) in node_list:
mapped_hosts = self.driver_rings[driver].get_hosts(node_uuid)
if self.host not in mapped_hosts:
continue
if state == states.DEPLOYWAIT:
limit = (timeutils.utcnow() - datetime.timedelta(
seconds=CONF.conductor.deploy_callback_timeout))
if timeutils.normalize_time(update_time) <= limit:
try:
task = task_manager.TaskManager(context, node_uuid)
except (exception.NodeLocked, exception.NodeNotFound):
continue
node = task.node
node.provision_state = states.DEPLOYFAIL
node.target_provision_state = states.NOSTATE
msg = (_('Timeout reached when waiting callback for '
'node %s') % node_uuid)
node.last_error = msg
LOG.error(msg)
node.save(task.context)
try:
thread = self._spawn_worker(
utils.cleanup_after_timeout, task)
thread.link(lambda t: task.release_resources())
except exception.NoFreeConductorWorker:
task.release_resources()
def _get_current_driver_rings(self): def _get_current_driver_rings(self):
"""Build the current hash ring for this ConductorManager's drivers.""" """Build the current hash ring for this ConductorManager's drivers."""

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
from ironic.common import exception
from ironic.common import states from ironic.common import states
from ironic.conductor import task_manager from ironic.conductor import task_manager
from ironic.openstack.common import excutils from ironic.openstack.common import excutils
@ -95,3 +96,29 @@ def node_power_action(task, node, state):
finally: finally:
node['target_power_state'] = states.NOSTATE node['target_power_state'] = states.NOSTATE
node.save(context) node.save(context)
@task_manager.require_exclusive_lock
def cleanup_after_timeout(task):
"""Cleanup deploy task after timeout.
:param task: a TaskManager instance.
"""
node = task.node
context = task.context
error_msg = _('Cleanup failed for node %(node)s after deploy timeout: '
' %(error)s')
try:
task.driver.deploy.clean_up(task, node)
except exception.IronicException as e:
msg = error_msg % {'node': node.uuid, 'error': e}
LOG.error(msg)
node.last_error = msg
node.save(context)
except Exception as e:
msg = error_msg % {'node': node.uuid, 'error': e}
LOG.error(msg)
node.last_error = _('Deploy timed out, but an unhandled exception was '
'encountered while aborting. More info may be '
'found in the log file.')
node.save(context)

View File

@ -19,6 +19,7 @@
"""Test class for Ironic ManagerService.""" """Test class for Ironic ManagerService."""
import datetime
import time import time
import mock import mock
@ -35,6 +36,7 @@ from ironic.conductor import utils as conductor_utils
from ironic.db import api as dbapi from ironic.db import api as dbapi
from ironic import objects from ironic import objects
from ironic.openstack.common import context from ironic.openstack.common import context
from ironic.openstack.common import timeutils
from ironic.tests.conductor import utils as mgr_utils from ironic.tests.conductor import utils as mgr_utils
from ironic.tests.db import base from ironic.tests.db import base
from ironic.tests.db import utils from ironic.tests.db import utils
@ -724,3 +726,74 @@ class ManagerTestCase(base.DbTestCase):
# Verify reservation was released. # Verify reservation was released.
node.refresh(self.context) node.refresh(self.context)
self.assertIsNone(node.reservation) self.assertIsNone(node.reservation)
@mock.patch.object(timeutils, 'utcnow')
def test__check_deploy_timeouts_timeout(self, mock_utcnow):
self.config(deploy_callback_timeout=60, group='conductor')
past = datetime.datetime(2000, 1, 1, 0, 0)
present = past + datetime.timedelta(minutes=5)
mock_utcnow.return_value = past
self.service.start()
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
target_provision_state=states.DEPLOYDONE,
provision_updated_at=past)
node = self.dbapi.create_node(n)
mock_utcnow.return_value = present
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
self.service._check_deploy_timeouts(self.context)
self.service._worker_pool.waitall()
node.refresh(self.context)
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
self.assertEqual(states.NOSTATE, node.target_provision_state)
self.assertIsNotNone(node.last_error)
clean_mock.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch.object(timeutils, 'utcnow')
def test__check_deploy_timeouts_no_timeout(self, mock_utcnow):
self.config(deploy_callback_timeout=600, group='conductor')
past = datetime.datetime(2000, 1, 1, 0, 0)
present = past + datetime.timedelta(minutes=5)
mock_utcnow.return_value = past
self.service.start()
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
target_provision_state=states.DEPLOYDONE,
provision_updated_at=past)
node = self.dbapi.create_node(n)
mock_utcnow.return_value = present
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
self.service._check_deploy_timeouts(self.context)
node.refresh(self.context)
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(states.DEPLOYDONE, node.target_provision_state)
self.assertIsNone(node.last_error)
self.assertFalse(clean_mock.called)
def test__check_deploy_timeouts_disabled(self):
self.config(deploy_callback_timeout=0, group='conductor')
self.service.start()
with mock.patch.object(self.dbapi, 'get_nodeinfo_list') as get_mock:
self.service._check_deploy_timeouts(self.context)
self.assertFalse(get_mock.called)
@mock.patch.object(timeutils, 'utcnow')
def test__check_deploy_timeouts_cleanup_failed(self, mock_utcnow):
self.config(deploy_callback_timeout=60, group='conductor')
past = datetime.datetime(2000, 1, 1, 0, 0)
present = past + datetime.timedelta(minutes=5)
mock_utcnow.return_value = past
self.service.start()
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
target_provision_state=states.DEPLOYDONE,
provision_updated_at=past)
node = self.dbapi.create_node(n)
mock_utcnow.return_value = present
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
error = 'test-123'
clean_mock.side_effect = exception.IronicException(message=error)
self.service._check_deploy_timeouts(self.context)
self.service._worker_pool.waitall()
node.refresh(self.context)
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
self.assertEqual(states.NOSTATE, node.target_provision_state)
self.assertIn(error, node.last_error)
self.assertIsNone(node.reservation)