Merge "Add timeout for waiting callback from deploy ramdisk"
This commit is contained in:
commit
5638531a91
@ -489,6 +489,14 @@
|
|||||||
# database, in seconds. (integer value)
|
# database, in seconds. (integer value)
|
||||||
#sync_power_state_interval=60
|
#sync_power_state_interval=60
|
||||||
|
|
||||||
|
# Interval between checks of provision timeouts, in seconds.
|
||||||
|
# (integer value)
|
||||||
|
#check_provision_state_interval=60
|
||||||
|
|
||||||
|
# Timeout (seconds) for waiting callback from deploy ramdisk.
|
||||||
|
# 0 - unlimited. (integer value)
|
||||||
|
#deploy_callback_timeout=1800
|
||||||
|
|
||||||
|
|
||||||
[database]
|
[database]
|
||||||
|
|
||||||
|
@ -42,6 +42,8 @@ building or tearing down the TFTP environment for a node, notifying Neutron of
|
|||||||
a change, etc.
|
a change, etc.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
from eventlet import greenpool
|
from eventlet import greenpool
|
||||||
|
|
||||||
from oslo.config import cfg
|
from oslo.config import cfg
|
||||||
@ -59,6 +61,7 @@ from ironic.openstack.common import excutils
|
|||||||
from ironic.openstack.common import lockutils
|
from ironic.openstack.common import lockutils
|
||||||
from ironic.openstack.common import log
|
from ironic.openstack.common import log
|
||||||
from ironic.openstack.common import periodic_task
|
from ironic.openstack.common import periodic_task
|
||||||
|
from ironic.openstack.common import timeutils
|
||||||
|
|
||||||
MANAGER_TOPIC = 'ironic.conductor_manager'
|
MANAGER_TOPIC = 'ironic.conductor_manager'
|
||||||
WORKER_SPAWN_lOCK = "conductor_worker_spawn"
|
WORKER_SPAWN_lOCK = "conductor_worker_spawn"
|
||||||
@ -82,6 +85,14 @@ conductor_opts = [
|
|||||||
default=60,
|
default=60,
|
||||||
help='Interval between syncing the node power state to the '
|
help='Interval between syncing the node power state to the '
|
||||||
'database, in seconds.'),
|
'database, in seconds.'),
|
||||||
|
cfg.IntOpt('check_provision_state_interval',
|
||||||
|
default=60,
|
||||||
|
help='Interval between checks of provision timeouts, '
|
||||||
|
'in seconds.'),
|
||||||
|
cfg.IntOpt('deploy_callback_timeout',
|
||||||
|
default=1800,
|
||||||
|
help='Timeout (seconds) for waiting callback from deploy '
|
||||||
|
'ramdisk. 0 - unlimited.'),
|
||||||
]
|
]
|
||||||
|
|
||||||
CONF = cfg.CONF
|
CONF = cfg.CONF
|
||||||
@ -421,6 +432,47 @@ class ConductorManager(service.PeriodicService):
|
|||||||
{'node': node_uuid})
|
{'node': node_uuid})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@periodic_task.periodic_task(
|
||||||
|
spacing=CONF.conductor.check_provision_state_interval)
|
||||||
|
def _check_deploy_timeouts(self, context):
|
||||||
|
if not CONF.conductor.deploy_callback_timeout:
|
||||||
|
return
|
||||||
|
|
||||||
|
filters = {'reserved': False, 'maintenance': False}
|
||||||
|
columns = ['uuid', 'driver', 'provision_state', 'provision_updated_at']
|
||||||
|
node_list = self.dbapi.get_nodeinfo_list(columns=columns,
|
||||||
|
filters=filters)
|
||||||
|
|
||||||
|
for (node_uuid, driver, state, update_time) in node_list:
|
||||||
|
mapped_hosts = self.driver_rings[driver].get_hosts(node_uuid)
|
||||||
|
if self.host not in mapped_hosts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if state == states.DEPLOYWAIT:
|
||||||
|
limit = (timeutils.utcnow() - datetime.timedelta(
|
||||||
|
seconds=CONF.conductor.deploy_callback_timeout))
|
||||||
|
if timeutils.normalize_time(update_time) <= limit:
|
||||||
|
try:
|
||||||
|
task = task_manager.TaskManager(context, node_uuid)
|
||||||
|
except (exception.NodeLocked, exception.NodeNotFound):
|
||||||
|
continue
|
||||||
|
|
||||||
|
node = task.node
|
||||||
|
node.provision_state = states.DEPLOYFAIL
|
||||||
|
node.target_provision_state = states.NOSTATE
|
||||||
|
msg = (_('Timeout reached when waiting callback for '
|
||||||
|
'node %s') % node_uuid)
|
||||||
|
node.last_error = msg
|
||||||
|
LOG.error(msg)
|
||||||
|
node.save(task.context)
|
||||||
|
|
||||||
|
try:
|
||||||
|
thread = self._spawn_worker(
|
||||||
|
utils.cleanup_after_timeout, task)
|
||||||
|
thread.link(lambda t: task.release_resources())
|
||||||
|
except exception.NoFreeConductorWorker:
|
||||||
|
task.release_resources()
|
||||||
|
|
||||||
def _get_current_driver_rings(self):
|
def _get_current_driver_rings(self):
|
||||||
"""Build the current hash ring for this ConductorManager's drivers."""
|
"""Build the current hash ring for this ConductorManager's drivers."""
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
from ironic.common import exception
|
||||||
from ironic.common import states
|
from ironic.common import states
|
||||||
from ironic.conductor import task_manager
|
from ironic.conductor import task_manager
|
||||||
from ironic.openstack.common import excutils
|
from ironic.openstack.common import excutils
|
||||||
@ -95,3 +96,29 @@ def node_power_action(task, node, state):
|
|||||||
finally:
|
finally:
|
||||||
node['target_power_state'] = states.NOSTATE
|
node['target_power_state'] = states.NOSTATE
|
||||||
node.save(context)
|
node.save(context)
|
||||||
|
|
||||||
|
|
||||||
|
@task_manager.require_exclusive_lock
|
||||||
|
def cleanup_after_timeout(task):
|
||||||
|
"""Cleanup deploy task after timeout.
|
||||||
|
|
||||||
|
:param task: a TaskManager instance.
|
||||||
|
"""
|
||||||
|
node = task.node
|
||||||
|
context = task.context
|
||||||
|
error_msg = _('Cleanup failed for node %(node)s after deploy timeout: '
|
||||||
|
' %(error)s')
|
||||||
|
try:
|
||||||
|
task.driver.deploy.clean_up(task, node)
|
||||||
|
except exception.IronicException as e:
|
||||||
|
msg = error_msg % {'node': node.uuid, 'error': e}
|
||||||
|
LOG.error(msg)
|
||||||
|
node.last_error = msg
|
||||||
|
node.save(context)
|
||||||
|
except Exception as e:
|
||||||
|
msg = error_msg % {'node': node.uuid, 'error': e}
|
||||||
|
LOG.error(msg)
|
||||||
|
node.last_error = _('Deploy timed out, but an unhandled exception was '
|
||||||
|
'encountered while aborting. More info may be '
|
||||||
|
'found in the log file.')
|
||||||
|
node.save(context)
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
|
|
||||||
"""Test class for Ironic ManagerService."""
|
"""Test class for Ironic ManagerService."""
|
||||||
|
|
||||||
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import mock
|
import mock
|
||||||
@ -35,6 +36,7 @@ from ironic.conductor import utils as conductor_utils
|
|||||||
from ironic.db import api as dbapi
|
from ironic.db import api as dbapi
|
||||||
from ironic import objects
|
from ironic import objects
|
||||||
from ironic.openstack.common import context
|
from ironic.openstack.common import context
|
||||||
|
from ironic.openstack.common import timeutils
|
||||||
from ironic.tests.conductor import utils as mgr_utils
|
from ironic.tests.conductor import utils as mgr_utils
|
||||||
from ironic.tests.db import base
|
from ironic.tests.db import base
|
||||||
from ironic.tests.db import utils
|
from ironic.tests.db import utils
|
||||||
@ -724,3 +726,74 @@ class ManagerTestCase(base.DbTestCase):
|
|||||||
# Verify reservation was released.
|
# Verify reservation was released.
|
||||||
node.refresh(self.context)
|
node.refresh(self.context)
|
||||||
self.assertIsNone(node.reservation)
|
self.assertIsNone(node.reservation)
|
||||||
|
|
||||||
|
@mock.patch.object(timeutils, 'utcnow')
|
||||||
|
def test__check_deploy_timeouts_timeout(self, mock_utcnow):
|
||||||
|
self.config(deploy_callback_timeout=60, group='conductor')
|
||||||
|
past = datetime.datetime(2000, 1, 1, 0, 0)
|
||||||
|
present = past + datetime.timedelta(minutes=5)
|
||||||
|
mock_utcnow.return_value = past
|
||||||
|
self.service.start()
|
||||||
|
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
|
||||||
|
target_provision_state=states.DEPLOYDONE,
|
||||||
|
provision_updated_at=past)
|
||||||
|
node = self.dbapi.create_node(n)
|
||||||
|
mock_utcnow.return_value = present
|
||||||
|
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
|
||||||
|
self.service._check_deploy_timeouts(self.context)
|
||||||
|
self.service._worker_pool.waitall()
|
||||||
|
node.refresh(self.context)
|
||||||
|
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||||
|
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||||
|
self.assertIsNotNone(node.last_error)
|
||||||
|
clean_mock.assert_called_once_with(mock.ANY, mock.ANY)
|
||||||
|
|
||||||
|
@mock.patch.object(timeutils, 'utcnow')
|
||||||
|
def test__check_deploy_timeouts_no_timeout(self, mock_utcnow):
|
||||||
|
self.config(deploy_callback_timeout=600, group='conductor')
|
||||||
|
past = datetime.datetime(2000, 1, 1, 0, 0)
|
||||||
|
present = past + datetime.timedelta(minutes=5)
|
||||||
|
mock_utcnow.return_value = past
|
||||||
|
self.service.start()
|
||||||
|
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
|
||||||
|
target_provision_state=states.DEPLOYDONE,
|
||||||
|
provision_updated_at=past)
|
||||||
|
node = self.dbapi.create_node(n)
|
||||||
|
mock_utcnow.return_value = present
|
||||||
|
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
|
||||||
|
self.service._check_deploy_timeouts(self.context)
|
||||||
|
node.refresh(self.context)
|
||||||
|
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
|
||||||
|
self.assertEqual(states.DEPLOYDONE, node.target_provision_state)
|
||||||
|
self.assertIsNone(node.last_error)
|
||||||
|
self.assertFalse(clean_mock.called)
|
||||||
|
|
||||||
|
def test__check_deploy_timeouts_disabled(self):
|
||||||
|
self.config(deploy_callback_timeout=0, group='conductor')
|
||||||
|
self.service.start()
|
||||||
|
with mock.patch.object(self.dbapi, 'get_nodeinfo_list') as get_mock:
|
||||||
|
self.service._check_deploy_timeouts(self.context)
|
||||||
|
self.assertFalse(get_mock.called)
|
||||||
|
|
||||||
|
@mock.patch.object(timeutils, 'utcnow')
|
||||||
|
def test__check_deploy_timeouts_cleanup_failed(self, mock_utcnow):
|
||||||
|
self.config(deploy_callback_timeout=60, group='conductor')
|
||||||
|
past = datetime.datetime(2000, 1, 1, 0, 0)
|
||||||
|
present = past + datetime.timedelta(minutes=5)
|
||||||
|
mock_utcnow.return_value = past
|
||||||
|
self.service.start()
|
||||||
|
n = utils.get_test_node(provision_state=states.DEPLOYWAIT,
|
||||||
|
target_provision_state=states.DEPLOYDONE,
|
||||||
|
provision_updated_at=past)
|
||||||
|
node = self.dbapi.create_node(n)
|
||||||
|
mock_utcnow.return_value = present
|
||||||
|
with mock.patch.object(self.driver.deploy, 'clean_up') as clean_mock:
|
||||||
|
error = 'test-123'
|
||||||
|
clean_mock.side_effect = exception.IronicException(message=error)
|
||||||
|
self.service._check_deploy_timeouts(self.context)
|
||||||
|
self.service._worker_pool.waitall()
|
||||||
|
node.refresh(self.context)
|
||||||
|
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||||
|
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||||
|
self.assertIn(error, node.last_error)
|
||||||
|
self.assertIsNone(node.reservation)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user