Merge "Clean nodes stuck in CLEANING state when ir-cond restarts"
This commit is contained in:
commit
bc5e21a3ec
@ -223,21 +223,14 @@ class BaseConductorManager(object):
|
||||
self._periodic_tasks_worker.add_done_callback(
|
||||
self._on_periodic_tasks_stop)
|
||||
|
||||
# NOTE(lucasagomes): If the conductor server dies abruptly
|
||||
# mid deployment (OMM Killer, power outage, etc...) we
|
||||
# can not resume the deployment even if the conductor
|
||||
# comes back online. Cleaning the reservation of the nodes
|
||||
# (dbapi.clear_node_reservations_for_conductor) is not enough to
|
||||
# unstick it, so let's gracefully fail the deployment so the node
|
||||
# can go through the steps (deleting & cleaning) to make itself
|
||||
# available again.
|
||||
filters = {'reserved': False,
|
||||
'provision_state': states.DEPLOYING}
|
||||
last_error = (_("The deployment can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
|
||||
states.DEPLOYING, 'provision_updated_at',
|
||||
last_error=last_error)
|
||||
self._fail_transient_state(
|
||||
states.DEPLOYING,
|
||||
_("The deployment can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
self._fail_transient_state(
|
||||
states.CLEANING,
|
||||
_("The cleaning can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
|
||||
# Start consoles if it set enabled in a greenthread.
|
||||
try:
|
||||
@ -259,6 +252,20 @@ class BaseConductorManager(object):
|
||||
|
||||
self._started = True
|
||||
|
||||
def _fail_transient_state(self, state, last_error):
|
||||
"""Apply "fail" transition to nodes in a transient state.
|
||||
|
||||
If the conductor server dies abruptly mid deployment or cleaning
|
||||
(OMM Killer, power outage, etc...) we can not resume the process even
|
||||
if the conductor comes back online. Cleaning the reservation of
|
||||
the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
|
||||
to unstick it, so let's gracefully fail the process.
|
||||
"""
|
||||
filters = {'reserved': False, 'provision_state': state}
|
||||
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
|
||||
state, 'provision_updated_at',
|
||||
last_error=last_error)
|
||||
|
||||
def del_host(self, deregister=True):
|
||||
# Conductor deregistration fails if called on non-initialized
|
||||
# conductor (e.g. when rpc server is unreachable).
|
||||
|
@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
ht_mock.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
|
||||
autospec=True)
|
||||
@mock.patch.object(driver_factory, 'DriverFactory')
|
||||
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
|
||||
log_mock):
|
||||
@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertFalse(del_mock.called)
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
|
||||
autospec=True)
|
||||
@mock.patch.object(driver_factory, 'HardwareTypesFactory')
|
||||
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
|
||||
log_mock):
|
||||
|
@ -2351,13 +2351,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
|
||||
def test__do_node_clean_automated_disabled(self, mock_validate):
|
||||
self.config(automated_clean=False, group='conductor')
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
last_error=None)
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_node_clean(task)
|
||||
@ -2469,6 +2469,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
else:
|
||||
tgt_prov_state = states.AVAILABLE
|
||||
driver_info = {'clean_steps': self.clean_steps}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2477,7 +2479,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
power_state=states.POWER_OFF,
|
||||
driver_internal_info=driver_info)
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_node_clean(task, clean_steps=clean_steps)
|
||||
@ -2515,6 +2516,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
tgt_prov_state = states.AVAILABLE
|
||||
driver_internal_info['clean_steps'] = self.clean_steps
|
||||
|
||||
self._start_service()
|
||||
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2525,8 +2528,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_execute.return_value = return_state
|
||||
expected_first_step = node.driver_internal_info['clean_steps'][0]
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2553,6 +2554,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
manual=False):
|
||||
# Resume an in-progress cleaning after the first async step
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2563,8 +2566,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step=self.clean_steps[0])
|
||||
mock_execute.return_value = return_state
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, self.next_clean_step_index)
|
||||
@ -2591,6 +2592,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
info = {'clean_steps': self.clean_steps,
|
||||
'clean_step_index': len(self.clean_steps) - 1}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2599,8 +2602,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
driver_internal_info=info,
|
||||
clean_step=self.clean_steps[-1])
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
@ -2628,6 +2629,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_power_execute, manual=False):
|
||||
# Run all steps from start to finish (all synchronous)
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2639,8 +2642,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_deploy_execute.return_value = None
|
||||
mock_power_execute.return_value = None
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2673,6 +2674,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
manual=False):
|
||||
# When a clean step fails, go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2683,8 +2686,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step={})
|
||||
mock_execute.side_effect = Exception()
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2718,6 +2719,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
|
||||
manual=True):
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
power_exec_mock.return_value = None
|
||||
tear_mock.side_effect = Exception('boom')
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2775,6 +2776,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
{'clean_steps': None}):
|
||||
# Resume where there are no steps, should be a noop
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
@ -2784,8 +2787,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
driver_internal_info=info,
|
||||
clean_step={})
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
@ -2813,6 +2814,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self, deploy_exec_mock, power_exec_mock, manual=False):
|
||||
# When a clean step fails, go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2823,8 +2826,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step={})
|
||||
deploy_exec_mock.return_value = "foo"
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -3425,11 +3426,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
|
||||
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
|
||||
mock_nodeinfo_list.assert_called_once_with(
|
||||
columns=self.columns, filters=mock.sentinel.filters)
|
||||
mock_fail_if_state.assert_called_once_with(
|
||||
mock.ANY, mock.ANY,
|
||||
{'provision_state': 'deploying', 'reserved': False},
|
||||
'deploying', 'provision_updated_at',
|
||||
last_error=mock.ANY)
|
||||
expected_calls = [mock.call(mock.ANY, mock.ANY,
|
||||
{'provision_state': 'deploying',
|
||||
'reserved': False},
|
||||
'deploying',
|
||||
'provision_updated_at',
|
||||
last_error=mock.ANY),
|
||||
mock.call(mock.ANY, mock.ANY,
|
||||
{'provision_state': 'cleaning',
|
||||
'reserved': False},
|
||||
'cleaning',
|
||||
'provision_updated_at',
|
||||
last_error=mock.ANY)]
|
||||
mock_fail_if_state.assert_has_calls(expected_calls)
|
||||
|
||||
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
|
||||
def test_iter_nodes_shutdown(self, mock_nodeinfo_list):
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- When a conductor managing a node dies mid-cleaning the node would get stuck
|
||||
in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
|
||||
will be moved to the CLEANFAIL state.
|
Loading…
x
Reference in New Issue
Block a user