Merge "Clean nodes stuck in CLEANING state when ir-cond restarts"

This commit is contained in:
Zuul 2018-02-19 17:17:49 +00:00 committed by Gerrit Code Review
commit bc5e21a3ec
4 changed files with 64 additions and 41 deletions

View File

@ -223,21 +223,14 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop)
# NOTE(lucasagomes): If the conductor server dies abruptly
# mid deployment (OMM Killer, power outage, etc...) we
# can not resume the deployment even if the conductor
# comes back online. Cleaning the reservation of the nodes
# (dbapi.clear_node_reservations_for_conductor) is not enough to
# unstick it, so let's gracefully fail the deployment so the node
# can go through the steps (deleting & cleaning) to make itself
# available again.
filters = {'reserved': False,
'provision_state': states.DEPLOYING}
last_error = (_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
states.DEPLOYING, 'provision_updated_at',
last_error=last_error)
self._fail_transient_state(
states.DEPLOYING,
_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
self._fail_transient_state(
states.CLEANING,
_("The cleaning can't be resumed by conductor "
"%s. Moving to fail state.") % self.host)
# Start consoles if it set enabled in a greenthread.
try:
@ -259,6 +252,20 @@ class BaseConductorManager(object):
self._started = True
def _fail_transient_state(self, state, last_error):
"""Apply "fail" transition to nodes in a transient state.
If the conductor server dies abruptly mid deployment or cleaning
(OMM Killer, power outage, etc...) we can not resume the process even
if the conductor comes back online. Cleaning the reservation of
the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
to unstick it, so let's gracefully fail the process.
"""
filters = {'reserved': False, 'provision_state': state}
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
state, 'provision_updated_at',
last_error=last_error)
def del_host(self, deregister=True):
# Conductor deregistration fails if called on non-initialized
# conductor (e.g. when rpc server is unreachable).

View File

@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
ht_mock.assert_called_once_with()
@mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'DriverFactory')
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
log_mock):
@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertFalse(del_mock.called)
@mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'HardwareTypesFactory')
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
log_mock):

View File

@ -2351,13 +2351,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_clean_automated_disabled(self, mock_validate):
self.config(automated_clean=False, group='conductor')
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE,
last_error=None)
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task)
@ -2469,6 +2469,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
else:
tgt_prov_state = states.AVAILABLE
driver_info = {'clean_steps': self.clean_steps}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2477,7 +2479,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_state=states.POWER_OFF,
driver_internal_info=driver_info)
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task, clean_steps=clean_steps)
@ -2515,6 +2516,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.AVAILABLE
driver_internal_info['clean_steps'] = self.clean_steps
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2525,8 +2528,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_execute.return_value = return_state
expected_first_step = node.driver_internal_info['clean_steps'][0]
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2553,6 +2554,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# Resume an in-progress cleaning after the first async step
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2563,8 +2566,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step=self.clean_steps[0])
mock_execute.return_value = return_state
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, self.next_clean_step_index)
@ -2591,6 +2592,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
info = {'clean_steps': self.clean_steps,
'clean_step_index': len(self.clean_steps) - 1}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2599,8 +2602,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step=self.clean_steps[-1])
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@ -2628,6 +2629,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_power_execute, manual=False):
# Run all steps from start to finish (all synchronous)
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2639,8 +2642,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_deploy_execute.return_value = None
mock_power_execute.return_value = None
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2673,6 +2674,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2683,8 +2686,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
mock_execute.side_effect = Exception()
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2718,6 +2719,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
manual=True):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_exec_mock.return_value = None
tear_mock.side_effect = Exception('boom')
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -2775,6 +2776,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
{'clean_steps': None}):
# Resume where there are no steps, should be a noop
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
uuid=uuidutils.generate_uuid(),
@ -2784,8 +2787,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info,
clean_step={})
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
@ -2813,6 +2814,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, deploy_exec_mock, power_exec_mock, manual=False):
# When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
@ -2823,8 +2826,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={})
deploy_exec_mock.return_value = "foo"
self._start_service()
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
@ -3425,11 +3426,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
mock_nodeinfo_list.assert_called_once_with(
columns=self.columns, filters=mock.sentinel.filters)
mock_fail_if_state.assert_called_once_with(
mock.ANY, mock.ANY,
{'provision_state': 'deploying', 'reserved': False},
'deploying', 'provision_updated_at',
last_error=mock.ANY)
expected_calls = [mock.call(mock.ANY, mock.ANY,
{'provision_state': 'deploying',
'reserved': False},
'deploying',
'provision_updated_at',
last_error=mock.ANY),
mock.call(mock.ANY, mock.ANY,
{'provision_state': 'cleaning',
'reserved': False},
'cleaning',
'provision_updated_at',
last_error=mock.ANY)]
mock_fail_if_state.assert_has_calls(expected_calls)
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
def test_iter_nodes_shutdown(self, mock_nodeinfo_list):

View File

@ -0,0 +1,5 @@
---
fixes:
- When a conductor managing a node dies mid-cleaning the node would get stuck
in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
will be moved to the CLEANFAIL state.