From 65098bcf6576526200a5c946c8aa12352d8e33d3 Mon Sep 17 00:00:00 2001 From: Rohan Kanade Date: Sun, 2 Mar 2014 11:27:44 +0100 Subject: [PATCH] Adds max retry limit to sync_power_state task * sync_power_state periodic task tries to set node's actual power state. In case of failure to sync power state, Ironic will keep trying to sync power state, Hence introduce a max retry per node sync power state limit * Set Node's DB power state as the actual Node power state if max retries exceeded * Set the Node's last_error msg after max retries exceed and set the node in maintainence mode Change-Id: Idbd3afab746f81b9097e97afb1d12a81686a4564 Closes-Bug: #1285806 --- etc/ironic/ironic.conf.sample | 5 + ironic/conductor/manager.py | 156 ++++++++++++++++--------- ironic/tests/conductor/test_manager.py | 29 +++++ 3 files changed, 138 insertions(+), 52 deletions(-) diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 9447c4fc94..8253119339 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -493,6 +493,11 @@ # (boolean value) #force_power_state_during_sync=true +# During sync_power_state failures, limit the number of times +# Ironic should try syncing the hardware node power state with +# the node power state in DB (integer value) +#power_state_sync_max_retries=3 + [database] diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 262df2679a..4b8be8efe8 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -41,6 +41,7 @@ building or tearing down the TFTP environment for a node, notifying Neutron of a change, etc. """ +import collections import datetime from eventlet import greenpool @@ -99,62 +100,18 @@ conductor_opts = [ 'state be set to the state recorded in the database ' '(True) or should the database be updated based on ' 'the hardware state (False).'), + cfg.IntOpt('power_state_sync_max_retries', + default=3, + help='During sync_power_state failures, limit the ' + 'number of times Ironic should try syncing the ' + 'hardware node power state with the node power state ' + 'in DB') ] CONF = cfg.CONF CONF.register_opts(conductor_opts, 'conductor') -def _do_sync_power_state(task): - node = task.node - - try: - power_state = task.driver.power.get_power_state(task, node) - except Exception as e: - # TODO(rloo): change to IronicException, after - # https://bugs.launchpad.net/ironic/+bug/1267693 - LOG.warning(_("During sync_power_state, could not get power state for " - "node %(node)s. Error: %(err)s."), - {'node': node.uuid, 'err': e}) - return - - if node.power_state is None: - LOG.info(_("During sync_power_state, node %(node)s has no previous " - "known state. Recording current state '%(state)s'."), - {'node': node.uuid, 'state': power_state}) - node.power_state = power_state - node.save(task.context) - - if power_state == node.power_state: - return - - if CONF.conductor.force_power_state_during_sync: - LOG.warning(_("During sync_power_state, node %(node)s state " - "'%(actual)s' does not match expected state. " - "Changing hardware state to '%(state)s'."), - {'node': node.uuid, 'actual': power_state, - 'state': node.power_state}) - try: - # node_power_action will update the node record - # so don't do that again here. - utils.node_power_action(task, task.node, - node.power_state) - except Exception as e: - # TODO(rloo): change to IronicException after - # https://bugs.launchpad.net/ironic/+bug/1267693 - LOG.error(_("Failed to change power state of node %(node)s " - "to '%(state)s'."), {'node': node.uuid, - 'state': node.power_state}) - else: - LOG.warning(_("During sync_power_state, node %(node)s state " - "does not match expected state '%(state)s'. " - "Updating recorded state to '%(actual)s'."), - {'node': node.uuid, 'actual': power_state, - 'state': node.power_state}) - node.power_state = power_state - node.save(task.context) - - class ConductorManager(service.PeriodicService): """Ironic Conductor service main class.""" @@ -165,6 +122,8 @@ class ConductorManager(service.PeriodicService): super(ConductorManager, self).__init__(host, topic, serializer=serializer) + self.power_state_sync_count = collections.defaultdict(int) + def start(self): super(ConductorManager, self).start() self.dbapi = dbapi.get_instance() @@ -493,10 +452,103 @@ class ConductorManager(service.PeriodicService): def _conductor_service_record_keepalive(self, context): self.dbapi.touch_conductor(self.host) + def _handle_sync_power_state_max_retries_exceeded(self, task, + actual_power_state): + node = task.node + msg = (_("During sync_power_state, max retries exceeded " + "for node %(node)s, node state %(actual)s " + "does not match expected state '%(state)s'. " + "Updating DB state to '%(actual)s' " + "Switching node to maintenance mode."), + {'node': node.uuid, 'actual': actual_power_state, + 'state': node.power_state}) + node.power_state = actual_power_state + node.last_error = msg + node.maintenance = True + node.save(task.context) + LOG.error(msg) + + def _do_sync_power_state(self, task): + node = task.node + power_state = None + + try: + power_state = task.driver.power.get_power_state(task, node) + except Exception as e: + # TODO(rloo): change to IronicException, after + # https://bugs.launchpad.net/ironic/+bug/1267693 + LOG.warning(_("During sync_power_state, could not get power " + "state for node %(node)s. Error: %(err)s."), + {'node': node.uuid, 'err': e}) + self.power_state_sync_count[node.uuid] += 1 + + if (self.power_state_sync_count[node.uuid] >= + CONF.conductor.power_state_sync_max_retries): + self._handle_sync_power_state_max_retries_exceeded(task, + power_state) + return + + if node.power_state is None: + LOG.info(_("During sync_power_state, node %(node)s has no " + "previous known state. Recording current state " + "'%(state)s'."), + {'node': node.uuid, 'state': power_state}) + node.power_state = power_state + node.save(task.context) + + if power_state == node.power_state: + if node.uuid in self.power_state_sync_count: + del self.power_state_sync_count[node.uuid] + return + + if not CONF.conductor.force_power_state_during_sync: + LOG.warning(_("During sync_power_state, node %(node)s state " + "does not match expected state '%(state)s'. " + "Updating recorded state to '%(actual)s'."), + {'node': node.uuid, 'actual': power_state, + 'state': node.power_state}) + node.power_state = power_state + node.save(task.context) + return + + if (self.power_state_sync_count[node.uuid] >= + CONF.conductor.power_state_sync_max_retries): + self._handle_sync_power_state_max_retries_exceeded(task, + power_state) + return + + # Force actual power_state of node equal to DB power_state of node + LOG.warning(_("During sync_power_state, node %(node)s state " + "'%(actual)s' does not match expected state. " + "Changing hardware state to '%(state)s'."), + {'node': node.uuid, 'actual': power_state, + 'state': node.power_state}) + try: + # node_power_action will update the node record + # so don't do that again here. + utils.node_power_action(task, task.node, + node.power_state) + except Exception as e: + # TODO(rloo): change to IronicException after + # https://bugs.launchpad.net/ironic/+bug/1267693 + LOG.error(_("Failed to change power state of node %(node)s " + "to '%(state)s'."), {'node': node.uuid, + 'state': node.power_state}) + attempts_left = (CONF.conductor.power_state_sync_max_retries - + self.power_state_sync_count[node.uuid]) - 1 + msg = (_("%(left)s attempts remaining to " + "sync_power_state for node %(node)s"), + {'left': attempts_left, + 'node': node.uuid}) + LOG.warning(msg) + finally: + # Update power state sync count for current node + self.power_state_sync_count[node.uuid] += 1 + @periodic_task.periodic_task( spacing=CONF.conductor.sync_power_state_interval) def _sync_power_states(self, context): - filters = {'reserved': False} + filters = {'reserved': False, 'maintenance': False} columns = ['id', 'uuid', 'driver'] node_list = self.dbapi.get_nodeinfo_list(columns=columns, filters=filters) @@ -513,7 +565,7 @@ class ConductorManager(service.PeriodicService): continue with task_manager.acquire(context, node_id) as task: - _do_sync_power_state(task) + self._do_sync_power_state(task) except exception.NodeNotFound: LOG.info(_("During sync_power_state, node %(node)s was not " diff --git a/ironic/tests/conductor/test_manager.py b/ironic/tests/conductor/test_manager.py index 6c8ed5d954..df98cce2d7 100644 --- a/ironic/tests/conductor/test_manager.py +++ b/ironic/tests/conductor/test_manager.py @@ -148,6 +148,35 @@ class ManagerTestCase(base.DbTestCase): states.POWER_ON) self.assertFalse(save_mock.called) + @mock.patch.object(conductor_utils, 'node_power_action') + @mock.patch.object(objects.node.Node, 'save') + def test__sync_power_state_max_retries_exceeded(self, save_mock, npa_mock): + """Force sync node power state and check if max retry + limit for force sync is honoured + """ + self.service.start() + self.config(force_power_state_during_sync=True, group='conductor') + self.config(power_state_sync_max_retries=1, group='conductor') + n = utils.get_test_node(driver='fake', power_state=states.POWER_ON) + self.dbapi.create_node(n) + with mock.patch.object(self.driver.power, + 'get_power_state') as get_power_mock: + get_power_mock.return_value = states.POWER_OFF + self.service._sync_power_states(self.context) + self.assertEqual(1, get_power_mock.call_count) + npa_mock.assert_called_once_with(mock.ANY, mock.ANY, + states.POWER_ON) + self.assertEqual(0, save_mock.call_count) + + npa_mock.reset_mock() + save_mock.reset_mock() + get_power_mock.reset_mock() + # This try wont succeed + self.service._sync_power_states(self.context) + self.assertEqual(0, npa_mock.call_count) + self.assertEqual(1, get_power_mock.call_count) + self.assertEqual(1, save_mock.call_count) + @mock.patch.object(conductor_utils, 'node_power_action') @mock.patch.object(objects.node.Node, 'save') def test__sync_power_state_changed_failed(self, save_mock, npa_mock):