Adds max retry limit to sync_power_state task

* sync_power_state periodic task tries to set node's actual power state.
  In case of failure to sync power state, Ironic will keep trying to
  sync power state, Hence introduce a max retry per node
  sync power state limit
* Set Node's DB power state as the actual Node power state if max retries
  exceeded
* Set the Node's last_error msg after max retries exceed and set the
  node in maintainence mode

Change-Id: Idbd3afab746f81b9097e97afb1d12a81686a4564
Closes-Bug: #1285806
This commit is contained in:
Rohan Kanade 2014-03-02 11:27:44 +01:00 committed by Rohan Kanade
parent 882e007890
commit 65098bcf65
3 changed files with 138 additions and 52 deletions

View File

@ -493,6 +493,11 @@
# (boolean value)
#force_power_state_during_sync=true
# During sync_power_state failures, limit the number of times
# Ironic should try syncing the hardware node power state with
# the node power state in DB (integer value)
#power_state_sync_max_retries=3
[database]

View File

@ -41,6 +41,7 @@ building or tearing down the TFTP environment for a node, notifying Neutron of
a change, etc.
"""
import collections
import datetime
from eventlet import greenpool
@ -99,62 +100,18 @@ conductor_opts = [
'state be set to the state recorded in the database '
'(True) or should the database be updated based on '
'the hardware state (False).'),
cfg.IntOpt('power_state_sync_max_retries',
default=3,
help='During sync_power_state failures, limit the '
'number of times Ironic should try syncing the '
'hardware node power state with the node power state '
'in DB')
]
CONF = cfg.CONF
CONF.register_opts(conductor_opts, 'conductor')
def _do_sync_power_state(task):
node = task.node
try:
power_state = task.driver.power.get_power_state(task, node)
except Exception as e:
# TODO(rloo): change to IronicException, after
# https://bugs.launchpad.net/ironic/+bug/1267693
LOG.warning(_("During sync_power_state, could not get power state for "
"node %(node)s. Error: %(err)s."),
{'node': node.uuid, 'err': e})
return
if node.power_state is None:
LOG.info(_("During sync_power_state, node %(node)s has no previous "
"known state. Recording current state '%(state)s'."),
{'node': node.uuid, 'state': power_state})
node.power_state = power_state
node.save(task.context)
if power_state == node.power_state:
return
if CONF.conductor.force_power_state_during_sync:
LOG.warning(_("During sync_power_state, node %(node)s state "
"'%(actual)s' does not match expected state. "
"Changing hardware state to '%(state)s'."),
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
try:
# node_power_action will update the node record
# so don't do that again here.
utils.node_power_action(task, task.node,
node.power_state)
except Exception as e:
# TODO(rloo): change to IronicException after
# https://bugs.launchpad.net/ironic/+bug/1267693
LOG.error(_("Failed to change power state of node %(node)s "
"to '%(state)s'."), {'node': node.uuid,
'state': node.power_state})
else:
LOG.warning(_("During sync_power_state, node %(node)s state "
"does not match expected state '%(state)s'. "
"Updating recorded state to '%(actual)s'."),
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
node.power_state = power_state
node.save(task.context)
class ConductorManager(service.PeriodicService):
"""Ironic Conductor service main class."""
@ -165,6 +122,8 @@ class ConductorManager(service.PeriodicService):
super(ConductorManager, self).__init__(host, topic,
serializer=serializer)
self.power_state_sync_count = collections.defaultdict(int)
def start(self):
super(ConductorManager, self).start()
self.dbapi = dbapi.get_instance()
@ -493,10 +452,103 @@ class ConductorManager(service.PeriodicService):
def _conductor_service_record_keepalive(self, context):
self.dbapi.touch_conductor(self.host)
def _handle_sync_power_state_max_retries_exceeded(self, task,
actual_power_state):
node = task.node
msg = (_("During sync_power_state, max retries exceeded "
"for node %(node)s, node state %(actual)s "
"does not match expected state '%(state)s'. "
"Updating DB state to '%(actual)s' "
"Switching node to maintenance mode."),
{'node': node.uuid, 'actual': actual_power_state,
'state': node.power_state})
node.power_state = actual_power_state
node.last_error = msg
node.maintenance = True
node.save(task.context)
LOG.error(msg)
def _do_sync_power_state(self, task):
node = task.node
power_state = None
try:
power_state = task.driver.power.get_power_state(task, node)
except Exception as e:
# TODO(rloo): change to IronicException, after
# https://bugs.launchpad.net/ironic/+bug/1267693
LOG.warning(_("During sync_power_state, could not get power "
"state for node %(node)s. Error: %(err)s."),
{'node': node.uuid, 'err': e})
self.power_state_sync_count[node.uuid] += 1
if (self.power_state_sync_count[node.uuid] >=
CONF.conductor.power_state_sync_max_retries):
self._handle_sync_power_state_max_retries_exceeded(task,
power_state)
return
if node.power_state is None:
LOG.info(_("During sync_power_state, node %(node)s has no "
"previous known state. Recording current state "
"'%(state)s'."),
{'node': node.uuid, 'state': power_state})
node.power_state = power_state
node.save(task.context)
if power_state == node.power_state:
if node.uuid in self.power_state_sync_count:
del self.power_state_sync_count[node.uuid]
return
if not CONF.conductor.force_power_state_during_sync:
LOG.warning(_("During sync_power_state, node %(node)s state "
"does not match expected state '%(state)s'. "
"Updating recorded state to '%(actual)s'."),
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
node.power_state = power_state
node.save(task.context)
return
if (self.power_state_sync_count[node.uuid] >=
CONF.conductor.power_state_sync_max_retries):
self._handle_sync_power_state_max_retries_exceeded(task,
power_state)
return
# Force actual power_state of node equal to DB power_state of node
LOG.warning(_("During sync_power_state, node %(node)s state "
"'%(actual)s' does not match expected state. "
"Changing hardware state to '%(state)s'."),
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
try:
# node_power_action will update the node record
# so don't do that again here.
utils.node_power_action(task, task.node,
node.power_state)
except Exception as e:
# TODO(rloo): change to IronicException after
# https://bugs.launchpad.net/ironic/+bug/1267693
LOG.error(_("Failed to change power state of node %(node)s "
"to '%(state)s'."), {'node': node.uuid,
'state': node.power_state})
attempts_left = (CONF.conductor.power_state_sync_max_retries -
self.power_state_sync_count[node.uuid]) - 1
msg = (_("%(left)s attempts remaining to "
"sync_power_state for node %(node)s"),
{'left': attempts_left,
'node': node.uuid})
LOG.warning(msg)
finally:
# Update power state sync count for current node
self.power_state_sync_count[node.uuid] += 1
@periodic_task.periodic_task(
spacing=CONF.conductor.sync_power_state_interval)
def _sync_power_states(self, context):
filters = {'reserved': False}
filters = {'reserved': False, 'maintenance': False}
columns = ['id', 'uuid', 'driver']
node_list = self.dbapi.get_nodeinfo_list(columns=columns,
filters=filters)
@ -513,7 +565,7 @@ class ConductorManager(service.PeriodicService):
continue
with task_manager.acquire(context, node_id) as task:
_do_sync_power_state(task)
self._do_sync_power_state(task)
except exception.NodeNotFound:
LOG.info(_("During sync_power_state, node %(node)s was not "

View File

@ -148,6 +148,35 @@ class ManagerTestCase(base.DbTestCase):
states.POWER_ON)
self.assertFalse(save_mock.called)
@mock.patch.object(conductor_utils, 'node_power_action')
@mock.patch.object(objects.node.Node, 'save')
def test__sync_power_state_max_retries_exceeded(self, save_mock, npa_mock):
"""Force sync node power state and check if max retry
limit for force sync is honoured
"""
self.service.start()
self.config(force_power_state_during_sync=True, group='conductor')
self.config(power_state_sync_max_retries=1, group='conductor')
n = utils.get_test_node(driver='fake', power_state=states.POWER_ON)
self.dbapi.create_node(n)
with mock.patch.object(self.driver.power,
'get_power_state') as get_power_mock:
get_power_mock.return_value = states.POWER_OFF
self.service._sync_power_states(self.context)
self.assertEqual(1, get_power_mock.call_count)
npa_mock.assert_called_once_with(mock.ANY, mock.ANY,
states.POWER_ON)
self.assertEqual(0, save_mock.call_count)
npa_mock.reset_mock()
save_mock.reset_mock()
get_power_mock.reset_mock()
# This try wont succeed
self.service._sync_power_states(self.context)
self.assertEqual(0, npa_mock.call_count)
self.assertEqual(1, get_power_mock.call_count)
self.assertEqual(1, save_mock.call_count)
@mock.patch.object(conductor_utils, 'node_power_action')
@mock.patch.object(objects.node.Node, 'save')
def test__sync_power_state_changed_failed(self, save_mock, npa_mock):