Run ipmi power status less aggressively
Ironic runs IPMI 'power status' in a loop until the desired state is reached. Running repeated IPMI commands in a tight loop (1 second sleep) seems to occasionally cause 'insufficient resources' errors with some hardware types. In this commit we update the IPMI retry timer to use a DynamicLoopingCall which can wait longer for each retry of the power status command. We increase the wait time exponentially between each call, until the total time would exceed the max retry time on the next iteration. This patch also removes the spurious initial 'power status' call which preceeded the 'power on' or 'power off' call without any gap. Since Ironic has a periodic task to sync the power state, it is safe to assume that the hardware's current power state is already known and does not need to be verified inside the set_power_state method. As part of this change we also increase the default IPMI retry_timeout config to 60 seconds. Co-authored-by: Dan Prince <dprince@redhat.com> Change-Id: Ie0e29239592bb64c7a10ecedae299df2c5053359 Closes-bug: #1291991
This commit is contained in:
parent
809cbf9281
commit
a6146ed691
@ -654,7 +654,7 @@
|
||||
|
||||
# Maximum time in seconds to retry IPMI operations. (integer
|
||||
# value)
|
||||
#retry_timeout=10
|
||||
#retry_timeout=60
|
||||
|
||||
|
||||
[keystone_authtoken]
|
||||
|
@ -31,7 +31,7 @@ from pyghmi.ipmi import command as ipmi_command
|
||||
|
||||
opts = [
|
||||
cfg.IntOpt('retry_timeout',
|
||||
default=10,
|
||||
default=60,
|
||||
help='Maximum time in seconds to retry IPMI operations.'),
|
||||
]
|
||||
|
||||
|
@ -138,6 +138,80 @@ def _exec_ipmitool(driver_info, command):
|
||||
return out, err
|
||||
|
||||
|
||||
def _sleep_time(iter):
|
||||
"""Return the time-to-sleep for the n'th iteration of a retry loop.
|
||||
This implementation increases exponentially.
|
||||
|
||||
:param iter: iteration number
|
||||
:returns: number of seconds to sleep
|
||||
|
||||
"""
|
||||
if iter <= 1:
|
||||
return 1
|
||||
return iter ** 2
|
||||
|
||||
|
||||
def _set_and_wait(target_state, driver_info):
|
||||
"""Helper function for DynamicLoopingCall.
|
||||
|
||||
This method changes the power state and polls the BMCuntil the desired
|
||||
power state is reached, or CONF.ipmi.retry_timeout would be exceeded by the
|
||||
next iteration.
|
||||
|
||||
This method assumes the caller knows the current power state and does not
|
||||
check it prior to changing the power state. Most BMCs should be fine, but
|
||||
if a driver is concerned, the state should be checked prior to calling this
|
||||
method.
|
||||
|
||||
:param target_state: desired power state
|
||||
:param driver_info: the ipmitool parameters for accessing a node.
|
||||
:returns: one of ironic.common.states
|
||||
:raises: IPMIFailure on an error from ipmitool (from _power_status call).
|
||||
|
||||
"""
|
||||
if target_state == states.POWER_ON:
|
||||
state_name = "on"
|
||||
elif target_state == states.POWER_OFF:
|
||||
state_name = "off"
|
||||
|
||||
def _wait(mutable):
|
||||
try:
|
||||
# Only issue power change command once
|
||||
if mutable['iter'] < 0:
|
||||
_exec_ipmitool(driver_info, "power %s" % state_name)
|
||||
else:
|
||||
mutable['power'] = _power_status(driver_info)
|
||||
except Exception:
|
||||
# Log failures but keep trying
|
||||
LOG.warning(_("IPMI power %(state)s failed for node %(node)s."),
|
||||
{'state': state_name, 'node': driver_info['uuid']})
|
||||
finally:
|
||||
mutable['iter'] += 1
|
||||
|
||||
if mutable['power'] == target_state:
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
sleep_time = _sleep_time(mutable['iter'])
|
||||
if (sleep_time + mutable['total_time']) > CONF.ipmi.retry_timeout:
|
||||
# Stop if the next loop would exceed maximum retry_timeout
|
||||
LOG.error(_('IPMI power %(state)s timed out after '
|
||||
'%(tries)s retries.'),
|
||||
{'state': state_name, 'tries': mutable['iter']})
|
||||
mutable['power'] = states.ERROR
|
||||
raise loopingcall.LoopingCallDone()
|
||||
else:
|
||||
mutable['total_time'] += sleep_time
|
||||
return sleep_time
|
||||
|
||||
# Use mutable objects so the looped method can change them.
|
||||
# Start 'iter' from -1 so that the first two checks are one second apart.
|
||||
status = {'power': None, 'iter': -1, 'total_time': 0}
|
||||
|
||||
timer = loopingcall.DynamicLoopingCall(_wait, status)
|
||||
timer.start().wait()
|
||||
return status['power']
|
||||
|
||||
|
||||
def _power_on(driver_info):
|
||||
"""Turn the power ON for this node.
|
||||
|
||||
@ -146,36 +220,7 @@ def _power_on(driver_info):
|
||||
:raises: IPMIFailure on an error from ipmitool (from _power_status call).
|
||||
|
||||
"""
|
||||
# use mutable objects so the looped method can change them
|
||||
state = [None]
|
||||
retries = [0]
|
||||
|
||||
def _wait_for_power_on(state, retries):
|
||||
"""Called at an interval until the node's power is on."""
|
||||
|
||||
state[0] = _power_status(driver_info)
|
||||
if state[0] == states.POWER_ON:
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
if retries[0] > CONF.ipmi.retry_timeout:
|
||||
LOG.error(_('IPMI power on timed out after %(tries)s retries.'),
|
||||
{'tries': retries[0]})
|
||||
state[0] = states.ERROR
|
||||
raise loopingcall.LoopingCallDone()
|
||||
try:
|
||||
# only issue "power on" once
|
||||
if retries[0] == 0:
|
||||
_exec_ipmitool(driver_info, "power on")
|
||||
retries[0] += 1
|
||||
except Exception:
|
||||
# Log failures but keep trying
|
||||
LOG.warning(_("IPMI power on failed for node %s.")
|
||||
% driver_info['uuid'])
|
||||
|
||||
timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_on,
|
||||
state, retries)
|
||||
timer.start(interval=1.0).wait()
|
||||
return state[0]
|
||||
return _set_and_wait(states.POWER_ON, driver_info)
|
||||
|
||||
|
||||
def _power_off(driver_info):
|
||||
@ -186,36 +231,7 @@ def _power_off(driver_info):
|
||||
:raises: IPMIFailure on an error from ipmitool (from _power_status call).
|
||||
|
||||
"""
|
||||
# use mutable objects so the looped method can change them
|
||||
state = [None]
|
||||
retries = [0]
|
||||
|
||||
def _wait_for_power_off(state, retries):
|
||||
"""Called at an interval until the node's power is off."""
|
||||
|
||||
state[0] = _power_status(driver_info)
|
||||
if state[0] == states.POWER_OFF:
|
||||
raise loopingcall.LoopingCallDone()
|
||||
|
||||
if retries[0] > CONF.ipmi.retry_timeout:
|
||||
LOG.error(_('IPMI power off timed out after %(tries)s retries.'),
|
||||
{'tries': retries[0]})
|
||||
state[0] = states.ERROR
|
||||
raise loopingcall.LoopingCallDone()
|
||||
try:
|
||||
# only issue "power off" once
|
||||
if retries[0] == 0:
|
||||
_exec_ipmitool(driver_info, "power off")
|
||||
retries[0] += 1
|
||||
except Exception:
|
||||
# Log failures but keep trying
|
||||
LOG.warning(_("IPMI power off failed for node %s.")
|
||||
% driver_info['uuid'])
|
||||
|
||||
timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_off,
|
||||
state=state, retries=retries)
|
||||
timer.start(interval=1.0).wait()
|
||||
return state[0]
|
||||
return _set_and_wait(states.POWER_OFF, driver_info)
|
||||
|
||||
|
||||
def _power_status(driver_info):
|
||||
|
@ -251,9 +251,7 @@ class IPMIToolPrivateMethodTestCase(base.TestCase):
|
||||
autospec=True) as mock_exec:
|
||||
mock_exec.side_effect = side_effect
|
||||
|
||||
expected = [mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power on"),
|
||||
mock.call(self.info, "power status"),
|
||||
expected = [mock.call(self.info, "power on"),
|
||||
mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power status")]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user