Merge "Allow retrying PXE boot if it takes too long"
This commit is contained in:
commit
0450c4f2fd
@ -88,9 +88,19 @@ IRONIC_USE_MOD_WSGI=$(trueorfalse $ENABLE_HTTPD_MOD_WSGI_SERVICES IRONIC_USE_MOD
|
|||||||
# Defaults to the (now confusingly named) IRONIC_USE_MOD_WSGI for backward compat
|
# Defaults to the (now confusingly named) IRONIC_USE_MOD_WSGI for backward compat
|
||||||
IRONIC_USE_WSGI=$(trueorfalse $IRONIC_USE_MOD_WSGI IRONIC_USE_WSGI)
|
IRONIC_USE_WSGI=$(trueorfalse $IRONIC_USE_MOD_WSGI IRONIC_USE_WSGI)
|
||||||
|
|
||||||
|
# Whether DevStack will be setup for bare metal or VMs
|
||||||
|
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
|
||||||
|
|
||||||
# Deploy callback timeout can be changed from its default (1800), if required.
|
# Deploy callback timeout can be changed from its default (1800), if required.
|
||||||
IRONIC_CALLBACK_TIMEOUT=${IRONIC_CALLBACK_TIMEOUT:-}
|
IRONIC_CALLBACK_TIMEOUT=${IRONIC_CALLBACK_TIMEOUT:-}
|
||||||
|
|
||||||
|
# Timeout before retrying PXE boot. Set low to help the CI.
|
||||||
|
if [[ "$IRONIC_IS_HARDWARE" == False ]]; then
|
||||||
|
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-600}
|
||||||
|
else
|
||||||
|
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-}
|
||||||
|
fi
|
||||||
|
|
||||||
# Ping timeout after the node becomes active
|
# Ping timeout after the node becomes active
|
||||||
IRONIC_PING_TIMEOUT=${IRONIC_PING_TIMEOUT:-}
|
IRONIC_PING_TIMEOUT=${IRONIC_PING_TIMEOUT:-}
|
||||||
|
|
||||||
@ -327,9 +337,6 @@ IRONIC_HTTP_PORT=${IRONIC_HTTP_PORT:-3928}
|
|||||||
IRONIC_RPC_TRANSPORT=${IRONIC_RPC_TRANSPORT:-oslo}
|
IRONIC_RPC_TRANSPORT=${IRONIC_RPC_TRANSPORT:-oslo}
|
||||||
IRONIC_JSON_RPC_PORT=${IRONIC_JSON_RPC_PORT:-8089}
|
IRONIC_JSON_RPC_PORT=${IRONIC_JSON_RPC_PORT:-8089}
|
||||||
|
|
||||||
# Whether DevStack will be setup for bare metal or VMs
|
|
||||||
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
|
|
||||||
|
|
||||||
# The first port in the range to bind the Virtual BMCs. The number of
|
# The first port in the range to bind the Virtual BMCs. The number of
|
||||||
# ports that will be used depends on $IRONIC_VM_COUNT variable, e.g if
|
# ports that will be used depends on $IRONIC_VM_COUNT variable, e.g if
|
||||||
# $IRONIC_VM_COUNT=3 the ports 6230, 6231 and 6232 will be used for the
|
# $IRONIC_VM_COUNT=3 the ports 6230, 6231 and 6232 will be used for the
|
||||||
@ -1479,6 +1486,9 @@ function configure_ironic_conductor {
|
|||||||
iniset $IRONIC_CONF_FILE pxe tftp_server $IRONIC_TFTPSERVER_IP
|
iniset $IRONIC_CONF_FILE pxe tftp_server $IRONIC_TFTPSERVER_IP
|
||||||
iniset $IRONIC_CONF_FILE pxe tftp_root $IRONIC_TFTPBOOT_DIR
|
iniset $IRONIC_CONF_FILE pxe tftp_root $IRONIC_TFTPBOOT_DIR
|
||||||
iniset $IRONIC_CONF_FILE pxe tftp_master_path $IRONIC_TFTPBOOT_DIR/master_images
|
iniset $IRONIC_CONF_FILE pxe tftp_master_path $IRONIC_TFTPBOOT_DIR/master_images
|
||||||
|
if [[ -n "$IRONIC_PXE_BOOT_RETRY_TIMEOUT" ]]; then
|
||||||
|
iniset $IRONIC_CONF_FILE pxe boot_retry_timeout $IRONIC_PXE_BOOT_RETRY_TIMEOUT
|
||||||
|
fi
|
||||||
if [[ "$IRONIC_TERMINAL_SSL" == "True" ]]; then
|
if [[ "$IRONIC_TERMINAL_SSL" == "True" ]]; then
|
||||||
# Make sure the cert directory exist
|
# Make sure the cert directory exist
|
||||||
sudo mkdir -p $IRONIC_TERMINAL_CERT_DIR
|
sudo mkdir -p $IRONIC_TERMINAL_CERT_DIR
|
||||||
|
@ -272,6 +272,10 @@ def node_power_action(task, new_state, timeout=None):
|
|||||||
if node['target_power_state'] != target_state:
|
if node['target_power_state'] != target_state:
|
||||||
node['target_power_state'] = target_state
|
node['target_power_state'] = target_state
|
||||||
node['last_error'] = None
|
node['last_error'] = None
|
||||||
|
driver_internal_info = node.driver_internal_info
|
||||||
|
driver_internal_info['last_power_state_change'] = str(
|
||||||
|
timeutils.utcnow().isoformat())
|
||||||
|
node.driver_internal_info = driver_internal_info
|
||||||
node.save()
|
node.save()
|
||||||
|
|
||||||
# take power action
|
# take power action
|
||||||
@ -872,6 +876,22 @@ def fast_track_able(task):
|
|||||||
and task.node.last_error is None)
|
and task.node.last_error is None)
|
||||||
|
|
||||||
|
|
||||||
|
def value_within_timeout(value, timeout):
|
||||||
|
"""Checks if the time is within the previous timeout seconds from now.
|
||||||
|
|
||||||
|
:param value: a string representing date and time or None.
|
||||||
|
:param timeout: timeout in seconds.
|
||||||
|
"""
|
||||||
|
# use native datetime objects for conversion and compare
|
||||||
|
# slightly odd because py2 compatability :(
|
||||||
|
last = datetime.datetime.strptime(value or '1970-01-01T00:00:00.000000',
|
||||||
|
"%Y-%m-%dT%H:%M:%S.%f")
|
||||||
|
# If we found nothing, we assume that the time is essentially epoch.
|
||||||
|
time_delta = datetime.timedelta(seconds=timeout)
|
||||||
|
last_valid = timeutils.utcnow() - time_delta
|
||||||
|
return last_valid <= last
|
||||||
|
|
||||||
|
|
||||||
def is_fast_track(task):
|
def is_fast_track(task):
|
||||||
"""Checks a fast track is available.
|
"""Checks a fast track is available.
|
||||||
|
|
||||||
@ -888,19 +908,8 @@ def is_fast_track(task):
|
|||||||
:returns: True if the last heartbeat that was recorded was within
|
:returns: True if the last heartbeat that was recorded was within
|
||||||
the [deploy]fast_track_timeout setting.
|
the [deploy]fast_track_timeout setting.
|
||||||
"""
|
"""
|
||||||
if not fast_track_able(task):
|
return (fast_track_able(task)
|
||||||
return False
|
and value_within_timeout(
|
||||||
# use native datetime objects for conversion and compare
|
task.node.driver_internal_info.get('agent_last_heartbeat'),
|
||||||
# slightly odd because py2 compatability :(
|
CONF.deploy.fast_track_timeout)
|
||||||
last = datetime.datetime.strptime(
|
|
||||||
task.node.driver_internal_info.get(
|
|
||||||
'agent_last_heartbeat',
|
|
||||||
'1970-01-01T00:00:00.000000'),
|
|
||||||
"%Y-%m-%dT%H:%M:%S.%f")
|
|
||||||
# If we found nothing, we assume that the time is essentially epoch.
|
|
||||||
time_delta = datetime.timedelta(seconds=CONF.deploy.fast_track_timeout)
|
|
||||||
last_valid = timeutils.utcnow() - time_delta
|
|
||||||
# Checking the power state, because if we find the machine off due to
|
|
||||||
# any action, we can't actually fast track the node. :(
|
|
||||||
return (last_valid <= last
|
|
||||||
and task.driver.power.get_power_state(task) == states.POWER_ON)
|
and task.driver.power.get_power_state(task) == states.POWER_ON)
|
||||||
|
@ -126,6 +126,16 @@ opts = [
|
|||||||
default=0,
|
default=0,
|
||||||
help=_('Timeout value (in seconds) for downloading an image '
|
help=_('Timeout value (in seconds) for downloading an image '
|
||||||
'via iPXE. Defaults to 0 (no timeout)')),
|
'via iPXE. Defaults to 0 (no timeout)')),
|
||||||
|
cfg.IntOpt('boot_retry_timeout',
|
||||||
|
min=60,
|
||||||
|
help=_('Timeout (in seconds) after which PXE boot should be '
|
||||||
|
'retried. Must be less than [conductor]'
|
||||||
|
'deploy_callback_timeout. Disabled by default.')),
|
||||||
|
cfg.IntOpt('boot_retry_check_interval',
|
||||||
|
default=90, min=1,
|
||||||
|
help=_('Interval (in seconds) between periodic checks on PXE '
|
||||||
|
'boot retry. Has no effect if boot_retry_timeout '
|
||||||
|
'is not set.')),
|
||||||
cfg.StrOpt('ip_version',
|
cfg.StrOpt('ip_version',
|
||||||
default='4',
|
default='4',
|
||||||
choices=[('4', _('IPv4')),
|
choices=[('4', _('IPv4')),
|
||||||
|
@ -279,20 +279,28 @@ def _zip_matching(a, b, key):
|
|||||||
class Connection(api.Connection):
|
class Connection(api.Connection):
|
||||||
"""SqlAlchemy connection."""
|
"""SqlAlchemy connection."""
|
||||||
|
|
||||||
|
_NODE_QUERY_FIELDS = {'console_enabled', 'maintenance', 'driver',
|
||||||
|
'resource_class', 'provision_state', 'uuid', 'id',
|
||||||
|
'fault', 'conductor_group', 'owner'}
|
||||||
|
_NODE_IN_QUERY_FIELDS = {'%s_in' % field: field
|
||||||
|
for field in ('uuid', 'provision_state')}
|
||||||
|
_NODE_NON_NULL_FILTERS = {'associated': 'instance_uuid',
|
||||||
|
'reserved': 'reservation',
|
||||||
|
'with_power_state': 'power_state'}
|
||||||
|
_NODE_FILTERS = ({'chassis_uuid', 'reserved_by_any_of',
|
||||||
|
'provisioned_before', 'inspection_started_before',
|
||||||
|
'description_contains'}
|
||||||
|
| _NODE_QUERY_FIELDS
|
||||||
|
| set(_NODE_IN_QUERY_FIELDS)
|
||||||
|
| set(_NODE_NON_NULL_FILTERS))
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _validate_nodes_filters(self, filters):
|
def _validate_nodes_filters(self, filters):
|
||||||
if filters is None:
|
if filters is None:
|
||||||
filters = dict()
|
filters = dict()
|
||||||
supported_filters = {'console_enabled', 'maintenance', 'driver',
|
unsupported_filters = set(filters).difference(self._NODE_FILTERS)
|
||||||
'resource_class', 'provision_state', 'uuid', 'id',
|
|
||||||
'chassis_uuid', 'associated', 'reserved',
|
|
||||||
'reserved_by_any_of', 'provisioned_before',
|
|
||||||
'inspection_started_before', 'fault',
|
|
||||||
'conductor_group', 'owner', 'uuid_in',
|
|
||||||
'with_power_state', 'description_contains'}
|
|
||||||
unsupported_filters = set(filters).difference(supported_filters)
|
|
||||||
if unsupported_filters:
|
if unsupported_filters:
|
||||||
msg = _("SqlAlchemy API does not support "
|
msg = _("SqlAlchemy API does not support "
|
||||||
"filtering by %s") % ', '.join(unsupported_filters)
|
"filtering by %s") % ', '.join(unsupported_filters)
|
||||||
@ -301,26 +309,26 @@ class Connection(api.Connection):
|
|||||||
|
|
||||||
def _add_nodes_filters(self, query, filters):
|
def _add_nodes_filters(self, query, filters):
|
||||||
filters = self._validate_nodes_filters(filters)
|
filters = self._validate_nodes_filters(filters)
|
||||||
for field in ['console_enabled', 'maintenance', 'driver',
|
for field in self._NODE_QUERY_FIELDS:
|
||||||
'resource_class', 'provision_state', 'uuid', 'id',
|
|
||||||
'fault', 'conductor_group', 'owner']:
|
|
||||||
if field in filters:
|
if field in filters:
|
||||||
query = query.filter_by(**{field: filters[field]})
|
query = query.filter_by(**{field: filters[field]})
|
||||||
|
for key, field in self._NODE_IN_QUERY_FIELDS.items():
|
||||||
|
if key in filters:
|
||||||
|
query = query.filter(
|
||||||
|
getattr(models.Node, field).in_(filters[key]))
|
||||||
|
for key, field in self._NODE_NON_NULL_FILTERS.items():
|
||||||
|
if key in filters:
|
||||||
|
column = getattr(models.Node, field)
|
||||||
|
if filters[key]:
|
||||||
|
query = query.filter(column != sql.null())
|
||||||
|
else:
|
||||||
|
query = query.filter(column == sql.null())
|
||||||
|
|
||||||
if 'chassis_uuid' in filters:
|
if 'chassis_uuid' in filters:
|
||||||
# get_chassis_by_uuid() to raise an exception if the chassis
|
# get_chassis_by_uuid() to raise an exception if the chassis
|
||||||
# is not found
|
# is not found
|
||||||
chassis_obj = self.get_chassis_by_uuid(filters['chassis_uuid'])
|
chassis_obj = self.get_chassis_by_uuid(filters['chassis_uuid'])
|
||||||
query = query.filter_by(chassis_id=chassis_obj.id)
|
query = query.filter_by(chassis_id=chassis_obj.id)
|
||||||
if 'associated' in filters:
|
|
||||||
if filters['associated']:
|
|
||||||
query = query.filter(models.Node.instance_uuid != sql.null())
|
|
||||||
else:
|
|
||||||
query = query.filter(models.Node.instance_uuid == sql.null())
|
|
||||||
if 'reserved' in filters:
|
|
||||||
if filters['reserved']:
|
|
||||||
query = query.filter(models.Node.reservation != sql.null())
|
|
||||||
else:
|
|
||||||
query = query.filter(models.Node.reservation == sql.null())
|
|
||||||
if 'reserved_by_any_of' in filters:
|
if 'reserved_by_any_of' in filters:
|
||||||
query = query.filter(models.Node.reservation.in_(
|
query = query.filter(models.Node.reservation.in_(
|
||||||
filters['reserved_by_any_of']))
|
filters['reserved_by_any_of']))
|
||||||
@ -334,13 +342,6 @@ class Connection(api.Connection):
|
|||||||
- (datetime.timedelta(
|
- (datetime.timedelta(
|
||||||
seconds=filters['inspection_started_before'])))
|
seconds=filters['inspection_started_before'])))
|
||||||
query = query.filter(models.Node.inspection_started_at < limit)
|
query = query.filter(models.Node.inspection_started_at < limit)
|
||||||
if 'uuid_in' in filters:
|
|
||||||
query = query.filter(models.Node.uuid.in_(filters['uuid_in']))
|
|
||||||
if 'with_power_state' in filters:
|
|
||||||
if filters['with_power_state']:
|
|
||||||
query = query.filter(models.Node.power_state != sql.null())
|
|
||||||
else:
|
|
||||||
query = query.filter(models.Node.power_state == sql.null())
|
|
||||||
if 'description_contains' in filters:
|
if 'description_contains' in filters:
|
||||||
keyword = filters['description_contains']
|
keyword = filters['description_contains']
|
||||||
if keyword is not None:
|
if keyword is not None:
|
||||||
|
@ -17,7 +17,6 @@ PXE Boot Interface
|
|||||||
|
|
||||||
from ironic_lib import metrics_utils
|
from ironic_lib import metrics_utils
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
from oslo_utils import strutils
|
|
||||||
|
|
||||||
from ironic.common import boot_devices
|
from ironic.common import boot_devices
|
||||||
from ironic.common import dhcp_factory
|
from ironic.common import dhcp_factory
|
||||||
@ -174,14 +173,7 @@ class PXEBoot(pxe_base.PXEBaseMixin, base.BootInterface):
|
|||||||
pxe_config_template,
|
pxe_config_template,
|
||||||
ipxe_enabled=CONF.pxe.ipxe_enabled)
|
ipxe_enabled=CONF.pxe.ipxe_enabled)
|
||||||
|
|
||||||
persistent = False
|
persistent = self._persistent_ramdisk_boot(node)
|
||||||
value = node.driver_info.get('force_persistent_boot_device',
|
|
||||||
'Default')
|
|
||||||
if value in {'Always', 'Default', 'Never'}:
|
|
||||||
if value == 'Always':
|
|
||||||
persistent = True
|
|
||||||
else:
|
|
||||||
persistent = strutils.bool_from_string(value, False)
|
|
||||||
manager_utils.node_set_boot_device(task, boot_devices.PXE,
|
manager_utils.node_set_boot_device(task, boot_devices.PXE,
|
||||||
persistent=persistent)
|
persistent=persistent)
|
||||||
|
|
||||||
|
@ -13,13 +13,24 @@
|
|||||||
Base PXE Interface Methods
|
Base PXE Interface Methods
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from futurist import periodics
|
||||||
from ironic_lib import metrics_utils
|
from ironic_lib import metrics_utils
|
||||||
|
from oslo_config import cfg
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
|
from oslo_utils import strutils
|
||||||
|
|
||||||
|
from ironic.common import boot_devices
|
||||||
from ironic.common import exception
|
from ironic.common import exception
|
||||||
from ironic.common.i18n import _
|
from ironic.common.i18n import _
|
||||||
from ironic.common import pxe_utils
|
from ironic.common import pxe_utils
|
||||||
|
from ironic.common import states
|
||||||
|
from ironic.conductor import task_manager
|
||||||
|
from ironic.conductor import utils as manager_utils
|
||||||
from ironic.drivers.modules import deploy_utils
|
from ironic.drivers.modules import deploy_utils
|
||||||
|
|
||||||
|
|
||||||
|
CONF = cfg.CONF
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
METRICS = metrics_utils.get_metrics_logger(__name__)
|
METRICS = metrics_utils.get_metrics_logger(__name__)
|
||||||
@ -99,3 +110,82 @@ class PXEBaseMixin(object):
|
|||||||
parameters
|
parameters
|
||||||
"""
|
"""
|
||||||
pxe_utils.parse_driver_info(task.node, mode='rescue')
|
pxe_utils.parse_driver_info(task.node, mode='rescue')
|
||||||
|
|
||||||
|
def _persistent_ramdisk_boot(self, node):
|
||||||
|
"""If the ramdisk should be configured as a persistent boot device."""
|
||||||
|
value = node.driver_info.get('force_persistent_boot_device', 'Default')
|
||||||
|
if value in {'Always', 'Default', 'Never'}:
|
||||||
|
return value == 'Always'
|
||||||
|
else:
|
||||||
|
return strutils.bool_from_string(value, False)
|
||||||
|
|
||||||
|
_RETRY_ALLOWED_STATES = {states.DEPLOYWAIT, states.CLEANWAIT,
|
||||||
|
states.RESCUEWAIT}
|
||||||
|
|
||||||
|
@METRICS.timer('PXEBaseMixin._check_boot_timeouts')
|
||||||
|
@periodics.periodic(spacing=CONF.pxe.boot_retry_check_interval,
|
||||||
|
enabled=bool(CONF.pxe.boot_retry_timeout))
|
||||||
|
def _check_boot_timeouts(self, manager, context):
|
||||||
|
"""Periodically checks whether boot has timed out and retry it.
|
||||||
|
|
||||||
|
:param manager: conductor manager.
|
||||||
|
:param context: request context.
|
||||||
|
"""
|
||||||
|
filters = {'provision_state_in': self._RETRY_ALLOWED_STATES,
|
||||||
|
'reserved': False,
|
||||||
|
'maintenance': False,
|
||||||
|
'provisioned_before': CONF.pxe.boot_retry_timeout}
|
||||||
|
node_iter = manager.iter_nodes(filters=filters)
|
||||||
|
|
||||||
|
for node_uuid, driver, conductor_group in node_iter:
|
||||||
|
try:
|
||||||
|
lock_purpose = 'checking PXE boot status'
|
||||||
|
with task_manager.acquire(context, node_uuid,
|
||||||
|
shared=True,
|
||||||
|
purpose=lock_purpose) as task:
|
||||||
|
self._check_boot_status(task)
|
||||||
|
except (exception.NodeLocked, exception.NodeNotFound):
|
||||||
|
continue
|
||||||
|
|
||||||
|
def _check_boot_status(self, task):
|
||||||
|
if not isinstance(task.driver.boot, PXEBaseMixin):
|
||||||
|
return
|
||||||
|
|
||||||
|
if not _should_retry_boot(task.node):
|
||||||
|
return
|
||||||
|
|
||||||
|
task.upgrade_lock(purpose='retrying PXE boot')
|
||||||
|
|
||||||
|
# Retry critical checks after acquiring the exclusive lock.
|
||||||
|
if (task.node.maintenance or task.node.provision_state
|
||||||
|
not in self._RETRY_ALLOWED_STATES
|
||||||
|
or not _should_retry_boot(task.node)):
|
||||||
|
return
|
||||||
|
|
||||||
|
LOG.info('Booting the ramdisk on node %(node)s is taking more than '
|
||||||
|
'%(timeout)d seconds, retrying boot',
|
||||||
|
{'node': task.node.uuid,
|
||||||
|
'timeout': CONF.pxe.boot_retry_timeout})
|
||||||
|
|
||||||
|
manager_utils.node_power_action(task, states.POWER_OFF)
|
||||||
|
# NOTE(dtantsur): retry even persistent boot setting in case it did not
|
||||||
|
# work for some reason.
|
||||||
|
persistent = self._persistent_ramdisk_boot(task.node)
|
||||||
|
manager_utils.node_set_boot_device(task, boot_devices.PXE,
|
||||||
|
persistent=persistent)
|
||||||
|
manager_utils.node_power_action(task, states.POWER_ON)
|
||||||
|
|
||||||
|
|
||||||
|
def _should_retry_boot(node):
|
||||||
|
# NOTE(dtantsur): this assumes IPA, do we need to make it generic?
|
||||||
|
for field in ('agent_last_heartbeat', 'last_power_state_change'):
|
||||||
|
if manager_utils.value_within_timeout(
|
||||||
|
node.driver_internal_info.get(field),
|
||||||
|
CONF.pxe.boot_retry_timeout):
|
||||||
|
# Alive and heartbeating, probably busy with something long
|
||||||
|
LOG.debug('Not retrying PXE boot for node %(node)s; its '
|
||||||
|
'%(event)s happened less than %(timeout)d seconds ago',
|
||||||
|
{'node': node.uuid, 'event': field,
|
||||||
|
'timeout': CONF.pxe.boot_retry_timeout})
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
@ -231,7 +231,8 @@ class DbNodeTestCase(base.DbTestCase):
|
|||||||
|
|
||||||
# node with provision_updated timeout
|
# node with provision_updated timeout
|
||||||
node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
|
node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
|
||||||
provision_updated_at=past)
|
provision_updated_at=past,
|
||||||
|
provision_state=states.DEPLOYING)
|
||||||
# node with None in provision_updated_at
|
# node with None in provision_updated_at
|
||||||
node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
|
node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
|
||||||
provision_state=states.DEPLOYWAIT)
|
provision_state=states.DEPLOYWAIT)
|
||||||
@ -247,6 +248,10 @@ class DbNodeTestCase(base.DbTestCase):
|
|||||||
states.DEPLOYWAIT})
|
states.DEPLOYWAIT})
|
||||||
self.assertEqual([node2.id], [r[0] for r in res])
|
self.assertEqual([node2.id], [r[0] for r in res])
|
||||||
|
|
||||||
|
res = self.dbapi.get_nodeinfo_list(
|
||||||
|
filters={'provision_state_in': [states.ACTIVE, states.DEPLOYING]})
|
||||||
|
self.assertEqual([node1.id], [r[0] for r in res])
|
||||||
|
|
||||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||||
def test_get_nodeinfo_list_inspection(self, mock_utcnow):
|
def test_get_nodeinfo_list_inspection(self, mock_utcnow):
|
||||||
past = datetime.datetime(2000, 1, 1, 0, 0)
|
past = datetime.datetime(2000, 1, 1, 0, 0)
|
||||||
|
@ -21,6 +21,7 @@ import tempfile
|
|||||||
import mock
|
import mock
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_serialization import jsonutils as json
|
from oslo_serialization import jsonutils as json
|
||||||
|
from oslo_utils import timeutils
|
||||||
from oslo_utils import uuidutils
|
from oslo_utils import uuidutils
|
||||||
|
|
||||||
from ironic.common import boot_devices
|
from ironic.common import boot_devices
|
||||||
@ -1162,3 +1163,98 @@ class PXEValidateRescueTestCase(db_base.DbTestCase):
|
|||||||
self.assertRaisesRegex(exception.MissingParameterValue,
|
self.assertRaisesRegex(exception.MissingParameterValue,
|
||||||
'Missing.*rescue_kernel',
|
'Missing.*rescue_kernel',
|
||||||
task.driver.boot.validate_rescue, task)
|
task.driver.boot.validate_rescue, task)
|
||||||
|
|
||||||
|
|
||||||
|
@mock.patch.object(ipxe.iPXEBoot, '__init__', lambda self: None)
|
||||||
|
@mock.patch.object(pxe.PXEBoot, '__init__', lambda self: None)
|
||||||
|
@mock.patch.object(manager_utils, 'node_set_boot_device', autospec=True)
|
||||||
|
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||||
|
class PXEBootRetryTestCase(db_base.DbTestCase):
|
||||||
|
|
||||||
|
boot_interface = 'pxe'
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(PXEBootRetryTestCase, self).setUp()
|
||||||
|
self.config(enabled_boot_interfaces=['pxe', 'ipxe', 'fake'])
|
||||||
|
self.config(boot_retry_timeout=300, group='pxe')
|
||||||
|
self.node = obj_utils.create_test_node(
|
||||||
|
self.context,
|
||||||
|
driver='fake-hardware',
|
||||||
|
boot_interface=self.boot_interface,
|
||||||
|
provision_state=states.DEPLOYWAIT)
|
||||||
|
|
||||||
|
@mock.patch.object(pxe.PXEBoot, '_check_boot_status', autospec=True)
|
||||||
|
def test_check_boot_timeouts(self, mock_check_status, mock_power,
|
||||||
|
mock_boot_dev):
|
||||||
|
def _side_effect(iface, task):
|
||||||
|
self.assertEqual(self.node.uuid, task.node.uuid)
|
||||||
|
|
||||||
|
mock_check_status.side_effect = _side_effect
|
||||||
|
manager = mock.Mock(spec=['iter_nodes'])
|
||||||
|
manager.iter_nodes.return_value = [
|
||||||
|
(uuidutils.generate_uuid(), 'fake-hardware', ''),
|
||||||
|
(self.node.uuid, self.node.driver, self.node.conductor_group)
|
||||||
|
]
|
||||||
|
iface = pxe.PXEBoot()
|
||||||
|
iface._check_boot_timeouts(manager, self.context)
|
||||||
|
mock_check_status.assert_called_once_with(iface, mock.ANY)
|
||||||
|
|
||||||
|
def test_check_boot_status_another_boot_interface(self, mock_power,
|
||||||
|
mock_boot_dev):
|
||||||
|
with task_manager.acquire(self.context, self.node.uuid,
|
||||||
|
shared=True) as task:
|
||||||
|
task.driver.boot = fake.FakeBoot()
|
||||||
|
pxe.PXEBoot()._check_boot_status(task)
|
||||||
|
self.assertTrue(task.shared)
|
||||||
|
self.assertFalse(mock_power.called)
|
||||||
|
self.assertFalse(mock_boot_dev.called)
|
||||||
|
|
||||||
|
def test_check_boot_status_recent_power_change(self, mock_power,
|
||||||
|
mock_boot_dev):
|
||||||
|
for field in ('agent_last_heartbeat', 'last_power_state_change'):
|
||||||
|
with task_manager.acquire(self.context, self.node.uuid,
|
||||||
|
shared=True) as task:
|
||||||
|
task.node.driver_internal_info = {
|
||||||
|
field: str(timeutils.utcnow().isoformat())
|
||||||
|
}
|
||||||
|
task.driver.boot._check_boot_status(task)
|
||||||
|
self.assertTrue(task.shared)
|
||||||
|
self.assertFalse(mock_power.called)
|
||||||
|
self.assertFalse(mock_boot_dev.called)
|
||||||
|
|
||||||
|
def test_check_boot_status_maintenance(self, mock_power, mock_boot_dev):
|
||||||
|
self.node.maintenance = True
|
||||||
|
self.node.save()
|
||||||
|
with task_manager.acquire(self.context, self.node.uuid,
|
||||||
|
shared=True) as task:
|
||||||
|
task.driver.boot._check_boot_status(task)
|
||||||
|
self.assertFalse(task.shared)
|
||||||
|
self.assertFalse(mock_power.called)
|
||||||
|
self.assertFalse(mock_boot_dev.called)
|
||||||
|
|
||||||
|
def test_check_boot_status_wrong_state(self, mock_power, mock_boot_dev):
|
||||||
|
self.node.provision_state = states.DEPLOYING
|
||||||
|
self.node.save()
|
||||||
|
with task_manager.acquire(self.context, self.node.uuid,
|
||||||
|
shared=True) as task:
|
||||||
|
task.driver.boot._check_boot_status(task)
|
||||||
|
self.assertFalse(task.shared)
|
||||||
|
self.assertFalse(mock_power.called)
|
||||||
|
self.assertFalse(mock_boot_dev.called)
|
||||||
|
|
||||||
|
def test_check_boot_status_retry(self, mock_power, mock_boot_dev):
|
||||||
|
with task_manager.acquire(self.context, self.node.uuid,
|
||||||
|
shared=True) as task:
|
||||||
|
task.driver.boot._check_boot_status(task)
|
||||||
|
self.assertFalse(task.shared)
|
||||||
|
mock_power.assert_has_calls([
|
||||||
|
mock.call(task, states.POWER_OFF),
|
||||||
|
mock.call(task, states.POWER_ON)
|
||||||
|
])
|
||||||
|
mock_boot_dev.assert_called_once_with(task, 'pxe',
|
||||||
|
persistent=False)
|
||||||
|
|
||||||
|
|
||||||
|
class iPXEBootRetryTestCase(PXEBootRetryTestCase):
|
||||||
|
|
||||||
|
boot_interface = 'ipxe'
|
||||||
|
10
releasenotes/notes/pxe-retry-762a00ba1089bd75.yaml
Normal file
10
releasenotes/notes/pxe-retry-762a00ba1089bd75.yaml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Allows retrying PXE/iPXE boot during deployment, cleaning and rescuing.
|
||||||
|
This feature is disabled by default and can be enabled by setting
|
||||||
|
``[pxe]boot_retry_timeout`` to the timeout (in seconds) after which
|
||||||
|
the boot should be retried.
|
||||||
|
|
||||||
|
The new option ``[pxe]boot_retry_check_interval`` defines how often to
|
||||||
|
check the nodes for timeout and defaults to 90 seconds.
|
Loading…
x
Reference in New Issue
Block a user