Handle deployment interruption

This commit is an enhancement for handling interruption
during the USM upgrade.

When the USM upgrade is in the stage of deploy-start,
deploy-host, activate or activate-rollback,  if the interruption
occurs, such as host reboot, the deploy state will be set to
failed for the recent stage. After setting to failure state, the
USM upgrade can be re-tried.

Test Plan:

PASS: build and deploy iso
PASS: SX upgrade with deploy start interruption
PASS: DX upgrade with deploy start interruption

Task: 2011357
Story: 51849

Change-Id: I37341d9be5c17d1da7161e08c7b46fd86f28f589
Signed-off-by: junfeng-li <junfeng.li@windriver.com>
This commit is contained in:
junfeng-li 2025-03-24 23:52:33 +00:00
parent a918790698
commit 2f8effc929
4 changed files with 103 additions and 3 deletions

View File

@ -8,6 +8,7 @@ oslo.policy
oslo.serialization
netaddr
pecan
psutil
psycopg2-binary
pycryptodomex
PyGObject

View File

@ -35,9 +35,12 @@ import software.apt_utils as apt_utils
import software.ostree_utils as ostree_utils
from software.api import app
from software.authapi import app as auth_app
from software.constants import CONTROLLER_0_HOSTNAME
from software.constants import CONTROLLER_1_HOSTNAME
from software.constants import INSTALL_LOCAL_FLAG
from software.states import DEPLOY_STATES
from software.states import DEPLOY_HOST_STATES
from software.states import DEPLOY_STATES
from software.states import INTERRUPTION_RECOVERY_STATES
from software.base import PatchService
from software.dc_utils import get_subcloud_groupby_version
from software.deploy_state import require_deploy_state
@ -88,6 +91,7 @@ from software.software_functions import get_release_from_patch
from software.software_functions import clean_up_deployment_data
from software.software_functions import run_remove_temporary_data_script
from software.release_state import ReleaseState
from software.utilities.deploy_set_failed import start_set_fail
from software.deploy_host_state import DeployHostState
from software.deploy_state import DeployState
from software.release_verify import verify_files
@ -996,6 +1000,7 @@ class PatchController(PatchService):
self.hosts = {}
self.controller_neighbours = {}
self.host_mgmt_ip = []
self.db_api_instance = get_instance()
@ -4385,6 +4390,42 @@ class PatchController(PatchService):
return None
def is_host_active_controller(self):
"""
Check if current host is active controller by checking if floating ip is assigned
to the host
:return: True if it is active controller, False otherwise
"""
if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG):
return False
floating_mgmt_ip = utils.gethostbyname(constants.CONTROLLER_FLOATING_HOSTNAME)
if not floating_mgmt_ip:
return False
ip_family = utils.get_management_family()
mgmt_iface = cfg.get_mgmt_iface()
host_mgmt_ip_list = utils.get_iface_ip(mgmt_iface, ip_family)
return floating_mgmt_ip in host_mgmt_ip_list if host_mgmt_ip_list else False
def set_interruption_fail_state(self):
"""
Set the host failed state after an interruption based on current deployment state
"""
upgrade_status = self.get_software_upgrade()
if self.is_host_active_controller() and os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG) and upgrade_status:
if upgrade_status.get('state') == DEPLOY_STATES.HOST.value and not is_simplex():
to_fail_hostname = CONTROLLER_0_HOSTNAME if self.hostname == CONTROLLER_1_HOSTNAME else \
CONTROLLER_1_HOSTNAME
# In DX, when it is in deploy-host state, we can only set the standby controller to fail
start_set_fail(True, to_fail_hostname)
elif upgrade_status.get('state') in INTERRUPTION_RECOVERY_STATES:
# The deployment was interrupted. We need to update the deployment state first
start_set_fail(True, self.hostname)
class PatchControllerApiThread(threading.Thread):
def __init__(self):
@ -4531,6 +4572,12 @@ class PatchControllerMainThread(threading.Thread):
sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False')
LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors)
LOG.info("software-controller-daemon is starting")
LOG.info("%s is active controller: %s", sc.hostname, sc.is_host_active_controller())
sc.set_interruption_fail_state()
try:
if sc.pre_bootstrap and cfg.get_mgmt_ip():
sc.pre_bootstrap = False

View File

@ -149,3 +149,13 @@ VALID_HOST_DEPLOY_STATE = [
DEPLOY_HOST_STATES.ROLLBACK_FAILED,
DEPLOY_HOST_STATES.ROLLBACK_PENDING,
]
# Only in these states, the state will be
# set to failed after interruption
INTERRUPTION_RECOVERY_STATES = [
DEPLOY_STATES.START.value,
DEPLOY_STATES.HOST.value,
DEPLOY_STATES.HOST_ROLLBACK.value,
DEPLOY_STATES.ACTIVATE.value,
DEPLOY_STATES.ACTIVATE_ROLLBACK.value,
]

View File

@ -13,6 +13,7 @@ from netaddr import IPAddress
import os
from oslo_config import cfg as oslo_cfg
from packaging import version
import psutil
import re
import shutil
import socket
@ -114,7 +115,8 @@ def get_component_and_versions(release_name):
match = pattern.match(release_name)
if match:
component = match.group(2) or None
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}" if match.group(5) else ".0")
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}"
if match.group(5) else ".0")
software_version = f"{match.group(3)}.{match.group(4)}"
patch_version = match.group(5) or '0'
return component, release_version, software_version, patch_version
@ -274,7 +276,8 @@ def save_temp_file(file_item, temp_dir=constants.SCRATCH_DIR):
LOG.error("Not enough space to save file %s in %s \n \
Available %s bytes. File size %s", file_name, temp_dir, avail_space, file_size)
except Exception:
msg = "Failed to get file size in bytes for {} or disk space for {}".format(file_item, temp_dir)
msg = "Failed to get file size in bytes for {} or disk space for {}".format(
file_item, temp_dir)
LOG.exception(msg)
raise Exception(msg)
@ -519,3 +522,42 @@ def find_file_by_regex(dir_path, pattern):
except Exception:
LOG.error("Can't find files by regex pattern in directory %s." % dir_path)
return []
def get_iface_ip(iface_name: str, ip_family: int = socket.AF_INET) -> list[str]:
"""Get IP addresses for a network interface filtered by address family.
:param iface_name: Name of the network interface to query
:param ip_family: Address family to filter by (socket.AF_INET or socket.AF_INET6)
return: List of IP addresses matching the specified family
"""
# Input validation
if not iface_name or not isinstance(iface_name, str):
raise ValueError("Interface name must be a non-empty string")
if ip_family not in (socket.AF_INET, socket.AF_INET6):
raise TypeError(f"Invalid address family: {ip_family}")
try:
# Get network interface addresses
interface_addresses = psutil.net_if_addrs()
# Return early if interface not found
if iface_name not in interface_addresses:
LOG.error("Interface %s not found", iface_name)
return []
# Filter interfaces and collect IP addresses in one pass
# Secondary IP config e.g. enp0s8:2 needs to be handled accordingly
return [
addr.address
for name, addrs in interface_addresses.items()
if name.startswith(iface_name)
for addr in addrs
if addr.family == ip_family
]
except Exception as e:
LOG.error("Error getting IP for interface %s: %s", iface_name, str(e))
return []