From 2f8effc9290d1943aa05cc4c57728e9e67f33557 Mon Sep 17 00:00:00 2001 From: junfeng-li Date: Mon, 24 Mar 2025 23:52:33 +0000 Subject: [PATCH] Handle deployment interruption This commit is an enhancement for handling interruption during the USM upgrade. When the USM upgrade is in the stage of deploy-start, deploy-host, activate or activate-rollback, if the interruption occurs, such as host reboot, the deploy state will be set to failed for the recent stage. After setting to failure state, the USM upgrade can be re-tried. Test Plan: PASS: build and deploy iso PASS: SX upgrade with deploy start interruption PASS: DX upgrade with deploy start interruption Task: 2011357 Story: 51849 Change-Id: I37341d9be5c17d1da7161e08c7b46fd86f28f589 Signed-off-by: junfeng-li --- software/requirements.txt | 1 + software/software/software_controller.py | 49 +++++++++++++++++++++++- software/software/states.py | 10 +++++ software/software/utils.py | 46 +++++++++++++++++++++- 4 files changed, 103 insertions(+), 3 deletions(-) diff --git a/software/requirements.txt b/software/requirements.txt index beede63e..6537d782 100644 --- a/software/requirements.txt +++ b/software/requirements.txt @@ -8,6 +8,7 @@ oslo.policy oslo.serialization netaddr pecan +psutil psycopg2-binary pycryptodomex PyGObject diff --git a/software/software/software_controller.py b/software/software/software_controller.py index f73bf63c..62057890 100644 --- a/software/software/software_controller.py +++ b/software/software/software_controller.py @@ -35,9 +35,12 @@ import software.apt_utils as apt_utils import software.ostree_utils as ostree_utils from software.api import app from software.authapi import app as auth_app +from software.constants import CONTROLLER_0_HOSTNAME +from software.constants import CONTROLLER_1_HOSTNAME from software.constants import INSTALL_LOCAL_FLAG -from software.states import DEPLOY_STATES from software.states import DEPLOY_HOST_STATES +from software.states import DEPLOY_STATES +from software.states import INTERRUPTION_RECOVERY_STATES from software.base import PatchService from software.dc_utils import get_subcloud_groupby_version from software.deploy_state import require_deploy_state @@ -88,6 +91,7 @@ from software.software_functions import get_release_from_patch from software.software_functions import clean_up_deployment_data from software.software_functions import run_remove_temporary_data_script from software.release_state import ReleaseState +from software.utilities.deploy_set_failed import start_set_fail from software.deploy_host_state import DeployHostState from software.deploy_state import DeployState from software.release_verify import verify_files @@ -996,6 +1000,7 @@ class PatchController(PatchService): self.hosts = {} self.controller_neighbours = {} + self.host_mgmt_ip = [] self.db_api_instance = get_instance() @@ -4385,6 +4390,42 @@ class PatchController(PatchService): return None + def is_host_active_controller(self): + """ + Check if current host is active controller by checking if floating ip is assigned + to the host + :return: True if it is active controller, False otherwise + """ + if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG): + return False + + floating_mgmt_ip = utils.gethostbyname(constants.CONTROLLER_FLOATING_HOSTNAME) + if not floating_mgmt_ip: + return False + + ip_family = utils.get_management_family() + mgmt_iface = cfg.get_mgmt_iface() + + host_mgmt_ip_list = utils.get_iface_ip(mgmt_iface, ip_family) + return floating_mgmt_ip in host_mgmt_ip_list if host_mgmt_ip_list else False + + def set_interruption_fail_state(self): + """ + Set the host failed state after an interruption based on current deployment state + """ + upgrade_status = self.get_software_upgrade() + if self.is_host_active_controller() and os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG) and upgrade_status: + + if upgrade_status.get('state') == DEPLOY_STATES.HOST.value and not is_simplex(): + to_fail_hostname = CONTROLLER_0_HOSTNAME if self.hostname == CONTROLLER_1_HOSTNAME else \ + CONTROLLER_1_HOSTNAME + # In DX, when it is in deploy-host state, we can only set the standby controller to fail + start_set_fail(True, to_fail_hostname) + + elif upgrade_status.get('state') in INTERRUPTION_RECOVERY_STATES: + # The deployment was interrupted. We need to update the deployment state first + start_set_fail(True, self.hostname) + class PatchControllerApiThread(threading.Thread): def __init__(self): @@ -4531,6 +4572,12 @@ class PatchControllerMainThread(threading.Thread): sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False') LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors) + LOG.info("software-controller-daemon is starting") + + LOG.info("%s is active controller: %s", sc.hostname, sc.is_host_active_controller()) + + sc.set_interruption_fail_state() + try: if sc.pre_bootstrap and cfg.get_mgmt_ip(): sc.pre_bootstrap = False diff --git a/software/software/states.py b/software/software/states.py index fc85c320..0032d31f 100644 --- a/software/software/states.py +++ b/software/software/states.py @@ -149,3 +149,13 @@ VALID_HOST_DEPLOY_STATE = [ DEPLOY_HOST_STATES.ROLLBACK_FAILED, DEPLOY_HOST_STATES.ROLLBACK_PENDING, ] + +# Only in these states, the state will be +# set to failed after interruption +INTERRUPTION_RECOVERY_STATES = [ + DEPLOY_STATES.START.value, + DEPLOY_STATES.HOST.value, + DEPLOY_STATES.HOST_ROLLBACK.value, + DEPLOY_STATES.ACTIVATE.value, + DEPLOY_STATES.ACTIVATE_ROLLBACK.value, +] diff --git a/software/software/utils.py b/software/software/utils.py index c6f864ad..4d01fd05 100644 --- a/software/software/utils.py +++ b/software/software/utils.py @@ -13,6 +13,7 @@ from netaddr import IPAddress import os from oslo_config import cfg as oslo_cfg from packaging import version +import psutil import re import shutil import socket @@ -114,7 +115,8 @@ def get_component_and_versions(release_name): match = pattern.match(release_name) if match: component = match.group(2) or None - release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}" if match.group(5) else ".0") + release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}" + if match.group(5) else ".0") software_version = f"{match.group(3)}.{match.group(4)}" patch_version = match.group(5) or '0' return component, release_version, software_version, patch_version @@ -274,7 +276,8 @@ def save_temp_file(file_item, temp_dir=constants.SCRATCH_DIR): LOG.error("Not enough space to save file %s in %s \n \ Available %s bytes. File size %s", file_name, temp_dir, avail_space, file_size) except Exception: - msg = "Failed to get file size in bytes for {} or disk space for {}".format(file_item, temp_dir) + msg = "Failed to get file size in bytes for {} or disk space for {}".format( + file_item, temp_dir) LOG.exception(msg) raise Exception(msg) @@ -519,3 +522,42 @@ def find_file_by_regex(dir_path, pattern): except Exception: LOG.error("Can't find files by regex pattern in directory %s." % dir_path) return [] + + +def get_iface_ip(iface_name: str, ip_family: int = socket.AF_INET) -> list[str]: + """Get IP addresses for a network interface filtered by address family. + + :param iface_name: Name of the network interface to query + :param ip_family: Address family to filter by (socket.AF_INET or socket.AF_INET6) + + return: List of IP addresses matching the specified family + """ + # Input validation + if not iface_name or not isinstance(iface_name, str): + raise ValueError("Interface name must be a non-empty string") + + if ip_family not in (socket.AF_INET, socket.AF_INET6): + raise TypeError(f"Invalid address family: {ip_family}") + + try: + # Get network interface addresses + interface_addresses = psutil.net_if_addrs() + + # Return early if interface not found + if iface_name not in interface_addresses: + LOG.error("Interface %s not found", iface_name) + return [] + + # Filter interfaces and collect IP addresses in one pass + # Secondary IP config e.g. enp0s8:2 needs to be handled accordingly + return [ + addr.address + for name, addrs in interface_addresses.items() + if name.startswith(iface_name) + for addr in addrs + if addr.family == ip_family + ] + + except Exception as e: + LOG.error("Error getting IP for interface %s: %s", iface_name, str(e)) + return []