Handle deployment interruption
This commit is an enhancement for handling interruption during the USM upgrade. When the USM upgrade is in the stage of deploy-start, deploy-host, activate or activate-rollback, if the interruption occurs, such as host reboot, the deploy state will be set to failed for the recent stage. After setting to failure state, the USM upgrade can be re-tried. Test Plan: PASS: build and deploy iso PASS: SX upgrade with deploy start interruption PASS: DX upgrade with deploy start interruption Task: 2011357 Story: 51849 Change-Id: I37341d9be5c17d1da7161e08c7b46fd86f28f589 Signed-off-by: junfeng-li <junfeng.li@windriver.com>
This commit is contained in:
parent
a918790698
commit
2f8effc929
@ -8,6 +8,7 @@ oslo.policy
|
||||
oslo.serialization
|
||||
netaddr
|
||||
pecan
|
||||
psutil
|
||||
psycopg2-binary
|
||||
pycryptodomex
|
||||
PyGObject
|
||||
|
@ -35,9 +35,12 @@ import software.apt_utils as apt_utils
|
||||
import software.ostree_utils as ostree_utils
|
||||
from software.api import app
|
||||
from software.authapi import app as auth_app
|
||||
from software.constants import CONTROLLER_0_HOSTNAME
|
||||
from software.constants import CONTROLLER_1_HOSTNAME
|
||||
from software.constants import INSTALL_LOCAL_FLAG
|
||||
from software.states import DEPLOY_STATES
|
||||
from software.states import DEPLOY_HOST_STATES
|
||||
from software.states import DEPLOY_STATES
|
||||
from software.states import INTERRUPTION_RECOVERY_STATES
|
||||
from software.base import PatchService
|
||||
from software.dc_utils import get_subcloud_groupby_version
|
||||
from software.deploy_state import require_deploy_state
|
||||
@ -88,6 +91,7 @@ from software.software_functions import get_release_from_patch
|
||||
from software.software_functions import clean_up_deployment_data
|
||||
from software.software_functions import run_remove_temporary_data_script
|
||||
from software.release_state import ReleaseState
|
||||
from software.utilities.deploy_set_failed import start_set_fail
|
||||
from software.deploy_host_state import DeployHostState
|
||||
from software.deploy_state import DeployState
|
||||
from software.release_verify import verify_files
|
||||
@ -996,6 +1000,7 @@ class PatchController(PatchService):
|
||||
|
||||
self.hosts = {}
|
||||
self.controller_neighbours = {}
|
||||
self.host_mgmt_ip = []
|
||||
|
||||
self.db_api_instance = get_instance()
|
||||
|
||||
@ -4385,6 +4390,42 @@ class PatchController(PatchService):
|
||||
|
||||
return None
|
||||
|
||||
def is_host_active_controller(self):
|
||||
"""
|
||||
Check if current host is active controller by checking if floating ip is assigned
|
||||
to the host
|
||||
:return: True if it is active controller, False otherwise
|
||||
"""
|
||||
if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG):
|
||||
return False
|
||||
|
||||
floating_mgmt_ip = utils.gethostbyname(constants.CONTROLLER_FLOATING_HOSTNAME)
|
||||
if not floating_mgmt_ip:
|
||||
return False
|
||||
|
||||
ip_family = utils.get_management_family()
|
||||
mgmt_iface = cfg.get_mgmt_iface()
|
||||
|
||||
host_mgmt_ip_list = utils.get_iface_ip(mgmt_iface, ip_family)
|
||||
return floating_mgmt_ip in host_mgmt_ip_list if host_mgmt_ip_list else False
|
||||
|
||||
def set_interruption_fail_state(self):
|
||||
"""
|
||||
Set the host failed state after an interruption based on current deployment state
|
||||
"""
|
||||
upgrade_status = self.get_software_upgrade()
|
||||
if self.is_host_active_controller() and os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG) and upgrade_status:
|
||||
|
||||
if upgrade_status.get('state') == DEPLOY_STATES.HOST.value and not is_simplex():
|
||||
to_fail_hostname = CONTROLLER_0_HOSTNAME if self.hostname == CONTROLLER_1_HOSTNAME else \
|
||||
CONTROLLER_1_HOSTNAME
|
||||
# In DX, when it is in deploy-host state, we can only set the standby controller to fail
|
||||
start_set_fail(True, to_fail_hostname)
|
||||
|
||||
elif upgrade_status.get('state') in INTERRUPTION_RECOVERY_STATES:
|
||||
# The deployment was interrupted. We need to update the deployment state first
|
||||
start_set_fail(True, self.hostname)
|
||||
|
||||
|
||||
class PatchControllerApiThread(threading.Thread):
|
||||
def __init__(self):
|
||||
@ -4531,6 +4572,12 @@ class PatchControllerMainThread(threading.Thread):
|
||||
sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False')
|
||||
LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors)
|
||||
|
||||
LOG.info("software-controller-daemon is starting")
|
||||
|
||||
LOG.info("%s is active controller: %s", sc.hostname, sc.is_host_active_controller())
|
||||
|
||||
sc.set_interruption_fail_state()
|
||||
|
||||
try:
|
||||
if sc.pre_bootstrap and cfg.get_mgmt_ip():
|
||||
sc.pre_bootstrap = False
|
||||
|
@ -149,3 +149,13 @@ VALID_HOST_DEPLOY_STATE = [
|
||||
DEPLOY_HOST_STATES.ROLLBACK_FAILED,
|
||||
DEPLOY_HOST_STATES.ROLLBACK_PENDING,
|
||||
]
|
||||
|
||||
# Only in these states, the state will be
|
||||
# set to failed after interruption
|
||||
INTERRUPTION_RECOVERY_STATES = [
|
||||
DEPLOY_STATES.START.value,
|
||||
DEPLOY_STATES.HOST.value,
|
||||
DEPLOY_STATES.HOST_ROLLBACK.value,
|
||||
DEPLOY_STATES.ACTIVATE.value,
|
||||
DEPLOY_STATES.ACTIVATE_ROLLBACK.value,
|
||||
]
|
||||
|
@ -13,6 +13,7 @@ from netaddr import IPAddress
|
||||
import os
|
||||
from oslo_config import cfg as oslo_cfg
|
||||
from packaging import version
|
||||
import psutil
|
||||
import re
|
||||
import shutil
|
||||
import socket
|
||||
@ -114,7 +115,8 @@ def get_component_and_versions(release_name):
|
||||
match = pattern.match(release_name)
|
||||
if match:
|
||||
component = match.group(2) or None
|
||||
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}" if match.group(5) else ".0")
|
||||
release_version = f"{match.group(3)}.{match.group(4)}" + (f".{match.group(5)}"
|
||||
if match.group(5) else ".0")
|
||||
software_version = f"{match.group(3)}.{match.group(4)}"
|
||||
patch_version = match.group(5) or '0'
|
||||
return component, release_version, software_version, patch_version
|
||||
@ -274,7 +276,8 @@ def save_temp_file(file_item, temp_dir=constants.SCRATCH_DIR):
|
||||
LOG.error("Not enough space to save file %s in %s \n \
|
||||
Available %s bytes. File size %s", file_name, temp_dir, avail_space, file_size)
|
||||
except Exception:
|
||||
msg = "Failed to get file size in bytes for {} or disk space for {}".format(file_item, temp_dir)
|
||||
msg = "Failed to get file size in bytes for {} or disk space for {}".format(
|
||||
file_item, temp_dir)
|
||||
LOG.exception(msg)
|
||||
raise Exception(msg)
|
||||
|
||||
@ -519,3 +522,42 @@ def find_file_by_regex(dir_path, pattern):
|
||||
except Exception:
|
||||
LOG.error("Can't find files by regex pattern in directory %s." % dir_path)
|
||||
return []
|
||||
|
||||
|
||||
def get_iface_ip(iface_name: str, ip_family: int = socket.AF_INET) -> list[str]:
|
||||
"""Get IP addresses for a network interface filtered by address family.
|
||||
|
||||
:param iface_name: Name of the network interface to query
|
||||
:param ip_family: Address family to filter by (socket.AF_INET or socket.AF_INET6)
|
||||
|
||||
return: List of IP addresses matching the specified family
|
||||
"""
|
||||
# Input validation
|
||||
if not iface_name or not isinstance(iface_name, str):
|
||||
raise ValueError("Interface name must be a non-empty string")
|
||||
|
||||
if ip_family not in (socket.AF_INET, socket.AF_INET6):
|
||||
raise TypeError(f"Invalid address family: {ip_family}")
|
||||
|
||||
try:
|
||||
# Get network interface addresses
|
||||
interface_addresses = psutil.net_if_addrs()
|
||||
|
||||
# Return early if interface not found
|
||||
if iface_name not in interface_addresses:
|
||||
LOG.error("Interface %s not found", iface_name)
|
||||
return []
|
||||
|
||||
# Filter interfaces and collect IP addresses in one pass
|
||||
# Secondary IP config e.g. enp0s8:2 needs to be handled accordingly
|
||||
return [
|
||||
addr.address
|
||||
for name, addrs in interface_addresses.items()
|
||||
if name.startswith(iface_name)
|
||||
for addr in addrs
|
||||
if addr.family == ip_family
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
LOG.error("Error getting IP for interface %s: %s", iface_name, str(e))
|
||||
return []
|
||||
|
Loading…
x
Reference in New Issue
Block a user