Raise deploy state out of sync alarm
This commit is to raise the deploy state out of sync alarm when the deploy state in the software.json files in both controllers are different. The deploy state is checked every 30 seconds during the deploying stage. If they are insync, the alarm will be cleared. Depends-on: https://review.opendev.org/c/starlingx/fault/+/913581 Test Plan: PASS: the alarm is raised when the state is out of sync in both DX and SX PASS: the alarm is cleared when the state is in sync in both DX and SX Task: 49737 Story: 2010676 Change-Id: Ic31c7166135d03591fa4696445783895254dfc95 Signed-off-by: junfeng-li <junfeng.li@windriver.com>
This commit is contained in:
parent
4b433c3d46
commit
979bd27d90
@ -53,6 +53,7 @@
|
||||
nodeset: debian-bullseye
|
||||
required-projects:
|
||||
- starlingx/config
|
||||
- starlingx/fault
|
||||
files:
|
||||
- software/*
|
||||
vars:
|
||||
@ -66,6 +67,7 @@
|
||||
nodeset: debian-bullseye
|
||||
required-projects:
|
||||
- starlingx/config
|
||||
- starlingx/fault
|
||||
files:
|
||||
- software/*
|
||||
vars:
|
||||
|
@ -9,6 +9,7 @@ Build-Depends: debhelper-compat (= 13),
|
||||
python3-wheel,
|
||||
build-info-dev
|
||||
Build-Depends-Indep:
|
||||
python3-fm-api,
|
||||
python3-keystonemiddleware,
|
||||
python3-oslo.config
|
||||
Standards-Version: 4.5.1
|
||||
@ -20,7 +21,8 @@ Architecture: all
|
||||
Depends: ${python3:Depends},
|
||||
${misc:Depends},
|
||||
gir1.2-ostree-1.0,
|
||||
python3-argcomplete
|
||||
python3-argcomplete,
|
||||
python3-fm-api,
|
||||
Description: StarlingX unified software deployment and management
|
||||
StarlingX unified software deployment and management.
|
||||
|
||||
|
@ -179,6 +179,12 @@ WORKER_SUMMARY_DIR = "%s/summary" % SOFTWARE_STORAGE_DIR
|
||||
WORKER_DATETIME_FORMAT = "%Y%m%dT%H%M%S%f"
|
||||
UNKNOWN_SOFTWARE_VERSION = "0.0.0"
|
||||
|
||||
LAST_IN_SYNC = "last_in_sync"
|
||||
|
||||
SYSTEM_MODE_SIMPLEX = "simplex"
|
||||
SYSTEM_MODE_DUPLEX = "duplex"
|
||||
|
||||
|
||||
|
||||
class DEPLOY_STATES(Enum):
|
||||
ACTIVATE = 'activate'
|
||||
|
@ -23,6 +23,10 @@ import threading
|
||||
import time
|
||||
from wsgiref import simple_server
|
||||
|
||||
from fm_api import fm_api
|
||||
from fm_api import constants as fm_constants
|
||||
|
||||
|
||||
from oslo_config import cfg as oslo_cfg
|
||||
|
||||
import software.apt_utils as apt_utils
|
||||
@ -63,6 +67,8 @@ from software.software_functions import LOG
|
||||
from software.software_functions import audit_log_info
|
||||
from software.software_functions import repo_root_dir
|
||||
from software.software_functions import ReleaseData
|
||||
from software.software_functions import is_deploy_state_in_sync
|
||||
from software.software_functions import is_deployment_in_progress
|
||||
from software.release_verify import verify_files
|
||||
import software.config as cfg
|
||||
import software.utils as utils
|
||||
@ -786,6 +792,11 @@ class PatchController(PatchService):
|
||||
self.check_patch_states()
|
||||
self.base_pkgdata = BasePackageData()
|
||||
|
||||
# This is for alarm cache. It will be used to store the last raising alarm id
|
||||
self.usm_alarm = {constants.LAST_IN_SYNC: False}
|
||||
self.hostname = socket.gethostname()
|
||||
self.fm_api = fm_api.FaultAPIs()
|
||||
|
||||
self.allow_insvc_patching = True
|
||||
|
||||
if os.path.exists(app_dependency_filename):
|
||||
@ -802,6 +813,14 @@ class PatchController(PatchService):
|
||||
else:
|
||||
self.write_state_file()
|
||||
|
||||
system_mode = utils.get_platform_conf("system_mode")
|
||||
if system_mode == constants.SYSTEM_MODE_SIMPLEX:
|
||||
self.standby_controller = "controller-0"
|
||||
elif system_mode == constants.SYSTEM_MODE_DUPLEX:
|
||||
self.standby_controller = "controller-0" \
|
||||
if self.hostname == "controller-1" \
|
||||
else "controller-1"
|
||||
|
||||
@property
|
||||
def release_collection(self):
|
||||
# for this stage, the SWReleaseCollection behaves as a broker which
|
||||
@ -3062,6 +3081,65 @@ class PatchController(PatchService):
|
||||
func(*args, **kwargs)
|
||||
self._update_state_to_peer()
|
||||
|
||||
def handle_deploy_state_sync(self, alarm_instance_id):
|
||||
"""
|
||||
Handle the deploy state sync.
|
||||
If deploy state is in sync, clear the alarm.
|
||||
If not, raise the alarm.
|
||||
"""
|
||||
is_in_sync = is_deploy_state_in_sync()
|
||||
|
||||
# Deploy in sync state is not changed, no need to update the alarm
|
||||
if is_in_sync == self.usm_alarm.get(constants.LAST_IN_SYNC):
|
||||
return
|
||||
|
||||
try:
|
||||
out_of_sync_alarm_fault = sc.fm_api.get_fault(
|
||||
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id)
|
||||
|
||||
LOG.info("software.json in sync: %s", is_in_sync)
|
||||
|
||||
if out_of_sync_alarm_fault and is_in_sync:
|
||||
# There was an out of sync alarm raised, but local software.json is in sync,
|
||||
# we clear the alarm
|
||||
LOG.info("Clearing alarm: %s ", out_of_sync_alarm_fault.alarm_id)
|
||||
self.fm_api.clear_fault(
|
||||
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
|
||||
alarm_instance_id)
|
||||
|
||||
# Deploy in sync state is changed, update the cache
|
||||
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
|
||||
|
||||
elif (not out_of_sync_alarm_fault) and (not is_in_sync):
|
||||
# There was no out of sync alarm raised, but local software.json is not in sync,
|
||||
# we raise the alarm
|
||||
LOG.info("Raising alarm: %s ",
|
||||
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC)
|
||||
out_of_sync_fault = fm_api.Fault(
|
||||
alarm_id=fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
entity_instance_id=alarm_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text="Software deployment in progress",
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_11,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
proposed_repair_action="Wait for deployment to complete",
|
||||
service_affecting=False
|
||||
)
|
||||
|
||||
self.fm_api.set_fault(out_of_sync_fault)
|
||||
|
||||
# Deploy in sync state is changed, update the cache
|
||||
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
|
||||
|
||||
else:
|
||||
# Shouldn't come to here
|
||||
LOG.error("Unexpected case in handling deploy state sync. ")
|
||||
|
||||
except Exception as ex:
|
||||
LOG.exception("Failed in handling deploy state sync. Error: %s" % str(ex))
|
||||
|
||||
def _get_software_upgrade(self):
|
||||
"""
|
||||
Get the current software upgrade from/to versions and state
|
||||
@ -3244,7 +3322,15 @@ class PatchControllerMainThread(threading.Thread):
|
||||
# We only can use one inverval
|
||||
SEND_MSG_INTERVAL_IN_SECONDS = 30.0
|
||||
|
||||
alarm_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
sc.standby_controller)
|
||||
|
||||
try:
|
||||
# Update the out of sync alarm cache when the thread starts
|
||||
out_of_sync_alarm_fault = sc.fm_api.get_fault(
|
||||
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id)
|
||||
sc.usm_alarm[constants.LAST_IN_SYNC] = not out_of_sync_alarm_fault
|
||||
|
||||
sock_in = sc.setup_socket()
|
||||
|
||||
while sock_in is None:
|
||||
@ -3445,11 +3531,12 @@ class PatchControllerMainThread(threading.Thread):
|
||||
SEND_MSG_INTERVAL_IN_SECONDS)
|
||||
|
||||
# Only send the deploy state update from the active controller
|
||||
if utils.is_active_controller():
|
||||
if is_deployment_in_progress(sc.release_data.metadata) and utils.is_active_controller():
|
||||
try:
|
||||
sc.socket_lock.acquire()
|
||||
deploy_state_update = SoftwareMessageDeployStateUpdate()
|
||||
deploy_state_update.send(sc.sock_out)
|
||||
sc.handle_deploy_state_sync(alarm_instance_id)
|
||||
except Exception as e:
|
||||
LOG.exception("Failed to send deploy state update. Error: %s", str(e))
|
||||
finally:
|
||||
|
@ -1287,3 +1287,39 @@ def parse_release_metadata(filename):
|
||||
continue
|
||||
data[child.tag] = child.text
|
||||
return data
|
||||
|
||||
|
||||
def is_deploy_state_in_sync():
|
||||
"""
|
||||
Check if deploy state in sync
|
||||
:return: bool true if in sync, false otherwise
|
||||
"""
|
||||
if os.path.isfile(constants.SOFTWARE_JSON_FILE) \
|
||||
and os.path.isfile(constants.SYNCED_SOFTWARE_JSON_FILE):
|
||||
|
||||
working_data_deploy_state = utils.load_from_json_file(
|
||||
constants.SOFTWARE_JSON_FILE)
|
||||
|
||||
synced_data_deploy_state = utils.load_from_json_file(
|
||||
constants.SYNCED_SOFTWARE_JSON_FILE)
|
||||
|
||||
working_deploy_state = working_data_deploy_state.get("deploy", {})
|
||||
|
||||
synced_deploy_state = synced_data_deploy_state.get("deploy", {})
|
||||
|
||||
working_deploy_host_state = working_data_deploy_state.get("deploy_host", {})
|
||||
|
||||
synced_deploy_host_state = synced_data_deploy_state.get("deploy_host", {})
|
||||
|
||||
return working_deploy_state == synced_deploy_state \
|
||||
and working_deploy_host_state == synced_deploy_host_state
|
||||
return False
|
||||
|
||||
|
||||
def is_deployment_in_progress(release_metadata):
|
||||
"""
|
||||
Check if at least one deployment is in progress
|
||||
:param release_metadata: dict of release metadata
|
||||
:return: bool true if in progress, false otherwise
|
||||
"""
|
||||
return any(release['state'] == constants.DEPLOYING for release in release_metadata.values())
|
||||
|
4
software/software/tests/base.py
Normal file
4
software/software/tests/base.py
Normal file
@ -0,0 +1,4 @@
|
||||
import sys
|
||||
from unittest import mock
|
||||
|
||||
sys.modules['fm_core'] = mock.Mock()
|
@ -3,6 +3,10 @@
|
||||
#
|
||||
# Copyright (c) 2023-2024 Wind River Systems, Inc.
|
||||
#
|
||||
|
||||
# This import has to be first
|
||||
from software.tests import base # pylint: disable=unused-import
|
||||
|
||||
from software.software_controller import PatchController
|
||||
from software.software_controller import ReleaseValidationFailure
|
||||
import unittest
|
||||
@ -144,8 +148,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
@patch('software.software_controller.os.path.isfile')
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_in_sync_controller_api_files_identical(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load,
|
||||
mock_isfile):
|
||||
controller = PatchController()
|
||||
@ -159,8 +167,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
@patch('software.software_controller.os.path.isfile')
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_in_sync_controller_api_files_not_identical(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load,
|
||||
mock_isfile):
|
||||
controller = PatchController()
|
||||
@ -174,8 +186,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
@patch('software.software_controller.os.path.isfile')
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_in_sync_controller_api_files_not_exist(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
mock_isfile):
|
||||
controller = PatchController()
|
||||
@ -188,8 +204,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
@patch('software.software_controller.os.path.isfile')
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_in_sync_controller_api_one_file_exist(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
mock_isfile):
|
||||
controller = PatchController()
|
||||
@ -204,8 +224,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_software_host_upgrade_deployed(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
controller = PatchController()
|
||||
@ -229,8 +253,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_software_host_upgrade_deploying(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
controller = PatchController()
|
||||
@ -254,8 +282,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_all_software_host_upgrade_deploying(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
controller = PatchController()
|
||||
@ -284,22 +316,31 @@ class TestSoftwareController(unittest.TestCase):
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_software_host_upgrade_none_state(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
controller = PatchController()
|
||||
|
||||
# Test when the deploy or deploy_hosts is None
|
||||
controller._get_software_upgrade = MagicMock(return_value=None) # pylint: disable=protected-access
|
||||
controller._get_software_upgrade = MagicMock( # pylint: disable=protected-access
|
||||
return_value=None)
|
||||
controller.db_api_instance.get_deploy_host.return_value = None
|
||||
result = controller.get_one_software_host_upgrade("host1")
|
||||
self.assertIsNone(result)
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_software_upgrade_get_deploy_all(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
|
||||
@ -329,8 +370,12 @@ class TestSoftwareController(unittest.TestCase):
|
||||
|
||||
@patch('software.software_controller.json.load')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
|
||||
@patch('software.software_controller.open', new_callable=mock_open)
|
||||
def test_get_software_upgrade_get_deploy_all_none(self,
|
||||
mock_dummy_open_config, # pylint: disable=unused-argument
|
||||
mock_dummy, # pylint: disable=unused-argument
|
||||
mock_dummy_open, # pylint: disable=unused-argument
|
||||
mock_json_load, # pylint: disable=unused-argument
|
||||
):
|
||||
|
||||
|
@ -22,6 +22,7 @@ import webob
|
||||
import software.constants as constants
|
||||
from software.exceptions import StateValidationFailure
|
||||
from software.exceptions import SoftwareServiceError
|
||||
from tsconfig.tsconfig import PLATFORM_CONF_FILE
|
||||
|
||||
|
||||
LOG = logging.getLogger('main_logger')
|
||||
@ -439,3 +440,22 @@ def is_active_controller():
|
||||
|
||||
keyring_file = f"/opt/platform/.keyring/{constants.SW_VERSION}/.CREDENTIAL"
|
||||
return os.path.exists(keyring_file)
|
||||
|
||||
|
||||
def get_platform_conf(key):
|
||||
"""
|
||||
Get the value of given key in platform.conf
|
||||
:param key: key to get
|
||||
:return: value
|
||||
"""
|
||||
value = None
|
||||
|
||||
with open(PLATFORM_CONF_FILE) as fp:
|
||||
lines = fp.readlines()
|
||||
for line in lines:
|
||||
if line.find(key) != -1:
|
||||
value = line.split('=')[1]
|
||||
value = value.replace('\n', '')
|
||||
break
|
||||
|
||||
return value
|
||||
|
@ -16,10 +16,11 @@ allowlist_externals = find
|
||||
basepython = python3
|
||||
deps = -r{toxinidir}/requirements.txt
|
||||
-r{toxinidir}/test-requirements.txt
|
||||
-e{[tox]stxdir}/fault/fm-api/source
|
||||
-e{[tox]stxdir}/config/tsconfig/tsconfig
|
||||
|
||||
install_command = pip install \
|
||||
-c{env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \
|
||||
install_command = pip install -v -v -v \
|
||||
-c {env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \
|
||||
{opts} {packages}
|
||||
passenv =
|
||||
XDG_CACHE_HOME
|
||||
|
Loading…
x
Reference in New Issue
Block a user