Improve DC VIM strategy create/apply error handling

This commit updates subcloud's error_description with the error
returned by the software API during VIM strategy create and apply.

- Created two custom exceptions for handling these errors.
- Clean up error_description in strategy creation.

Note: This also updated the timeout values of software API.

Test Plan:
PASS - Apply a sw-deploy-strategy and force an error in the
deploy precheck command.
  - Apply should fail in the `create VIM strategy` state
  - dcmanager subcloud errors should be updated
PASS - Apply a sw-deploy-strategy and force an error in the
deploy start command.
  - Apply should fail in `apply VIM strategy` state
  - dcmanager subcloud errors should be updated
PASS - Create a dcmanager sw-deploy-strategy with subcloud errors.
  - Strategy created and subcloud errors should be `No errors present`.

Story: 2010676
Task: 50644

Change-Id: Ib0b0b586d90093088a6af96e5d630e3fe04fd3f7
Signed-off-by: Hugo Brito <hugo.brito@windriver.com>
This commit is contained in:
Hugo Brito 2024-07-22 20:26:27 -03:00
parent 30df79b36a
commit b3d206781b
9 changed files with 240 additions and 93 deletions

View File

@ -23,6 +23,8 @@ REMOVING = "removing"
UNAVAILABLE = "unavailable"
REST_DEFAULT_TIMEOUT = 900
REST_SHOW_TIMEOUT = 150
REST_DELETE_TIMEOUT = 300
class SoftwareClient(base.DriverBase):
@ -58,13 +60,13 @@ class SoftwareClient(base.DriverBase):
response = requests.get(url, headers=self.headers, timeout=timeout)
return self._handle_response(response, operation="List")
def show(self, release, timeout=REST_DEFAULT_TIMEOUT):
def show(self, release, timeout=REST_SHOW_TIMEOUT):
"""Show release"""
url = self.endpoint + f"/release/{release}"
response = requests.get(url, headers=self.headers, timeout=timeout)
return self._handle_response(response, operation="Show")
def delete(self, releases, timeout=REST_DEFAULT_TIMEOUT):
def delete(self, releases, timeout=REST_DELETE_TIMEOUT):
"""Delete release"""
release_str = "/".join(releases)
url = self.endpoint + f"/release/{release_str}"
@ -77,7 +79,7 @@ class SoftwareClient(base.DriverBase):
response = requests.post(url, headers=self.headers, timeout=timeout)
return self._handle_response(response, operation="Deploy precheck")
def deploy_delete(self, timeout=REST_DEFAULT_TIMEOUT):
def deploy_delete(self, timeout=REST_DELETE_TIMEOUT):
"""Deploy delete"""
url = self.endpoint + "/deploy"
response = requests.delete(url, headers=self.headers, timeout=timeout)

View File

@ -262,6 +262,20 @@ class SoftwarePreCheckFailedException(DCManagerException):
message = _("Subcloud %(subcloud)s software deploy precheck failed: %(details)s")
class CreateVIMStrategyFailedException(DCManagerException):
message = _(
"Subcloud %(subcloud)s create VIM %(name)s strategy "
"failed. State: %(state)s Details: %(details)s"
)
class ApplyVIMStrategyFailedException(DCManagerException):
message = _(
"Subcloud %(subcloud)s apply VIM %(name)s strategy "
"failed. State: %(state)s Details: %(details)s"
)
class SoftwareListFailedException(DCManagerException):
message = _("Subcloud %(subcloud)s software list failed: %(details)s")

View File

@ -6,7 +6,7 @@
import time
from dccommon.drivers.openstack import vim
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import exceptions
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
@ -64,7 +64,13 @@ class ApplyingVIMStrategyState(BaseState):
# Do not raise the default exception if there is no strategy
# because the default exception is unclear: ie: "Get strategy failed"
if subcloud_strategy is None:
raise Exception("(%s) VIM Strategy not found." % self.strategy_name)
message = "VIM Strategy not found."
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
# We have a VIM strategy, but need to check if it is ready to apply
elif subcloud_strategy.state == vim.STATE_READY_TO_APPLY:
@ -87,20 +93,20 @@ class ApplyingVIMStrategyState(BaseState):
vim.STATE_APPLY_FAILED,
vim.STATE_APPLY_TIMEOUT,
]:
# Explicit known failure states
raise Exception(
"(%s) VIM strategy apply failed. %s. %s"
% (
self.strategy_name,
subcloud_strategy.state,
subcloud_strategy.apply_phase.reason,
)
message = "VIM strategy apply failed: "
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message + subcloud_strategy.apply_phase.reason,
)
else:
# Other states are bad
raise Exception(
"(%s) VIM strategy apply failed. Unexpected State: %s."
% (self.strategy_name, subcloud_strategy.state)
message = "VIM strategy unexpected apply state."
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
# wait for new strategy to apply or the existing strategy to complete.
@ -117,14 +123,19 @@ class ApplyingVIMStrategyState(BaseState):
# which would allow the longer 60 second sleep to be broken into
# multiple smaller sleep calls
error_message = None
# If event handler stop has been triggered, fail the state
if self.stopped():
raise StrategyStoppedException()
raise exceptions.StrategyStoppedException()
# break out of the loop if the max number of attempts is reached
wait_count += 1
if wait_count >= self.wait_attempts:
raise Exception(
"Timeout applying (%s) vim strategy." % self.strategy_name
message = "Timeout applying vim strategy."
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
# every loop we wait, even the first one
time.sleep(self.wait_interval)
@ -143,9 +154,12 @@ class ApplyingVIMStrategyState(BaseState):
get_fail_count += 1
if get_fail_count >= self.max_failed_queries:
# We have waited too long.
raise Exception(
"Timeout during recovery of apply (%s) Vim strategy."
% self.strategy_name
message = "Timeout during recovery of VIM strategy."
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
self.debug_log(
strategy_step,
@ -156,8 +170,12 @@ class ApplyingVIMStrategyState(BaseState):
# If an external actor has deleted the strategy, the only option
# is to fail this state.
if subcloud_strategy is None:
raise Exception(
"(%s) VIM Strategy no longer exists." % self.strategy_name
message = "VIM Strategy no longer exists."
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
elif subcloud_strategy.state == vim.STATE_APPLYING:
@ -186,22 +204,26 @@ class ApplyingVIMStrategyState(BaseState):
vim.STATE_APPLY_FAILED,
vim.STATE_APPLY_TIMEOUT,
]:
# Explicit known failure states
raise Exception(
"(%s) Vim strategy apply failed. %s. %s"
% (
self.strategy_name,
subcloud_strategy.state,
subcloud_strategy.apply_phase.reason,
)
)
error_message = "VIM strategy apply failed: "
else:
# Other states are bad
raise Exception(
"(%s) Vim strategy apply failed. Unexpected State: %s."
% (self.strategy_name, subcloud_strategy.state)
error_message = "VIM strategy unexpected apply state."
if error_message:
apply_error = subcloud_strategy.apply_phase.response
# If response is None, use the reason
if not apply_error:
apply_error = subcloud_strategy.apply_phase.reason
db_api.subcloud_update(
self.context,
strategy_step.subcloud_id,
error_description=apply_error,
)
raise exceptions.ApplyVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=error_message + apply_error,
)
# end of loop
# Success, state machine can proceed to the next state
return self.next_state

View File

@ -8,8 +8,9 @@ import time
from dccommon.drivers.openstack import vim
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import exceptions
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
# Max time: 30 minutes = 180 queries x 10 seconds between
@ -59,8 +60,12 @@ class CreatingVIMStrategyState(BaseState):
# a successful API call to create MUST set the state be 'building'
if subcloud_strategy.state != vim.STATE_BUILDING:
raise Exception(
"Unexpected VIM strategy build state: %s" % subcloud_strategy.state
message = "Unexpected VIM strategy build state."
raise exceptions.CreateVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
return subcloud_strategy
@ -91,9 +96,9 @@ class CreatingVIMStrategyState(BaseState):
strategy_step,
"VIM strategy exists with state: %s" % subcloud_strategy.state,
)
# if a strategy exists in any type of failed state or aborted
# state it should be deleted.
# applied state should also be deleted from previous success runs.
# if a strategy exists in any type of failed state or aborted state it
# should be deleted. Applied state should also be deleted from previous
# success runs.
if subcloud_strategy.state in [
vim.STATE_BUILDING,
vim.STATE_APPLYING,
@ -101,12 +106,16 @@ class CreatingVIMStrategyState(BaseState):
]:
# Can't delete a strategy in these states
message = (
"Failed to create a VIM strategy for %s. "
"There already is an existing strategy in %s state"
% (region, subcloud_strategy.state)
"Failed to create a VIM strategy. There already is an existing "
"strategy in this state."
)
self.warn_log(strategy_step, message)
raise Exception(message)
raise exceptions.CreateVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=message,
)
# if strategy exists in any other type of state, delete and create
self.info_log(strategy_step, "Deleting existing VIM strategy")
@ -120,12 +129,17 @@ class CreatingVIMStrategyState(BaseState):
# Loop until the strategy is done building Repeatedly query the API
counter = 0
while True:
error_message = None
# If event handler stop has been triggered, fail the state
if self.stopped():
raise StrategyStoppedException()
raise exceptions.StrategyStoppedException()
if counter >= self.max_queries:
raise Exception(
"Timeout building vim strategy. state: %s" % subcloud_strategy.state
details = "Timeout building VIM strategy."
raise exceptions.CreateVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=details,
)
counter += 1
time.sleep(self.sleep_duration)
@ -151,17 +165,26 @@ class CreatingVIMStrategyState(BaseState):
# This is the expected state while creating the strategy
pass
elif subcloud_strategy.state == vim.STATE_BUILD_FAILED:
raise Exception(
"VIM strategy build failed: %s. %s."
% (subcloud_strategy.state, subcloud_strategy.build_phase.reason)
)
error_message = "VIM strategy build failed: "
elif subcloud_strategy.state == vim.STATE_BUILD_TIMEOUT:
raise Exception(
"VIM strategy build timed out: %s." % subcloud_strategy.state
)
error_message = "VIM strategy build timed out: "
else:
raise Exception(
"VIM strategy unexpected build state: %s" % subcloud_strategy.state
error_message = "VIM strategy unexpected build state."
if error_message:
build_error = subcloud_strategy.build_phase.response
# If response is None, use the reason
if not build_error:
build_error = subcloud_strategy.build_phase.reason
db_api.subcloud_update(
self.context,
strategy_step.subcloud_id,
error_description=build_error,
)
raise exceptions.CreateVIMStrategyFailedException(
subcloud=strategy_step.subcloud.name,
name=self.strategy_name,
state=subcloud_strategy.state,
details=error_message + build_error,
)
# Success, state machine can proceed to the next state

View File

@ -7,7 +7,6 @@
from dccommon.drivers.openstack import software_v1
from dcmanager.common import consts
from dcmanager.common import exceptions
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.software.cache.cache_specifications import (
REGION_ONE_RELEASE_USM_CACHE_TYPE,
@ -28,8 +27,6 @@ class FinishStrategyState(BaseState):
self.info_log(strategy_step, "Finishing software strategy")
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
regionone_deployed_releases = self._read_from_cache(
REGION_ONE_RELEASE_USM_CACHE_TYPE, state=software_v1.DEPLOYED
)
@ -46,7 +43,7 @@ class FinishStrategyState(BaseState):
message = "Cannot retrieve subcloud releases. Please see logs for details."
self.exception_log(strategy_step, message)
raise exceptions.SoftwareListFailedException(
subcloud=subcloud.name,
subcloud=strategy_step.subcloud.name,
details=message,
)
@ -72,7 +69,7 @@ class FinishStrategyState(BaseState):
if releases_to_delete:
self._handle_release_delete(
strategy_step, software_client, subcloud, releases_to_delete
strategy_step, software_client, releases_to_delete
)
if self.stopped():
@ -80,14 +77,13 @@ class FinishStrategyState(BaseState):
if releases_to_commit:
self._handle_deploy_commit(
strategy_step, software_client, subcloud, releases_to_commit
strategy_step, software_client, releases_to_commit
)
if releases_to_deploy_delete:
self._handle_deploy_delete(
strategy_step,
software_client,
subcloud,
releases_to_deploy_delete,
regionone_deployed_releases,
)
@ -95,7 +91,7 @@ class FinishStrategyState(BaseState):
return self.next_state
def _handle_release_delete(
self, strategy_step, software_client, subcloud, releases_to_delete
self, strategy_step, software_client, releases_to_delete
):
self.info_log(strategy_step, f"Deleting releases {releases_to_delete}")
try:
@ -106,13 +102,11 @@ class FinishStrategyState(BaseState):
)
self.exception_log(strategy_step, message)
raise exceptions.SoftwareDeleteFailedException(
subcloud=subcloud.name,
subcloud=strategy_step.subcloud.name,
details=message,
)
def _handle_deploy_commit(
self, strategy_step, software_client, subcloud, releases_to_commit
):
def _handle_deploy_commit(self, strategy_step, software_client, releases_to_commit):
raise NotImplementedError()
# If there are releases in deploying state and it's deployed in the regionone,
@ -121,7 +115,6 @@ class FinishStrategyState(BaseState):
self,
strategy_step,
software_client,
subcloud,
releases_to_deploy_delete,
regionone_deployed_releases,
):
@ -131,12 +124,12 @@ class FinishStrategyState(BaseState):
for release_regionone in regionone_deployed_releases
):
message = (
f"There is a deploying release on subcloud {subcloud.name} "
"that is not deployed in System Controller. Aborting."
f"Deploying release found on subcloud {strategy_step.subcloud.name} "
"and is not deployed in System Controller. Aborting."
)
self.error_log(strategy_step, message)
raise exceptions.SoftwareDeployDeleteFailedException(
subcloud=subcloud.name,
subcloud=strategy_step.subcloud.name,
details=message,
)
self.info_log(
@ -151,6 +144,6 @@ class FinishStrategyState(BaseState):
)
self.exception_log(strategy_step, message)
raise exceptions.SoftwareDeployDeleteFailedException(
subcloud=subcloud.name,
subcloud=strategy_step.subcloud.name,
details=message,
)

View File

@ -462,6 +462,14 @@ class SwUpdateManager(manager.Manager):
state=consts.STRATEGY_STATE_INITIAL,
details="",
)
# Clear the error_description field for all subclouds that will
# perform orchestration.
update_form = {"error_description": consts.ERROR_DESC_EMPTY}
db_api.subcloud_bulk_update_by_ids(
context,
[subcloud.id for subcloud, _ in valid_subclouds],
update_form,
)
LOG.info(
f"Finished creating software update strategy of type {payload['type']}."

View File

@ -65,6 +65,17 @@ class FakeVimStrategy(object):
self.abort_phase = abort_phase
class FakeVimStrategyPhase(object):
"""Represents a VIM StrategyPhase object defined in:
starlingx/nfv/nfv-client/nfv_client/openstack/sw_update.py
"""
def __init__(self, response=None, reason=None):
self.response = response
self.reason = reason
class SwUpdateStrategy(object):
def __init__(self, id, data):
self.id = id

View File

@ -8,14 +8,20 @@ import mock
from dccommon.drivers.openstack import vim
from dcmanager.common import consts
from dcmanager.common import exceptions
from dcmanager.tests.unit.common import fake_strategy
from dcmanager.tests.unit.fakes import FakeVimStrategy
from dcmanager.tests.unit.orchestrator.states.software.test_base import \
TestSoftwareOrchestrator
from dcmanager.tests.unit import fakes
from dcmanager.tests.unit.orchestrator.states.software.test_base import (
TestSoftwareOrchestrator,
)
STRATEGY_READY_TO_APPLY = FakeVimStrategy(state=vim.STATE_READY_TO_APPLY)
STRATEGY_APPLYING = FakeVimStrategy(state=vim.STATE_APPLYING)
STRATEGY_APPLIED = FakeVimStrategy(state=vim.STATE_APPLIED)
STRATEGY_READY_TO_APPLY = fakes.FakeVimStrategy(state=vim.STATE_READY_TO_APPLY)
STRATEGY_APPLYING = fakes.FakeVimStrategy(state=vim.STATE_APPLYING)
STRATEGY_APPLIED = fakes.FakeVimStrategy(state=vim.STATE_APPLIED)
APPLY_PHASE_ERROR = fakes.FakeVimStrategyPhase(response="Deploy Start Failed")
STRATEGY_APPLY_FAILED = fakes.FakeVimStrategy(
state=vim.STATE_APPLY_FAILED, apply_phase=APPLY_PHASE_ERROR
)
RELEASE_ID = "starlingx-9.0.1"
@ -36,7 +42,8 @@ class TestApplyVIMSoftwareStrategyState(TestSoftwareOrchestrator):
# Add the strategy_step state being processed by this unit test
self.strategy_step = self.setup_strategy_step(
self.subcloud.id, consts.STRATEGY_STATE_SW_APPLY_VIM_STRATEGY)
self.subcloud.id, consts.STRATEGY_STATE_SW_APPLY_VIM_STRATEGY
)
# Mock the API calls made by the state
self.vim_client.get_strategy = mock.MagicMock()
@ -52,9 +59,7 @@ class TestApplyVIMSoftwareStrategyState(TestSoftwareOrchestrator):
"DEFAULT_MAX_WAIT_ATTEMPTS",
3,
)
@mock.patch(
"dcmanager.orchestrator.states.applying_vim_strategy.WAIT_INTERVAL", 1
)
@mock.patch("dcmanager.orchestrator.states.applying_vim_strategy.WAIT_INTERVAL", 1)
@mock.patch(
"dcmanager.orchestrator.states.applying_vim_strategy."
"ApplyingVIMStrategyState.__init__.__defaults__",
@ -80,5 +85,44 @@ class TestApplyVIMSoftwareStrategyState(TestSoftwareOrchestrator):
self.vim_client.apply_strategy.assert_called_with(strategy_name="sw-upgrade")
# On success, the state should transition to the next state
self.assert_step_updated(
self.strategy_step.subcloud_id, self.on_success_state)
self.assert_step_updated(self.strategy_step.subcloud_id, self.on_success_state)
@mock.patch(
"dcmanager.orchestrator.states.applying_vim_strategy."
"DEFAULT_MAX_FAILED_QUERIES",
3,
)
@mock.patch(
"dcmanager.orchestrator.states.applying_vim_strategy."
"DEFAULT_MAX_WAIT_ATTEMPTS",
3,
)
@mock.patch("dcmanager.orchestrator.states.applying_vim_strategy.WAIT_INTERVAL", 1)
@mock.patch(
"dcmanager.orchestrator.states.applying_vim_strategy."
"ApplyingVIMStrategyState.__init__.__defaults__",
(3, 1),
)
@mock.patch.object(exceptions, "ApplyVIMStrategyFailedException")
def test_apply_vim_software_strategy_apply_failed(self, mock_exception):
"""Test apply vim software strategy apply failed"""
self.vim_client.get_strategy.side_effect = [
STRATEGY_READY_TO_APPLY,
STRATEGY_APPLYING,
STRATEGY_APPLY_FAILED,
]
# API calls acts as expected
self.vim_client.apply_strategy.return_value = STRATEGY_APPLYING
self.worker.perform_state_action(self.strategy_step)
# Assert ApplyVIMStrategyFailedException is called with the correct parameters
expected_message = f"VIM strategy apply failed: {APPLY_PHASE_ERROR.response}"
mock_exception.assert_called_once_with(
subcloud=self.subcloud.name,
name=vim.STRATEGY_NAME_SW_USM,
state=vim.STATE_APPLY_FAILED,
details=expected_message,
)

View File

@ -8,14 +8,21 @@ import mock
from dccommon.drivers.openstack import vim
from dcmanager.common import consts
from dcmanager.common import exceptions
from dcmanager.tests.unit.common import fake_strategy
from dcmanager.tests.unit.fakes import FakeVimStrategy
from dcmanager.tests.unit import fakes
from dcmanager.tests.unit.orchestrator.states.software.test_base import (
TestSoftwareOrchestrator,
)
STRATEGY_BUILDING = FakeVimStrategy(state=vim.STATE_BUILDING)
STRATEGY_DONE_BUILDING = FakeVimStrategy(state=vim.STATE_READY_TO_APPLY)
STRATEGY_BUILDING = fakes.FakeVimStrategy(state=vim.STATE_BUILDING)
BUILD_PHASE_ERROR = fakes.FakeVimStrategyPhase(
response="Installed license is valid: [FAIL]"
)
STRATEGY_BUILDING_FAILED = fakes.FakeVimStrategy(
state=vim.STATE_BUILD_FAILED, build_phase=BUILD_PHASE_ERROR
)
STRATEGY_DONE_BUILDING = fakes.FakeVimStrategy(state=vim.STATE_READY_TO_APPLY)
RELEASE_ID = "starlingx-9.0.1"
@ -72,3 +79,26 @@ class TestCreateVIMSoftwareStrategyState(TestSoftwareOrchestrator):
# On success, the state should transition to the next state
self.assert_step_updated(self.strategy_step.subcloud_id, self.on_success_state)
@mock.patch.object(exceptions, "CreateVIMStrategyFailedException")
def test_create_vim_software_strategy_build_failed(self, mock_exception):
"""Test create vim software strategy build failed"""
self.vim_client.get_strategy.side_effect = [
None,
STRATEGY_BUILDING_FAILED,
]
# API calls acts as expected
self.vim_client.create_strategy.return_value = STRATEGY_BUILDING
self.worker.perform_state_action(self.strategy_step)
# Assert ApplyVIMStrategyFailedException is called with the correct parameters
expected_message = f"VIM strategy build failed: {BUILD_PHASE_ERROR.response}"
mock_exception.assert_called_once_with(
subcloud=self.subcloud.name,
name=vim.STRATEGY_NAME_SW_USM,
state=vim.STATE_BUILD_FAILED,
details=expected_message,
)