From 4fc1abf91fe622b3eca52547dca086396acf9436 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Mon, 13 Sep 2021 09:46:08 -0700 Subject: [PATCH] Fix driver task pattern to reduce periodic db load Previously, a pattern of periodic tasks was created where nodes, and in many cases, all nodes not actively locked nor those in maintenance state, were pulled in by a periodic task. These periodic tasks would then create tasks which generated additional database queries in order to populate the task object. With the task object populated, the driver would then evaluate if the driver in question was for the the driver interface in question and *then* evaluate if work had to be performed. However, that field containing a pointer to if work needed to be performed as often already queried from the database on the very initial query to generate the list of nodes to evaluate. In essence, we've moved this up in the sequence so we evaluate that field in question prior to creating the task, potentially across every conductor, depending on the query, and ultimately which drivers are enabled. This saves potentially saves hundreds of thousands of needless database queries on a medium size deployment per single day, depending on which drivers and driver interfaces are in use. Change-Id: I409e87de2808d442d39e4d0ae6e995668230cbba --- ironic/drivers/modules/drac/bios.py | 8 ++++++ ironic/drivers/modules/drac/management.py | 15 ++++++++--- ironic/drivers/modules/drac/raid.py | 12 ++++++--- ironic/drivers/modules/irmc/raid.py | 9 +++++-- ironic/drivers/modules/redfish/management.py | 27 ++++++++++++------- ironic/drivers/modules/redfish/raid.py | 27 ++++++++++++------- ...-driver-task-pattern-322e02b6a2233919.yaml | 8 ++++++ 7 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml diff --git a/ironic/drivers/modules/drac/bios.py b/ironic/drivers/modules/drac/bios.py index 52ccc8e66b..69955f6936 100644 --- a/ironic/drivers/modules/drac/bios.py +++ b/ironic/drivers/modules/drac/bios.py @@ -169,6 +169,14 @@ class DracWSManBIOS(base.BIOSInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + # NOTE(TheJulia) Evaluate if work is actually required before + # creating a task for every node in the deployment which does + # not have a lock and is not in maintenance mode. + if (not driver_internal_info.get("bios_config_job_ids") + and not driver_internal_info.get( + "factory_reset_time_before_reboot")): + continue + lock_purpose = 'checking async bios configuration jobs' # Performing read-only/non-destructive work with shared lock with task_manager.acquire(context, node_uuid, diff --git a/ironic/drivers/modules/drac/management.py b/ironic/drivers/modules/drac/management.py index 3bacde9623..96e8dd5b59 100644 --- a/ironic/drivers/modules/drac/management.py +++ b/ironic/drivers/modules/drac/management.py @@ -502,6 +502,17 @@ class DracRedfishManagement(redfish_management.RedfishManagement): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + + task_monitor_url = driver_internal_info.get( + 'import_task_monitor_url') + # NOTE(TheJulia): Evaluate if a task montitor URL exists + # based upon our inital DB query before pulling a task for + # every node in the deployment which reduces the overall + # number of DB queries triggering in the background where + # no work is required. + if not task_monitor_url: + continue + lock_purpose = 'checking async import configuration task' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -509,10 +520,6 @@ class DracRedfishManagement(redfish_management.RedfishManagement): if not isinstance(task.driver.management, DracRedfishManagement): continue - task_monitor_url = driver_internal_info.get( - 'import_task_monitor_url') - if not task_monitor_url: - continue self._check_import_configuration_task( task, task_monitor_url) except exception.NodeNotFound: diff --git a/ironic/drivers/modules/drac/raid.py b/ironic/drivers/modules/drac/raid.py index b42b5b21fc..4bb41c1f67 100644 --- a/ironic/drivers/modules/drac/raid.py +++ b/ironic/drivers/modules/drac/raid.py @@ -1482,6 +1482,14 @@ class DracWSManRAID(base.RAIDInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + + job_ids = driver_internal_info.get('raid_config_job_ids') + # NOTE(TheJulia): Evaluate if there is work to be done + # based upon the original DB query's results so we don't + # proceed creating tasks for every node in the deployment. + if not job_ids: + continue + lock_purpose = 'checking async raid configuration jobs' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -1489,10 +1497,6 @@ class DracWSManRAID(base.RAIDInterface): if not isinstance(task.driver.raid, DracWSManRAID): continue - job_ids = driver_internal_info.get('raid_config_job_ids') - if not job_ids: - continue - self._check_node_raid_jobs(task) except exception.NodeNotFound: diff --git a/ironic/drivers/modules/irmc/raid.py b/ironic/drivers/modules/irmc/raid.py index 9016956322..34d1c3f382 100644 --- a/ironic/drivers/modules/irmc/raid.py +++ b/ironic/drivers/modules/irmc/raid.py @@ -434,6 +434,13 @@ class IRMCRAID(base.RAIDInterface): node_list = manager.iter_nodes(fields=fields, filters=filters) for (node_uuid, driver, conductor_group, raid_config) in node_list: try: + # NOTE(TheJulia): Evaluate based upon presence of raid + # configuration before triggering a task, as opposed to after + # so we don't create excess node task objects with related + # DB queries. + if not raid_config or raid_config.get('fgi_status'): + continue + lock_purpose = 'checking async RAID configuration tasks' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -444,8 +451,6 @@ class IRMCRAID(base.RAIDInterface): continue if task.node.target_raid_config is None: continue - if not raid_config or raid_config.get('fgi_status'): - continue task.upgrade_lock() if node.provision_state != states.CLEANWAIT: continue diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 4d554925b2..5e572f84a8 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -872,6 +872,15 @@ class RedfishManagement(base.ManagementInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + firmware_updates = driver_internal_info.get( + 'firmware_updates') + # NOTE(TheJulia): If we don't have a entry upfront, we can + # safely skip past the node as we know work here is not + # required, otherwise minimizing the number of potential + # nodes to visit. + if not firmware_updates: + continue + lock_purpose = 'checking async firmware update failed.' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -880,11 +889,6 @@ class RedfishManagement(base.ManagementInterface): RedfishManagement): continue - firmware_updates = driver_internal_info.get( - 'firmware_updates') - if not firmware_updates: - continue - node = task.node # A firmware update failed. Discard any remaining firmware @@ -921,6 +925,14 @@ class RedfishManagement(base.ManagementInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + firmware_updates = driver_internal_info.get( + 'firmware_updates') + # NOTE(TheJulia): Check and skip upfront before creating a + # task so we don't generate additional tasks and db queries + # for every node in CLEANWAIT which is not locked. + if not firmware_updates: + continue + lock_purpose = 'checking async firmware update tasks.' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -929,11 +941,6 @@ class RedfishManagement(base.ManagementInterface): RedfishManagement): continue - firmware_updates = driver_internal_info.get( - 'firmware_updates') - if not firmware_updates: - continue - self._check_node_firmware_update(task) except exception.NodeNotFound: diff --git a/ironic/drivers/modules/redfish/raid.py b/ironic/drivers/modules/redfish/raid.py index df0d0b99d6..f726d9d57a 100644 --- a/ironic/drivers/modules/redfish/raid.py +++ b/ironic/drivers/modules/redfish/raid.py @@ -1032,6 +1032,15 @@ class RedfishRAID(base.RAIDInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + raid_configs = driver_internal_info.get( + 'raid_configs') + # NOTE(TheJulia): Evaluate the presence of raid configuration + # activity before pulling the task, so we don't needlessly + # create database queries with tasks which would be skipped + # anyhow. + if not raid_configs: + continue + lock_purpose = 'checking async RAID config failed.' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -1039,11 +1048,6 @@ class RedfishRAID(base.RAIDInterface): if not isinstance(task.driver.raid, RedfishRAID): continue - raid_configs = driver_internal_info.get( - 'raid_configs') - if not raid_configs: - continue - node = task.node # A RAID config failed. Discard any remaining RAID @@ -1080,6 +1084,14 @@ class RedfishRAID(base.RAIDInterface): for (node_uuid, driver, conductor_group, driver_internal_info) in node_list: try: + raid_configs = driver_internal_info.get( + 'raid_configs') + # NOTE(TheJulia): Skip to next record if we do not + # have raid configuraiton tasks, so we don't pull tasks + # for every unrelated node in CLEANWAIT. + if not raid_configs: + continue + lock_purpose = 'checking async RAID config tasks.' with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -1087,11 +1099,6 @@ class RedfishRAID(base.RAIDInterface): if not isinstance(task.driver.raid, RedfishRAID): continue - raid_configs = driver_internal_info.get( - 'raid_configs') - if not raid_configs: - continue - self._check_node_raid_config(task) except exception.NodeNotFound: diff --git a/releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml b/releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml new file mode 100644 index 0000000000..8f5c2b1929 --- /dev/null +++ b/releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes the pattern of execution for periodic tasks such that the majority + of drivers now evaluate *if* work needs to be performed in advance of + creating a node task. Depending on the individual driver query pattern, + this prevents excess database queries from being triggered with every + task execution.