From 757bf27cac63b94027d2e3b8afa40624a715cca0 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 22 Aug 2023 15:37:30 -0700 Subject: [PATCH] Task Manager: Log if the lock takes too long In order to troubleshoot some of the locking issues, we need to understand if we're getting held up on the start of the lock or on the far end of the log upon completing the actual needful work. This change adds additional logging on the front end if it is on the front end, and also sets the rpc object to None explicitly before returning the update_node result, since it appears a stream of them can cause issues for sqlite's release of the database from one thread to another. Change-Id: I8ef68ff62e8f93f7bc5bd9ee7c6ac8d96328f929 --- ironic/api/controllers/v1/node.py | 1 - ironic/conductor/task_manager.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ironic/api/controllers/v1/node.py b/ironic/api/controllers/v1/node.py index 0dde80f1fe..0f9556bc82 100644 --- a/ironic/api/controllers/v1/node.py +++ b/ironic/api/controllers/v1/node.py @@ -3204,7 +3204,6 @@ class NodesController(rest.RestController): chassis_uuid = api_node.get('chassis_uuid') notify.emit_end_notification(context, new_node, 'update', chassis_uuid=chassis_uuid) - return api_node @METRICS.timer('NodesController.delete') diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py index 922e74cf63..817c2bd814 100644 --- a/ironic/conductor/task_manager.py +++ b/ironic/conductor/task_manager.py @@ -338,6 +338,10 @@ class TaskManager(object): else: stop_after = tenacity.stop_after_attempt(1) + max_lock_time = \ + CONF.conductor.node_locked_retry_interval * \ + CONF.conductor.node_locked_retry_attempts + # NodeLocked exceptions can be annoying. Let's try to alleviate # some of that pain by retrying our lock attempts. @tenacity.retry( @@ -347,6 +351,13 @@ class TaskManager(object): CONF.conductor.node_locked_retry_interval), reraise=True) def reserve_node(): + if self._debug_timer.elapsed() > max_lock_time: + LOG.warning('We have exceeded the normal maximum time window ' + 'to complete a node lock attempting to reserve ' + 'node %(node)s for purpose %(purpose)s. At ' + '%(time).2f seconds.', + {'node': self.node_id, 'purpose': self._purpose, + 'time': self._debug_timer.elapsed()}) self.node = objects.Node.reserve(self.context, CONF.host, self.node_id) LOG.debug("Node %(node)s successfully reserved for %(purpose)s "