Minor improvements to nodescan state machine

* Change the state change logging level to debug -- it's chatty * Don't allow individual connection attempts to take > 10 seconds This is a behavior that is in the old nodescan method that wasn't ported over but should be. As a port comes online as port of the boot process, early connection attempts may hang while later ones may succeed. We want to continually try new connections whether they return an error or hang. * Fall through to the complete state even if the last key is ignored Previously, if the last key we scanned was not compatible, the state machine would need to go through one extra state transition in order to set the complete flag, due to an early return call. We now rearrange that state transition so that we fall through to completion regardless of whether the last key was added. Change-Id: Ic6fd1551c3ef1bbd8eaf3b733e9ecc2609bce47f
2023-11-07 10:36:02 -08:00 · 2023-11-07 10:36:02 -08:00 · 49e7dab5f5
commit 49e7dab5f5
parent 5984a2638a
1 changed files with 16 additions and 4 deletions
--- a/nodepool/driver/statemachine.py
+++ b/nodepool/driver/statemachine.py
@ -1169,7 +1169,7 @@ class NodescanWorker:
        old_state = request.state
        request.advance(socket_ready)
        if request.state != old_state:
-            request.log.info(
+            request.log.debug(
                "Nodescan request for %s advanced "
                "from %s to %s %s",
                request.node.id, old_state, request.state, request.iteration)
@ -1254,6 +1254,7 @@ class NodescanRequest:
        self.start_time = time.monotonic()
        self.worker = None
        self.exception = None
+        self.connect_start_time = None

    def setWorker(self, worker):
        """Store a reference to the worker thread so we register and unregister
@ -1318,6 +1319,7 @@ class NodescanRequest:
            self.sock.connect(self.sockaddr)
        except BlockingIOError:
            self.state = self.CONNECTING_INIT
+        self.connect_start_time = time.monotonic()
        self.worker.registerDescriptor(self.sock)

    def _start(self):
@ -1361,7 +1363,13 @@ class NodescanRequest:

        if self.state == self.CONNECTING_INIT:
            if not socket_ready:
+                # Check the overall timeout
                self._checkTimeout()
+                # If we're still here, then don't let any individual
+                # connection attempt last more than 10 seconds:
+                if time.monotonic() - self.connect_start_time >= 10:
+                    self._close()
+                    self.state = self.START
                return
            eno = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)
            if eno:
@ -1369,10 +1377,12 @@ class NodescanRequest:
                    self.log.exception(
                        f"Error {eno} connecting to {self.ip} "
                        f"on port {self.port}")
-                # Try again
+                # Try again.  Don't immediately start to reconnect
+                # since econnrefused can happen very quickly, so we
+                # could end up busy-waiting.
+                self._close()
                self.state = self.START
                self._checkTimeout()
-                self._connect()
                return
            if self.gather_hostkeys:
                self._start()
@ -1434,7 +1444,9 @@ class NodescanRequest:
                        f"SSH error connecting to {self.ip} "
                        f"on port {self.port}")
                self._nextKey()
-                return
+
+        # Check if we're still in the same state
+        if self.state == self.NEGOTIATING_KEY:
            key = self.transport.get_remote_server_key()
            if key:
                self.keys.append("%s %s" % (key.get_name(), key.get_base64()))