Minor improvements to nodescan state machine
* Change the state change logging level to debug -- it's chatty * Don't allow individual connection attempts to take > 10 seconds This is a behavior that is in the old nodescan method that wasn't ported over but should be. As a port comes online as port of the boot process, early connection attempts may hang while later ones may succeed. We want to continually try new connections whether they return an error or hang. * Fall through to the complete state even if the last key is ignored Previously, if the last key we scanned was not compatible, the state machine would need to go through one extra state transition in order to set the complete flag, due to an early return call. We now rearrange that state transition so that we fall through to completion regardless of whether the last key was added. Change-Id: Ic6fd1551c3ef1bbd8eaf3b733e9ecc2609bce47f
This commit is contained in:
parent
5984a2638a
commit
49e7dab5f5
@ -1169,7 +1169,7 @@ class NodescanWorker:
|
||||
old_state = request.state
|
||||
request.advance(socket_ready)
|
||||
if request.state != old_state:
|
||||
request.log.info(
|
||||
request.log.debug(
|
||||
"Nodescan request for %s advanced "
|
||||
"from %s to %s %s",
|
||||
request.node.id, old_state, request.state, request.iteration)
|
||||
@ -1254,6 +1254,7 @@ class NodescanRequest:
|
||||
self.start_time = time.monotonic()
|
||||
self.worker = None
|
||||
self.exception = None
|
||||
self.connect_start_time = None
|
||||
|
||||
def setWorker(self, worker):
|
||||
"""Store a reference to the worker thread so we register and unregister
|
||||
@ -1318,6 +1319,7 @@ class NodescanRequest:
|
||||
self.sock.connect(self.sockaddr)
|
||||
except BlockingIOError:
|
||||
self.state = self.CONNECTING_INIT
|
||||
self.connect_start_time = time.monotonic()
|
||||
self.worker.registerDescriptor(self.sock)
|
||||
|
||||
def _start(self):
|
||||
@ -1361,7 +1363,13 @@ class NodescanRequest:
|
||||
|
||||
if self.state == self.CONNECTING_INIT:
|
||||
if not socket_ready:
|
||||
# Check the overall timeout
|
||||
self._checkTimeout()
|
||||
# If we're still here, then don't let any individual
|
||||
# connection attempt last more than 10 seconds:
|
||||
if time.monotonic() - self.connect_start_time >= 10:
|
||||
self._close()
|
||||
self.state = self.START
|
||||
return
|
||||
eno = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)
|
||||
if eno:
|
||||
@ -1369,10 +1377,12 @@ class NodescanRequest:
|
||||
self.log.exception(
|
||||
f"Error {eno} connecting to {self.ip} "
|
||||
f"on port {self.port}")
|
||||
# Try again
|
||||
# Try again. Don't immediately start to reconnect
|
||||
# since econnrefused can happen very quickly, so we
|
||||
# could end up busy-waiting.
|
||||
self._close()
|
||||
self.state = self.START
|
||||
self._checkTimeout()
|
||||
self._connect()
|
||||
return
|
||||
if self.gather_hostkeys:
|
||||
self._start()
|
||||
@ -1434,7 +1444,9 @@ class NodescanRequest:
|
||||
f"SSH error connecting to {self.ip} "
|
||||
f"on port {self.port}")
|
||||
self._nextKey()
|
||||
return
|
||||
|
||||
# Check if we're still in the same state
|
||||
if self.state == self.NEGOTIATING_KEY:
|
||||
key = self.transport.get_remote_server_key()
|
||||
if key:
|
||||
self.keys.append("%s %s" % (key.get_name(), key.get_base64()))
|
||||
|
Loading…
x
Reference in New Issue
Block a user