Add host-key-checking to metastatic driver
If a long-running backing node used by the metastatic driver develops problems, performing a host-key-check each time we allocate a new metastatic node may detect these problems. If that happens, mark the backing node as failed so that no more nodes are allocated to it and it is eventually removed. Change-Id: Ib1763cf8c6e694a4957cb158b3b6afa53d20e606
This commit is contained in:
parent
f89b41f6ad
commit
21f1b88b75
@ -107,6 +107,23 @@ itself, which is "meta".
|
||||
to apply to all pools within that provider, or it can be
|
||||
overridden here for a specific pool.
|
||||
|
||||
.. attr:: host-key-checking
|
||||
:type: bool
|
||||
:default: False
|
||||
|
||||
Whether to validate SSH host keys. When true, this helps
|
||||
ensure that nodes are ready to receive SSH connections
|
||||
before they are supplied to the requestor. When set to
|
||||
false, nodepool-launcher will not attempt to ssh-keyscan
|
||||
nodes after they are booted. Unlike other drivers, this
|
||||
defaults to false here because it is presumed that the
|
||||
backing node has already been checked for connectivity.
|
||||
Enabling it here will cause the launcher to check
|
||||
connectivity each time it allocates a new slot on the
|
||||
backing node, and if a check fails, it will mark the backing
|
||||
node as failed and stop allocating any more slots on that
|
||||
node.
|
||||
|
||||
.. attr:: node-attributes
|
||||
:type: dict
|
||||
|
||||
@ -175,3 +192,23 @@ itself, which is "meta".
|
||||
last requested node is deleted to see if new requests
|
||||
arrive for this label before deleting the backing node.
|
||||
Set this value to the amount of time in seconds to wait.
|
||||
|
||||
.. attr:: host-key-checking
|
||||
:type: bool
|
||||
:default: False
|
||||
|
||||
Whether to validate SSH host keys. When true, this helps
|
||||
ensure that nodes are ready to receive SSH connections
|
||||
before they are supplied to the requestor. When set to
|
||||
false, nodepool-launcher will not attempt to ssh-keyscan
|
||||
nodes after they are booted. Unlike other drivers, this
|
||||
defaults to false here because it is presumed that the
|
||||
backing node has already been checked for connectivity.
|
||||
Enabling it here will cause the launcher to check
|
||||
connectivity each time it allocates a new slot on the
|
||||
backing node, and if a check fails, it will mark the backing
|
||||
node as failed and stop allocating any more slots on that
|
||||
node.
|
||||
|
||||
.. note:: This value will override the value for
|
||||
:attr:`providers.[metastatic].pools.host-key-checking`.
|
||||
|
@ -220,6 +220,8 @@ class BackingNodeRecord:
|
||||
self.last_used = time.time()
|
||||
|
||||
def hasAvailableSlot(self):
|
||||
if self.failed:
|
||||
return False
|
||||
return None in self.allocated_nodes
|
||||
|
||||
def isEmpty(self):
|
||||
@ -297,6 +299,8 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
# The label doesn't exist in our config any more,
|
||||
# it must have been removed.
|
||||
grace_time = 0
|
||||
if bnr.failed:
|
||||
grace_time = 0
|
||||
if (bnr.isEmpty() and
|
||||
now - bnr.last_used > grace_time):
|
||||
self.log.info("Backing node %s has been idle for "
|
||||
@ -325,6 +329,21 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
def getQuotaForLabel(self, label):
|
||||
return QuotaInformation(instances=1)
|
||||
|
||||
def notifyNodescanFailure(self, label, external_id):
|
||||
with self.allocation_lock:
|
||||
backing_node_records = self.backing_node_records.get(
|
||||
label.name, [])
|
||||
for bnr in backing_node_records:
|
||||
if bnr.backsNode(external_id):
|
||||
self.log.info(
|
||||
"Nodescan failure of %s on %s, failing backing node",
|
||||
external_id, bnr.node_id)
|
||||
bnr.failed = True
|
||||
backing_node = self._getNode(bnr.node_id)
|
||||
backing_node.user_data = self._makeBackingNodeUserData(bnr)
|
||||
self.zk.storeNode(backing_node)
|
||||
return
|
||||
|
||||
# Local implementation below
|
||||
|
||||
def _init(self):
|
||||
@ -352,6 +371,7 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
backing_node_record = BackingNodeRecord(user_data['label'],
|
||||
user_data['slots'])
|
||||
backing_node_record.node_id = node.id
|
||||
backing_node_record.failed = user_data.get('failed', False)
|
||||
self.log.info("Found backing node %s for %s",
|
||||
node.id, user_data['label'])
|
||||
self._addBackingNode(user_data['label'],
|
||||
@ -381,6 +401,9 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
# if we have room for the label, allocate and return existing slot
|
||||
# otherwise, make a new backing node
|
||||
with self.allocation_lock:
|
||||
# First, find out if we're retrying a request for the same
|
||||
# node id; if so, immediately deallocate the old one.
|
||||
self._deallocateBackingNodeInner(node_id)
|
||||
backing_node_record = None
|
||||
for bnr in self.backing_node_records.get(label.name, []):
|
||||
if bnr.hasAvailableSlot():
|
||||
@ -411,15 +434,26 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
def _deallocateBackingNode(self, node_id):
|
||||
self._init()
|
||||
with self.allocation_lock:
|
||||
for label_name, backing_node_records in \
|
||||
self.backing_node_records.items():
|
||||
for bn in backing_node_records:
|
||||
if bn.backsNode(node_id):
|
||||
slot = bn.deallocateSlot(node_id)
|
||||
self.log.info(
|
||||
"Unassigned node %s from backing node %s slot %s",
|
||||
node_id, bn.node_id, slot)
|
||||
return
|
||||
self._deallocateBackingNodeInner(node_id)
|
||||
|
||||
def _deallocateBackingNodeInner(self, node_id):
|
||||
for label_name, backing_node_records in \
|
||||
self.backing_node_records.items():
|
||||
for bnr in backing_node_records:
|
||||
if bnr.backsNode(node_id):
|
||||
slot = bnr.deallocateSlot(node_id)
|
||||
self.log.info(
|
||||
"Unassigned node %s from backing node %s slot %s",
|
||||
node_id, bnr.node_id, slot)
|
||||
return
|
||||
|
||||
def _makeBackingNodeUserData(self, bnr):
|
||||
return json.dumps({
|
||||
'owner': self.my_id,
|
||||
'label': bnr.label_name,
|
||||
'slots': bnr.slot_count,
|
||||
'failed': bnr.failed,
|
||||
})
|
||||
|
||||
def _checkBackingNodeRequests(self):
|
||||
self._init()
|
||||
@ -452,11 +486,7 @@ class MetastaticAdapter(statemachine.Adapter):
|
||||
node = self._getNode(node_id)
|
||||
self.zk.lockNode(node, blocking=True, timeout=30,
|
||||
ephemeral=False, identifier=self.my_id)
|
||||
node.user_data = json.dumps({
|
||||
'owner': self.my_id,
|
||||
'label': bnr.label_name,
|
||||
'slots': bnr.slot_count,
|
||||
})
|
||||
node.user_data = self._makeBackingNodeUserData(bnr)
|
||||
node.state = zk.IN_USE
|
||||
self.zk.storeNode(node)
|
||||
self.zk.deleteNodeRequest(request)
|
||||
|
@ -45,7 +45,8 @@ class MetastaticLabel(ConfigValue):
|
||||
self.cloud_image = MetastaticCloudImage()
|
||||
self.max_parallel_jobs = label.get('max-parallel-jobs', 1)
|
||||
self.grace_time = label.get('grace-time', 60)
|
||||
self.host_key_checking = self.pool.host_key_checking
|
||||
self.host_key_checking = label.get('host-key-checking',
|
||||
self.pool.host_key_checking)
|
||||
|
||||
@staticmethod
|
||||
def getSchema():
|
||||
@ -54,6 +55,7 @@ class MetastaticLabel(ConfigValue):
|
||||
v.Required('backing-label'): str,
|
||||
'max-parallel-jobs': int,
|
||||
'grace-time': int,
|
||||
'host-key-checking': bool,
|
||||
}
|
||||
|
||||
def isBackingConfigEqual(self, other):
|
||||
@ -74,7 +76,9 @@ class MetastaticPool(ConfigPool):
|
||||
self.labels = {}
|
||||
# We will just use the interface_ip of the backing node
|
||||
self.use_internal_ip = False
|
||||
self.host_key_checking = False
|
||||
# Allow extra checking of the backing node to detect failure
|
||||
# after it's already running.
|
||||
self.host_key_checking = pool_config.get('host-key-checking', False)
|
||||
|
||||
self.load(pool_config)
|
||||
|
||||
@ -95,6 +99,7 @@ class MetastaticPool(ConfigPool):
|
||||
v.Required('name'): str,
|
||||
v.Required('labels'): [label],
|
||||
'max-servers': int,
|
||||
'host-key-checking': bool,
|
||||
})
|
||||
return pool
|
||||
|
||||
|
@ -344,6 +344,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
|
||||
if isinstance(e, exceptions.LaunchKeyscanException):
|
||||
try:
|
||||
label = self.handler.pool.labels[node.type[0]]
|
||||
self.manager.adapter.notifyNodescanFailure(
|
||||
label, node.external_id)
|
||||
console = self.manager.adapter.getConsoleLog(
|
||||
label, node.external_id)
|
||||
if console:
|
||||
@ -1651,7 +1653,7 @@ class Adapter:
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
# The following method is optional
|
||||
# The following methods are optional
|
||||
def getConsoleLog(self, label, external_id):
|
||||
"""Return the console log from the specified server
|
||||
|
||||
@ -1659,3 +1661,11 @@ class Adapter:
|
||||
:param external_id str: The external id of the server
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def notifyNodescanFailure(self, label, external_id):
|
||||
"""Notify the adapter of a nodescan failure
|
||||
|
||||
:param label ConfigLabel: The label config for the node
|
||||
:param external_id str: The external id of the server
|
||||
"""
|
||||
pass
|
||||
|
2
nodepool/tests/fixtures/metastatic.yaml
vendored
2
nodepool/tests/fixtures/metastatic.yaml
vendored
@ -49,6 +49,7 @@ providers:
|
||||
- name: main
|
||||
max-servers: 10
|
||||
priority: 1
|
||||
host-key-checking: true
|
||||
node-attributes:
|
||||
metaattr: meta
|
||||
testattr: metastatic
|
||||
@ -57,3 +58,4 @@ providers:
|
||||
backing-label: backing-label
|
||||
max-parallel-jobs: 2
|
||||
grace-time: 2
|
||||
host-key-checking: true
|
||||
|
@ -17,11 +17,13 @@ import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import fixtures
|
||||
import testtools
|
||||
|
||||
from nodepool import tests
|
||||
from nodepool.zk import zookeeper as zk
|
||||
from nodepool.driver.statemachine import StateMachineProvider
|
||||
import nodepool.driver.statemachine
|
||||
from nodepool.cmd.config_validator import ConfigValidator
|
||||
|
||||
|
||||
@ -228,3 +230,79 @@ class TestDriverMetastatic(tests.DBTestCase):
|
||||
self.waitForNodeDeletion(bn1)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [])
|
||||
|
||||
def test_metastatic_nodescan(self):
|
||||
# Test that a nodescan failure takes a backing node out of service
|
||||
|
||||
counter = -1
|
||||
# bn1, node1, node2
|
||||
fail = [False, False, True]
|
||||
orig_advance = nodepool.driver.statemachine.NodescanRequest.advance
|
||||
|
||||
def handler(*args, **kw):
|
||||
nonlocal counter, fail
|
||||
counter += 1
|
||||
if counter >= len(fail):
|
||||
return orig_advance(*args, **kw)
|
||||
if fail[counter]:
|
||||
raise nodepool.exceptions.ConnectionTimeoutException()
|
||||
return orig_advance(*args, **kw)
|
||||
|
||||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nodepool.driver.statemachine.NodescanRequest.advance',
|
||||
handler))
|
||||
|
||||
configfile = self.setup_config('metastatic.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.startPool(pool)
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
manager.adapter._client.create_image(name="fake-image")
|
||||
|
||||
# Launch one metastatic node on a backing node
|
||||
node1 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(len(nodes), 2)
|
||||
bn1 = nodes[1]
|
||||
self.assertEqual(bn1.provider, 'fake-provider')
|
||||
self.assertEqual(bn1.id, node1.driver_data['backing_node'])
|
||||
|
||||
# Launch a second one with a failed nodescan; should have a
|
||||
# second backing node
|
||||
node2 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
bn2 = nodes[3]
|
||||
# Reload bn1 since the userdata failed
|
||||
self.assertEqual(bn1.id, nodes[1].id)
|
||||
bn1 = nodes[1]
|
||||
self.assertNotEqual(bn1.id, bn2.id)
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2])
|
||||
self.assertEqual(bn2.id, node2.driver_data['backing_node'])
|
||||
|
||||
# Allocate a third node, should use the second backing node
|
||||
node3 = self._requestNode()
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2, node3])
|
||||
self.assertEqual(bn2.id, node3.driver_data['backing_node'])
|
||||
|
||||
# Delete node3, verify that both backing nodes exist
|
||||
node3.state = zk.DELETING
|
||||
self.zk.storeNode(node3)
|
||||
self.waitForNodeDeletion(node3)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1, node2, bn2])
|
||||
|
||||
# Delete node2, verify that only the first backing node exists
|
||||
node2.state = zk.DELETING
|
||||
self.zk.storeNode(node2)
|
||||
self.waitForNodeDeletion(node2)
|
||||
self.waitForNodeDeletion(bn2)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [node1, bn1])
|
||||
|
||||
# Delete node1, verify that no nodes exist
|
||||
node1.state = zk.DELETING
|
||||
self.zk.storeNode(node1)
|
||||
self.waitForNodeDeletion(node1)
|
||||
self.waitForNodeDeletion(bn1)
|
||||
nodes = self._getNodes()
|
||||
self.assertEqual(nodes, [])
|
||||
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The metastatic driver now includes the
|
||||
:attr:`providers.[metastatic].pools.host-key-checking` and
|
||||
:attr:`providers.[metastatic].pools.labels.host-key-checking`
|
||||
options. This can be used to verify that a backing node is still
|
||||
functioning correctly before allocating new metastatic nodes to
|
||||
it.
|
Loading…
x
Reference in New Issue
Block a user