diff --git a/nodepool/driver/__init__.py b/nodepool/driver/__init__.py index 7383b2534..5ae55c83e 100644 --- a/nodepool/driver/__init__.py +++ b/nodepool/driver/__init__.py @@ -300,6 +300,15 @@ class LabelRecorder(object): self.data.remove({'label': label, 'node_id': node_id}) return node_id + def removeNode(self, id): + ''' + Remove the node with the specified ID. + ''' + for d in self.data: + if d['node_id'] == id: + self.data.remove(d) + return + class NodeRequestHandlerNotifications(object): """ @@ -645,9 +654,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications, return False # Launches are complete, so populate ready_nodes and failed_nodes. - for node in self.nodeset: + aborted_nodes = [] + for node in self.nodeset[:]: if node.state == zk.READY: self.ready_nodes.append(node) + elif node.state == zk.ABORTED: + # ABORTED is a transient error triggered by overquota. In order + # to handle this gracefully don't count this as failed so the + # node is relaunched within this provider. Unlock the node so + # the DeletedNodeWorker cleans up the zNode. + aborted_nodes.append(node) + self.nodeset.remove(node) + self.zk.unlockNode(node) else: self.failed_nodes.append(node) @@ -674,6 +692,14 @@ class NodeRequestHandler(NodeRequestHandlerNotifications, self.log.debug("Declining node request %s because nodes failed", self.request.id) self.decline_request() + elif aborted_nodes: + # Because nodes are added to the satisfied types list before they + # are ready we need to remove the aborted nodes again so they can + # be created again. + for node in aborted_nodes: + self._satisfied_types.removeNode(node.id) + self.paused = True + return False else: # The assigned nodes must be added to the request in the order # in which they were requested. diff --git a/nodepool/driver/openstack/handler.py b/nodepool/driver/openstack/handler.py index 946e93141..1f5e64b98 100644 --- a/nodepool/driver/openstack/handler.py +++ b/nodepool/driver/openstack/handler.py @@ -246,10 +246,12 @@ class OpenStackNodeLauncher(NodeLauncher): self.zk.storeNode(self.node) if attempts == self._retries: raise - # Invalidate the quota cache if we encountered a quota error. if 'quota exceeded' in str(e).lower(): + # A quota exception is not directly recoverable so bail + # out immediately with a specific exception. self.log.info("Quota exceeded, invalidating quota cache") self.handler.manager.invalidateQuotaCache() + raise exceptions.QuotaException("Quota exceeded") attempts += 1 self.node.state = zk.READY @@ -380,8 +382,8 @@ class OpenStackNodeRequestHandler(NodeRequestHandler): ''' Check if all launch requests have completed. - When all of the Node objects have reached a final state (READY or - FAILED), we'll know all threads have finished the launch process. + When all of the Node objects have reached a final state (READY, FAILED + or ABORTED), we'll know all threads have finished the launch process. ''' if not self._threads: return True @@ -392,9 +394,10 @@ class OpenStackNodeRequestHandler(NodeRequestHandler): node_states = [node.state for node in self.nodeset] - # NOTE: It very important that NodeLauncher always sets one of + # NOTE: It's very important that NodeLauncher always sets one of # these states, no matter what. - if not all(s in (zk.READY, zk.FAILED) for s in node_states): + if not all(s in (zk.READY, zk.FAILED, zk.ABORTED) + for s in node_states): return False return True diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py index 32f77f6c4..0a760168f 100644 --- a/nodepool/driver/utils.py +++ b/nodepool/driver/utils.py @@ -22,6 +22,7 @@ import time from kazoo import exceptions as kze +from nodepool import exceptions from nodepool import stats from nodepool import zk @@ -69,6 +70,16 @@ class NodeLauncher(threading.Thread, self.node.id) self.node.state = zk.FAILED statsd_key = 'error.zksession' + except exceptions.QuotaException: + # We encountered a quota error when trying to launch a + # node. In this case we need to abort the launch. The upper + # layers will take care of this and reschedule a new node once + # the quota is ok again. + self.log.info("Aborting node %s due to quota failure" % + self.node.id) + self.node.state = zk.ABORTED + self.zk.storeNode(self.node) + statsd_key = 'error.quota' except Exception as e: self.log.exception("Launch failed for node %s:", self.node.id) self.node.state = zk.FAILED diff --git a/nodepool/exceptions.py b/nodepool/exceptions.py index 44cabfe59..f811a4ee7 100755 --- a/nodepool/exceptions.py +++ b/nodepool/exceptions.py @@ -45,6 +45,10 @@ class DibFailedError(BuilderError): pass +class QuotaException(Exception): + pass + + class TimeoutException(Exception): pass diff --git a/nodepool/launcher.py b/nodepool/launcher.py index c62c8e84e..e81887e19 100755 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -224,6 +224,8 @@ class PoolWorker(threading.Thread): try: if not r.poll(): active_handlers.append(r) + if r.paused: + self.paused_handler = r else: self.log.debug("Removing handler for request %s", r.request.id) @@ -662,7 +664,7 @@ class DeletedNodeWorker(BaseCleanupWorker): Delete instances from providers and nodes entries from ZooKeeper. ''' cleanup_states = (zk.USED, zk.IN_USE, zk.BUILDING, zk.FAILED, - zk.DELETING) + zk.DELETING, zk.ABORTED) zk_conn = self._nodepool.getZK() for node in zk_conn.nodeIterator(): diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index c9e088155..87f2732d3 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -297,32 +297,35 @@ class TestLauncher(tests.DBTestCase): # (according to nodepool's quota estimate) fails. client.max_instances = 1 - # Request a second node; this request should fail. + # Request a second node; this request should pause the handler. req2 = zk.NodeRequest() req2.state = zk.REQUESTED req2.node_types.append('fake-label') self.log.debug("Adding second request") self.zk.storeNodeRequest(req2) - req2 = self.waitForNodeRequest(req2) - self.assertEqual(req2.state, zk.FAILED) - # After the second request failed, the internal quota estimate - # should be reset, so the next request should pause to wait - # for more quota to become available. - req3 = zk.NodeRequest() - req3.state = zk.REQUESTED - req3.node_types.append('fake-label') - self.log.debug("Adding third request") - self.zk.storeNodeRequest(req3) - req3 = self.waitForNodeRequest(req3, (zk.PENDING,)) - self.assertEqual(req3.state, zk.PENDING) - - # Wait until there is a paused request handler and verify that - # there is still only one server built (from the first - # request). pool_worker = pool.getPoolWorkers('fake-provider') while not pool_worker[0].paused_handler: + # self.log.debug("tick") time.sleep(0.1) + self.log.debug("finished waiting") + + # The handler is paused now and the request should be in state PENDING + req2 = self.waitForNodeRequest(req2, zk.PENDING) + self.assertEqual(req2.state, zk.PENDING) + + # Now free up the first node + self.log.debug("Marking first node as used %s", req1.id) + req1_node.state = zk.USED + self.zk.storeNode(req1_node) + self.zk.unlockNode(req1_node) + self.waitForNodeDeletion(req1_node) + + # After the first node is cleaned up the second request should be + # able to fulfill now. + req2 = self.waitForNodeRequest(req2) + self.assertEqual(req2.state, zk.FULFILLED) + self.assertEqual(len(client._server_list), 1) def test_fail_request_on_launch_failure(self): diff --git a/nodepool/zk.py b/nodepool/zk.py index 6ead9199e..bde1ab4e5 100755 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -52,6 +52,9 @@ USED = 'used' HOLD = 'hold' # Initial node state INIT = 'init' +# Aborted due to a transient error like overquota that should not count as a +# failed launch attempt +ABORTED = 'aborted' # NOTE(Shrews): Importing this from nodepool.config causes an import error @@ -484,7 +487,7 @@ class Node(BaseModel): Class representing a launched node. ''' VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, - HOLD, DELETING, FAILED, INIT]) + HOLD, DELETING, FAILED, INIT, ABORTED]) def __init__(self, id=None): super(Node, self).__init__(id)