From b2367d88a68ec9fa8cd1f273bf4ed0ec216e87f6 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 10 Oct 2013 12:44:33 -0700 Subject: [PATCH] Add the ability to ignore offline targets If a jenkins is in shutdown mode or is offline, ignore that jenkins for the purposes of launching nodes. Node updates (used/complete) for that jenkins will still be processed. This should allow another jenkins to gracefully accept the increased load if one goes offline. Also, log the IP address when spinning up a node. Change-Id: I3a8720dd5aaf154ca91cdc36136decad52eb6afa --- nodepool/fakeprovider.py | 33 ++++++++++++++++++++++++++++++- nodepool/jenkins_manager.py | 10 +++++++++- nodepool/nodepool.py | 39 ++++++++++++++++++++++++++++++------- tools/fake-servers.py | 4 ++-- 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/nodepool/fakeprovider.py b/nodepool/fakeprovider.py index 0bd21e9b4..58fbc7b3f 100644 --- a/nodepool/fakeprovider.py +++ b/nodepool/fakeprovider.py @@ -18,6 +18,7 @@ import uuid import time import threading import novaclient +from jenkins import JenkinsException class Dummy(object): @@ -108,8 +109,14 @@ class FakeSSHClient(object): class FakeJenkins(object): - def __init__(self): + def __init__(self, user): self._nodes = {} + self.quiet = False + self.down = False + if user == 'quiet': + self.quiet = True + if user == 'down': + self.down = True def node_exists(self, name): return name in self._nodes @@ -120,5 +127,29 @@ class FakeJenkins(object): def delete_node(self, name): del self._nodes[name] + def get_info(self): + if self.down: + raise JenkinsException("Jenkins is down") + d = {u'assignedLabels': [{}], + u'description': None, + u'jobs': [{u'color': u'red', + u'name': u'test-job', + u'url': u'https://jenkins.example.com/job/test-job/'}], + u'mode': u'NORMAL', + u'nodeDescription': u'the master Jenkins node', + u'nodeName': u'', + u'numExecutors': 1, + u'overallLoad': {}, + u'primaryView': {u'name': u'Overview', + u'url': u'https://jenkins.example.com/'}, + u'quietingDown': self.quiet, + u'slaveAgentPort': 8090, + u'unlabeledLoad': {}, + u'useCrumbs': False, + u'useSecurity': True, + u'views': [ + {u'name': u'test-view', + u'url': u'https://jenkins.example.com/view/test-view/'}]} + return d FAKE_CLIENT = FakeClient() diff --git a/nodepool/jenkins_manager.py b/nodepool/jenkins_manager.py index d33a99964..d94c5f2fe 100644 --- a/nodepool/jenkins_manager.py +++ b/nodepool/jenkins_manager.py @@ -80,6 +80,11 @@ class StartBuildTask(Task): parameters=self.args['params']) +class GetInfoTask(Task): + def main(self, jenkins): + return jenkins.get_info() + + class JenkinsManager(TaskManager): log = logging.getLogger("nodepool.JenkinsManager") @@ -90,7 +95,7 @@ class JenkinsManager(TaskManager): def _getClient(self): if self.target.jenkins_apikey == 'fake': - return fakeprovider.FakeJenkins() + return fakeprovider.FakeJenkins(self.target.jenkins_user) return myjenkins.Jenkins(self.target.jenkins_url, self.target.jenkins_user, self.target.jenkins_apikey) @@ -127,3 +132,6 @@ class JenkinsManager(TaskManager): def startBuild(self, name, params): self.submitTask(StartBuildTask(name=name, params=params)) + + def getInfo(self): + return self.submitTask(GetInfoTask()) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 225163c92..3d7025203 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -293,7 +293,8 @@ class NodeLauncher(threading.Thread): raise Exception("Unable to find public IP of server") self.node.ip = ip - self.log.debug("Node id: %s is running, testing ssh" % self.node.id) + self.log.debug("Node id: %s is running, ip: %s, testing ssh" % + (ip, self.node.id)) connect_kwargs = dict(key_filename=self.image.private_key) if not utils.ssh_connect(ip, self.image.username, connect_kwargs=connect_kwargs, @@ -630,6 +631,7 @@ class NodePool(threading.Thread): t.name = target['name'] newconfig.targets[t.name] = t jenkins = target.get('jenkins') + t.online = True if jenkins: t.jenkins_url = jenkins['url'] t.jenkins_user = jenkins['user'] @@ -711,6 +713,18 @@ class NodePool(threading.Thread): for oldmanager in stop_managers: oldmanager.stop() + for t in config.targets.values(): + try: + info = config.jenkins_managers[t.name].getInfo() + if info['quietingDown']: + self.log.info("Target %s is offline" % t.name) + t.online = False + else: + t.online = True + except Exception: + self.log.exception("Unable to check status of %s" % t.name) + t.online = False + def reconfigureCrons(self, config): cron_map = { 'image-update': self._doUpdateImages, @@ -809,22 +823,29 @@ class NodePool(threading.Thread): # Make sure that the current demand includes at least the # configured min_ready values total_image_min_ready = {} + online_targets = set() for target in self.config.targets.values(): + if not target.online: + continue + online_targets.add(target.name) for image in target.images.values(): min_ready = total_image_min_ready.get(image.name, 0) min_ready += image.min_ready total_image_min_ready[image.name] = min_ready + def count_nodes(image_name, state): + nodes = session.getNodes(image_name=image_name, + state=state) + return len([n for n in nodes + if n.target_name in online_targets]) + # Actual need is demand - (ready + building) for image_name in total_image_min_ready: start_demand = image_demand.get(image_name, 0) min_demand = max(start_demand, total_image_min_ready[image_name]) - n_ready = len(session.getNodes(image_name=image_name, - state=nodedb.READY)) - n_building = len(session.getNodes(image_name=image_name, - state=nodedb.BUILDING)) - n_test = len(session.getNodes(image_name=image_name, - state=nodedb.TEST)) + n_ready = count_nodes(image_name, nodedb.READY) + n_building = count_nodes(image_name, nodedb.BUILDING) + n_test = count_nodes(image_name, nodedb.TEST) ready = n_ready + n_building + n_test demand = max(min_demand - ready, 0) image_demand[image_name] = demand @@ -851,6 +872,8 @@ class NodePool(threading.Thread): allocation_requests = {} # Set up the request values in the allocation system for target in self.config.targets.values(): + if not target.online: + continue at = allocation.AllocationTarget(target.name) for image in target.images.values(): ar = allocation_requests.get(image.name) @@ -915,6 +938,8 @@ class NodePool(threading.Thread): self.checkForMissingImages(session) nodes_to_launch = self.getNeededNodes(session) for target in self.config.targets.values(): + if not target.online: + continue self.log.debug("Examining target: %s" % target.name) for image in target.images.values(): for provider in image.providers.values(): diff --git a/tools/fake-servers.py b/tools/fake-servers.py index de0c6171b..acd5fed5c 100644 --- a/tools/fake-servers.py +++ b/tools/fake-servers.py @@ -31,9 +31,9 @@ import gear class MyGearmanServer(gear.Server): def handleStatus(self, request): - request.connection.conn.send(("fake_job\t%s\t0\t0\n" % + request.connection.conn.send(("build:fake_job\t%s\t0\t0\n" % self._count).encode('utf8')) - request.connection.conn.send(("fake_job:nodepool-fake\t%s\t0\t0\n" % + request.connection.conn.send(("build:fake_job:devstack-precise\t%s\t0\t0\n" % 0).encode('utf8')) request.connection.conn.send(b'.\n')