Support UUID as builder identifier

We currently use the hostname of the builder host as the
identifier for the owner of built images. This value is
recorded in ZooKeeper and is used to make sure that only
the owner of an image build ever deletes the entry from
ZooKeeper.

Using hostname can be problematic if it ever changes. It will
cause orphaned builds that will not get deleted. This change
allows us to use a UUID as the identifier, deprecating use of
hostname altogether (although we continue storing that info).
The UUID will be stored in a file in the images directory so
that it may persist across nodepool-builder restarts.

In order to help with transitioning existing builders to using
UUID instead of hostname, the code will always compare the UUID
value AND the hostname so that existing ZK entries will be
matched until they age away.

Change-Id: Ifafbab9fb0f41564cc1af595586fa7353ce1d0d0
This commit is contained in:
David Shrewsbury 2017-07-17 12:04:24 -04:00
parent 42e15e2150
commit fcc68b8fd6
5 changed files with 82 additions and 14 deletions

View File

@ -16,6 +16,12 @@ and ``providers`` sections::
providers: providers:
... ...
.. note:: The builder daemon creates a UUID to uniquely identify itself and
to mark image builds in ZooKeeper that it owns. This file will be
named ``builder_id.txt`` and will live in the directory named by the
:ref:`images-dir` option. If this file does not exist, it will be
created on builder startup and a UUID will be created automatically.
The following sections are available. All are required unless The following sections are available. All are required unless
otherwise indicated. otherwise indicated.
@ -46,6 +52,8 @@ Example::
elements-dir: /path/to/elements/dir elements-dir: /path/to/elements/dir
.. _images-dir:
images-dir images-dir
---------- ----------

View File

@ -21,6 +21,7 @@ import subprocess
import threading import threading
import time import time
import shlex import shlex
import uuid
from nodepool import config as nodepool_config from nodepool import config as nodepool_config
from nodepool import exceptions from nodepool import exceptions
@ -107,7 +108,7 @@ class DibImageFile(object):
class BaseWorker(threading.Thread): class BaseWorker(threading.Thread):
def __init__(self, config_path, interval, zk): def __init__(self, builder_id, config_path, interval, zk):
super(BaseWorker, self).__init__() super(BaseWorker, self).__init__()
self.log = logging.getLogger("nodepool.builder.BaseWorker") self.log = logging.getLogger("nodepool.builder.BaseWorker")
self.daemon = True self.daemon = True
@ -118,6 +119,7 @@ class BaseWorker(threading.Thread):
self._hostname = socket.gethostname() self._hostname = socket.gethostname()
self._statsd = stats.get_client() self._statsd = stats.get_client()
self._interval = interval self._interval = interval
self._builder_id = builder_id
def _checkForZooKeeperChanges(self, new_config): def _checkForZooKeeperChanges(self, new_config):
''' '''
@ -144,8 +146,9 @@ class CleanupWorker(BaseWorker):
and any local DIB builds. and any local DIB builds.
''' '''
def __init__(self, name, config_path, interval, zk): def __init__(self, name, builder_id, config_path, interval, zk):
super(CleanupWorker, self).__init__(config_path, interval, zk) super(CleanupWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name) self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name)
self.name = 'CleanupWorker.%s' % name self.name = 'CleanupWorker.%s' % name
@ -236,7 +239,13 @@ class CleanupWorker(BaseWorker):
# NOTE(pabelanger): It is possible we don't have any files because # NOTE(pabelanger): It is possible we don't have any files because
# diskimage-builder failed. So, check to see if we have the correct # diskimage-builder failed. So, check to see if we have the correct
# builder so we can removed the data from zookeeper. # builder so we can removed the data from zookeeper.
if build.builder == self._hostname:
# To maintain backward compatibility with builders that didn't
# use unique builder IDs before, but do now, always compare to
# hostname as well since some ZK data may still reference that.
if (build.builder_id == self._builder_id or
build.builder == self._hostname
):
return True return True
return False return False
@ -510,8 +519,9 @@ class CleanupWorker(BaseWorker):
class BuildWorker(BaseWorker): class BuildWorker(BaseWorker):
def __init__(self, name, config_path, interval, zk, dib_cmd): def __init__(self, name, builder_id, config_path, interval, zk, dib_cmd):
super(BuildWorker, self).__init__(config_path, interval, zk) super(BuildWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name) self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name)
self.name = 'BuildWorker.%s' % name self.name = 'BuildWorker.%s' % name
self.dib_cmd = dib_cmd self.dib_cmd = dib_cmd
@ -573,6 +583,7 @@ class BuildWorker(BaseWorker):
data = zk.ImageBuild() data = zk.ImageBuild()
data.state = zk.BUILDING data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname data.builder = self._hostname
data.formats = list(diskimage.image_types) data.formats = list(diskimage.image_types)
@ -623,6 +634,7 @@ class BuildWorker(BaseWorker):
data = zk.ImageBuild() data = zk.ImageBuild()
data.state = zk.BUILDING data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname data.builder = self._hostname
data.formats = list(diskimage.image_types) data.formats = list(diskimage.image_types)
@ -712,6 +724,7 @@ class BuildWorker(BaseWorker):
time.sleep(SUSPEND_WAIT_TIME) time.sleep(SUSPEND_WAIT_TIME)
build_data = zk.ImageBuild() build_data = zk.ImageBuild()
build_data.builder_id = self._builder_id
build_data.builder = self._hostname build_data.builder = self._hostname
if self._zk.didLoseConnection: if self._zk.didLoseConnection:
@ -778,8 +791,9 @@ class BuildWorker(BaseWorker):
class UploadWorker(BaseWorker): class UploadWorker(BaseWorker):
def __init__(self, name, config_path, interval, zk): def __init__(self, name, builder_id, config_path, interval, zk):
super(UploadWorker, self).__init__(config_path, interval, zk) super(UploadWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name) self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name)
self.name = 'UploadWorker.%s' % name self.name = 'UploadWorker.%s' % name
@ -1055,6 +1069,17 @@ class NodePoolBuilder(object):
# Private methods # Private methods
#======================================================================= #=======================================================================
def _getBuilderID(self, id_file):
if not os.path.exists(id_file):
with open(id_file, "w") as f:
builder_id = str(uuid.uuid4())
f.write(builder_id)
return builder_id
with open(id_file, "r") as f:
builder_id = f.read()
return builder_id
def _getAndValidateConfig(self): def _getAndValidateConfig(self):
config = nodepool_config.loadConfig(self._config_path) config = nodepool_config.loadConfig(self._config_path)
if not config.zookeeper_servers.values(): if not config.zookeeper_servers.values():
@ -1082,6 +1107,10 @@ class NodePoolBuilder(object):
self._config = self._getAndValidateConfig() self._config = self._getAndValidateConfig()
self._running = True self._running = True
builder_id_file = os.path.join(self._config.imagesdir,
"builder_id.txt")
builder_id = self._getBuilderID(builder_id_file)
# All worker threads share a single ZooKeeper instance/connection. # All worker threads share a single ZooKeeper instance/connection.
self.zk = zk.ZooKeeper() self.zk = zk.ZooKeeper()
self.zk.connect(list(self._config.zookeeper_servers.values())) self.zk.connect(list(self._config.zookeeper_servers.values()))
@ -1090,20 +1119,21 @@ class NodePoolBuilder(object):
# Create build and upload worker objects # Create build and upload worker objects
for i in range(self._num_builders): for i in range(self._num_builders):
w = BuildWorker(i, self._config_path, self.build_interval, w = BuildWorker(i, builder_id, self._config_path,
self.zk, self.dib_cmd) self.build_interval, self.zk, self.dib_cmd)
w.start() w.start()
self._build_workers.append(w) self._build_workers.append(w)
for i in range(self._num_uploaders): for i in range(self._num_uploaders):
w = UploadWorker(i, self._config_path, self.upload_interval, w = UploadWorker(i, builder_id, self._config_path,
self.zk) self.upload_interval, self.zk)
w.start() w.start()
self._upload_workers.append(w) self._upload_workers.append(w)
if self.cleanup_interval > 0: if self.cleanup_interval > 0:
self._janitor = CleanupWorker( self._janitor = CleanupWorker(
0, self._config_path, self.cleanup_interval, self.zk) 0, builder_id, self._config_path,
self.cleanup_interval, self.zk)
self._janitor.start() self._janitor.start()
# Wait until all threads are running. Otherwise, we have a race # Wait until all threads are running. Otherwise, we have a race

View File

@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import os import os
import uuid
import fixtures import fixtures
from nodepool import builder, exceptions, fakeprovider, tests from nodepool import builder, exceptions, fakeprovider, tests
@ -95,6 +96,18 @@ class TestNodePoolBuilder(tests.DBTestCase):
nb.start() nb.start()
nb.stop() nb.stop()
def test_builder_id_file(self):
configfile = self.setup_config('node.yaml')
self._useBuilder(configfile)
path = os.path.join(self._config_images_dir.path, 'builder_id.txt')
# Validate the unique ID file exists and contents are what we expect
self.assertTrue(os.path.exists(path))
with open(path, "r") as f:
the_id = f.read()
obj = uuid.UUID(the_id, version=4)
self.assertEqual(the_id, str(obj))
def test_image_upload_fail(self): def test_image_upload_fail(self):
"""Test that image upload fails are handled properly.""" """Test that image upload fails are handled properly."""

View File

@ -127,12 +127,14 @@ class TestZooKeeper(tests.DBTestCase):
image = "ubuntu-trusty" image = "ubuntu-trusty"
orig_data = zk.ImageBuild() orig_data = zk.ImageBuild()
orig_data.builder = 'host' orig_data.builder = 'host'
orig_data.builder_id = 'ABC-123'
orig_data.state = zk.READY orig_data.state = zk.READY
with self.zk.imageBuildLock(image, blocking=True, timeout=1): with self.zk.imageBuildLock(image, blocking=True, timeout=1):
build_num = self.zk.storeBuild(image, orig_data) build_num = self.zk.storeBuild(image, orig_data)
data = self.zk.getBuild(image, build_num) data = self.zk.getBuild(image, build_num)
self.assertEqual(orig_data.builder, data.builder) self.assertEqual(orig_data.builder, data.builder)
self.assertEqual(orig_data.builder_id, data.builder_id)
self.assertEqual(orig_data.state, data.state) self.assertEqual(orig_data.state, data.state)
self.assertEqual(orig_data.state_time, data.state_time) self.assertEqual(orig_data.state_time, data.state_time)
self.assertEqual(build_num, data.id) self.assertEqual(build_num, data.id)
@ -666,6 +668,7 @@ class TestZKModel(tests.BaseTestCase):
o = zk.ImageBuild('0001') o = zk.ImageBuild('0001')
o.state = zk.BUILDING o.state = zk.BUILDING
o.builder = 'localhost' o.builder = 'localhost'
o.builder_id = 'ABC-123'
o.formats = ['qemu', 'raw'] o.formats = ['qemu', 'raw']
d = o.toDict() d = o.toDict()
@ -674,12 +677,14 @@ class TestZKModel(tests.BaseTestCase):
self.assertIsNotNone(d['state_time']) self.assertIsNotNone(d['state_time'])
self.assertEqual(','.join(o.formats), d['formats']) self.assertEqual(','.join(o.formats), d['formats'])
self.assertEqual(o.builder, d['builder']) self.assertEqual(o.builder, d['builder'])
self.assertEqual(o.builder_id, d['builder_id'])
def test_ImageBuild_fromDict(self): def test_ImageBuild_fromDict(self):
now = int(time.time()) now = int(time.time())
d_id = '0001' d_id = '0001'
d = { d = {
'builder': 'localhost', 'builder': 'localhost',
'builder_id': 'ABC-123',
'formats': 'qemu,raw', 'formats': 'qemu,raw',
'state': zk.BUILDING, 'state': zk.BUILDING,
'state_time': now 'state_time': now
@ -690,6 +695,7 @@ class TestZKModel(tests.BaseTestCase):
self.assertEqual(o.state, d['state']) self.assertEqual(o.state, d['state'])
self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.state_time, d['state_time'])
self.assertEqual(o.builder, d['builder']) self.assertEqual(o.builder, d['builder'])
self.assertEqual(o.builder_id, d['builder_id'])
self.assertEqual(o.formats, d['formats'].split(',')) self.assertEqual(o.formats, d['formats'].split(','))
def test_ImageUpload_toDict(self): def test_ImageUpload_toDict(self):

View File

@ -189,13 +189,21 @@ class BaseModel(object):
class ImageBuild(BaseModel): class ImageBuild(BaseModel):
''' '''
Class representing a DIB image build within the ZooKeeper cluster. Class representing a DIB image build within the ZooKeeper cluster.
Note that the 'builder' attribute used to be used to uniquely identify
the owner of an image build in ZooKeeper. Because hostname was used, if
it ever changed, then we would get orphaned znodes. The 'builder_id'
attribute was added as a replacement, keeping 'builder' to mean the
same thing (which is why this attribute is not called 'hostname' or
similar).
''' '''
VALID_STATES = set([BUILDING, READY, DELETING, FAILED]) VALID_STATES = set([BUILDING, READY, DELETING, FAILED])
def __init__(self, build_id=None): def __init__(self, build_id=None):
super(ImageBuild, self).__init__(build_id) super(ImageBuild, self).__init__(build_id)
self._formats = [] self._formats = []
self.builder = None # Builder hostname self.builder = None # Hostname
self.builder_id = None # Unique ID
def __repr__(self): def __repr__(self):
d = self.toDict() d = self.toDict()
@ -223,6 +231,8 @@ class ImageBuild(BaseModel):
d = super(ImageBuild, self).toDict() d = super(ImageBuild, self).toDict()
if self.builder is not None: if self.builder is not None:
d['builder'] = self.builder d['builder'] = self.builder
if self.builder_id is not None:
d['builder_id'] = self.builder_id
if len(self.formats): if len(self.formats):
d['formats'] = ','.join(self.formats) d['formats'] = ','.join(self.formats)
return d return d
@ -240,6 +250,7 @@ class ImageBuild(BaseModel):
o = ImageBuild(o_id) o = ImageBuild(o_id)
super(ImageBuild, o).fromDict(d) super(ImageBuild, o).fromDict(d)
o.builder = d.get('builder') o.builder = d.get('builder')
o.builder_id = d.get('builder_id')
# Only attempt the split on non-empty string # Only attempt the split on non-empty string
if d.get('formats', ''): if d.get('formats', ''):
o.formats = d.get('formats', '').split(',') o.formats = d.get('formats', '').split(',')