Support UUID as builder identifier
We currently use the hostname of the builder host as the identifier for the owner of built images. This value is recorded in ZooKeeper and is used to make sure that only the owner of an image build ever deletes the entry from ZooKeeper. Using hostname can be problematic if it ever changes. It will cause orphaned builds that will not get deleted. This change allows us to use a UUID as the identifier, deprecating use of hostname altogether (although we continue storing that info). The UUID will be stored in a file in the images directory so that it may persist across nodepool-builder restarts. In order to help with transitioning existing builders to using UUID instead of hostname, the code will always compare the UUID value AND the hostname so that existing ZK entries will be matched until they age away. Change-Id: Ifafbab9fb0f41564cc1af595586fa7353ce1d0d0
This commit is contained in:
parent
42e15e2150
commit
fcc68b8fd6
@ -16,6 +16,12 @@ and ``providers`` sections::
|
||||
providers:
|
||||
...
|
||||
|
||||
.. note:: The builder daemon creates a UUID to uniquely identify itself and
|
||||
to mark image builds in ZooKeeper that it owns. This file will be
|
||||
named ``builder_id.txt`` and will live in the directory named by the
|
||||
:ref:`images-dir` option. If this file does not exist, it will be
|
||||
created on builder startup and a UUID will be created automatically.
|
||||
|
||||
The following sections are available. All are required unless
|
||||
otherwise indicated.
|
||||
|
||||
@ -46,6 +52,8 @@ Example::
|
||||
|
||||
elements-dir: /path/to/elements/dir
|
||||
|
||||
.. _images-dir:
|
||||
|
||||
images-dir
|
||||
----------
|
||||
|
||||
|
@ -21,6 +21,7 @@ import subprocess
|
||||
import threading
|
||||
import time
|
||||
import shlex
|
||||
import uuid
|
||||
|
||||
from nodepool import config as nodepool_config
|
||||
from nodepool import exceptions
|
||||
@ -107,7 +108,7 @@ class DibImageFile(object):
|
||||
|
||||
|
||||
class BaseWorker(threading.Thread):
|
||||
def __init__(self, config_path, interval, zk):
|
||||
def __init__(self, builder_id, config_path, interval, zk):
|
||||
super(BaseWorker, self).__init__()
|
||||
self.log = logging.getLogger("nodepool.builder.BaseWorker")
|
||||
self.daemon = True
|
||||
@ -118,6 +119,7 @@ class BaseWorker(threading.Thread):
|
||||
self._hostname = socket.gethostname()
|
||||
self._statsd = stats.get_client()
|
||||
self._interval = interval
|
||||
self._builder_id = builder_id
|
||||
|
||||
def _checkForZooKeeperChanges(self, new_config):
|
||||
'''
|
||||
@ -144,8 +146,9 @@ class CleanupWorker(BaseWorker):
|
||||
and any local DIB builds.
|
||||
'''
|
||||
|
||||
def __init__(self, name, config_path, interval, zk):
|
||||
super(CleanupWorker, self).__init__(config_path, interval, zk)
|
||||
def __init__(self, name, builder_id, config_path, interval, zk):
|
||||
super(CleanupWorker, self).__init__(builder_id, config_path,
|
||||
interval, zk)
|
||||
self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name)
|
||||
self.name = 'CleanupWorker.%s' % name
|
||||
|
||||
@ -236,7 +239,13 @@ class CleanupWorker(BaseWorker):
|
||||
# NOTE(pabelanger): It is possible we don't have any files because
|
||||
# diskimage-builder failed. So, check to see if we have the correct
|
||||
# builder so we can removed the data from zookeeper.
|
||||
if build.builder == self._hostname:
|
||||
|
||||
# To maintain backward compatibility with builders that didn't
|
||||
# use unique builder IDs before, but do now, always compare to
|
||||
# hostname as well since some ZK data may still reference that.
|
||||
if (build.builder_id == self._builder_id or
|
||||
build.builder == self._hostname
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -510,8 +519,9 @@ class CleanupWorker(BaseWorker):
|
||||
|
||||
|
||||
class BuildWorker(BaseWorker):
|
||||
def __init__(self, name, config_path, interval, zk, dib_cmd):
|
||||
super(BuildWorker, self).__init__(config_path, interval, zk)
|
||||
def __init__(self, name, builder_id, config_path, interval, zk, dib_cmd):
|
||||
super(BuildWorker, self).__init__(builder_id, config_path,
|
||||
interval, zk)
|
||||
self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name)
|
||||
self.name = 'BuildWorker.%s' % name
|
||||
self.dib_cmd = dib_cmd
|
||||
@ -573,6 +583,7 @@ class BuildWorker(BaseWorker):
|
||||
|
||||
data = zk.ImageBuild()
|
||||
data.state = zk.BUILDING
|
||||
data.builder_id = self._builder_id
|
||||
data.builder = self._hostname
|
||||
data.formats = list(diskimage.image_types)
|
||||
|
||||
@ -623,6 +634,7 @@ class BuildWorker(BaseWorker):
|
||||
|
||||
data = zk.ImageBuild()
|
||||
data.state = zk.BUILDING
|
||||
data.builder_id = self._builder_id
|
||||
data.builder = self._hostname
|
||||
data.formats = list(diskimage.image_types)
|
||||
|
||||
@ -712,6 +724,7 @@ class BuildWorker(BaseWorker):
|
||||
time.sleep(SUSPEND_WAIT_TIME)
|
||||
|
||||
build_data = zk.ImageBuild()
|
||||
build_data.builder_id = self._builder_id
|
||||
build_data.builder = self._hostname
|
||||
|
||||
if self._zk.didLoseConnection:
|
||||
@ -778,8 +791,9 @@ class BuildWorker(BaseWorker):
|
||||
|
||||
|
||||
class UploadWorker(BaseWorker):
|
||||
def __init__(self, name, config_path, interval, zk):
|
||||
super(UploadWorker, self).__init__(config_path, interval, zk)
|
||||
def __init__(self, name, builder_id, config_path, interval, zk):
|
||||
super(UploadWorker, self).__init__(builder_id, config_path,
|
||||
interval, zk)
|
||||
self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name)
|
||||
self.name = 'UploadWorker.%s' % name
|
||||
|
||||
@ -1055,6 +1069,17 @@ class NodePoolBuilder(object):
|
||||
# Private methods
|
||||
#=======================================================================
|
||||
|
||||
def _getBuilderID(self, id_file):
|
||||
if not os.path.exists(id_file):
|
||||
with open(id_file, "w") as f:
|
||||
builder_id = str(uuid.uuid4())
|
||||
f.write(builder_id)
|
||||
return builder_id
|
||||
|
||||
with open(id_file, "r") as f:
|
||||
builder_id = f.read()
|
||||
return builder_id
|
||||
|
||||
def _getAndValidateConfig(self):
|
||||
config = nodepool_config.loadConfig(self._config_path)
|
||||
if not config.zookeeper_servers.values():
|
||||
@ -1082,6 +1107,10 @@ class NodePoolBuilder(object):
|
||||
self._config = self._getAndValidateConfig()
|
||||
self._running = True
|
||||
|
||||
builder_id_file = os.path.join(self._config.imagesdir,
|
||||
"builder_id.txt")
|
||||
builder_id = self._getBuilderID(builder_id_file)
|
||||
|
||||
# All worker threads share a single ZooKeeper instance/connection.
|
||||
self.zk = zk.ZooKeeper()
|
||||
self.zk.connect(list(self._config.zookeeper_servers.values()))
|
||||
@ -1090,20 +1119,21 @@ class NodePoolBuilder(object):
|
||||
|
||||
# Create build and upload worker objects
|
||||
for i in range(self._num_builders):
|
||||
w = BuildWorker(i, self._config_path, self.build_interval,
|
||||
self.zk, self.dib_cmd)
|
||||
w = BuildWorker(i, builder_id, self._config_path,
|
||||
self.build_interval, self.zk, self.dib_cmd)
|
||||
w.start()
|
||||
self._build_workers.append(w)
|
||||
|
||||
for i in range(self._num_uploaders):
|
||||
w = UploadWorker(i, self._config_path, self.upload_interval,
|
||||
self.zk)
|
||||
w = UploadWorker(i, builder_id, self._config_path,
|
||||
self.upload_interval, self.zk)
|
||||
w.start()
|
||||
self._upload_workers.append(w)
|
||||
|
||||
if self.cleanup_interval > 0:
|
||||
self._janitor = CleanupWorker(
|
||||
0, self._config_path, self.cleanup_interval, self.zk)
|
||||
0, builder_id, self._config_path,
|
||||
self.cleanup_interval, self.zk)
|
||||
self._janitor.start()
|
||||
|
||||
# Wait until all threads are running. Otherwise, we have a race
|
||||
|
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import fixtures
|
||||
|
||||
from nodepool import builder, exceptions, fakeprovider, tests
|
||||
@ -95,6 +96,18 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
||||
nb.start()
|
||||
nb.stop()
|
||||
|
||||
def test_builder_id_file(self):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
self._useBuilder(configfile)
|
||||
path = os.path.join(self._config_images_dir.path, 'builder_id.txt')
|
||||
|
||||
# Validate the unique ID file exists and contents are what we expect
|
||||
self.assertTrue(os.path.exists(path))
|
||||
with open(path, "r") as f:
|
||||
the_id = f.read()
|
||||
obj = uuid.UUID(the_id, version=4)
|
||||
self.assertEqual(the_id, str(obj))
|
||||
|
||||
def test_image_upload_fail(self):
|
||||
"""Test that image upload fails are handled properly."""
|
||||
|
||||
|
@ -127,12 +127,14 @@ class TestZooKeeper(tests.DBTestCase):
|
||||
image = "ubuntu-trusty"
|
||||
orig_data = zk.ImageBuild()
|
||||
orig_data.builder = 'host'
|
||||
orig_data.builder_id = 'ABC-123'
|
||||
orig_data.state = zk.READY
|
||||
with self.zk.imageBuildLock(image, blocking=True, timeout=1):
|
||||
build_num = self.zk.storeBuild(image, orig_data)
|
||||
|
||||
data = self.zk.getBuild(image, build_num)
|
||||
self.assertEqual(orig_data.builder, data.builder)
|
||||
self.assertEqual(orig_data.builder_id, data.builder_id)
|
||||
self.assertEqual(orig_data.state, data.state)
|
||||
self.assertEqual(orig_data.state_time, data.state_time)
|
||||
self.assertEqual(build_num, data.id)
|
||||
@ -666,6 +668,7 @@ class TestZKModel(tests.BaseTestCase):
|
||||
o = zk.ImageBuild('0001')
|
||||
o.state = zk.BUILDING
|
||||
o.builder = 'localhost'
|
||||
o.builder_id = 'ABC-123'
|
||||
o.formats = ['qemu', 'raw']
|
||||
|
||||
d = o.toDict()
|
||||
@ -674,12 +677,14 @@ class TestZKModel(tests.BaseTestCase):
|
||||
self.assertIsNotNone(d['state_time'])
|
||||
self.assertEqual(','.join(o.formats), d['formats'])
|
||||
self.assertEqual(o.builder, d['builder'])
|
||||
self.assertEqual(o.builder_id, d['builder_id'])
|
||||
|
||||
def test_ImageBuild_fromDict(self):
|
||||
now = int(time.time())
|
||||
d_id = '0001'
|
||||
d = {
|
||||
'builder': 'localhost',
|
||||
'builder_id': 'ABC-123',
|
||||
'formats': 'qemu,raw',
|
||||
'state': zk.BUILDING,
|
||||
'state_time': now
|
||||
@ -690,6 +695,7 @@ class TestZKModel(tests.BaseTestCase):
|
||||
self.assertEqual(o.state, d['state'])
|
||||
self.assertEqual(o.state_time, d['state_time'])
|
||||
self.assertEqual(o.builder, d['builder'])
|
||||
self.assertEqual(o.builder_id, d['builder_id'])
|
||||
self.assertEqual(o.formats, d['formats'].split(','))
|
||||
|
||||
def test_ImageUpload_toDict(self):
|
||||
|
@ -189,13 +189,21 @@ class BaseModel(object):
|
||||
class ImageBuild(BaseModel):
|
||||
'''
|
||||
Class representing a DIB image build within the ZooKeeper cluster.
|
||||
|
||||
Note that the 'builder' attribute used to be used to uniquely identify
|
||||
the owner of an image build in ZooKeeper. Because hostname was used, if
|
||||
it ever changed, then we would get orphaned znodes. The 'builder_id'
|
||||
attribute was added as a replacement, keeping 'builder' to mean the
|
||||
same thing (which is why this attribute is not called 'hostname' or
|
||||
similar).
|
||||
'''
|
||||
VALID_STATES = set([BUILDING, READY, DELETING, FAILED])
|
||||
|
||||
def __init__(self, build_id=None):
|
||||
super(ImageBuild, self).__init__(build_id)
|
||||
self._formats = []
|
||||
self.builder = None # Builder hostname
|
||||
self.builder = None # Hostname
|
||||
self.builder_id = None # Unique ID
|
||||
|
||||
def __repr__(self):
|
||||
d = self.toDict()
|
||||
@ -223,6 +231,8 @@ class ImageBuild(BaseModel):
|
||||
d = super(ImageBuild, self).toDict()
|
||||
if self.builder is not None:
|
||||
d['builder'] = self.builder
|
||||
if self.builder_id is not None:
|
||||
d['builder_id'] = self.builder_id
|
||||
if len(self.formats):
|
||||
d['formats'] = ','.join(self.formats)
|
||||
return d
|
||||
@ -240,6 +250,7 @@ class ImageBuild(BaseModel):
|
||||
o = ImageBuild(o_id)
|
||||
super(ImageBuild, o).fromDict(d)
|
||||
o.builder = d.get('builder')
|
||||
o.builder_id = d.get('builder_id')
|
||||
# Only attempt the split on non-empty string
|
||||
if d.get('formats', ''):
|
||||
o.formats = d.get('formats', '').split(',')
|
||||
|
Loading…
x
Reference in New Issue
Block a user