diff --git a/.gitignore b/.gitignore index 9cce615c2..26e93f51e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,5 @@ doc/build/* zuul/versioninfo dist/ venv/ -nodepool.yaml *~ .*.swp diff --git a/.gitreview b/.gitreview index b41fcef7f..9efbf512f 100644 --- a/.gitreview +++ b/.gitreview @@ -2,3 +2,4 @@ host=review.openstack.org port=29418 project=openstack-infra/nodepool.git + diff --git a/.zuul.yaml b/.zuul.yaml index 180d54611..1904788ff 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -1,26 +1,3 @@ -- job: - name: nodepool-functional - parent: legacy-dsvm-base - run: playbooks/nodepool-functional/run.yaml - post-run: playbooks/nodepool-functional/post.yaml - timeout: 5400 - required-projects: - - openstack-infra/devstack-gate - - openstack-infra/nodepool - -- job: - name: nodepool-functional-src - parent: legacy-dsvm-base - run: playbooks/nodepool-functional-src/run.yaml - post-run: playbooks/nodepool-functional-src/post.yaml - timeout: 5400 - required-projects: - - openstack-infra/devstack-gate - - openstack-infra/glean - - openstack-infra/nodepool - - openstack-infra/shade - - openstack/diskimage-builder - - job: name: nodepool-functional-py35 parent: legacy-dsvm-base @@ -44,16 +21,23 @@ - openstack-infra/shade - openstack/diskimage-builder +- job: + name: nodepool-zuul-functional + parent: legacy-base + run: playbooks/nodepool-zuul-functional/run.yaml + post-run: playbooks/nodepool-zuul-functional/post.yaml + timeout: 1800 + required-projects: + - openstack-infra/nodepool + - openstack-infra/zuul + - project: - name: openstack-infra/nodepool check: jobs: + - tox-docs + - tox-cover - tox-pep8 - - tox-py27 - - nodepool-functional: - voting: false - - nodepool-functional-src: - voting: false + - tox-py35 - nodepool-functional-py35: voting: false - nodepool-functional-py35-src: @@ -61,7 +45,7 @@ gate: jobs: - tox-pep8 - - tox-py27 - post: + - tox-py35 + experimental: jobs: - - publish-openstack-python-branch-tarball + - nodepool-zuul-functional diff --git a/README.rst b/README.rst index 4a99ffbc3..73a12e9dd 100644 --- a/README.rst +++ b/README.rst @@ -47,29 +47,6 @@ If the cloud being used has no default_floating_pool defined in nova.conf, you will need to define a pool name using the nodepool yaml file to use floating ips. - -Set up database for interactive testing: - -.. code-block:: bash - - mysql -u root - - mysql> create database nodepool; - mysql> GRANT ALL ON nodepool.* TO 'nodepool'@'localhost'; - mysql> flush privileges; - -Set up database for unit tests: - -.. code-block:: bash - - mysql -u root - mysql> grant all privileges on *.* to 'openstack_citest'@'localhost' identified by 'openstack_citest' with grant option; - mysql> flush privileges; - mysql> create database openstack_citest; - -Note that the script tools/test-setup.sh can be used for the step -above. - Export variable for your ssh key so you can log into the created instances: .. code-block:: bash @@ -83,7 +60,7 @@ to contain your data): export STATSD_HOST=127.0.0.1 export STATSD_PORT=8125 - nodepoold -d -c tools/fake.yaml + nodepool-launcher -d -c tools/fake.yaml All logging ends up in stdout. @@ -92,9 +69,3 @@ Use the following tool to check on progress: .. code-block:: bash nodepool image-list - -After each run (the fake nova provider is only in-memory): - -.. code-block:: bash - - mysql> delete from snapshot_image; delete from node; diff --git a/bindep.txt b/bindep.txt index 426c5db61..51c1b57a4 100644 --- a/bindep.txt +++ b/bindep.txt @@ -1,8 +1,8 @@ # This is a cross-platform list tracking distribution packages needed by tests; # see http://docs.openstack.org/infra/bindep/ for additional information. -mysql-client [test] -mysql-server [test] +libffi-devel [platform:rpm] +libffi-dev [platform:dpkg] python-dev [platform:dpkg test] python-devel [platform:rpm test] zookeeperd [platform:dpkg test] diff --git a/devstack/files/debs/nodepool b/devstack/files/debs/nodepool index 1d8877b47..fe8b87512 100644 --- a/devstack/files/debs/nodepool +++ b/devstack/files/debs/nodepool @@ -3,6 +3,3 @@ kpartx debootstrap yum-utils zookeeperd -zypper -# workarond for https://bugs.launchpad.net/ubuntu/+source/zypper/+bug/1639428 -gnupg2 diff --git a/devstack/plugin.sh b/devstack/plugin.sh index 066f9679f..6051589d7 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -14,8 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. -NODEPOOL_KEY=$HOME/.ssh/id_nodepool -NODEPOOL_KEY_NAME=root NODEPOOL_PUBKEY=$HOME/.ssh/id_nodepool.pub NODEPOOL_INSTALL=$HOME/nodepool-venv NODEPOOL_CACHE_GET_PIP=/opt/stack/cache/files/get-pip.py @@ -34,7 +32,7 @@ function install_shade { # BUT - install shade into a virtualenv so that we don't have issues # with OpenStack constraints affecting the shade dependency install. # This particularly shows up with os-client-config - $NODEPOOL_INSTALL/bin/pip install -e $DEST/shade + $NODEPOOL_INSTALL/bin/pip install $DEST/shade fi } @@ -45,7 +43,7 @@ function install_diskimage_builder { GITBRANCH["diskimage-builder"]=$DISKIMAGE_BUILDER_REPO_REF git_clone_by_name "diskimage-builder" setup_dev_lib "diskimage-builder" - $NODEPOOL_INSTALL/bin/pip install -e $DEST/diskimage-builder + $NODEPOOL_INSTALL/bin/pip install $DEST/diskimage-builder fi } @@ -56,38 +54,30 @@ function install_glean { GITBRANCH["glean"]=$GLEAN_REPO_REF git_clone_by_name "glean" setup_dev_lib "glean" - $NODEPOOL_INSTALL/bin/pip install -e $DEST/glean + $NODEPOOL_INSTALL/bin/pip install $DEST/glean fi } # Install nodepool code function install_nodepool { - virtualenv $NODEPOOL_INSTALL + if python3_enabled; then + VENV="virtualenv -p python${PYTHON3_VERSION}" + else + VENV="virtualenv -p python${PYTHON2_VERSION}" + fi + $VENV $NODEPOOL_INSTALL install_shade install_diskimage_builder install_glean setup_develop $DEST/nodepool - $NODEPOOL_INSTALL/bin/pip install -e $DEST/nodepool + $NODEPOOL_INSTALL/bin/pip install $DEST/nodepool } # requires some globals from devstack, which *might* not be stable api # points. If things break, investigate changes in those globals first. -function nodepool_create_keypairs { - if [[ ! -f $NODEPOOL_KEY ]]; then - ssh-keygen -f $NODEPOOL_KEY -P "" - fi - - cat > /tmp/ssh_wrapper < /tmp/logging.conf < /tmp/secure.conf << EOF -[database] -# The mysql password here may be different depending on your -# devstack install, you should double check it (the devstack var -# is MYSQL_PASSWORD and if unset devstack should prompt you for -# the value). -dburi: $dburi +# Empty EOF sudo mv /tmp/secure.conf $NODEPOOL_SECURE @@ -197,131 +181,129 @@ EOF if [ -f $NODEPOOL_CACHE_GET_PIP ] ; then DIB_GET_PIP="DIB_REPOLOCATION_pip_and_virtualenv: file://$NODEPOOL_CACHE_GET_PIP" fi - if [ -f /etc/ci/mirror_info.sh ] ; then - source /etc/ci/mirror_info.sh + if [ -f /etc/nodepool/provider ] ; then + source /etc/nodepool/provider + + NODEPOOL_MIRROR_HOST=${NODEPOOL_MIRROR_HOST:-mirror.$NODEPOOL_REGION.$NODEPOOL_CLOUD.openstack.org} + NODEPOOL_MIRROR_HOST=$(echo $NODEPOOL_MIRROR_HOST|tr '[:upper:]' '[:lower:]') + + NODEPOOL_CENTOS_MIRROR=${NODEPOOL_CENTOS_MIRROR:-http://$NODEPOOL_MIRROR_HOST/centos} + NODEPOOL_DEBIAN_MIRROR=${NODEPOOL_DEBIAN_MIRROR:-http://$NODEPOOL_MIRROR_HOST/debian} + NODEPOOL_UBUNTU_MIRROR=${NODEPOOL_UBUNTU_MIRROR:-http://$NODEPOOL_MIRROR_HOST/ubuntu} DIB_DISTRIBUTION_MIRROR_CENTOS="DIB_DISTRIBUTION_MIRROR: $NODEPOOL_CENTOS_MIRROR" DIB_DISTRIBUTION_MIRROR_DEBIAN="DIB_DISTRIBUTION_MIRROR: $NODEPOOL_DEBIAN_MIRROR" - DIB_DISTRIBUTION_MIRROR_FEDORA="DIB_DISTRIBUTION_MIRROR: $NODEPOOL_FEDORA_MIRROR" DIB_DISTRIBUTION_MIRROR_UBUNTU="DIB_DISTRIBUTION_MIRROR: $NODEPOOL_UBUNTU_MIRROR" DIB_DEBOOTSTRAP_EXTRA_ARGS="DIB_DEBOOTSTRAP_EXTRA_ARGS: '--no-check-gpg'" fi + NODEPOOL_CENTOS_7_MIN_READY=1 + NODEPOOL_DEBIAN_JESSIE_MIN_READY=1 + # TODO(pabelanger): Remove fedora-25 after fedora-26 is online + NODEPOOL_FEDORA_25_MIN_READY=1 + NODEPOOL_FEDORA_26_MIN_READY=1 + NODEPOOL_UBUNTU_TRUSTY_MIN_READY=1 + NODEPOOL_UBUNTU_XENIAL_MIN_READY=1 + + if $NODEPOOL_PAUSE_CENTOS_7_DIB ; then + NODEPOOL_CENTOS_7_MIN_READY=0 + fi + if $NODEPOOL_PAUSE_DEBIAN_JESSIE_DIB ; then + NODEPOOL_DEBIAN_JESSIE_MIN_READY=0 + fi + if $NODEPOOL_PAUSE_FEDORA_25_DIB ; then + NODEPOOL_FEDORA_25_MIN_READY=0 + fi + if $NODEPOOL_PAUSE_FEDORA_26_DIB ; then + NODEPOOL_FEDORA_26_MIN_READY=0 + fi + if $NODEPOOL_PAUSE_UBUNTU_TRUSTY_DIB ; then + NODEPOOL_UBUNTU_TRUSTY_MIN_READY=0 + fi + if $NODEPOOL_PAUSE_UBUNTU_XENIAL_DIB ; then + NODEPOOL_UBUNTU_XENIAL_MIN_READY=0 + fi + cat > /tmp/nodepool.yaml <`_ - for the syntax. Example:: - - dburi='mysql+pymysql://nodepool@localhost/nodepool' - -**optional** - - While it is possible to run Nodepool without any Jenkins targets, - if Jenkins is used, the `target_name` and `url` are required. The - `user`, `apikey` and `credentials` also may be needed depending on - the Jenkins security settings. - - ``target_name`` - Name of the jenkins target. It needs to match with a target - specified in nodepool.yaml, in order to retrieve its settings. - - ``url`` - Url to the Jenkins REST API. - - ``user`` - Jenkins username. - - ``apikey`` - API key generated by Jenkins (not the user password). - - ``credentials`` - If provided, Nodepool will configure the Jenkins slave to use the Jenkins - credential identified by that ID, otherwise it will use the username and - ssh keys configured in the image. - Nodepool reads its configuration from ``/etc/nodepool/nodepool.yaml`` by default. The configuration file follows the standard YAML syntax with a number of sections defined with top level keys. For example, a full configuration file may have the ``diskimages``, ``labels``, -``providers``, and ``targets`` sections:: +and ``providers`` sections:: diskimages: ... @@ -66,12 +15,29 @@ full configuration file may have the ``diskimages``, ``labels``, ... providers: ... - targets: - ... + +.. note:: The builder daemon creates a UUID to uniquely identify itself and + to mark image builds in ZooKeeper that it owns. This file will be + named ``builder_id.txt`` and will live in the directory named by the + :ref:`images-dir` option. If this file does not exist, it will be + created on builder startup and a UUID will be created automatically. The following sections are available. All are required unless otherwise indicated. +.. _webapp-conf: + +webapp +------ + +Define the webapp endpoint port and listen address. + +Example:: + + webapp: + port: 8005 + listen_address: '0.0.0.0' + .. _elements-dir: elements-dir @@ -86,6 +52,8 @@ Example:: elements-dir: /path/to/elements/dir +.. _images-dir: + images-dir ---------- @@ -97,44 +65,6 @@ Example:: images-dir: /path/to/images/dir -cron ----- -This section is optional. - -Nodepool runs several periodic tasks. The ``cleanup`` task deletes -old images and servers which may have encountered errors during their -initial deletion. The ``check`` task attempts to log into each node -that is waiting to be used to make sure that it is still operational. -The following illustrates how to change the schedule for these tasks -and also indicates their default values:: - - cron: - cleanup: '27 */6 * * *' - check: '*/15 * * * *' - -zmq-publishers --------------- -Lists the ZeroMQ endpoints for the Jenkins masters. Nodepool uses -this to receive real-time notification that jobs are running on nodes -or are complete and nodes may be deleted. Example:: - - zmq-publishers: - - tcp://jenkins1.example.com:8888 - - tcp://jenkins2.example.com:8888 - -gearman-servers ---------------- -Lists the Zuul Gearman servers that should be consulted for real-time -demand. Nodepool will use information from these servers to determine -if additional nodes should be created to satisfy current demand. -Example:: - - gearman-servers: - - host: zuul.example.com - port: 4730 - -The ``port`` key is optional (default: 4730). - zookeeper-servers ----------------- Lists the ZooKeeper servers uses for coordinating information between @@ -155,83 +85,54 @@ the supplied root path, is also optional and has no default. labels ------ -Defines the types of nodes that should be created. Maps node types to -the images that are used to back them and the providers that are used -to supply them. Jobs should be written to run on nodes of a certain -label (so targets such as Jenkins don't need to know about what -providers or images are used to create them). Example:: +Defines the types of nodes that should be created. Jobs should be +written to run on nodes of a certain label. Example:: labels: - name: my-precise - image: precise + max-ready-age: 3600 min-ready: 2 - providers: - - name: provider1 - - name: provider2 - name: multi-precise - image: precise - subnodes: 2 min-ready: 2 - ready-script: setup_multinode.sh - providers: - - name: provider1 **required** ``name`` Unique name used to tie jobs to those instances. - ``image`` - Refers to providers images, see :ref:`images`. - - ``providers`` (list) - Required if any nodes should actually be created (e.g., the label is not - currently disabled, see ``min-ready`` below). - **optional** + ``max-ready-age`` (int) + Maximum number of seconds the node shall be in ready state. If + this is exceeded the node will be deleted. A value of 0 disables this. + Defaults to 0. + ``min-ready`` (default: 2) Minimum instances that should be in a ready state. Set to -1 to have the label considered disabled. ``min-ready`` is best-effort based on available capacity and is not a guaranteed allocation. - ``subnodes`` - Used to configure multi-node support. If a `subnodes` key is supplied to - an image, it indicates that the specified number of additional nodes of the - same image type should be created and associated with each node for that - image. - - Only one node from each such group will be added to the target, the - subnodes are expected to communicate directly with each other. In the - example above, for each Precise node added to the target system, two - additional nodes will be created and associated with it. - - ``ready-script`` - A script to be used to perform any last minute changes to a node after it - has been launched but before it is put in the READY state to receive jobs. - For more information, see :ref:`scripts`. - .. _diskimages: diskimages ---------- This section lists the images to be built using diskimage-builder. The -name of the diskimage is mapped to the :ref:`images` section of the -provider, to determine which providers should received uploads of each +name of the diskimage is mapped to the :ref:`provider_diskimages` section +of the provider, to determine which providers should received uploads of each image. The diskimage will be built in every format required by the providers with which it is associated. Because Nodepool needs to know which formats to build, if the diskimage will only be built if it appears in at least one provider. To remove a diskimage from the system entirely, remove all associated -entries in :ref:`images` and remove its entry from `diskimages`. All -uploads will be deleted as well as the files on disk. +entries in :ref:`provider_diskimages` and remove its entry from `diskimages`. +All uploads will be deleted as well as the files on disk. Example configuration:: diskimages: - - name: precise + - name: ubuntu-precise pause: False rebuild-age: 86400 elements: @@ -245,6 +146,7 @@ Example configuration:: - growroot - infra-package-needs release: precise + username: zuul env-vars: TMPDIR: /opt/dib_tmp DIB_CHECKSUM: '1' @@ -252,7 +154,7 @@ Example configuration:: DIB_APT_LOCAL_CACHE: '0' DIB_DISABLE_APT_CLEANUP: '1' FS_TYPE: ext3 - - name: xenial + - name: ubuntu-xenial pause: True rebuild-age: 86400 formats: @@ -269,6 +171,7 @@ Example configuration:: - growroot - infra-package-needs release: precise + username: ubuntu env-vars: TMPDIR: /opt/dib_tmp DIB_CHECKSUM: '1' @@ -281,7 +184,8 @@ Example configuration:: **required** ``name`` - Identifier to reference the disk image in :ref:`images` and :ref:`labels`. + Identifier to reference the disk image in :ref:`provider_diskimages` + and :ref:`labels`. **optional** @@ -312,124 +216,124 @@ Example configuration:: ``pause`` (bool) When set to True, nodepool-builder will not build the diskimage. + ``username`` (string) + The username that a consumer should use when connecting onto the node. Defaults + to ``zuul``. + .. _provider: -provider +providers --------- -Lists the OpenStack cloud providers Nodepool should use. Within each -provider, the Nodepool image types are also defined (see -:ref:`images` for details). Example:: - - providers: - - name: provider1 - cloud: example - region-name: 'region1' - max-servers: 96 - rate: 1.0 - availability-zones: - - az1 - boot-timeout: 120 - launch-timeout: 900 - template-hostname: 'template-{image.name}-{timestamp}' - ipv6-preferred: False - networks: - - name: 'some-network-name' - images: - - name: trusty - min-ram: 8192 - name-filter: 'something to match' - username: jenkins - user-home: '/home/jenkins' - private-key: /var/lib/jenkins/.ssh/id_rsa - meta: - key: value - key2: value - - name: precise - min-ram: 8192 - username: jenkins - user-home: '/home/jenkins' - private-key: /var/lib/jenkins/.ssh/id_rsa - - name: devstack-trusty - min-ram: 30720 - username: jenkins - private-key: /home/nodepool/.ssh/id_rsa - - name: provider2 - username: 'username' - password: 'password' - auth-url: 'http://auth.provider2.example.com/' - project-name: 'project' - service-type: 'compute' - service-name: 'compute' - region-name: 'region1' - max-servers: 96 - rate: 1.0 - template-hostname: '{image.name}-{timestamp}-nodepool-template' - images: - - name: precise - min-ram: 8192 - username: jenkins - user-home: '/home/jenkins' - private-key: /var/lib/jenkins/.ssh/id_rsa - meta: - key: value - key2: value - -**cloud configuration*** - -**preferred** - - ``cloud`` - There are two methods supported for configuring cloud entries. The preferred - method is to create an ``~/.config/openstack/clouds.yaml`` file containing - your cloud configuration information. Then, use ``cloud`` to refer to a - named entry in that file. - - More information about the contents of `clouds.yaml` can be found in - `the os-client-config documentation `_. - -**compatablity** - - For backwards compatibility reasons, you can also include - portions of the cloud configuration directly in ``nodepool.yaml``. Not all - of the options settable via ``clouds.yaml`` are available. - - ``username`` - - ``password`` - - ``project-id`` OR ``project-name`` - Some clouds may refer to the ``project-id`` as ``tenant-id``. - Some clouds may refer to the ``project-name`` as ``tenant-name``. - - ``auth-url`` - Keystone URL. - - ``image-type`` - Specifies the image type supported by this provider. The disk images built - by diskimage-builder will output an image for each ``image-type`` specified - by a provider using that particular diskimage. - - By default, ``image-type`` is set to the value returned from - ``os-client-config`` and can be omitted in most cases. +Lists the providers Nodepool should use. Each provider is associated to +a driver listed below. **required** ``name`` - ``max-servers`` - Maximum number of servers spawnable on this provider. **optional** - ``availability-zones`` (list) - Without it nodepool will rely on nova to schedule an availability zone. + ``driver`` + Default to *openstack* - If it is provided the value should be a list of availability zone names. - Nodepool will select one at random and provide that to nova. This should - give a good distribution of availability zones being used. If you need more - control of the distribution you can use multiple logical providers each - providing a different list of availabiltiy zones. + ``max-concurrency`` + Maximum number of node requests that this provider is allowed to handle + concurrently. The default, if not specified, is to have no maximum. Since + each node request is handled by a separate thread, this can be useful for + limiting the number of threads used by the nodepool-launcher daemon. + + +OpenStack driver +^^^^^^^^^^^^^^^^ + +Within each OpenStack provider the available Nodepool image types are defined +(see :ref:`provider_diskimages`). + +An OpenStack provider's resources are partitioned into groups called "pools" +(see :ref:`pools` for details), and within a pool, the node types which are +to be made available are listed (see :ref:`pool_labels` for +details). + +Example:: + + providers: + - name: provider1 + driver: openstack + cloud: example + region-name: 'region1' + rate: 1.0 + boot-timeout: 120 + launch-timeout: 900 + launch-retries: 3 + image-name-format: '{image_name}-{timestamp}' + hostname-format: '{label.name}-{provider.name}-{node.id}' + diskimages: + - name: trusty + meta: + key: value + key2: value + - name: precise + - name: devstack-trusty + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - some-network-name + labels: + - name: trusty + min-ram: 8192 + diskimage: trusty + console-log: True + - name: precise + min-ram: 8192 + diskimage: precise + - name: devstack-trusty + min-ram: 8192 + diskimage: devstack-trusty + - name: provider2 + driver: openstack + cloud: example2 + region-name: 'region1' + rate: 1.0 + image-name-format: '{image_name}-{timestamp}' + hostname-format: '{label.name}-{provider.name}-{node.id}' + diskimages: + - name: precise + meta: + key: value + key2: value + pools: + - name: main + max-servers: 96 + labels: + - name: trusty + min-ram: 8192 + diskimage: trusty + - name: precise + min-ram: 8192 + diskimage: precise + - name: devstack-trusty + min-ram: 8192 + diskimage: devstack-trusty + +**required** + + ``cloud`` + Name of a cloud configured in ``clouds.yaml``. + + The instances spawned by nodepool will inherit the default security group + of the project specified in the cloud definition in `clouds.yaml`. This means + that when working with Zuul, for example, SSH traffic (TCP/22) must be allowed + in the project's default security group for Zuul to be able to reach instances. + + More information about the contents of `clouds.yaml` can be found in + `the os-client-config documentation `_. + +**optional** ``boot-timeout`` Once an instance is active, how long to try connecting to the @@ -454,31 +358,22 @@ provider, the Nodepool image types are also defined (see Default None - ``networks`` (dict) - Specify custom Neutron networks that get attached to each - node. Specify the ``name`` of the network (a string). + ``launch-retries`` - ``ipv6-preferred`` - If it is set to True, nodepool will try to find ipv6 in public net first - as the ip address for ssh connection to build snapshot images and create - jenkins slave definition. If ipv6 is not found or the key is not - specified or set to False, ipv4 address will be used. + The number of times to retry launching a server before considering the job + failed. - ``api-timeout`` (compatability) - Timeout for the OpenStack API calls client in seconds. Prefer setting - this in `clouds.yaml` - - ``service-type`` (compatability) - Prefer setting this in `clouds.yaml`. - - ``service-name`` (compatability) - Prefer setting this in `clouds.yaml`. + Default 3. ``region-name`` - ``template-hostname`` + ``hostname-format`` Hostname template to use for the spawned instance. - Default ``template-{image.name}-{timestamp}`` + Default ``{label.name}-{provider.name}-{node.id}`` + + ``image-name-format`` + Format for image names that are uploaded to providers. + Default ``{image_name}-{timestamp}`` ``rate`` In seconds, amount to wait between operations on the provider. @@ -489,12 +384,88 @@ provider, the Nodepool image types are also defined (see OpenStack project and will attempt to clean unattached floating ips that may have leaked around restarts. -.. _images: +.. _pools: -images -~~~~~~ +pools +~~~~~ -Each entry in a provider's `images` section must correspond to an +A pool defines a group of resources from an OpenStack provider. Each pool has a +maximum number of nodes which can be launched from it, along with a +number of cloud-related attributes used when launching nodes. + +Example:: + + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - some-network-name + auto-floating-ip: False + labels: + - name: trusty + min-ram: 8192 + diskimage: trusty + console-log: True + - name: precise + min-ram: 8192 + diskimage: precise + - name: devstack-trusty + min-ram: 8192 + diskimage: devstack-trusty + +**required** + + ``name`` + + +**optional** + + ``max-cores`` + Maximum number of cores usable from this pool. This can be used to limit + usage of the tenant. If not defined nodepool can use all cores up to the + quota of the tenant. + + ``max-servers`` + Maximum number of servers spawnable from this pool. This can be used to + limit the number of servers. If not defined nodepool can create as many + servers the tenant allows. + + ``max-ram`` + Maximum ram usable from this pool. This can be used to limit the amount of + ram allocated by nodepool. If not defined nodepool can use as much ram as + the tenant allows. + + ``availability-zones`` (list) + A list of availability zones to use. + + If this setting is omitted, nodepool will fetch the list of all + availability zones from nova. To restrict nodepool to a subset + of availability zones, supply a list of availability zone names + in this setting. + + Nodepool chooses an availability zone from the list at random + when creating nodes but ensures that all nodes for a given + request are placed in the same availability zone. + + ``networks`` (list) + Specify custom Neutron networks that get attached to each + node. Specify the name or id of the network as a string. + + ``auto-floating-ip`` (bool) + Specify custom behavior of allocating floating ip for each node. + When set to False, nodepool-launcher will not apply floating ip + for nodes. When zuul instances and nodes are deployed in the same + internal private network, set the option to False to save floating ip + for cloud provider. The default value is True. + +.. _provider_diskimages: + +diskimages +~~~~~~~~~~ + +Each entry in a provider's `diskimages` section must correspond to an entry in :ref:`diskimages`. Such an entry indicates that the corresponding diskimage should be uploaded for use in this provider. Additionally, any nodes that are created using the uploaded image will @@ -505,16 +476,14 @@ images will be deleted from the provider. Example configuration:: - images: + diskimages: - name: precise pause: False - min-ram: 8192 - name-filter: 'something to match' - username: jenkins - private-key: /var/lib/jenkins/.ssh/id_rsa meta: key: value key2: value + - name: windows + connection-type: winrm **required** @@ -522,86 +491,143 @@ Example configuration:: Identifier to refer this image from :ref:`labels` and :ref:`diskimages` sections. - ``min-ram`` - Determine the flavor to use (e.g. ``m1.medium``, ``m1.large``, - etc). The smallest flavor that meets the ``min-ram`` requirements - will be chosen. To further filter by flavor name, see optional - ``name-filter`` below. - **optional** - ``name-filter`` - Additional filter complementing ``min-ram``, will be required to match on - the flavor-name (e.g. Rackspace offer a "Performance" flavour; setting - `name-filter` to ``Performance`` will ensure the chosen flavor also - contains this string as well as meeting `min-ram` requirements). - ``pause`` (bool) When set to True, nodepool-builder will not upload the image to the provider. - ``username`` - Nodepool expects that user to exist after running the script indicated by - ``setup``. Default ``jenkins`` - - ``key-name`` - If provided, named keypair in nova that will be provided to server create. - - ``private-key`` - Default ``/var/lib/jenkins/.ssh/id_rsa`` - ``config-drive`` (boolean) - Whether config drive should be used for the image. Default ``True`` + Whether config drive should be used for the image. Defaults to unset which + will use the cloud's default behavior. ``meta`` (dict) Arbitrary key/value metadata to store for this server using the Nova metadata service. A maximum of five entries is allowed, and both keys and values must be 255 characters or less. -.. _targets: + ``connection-type`` (string) + The connection type that a consumer should use when connecting onto the + node. For most diskimages this is not necessary. However when creating + Windows images this could be 'winrm' to enable access via ansible. -targets -------- -Lists the Jenkins masters to which Nodepool should attach nodes after -they are created. Nodes of each label will be evenly distributed -across all of the targets which are on-line:: +.. _provider_cloud_images: - targets: - - name: jenkins1 - hostname: '{label.name}-{provider.name}-{node_id}' - subnode-hostname: '{label.name}-{provider.name}-{node_id}-{subnode_id}' - - name: jenkins2 - hostname: '{label.name}-{provider.name}-{node_id}' - subnode-hostname: '{label.name}-{provider.name}-{node_id}-{subnode_id}' +cloud-images +~~~~~~~~~~~~ + +Each cloud-image entry in :ref:`labels` refers to an entry in this section. +This is a way for modifying launch parameters of the nodes (currently only +config-drive). + +Example configuration:: + + cloud-images: + - name: trusty-external + config-drive: False + - name: windows-external + connection-type: winrm **required** ``name`` - Identifier for the system an instance is attached to. + Identifier to refer this cloud-image from :ref:`labels` section. + Since this name appears elsewhere in the nodepool configuration + file, you may want to use your own descriptive name here and use + one of ``image-id`` or ``image-name`` to specify the cloud image + so that if the image name or id changes on the cloud, the impact + to your Nodepool configuration will be minimal. However, if + neither of those attributes are provided, this is also assumed to + be the image name or ID in the cloud. **optional** - ``hostname`` - Default ``{label.name}-{provider.name}-{node_id}`` + ``config-drive`` (boolean) + Whether config drive should be used for the cloud image. Defaults to + unset which will use the cloud's default behavior. - ``subnode-hostname`` - Default ``{label.name}-{provider.name}-{node_id}-{subnode_id}`` + ``image-id`` (str) + If this is provided, it is used to select the image from the cloud + provider by ID, rather than name. Mutually exclusive with ``image-name``. - ``rate`` - In seconds. Default 1.0 + ``image-name`` (str) + If this is provided, it is used to select the image from the cloud + provider by this name or ID. Mutually exclusive with ``image-id``. - ``jenkins`` (dict) + ``username`` (str) + The username that a consumer should use when connecting onto the node. - ``test-job`` (optional) - Setting this would cause a newly created instance to be in a TEST state. - The job name given will then be executed with the node name as a - parameter. + ``connection-type`` (str) + The connection type that a consumer should use when connecting onto the + node. For most diskimages this is not necessary. However when creating + Windows images this could be 'winrm' to enable access via ansible. - If the job succeeds, move the node into READY state and relabel it with - the appropriate label (from the image name). +.. _pool_labels: - If it fails, immediately delete the node. +labels +~~~~~~ - If the job never runs, the node will eventually be cleaned up by the - periodic cleanup task. +Each entry in a pool`s `labels` section indicates that the +corresponding label is available for use in this pool. When creating +nodes for a label, the flavor-related attributes in that label's +section will be used. + +Example configuration:: + + labels: + - name: precise + min-ram: 8192 + flavor-name: 'something to match' + console-log: True + +**required** + + ``name`` + Identifier to refer this image from :ref:`labels` and :ref:`diskimages` + sections. + +**one of** + + ``diskimage`` + Refers to provider's diskimages, see :ref:`provider_diskimages`. + + ``cloud-image`` + Refers to the name of an externally managed image in the cloud that already + exists on the provider. The value of ``cloud-image`` should match the + ``name`` of a previously configured entry from the ``cloud-images`` section + of the provider. See :ref:`provider_cloud_images`. + +**at least one of** + + ``flavor-name`` + Name or id of the flavor to use. If ``min-ram`` is omitted, it + must be an exact match. If ``min-ram`` is given, ``flavor-name`` will + be used to find flavor names that meet ``min-ram`` and also contain + ``flavor-name``. + + ``min-ram`` + Determine the flavor to use (e.g. ``m1.medium``, ``m1.large``, + etc). The smallest flavor that meets the ``min-ram`` requirements + will be chosen. + +**optional** + + ``boot-from-volume`` (bool) + If given, the label for use in this pool will create a volume from the + image and boot the node from it. + + Default: False + + ``key-name`` + If given, is the name of a keypair that will be used when booting each + server. + + ``console-log`` (default: False) + On the failure of the ssh ready check, download the server console log to + aid in debuging the problem. + + ``volume-size`` + When booting an image from volume, how big should the created volume be. + + In gigabytes. Default 50. diff --git a/doc/source/index.rst b/doc/source/index.rst index e0cf0df7a..ee9eea07e 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -4,7 +4,7 @@ Nodepool Nodepool is a system for launching single-use test nodes on demand based on images built with cached data. It is designed to work with any OpenStack based cloud, and is part of a suite of tools that form a -comprehensive test system including Jenkins and Zuul. +comprehensive test system, including Zuul. Contents: @@ -13,7 +13,6 @@ Contents: installation configuration - scripts operation devguide @@ -21,5 +20,6 @@ Indices and tables ================== * :ref:`genindex` +* :ref:`modindex` * :ref:`search` diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 7557b7c34..f258e1c70 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -3,51 +3,12 @@ Installation ============ -Nodepool consists of a set of long-running daemons which use an SQL -database, a ZooKeeper cluster, and communicates with Jenkins using -ZeroMQ. +Nodepool consists of a long-running daemon which uses ZooKeeper +for coordination with Zuul. External Requirements --------------------- -Jenkins -~~~~~~~ - -You should have a Jenkins server running with the `ZMQ Event Publisher -`_ -plugin installed (it is available in the Jenkins Update Center). Be -sure that the machine where you plan to run Nodepool can connect to -the ZMQ port specified by the plugin on your Jenkins master(s). - -Zuul -~~~~ - -If you plan to use Nodepool with Zuul (it is optional), you should -ensure that Nodepool can connect to the gearman port on your Zuul -server (TCP 4730 by default). This will allow Nodepool to respond to -current Zuul demand. If you elect not to connect Nodepool to Zuul, it -will still operate in a node-replacement mode. - -Database -~~~~~~~~ - -Nodepool requires an SQL server. MySQL with the InnoDB storage engine -is tested and recommended. PostgreSQL should work fine. Due to the -high number of concurrent connections from Nodepool, SQLite is not -recommended. When adding or deleting nodes, Nodepool will hold open a -database connection for each node. Be sure to configure the database -server to support at least a number of connections equal to twice the -number of nodes you expect to be in use at once. - -All that is necessary is that the database is created. Nodepool will -handle the schema by itself when it is run. - -MySQL Example:: - - CREATE USER 'nodepool'@'localhost' IDENTIFIED BY ''; - CREATE DATABASE nodepooldb; - GRANT ALL ON nodepooldb.* TO 'nodepool'@'localhost'; - ZooKeeper ~~~~~~~~~ @@ -88,22 +49,28 @@ Or install directly from a git checkout with:: pip install . -Note that some distributions provide a libzmq1 which does not support -RCVTIMEO. Removing this libzmq1 from the system libraries will ensure -pip compiles a libzmq1 with appropriate options for the version of -pyzmq used by nodepool. - Configuration ------------- -Nodepool has two required configuration files: secure.conf and -nodepool.yaml, and an optional logging configuration file logging.conf. -The secure.conf file is used to store nodepool configurations that contain -sensitive data, such as the Nodepool database password and Jenkins -api key. The nodepool.yaml files is used to store all other -configurations. - -The logging configuration file is in the standard python logging -`configuration file format -`_. +Nodepool has one required configuration file, which defaults to +``/etc/nodepool/nodepool.yaml``. This can be changed with the ``-c`` option. +The Nodepool configuration file is described in :ref:`configuration`. + +There is support for a secure file that is used to store nodepool +configurations that contain sensitive data. It currently only supports +specifying ZooKeeper credentials. If ZooKeeper credentials are defined in +both configuration files, the data in the secure file takes precedence. +The secure file location can be changed with the ``-s`` option and follows +the same file format as the Nodepool configuration file. + +There is an optional logging configuration file, specified with the ``-l`` +option. The logging configuration file can accept either: + +* the traditional ini python logging `configuration file format + `_. + +* a `.yml` or `.yaml` suffixed file that will be parsed and loaded as the newer + `dictConfig format + `_. + The Nodepool configuration file is described in :ref:`configuration`. diff --git a/doc/source/operation.rst b/doc/source/operation.rst index f245fa0e7..bb53e99dc 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -5,13 +5,17 @@ Operation Nodepool has two components which run as daemons. The ``nodepool-builder`` daemon is responsible for building diskimages and -uploading them to providers, and the ``nodepoold`` daemon is +uploading them to providers, and the ``nodepool-launcher`` daemon is responsible for launching and deleting nodes. Both daemons frequently re-read their configuration file after starting to support adding or removing new images and providers, or otherwise altering the configuration. +These daemons communicate with each other via a Zookeeper database. +You must run Zookeeper and at least one of each of these daemons to +have a functioning Nodepool installation. + Nodepool-builder ---------------- @@ -31,14 +35,14 @@ safe, it is recommended to run a single instance of only a single build thread (the default). -Nodepoold ---------- +Nodepool-launcher +----------------- -The main nodepool daemon is named ``nodepoold`` and is responsible for -launching instances from the images created and uploaded by -``nodepool-builder``. +The main nodepool daemon is named ``nodepool-launcher`` and is +responsible for managing cloud instances launched from the images +created and uploaded by ``nodepool-builder``. -When a new image is created and uploaded, ``nodepoold`` will +When a new image is created and uploaded, ``nodepool-launcher`` will immediately start using it when launching nodes (Nodepool always uses the most recent image for a given provider in the ``ready`` state). Nodepool will delete images if they are not the most recent or second @@ -51,9 +55,9 @@ using the previous image. Daemon usage ------------ -To start the main Nodepool daemon, run **nodepoold**: +To start the main Nodepool daemon, run **nodepool-launcher**: -.. program-output:: nodepoold --help +.. program-output:: nodepool-launcher --help :nostderr: To start the nodepool-builder daemon, run **nodepool--builder**: @@ -77,21 +81,73 @@ When Nodepool creates instances, it will assign the following nova metadata: groups - A json-encoded list containing the name of the image and the name + A comma separated list containing the name of the image and the name of the provider. This may be used by the Ansible OpenStack inventory plugin. - nodepool - A json-encoded dictionary with the following entries: + nodepool_image_name + The name of the image as a string. - image_name - The name of the image as a string. + nodepool_provider_name + The name of the provider as a string. - provider_name - The name of the provider as a string. + nodepool_node_id + The nodepool id of the node as an integer. - node_id - The nodepool id of the node as an integer. +Common Management Tasks +----------------------- + +In the course of running a Nodepool service you will find that there are +some common operations that will be performed. Like the services +themselves these are split into two groups, image management and +instance management. + +Image Management +~~~~~~~~~~~~~~~~ + +Before Nodepool can launch any cloud instances it must have images to boot +off of. ``nodepool dib-image-list`` will show you which images are available +locally on disk. These images on disk are then uploaded to clouds, +``nodepool image-list`` will show you what images are bootable in your +various clouds. + +If you need to force a new image to be built to pick up a new feature more +quickly than the normal rebuild cycle (which defaults to 24 hours) you can +manually trigger a rebuild. Using ``nodepool image-build`` you can tell +Nodepool to begin a new image build now. Note that depending on work that +the nodepool-builder is already performing this may queue the build. Check +``nodepool dib-image-list`` to see the current state of the builds. Once +the image is built it is automatically uploaded to all of the clouds +configured to use that image. + +At times you may need to stop using an existing image because it is broken. +Your two major options here are to build a new image to replace the existing +image or to delete the existing image and have Nodepool fall back on using +the previous image. Rebuilding and uploading can be slow so typically the +best option is to simply ``nodepool image-delete`` the most recent image +which will cause Nodepool to fallback on using the previous image. Howevever, +if you do this without "pausing" the image it will be immediately reuploaded. +You will want to pause the image if you need to further investigate why +the image is not being built correctly. If you know the image will be built +correctly you can simple delete the built image and remove it from all clouds +which will cause it to be rebuilt using ``nodepool dib-image-delete``. + +Instance Management +~~~~~~~~~~~~~~~~~~~ + +With working images in providers you should see Nodepool launching instances +in these providers using the images it built. You may find that you need to +debug a particular job failure manually. An easy way to do this is to +``nodepool hold`` an instance then log in to the instance and perform any +necessary debugging steps. Note that this doesn't stop the job running there, +what it will do is prevent Nodepool from automatically deleting this instance +once the job is complete. + +In some circumstances like manually holding an instance above, or wanting to +force a job restart you may want to delete a running instance. You can issue +a ``nodepool delete`` to force nodepool to do this. + +Complete command help info is below. Command Line Tools ------------------ @@ -151,38 +207,11 @@ If Nodepool's database gets out of sync with reality, the following commands can help identify compute instances or images that are unknown to Nodepool: -alien-list -^^^^^^^^^^ -.. program-output:: nodepool alien-list --help - :nostderr: - alien-image-list ^^^^^^^^^^^^^^^^ .. program-output:: nodepool alien-image-list --help :nostderr: -In the case that a job is randomly failing for an unknown cause, it -may be necessary to instruct nodepool to automatically hold a node on -which that job has failed. To do so, use the ``job-create`` -command to specify the job name and how many failed nodes should be -held. When debugging is complete, use ''job-delete'' to disable the -feature. - -job-create -^^^^^^^^^^ -.. program-output:: nodepool job-create --help - :nostderr: - -job-list -^^^^^^^^ -.. program-output:: nodepool job-list --help - :nostderr: - -job-delete -^^^^^^^^^^ -.. program-output:: nodepool job-delete --help - :nostderr: - Removing a Provider ------------------- diff --git a/doc/source/scripts.rst b/doc/source/scripts.rst deleted file mode 100644 index b9d389c68..000000000 --- a/doc/source/scripts.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. _scripts: - -Node Ready Scripts -================== - -Each label can specify a ready script with `ready-script`. This script can be -used to perform any last minute changes to a node after it has been launched -but before it is put in the READY state to receive jobs. In particular, it -can read the files in /etc/nodepool to perform multi-node related setup. - -Those files include: - -**/etc/nodepool/role** - Either the string ``primary`` or ``sub`` indicating whether this - node is the primary (the node added to the target and which will run - the job), or a sub-node. -**/etc/nodepool/node** - The IP address of this node. -**/etc/nodepool/node_private** - The private IP address of this node. -**/etc/nodepool/primary_node** - The IP address of the primary node, usable for external access. -**/etc/nodepool/primary_node_private** - The Private IP address of the primary node, for internal communication. -**/etc/nodepool/sub_nodes** - The IP addresses of the sub nodes, one on each line, - usable for external access. -**/etc/nodepool/sub_nodes_private** - The Private IP addresses of the sub nodes, one on each line. -**/etc/nodepool/id_rsa** - An OpenSSH private key generated specifically for this node group. -**/etc/nodepool/id_rsa.pub** - The corresponding public key. -**/etc/nodepool/provider** - Information about the provider in a shell-usable form. This - includes the following information: - - **NODEPOOL_PROVIDER** - The name of the provider - **NODEPOOL_CLOUD** - The name of the cloud - **NODEPOOL_REGION** - The name of the region - **NODEPOOL_AZ** - The name of the availability zone (if available) diff --git a/nodepool/allocation.py b/nodepool/allocation.py deleted file mode 100644 index cf34afb18..000000000 --- a/nodepool/allocation.py +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/env python - -# Copyright (C) 2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module holds classes that represent concepts in nodepool's -allocation algorithm. - -The algorithm is: - - Setup: - - * Establish the node providers with their current available - capacity. - * Establish requests that are to be made of each provider for a - certain label. - * Indicate which providers can supply nodes of that label. - * Indicate to which targets nodes of a certain label from a certain - provider may be distributed (and the weight that should be - given to each target when distributing). - - Run: - - * For each label, set the requested number of nodes from each - provider to be proportional to that providers overall capacity. - - * Define the 'priority' of a request as the number of requests for - the same label from other providers. - - * For each provider, sort the requests by the priority. This puts - requests that can be serviced by the fewest providers first. - - * Grant each such request in proportion to that requests portion of - the total amount requested by requests of the same priority. - - * The nodes allocated by a grant are then distributed to the targets - which are associated with the provider and label, in proportion to - that target's portion of the sum of the weights of each target for - that label. -""" - -import functools - -# History allocation tracking - -# The goal of the history allocation tracking is to ensure forward -# progress by not starving any particular label when in over-quota -# situations. For example, if you have two labels, say 'fedora' and -# 'ubuntu', and 'ubuntu' is requesting many more nodes than 'fedora', -# it is quite possible that 'fedora' never gets any allocations. If -# 'fedora' is required for a gate-check job, older changes may wait -# in Zuul's pipelines longer than expected while jobs for newer -# changes continue to receive 'ubuntu' nodes and overall merge -# throughput decreases during such contention. -# -# We track the history of allocations by label. A persistent -# AllocationHistory object should be kept and passed along with each -# AllocationRequest, which records its initial request in the history -# via recordRequest(). -# -# When a sub-allocation gets a grant, it records this via a call to -# AllocationHistory.recordGrant(). All the sub-allocations -# contribute to tracking the total grants for the parent -# AllocationRequest. -# -# When finished requesting grants from all providers, -# AllocationHistory.grantsDone() should be called to store the -# allocation state in the history. -# -# This history is used AllocationProvider.makeGrants() to prioritize -# requests that have not been granted in prior iterations. -# AllocationHistory.getWaitTime will return how many iterations -# each label has been waiting for an allocation. - - -class AllocationHistory(object): - '''A history of allocation requests and grants''' - - def __init__(self, history=100): - # current allocations for this iteration - # keeps elements of type - # label -> (request, granted) - self.current_allocations = {} - - self.history = history - # list of up to previous current_allocation - # dictionaries - self.past_allocations = [] - - def recordRequest(self, label, amount): - try: - a = self.current_allocations[label] - a['requested'] += amount - except KeyError: - self.current_allocations[label] = dict(requested=amount, - allocated=0) - - def recordGrant(self, label, amount): - try: - a = self.current_allocations[label] - a['allocated'] += amount - except KeyError: - # granted but not requested? shouldn't happen - raise - - def grantsDone(self): - # save this round of allocations/grants up to our history - self.past_allocations.insert(0, self.current_allocations) - self.past_allocations = self.past_allocations[:self.history] - self.current_allocations = {} - - def getWaitTime(self, label): - # go through the history of allocations and calculate how many - # previous iterations this label has received none of its - # requested allocations. - wait = 0 - - # We don't look at the current_alloctions here; only - # historical. With multiple providers, possibly the first - # provider has given nodes to the waiting label (which would - # be recorded in current_allocations), and a second provider - # should fall back to using the usual ratio-based mechanism? - for i, a in enumerate(self.past_allocations): - if (label in a) and (a[label]['allocated'] == 0): - wait = i + 1 - continue - - # only interested in consecutive failures to allocate. - break - - return wait - - -class AllocationProvider(object): - """A node provider and its capacity.""" - def __init__(self, name, available): - self.name = name - # if this is negative, many of the calcuations turn around and - # we start handing out nodes that don't exist. - self.available = available if available >= 0 else 0 - self.sub_requests = [] - self.grants = [] - - def __repr__(self): - return '' % self.name - - def makeGrants(self): - # build a list of (request,wait-time) tuples - all_reqs = [(x, x.getWaitTime()) for x in self.sub_requests] - - # reqs with no wait time get processed via ratio mechanism - reqs = [x[0] for x in all_reqs if x[1] == 0] - - # we prioritize whoever has been waiting the longest and give - # them whatever is available. If we run out, put them back in - # the ratio queue - waiters = [x for x in all_reqs if x[1] != 0] - waiters.sort(key=lambda x: x[1], reverse=True) - - for w in waiters: - w = w[0] - if self.available > 0: - w.grant(min(int(w.amount), self.available)) - else: - reqs.append(w) - - # Sort the remaining requests by priority so we fill the most - # specific requests first (e.g., if this provider is the only - # one that can supply foo nodes, then it should focus on - # supplying them and leave bar nodes to other providers). - reqs.sort(lambda a, b: cmp(a.getPriority(), b.getPriority())) - - for req in reqs: - total_requested = 0.0 - # Within a specific priority, limit the number of - # available nodes to a value proportionate to the request. - reqs_at_this_level = [r for r in reqs - if r.getPriority() == req.getPriority()] - for r in reqs_at_this_level: - total_requested += r.amount - if total_requested: - ratio = float(req.amount) / total_requested - else: - ratio = 0.0 - - grant = int(round(req.amount)) - grant = min(grant, int(round(self.available * ratio))) - # This adjusts our availability as well as the values of - # other requests, so values will be correct the next time - # through the loop. - req.grant(grant) - - -class AllocationRequest(object): - """A request for a number of labels.""" - - def __init__(self, name, amount, history=None): - self.name = name - self.amount = float(amount) - # Sub-requests of individual providers that make up this - # request. AllocationProvider -> AllocationSubRequest - self.sub_requests = {} - # Targets to which nodes from this request may be assigned. - # AllocationTarget -> AllocationRequestTarget - self.request_targets = {} - - if history is not None: - self.history = history - else: - self.history = AllocationHistory() - - self.history.recordRequest(name, amount) - - # subrequests use these - self.recordGrant = functools.partial(self.history.recordGrant, name) - self.getWaitTime = functools.partial(self.history.getWaitTime, name) - - def __repr__(self): - return '' % (self.amount, self.name) - - def addTarget(self, target, current): - art = AllocationRequestTarget(self, target, current) - self.request_targets[target] = art - - def addProvider(self, provider, target, subnodes): - # Handle being called multiple times with different targets. - s = self.sub_requests.get(provider) - if not s: - s = AllocationSubRequest(self, provider, subnodes) - agt = s.addTarget(self.request_targets[target]) - self.sub_requests[provider] = s - if s not in provider.sub_requests: - provider.sub_requests.append(s) - self.makeRequests() - return s, agt - - def makeRequests(self): - # (Re-)distribute this request across all of its providers. - total_available = 0.0 - for sub_request in self.sub_requests.values(): - total_available += sub_request.provider.available - for sub_request in self.sub_requests.values(): - if total_available: - ratio = float(sub_request.provider.available) / total_available - else: - ratio = 0.0 - sub_request.setAmount(ratio * self.amount) - - -class AllocationSubRequest(object): - """A request for a number of images from a specific provider.""" - def __init__(self, request, provider, subnodes): - self.request = request - self.provider = provider - self.amount = 0.0 - self.subnodes = subnodes - self.targets = [] - - def __repr__(self): - return '' % ( - self.amount, self.request.amount, self.request.name, - self.provider.name) - - def addTarget(self, request_target): - agt = AllocationGrantTarget(self, request_target) - self.targets.append(agt) - return agt - - def setAmount(self, amount): - self.amount = amount - - def getPriority(self): - return len(self.request.sub_requests) - - def getWaitTime(self): - return self.request.getWaitTime() - - def grant(self, amount): - # Grant this request (with the supplied amount). Adjust this - # sub-request's value to the actual, as well as the values of - # any remaining sub-requests. - - # fractional amounts don't make sense - assert int(amount) == amount - - # Remove from the set of sub-requests so that this is not - # included in future calculations. - self.provider.sub_requests.remove(self) - del self.request.sub_requests[self.provider] - if amount > 0: - grant = AllocationGrant(self.request, self.provider, - amount, self.targets) - self.request.recordGrant(amount) - # This is now a grant instead of a request. - self.provider.grants.append(grant) - else: - grant = None - amount = 0 - self.amount = amount - # Adjust provider and request values accordingly. - self.request.amount -= amount - subnode_factor = 1 + self.subnodes - self.provider.available -= (amount * subnode_factor) - # Adjust the requested values for related sub-requests. - self.request.makeRequests() - # Allocate these granted nodes to targets. - if grant: - grant.makeAllocations() - - -class AllocationGrant(object): - """A grant of a certain number of nodes of an image from a - specific provider.""" - - def __init__(self, request, provider, amount, targets): - self.request = request - self.provider = provider - self.amount = amount - self.targets = targets - - def __repr__(self): - return '' % ( - self.amount, self.request.name, self.provider.name) - - def makeAllocations(self): - # Allocate this grant to the linked targets. - total_current = 0 - for agt in self.targets: - total_current += agt.request_target.current - amount = self.amount - # Add the nodes in this allocation to the total number of - # nodes for this image so that we're setting our target - # allocations based on a portion of the total future nodes. - total_current += amount - remaining_targets = len(self.targets) - for agt in self.targets: - # Evenly distribute the grants across all targets - ratio = 1.0 / remaining_targets - # Take the weight and apply it to the total number of - # nodes to this image to figure out how many of the total - # nodes should ideally be on this target. - desired_count = int(round(ratio * total_current)) - # The number of nodes off from our calculated target. - delta = desired_count - agt.request_target.current - # Use the delta as the allocation for this target, but - # make sure it's bounded by 0 and the number of nodes we - # have available to allocate. - allocation = min(delta, amount) - allocation = max(allocation, 0) - - # The next time through the loop, we have reduced our - # grant by this amount. - amount -= allocation - # Don't consider this target's count in the total number - # of nodes in the next iteration, nor the nodes we have - # just allocated. - total_current -= agt.request_target.current - total_current -= allocation - # Since we aren't considering this target's count, also - # don't consider this target itself when calculating the - # ratio. - remaining_targets -= 1 - # Set the amount of this allocation. - agt.allocate(allocation) - - -class AllocationTarget(object): - """A target to which nodes may be assigned.""" - def __init__(self, name): - self.name = name - - def __repr__(self): - return '' % (self.name) - - -class AllocationRequestTarget(object): - """A request associated with a target to which nodes may be assigned.""" - def __init__(self, request, target, current): - self.target = target - self.request = request - self.current = current - - -class AllocationGrantTarget(object): - """A target for a specific grant to which nodes may be assigned.""" - def __init__(self, sub_request, request_target): - self.sub_request = sub_request - self.request_target = request_target - self.amount = 0 - - def __repr__(self): - return '' % ( - self.amount, self.sub_request.request.name, - self.request_target.target.name) - - def allocate(self, amount): - # This is essentially the output of this system. This - # represents the number of nodes of a specific image from a - # specific provider that should be assigned to a specific - # target. - self.amount = amount - # Update the number of nodes of this image that are assigned - # to this target to assist in other allocation calculations - self.request_target.current += amount diff --git a/nodepool/builder.py b/nodepool/builder.py index c6ceaab63..01a90b238 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -21,20 +21,22 @@ import subprocess import threading import time import shlex -import sys +import uuid -import config as nodepool_config -import exceptions -import provider_manager -import stats -import zk +from nodepool import config as nodepool_config +from nodepool import exceptions +from nodepool import provider_manager +from nodepool import stats +from nodepool import zk MINS = 60 HOURS = 60 * MINS -IMAGE_TIMEOUT = 6 * HOURS # How long to wait for an image save -SUSPEND_WAIT_TIME = 30 # How long to wait between checks for - # ZooKeeper connectivity if it disappears. +# How long to wait for an image save +IMAGE_TIMEOUT = 6 * HOURS + +# How long to wait between checks for ZooKeeper connectivity if it disappears. +SUSPEND_WAIT_TIME = 30 # HP Cloud requires qemu compat with 0.10. That version works elsewhere, # so just hardcode it for all qcow2 building @@ -108,17 +110,19 @@ class DibImageFile(object): class BaseWorker(threading.Thread): - def __init__(self, config_path, interval, zk): + def __init__(self, builder_id, config_path, secure_path, interval, zk): super(BaseWorker, self).__init__() self.log = logging.getLogger("nodepool.builder.BaseWorker") self.daemon = True self._running = False self._config = None self._config_path = config_path + self._secure_path = secure_path self._zk = zk self._hostname = socket.gethostname() self._statsd = stats.get_client() self._interval = interval + self._builder_id = builder_id def _checkForZooKeeperChanges(self, new_config): ''' @@ -129,7 +133,7 @@ class BaseWorker(threading.Thread): ''' if self._config.zookeeper_servers != new_config.zookeeper_servers: self.log.debug("Detected ZooKeeper server changes") - self._zk.resetHosts(new_config.zookeeper_servers.values()) + self._zk.resetHosts(list(new_config.zookeeper_servers.values())) @property def running(self): @@ -145,9 +149,12 @@ class CleanupWorker(BaseWorker): and any local DIB builds. ''' - def __init__(self, name, config_path, interval, zk): - super(CleanupWorker, self).__init__(config_path, interval, zk) - self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name) + def __init__(self, name, builder_id, config_path, secure_path, + interval, zk): + super(CleanupWorker, self).__init__(builder_id, config_path, + secure_path, interval, zk) + self.log = logging.getLogger( + "nodepool.builder.CleanupWorker.%s" % name) self.name = 'CleanupWorker.%s' % name def _buildUploadRecencyTable(self): @@ -178,7 +185,7 @@ class CleanupWorker(BaseWorker): ) # Sort uploads by state_time (upload time) and keep the 2 most recent - for i in self._rtable.keys(): + for i in list(self._rtable.keys()): for p in self._rtable[i].keys(): self._rtable[i][p].sort(key=lambda x: x[2], reverse=True) self._rtable[i][p] = self._rtable[i][p][:2] @@ -222,27 +229,32 @@ class CleanupWorker(BaseWorker): if e.errno != 2: # No such file or directory raise e - def _deleteLocalBuild(self, image, build_id, builder): + def _deleteLocalBuild(self, image, build): ''' Remove expired image build from local disk. :param str image: Name of the image whose build we are deleting. - :param str build_id: ID of the build we want to delete. - :param str builder: hostname of the build. + :param ImageBuild build: The build we want to delete. :returns: True if files were deleted, False if none were found. ''' - base = "-".join([image, build_id]) + base = "-".join([image, build.id]) files = DibImageFile.from_image_id(self._config.imagesdir, base) if not files: # NOTE(pabelanger): It is possible we don't have any files because # diskimage-builder failed. So, check to see if we have the correct # builder so we can removed the data from zookeeper. - if builder == self._hostname: + + # To maintain backward compatibility with builders that didn't + # use unique builder IDs before, but do now, always compare to + # hostname as well since some ZK data may still reference that. + if (build.builder_id == self._builder_id or + build.builder == self._hostname + ): return True return False - self.log.info("Doing cleanup for %s:%s" % (image, build_id)) + self.log.info("Doing cleanup for %s:%s" % (image, build.id)) manifest_dir = None @@ -251,7 +263,8 @@ class CleanupWorker(BaseWorker): if not manifest_dir: path, ext = filename.rsplit('.', 1) manifest_dir = path + ".d" - map(self._removeDibItem, [filename, f.md5_file, f.sha256_file]) + items = [filename, f.md5_file, f.sha256_file] + list(map(self._removeDibItem, items)) try: shutil.rmtree(manifest_dir) @@ -271,8 +284,7 @@ class CleanupWorker(BaseWorker): self._deleteUpload(upload) def _cleanupObsoleteProviderUploads(self, provider, image, build_id): - image_names_for_provider = provider.images.keys() - if image in image_names_for_provider: + if image in provider.diskimages: # This image is in use for this provider return @@ -353,9 +365,7 @@ class CleanupWorker(BaseWorker): for build in builds: base = "-".join([image, build.id]) files = DibImageFile.from_image_id(self._config.imagesdir, base) - # If we have local dib files OR if our hostname matches the - # recorded owner hostname, consider this our build. - if files or (self._hostname == build.builder): + if files: ret.append(build) return ret @@ -388,7 +398,8 @@ class CleanupWorker(BaseWorker): self.log.info("Removing failed upload record: %s" % upload) self._zk.deleteUpload(image, build_id, provider, upload.id) elif upload.state == zk.DELETING: - self.log.info("Removing deleted upload and record: %s" % upload) + self.log.info( + "Removing deleted upload and record: %s" % upload) self._deleteUpload(upload) elif upload.state == zk.FAILED: self.log.info("Removing failed upload and record: %s" % upload) @@ -403,7 +414,7 @@ class CleanupWorker(BaseWorker): all_builds = self._zk.getBuilds(image) builds_to_keep = set([b for b in sorted(all_builds, reverse=True, key=lambda y: y.state_time) - if b.state==zk.READY][:2]) + if b.state == zk.READY][:2]) local_builds = set(self._filterLocalBuilds(image, all_builds)) diskimage = self._config.diskimages.get(image) if not diskimage and not local_builds: @@ -471,7 +482,7 @@ class CleanupWorker(BaseWorker): self._zk.storeBuild(image, build, build.id) # Release the lock here so we can delete the build znode - if self._deleteLocalBuild(image, build.id, build.builder): + if self._deleteLocalBuild(image, build): if not self._zk.deleteBuild(image, build.id): self.log.error("Unable to delete build %s because" " uploads still remain.", build) @@ -483,9 +494,13 @@ class CleanupWorker(BaseWorker): self._running = True while self._running: # Don't do work if we've lost communication with the ZK cluster + did_suspend = False while self._zk and (self._zk.suspended or self._zk.lost): + did_suspend = True self.log.info("ZooKeeper suspended. Waiting") time.sleep(SUSPEND_WAIT_TIME) + if did_suspend: + self.log.info("ZooKeeper available. Resuming") try: self._run() @@ -502,6 +517,8 @@ class CleanupWorker(BaseWorker): Body of run method for exception handling purposes. ''' new_config = nodepool_config.loadConfig(self._config_path) + if self._secure_path: + nodepool_config.loadSecureConfig(new_config, self._secure_path) if not self._config: self._config = new_config @@ -514,38 +531,14 @@ class CleanupWorker(BaseWorker): class BuildWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk, dib_cmd): - super(BuildWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, secure_path, + interval, zk, dib_cmd): + super(BuildWorker, self).__init__(builder_id, config_path, secure_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name) self.name = 'BuildWorker.%s' % name self.dib_cmd = dib_cmd - def _running_under_virtualenv(self): - # NOTE: borrowed from pip:locations.py - if hasattr(sys, 'real_prefix'): - return True - elif sys.prefix != getattr(sys, "base_prefix", sys.prefix): - return True - return False - - def _activate_virtualenv(self): - """Run as a pre-exec function to activate current virtualenv - - If we are invoked directly as /path/ENV/nodepool-builer (as - done by an init script, for example) then /path/ENV/bin will - not be in our $PATH, meaning we can't find disk-image-create. - Apart from that, dib also needs to run in an activated - virtualenv so it can find utils like dib-run-parts. Run this - before exec of dib to ensure the current virtualenv (if any) - is activated. - """ - if self._running_under_virtualenv(): - activate_this = os.path.join(sys.prefix, "bin", "activate_this.py") - if not os.path.exists(activate_this): - raise exceptions.BuilderError("Running in a virtualenv, but " - "cannot find: %s" % activate_this) - execfile(activate_this, dict(__file__=activate_this)) - def _checkForScheduledImageUpdates(self): ''' Check every DIB image to see if it has aged out and needs rebuilt. @@ -553,7 +546,7 @@ class BuildWorker(BaseWorker): for diskimage in self._config.diskimages.values(): # Check if we've been told to shutdown # or if ZK connection is suspended - if not self.running or self._zk.suspended or self._zk.lost: + if not self._running or self._zk.suspended or self._zk.lost: return try: self._checkImageForScheduledImageUpdates(diskimage) @@ -586,7 +579,8 @@ class BuildWorker(BaseWorker): if (not builds or (now - builds[0].state_time) >= diskimage.rebuild_age or not set(builds[0].formats).issuperset(diskimage.image_types) - ): + ): + try: with self._zk.imageBuildLock(diskimage.name, blocking=False): # To avoid locking each image repeatedly, we have an @@ -595,7 +589,8 @@ class BuildWorker(BaseWorker): # lock acquisition. If it's not the same build as # identified in the first check above, assume another # BuildWorker created the build for us and continue. - builds2 = self._zk.getMostRecentBuilds(1, diskimage.name, zk.READY) + builds2 = self._zk.getMostRecentBuilds( + 1, diskimage.name, zk.READY) if builds2 and builds[0].id != builds2[0].id: return @@ -603,6 +598,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -620,7 +616,7 @@ class BuildWorker(BaseWorker): for diskimage in self._config.diskimages.values(): # Check if we've been told to shutdown # or if ZK connection is suspended - if not self.running or self._zk.suspended or self._zk.lost: + if not self._running or self._zk.suspended or self._zk.lost: return try: self._checkImageForManualBuildRequest(diskimage) @@ -653,6 +649,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -719,7 +716,6 @@ class BuildWorker(BaseWorker): shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - preexec_fn=self._activate_virtualenv, env=env) except OSError as e: raise exceptions.BuilderError( @@ -738,19 +734,26 @@ class BuildWorker(BaseWorker): # interrupted during the build. If so, wait for it to return. # It could transition directly from SUSPENDED to CONNECTED, or go # through the LOST state before CONNECTED. + did_suspend = False while self._zk.suspended or self._zk.lost: + did_suspend = True self.log.info("ZooKeeper suspended during build. Waiting") time.sleep(SUSPEND_WAIT_TIME) + if did_suspend: + self.log.info("ZooKeeper available. Resuming") build_data = zk.ImageBuild() + build_data.builder_id = self._builder_id build_data.builder = self._hostname + build_data.username = diskimage.username if self._zk.didLoseConnection: self.log.info("ZooKeeper lost while building %s" % diskimage.name) self._zk.resetLostFlag() build_data.state = zk.FAILED elif p.returncode: - self.log.info("DIB failed creating %s" % diskimage.name) + self.log.info( + "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) build_data.state = zk.FAILED else: self.log.info("DIB image %s is built" % diskimage.name) @@ -760,7 +763,8 @@ class BuildWorker(BaseWorker): if self._statsd: # record stats on the size of each image we create for ext in img_types.split(','): - key = 'nodepool.dib_image_build.%s.%s.size' % (diskimage.name, ext) + key = 'nodepool.dib_image_build.%s.%s.size' % ( + diskimage.name, ext) # A bit tricky because these image files may be sparse # files; we only want the true size of the file for # purposes of watching if we've added too much stuff @@ -780,9 +784,13 @@ class BuildWorker(BaseWorker): self._running = True while self._running: # Don't do work if we've lost communication with the ZK cluster + did_suspend = False while self._zk and (self._zk.suspended or self._zk.lost): + did_suspend = True self.log.info("ZooKeeper suspended. Waiting") time.sleep(SUSPEND_WAIT_TIME) + if did_suspend: + self.log.info("ZooKeeper available. Resuming") try: self._run() @@ -798,6 +806,8 @@ class BuildWorker(BaseWorker): ''' # NOTE: For the first iteration, we expect self._config to be None new_config = nodepool_config.loadConfig(self._config_path) + if self._secure_path: + nodepool_config.loadSecureConfig(new_config, self._secure_path) if not self._config: self._config = new_config @@ -809,8 +819,10 @@ class BuildWorker(BaseWorker): class UploadWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk): - super(UploadWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, secure_path, + interval, zk): + super(UploadWorker, self).__init__(builder_id, config_path, + secure_path, interval, zk) self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name) self.name = 'UploadWorker.%s' % name @@ -819,6 +831,8 @@ class UploadWorker(BaseWorker): Reload the nodepool configuration file. ''' new_config = nodepool_config.loadConfig(self._config_path) + if self._secure_path: + nodepool_config.loadSecureConfig(new_config, self._secure_path) if not self._config: self._config = new_config @@ -827,7 +841,8 @@ class UploadWorker(BaseWorker): use_taskmanager=False) self._config = new_config - def _uploadImage(self, build_id, upload_id, image_name, images, provider): + def _uploadImage(self, build_id, upload_id, image_name, images, provider, + username): ''' Upload a local DIB image build to a provider. @@ -837,6 +852,7 @@ class UploadWorker(BaseWorker): :param list images: A list of DibImageFile objects from this build that available for uploading. :param provider: The provider from the parsed config file. + :param username: ''' start_time = time.time() timestamp = int(start_time) @@ -858,19 +874,15 @@ class UploadWorker(BaseWorker): filename = image.to_path(self._config.imagesdir, with_extension=True) - dummy_image = type('obj', (object,), - {'name': image_name, 'id': image.image_id}) - - ext_image_name = provider.template_hostname.format( - provider=provider, image=dummy_image, - timestamp=str(timestamp) + ext_image_name = provider.image_name_format.format( + image_name=image_name, timestamp=str(timestamp) ) self.log.info("Uploading DIB image build %s from %s to %s" % (build_id, filename, provider.name)) manager = self._config.provider_managers[provider.name] - provider_image = provider.images.get(image_name) + provider_image = provider.diskimages.get(image_name) if provider_image is None: raise exceptions.BuilderInvalidCommandError( "Could not find matching provider image for %s" % image_name @@ -910,6 +922,9 @@ class UploadWorker(BaseWorker): data.state = zk.READY data.external_id = external_id data.external_name = ext_image_name + data.format = image.extension + data.username = username + return data def _checkForProviderUploads(self): @@ -920,12 +935,12 @@ class UploadWorker(BaseWorker): to providers, do the upload if they are available on the local disk. ''' for provider in self._config.providers.values(): - for image in provider.images.values(): + for image in provider.diskimages.values(): uploaded = False # Check if we've been told to shutdown # or if ZK connection is suspended - if not self.running or self._zk.suspended or self._zk.lost: + if not self._running or self._zk.suspended or self._zk.lost: return try: uploaded = self._checkProviderImageUpload(provider, image) @@ -952,7 +967,7 @@ class UploadWorker(BaseWorker): :returns: True if an upload was attempted, False otherwise. ''' # Check if image uploads are paused. - if provider.images.get(image.name).pause: + if provider.diskimages.get(image.name).pause: return False # Search for the most recent 'ready' image build @@ -1003,11 +1018,14 @@ class UploadWorker(BaseWorker): # New upload number with initial state 'uploading' data = zk.ImageUpload() data.state = zk.UPLOADING + data.username = build.username + upnum = self._zk.storeImageUpload( image.name, build.id, provider.name, data) data = self._uploadImage(build.id, upnum, image.name, - local_images, provider) + local_images, provider, + build.username) # Set final state self._zk.storeImageUpload(image.name, build.id, @@ -1025,9 +1043,13 @@ class UploadWorker(BaseWorker): self._running = True while self._running: # Don't do work if we've lost communication with the ZK cluster + did_suspend = False while self._zk and (self._zk.suspended or self._zk.lost): + did_suspend = True self.log.info("ZooKeeper suspended. Waiting") time.sleep(SUSPEND_WAIT_TIME) + if did_suspend: + self.log.info("ZooKeeper available. Resuming") try: self._reloadConfig() @@ -1051,15 +1073,19 @@ class NodePoolBuilder(object): ''' log = logging.getLogger("nodepool.builder.NodePoolBuilder") - def __init__(self, config_path, num_builders=1, num_uploaders=4): + def __init__(self, config_path, secure_path=None, + num_builders=1, num_uploaders=4, fake=False): ''' Initialize the NodePoolBuilder object. :param str config_path: Path to configuration file. + :param str secure_path: Path to secure configuration file. :param int num_builders: Number of build workers to start. :param int num_uploaders: Number of upload workers to start. + :param bool fake: Whether to fake the image builds. ''' self._config_path = config_path + self._secure_path = secure_path self._config = None self._num_builders = num_builders self._build_workers = [] @@ -1070,7 +1096,11 @@ class NodePoolBuilder(object): self.cleanup_interval = 60 self.build_interval = 10 self.upload_interval = 10 - self.dib_cmd = 'disk-image-create' + if fake: + self.dib_cmd = os.path.join(os.path.dirname(__file__), '..', + 'nodepool/tests/fake-image-create') + else: + self.dib_cmd = 'disk-image-create' self.zk = None # This lock is needed because the run() method is started in a @@ -1079,21 +1109,34 @@ class NodePoolBuilder(object): # startup process has completed. self._start_lock = threading.Lock() - #======================================================================= + # ====================================================================== # Private methods - #======================================================================= + # ====================================================================== + + def _getBuilderID(self, id_file): + if not os.path.exists(id_file): + with open(id_file, "w") as f: + builder_id = str(uuid.uuid4()) + f.write(builder_id) + return builder_id + + with open(id_file, "r") as f: + builder_id = f.read() + return builder_id def _getAndValidateConfig(self): config = nodepool_config.loadConfig(self._config_path) + if self._secure_path: + nodepool_config.loadSecureConfig(config, self._secure_path) if not config.zookeeper_servers.values(): raise RuntimeError('No ZooKeeper servers specified in config.') if not config.imagesdir: raise RuntimeError('No images-dir specified in config.') return config - #======================================================================= + # ====================================================================== # Public methods - #======================================================================= + # ====================================================================== def start(self): ''' @@ -1110,28 +1153,36 @@ class NodePoolBuilder(object): self._config = self._getAndValidateConfig() self._running = True + builder_id_file = os.path.join(self._config.imagesdir, + "builder_id.txt") + builder_id = self._getBuilderID(builder_id_file) + # All worker threads share a single ZooKeeper instance/connection. self.zk = zk.ZooKeeper() - self.zk.connect(self._config.zookeeper_servers.values()) + self.zk.connect(list(self._config.zookeeper_servers.values())) self.log.debug('Starting listener for build jobs') # Create build and upload worker objects for i in range(self._num_builders): - w = BuildWorker(i, self._config_path, self.build_interval, - self.zk, self.dib_cmd) + w = BuildWorker(i, builder_id, + self._config_path, self._secure_path, + self.build_interval, self.zk, self.dib_cmd) w.start() self._build_workers.append(w) for i in range(self._num_uploaders): - w = UploadWorker(i, self._config_path, self.upload_interval, - self.zk) + w = UploadWorker(i, builder_id, + self._config_path, self._secure_path, + self.upload_interval, self.zk) w.start() self._upload_workers.append(w) if self.cleanup_interval > 0: self._janitor = CleanupWorker( - 0, self._config_path, self.cleanup_interval, self.zk) + 0, builder_id, + self._config_path, self._secure_path, + self.cleanup_interval, self.zk) self._janitor.start() # Wait until all threads are running. Otherwise, we have a race @@ -1154,7 +1205,14 @@ class NodePoolBuilder(object): ''' with self._start_lock: self.log.debug("Stopping. NodePoolBuilder shutting down workers") - workers = self._build_workers + self._upload_workers + # Note we do not add the upload workers to this list intentionally. + # The reason for this is that uploads can take many hours and there + # is no good way to stop the blocking writes performed by the + # uploads in order to join() below on a reasonable amount of time. + # Killing the process will stop the upload then both the record + # in zk and in the cloud will be deleted by any other running + # builders or when this builder starts again. + workers = self._build_workers if self._janitor: workers += [self._janitor] for worker in (workers): diff --git a/nodepool/cmd/__init__.py b/nodepool/cmd/__init__.py index 3d388e74c..8ccf480dc 100644 --- a/nodepool/cmd/__init__.py +++ b/nodepool/cmd/__init__.py @@ -14,6 +14,10 @@ # License for the specific language governing permissions and limitations # under the License. +import argparse +import daemon +import errno +import extras import logging import logging.config import os @@ -22,6 +26,37 @@ import sys import threading import traceback +import yaml + +from nodepool.version import version_info as npd_version_info + + +# as of python-daemon 1.6 it doesn't bundle pidlockfile anymore +# instead it depends on lockfile-0.9.1 which uses pidfile. +pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) + + +def is_pidfile_stale(pidfile): + """ Determine whether a PID file is stale. + + Return 'True' ("stale") if the contents of the PID file are + valid but do not match the PID of a currently-running process; + otherwise return 'False'. + + """ + result = False + + pidfile_pid = pidfile.read_pid() + if pidfile_pid is not None: + try: + os.kill(pidfile_pid, 0) + except OSError as exc: + if exc.errno == errno.ESRCH: + # The specified PID does not exist + result = True + + return result + def stack_dump_handler(signum, frame): signal.signal(signal.SIGUSR2, signal.SIG_IGN) @@ -45,17 +80,99 @@ def stack_dump_handler(signum, frame): class NodepoolApp(object): + app_name = None + app_description = 'Node pool.' + def __init__(self): + self.parser = None self.args = None + def create_parser(self): + parser = argparse.ArgumentParser(description=self.app_description) + + parser.add_argument('-l', + dest='logconfig', + help='path to log config file') + + parser.add_argument('--version', + action='version', + version=npd_version_info.version_string()) + + return parser + def setup_logging(self): if self.args.logconfig: fp = os.path.expanduser(self.args.logconfig) + if not os.path.exists(fp): - raise Exception("Unable to read logging config file at %s" % - fp) - logging.config.fileConfig(fp) + m = "Unable to read logging config file at %s" % fp + raise Exception(m) + + if os.path.splitext(fp)[1] in ('.yml', '.yaml'): + with open(fp, 'r') as f: + logging.config.dictConfig(yaml.safe_load(f)) + + else: + logging.config.fileConfig(fp) + else: - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(name)s: ' - '%(message)s') + m = '%(asctime)s %(levelname)s %(name)s: %(message)s' + logging.basicConfig(level=logging.DEBUG, format=m) + + def _main(self, argv=None): + if argv is None: + argv = sys.argv[1:] + + self.parser = self.create_parser() + self.args = self.parser.parse_args() + return self._do_run() + + def _do_run(self): + # NOTE(jamielennox): setup logging a bit late so it's not done until + # after a DaemonContext is created. + self.setup_logging() + return self.run() + + @classmethod + def main(cls, argv=None): + return cls()._main(argv=argv) + + def run(self): + """The app's primary function, override it with your logic.""" + raise NotImplementedError() + + +class NodepoolDaemonApp(NodepoolApp): + + def create_parser(self): + parser = super(NodepoolDaemonApp, self).create_parser() + + parser.add_argument('-p', + dest='pidfile', + help='path to pid file', + default='/var/run/nodepool/%s.pid' % self.app_name) + + parser.add_argument('-d', + dest='nodaemon', + action='store_true', + help='do not run as a daemon') + + return parser + + def _do_run(self): + if self.args.nodaemon: + return super(NodepoolDaemonApp, self)._do_run() + + else: + pid = pid_file_module.TimeoutPIDLockFile(self.args.pidfile, 10) + + if is_pidfile_stale(pid): + pid.break_lock() + + with daemon.DaemonContext(pidfile=pid): + return super(NodepoolDaemonApp, self)._do_run() + + @classmethod + def main(cls, argv=None): + signal.signal(signal.SIGUSR2, stack_dump_handler) + return super(NodepoolDaemonApp, cls).main(argv) diff --git a/nodepool/cmd/builder.py b/nodepool/cmd/builder.py index 56d96188f..7ac993b2a 100644 --- a/nodepool/cmd/builder.py +++ b/nodepool/cmd/builder.py @@ -12,56 +12,51 @@ # License for the specific language governing permissions and limitations # under the License. -import argparse -import extras import signal import sys -import daemon - from nodepool import builder import nodepool.cmd -# as of python-daemon 1.6 it doesn't bundle pidlockfile anymore -# instead it depends on lockfile-0.9.1 which uses pidfile. -pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) +class NodePoolBuilderApp(nodepool.cmd.NodepoolDaemonApp): -class NodePoolBuilderApp(nodepool.cmd.NodepoolApp): + app_name = 'nodepool-builder' + app_description = 'NodePool Image Builder.' def sigint_handler(self, signal, frame): self.nb.stop() sys.exit(0) - def parse_arguments(self): - parser = argparse.ArgumentParser(description='NodePool Image Builder.') + def create_parser(self): + parser = super(NodePoolBuilderApp, self).create_parser() + parser.add_argument('-c', dest='config', default='/etc/nodepool/nodepool.yaml', help='path to config file') - parser.add_argument('-l', dest='logconfig', - help='path to log config file') - parser.add_argument('-p', dest='pidfile', - help='path to pid file', - default='/var/run/nodepool-builder/' - 'nodepool-builder.pid') - parser.add_argument('-d', dest='nodaemon', action='store_true', - help='do not run as a daemon') + parser.add_argument('-s', dest='secure', + help='path to secure config file') parser.add_argument('--build-workers', dest='build_workers', default=1, help='number of build workers', type=int) parser.add_argument('--upload-workers', dest='upload_workers', default=4, help='number of upload workers', type=int) - self.args = parser.parse_args() + parser.add_argument('--fake', action='store_true', + help='Do not actually run diskimage-builder ' + '(used for testing)') + return parser - def main(self): - self.setup_logging() + def run(self): self.nb = builder.NodePoolBuilder( - self.args.config, self.args.build_workers, - self.args.upload_workers) + self.args.config, + secure_path=self.args.secure, + num_builders=self.args.build_workers, + num_uploaders=self.args.upload_workers, + fake=self.args.fake) signal.signal(signal.SIGINT, self.sigint_handler) - signal.signal(signal.SIGUSR2, nodepool.cmd.stack_dump_handler) + self.nb.start() while True: @@ -69,15 +64,7 @@ class NodePoolBuilderApp(nodepool.cmd.NodepoolApp): def main(): - app = NodePoolBuilderApp() - app.parse_arguments() - - if app.args.nodaemon: - app.main() - else: - pid = pid_file_module.TimeoutPIDLockFile(app.args.pidfile, 10) - with daemon.DaemonContext(pidfile=pid): - app.main() + return NodePoolBuilderApp.main() if __name__ == "__main__": diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 0409e0a45..cccf611ab 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -14,6 +14,8 @@ import logging import voluptuous as v import yaml +from nodepool.config import get_provider_config + log = logging.getLogger(__name__) @@ -24,88 +26,19 @@ class ConfigValidator: self.config_file = config_file def validate(self): - cron = { - 'check': str, - 'cleanup': str, - } - - images = { - 'name': str, - 'pause': bool, - 'min-ram': int, - 'name-filter': str, - 'key-name': str, - 'diskimage': str, - 'meta': dict, - 'username': str, - 'user-home': str, - 'private-key': str, - 'config-drive': bool, - } - - old_network = { - 'net-id': str, - 'net-label': str, - } - - network = { + provider = { 'name': v.Required(str), - 'public': bool, # Ignored, but kept for backwards compat + 'driver': str, + 'max-concurrency': int, } - providers = { + label = { 'name': str, - 'region-name': str, - 'service-type': str, - 'service-name': str, - 'availability-zones': [str], - 'cloud': str, - 'username': str, - 'password': str, - 'auth-url': str, - 'project-id': str, - 'project-name': str, - 'max-servers': int, - 'pool': str, # Ignored, but kept for backwards compat - 'image-type': str, - 'networks': [v.Any(old_network, network)], - 'ipv6-preferred': bool, - 'boot-timeout': int, - 'api-timeout': int, - 'launch-timeout': int, - 'nodepool-id': str, - 'rate': float, - 'images': [images], - 'template-hostname': str, - 'clean-floating-ips': bool, - } - - labels = { - 'name': str, - 'image': str, 'min-ready': int, - 'ready-script': str, - 'subnodes': int, - 'providers': [{ - 'name': str, - }], + 'max-ready-age': int, } - targets = { - 'name': str, - 'hostname': str, - 'subnode-hostname': str, - 'assign-via-gearman': bool, - 'jenkins': { - 'url': str, - 'user': str, - 'apikey': str, - 'credentials-id': str, - 'test-job': str - } - } - - diskimages = { + diskimage = { 'name': str, 'pause': bool, 'elements': [str], @@ -113,27 +46,26 @@ class ConfigValidator: 'release': v.Any(str, int), 'rebuild-age': int, 'env-vars': {str: str}, + 'username': str, + } + + webapp = { + 'port': int, + 'listen_address': str, } top_level = { + 'webapp': webapp, 'elements-dir': str, 'images-dir': str, - 'dburi': str, - 'zmq-publishers': [str], - 'gearman-servers': [{ - 'host': str, - 'port': int, - }], 'zookeeper-servers': [{ 'host': str, 'port': int, 'chroot': str, }], - 'cron': cron, - 'providers': [providers], - 'labels': [labels], - 'targets': [targets], - 'diskimages': [diskimages], + 'providers': list, + 'labels': [label], + 'diskimages': [diskimage], } log.info("validating %s" % self.config_file) @@ -142,12 +74,6 @@ class ConfigValidator: # validate the overall schema schema = v.Schema(top_level) schema(config) - - # labels must list valid providers - all_providers = [p['name'] for p in config['providers']] - for label in config['labels']: - for provider in label['providers']: - if not provider['name'] in all_providers: - raise AssertionError('label %s requests ' - 'non-existent provider %s' - % (label['name'], provider['name'])) + for provider_dict in config.get('providers', []): + provider_schema = get_provider_config(provider_dict).get_schema() + provider_schema.extend(provider)(provider_dict) diff --git a/nodepool/cmd/launcher.py b/nodepool/cmd/launcher.py new file mode 100755 index 000000000..d5594a458 --- /dev/null +++ b/nodepool/cmd/launcher.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# Copyright 2012 Hewlett-Packard Development Company, L.P. +# Copyright 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging +import os +import sys +import signal + +import nodepool.cmd +import nodepool.launcher +import nodepool.webapp + +log = logging.getLogger(__name__) + + +class NodePoolLauncherApp(nodepool.cmd.NodepoolDaemonApp): + + app_name = 'nodepool' + + def create_parser(self): + parser = super(NodePoolLauncherApp, self).create_parser() + + parser.add_argument('-c', dest='config', + default='/etc/nodepool/nodepool.yaml', + help='path to config file') + parser.add_argument('-s', dest='secure', + help='path to secure file') + parser.add_argument('--no-webapp', action='store_true') + return parser + + def exit_handler(self, signum, frame): + self.pool.stop() + if not self.args.no_webapp: + self.webapp.stop() + sys.exit(0) + + def term_handler(self, signum, frame): + os._exit(0) + + def run(self): + self.pool = nodepool.launcher.NodePool(self.args.secure, + self.args.config) + if not self.args.no_webapp: + config = self.pool.loadConfig() + self.webapp = nodepool.webapp.WebApp(self.pool, + **config.webapp) + + signal.signal(signal.SIGINT, self.exit_handler) + # For back compatibility: + signal.signal(signal.SIGUSR1, self.exit_handler) + + signal.signal(signal.SIGTERM, self.term_handler) + + self.pool.start() + + if not self.args.no_webapp: + self.webapp.start() + + while True: + signal.pause() + + +def main(): + return NodePoolLauncherApp.main() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py old mode 100644 new mode 100755 index 5ffe1918b..a73052380 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -14,37 +14,31 @@ # License for the specific language governing permissions and limitations # under the License. -import argparse import logging.config import sys -from nodepool import nodedb -from nodepool import nodepool +from prettytable import PrettyTable + +from nodepool import launcher +from nodepool import provider_manager from nodepool import status from nodepool import zk from nodepool.cmd import NodepoolApp -from nodepool.version import version_info as npc_version_info -from config_validator import ConfigValidator -from prettytable import PrettyTable +from nodepool.cmd.config_validator import ConfigValidator log = logging.getLogger(__name__) class NodePoolCmd(NodepoolApp): - def parse_arguments(self): - parser = argparse.ArgumentParser(description='Node pool.') + def create_parser(self): + parser = super(NodePoolCmd, self).create_parser() + parser.add_argument('-c', dest='config', default='/etc/nodepool/nodepool.yaml', help='path to config file') parser.add_argument('-s', dest='secure', - default='/etc/nodepool/secure.conf', help='path to secure file') - parser.add_argument('-l', dest='logconfig', - help='path to log config file') - parser.add_argument('--version', action='version', - version=npc_version_info.version_string(), - help='show version') parser.add_argument('--debug', dest='debug', action='store_true', help='show DEBUG level logging') @@ -55,6 +49,9 @@ class NodePoolCmd(NodepoolApp): cmd_list = subparsers.add_parser('list', help='list nodes') cmd_list.set_defaults(func=self.list) + cmd_list.add_argument('--detail', action='store_true', + help='Output detailed node info') + cmd_image_list = subparsers.add_parser( 'image-list', help='list images from providers') cmd_image_list.set_defaults(func=self.image_list) @@ -70,13 +67,6 @@ class NodePoolCmd(NodepoolApp): cmd_image_build.add_argument('image', help='image name') cmd_image_build.set_defaults(func=self.image_build) - cmd_alien_list = subparsers.add_parser( - 'alien-list', - help='list nodes not accounted for by nodepool') - cmd_alien_list.set_defaults(func=self.alien_list) - cmd_alien_list.add_argument('provider', help='provider name', - nargs='?') - cmd_alien_image_list = subparsers.add_parser( 'alien-image-list', help='list images not accounted for by nodepool') @@ -90,7 +80,8 @@ class NodePoolCmd(NodepoolApp): cmd_hold.set_defaults(func=self.hold) cmd_hold.add_argument('id', help='node id') cmd_hold.add_argument('--reason', - help='Optional reason this node is held') + help='Reason this node is held', + required=True) cmd_delete = subparsers.add_parser( 'delete', @@ -116,7 +107,8 @@ class NodePoolCmd(NodepoolApp): cmd_dib_image_delete = subparsers.add_parser( 'dib-image-delete', - help='delete image built with diskimage-builder') + help='Delete a dib built image from disk along with all cloud ' + 'uploads of this image') cmd_dib_image_delete.set_defaults(func=self.dib_image_delete) cmd_dib_image_delete.add_argument('id', help='dib image id') @@ -125,47 +117,39 @@ class NodePoolCmd(NodepoolApp): help='Validate configuration file') cmd_config_validate.set_defaults(func=self.config_validate) - cmd_job_list = subparsers.add_parser('job-list', help='list jobs') - cmd_job_list.set_defaults(func=self.job_list) + cmd_request_list = subparsers.add_parser( + 'request-list', + help='list the current node requests') + cmd_request_list.set_defaults(func=self.request_list) - cmd_job_create = subparsers.add_parser('job-create', help='create job') - cmd_job_create.add_argument( - 'name', - help='job name') - cmd_job_create.add_argument('--hold-on-failure', - help='number of nodes to hold when this job fails') - cmd_job_create.set_defaults(func=self.job_create) - - cmd_job_delete = subparsers.add_parser( - 'job-delete', - help='delete job') - cmd_job_delete.set_defaults(func=self.job_delete) - cmd_job_delete.add_argument('id', help='job id') - - self.args = parser.parse_args() + return parser def setup_logging(self): + # NOTE(jamielennox): This should just be the same as other apps if self.args.debug: - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(name)s: ' - '%(message)s') + m = '%(asctime)s %(levelname)s %(name)s: %(message)s' + logging.basicConfig(level=logging.DEBUG, format=m) + elif self.args.logconfig: - NodepoolApp.setup_logging(self) + super(NodePoolCmd, self).setup_logging() + else: - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(name)s: ' - '%(message)s') + m = '%(asctime)s %(levelname)s %(name)s: %(message)s' + logging.basicConfig(level=logging.INFO, format=m) + l = logging.getLogger('kazoo') l.setLevel(logging.WARNING) - def list(self, node_id=None): - print status.node_list(self.pool.getDB(), node_id) + def list(self, node_id=None, detail=False): + if hasattr(self.args, 'detail'): + detail = self.args.detail + print(status.node_list(self.zk, node_id, detail)) def dib_image_list(self): - print status.dib_image_list(self.zk) + print(status.dib_image_list(self.zk)) def image_list(self): - print status.image_list(self.zk) + print(status.image_list(self.zk)) def image_build(self, diskimage=None): diskimage = diskimage or self.args.image @@ -180,31 +164,8 @@ class NodePoolCmd(NodepoolApp): self.zk.submitBuildRequest(diskimage) - def alien_list(self): - self.pool.reconfigureManagers(self.pool.config, False) - - t = PrettyTable(["Provider", "Hostname", "Server ID", "IP"]) - t.align = 'l' - with self.pool.getDB().getSession() as session: - for provider in self.pool.config.providers.values(): - if (self.args.provider and - provider.name != self.args.provider): - continue - manager = self.pool.getProviderManager(provider) - - try: - for server in manager.listServers(): - if not session.getNodeByExternalID( - provider.name, server['id']): - t.add_row([provider.name, server['name'], - server['id'], server['public_v4']]) - except Exception as e: - log.warning("Exception listing aliens for %s: %s" - % (provider.name, str(e.message))) - print t - def alien_image_list(self): - self.pool.reconfigureManagers(self.pool.config, False) + self.pool.updateConfig() t = PrettyTable(["Provider", "Name", "Image ID"]) t.align = 'l' @@ -213,7 +174,7 @@ class NodePoolCmd(NodepoolApp): if (self.args.provider and provider.name != self.args.provider): continue - manager = self.pool.getProviderManager(provider) + manager = self.pool.getProviderManager(provider.name) # Build list of provider images as known by the provider provider_images = [] @@ -227,11 +188,11 @@ class NodePoolCmd(NodepoolApp): if 'nodepool_build_id' in image['properties']] except Exception as e: log.warning("Exception listing alien images for %s: %s" - % (provider.name, str(e.message))) + % (provider.name, str(e))) alien_ids = [] uploads = [] - for image in provider.images: + for image in provider.diskimages: # Build list of provider images as recorded in ZK for bnum in self.zk.getBuildNumbers(image): uploads.extend( @@ -249,30 +210,46 @@ class NodePoolCmd(NodepoolApp): if image['id'] in alien_ids: t.add_row([provider.name, image['name'], image['id']]) - print t + print(t) def hold(self): - node_id = None - with self.pool.getDB().getSession() as session: - node = session.getNode(self.args.id) - node.state = nodedb.HOLD - if self.args.reason: - node.comment = self.args.reason - node_id = node.id - self.list(node_id=node_id) + node = self.zk.getNode(self.args.id) + if not node: + print("Node id %s not found" % self.args.id) + return + + node.state = zk.HOLD + node.comment = self.args.reason + print("Waiting for lock...") + self.zk.lockNode(node, blocking=True) + self.zk.storeNode(node) + self.zk.unlockNode(node) + self.list(node_id=self.args.id) def delete(self): + node = self.zk.getNode(self.args.id) + if not node: + print("Node id %s not found" % self.args.id) + return + + self.zk.lockNode(node, blocking=True, timeout=5) + if self.args.now: - self.pool.reconfigureManagers(self.pool.config) - with self.pool.getDB().getSession() as session: - node = session.getNode(self.args.id) - if not node: - print "Node %s not found." % self.args.id - elif self.args.now: - self.pool._deleteNode(session, node) - else: - node.state = nodedb.DELETE - self.list(node_id=node.id) + if node.provider not in self.pool.config.providers: + print("Provider %s for node %s not defined on this launcher" % + (node.provider, node.id)) + return + provider = self.pool.config.providers[node.provider] + manager = provider_manager.get_provider(provider, True) + manager.start() + launcher.NodeDeleter.delete(self.zk, manager, node) + manager.stop() + else: + node.state = zk.DELETING + self.zk.storeNode(node) + self.zk.unlockNode(node) + + self.list(node_id=node.id) def dib_image_delete(self): (image, build_num) = self.args.id.rsplit('-', 1) @@ -312,53 +289,38 @@ class NodePoolCmd(NodepoolApp): validator = ConfigValidator(self.args.config) validator.validate() log.info("Configuration validation complete") - #TODO(asselin,yolanda): add validation of secure.conf + # TODO(asselin,yolanda): add validation of secure.conf - def job_list(self): - t = PrettyTable(["ID", "Name", "Hold on Failure"]) - t.align = 'l' - with self.pool.getDB().getSession() as session: - for job in session.getJobs(): - t.add_row([job.id, job.name, job.hold_on_failure]) - print t - - def job_create(self): - with self.pool.getDB().getSession() as session: - session.createJob(self.args.name, - hold_on_failure=self.args.hold_on_failure) - self.job_list() - - def job_delete(self): - with self.pool.getDB().getSession() as session: - job = session.getJob(self.args.id) - if not job: - print "Job %s not found." % self.args.id - else: - job.delete() + def request_list(self): + print(status.request_list(self.zk)) def _wait_for_threads(self, threads): for t in threads: if t: t.join() - def main(self): + def run(self): self.zk = None + # no arguments, print help messaging, then exit with error(1) + if not self.args.command: + self.parser.print_help() + return 1 # commands which do not need to start-up or parse config if self.args.command in ('config-validate'): return self.args.func() - self.pool = nodepool.NodePool(self.args.secure, self.args.config) + self.pool = launcher.NodePool(self.args.secure, self.args.config) config = self.pool.loadConfig() # commands needing ZooKeeper if self.args.command in ('image-build', 'dib-image-list', 'image-list', 'dib-image-delete', - 'image-delete', 'alien-image-list'): + 'image-delete', 'alien-image-list', + 'list', 'hold', 'delete', + 'request-list'): self.zk = zk.ZooKeeper() - self.zk.connect(config.zookeeper_servers.values()) - else: - self.pool.reconfigureDatabase(config) + self.zk.connect(list(config.zookeeper_servers.values())) self.pool.setConfig(config) self.args.func() @@ -366,11 +328,9 @@ class NodePoolCmd(NodepoolApp): if self.zk: self.zk.disconnect() + def main(): - npc = NodePoolCmd() - npc.parse_arguments() - npc.setup_logging() - return npc.main() + return NodePoolCmd.main() if __name__ == "__main__": diff --git a/nodepool/cmd/nodepoold.py b/nodepool/cmd/nodepoold.py deleted file mode 100644 index 625e57584..000000000 --- a/nodepool/cmd/nodepoold.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python -# Copyright 2012 Hewlett-Packard Development Company, L.P. -# Copyright 2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import argparse -import daemon -import errno -import extras - -# as of python-daemon 1.6 it doesn't bundle pidlockfile anymore -# instead it depends on lockfile-0.9.1 which uses pidfile. -pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) - -import logging -import os -import sys -import signal - -import nodepool.cmd -import nodepool.nodepool -import nodepool.webapp - -log = logging.getLogger(__name__) - - -def is_pidfile_stale(pidfile): - """ Determine whether a PID file is stale. - - Return 'True' ("stale") if the contents of the PID file are - valid but do not match the PID of a currently-running process; - otherwise return 'False'. - - """ - result = False - - pidfile_pid = pidfile.read_pid() - if pidfile_pid is not None: - try: - os.kill(pidfile_pid, 0) - except OSError as exc: - if exc.errno == errno.ESRCH: - # The specified PID does not exist - result = True - - return result - - -class NodePoolDaemon(nodepool.cmd.NodepoolApp): - - def parse_arguments(self): - parser = argparse.ArgumentParser(description='Node pool.') - parser.add_argument('-c', dest='config', - default='/etc/nodepool/nodepool.yaml', - help='path to config file') - parser.add_argument('-s', dest='secure', - default='/etc/nodepool/secure.conf', - help='path to secure file') - parser.add_argument('-d', dest='nodaemon', action='store_true', - help='do not run as a daemon') - parser.add_argument('-l', dest='logconfig', - help='path to log config file') - parser.add_argument('-p', dest='pidfile', - help='path to pid file', - default='/var/run/nodepool/nodepool.pid') - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--no-builder', dest='builder', - action='store_false') - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--build-workers', dest='build_workers', - default=1, help='number of build workers', - type=int) - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--upload-workers', dest='upload_workers', - default=4, help='number of upload workers', - type=int) - parser.add_argument('--no-deletes', action='store_true') - parser.add_argument('--no-launches', action='store_true') - parser.add_argument('--no-webapp', action='store_true') - parser.add_argument('--version', dest='version', action='store_true', - help='show version') - self.args = parser.parse_args() - - def exit_handler(self, signum, frame): - self.pool.stop() - if not self.args.no_webapp: - self.webapp.stop() - sys.exit(0) - - def term_handler(self, signum, frame): - os._exit(0) - - def main(self): - self.setup_logging() - self.pool = nodepool.nodepool.NodePool(self.args.secure, - self.args.config, - self.args.no_deletes, - self.args.no_launches) - if self.args.builder: - log.warning( - "Note: nodepool no longer automatically builds images, " - "please ensure the separate nodepool-builder process is " - "running if you haven't already") - else: - log.warning( - "--no-builder is deprecated and will be removed in the near " - "future. Update your service scripts to avoid a breakage.") - - if not self.args.no_webapp: - self.webapp = nodepool.webapp.WebApp(self.pool) - - signal.signal(signal.SIGINT, self.exit_handler) - # For back compatibility: - signal.signal(signal.SIGUSR1, self.exit_handler) - - signal.signal(signal.SIGUSR2, nodepool.cmd.stack_dump_handler) - signal.signal(signal.SIGTERM, self.term_handler) - - self.pool.start() - - if not self.args.no_webapp: - self.webapp.start() - - while True: - signal.pause() - - -def main(): - npd = NodePoolDaemon() - npd.parse_arguments() - - if npd.args.version: - from nodepool.version import version_info as npd_version_info - print "Nodepool version: %s" % npd_version_info.version_string() - return(0) - - pid = pid_file_module.TimeoutPIDLockFile(npd.args.pidfile, 10) - if is_pidfile_stale(pid): - pid.break_lock() - - if npd.args.nodaemon: - npd.main() - else: - with daemon.DaemonContext(pidfile=pid): - npd.main() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/nodepool/config.py b/nodepool/config.py old mode 100644 new mode 100755 index c9440135b..d9a710d6d --- a/nodepool/config.py +++ b/nodepool/config.py @@ -16,114 +16,56 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os_client_config -from six.moves import configparser as ConfigParser import time import yaml -import fakeprovider -import zk - - -class ConfigValue(object): - def __eq__(self, other): - if isinstance(other, ConfigValue): - if other.__dict__ == self.__dict__: - return True - return False +from nodepool import zk +from nodepool.driver import ConfigValue +from nodepool.driver.fake.config import FakeProviderConfig +from nodepool.driver.openstack.config import OpenStackProviderConfig class Config(ConfigValue): pass -class Provider(ConfigValue): - def __eq__(self, other): - if (other.cloud_config != self.cloud_config or - other.nodepool_id != self.nodepool_id or - other.max_servers != self.max_servers or - other.pool != self.pool or - other.image_type != self.image_type or - other.rate != self.rate or - other.api_timeout != self.api_timeout or - other.boot_timeout != self.boot_timeout or - other.launch_timeout != self.launch_timeout or - other.networks != self.networks or - other.ipv6_preferred != self.ipv6_preferred or - other.clean_floating_ips != self.clean_floating_ips or - other.azs != self.azs): - return False - new_images = other.images - old_images = self.images - # Check if images have been added or removed - if set(new_images.keys()) != set(old_images.keys()): - return False - # check if existing images have been updated - for k in new_images: - if (new_images[k].min_ram != old_images[k].min_ram or - new_images[k].name_filter != old_images[k].name_filter or - new_images[k].key_name != old_images[k].key_name or - new_images[k].username != old_images[k].username or - new_images[k].user_home != old_images[k].user_home or - new_images[k].private_key != old_images[k].private_key or - new_images[k].meta != old_images[k].meta or - new_images[k].config_drive != old_images[k].config_drive): - return False - return True - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "" % self.name - - -class ProviderImage(ConfigValue): - def __repr__(self): - return "" % self.name - - -class Target(ConfigValue): - def __repr__(self): - return "" % self.name - - class Label(ConfigValue): def __repr__(self): return "