tripleo-ansible/playbooks/update_cloud.yml
Julia Kreger 4aeb966792 Fix bootstrap node service stop
The boostrap node service stop configuration was previously
hardcoded to work only on Helion.

Changes the configuration and process list for upstream use.

Change-Id: I9d57d391f91967ba8c1daf696a354af49c6a2371
2015-01-27 21:22:57 +00:00

537 lines
24 KiB
YAML

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
- include: step_ping.yml
- hosts: localhost
name: "Setup local environment for upgrade processes to run"
gather_facts: no
max_fail_percentage: 0
tasks:
- include: update_local_ssh_config.yml
- include: step_check_image_vars.yml
- include: step_pre_hook.yml
- hosts: undercloud
name: Disable Undercloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on undercloud instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- command: mv -f /etc/init/mysql.conf /etc/init/mysql-boot-control.conf removes=/etc/init/mysql.conf
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_undercloud_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: undercloud_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: disable_os_collect_config.yml
- hosts: nova-compute
name: Disable Overcloud Compute
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- fail: "FAILURE: Cannot perform an online upgrade on nodes that are not in ACTIVE state"
when: instance_status != "ACTIVE" and online_upgrade is defined
- include: stop_vms.yml
when: instance_status == "ACTIVE" and online_upgrade is not defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_compute_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_compute_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- service: name=nova-compute state=stopped enabled=no
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
- hosts: swift-storage
name: swift-storage
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on swift instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_swift_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_swift_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: controller
name: Disable Overcloud Controller
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on controller instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_controller_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
- hosts: controller-bootstrap
name: Disable Overcloud Controller Bootstrap node
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on controllerMgmt instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_controller_services
when: instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
# Critically, we need to select a single node of the galera cluster to
# be the 'last'. So controller-bootstrap fits that bill for now. We will have
# to select one to be the "special" node eventually, we can do that with
# host facts and conditionals. The last to go down must have the
# Galera bootstrap run on it, or none of them will come up.
- hosts: controller
name: Stop MySQL/RabbitMQ on controller nodes
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
serial: 1
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- name: Stop MySQL under normal circumstances
include: stop_mysql.yml
when: instance_status == "ACTIVE" and galera_status == "Synced" and galera_cluster_size != "1"
- name: Stop MySQL if last node in cluster and single_controller flag has been set.
include: stop_mysql.yml
when: instance_status == "ACTIVE" and single_controller is defined and galera_status == "Synced" and galera_cluster_size == "1"
- fail: msg="Galera Replication is out of sync - cannot safely proceed"
when: single_controller is not defined and instance_status == "ACTIVE" and galera_status == "Out of Sync"
- fail: msg="Galera Replication - Node appears to be the last node in a cluster - cannot safely proceed unless overriden via single_controller setting - See README.rst"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_cluster_size == "1"
- name: Stop RabbitMQ Application for shutdown
command: rabbitmqctl stop_app
- name: Remove the node from the RabbitMQ cluster
command: rabbitmqctl reset
- service: name=rabbitmq-server state=stopped
when: instance_status == "ACTIVE"
- name: "Waiting for MySQL to stop"
wait_for: port=3307 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE" and helion is defined and single_controller is not defined and galera_status == 'Synced'
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- include: disable_os_collect_config.yml
- hosts: controller-bootstrap
name: Stop MySQL/RabbitMQ on Overcloud Controller Bootstrap node
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- fail: msg="Galera Replication on controller Management is out of sync - cannot safely proceed"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_status != "Synced"
- fail: msg="Galera Replication on controller Management - cannot safely proceed as another MySQL cluster node is active."
when: instance_status == "ACTIVE" and single_controller is not defined and galera_cluster_size != "1"
- include: stop_mysql.yml
when: instance_status == "ACTIVE"
- service: name=rabbitmq-server enabled=no state=stopped
when: instance_status == "ACTIVE"
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- include: disable_os_collect_config.yml
# On systems which had MySQL started at boot-up via os-collect-config, os-collect-config
# must be stopped after MySQL is stopped else MySQL will prematurely exit.
- hosts: undercloud:controller:controller-bootstrap:nova-compute:swift-storage:!unknown
tags: shutdown-cloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
when: instance_status == "ACTIVE"
- hosts: undercloud
name: Rebuild and Refresh Undercloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_undercloud_backup_tftpboot.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ undercloud_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: undercloud
name: Enable Undercloud
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_undercloud_restore_tftpboot.yml
- include: start_mysql.yml
- include: start_rabbitmq.yml
# Fix Ironic Reservations due to bug:
# https://bugs.launchpad.net/ironic/+bug/1382698
- include: step_undercloud_ironic_release_reservations.yml
- include: step_run_occ.yml
- service: name={{ item }} enabled=yes state=started
with_items: helion_undercloud_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: undercloud_services
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: controller-bootstrap
name: Rebuild and Refresh controller-bootstrap
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller-bootstrap node to settle"
- hosts: controller-bootstrap
name: Start initial cluster node
max_fail_percentage: 0
sudo: yes
tasks:
- include: activate_cinder_volumes.yml
- include: mysql_init_fix.yml
- include: rabbitmq_occ_disable.yml
- include: refresh_config.yml
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_generate_hosts_file.yml
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
- name: Bootstrap the MySQL cluster
shell: /etc/init.d/mysql bootstrap-pxc
when: single_controller is not defined
- include: start_mysql.yml
- name: "Start keepalived if not in single_controller mode"
service: name=keepalived state=started enabled=yes
when: single_controller is not defined
- name: "Set sysctl net.ipv4.ip_nonlocal_bind for controller-bootstrap group"
sysctl: name=net.ipv4.ip_nonlocal_bind value=1 state=present
when: single_controller is not defined
- name: "Start haproxy if not in single_controller mode"
service: name=haproxy state=started enabled=yes
when: single_controller is not defined
- name: "Pause for 60 seconds if not in single_controller mode"
pause: seconds=60 prompt="Pausing for 60 seconds to allow keepalived/haproxy to enter operational states"
when: single_controller is not defined
- include: step_create_databases.yml
- include: start_rabbitmq.yml
- include: step_run_occ.yml
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=90 delay=10
- include: mysql_access_fix.yml
- hosts: controller
name: Rebuild and Refresh Controller
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_stop_services.yml
vars:
services_to_stop: "{{ overcloud_controller_services }}"
when: instance_status == "ACTIVE" and helion is not defined
- include: step_stop_services.yml
vars:
services_to_stop: "{{ helion_overcloud_controller_services }}"
when: instance_status == "ACTIVE" and helion is defined
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller node to settle."
- hosts: controller
name: Stop and setup for controller refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: activate_cinder_volumes.yml
- name: "Inject Firewall rules for for MySQL to start - tcp/4444"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4444 -j ACCEPT
- name: "Inject Firewall rules for for MySQL to start - tcp/4567"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4567 -j ACCEPT
- name: "Inject Firewall rules for for MySQL to start - tcp/4568"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4568 -j ACCEPT
- name: "Ensure /var/run/rabbitmq is present"
file: path=/var/run/rabbitmq state=directory owner=rabbitmq group=rabbitmq mode=0755
- name: "Inject Firewall rules for for RabbitMQ to start - tcp/5672"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 5672 -j ACCEPT
- name: "Inject Firewall rules for for RabbitMQ to start - tcp/61000"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 61000 -j ACCEPT
- name: "Inject Firewall rules for for RabbitMQ to start - tcp/4369"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4369 -j ACCEPT
# This action of stopping prior to starting is to ensure that should
# MySQL started upon boot, then it would hopefully pickup new config
# that os-collect-config and os-apply-config would have put in place.
- include: mysql_init_fix.yml
- include: rabbitmq_occ_disable.yml
- include: step_reset_mnt_state_permissions.yml
- include: refresh_config.yml
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
# Directly call os-apply-config to write out configuration files in case
# os-collect-config has failed to reach that step.
- include: step_os-apply-config.yml
- include: step_generate_hosts_file.yml
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
- include: start_mysql.yml
- include: mysql_access_fix.yml
- hosts: controller
name: Initiate Database Creation
max_fail_percentage: 0
serial: 1
sudo: yes
tasks:
- include: step_create_databases.yml
- hosts: controller
name: Complete Controller Refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: start_rabbitmq.yml
- include: rabbitmq_rejoin_cluster.yml
- include: step_run_occ.yml
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=120 delay=10
- hosts: controller:controller-bootstrap
name: Check RabbitMQ
max_fail_percentage: 0
tasks:
- pause: seconds=30 prompt="Giving RabbitMQ time to start-up."
- name: Checking rabbitmq cluster status
sudo: yes
command: rabbitmqctl cluster_status
when: single_controller is not defined
- include: cleanup_rabbitmq_start.yml
- hosts: controller-bootstrap
name: Enable Overcloud controller-bootstrap
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_bootstrap_controller_service
when: helion is defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: controller
name: Enable Overcloud Controller
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_controller_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_controller_services
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: swift-storage
name: Rebuild and Refresh swift-storage
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ swift_storage_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: swift-storage
name: Enable Swift Storage
sudo: yes
max_fail_percentage: 0
tasks:
- include: step_run_occ.yml
sudo: yes
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_swift_services
sudo: yes
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_swift_services
sudo: yes
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: nova-compute
name: "Download image from glance if online upgrade is being invoked"
gather_facts: no
max_fail_percentage: 0
# This play must be executed one instance at a time as it downloads
# files to the local machine where ansible is executing.
serial: 1
tasks:
- include: step_update_online_download_image.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is defined
- hosts: nova-compute
name: Rebuild and Refresh Nova Compute
gather_facts: yes
max_fail_percentage: 0
tasks:
- include: step_preserve_iscsi_initiator.yml
when: instance_status == "ACTIVE"
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_update_rebuild_node.yml
vars:
instance_id: "{{ instance_id }}"
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is not defined
- include: step_update_online.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is defined
- include: step_stamp_image_id.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
- include: step_cleanup_from_online_upgrade.yml
when: online_upgrade is defined
- hosts: nova-compute
name: Enable Overcloud Compute
sudo: yes
max_fail_percentage: 0
tasks:
- include: step_os-apply-config.yml
- include: step_restore_iscsi_initiator.yml
- pause: seconds=45 prompt="Giving the compute node forty-five seconds to complete existing processes"
when: online_upgrade is not defined
# Write out config files in as we might be getting in while the
# system is starting up.
- include: step_cloud_init.yml
when: online_upgrade is defined
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
when: online_upgrade is not defined
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
when: online_upgrade is not defined
- include: step_run_occ.yml
- pause: seconds=30 msg="Pausing for 30 seconds to allow services to complete start-up."
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_compute_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_compute_services
when: helion is not defined and item in existing_services
- pause: seconds=60 prompt="Giving Open vSwitch time to reconnect"
# nova-compute should already be started, however this step explicitly sets
# the service to start upon boot.
- service: name=nova-compute state=started enabled=yes
- include: enable_start_os_collect_config.yml
- include: step_post_hook.yml