From 87dd74faf09a4d74579e2bfb2ab4c33ea3bfe1d2 Mon Sep 17 00:00:00 2001 From: Charles Short Date: Mon, 18 Apr 2022 13:21:50 -0400 Subject: [PATCH] debian: Create debian version of collect Debian and Centos use the same tools but they are installed in different places. In order for collect to work on Debian, make sure that we are trying not use to RPMs on Debian. This is done in the collect-patching script so that the "smart" program is not run. Also kdump uses the /var/lib/kdump path on Debian rather than /var/crash on Centos. Also checked for 'rpm -qa' usage and changed them to 'dpkg -l'. Test Plan PASS Build package PASS Build and install ISO PASS Run the collect -v -all Story: 2009101 Task: 43732 Depends-On: https://review.opendev.org/c/starlingx/tools/+/838327 Signed-off-by: Charles Short Change-Id: I66cf0615f8cab7fe877b6cb09d605557c9258c43 --- tools/collector/debian-scripts/LICENSE | 202 ++ tools/collector/debian-scripts/collect | 3101 +++++++++++++++++ .../collector/debian-scripts/collect_ceph.sh | 81 + .../collect_containerization.sh | 206 ++ .../debian-scripts/collect_coredump.sh | 35 + .../collector/debian-scripts/collect_crash.sh | 38 + tools/collector/debian-scripts/collect_date | 1064 ++++++ tools/collector/debian-scripts/collect_dc.sh | 97 + .../collector/debian-scripts/collect_disk.sh | 28 + tools/collector/debian-scripts/collect_fm.sh | 43 + tools/collector/debian-scripts/collect_host | 488 +++ tools/collector/debian-scripts/collect_ima.sh | 59 + .../debian-scripts/collect_interfaces.sh | 34 + .../debian-scripts/collect_mariadb.sh | 61 + .../debian-scripts/collect_mask_passwords | 138 + .../debian-scripts/collect_networking.sh | 58 + .../debian-scripts/collect_nfv_vim.sh | 44 + .../debian-scripts/collect_openstack.sh | 154 + tools/collector/debian-scripts/collect_ovs.sh | 35 + tools/collector/debian-scripts/collect_parms | 29 + .../debian-scripts/collect_patching.sh | 46 + .../debian-scripts/collect_psqldb.sh | 117 + tools/collector/debian-scripts/collect_sm.sh | 26 + .../debian-scripts/collect_sysinv.sh | 118 + tools/collector/debian-scripts/collect_tc.sh | 82 + tools/collector/debian-scripts/collect_utils | 318 ++ tools/collector/debian-scripts/etc.exclude | 41 + tools/collector/debian-scripts/expect_done | 1 + tools/collector/debian-scripts/mariadb-cli.sh | 232 ++ tools/collector/debian-scripts/run.exclude | 14 + tools/collector/debian-scripts/varlog.exclude | 1 + tools/collector/debian/meta_data.yaml | 2 +- 32 files changed, 6992 insertions(+), 1 deletion(-) create mode 100644 tools/collector/debian-scripts/LICENSE create mode 100755 tools/collector/debian-scripts/collect create mode 100755 tools/collector/debian-scripts/collect_ceph.sh create mode 100755 tools/collector/debian-scripts/collect_containerization.sh create mode 100644 tools/collector/debian-scripts/collect_coredump.sh create mode 100644 tools/collector/debian-scripts/collect_crash.sh create mode 100755 tools/collector/debian-scripts/collect_date create mode 100755 tools/collector/debian-scripts/collect_dc.sh create mode 100644 tools/collector/debian-scripts/collect_disk.sh create mode 100644 tools/collector/debian-scripts/collect_fm.sh create mode 100755 tools/collector/debian-scripts/collect_host create mode 100755 tools/collector/debian-scripts/collect_ima.sh create mode 100644 tools/collector/debian-scripts/collect_interfaces.sh create mode 100755 tools/collector/debian-scripts/collect_mariadb.sh create mode 100644 tools/collector/debian-scripts/collect_mask_passwords create mode 100755 tools/collector/debian-scripts/collect_networking.sh create mode 100644 tools/collector/debian-scripts/collect_nfv_vim.sh create mode 100755 tools/collector/debian-scripts/collect_openstack.sh create mode 100644 tools/collector/debian-scripts/collect_ovs.sh create mode 100644 tools/collector/debian-scripts/collect_parms create mode 100755 tools/collector/debian-scripts/collect_patching.sh create mode 100755 tools/collector/debian-scripts/collect_psqldb.sh create mode 100644 tools/collector/debian-scripts/collect_sm.sh create mode 100755 tools/collector/debian-scripts/collect_sysinv.sh create mode 100755 tools/collector/debian-scripts/collect_tc.sh create mode 100755 tools/collector/debian-scripts/collect_utils create mode 100644 tools/collector/debian-scripts/etc.exclude create mode 100755 tools/collector/debian-scripts/expect_done create mode 100755 tools/collector/debian-scripts/mariadb-cli.sh create mode 100644 tools/collector/debian-scripts/run.exclude create mode 100644 tools/collector/debian-scripts/varlog.exclude diff --git a/tools/collector/debian-scripts/LICENSE b/tools/collector/debian-scripts/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/tools/collector/debian-scripts/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/tools/collector/debian-scripts/collect b/tools/collector/debian-scripts/collect new file mode 100755 index 00000000..bdae3f35 --- /dev/null +++ b/tools/collector/debian-scripts/collect @@ -0,0 +1,3101 @@ +#! /bin/bash +######################################################################## +# +# Copyright (c) 2014-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# Description: The collect tool is used to gather log, config and state +# data from one or more hosts or subclouds for the purpose +# of off box analysis. +# +# The collect tool is implemented as a bash script that executes inline +# expect scripts and collection commands, some that that require sudo +# priviledge. +# +# The collect tool can be run from any host to collect data for that host. +# +# The collect tool must be run from an active controller to collect data +# from its managed hosts or subclouds. +# +# Version 2.2 introduces the following behavioral changes. +# +# 1. Default to a 1 month date restricted collect. This only affects what +# is collected from /var/log. Only collect log files that contain logs +# with a date less than one month old are collected. +# Use date options --start-date YYYYMMDD and/or --end-date YYYYMMDD to +# specify a more precise date range if only older logs or only more +# recent logs are required. +# +# 2. Collect for subclouds is added with the --subcloud or -sc option. +# With this option specified collect will collect from all the hosts in +# the specified subcloud(s). +# All the typical scope and naming options like --list or --all or --name +# options also apply to subcloud collections with the exception that +# collection of a subcloud from the system controller includes all the +# hosts in that subcloud. +# +# 3. Default to collecting from hosts or subclouds in parallel. Parallel +# collect reduces the overall collect time for the specified system. +# Collect now launchs host or subcloud collect requests as backgroud +# threads and monitors for completion or error before moving on to +# create the final tarball collect bundle. +# +# The previous default one-by-one or one-after-the-other mode remains +# supported with the introduction and use of the --inline or -in +# command option. +# +# Typical Usages: +# +# command line collect data for function +# --------------------------- ------------------------------------- +# > collect - collect current host ; any host +# > collect - collect from specified host +# > collect --list host1 host2 - collect from a list of hosts +# > collect --all - collect all hosts in controller context +# > collect --all --subcloud - collect all system controller subclouds +# > collect --subcloud --list ... - collect from a list of subclouds +# > collect --all --inline - collect all hosts one after the other +# +# See --help output for a complete list of full and abbreviated +# command line options. +# +# Example Output for some typical usages: +# +# Any single host collect +# +# compute-0:~$ collect +# [sudo] password for sysadmin: +# collecting data from 1 host(s): compute-0 +# collecting compute-0_20210806.145159 ... done (00:02:23 55M) +# creating single-node tarball /scratch/compute-0_20210806.145159.tar ... done (00:02:23 55M) +# +# +# An AIO-DX system collect +# +# controller-0:~$ collect -a +# [sudo] password for sysadmin: +# collecting data from 2 host(s): controller-0 controller-1 +# collected controller-1_20210805.193726 ... done (00:01:35 87M) +# collected controller-0_20210805.193726 ... done (00:02:53 135M) +# creating all-nodes tarball /scratch/ALL_NODES_20210805.193726.tar ... done (00:02:53 221M) +# +# +# A parallel collect of a storage system +# +# controller-0:~$ collect --all +# [sudo] password for sysadmin: +# collecting data from 8 host(s): controller-0 compute-0 compute-1 compute-2 compute-3 controller-1 storage-0 storage-1 +# collected compute-1_20210714.195247 ... done (00:00:57 14M) +# collected compute-2_20210714.195247 ... done (00:00:57 14M) +# collected controller-1_20210714.195247 ... done (00:01:02 16M) +# collected storage-1_20210714.195247 ... done (00:01:05 13M) +# collected storage-0_20210714.195247 ... done (00:01:06 13M) +# collected compute-3_20210714.195247 ... done (00:02:07 14M) +# collected controller-0_20210714.195247 ... done (00:02:11 29M) +# collected compute-0_20210714.195247 ... done (00:03:02 14M) +# creating all-nodes tarball /scratch/ALL_NODES_20210714.195247.tar ... done (00:03:02 124M) +# +# +# A parallel collect of all (3) subclouds in a system +# +# controller-0:~$ collect --all --subcloud +# [sudo] password for sysadmin: +# collecting data from 3 subcloud(s): subcloud1 subcloud2 subcloud3 +# collected subcloud3_20210811.120100 ... done (00:01:47 64M) +# collected subcloud2_20210811.120100 ... done (00:02:50 71M) +# collected subcloud1_20210811.120100 ... done (00:03:46 75M) +# creating all-subclouds tarball /scratch/SUBCLOUDS_20210811.120100.tar ... done (00:03:47 209M) +# +# +# An inline collect of all (3) subclouds in a system +# +# controller-0:~$ collect --all --subcloud --inline +# [sudo] password for sysadmin: +# collecting data from 3 subcloud(s): subcloud1 subcloud2 subcloud3 +# collecting subcloud1_20210811.140525 ... done (00:02:55 79M) +# collecting subcloud2_20210811.140525 ... done (00:02:59 74M) +# collecting subcloud3_20210811.140525 ... done (00:01:47 69M) +# creating all-subclouds tarball /scratch/SUBCLOUDS_20210811.140525.tar ... done (00:07:41 221M) +# +# +# Collect Output: +# +# Collect output is a tar file bundle containing compressed tarballs +# from each host or subcloud. A default named full system collect +# looks like this: +# +# /scratch/ALL_NODES_20210805.193726.tar +# +# or for subcloud(s) collect +# +# /scratch/SUBCLOUDS_20210805.192122.tar +# +# ssh the tarball bundle off box and extract the bundle to reveal its content. +# +# Extract the host tarballs with tar into that bundle's name dir +# +# myhost~$ tar -xvf ALL_NODES_20210805.193726.tar +# ALL_NODES_20210805.193726/controller-0_20210805.193726.tgz +# ALL_NODES_20210805.193726/controller-1_20210805.193726.tgz +# +# For a subcloud tar bundle +# +# myhost~ $ tar -xvf SUBCLOUDS_20210805.192122.tar +# SUBCLOUDS_20210805.192122/subcloud1_20210805.192122.tar +# SUBCLOUDS_20210805.192122/subcloud2_20210805.192122.tar +# SUBCLOUDS_20210805.192122/subcloud3_20210805.192122.tar +# SUBCLOUDS_20210805.192122/subcloud4_20210805.192122.tar +# +# The subcloud bundles have an additional tar level +# +# myhost SUBCLOUDS_20210805.192122 $ sudo tar -xvf subcloud1_20210805.192122.tar +# subcloud1_20210805.192122/controller-0_20210805.192122.tgz +# subcloud1_20210805.192122/controller-1_20210805.192122.tgz +# subcloud1_20210805.192122/compute-1_20210805.192122.tgz +# +# Host tarball content structure +# +# - etc ... config data +# - root ... root dir content +# - var +# |- crash ... crash bundle summary files +# |- lib/sm ... sm flight recorder +# |- log ... the system logs +# |- run ... volatile run dir +# |- extra ... info files produced from /etc/collect.d plugins +# ... area specific configuration and data +# ... all databases in plain text ; except for keystone +# +# Exclusions from etc and /var/run, /var/log are in /etc/collect exclude files. +# +# Behavior : See print_help below. +# +# Collect can be run to collect local hosts or it can be run to collect +# subclouds using the --subcloud or -sc option. The tool does not support +# collecting both in one command. +# +# Collect tool produces execution summary logs in /var/log/user.log and +# more detailed logs in /var/log/collect.log +# +# Collect cleans up after itself. Meaning that collected tarballs on +# remote hosts are removed after they are fetched by the active controller. +# +# The script first collects the process, host, memory, filesystem, interrupt +# and HA information. It then proceeds to calls run-parts against the +# /etc/collect.d direcory (plugins) which contains service level collectors. +# Additional plugins can be added to that collect.d directory and will be +# called automatically. +# +# The collector scripts must consider nodetype when deciding +# which commands to execute where. +# +################################################################## + + +TOOL_NAME="collect" +TOOL_VER=2 +TOOL_REV=2 + +# only supported username +UN="sysadmin" +pw="" + +# pull in common utils and environment +source /usr/local/sbin/collect_utils + +declare -i RETVAL=${FAIL} +function collect_exit() +{ + # support accepting the exit code as arg1 + if [ ${#} -ne 0 ] ; then + RETVAL=${1} + fi + exit ${RETVAL} +} + +# collect must be run as sysadmin +if [ ${UID} -eq 0 ]; then + elog "Cannot run collect as 'root' user" + collect_exit +elif [ "${USER}" != "${UN}" ]; then + elog "Can only run collect as '${UN}' user" + collect_exit +fi + +source_openrc_if_needed + +# used to hold the name of the password file used to pass +# the sudo password to a subcloud +TEMPFILE="" + +########################################################################### +# +# Trap Handling +# +########################################################################### +function cleanup() +{ + # kill all processes whose parent is this process + pkill -P $$ + + # remove the tempfile if it somehow still exists + if [ "${TEMPFILE}" != "" ]; then + rm -f ${TEMPFILE} + fi + collect_exit +} + +TRAP_RESET_GATE=false +function cleanup_with_reset() +{ + # prevent reset from being called for every trap definition + if [ "${TRAP_RESET_GATE}" = false ] ; then + $(reset) + TRAP_RESET_GATE=true + fi + cleanup + collect_exit +} + +# Handle exit signals +trap cleanup_with_reset SIGINT # administrative process termination +trap cleanup_with_reset SIGTERM # Control-C +trap cleanup EXIT # clean exit + +############################################################################ + +# static expect log level control ; +# 0 = hide expect output +# 1 = show expect outout +USER_LOG_MODE=0 + +# limit scp bandwidth to 1MB/s +# increase limit of scp bandwidth from 1MB/s to 10MB/s +SCP_CMD="scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PreferredAuthentications=password -o PubkeyAuthentication=no -l $((10*8*1000))" +SCP_TIMEOUT="600" +SSH_CMD="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PreferredAuthentications=password -o PubkeyAuthentication=no" +NOWDATE=$(date +"%Y%m%d.%H%M%S") +COLLECT_BASE_DIR="/scratch" +collect_host="/usr/local/sbin/collect_host" +collect="/usr/local/sbin/collect" + + +# This is set true on the subcloud when doing an orchestrated collect +ORCHESTRATED_COLLECT=false + +CURR_DIR=$(pwd) + + +# common permission error strings +pw_error="orry, try again" +ac_error="ermission denied" + +function print_help() +{ + echo "" + echo "StarlingX Log Collection Tool, version ${TOOL_VER}.${TOOL_REV}" + echo "" + echo "Usage: ${TOOL_NAME} [COMMANDS ...] {options}" + echo "" + echo "StarlingX 'collect' is used to gather system logs, configuration" + echo "and state data for off system analysis." + echo "" + echo "Running collect will collect logs to /scratch/" + echo "on the host collect is run from. Use host names to specify which" + echo "hosts or subclouds to collect from." + echo "" + echo "Host data collection scope can be the current host or subcloud," + echo "any single specified hostname or subcloud, a --list of or --all" + echo "hosts or subclouds in the system using a single command." + echo "" + echo "Hosts or subclouds are collected in parallel unless the --inline" + echo "or -in option is specified forcing a one after the other collect." + echo "" + echo "Collect gathers /var/log files that contain logs that are dated" + echo "less than a month old so as to limited the size of collect bundles." + echo "Optionally specify --start-date and/or --end-date options to refine" + echo "the collected date range. Only logs files in /var/log are affected" + echo "by these date options." + echo "" + echo "Optionally specify a --name prefix to rename the final collected" + echo "dated tar bundle." + echo "" + echo "With the command set specified, simply run collect as sysadmin and when" + echo "prompted provide the sysadmin sudo password and let collect handle the rest." + echo "" + echo "Scope Options:" + echo "" + echo " collect ... collect logs for current host" + echo " collect host1 ... collect logs for single named host" + echo " collect host1 host2 host3 ... collect logs for stacked host list" + echo " collect [--list | -l] host1 host2 host3 ... collect logs for list of named hosts" + echo " collect [--all | -a] ... collect logs for all hosts" + echo " collect -a ... collect logs for all hosts in parallel" + echo " collect -a [--inline | -in] ... collect logs for all hosts one after the other" + echo "" + echo " collect [--subcloud | -sc ] ... collect logs for subcloud" + echo " collect [--subcloud | -sc ] -l subc1 subc2 ... collect logs for subclouds subc1 and subc2" + echo " collect -a [--subcloud | -sc ] ... collect logs for all subclouds in parallel" + echo " collect -a -sc [--inline | -in] ... collect logs for all subclouds one after the other" + echo " collect --subcloud --continue ... continue a suspended subcloud collect" + echo "" + echo "Dated Collect:" + echo "" + echo "collect [--start-date | -s] YYYYMMDD ... collection of logs on and after this date" + echo "collect [--end-date | -e] YYYYMMDD ... collection of logs on and before this date" + echo "" + echo "Tarball Prefix:" + echo "" + echo "collect [--name | -n] name ... specify the name prefix of the collect tarball" + echo "" + echo "Detailed Display:" + echo "" + echo "collect [--verbose | -v] ... print details during collect" + echo "" + echo "collect [--inventory | -i] ... collect inventory by system cli command" + echo "" + echo "Avoid password and security masking:" + echo "" + echo "collect [--skip-mask] ... skip masking of collect data" + echo "" + echo "Examples:" + echo "" + echo "collect ... all logs for current host" + echo "collect --all ... all logs from all hosts in the system" + echo "collect --all --subcloud ... all logs from all hosts in all subclouds" + echo "collect --all --start-date 20150101 ... logs dated on and after Jan 1 2015 from all hosts" + echo "collect --all --start-date 20151101 --end-date 20160201 ... logs dated between Nov 1, 2015 and Feb 1 2016 from all hosts" + echo "collect --list controller-0 worker-0 storage-0 ... all logs from specified host list" + echo "collect --list controller-0 worker-1 --end-date 20160201 ... only logs before Nov 1, 2015 for host list" + echo "collect --list controller-1 storage-0 --start-date 20160101 ... only logs after Jan 1 2016 for controller-1 and storage-0" + echo "collect --start-date 20151101 --end-date 20160201 ... only logs dated between Nov 1, 2015 and Feb 1 2016 for current host" + echo "collect --subcloud subcloud1 subcloud2 subcloud3 ... only logs from a list of subclouds" + echo "" + exit 0 +} + +# command line arguement variables ; defaulted +DEBUG=false +CLEAN=false +VERBOSE=false +SKIP_MASK=false +INVENTORY=false +SUBCLOUD_COLLECT=false +SUBCLOUD_LOGIN_PROMPT="controller-" + +# parallel collect mode as default +PARALLEL_COLLECT_MODE=true + +# date variables - default to a 1 month dated collect +DATE_FORMAT="YYYYMMDD" +STARTDATE=$(date +%Y%m%d -d "-1 month") +STARTTIME="any" +ENDDATE="any" +ENDTIME="any" +GETSTARTDATE=false +GETENDDATE=false +DCROLE="" + +# host selection variables +LISTING=false +ALLHOSTS=false + +declare -i HOSTS=1 +declare -a HOSTLIST=(${HOSTNAME}) +declare -i SUBCLOUDS=0 +declare -a SUBCLOUDLIST=() +declare -i DONE_COUNT=0 +declare -i longest_name=0 + +PLEASE_STANDBY=false +COLLECT_CONTINUE_MSG_NEEDED=false +SUBCLOUD_COLLECT_CONTINUE=false +SUBCLOUD_COLLECT_CONTINUE_LIST_FILE="/tmp/collect_continue.lst" + +# overall collect timeout +TIMEOUT=1000 +SECONDS=0 +let UNTIL=${SECONDS}+${TIMEOUT} + +COLLECT_NAME="" + +# clear multi option modes +function clear_variable_args() +{ + LISTING=false + GETSTARTDATE=false + GETENDDATE=false +} + + +############################################################################ +# +# Name : report_error +# +# Purpose : Report error to console and logfile +# +# Assumptions: Handles specific cases of invalid password and permission errors +# by exiting so as to avoid repeated errors during multi-host +# collection. +# +# $1 - status string +# $2 - status code number +# +function report_error() +{ + local string=${1} + local code=${2} + + if [[ "${PARALLEL_COLLECT_MODE}" = true && "${PLEASE_STANDBY}" = true && ${DONE_COUNT} -eq 0 ]] ; then + DONE_COUNT=$((DONE_COUNT+1)) + # send new line to delineate '.' progress + echo "" + PLEASE_STANDBY=false + fi + + if [ ${code} -eq ${FAIL_PASSWORD} ] ; then + elog "Invalid password" + collect_exit ${code} + + elif [ ${code} -eq ${FAIL_CONTINUE} ] ; then + elog "${FAIL_CONTINUE_STR} ; ${string} (reason:${code})" + collect_exit ${code} + + elif [ ${code} -eq ${FAIL_INACTIVE} ] ; then + elog "${FAIL_INACTIVE_STR} ; ${string} (reason:${code})" + collect_exit ${code} + + elif [ ${code} -eq ${FAIL_PERMISSION} ] ; then + elog "Permission error ; exiting (${string})" + + elif [ ${code} -eq ${FAIL_UNREACHABLE} ] ; then + elog "${string} (reason:${code}:unreachable)" + + elif [ ${code} -eq ${FAIL_PERMISSION_SKIP} ] ; then + elog "${string} (reason:${code}:permission error)" + + elif [ ${code} -eq ${FAIL_OUT_OF_SPACE} ] ; then + elog "${string} (reason:${code}:${FAIL_NOT_ENOUGH_SPACE_STR}) ; need to increase available space ${COLLECT_BASE_DIR}" + + elif [ ${code} -eq ${FAIL_INSUFFICIENT_SPACE} ] ; then + elog "${string} (reason:${code}:${FAIL_NOT_ENOUGH_SPACE_STR}) ; ${COLLECT_BASE_DIR} usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%" + + elif [ ${code} -ge ${FAIL_TIMEOUT} -a ${code} -le ${FAIL_TIMEOUT9} ] ; then + elog "${FAIL_TIMEOUT_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_SUBCLOUD_TIMEOUT} ] ; then + elog "${FAIL_SUBCLOUD_TIMEOUT_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_PASSWORD_PROMPT} ] ; then + elog "${string} (reason:${code}:failed to learn password)" + + elif [ ${code} -eq ${FAIL_DATE_FORMAT} ] ; then + elog "${FAIL_DATE_FORMAT_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_NO_FILE_SPECIFIED} ] ; then + elog "${FAIL_NO_FILE_SPECIFIED_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_FILE_NOT_FOUND} ] ; then + elog "${FAIL_FILE_NOT_FOUND_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_FILE_EMPTY} ] ; then + elog "${FAIL_FILE_EMPTY_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_NO_HOSTS} ] ; then + elog "${FAIL_NO_HOSTS_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_NO_SUBCLOUDS} ] ; then + elog "${FAIL_NO_SUBCLOUDS_STR} ; ${string} (reason:${code})" + + elif [ ${code} -eq ${FAIL_MISSING_PARAMETER} ] ; then + elog "${FAIL_MISSING_PARAMETER_STR} ; ${string} (reason:${code})" + + else + elog "${string} (reason:${code})" + fi +} + +########################################################################### +# +# Name : is_valid_host +# +# Purpose : Checks to see if the specified hostname is known +# to inventory as a valid provisioned host +# +# Parameters: $1 check_hostname +# +# Return : PASS ... hostname is valid (success path) +# FAIL_HOSTNAME ... hostname is not valid +# FAIL_INACTIVE ... this host is not active +# +########################################################################### + +function is_valid_host() +{ + local check_hostname=${1} + + if [ "${check_hostname}" == "None" ] ; then + return ${FAIL_HOSTNAME} + elif [ "${check_hostname}" == "${HOSTNAME}" ] ; then + return ${PASS} + elif [ "${ACTIVE}" = true ] ; then + system host-show "${check_hostname}" 2>/dev/null 1>/dev/null + if [ ${?} -ne 0 ] ; then + return ${FAIL_HOSTNAME} + else + return ${PASS} + fi + else + report_error "can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} + collect_exit ${FAIL_INACTIVE} + fi +} + +########################################################################### +# +# Name : is_valid_subcloud +# +# Purpose : Checks to see if the specified subcloud name is known +# to dcmanager as a valid provisioned subcloud +# +# Parameters: $1 check_subcloudname +# +# Return : PASS ... subcloudname is valid (success path) +# FAIL_SUBCLOUDNAME ... subcloudname is not valid +# FAIL_INACTIVE ... this host is not the active controller +# +########################################################################### + +function is_valid_subcloud() +{ + local check_subcloudname=${1} + + if [ "${check_subcloudname}" == "None" ] ; then + return ${FAIL_SUBCLOUDNAME} + elif [ "${ACTIVE}" = true ] ; then + dcmanager subcloud show "${check_subcloudname}" 2>/dev/null 1>/dev/null + if [ ${?} -ne 0 ] ; then + return ${FAIL_SUBCLOUDNAME} + else + return ${PASS} + fi + else + report_error "can only run collect for subclouds from the active system controller" ${FAIL_INACTIVE} + collect_exit ${FAIL_INACTIVE} + fi +} + +function query_and_update_dcrole () +{ + DCROLE=$(system show | grep distributed_cloud_role | cut -d '|' -f 3 | tr -d ' ') +} + +############################################################################ +# Parse the command line # +############################################################################ + +# echo "`date` Debug: collect ${@}" + +while [[ ${#} -gt 0 ]] ; do + + key="${1}" + + case $key in + + -h|--help) + print_help + collect_exit ${PASS} + ;; + + -n|--name) + if [ "${2}" == "" ] ; then + report_error "need to specify a name with the --name option" ${FAIL_MISSING_PARAMETER} + collect_exit ${FAIL_MISSING_PARAMETER} + fi + COLLECT_NAME="${2}" + clear_variable_args + shift + ;; + + -v|--verbose) + USER_LOG_MODE=1 + VERBOSE=true + ;; + + --clean) + CLEAN=true + ;; + + -c|--continue) + SUBCLOUD_COLLECT_CONTINUE=true + ;; + + -i|--inventory) + INVENTORY=true + ;; + + -l|--list) + if [ "${ALLHOSTS}" = false ] ; then + if [[ ${#} -lt 2 ]] ; then + report_error "collect exit" ${FAIL_NO_HOSTS} + collect_exit ${FAIL_NO_HOSTS} + fi + if [ "${ACTIVE}" = false ] ; then + report_error "can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} + collect_exit ${FAIL_INACTIVE} + fi + HOSTLIST=(${2}) + HOSTS=1 + LISTING=true + GETSTARTDATE=false + GETENDDATE=false + shift + fi + ;; + + -a|--all|all) + if [ "${ACTIVE}" = false ] ; then + wlog "collect with '${key}' option is only supported on an active controller ; defaulting to local collect" + else + ALLHOSTS=true + fi + + HOSTLIST=(${HOSTNAME}) + HOSTS=1 + clear_variable_args + ;; + + -s|--start-date) + if [ "${2}" == "" ] ; then + report_error "need to specify a date with the --start-date option" ${FAIL_MISSING_PARAMETER} + collect_exit ${FAIL_MISSING_PARAMETER} + elif [ "${2}" != "any" -a ${#2} -ne ${#DATE_FORMAT} ] ; then + report_error "start date must be '${DATE_FORMAT}' format" ${FAIL_DATE_FORMAT} + collect_exit ${FAIL_DATE_FORMAT} + fi + STARTDATE="${2}" + LISTING=false + GETSTARTDATE=true + GETENDDATE=false + shift + ;; + + -e|--end-date) + if [ "${2}" == "" ] ; then + report_error "need to specify a date with the --end-date option" ${FAIL_MISSING_PARAMETER} + collect_exit ${FAIL_MISSING_PARAMETER} + elif [ "${2}" != "any" -a ${#2} -ne ${#DATE_FORMAT} ] ; then + report_error "end date must be '${DATE_FORMAT}' format" ${FAIL_DATE_FORMAT} + collect_exit ${FAIL_DATE_FORMAT} + fi + ENDDATE="${2}" + LISTING=false + GETSTARTDATE=false + GETENDDATE=true + shift + ;; + + -sc|--subcloud) + SUBCLOUD_COLLECT=true + ;; + + -d|--debug) + DEBUG=true + clear_variable_args + ;; + + --skip-mask) + SKIP_MASK=true + shift + ;; + + -in|--inline) + # switch to inline ; one-after-the-other (legacy) mode + PARALLEL_COLLECT_MODE=false + ;; + + -f|--file) + TEMPFILE="${2}" + if [ "${TEMPFILE}" == "" ]; then + report_error "need file path/name to follow --file option" ${FAIL_NO_FILE_SPECIFIED} + collect_exit ${FAIL_NO_FILE_SPECIFIED} + elif [ ! -e "${TEMPFILE}" ]; then + report_error "check path/file: ${TEMPFILE}" ${FAIL_NO_FILE_SPECIFIED} + collect_exit ${FAIL_NO_FILE_SPECIFIED} + elif [ ! -s "${TEMPFILE}" ] ; then + report_error "file:${TEMPFILE}" ${FAIL_FILE_EMPTY} + rm -f ${TEMPFILE} + collect_exit ${FAIL_FILE_EMPTY} + else + # read first line in file + pw=$(head -n 1 ${TEMPFILE}) + dlog "pw:${pw}" + rm -f ${TEMPFILE} + shift + fi + ;; + + *) + if [ "${LISTING}" = true ] ; then + HOSTS=$((HOSTS+1)) + HOSTLIST+=(${key}) + elif [ "${GETSTARTDATE}" = true ] ; then + dlog "accepting but ignoring legacy starttime specification" + elif [ "${GETENDDATE}" = true ] ; then + dlog "accepting but ignoring legacy endtime specification" + else + HOSTLIST=(${key}) + HOSTS=1 + LISTING=true + fi + GETSTARTDATE=false + GETENDDATE=false + ;; + esac + shift # past argument or value +done + + +# startup state debug logs +dlog "${TOOL_NAME} ver ${TOOL_REV}.${TOOL_REV} (pid:$$)" +dlog "USERNAME = ${USER}" +dlog "ACTIVE = ${ACTIVE}" +dlog "HOSTNAME = ${HOSTNAME}" +dlog "PARALLEL = ${PARALLEL_COLLECT_MODE}" +dlog "INVENTORY = ${INVENTORY}" +dlog "STARTDATE = ${STARTDATE}" +dlog "ENDDATE = ${ENDDATE}" +dlog "SKIPMASK = ${SKIP_MASK}" +dlog "ALLHOSTS = ${ALLHOSTS}" +dlog "LISTING = ${LISTING}" +dlog "CLEAN = ${CLEAN}" +dlog "TIMEOUT = ${TIMEOUT}" +dlog "SECONDS = ${SECONDS}" +dlog "UNTIL = ${UNTIL}" + +# the continue option is only supported for subcloud collect +if [[ "${SUBCLOUD_COLLECT_CONTINUE}" = true && "${SUBCLOUD_COLLECT}" = false ]] ; then + report_error "collect continue is only supported for subclouds" ${FAIL_CONTINUE} + collect_exit ${FAIL_CONTINUE} +fi + +# subcloud option only on active SystemController +if [[ "${ACTIVE}" = false && "${SUBCLOUD_COLLECT}" = true ]] ; then + report_error "subcloud collect can only be run from an active systemcontroller" ${FAIL_INACTIVE} + collect_exit ${FAIL_INACTIVE} +fi + +# Don't block the clean operation based on avalable space. +# That would defeat the purpose. +if [ "${CLEAN}" = false ] ; then + space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR} +fi + +# +# If on the active controller load the DCROLE variable and +# handle subcloud collect from non SC +# +if [ "${ACTIVE}" = true ] ; then + query_and_update_dcrole + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + if [ "${DCROLE}" != "${DCROLE_SYSTEMCONTROLLER}" ] ; then + report_error "must run subcloud collect from the systemcontroller" ${FAIL_NOT_SYSTEMCONTROLLER} + collect_exit ${FAIL_NOT_SYSTEMCONTROLLER} + fi + fi +fi + +# +# if the user specified the '--all' option then override +# the current list and add them all from inventory. +# +if [ "${ALLHOSTS}" = true ] ; then + HOSTLIST=() + HOSTS=0 + SUBCLOUDLIST=() + SUBCLOUDS=0 + if [ "${SUBCLOUD_COLLECT}" = false ]; then + HOSTLIST=(${HOSTNAME}) + HOSTS=1 + for foreign_host in $(system host-list | grep '[0-9]' | cut -d '|' -f 3 | tr -d ' ' | grep -v ${HOSTNAME}); do + if [ "${foreign_host}" != "None" ] ; then + HOSTS=$((HOSTS+1)) + HOSTLIST+=(${foreign_host}) + fi + done + + else + for foreign_host in $(dcmanager subcloud list | grep '[0-9]' | cut -d '|' -f 3 | tr -d ' '); do + if [ "${foreign_host}" != "None" ] ; then + SUBCLOUDS=$((SUBCLOUDS+1)) + SUBCLOUDLIST+=(${foreign_host}) + fi + done + fi +else + # This host path + # Filter default or user specified host list through temp_hostlist + # This drops rather than deletes invalid or duplicate hosts. + temp_hostlist=(${HOSTLIST[@]}) + temp_hosts=${HOSTS} + HOSTLIST=() + HOSTS=0 + SUBCLOUDLIST=() + SUBCLOUDS=0 + + # check for and handle collect --continue + if [ "${SUBCLOUD_COLLECT_CONTINUE}" = true ] ; then + if [ -f "${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE}" ] && \ + [ -s "${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE}" ] ; then + SUBCLOUDLIST=($( cat ${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE})) + SUBCLOUDS=${#SUBCLOUDLIST[@]} + dlog "continuing collect for remaining ${SUBCLOUDS} subclouds: ${SUBCLOUDLIST[@]}" + else + report_error "the ${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE} file is empty or missing" ${FAIL_CONTINUE} + fi + + elif [ "${SUBCLOUD_COLLECT}" = false ] ; then + if [ ${temp_hosts} -eq 0 ] ; then + report_error "no hosts specified" ${FAIL_NO_HOSTS} + collect_exit ${FAIL_NO_HOSTS} + else + for host in "${temp_hostlist[@]}" ; do + is_valid_host ${host} + if [ ${?} -eq 0 ] ; then + # don't add duplicates + drop=false + for tmp in "${HOSTLIST[@]}" ; do + if [ "${host}" == "${tmp}" ] ; then + drop=true + break + fi + done + if [ "${drop}" = false ] ; then + # add this host + HOSTS=$((HOSTS+1)) + HOSTLIST+=("${host}") + fi + else + report_error "cannot collect data from unknown host '${host}'" ${WARN_HOSTNAME} + fi + done + fi + else + if [ ${temp_hosts} -eq 0 ] ; then + report_error "no subclouds specified" ${FAIL_NO_SUBCLOUDS} + collect_exit ${FAIL_NO_SUBCLOUDS} + # don't query a large number of subclouds individually, + # that can take a long time. Instead get the full list and + # validate the specified list from the full list + elif [ ${temp_hosts} -gt 10 ] ; then + SUBCLOUDLIST_TEMP=() + # reuse HOSTS and HOSTLIST vars for this operation + for foreign_host in $(dcmanager subcloud list | grep '[0-9]' | cut -d '|' -f 3 | tr -d ' '); do + if [ "${foreign_host}" != "None" ] ; then + SUBCLOUDLIST_TEMP+=(${foreign_host}) + fi + done + # validate the subcloud names + for subcloud in "${temp_hostlist[@]}" ; do + for temp in "${SUBCLOUDLIST_TEMP[@]}" ; do + found=false + if [ "${temp}" == "${subcloud}" ] ; then + # don't add duplicates + drop=false + for tmp in "${SUBCLOUDLIST[@]}" ; do + if [ "${subcloud}" == "${tmp}" ] ; then + drop=true + break + fi + done + if [ "${drop}" = false ] ; then + SUBCLOUDS=$((SUBCLOUDS+1)) + SUBCLOUDLIST+=(${subcloud}) + found=true + break + fi + fi + done + if [ "${found}" = false ] ; then + is_valid_subcloud ${subcloud} + if [ ${?} -eq 0 ] ; then + # don't add duplicates + drop=false + for tmp in "${SUBCLOUDLIST[@]}" ; do + if [ "${subcloud}" == "${tmp}" ] ; then + drop=true + break + fi + done + if [ "${drop}" = false ] ; then + # add this subcloud + SUBCLOUDS=$((SUBCLOUDS+1)) + SUBCLOUDLIST+=("${subcloud}") + fi + else + report_error "cannot collect data from unknown subcloud '${subcloud}'" ${WARN_SUBCLOUD} + fi + fi + done + else + # validate subclouds one by one through dcmanager + for subcloud in "${temp_hostlist[@]}" ; do + is_valid_subcloud ${subcloud} + if [ ${?} -eq 0 ] ; then + # don't add duplicates + drop=false + for tmp in "${SUBCLOUDLIST[@]}" ; do + if [ "${subcloud}" == "${tmp}" ] ; then + drop=true + break + fi + done + if [ "${drop}" = false ] ; then + # add this subcloud + SUBCLOUDS=$((SUBCLOUDS+1)) + SUBCLOUDLIST+=("${subcloud}") + fi + else + report_error "cannot collect data from unknown subcloud '${subcloud}'" ${WARN_SUBCLOUD} + fi + done + fi + fi +fi + +if [ ! -z ${COLLECT_NAME} ] ; then + + # User specified tarname + # + # This is the only case for system controller initiated subcloud collect + COLLECT_TYPE="user-named" + + # Subcloud collect with a password at this point must be orchestrated + # ... with collect date specified by the system controller. + if [ "${DCROLE}" == "${DCROLE_SUBCLOUD}" -a "${pw}" != "" ] ; then + dlog "date override ${NOWDATE} to ${COLLECT_NAME: -15}" + NOWDATE=${COLLECT_NAME: -15} + ilog "Orchestrated collect" + ORCHESTRATED_COLLECT=true + elif [ "${DCROLE}" == "" -a "${ACTIVE}" == false -a "${pw}" != "" ]; then + wlog "Subcloud has not been properly configured." + ERROR_DCROLE=$(cat /etc/platform/platform.conf | grep distributed_cloud_role | cut -d '=' -f 2) + if [ "${ERROR_DCROLE}" = "subcloud" ]; then + dlog "date override ${NOWDATE} to ${COLLECT_NAME: -15}" + NOWDATE=${COLLECT_NAME: -15} + ilog "Orchestrated Collect" + ORCHESTRATED_COLLECT=true + fi + fi + +elif [ "${ALLHOSTS}" = true ] ; then + + # All hosts/subclouds bundle + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + COLLECT_NAME="ALL_SUBCLOUDS" + COLLECT_TYPE="all-subclouds" + else + COLLECT_NAME="ALL_NODES" + COLLECT_TYPE="all-nodes" + fi + +elif [ "${SUBCLOUD_COLLECT}" = false -a ${HOSTS} -eq 1 ] ; then + + # Single host bundle + COLLECT_NAME="${HOSTLIST[0]}" + COLLECT_TYPE="single-node" + +elif [ "${SUBCLOUD_COLLECT}" = true -a ${SUBCLOUDS} -eq 1 ] ; then + + # Single host bundle + COLLECT_NAME="${SUBCLOUDLIST[0]}" + COLLECT_TYPE="single-subcloud" + +else + + # Otherwise its a multi host bundle + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + COLLECT_NAME="SELECT_SUBCLOUDS" + COLLECT_TYPE="selected-subcloud" + else + COLLECT_NAME="SELECT_NODES" + COLLECT_TYPE="selected-node" + fi + +fi + +if [ "${ORCHESTRATED_COLLECT}" = false ] ; then + COLLECT_NAME+="_${NOWDATE}" +fi +COLLECT_DIR="${COLLECT_BASE_DIR}/${COLLECT_NAME}" +TARBALL_NAME="${COLLECT_DIR}.tar" + +# learned state debug logs +if [ "${SUBCLOUD_COLLECT}" = true ] ; then + dlog "SUBCLOUDLIST = ${SUBCLOUDS}:${SUBCLOUDLIST[@]}" +else + dlog "HOSTLIST = ${HOSTS}:${HOSTLIST[@]}" +fi +if [ "${DCROLE}" != "" ] ; then + dlog "DCROLE = ${DCROLE}" +fi +dlog "COLLECT_TYPE = ${COLLECT_TYPE}" +dlog "COLLECT_NAME = ${COLLECT_NAME}" +dlog "COLLECT_DIR = ${COLLECT_DIR}" +dlog "TARBALL_NAME = ${TARBALL_NAME}" + +############################################################################ +# +# Password handling +# +# If the password is not learned by other means by this time +# then prompt the user to enter it. +# +# The password is used for expect driven requests. +# +############################################################################ +# dlog "password coming in is:$pw" + +if [ -z "${pw}" ] ; then + read -s -p "[sudo] password for ${USER}:" pw + echo "" +fi + +# When the pw is used locally for expect requests ... +# +# Although bash 'read' will handle sanitizing the password +# input for the purposes of storing it in ${pw}, expect +# will need certain special characters to be backslash +# delimited +pw=${pw/\\/\\\\} # replace '\' with '\\' +pw=${pw/\]/\\\]} # replace ']' with '\]' +pw=${pw/\[/\\\[} # replace '[' with '\[' +pw=${pw/$/\\$} # replace '$' with '\$' +pw=${pw/\"/\\\"} # replace '"' with '\"' + + +########################################################################### +# +# Name : check_host_reachable +# +# Purpose : Verify a host is reachable before trying to collect from it +# +# Description: ls the content of the scratch dir +# Parameters : $1 - remote hostname +# $2 - dir or file with full path +# +########################################################################### + +function check_host_reachable() +{ + local hostname=${1} + + if [ "${hostname}" == "${HOSTNAME}" ] ; then + return ${PASS} + fi + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + expect -re $ + set timeout 60 + send "${SSH_CMD} ${UN}@${hostname} cat ${cmd_done_file}\n" + expect { + "assword:" { + expect -re $ + send "${pw}\r" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "No such file or directory" { exit ${FAIL_FILE_NOT_FOUND} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT3} } + } + } + "(yes/no)?" { + send "yes\r" + exp_continue + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve hostname" { + exit ${FAIL_UNREACHABLE} + } + "Network is unreachable" { + exit ${FAIL_UNREACHABLE} + } + "Connection refused" { + exit ${FAIL_UNREACHABLE} + } + "Connection timed out" { + exit ${FAIL_UNREACHABLE} + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + return ${?} +} + + +########################################################################### +# +# Name : clean_scratch_dir_local +# +# Purpose : remove contents of the local /scratch directory +# +# Parameters: $1 - this hostname +# $2 - specified directory (always $COLLECT_BASE_DIR) +# +########################################################################### + +function clean_scratch_dir_local () +{ + local this_hostname=${1} + local directory=${2} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 60 + expect -re $ + send -- "sudo rm -rf ${directory}/*_????????.??????* ; cat ${cmd_done_file}\n" + expect { + "assword:" { send "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION} } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "clean_scratch_dir_local ${this_hostname} failed" ${rc} + fi + return ${rc} +} + +########################################################################### +# +# Name : clean_scratch_dir_remote +# +# Purpose : remove contents of the specified host's /scratch directory +# +# Parameters: $1 - host +# $2 - specified directory (always $COLLECT_BASE_DIR) +# +########################################################################### + +function clean_scratch_dir_remote() +{ + local this_hostname=${1} + local directory=${2} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + expect -re $ + set timeout 60 + send "${SSH_CMD} ${UN}@${this_hostname}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "${this_hostname}" { + set timeout 30 + expect -re $ + send "sudo rm -rf ${directory}/*_????????.??????* ; cat ${cmd_done_file}\n" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "${cmd_done_file}: No such file or directory" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT3} } + } + } + timeout { exit ${FAIL_TIMEOUT1} } + } + } + "(yes/no)?" { + send "yes\r" + exp_continue + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve hostname" { + exit ${FAIL_UNREACHABLE} + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to clean ${this_hostname}:${directory}" ${rc} + fi + return ${rc} +} + +########################################################################### +# +# Name : delete_remote_dir_or_file +# +# Purpose : Deletes a remote directory or file +# +# Parameters: $1 - remote hostname +# $2 - dir or file with full path +# $3 - expected login prompt +# $4 - alternative login prompt (optional) +# +########################################################################### + +function delete_remote_dir_or_file() +{ + local remote_hostname=${1} + local dir_or_file=${2} + local login_prompt="${3}" + + # alt_login_prompt is optional. Used when the actual prompt does not + # match the expected login_prompt (as contained in $login_prompt) + local alt_login_prompt="${4}" + + # if ${4} is empty, use $login_prompt instead. + if test -z "${4}"; + then + alt_login_prompt=${login_prompt}; + fi + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + expect -re $ + set timeout 60 + send "${SSH_CMD} ${UN}@${remote_hostname}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + timeout { exit ${FAIL_TIMEOUT1} } + "${login_prompt}" {} + "${alt_login_prompt}" {} + } + set timeout 10 + expect -re $ + send "sudo rm -rf ${dir_or_file} ; cat ${cmd_done_file}\n" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "${cmd_done_file}: No such file or directory" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT3} } + } + } + "(yes/no)?" { + send "yes\r" + exp_continue + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve hostname" { + exit ${FAIL_UNREACHABLE} + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + dlog "delete_remote_dir_or_file parms=${remote_hostname}:${login_prompt}:${dir_or_file}" + report_error "failed to delete ${dir_or_file} on ${remote_hostname} (${login_prompt})" ${rc} + fi + return ${rc} +} + +############################################################################ +# +# Name : get_file_from_host +# +# Purpose : Fetch a file from a remote host +# +# Parameters: $1 - remote hostname +# $2 - remote source path/filename +# $3 - local path destination +# +############################################################################ + +function get_file_from_host() +{ + local remote_hostname=${1} + local remote_file=${2} + local local_dest=${3} + + remove_file_local ${HOST_COLLECT_ERROR_LOG} + + dlog "get_file_from_host: ${UN}@${remote_hostname}:${COLLECT_BASE_DIR}/${remote_file} ${local_dest}" + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout ${SCP_TIMEOUT} + expect -re $ + send "${SCP_CMD} ${UN}@${remote_hostname}:${COLLECT_BASE_DIR}/${remote_file} ${local_dest} 2>>${HOST_COLLECT_ERROR_LOG}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "100%" { exit ${PASS} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT1} } + } + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve hostname" { + exit ${FAIL_UNREACHABLE} + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to get file from ${remote_hostname}" ${rc} + else + # Look for "No space left on device" error + grep -q "${FAIL_OUT_OF_SPACE_STR}" ${HOST_COLLECT_ERROR_LOG} + if [ "$?" == "0" ] ; then + remove_file_local "${local_dest}/${remote_file}" + rc=${FAIL_OUT_OF_SPACE} + fi + fi + + remove_file_local ${HOST_COLLECT_ERROR_LOG} + + return ${rc} +} + +############################################################################ +# +# Name : copy_file_to_host +# +# Purpose : Copy a file to a remote host +# +# Parameters: $1 - local path/file +# $2 - remote hostname +# $3 - remote destination directory +# +############################################################################ + +function copy_file_to_host() +{ + local local_path_file_name="${1}" + local remote_hostname="${2}" + local remote_dir="${3}" + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout ${SCP_TIMEOUT} + expect -re $ + send "${SCP_CMD} ${local_path_file_name} ${UN}@${remote_hostname}:${remote_dir} 2>>${HOST_COLLECT_ERROR_LOG}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "100%" { exit ${PASS} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT1} } + } + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve hostname" { + exit ${FAIL_UNREACHABLE} + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "${FAIL_FILE_COPY_STR} ${local_path_file_name} to ${remote_hostname}:${remote_dir}" ${rc} + fi + return ${rc} +} + +########################################################################### +# +# Name : create_collect_dir_local +# +# Purpose : Create the local dated collect dir where all +# the tarballs for this collect will get put. +# +# Assumptions: Permissions are set to make it easy to copy +# tarballs from remote host into +# +# Parameters: $1 - the fill dir +# +########################################################################### + +function create_collect_dir_local() +{ + local dir=${1} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 10 + expect -re $ + send "sudo mkdir -m 775 -p ${dir} ; cat ${cmd_done_file}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "${cmd_done_sig}" { exit ${PASS} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION}} + timeout { exit ${FAIL_TIMEOUT1} } + } + } + "${cmd_done_sig}" { exit ${PASS} } + "${ac_error}" { exit ${FAIL_PERMISSION}} + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to create_collect_dir_local for ${dir}" ${rc} + collect_exit ${rc} + fi + return ${rc} +} + +############################################################################ +# +# Create the local collect dir where the tarball(s) will temporarily stored +# +# Note: Catches the password error case +# +############################################################################ + +create_collect_dir_local "${COLLECT_DIR}" + +########################################################################## +# +# Name : remove_file_local +# +# Purpose : Delete the specified file using sudo +# +# Parameters: $1 - the file to be delete with full path specified +# +########################################################################### + +function remove_file_local() +{ + local local_file=${1} + local rc=${PASS} + + if [ -e ${local_file} ] ; then + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 10 + expect -re $ + send -- "sudo rm -f ${local_file} ; cat ${cmd_done_file}\n" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION} } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to remove_file_local ${local_file}" ${rc} + fi + fi + return ${rc} +} + +########################################################################## +# +# Name : remove_dir_local +# +# Purpose : Delete the specified file using sudo +# +# Parameters: $1 - the directory to be removed with full path specified +# +########################################################################### + +function remove_dir_local() +{ + local dir=${1} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 10 + expect -re $ + send -- "sudo rm -rf ${dir} ; cat ${cmd_done_file}\n" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION} } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to remove_dir_local ${dir}" ${rc} + dlog "remove_dir_local failed: ${dir}" + fi + return ${rc} +} + +########################################################################### +# +# Name : move_file_local +# +# Purpose : Move a file and change permissions using sudo +# +# Parameters: $1 - src path/file +# $2 - dest path/file +# +########################################################################### + +function move_file_local() +{ + local src=${1} + local dst=${2} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 10 + expect -re $ + send -- "sudo mv ${src} ${dst} ; cat ${cmd_done_file}\n" + expect { + "assword:" { send -- "${pw}\r" ; exp_continue } + "${cmd_done_sig}" { exit ${PASS} } + "annot remove" { exit ${FAIL_CLEANUP} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION} } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to move_file_local ${src} to ${dst}" ${rc} + fi + return ${rc} +} + + +########################################################################### + + +########################################################################### +function scratch_full() +{ + avail=$(df -k ${COLLECT_BASE_DIR} | grep -v Available | awk '{ print $4 }') + if [ ${avail} -lt ${COLLECT_BASE_DIR_FULL_THRESHOLD} ] ; then + return ${FAIL} + else + return ${PASS} + fi +} + +########################################################################### +# +# Name : echo_stats +# +# Purpose : print collect data and/or stats +# +# Description: Append the echoed collect done with collect stats +# Produce a user log that duplicates the console output +# in both parallel and inline collect modes. +# +# Parameters : $1 - seconds +# $2 - label for control flow +# $3 - path/file name to get the size of +# +########################################################################## + +function echo_stats() +{ + local secs=${1} + local label="${2}" + local file="${3}" + local MSG="" + local stats="" + + MSG="collected " + len=${#label} + + for ((i=len;i/dev/null) + if [ $? -eq 0 ] ; then + if [ "${label}" == "stats-only" ] ; then + printf "%s %5s %3s)\n" "${stats}" "${size}" "${avail}" + log "${MSG} $stats ${size} ${avail})" + else + if [ "${PARALLEL_COLLECT_MODE}" = true ] ; then + printf "%s %s %5s %3s)\n" "${MSG}" "${stats}" "${size}" "${avail}" + else + printf "%s %5s %3s)\n" "${stats}" "${size}" "${avail}" + fi + log "${MSG} $stats ${size} ${avail})" + fi + return + fi + fi + printf "stats error)\n" +} + +############################################################################ +# +# Name : collect_host_run +# +# Purpose : Run collect host in selected mode +# +# Description: Run collect_host as a background task for each host if +# parallel option is specified. Otherwise, run collect in +# forground (legacy mode) for each host one after the other. +# +############################################################################ + +function collect_host_run() +{ + local host="${1}" + local rc=${PASS} + + if [ "${PARALLEL_COLLECT_MODE}" = false ] ; then + local MSG="collecting" + # line up the host names + len=${#host} + for ((i=len;i ${TEMPFILE} + copy_file_to_host "${TEMPFILE}" "${subcloud}" "/tmp" + rc=${?} + remove_file_local ${TEMPFILE} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to copy '${TEMPFILE}' to ${subcloud}/tmp" ${FAIL_FILE_COPY} + collect_exit ${FAIL_FILE_COPY} + fi + + # tell the remote subcloud the name of the password file + collect_cmd+=("-f ${TEMPFILE}") + + # Save current user log level + local save=${USER_LOG_MODE} + if [ "${VERBOSE}" = true ] ; then + USER_LOG_MODE=1 + fi + + # echo "Subcloud Collect: ${subcloud} ${collect_cmd[@]}" +/usr/bin/expect << EOF + trap exit {SIGINT SIGTERM} + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 30 + expect -re $ + send "${SSH_CMD} ${UN}@${subcloud}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT3} } + "${SUBCLOUD_LOGIN_PROMPT}" {} + "${subcloud}:" {} + } + set timeout ${TIMEOUT} + send "${collect} ${collect_cmd[@]}\n" + expect { + "${collect_done}" { + send "exit\r" + exit ${PASS} + } + "${FAIL_INSUFFICIENT_SPACE_STR}" { + send "exit\r" + exit ${FAIL_INSUFFICIENT_SPACE} + } + "${FAIL_OUT_OF_SPACE_STR}" { + send "exit\r" + exit ${FAIL_OUT_OF_SPACE} + } + "${FAIL_PASSWORD_PROMPT_STR}" { + send "exit\r" + exit ${FAIL_PASSWORD_PROMPT} + } + "${COLLECT_ERROR} ${FAIL_FILE_EMPTY_STR}" { + send "exit\r" + exit ${FAIL_FILE_EMPTY} + } + "${COLLECT_ERROR} ${FAIL_FILE_NOT_FOUND_STR}" { + send "exit\r" + exit ${FAIL_FILE_NOT_FOUND} + } + "${COLLECT_ERROR} ${FAIL_DATE_FORMAT_STR}" { + send "exit\r" + exit ${FAIL_DATE_FORMAT} + } + "${COLLECT_ERROR} ${FAIL_INACTIVE_STR}" { + send "exit\r" + exit ${FAIL_INACTIVE} + } + "${COLLECT_ERROR} ${FAIL_NO_HOSTS_STR}" { + send "exit\r" + exit ${FAIL_NO_HOSTS} + } + "${COLLECT_ERROR} ${FAIL_NO_SUBCLOUDS_STR}" { + send "exit\r" + exit ${FAIL_NO_SUBCLOUDS} + } + "${COLLECT_ERROR} ${FAIL_MISSING_PARAMETER_STR}" { + send "exit\r" + exit ${FAIL_MISSING_PARAMETER} + } + "${COLLECT_ERROR} ${FAIL_NO_FILE_SPECIFIED_STR}" { + send "exit\r" + exit ${FAIL_NO_FILE_SPECIFIED} + } + "${COLLECT_ERROR} ${FAIL_SUBCLOUD_TIMEOUT_STR}" { + send "exit\r" + exit ${FAIL_SUBCLOUD_TIMEOUT} + } + "${COLLECT_ERROR}" { + send "exit\r" + exit ${FAIL} + } + timeout { exit ${FAIL_TIMEOUT5} } + } + } + "(yes/no)?" { + send "yes\r" + exp_continue + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve" { + exit ${FAIL_UNREACHABLE} + } + "Host key verification failed" { + send "rm -f /home/${UN}/.ssh/known_hosts\n" + exit ${FAIL} + } + timeout { exit ${FAIL_TIMEOUT} } + } + exit { $FAIL } +EOF + rc=${?} + USER_LOG_MODE=${save} + return ${rc} +} + +############################################################################ +# +# Name : collect_host_complete_local +# +# Description: Perform collect host complete operations for a +# local collect host. +# +# 1. Get the tarball +# 2. Handle errors +# - report +# - cleanup +# +############################################################################ + +function collect_host_complete_local() +{ + local tarname="${1}" + local rc=${PASS} + + # create the dir again just to handle the case where we are + # collecting on ourself and have removed the collect_dir + # directory in collect_host above. + create_collect_dir_local "${COLLECT_DIR}" + + # move the tarball into the collect dir + # only applies to the local collect since the remote + # collect scp's it directly into the collect dir. + move_file_local "${COLLECT_BASE_DIR}/${tarname}.tgz" "${COLLECT_DIR}" + rc=${?} + if [ ${rc} -eq ${PASS} ] ; then + log "collect ${COLLECT_BASE_DIR}/${tarname}.tgz succeeded" + else + if [ ${rc} -eq ${FAIL_INSUFFICIENT_SPACE} ] ; then + + report_error "${FAIL_INSUFFICIENT_SPACE_STR}" ${rc} + + echo "" + wlog "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation." + echo "" + + remove_dir_local ${COLLECT_DIR} + + collect_exit ${FAIL_INSUFFICIENT_SPACE} + + elif [ ${rc} -eq ${FAIL_OUT_OF_SPACE} ] ; then + + report_error "${FAIL_OUT_OF_SPACE_STR}" ${rc} + + echo "" + wlog "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation." + echo "" + + # Remove the corrupt file and exit + remove_file_local ${COLLECT_ERROR_LOG} + remove_file_local ${COLLECT_BASE_DIR}/${tarname}.tgz + remove_dir_local ${COLLECT_BASE_DIR}/${tarname} + remove_dir_local ${COLLECT_BASE_DIR}/${COLLECT_NAME} + + collect_exit ${FAIL_OUT_OF_SPACE} + + else + report_error "failed to collect from ${HOSTNAME} [host complete]" ${rc} + dlog "collect_host_complete_local failure: ${COLLECT_DIR}:${tarname}:${rc}" + fi + fi + return ${rc} +} + +############################################################################ +# +# Name : collect_host_complete_remote +# +# Description: Perform collect host complete operations for a +# remote host collect. +# +# 1. Fetch the tarball +# 2. Remove tarball from remote host +# 2. Handle errors +# - report +# - cleanup +# +############################################################################ + +function collect_host_complete_remote () +{ + local host="${1}" + local tarname="${2}" + + if [ "${SUBCLOUD_COLLECT}" == true ] ; then + SUFFIX="tar" + else + SUFFIX="tgz" + fi + get_file_from_host "${host}" "${tarname}.${SUFFIX}" "${COLLECT_DIR}" + + local rc=${?} + if [ ${rc} -eq ${PASS} ] ; then + if [ "${SUBCLOUD_COLLECT}" == true ] ; then + # login to subclouds does not show the subcloud name + # in the login prompt. It will always be one of the controllers + # so set login prompt to SUBCLOUD_LOGIN_PROMPT + delete_remote_dir_or_file "${host}" "${COLLECT_BASE_DIR}/${tarname}*" "${SUBCLOUD_LOGIN_PROMPT}" "${host}:" + else + # hosts always login as host name, use that hostname as login prompt + delete_remote_dir_or_file "${host}" "${COLLECT_BASE_DIR}/${tarname}*" "${host}" + fi + rc=$? + if [ ${rc} -eq ${PASS} ] ; then + log "collect ${COLLECT_BASE_DIR}/${tarname}.${SUFFIX} succeeded" + else + log "collect ${COLLECT_BASE_DIR}/${tarname}.${SUFFIX} succeeded but failed to cleanup" + rc=${PASS} + fi + else + report_error "failed to collect from ${host} [get file]" ${rc} + dlog "get_file_from_host failure: ${host}:${tarname}.${SUFFIX}:${COLLECT_DIR}" + fi + return ${rc} +} + +############################################################################ +# +# Parallel Collect Support +# +# collect_host_run - run collect_host as a background task +# collect_host_monitor - monitor for collect_host background task status +# collect_host_done - mark collect_host done with status +# collect_host_stats - print collect host stats +# +# collect_host_complete_local - local collect complete operations +# collect_host_complete_remote - remote collect complete operations +# +# collect_host_ctrl_list_index_print - print collect host control list@index +# +# collect_host_ctrl_list is a structured host list used to track the state of +# collect_host run as a background task for each host. +# +# Structure members: +# +# hostname - the name of the host being collected +# stage - the collect stage for this host ; RUN, MON, DONE +# pid - the pid of the background'ed collect host process +# seconds - the time in seconds of when the collect started +# status - the exit status of the remote collect 0..255 +# name - the full path and name of the remote collected tarball +# +############################################################################ +declare collect_host_ctrl_list=() + +# The following index constants are used to access each field. +declare -r INDEX_HOST=0 +declare -r INDEX_STAGE=1 +declare -r INDEX_PID=2 +declare -r INDEX_SECONDS=3 +declare -r INDEX_STATUS=4 +declare -r INDEX_TARBALL=5 + +# The stages each launched collect_host goes through +declare -r STAGE_RUN="run" +declare -r STAGE_MON="monitor" +declare -r STAGE_DONE="done" + +# declare -r INVALID_PID=-1 + +########################################################################### +# +# Name : collect_host_monitor +# +# Purpose : Transition host into tjhe monitor stage +# +############################################################################ + +function collect_host_monitor() +{ + local index=${1} + + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + TARGETS=${SUBCLOUDS} + else + TARGETS=${HOSTS} + fi + if [ ${index} -lt ${TARGETS} ] ; then + TARGET=${collect_host_ctrl_list[${index}]} + info=(${TARGET//:/ }) + + # Update collect host control structure for this host with + # + # collect_host_ctrl_list[index].stage = MONitor + # + collect_host_ctrl_list[${index}]="${info[${INDEX_HOST}]}:\ + ${STAGE_MON}:\ + ${info[${INDEX_PID}]}:\ + ${info[${INDEX_SECONDS}]}:\ + ${info[${INDEX_STATUS}]}:\ + ${info[${INDEX_TARBALL}]}" + collect_host_ctrl_list_index_print ${index} + else + elog "collect_host_monitor ; invalid index:${index} ; must be smaller than ${TARGETS}" + collect_exit ${FAIL_INTERNAL} + fi +} + +########################################################################### +# +# Name : collect_host_done +# +# Purpose : mark a host collect as done +# +############################################################################ + +function collect_host_done() +{ + local index=${1} + local status=${2} + + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + TARGETS=${SUBCLOUDS} + else + TARGETS=${HOSTS} + fi + if [ ${index} -lt ${TARGETS} ] ; then + TARGET=${collect_host_ctrl_list[${index}]} + info=(${TARGET//:/ }) + + # update struct for this pid/process with + # + # collect_host_ctrl_list[index].stage = DONE + # collect_host_ctrl_list[index].seconds = script run time + # collect_host_ctrl_list[index].status = status + HOST_START_TIME=${info[${INDEX_SECONDS}]} + collect_host_ctrl_list[${index}]="${info[${INDEX_HOST}]}:\ + ${STAGE_DONE}:\ + ${info[${INDEX_PID}]}:\ + $((SECONDS-HOST_START_TIME)):\ + ${status}:\ + ${info[${INDEX_TARBALL}]}" + collect_host_ctrl_list_index_print ${index} + else + elog "collect_host_done ; invalid index:${index} ; must be smaller than ${TARGETS}" + collect_exit ${FAIL_INTERNAL} + fi +} + +########################################################################### +# +# Name : collect_host_stats +# +# Purpose : call echo stats for specified collect_host_ctrl_list index +# +############################################################################ + +function collect_host_stats() +{ + local index=${1} + + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + SUFFIX="tar" + else + SUFFIX="tgz" + fi + + if [[ "${PARALLEL_COLLECT_MODE}" = true && ${DONE_COUNT} -eq 0 ]] ; then + # send new line to delineate '.' progress + echo "" + PLEASE_STANDBY=false + fi + + HOST=${collect_host_ctrl_list[${index}]} + info=(${HOST//:/ }) + echo_stats "${info[${INDEX_SECONDS}]}" \ + "${info[${INDEX_TARBALL}]}" \ + "${COLLECT_DIR}/${info[${INDEX_TARBALL}]}.${SUFFIX}" +} + +########################################################################### +# +# Name : collect_host_ctrl_list_index_print +# +# Purpose : debug +# +# Description: print the structure for a specified index +# +############################################################################ + +collect_host_ctrl_list_index_print() +{ + local index=${1} + + if [ "${DEBUG}" = true ] ; then + HOST=${collect_host_ctrl_list[${index}]} + info=(${HOST//:/ }) + printf "%s Debug: %-12s %7s [%6s] | Secs:%3s | %3s | %s\n" \ + "$(date)" \ + "${info[${INDEX_HOST}]}" \ + "${info[${INDEX_STAGE}]}" \ + "${info[${INDEX_PID}]}" \ + "${info[${INDEX_SECONDS}]}" \ + "${info[${INDEX_STATUS}]}" \ + "${info[${INDEX_TARBALL}]}" + dlog "${info[${INDEX_HOST}]} ${info[${INDEX_STAGE}]} [${info[${INDEX_PID}]}] | Secs:${info[${INDEX_SECONDS}]} | ${info[${INDEX_STATUS}]} | ${info[${INDEX_TARBALL}]}" + fi +} + +############################################################################ +# +# Name : collect_host_clean +# +# Purpose : Clean collect content in /scratch on specified host +# +# Parameters: $1 - hostname +# +############################################################################ + +function collect_host_clean() +{ + local host="${1}" + local rc=${FAIL} + + if [ "${host}" == "None" -o "${host}" == "" ] ; then + report_error "invalid host (${host}) passed to collect_host_clean" ${FAIL_HOSTNAME} + return + fi + + echo -n "cleaning ${host}:${COLLECT_BASE_DIR} ... " + if [ "${host}" == "${HOSTNAME}" ] ; then + clean_scratch_dir_local ${host} ${COLLECT_BASE_DIR} + rc=${?} + else + clean_scratch_dir_remote ${host} ${COLLECT_BASE_DIR} + rc=${?} + fi + if [ ${rc} -eq ${PASS} ] ; then + echo "done" + log "user cleaned ${host}:${COLLECT_BASE_DIR} content" + fi +} + +############################################################################ +# +# Name : collect_subcloud_clean +# +# Purpose : Clean collect content in /scratch on specified subcloud +# +# Parameters: $1 - subcloud +# +############################################################################ + +function collect_subcloud_clean() +{ + local subcloud="${1}" + + check_host_reachable "${subcloud}" + if [ ${?} -ne ${PASS} ] ; then + report_error "cannot clean ${subcloud}" ${FAIL_UNREACHABLE} + return ${FAIL_UNREACHABLE} + fi + + echo -n "cleaning subcloud $subcloud:${COLLECT_BASE_DIR} ... " + + # Save current user log level + local save=${USER_LOG_MODE} + if [ "${VERBOSE}" = true ] ; then + USER_LOG_MODE=1 + fi + + # build the command + collect_cmd=("--clean --all --name ${subcloud}") + + # copy the pw file to the subcloud and then cleanup + TEMPFILE=$(mktemp) + echo "${pw}" > ${TEMPFILE} + copy_file_to_host "${TEMPFILE}" "${subcloud}" "/tmp" + rc=${?} + remove_file_local ${TEMPFILE} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to copy '${TEMPFILE}' to ${subcloud}/tmp" ${FAIL_FILE_COPY} + collect_exit ${FAIL_FILE_COPY} + fi + collect_cmd+=("-f ${TEMPFILE}") + + if [ "${DEBUG}" = true ] ; then + collect_cmd+=("-d") + fi + if [ "${VERBOSE}" = true ] ; then + collect_cmd+=("-v") + fi + + # echo "Subcloud Collect Clean: ${subcloud} ${collect_cmd[@]}" + +/usr/bin/expect << EOF + trap exit {SIGINT SIGTERM} + log_user ${USER_LOG_MODE} + spawn bash -i + set timeout 30 + expect -re $ + send "${SSH_CMD} ${UN}@${subcloud}\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION_SKIP}} + timeout { exit ${FAIL_TIMEOUT3} } + "${SUBCLOUD_LOGIN_PROMPT}" {} + "${subcloud}:" {} + } + send "${collect} ${collect_cmd[@]}\n" + expect { + "${collect_done}" { + send "exit\r" + exit ${PASS} + } + "${COLLECT_ERROR} ${FAIL_INACTIVE_STR}" { + send "exit\r" + exit ${FAIL_INACTIVE} + } + "${COLLECT_ERROR} ${FAIL_NO_HOSTS_STR}" { + send "exit\r" + exit ${FAIL_NO_HOSTS} + } + "${COLLECT_ERROR} ${FAIL_MISSING_PARAMETER_STR}" { + send "exit\r" + exit ${FAIL_MISSING_PARAMETER} + } + "${COLLECT_ERROR} ${FAIL_NO_FILE_SPECIFIED_STR}" { + send "exit\r" + exit ${FAIL_NO_FILE_SPECIFIED} + } + "${COLLECT_ERROR}" { + send "exit\r" + exit ${FAIL} + } + timeout { + exit ${FAIL_TIMEOUT5} + } + } + } + "(yes/no)?" { + send "yes\r" + exp_continue + } + "No route to host" { + exit ${FAIL_UNREACHABLE} + } + "Could not resolve" { + exit ${FAIL_UNREACHABLE} + } + "Host key verification failed" { + send "rm -f /home/${UN}/.ssh/known_hosts\n" + exit ${FAIL} + } + timeout { exit ${FAIL_TIMEOUT} } + } + exit { $FAIL } +EOF + rc=${?} + if [ ${rc} -eq ${PASS} ] ; then + log "clean of ${subcloud} hosts successful" + echo "done" + else + echo "failed to clean ${subcloud} rc:${rc}" + fi + + USER_LOG_MODE=${save} + return ${rc} +} + +############################################################################ +# +# Handle clean command option +# +############################################################################ + +if [ "${CLEAN}" = true ] ; then + + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + if [ ${SUBCLOUDS} -eq 0 ] ; then + report_error "no valid subclouds to clean" ${FAIL_NO_HOSTS} + collect_exit ${FAIL_NO_HOSTS} + fi + dlog "cleaning scratch space on ${SUBCLOUDLIST[@]}" + for subcloud in "${SUBCLOUDLIST[@]}" ; do + collect_subcloud_clean "${subcloud}" + done + else + if [ ${HOSTS} -eq 0 ] ; then + report_error "no valid hosts to clean" ${FAIL_NO_HOSTS} + collect_exit ${FAIL_NO_HOSTS} + fi + dlog "cleaning scratch space on ${HOSTLIST[@]}" + for host in "${HOSTLIST[@]}" ; do + collect_host_clean "$host" + done + if [ "${ORCHESTRATED_COLLECT}" = true ] ; then + echo "${collect_done}" + fi + fi + collect_exit ${PASS} +fi + +############################################################################ +# +# Handle collect +# +############################################################################ + +declare COLLECT_START_TIME=${SECONDS} + +if [ "${SUBCLOUD_COLLECT}" = true ] ; then + for subcloud in "${SUBCLOUDLIST[@]}" ; do + len=${#subcloud} + if [ $len -gt ${longest_name} ] ; then + longest_name=$len + fi + done +else + for host in "${HOSTLIST[@]}" ; do + len=${#host} + if [ $len -gt ${longest_name} ] ; then + longest_name=$len + fi + done +fi + +############################################################################ +# +# Name : collect_hosts +# +# Purpose : Run collect for all hosts in HOSTLIST +# +# Description: Loop over all the targetted hosts and +# +# 1. run collect_host +# +# if PARALLEL = true - Collect all hosts in parallel (all at once). +# i.e. launch one background task per host. +# Default behavior. +# +# if PARALLEL = false - Collect all hosts inline, one after the other. +# i.e. run collect for each host one after the other. +# Specify the -in or --inline command line option. +# +# 2. copy the tarball to $COLLECT_DIR +# +############################################################################ + +function collect_hosts() +{ + dlog "collect_hosts: [${HOSTS}] ${HOSTLIST[@]}" + let NEXT_PROGRESS_TIME=${SECONDS}+${PROGRESS_INTERVAL} + + for host in "${HOSTLIST[@]}" ; do + if [ "${host}" != " " ] ; then + + if [ "${host}" == "None" ] ; then + continue + elif [ "${host}" == "" ] ; then + continue + fi + + check_host_reachable "${host}" + if [ ${?} -ne ${PASS} ] ; then + report_error "cannot collect from ${host}" ${FAIL_UNREACHABLE} + continue + fi + + HOST_START_TIME=${SECONDS} + TARNAME="${host}_${NOWDATE}" + + if [ "${PARALLEL_COLLECT_MODE}" = true ] ; then + + # run collect_host in the background + (collect_host_run "${host}" "${TARNAME}")& + + # save the child process's pid + CHILD_PID=${!} + + ################################################################# + # + # Add this collect_host's background child process info + # to the collect_host_ctrl_list + # + # collect_host_ctrl_list[index].hostname = host + # collect_host_ctrl_list[index].stage = RUN + # collect_host_ctrl_list[index].pid = invalid pid (-1) + # collect_host_ctrl_list[index].seconds = script time in secs + # collect_host_ctrl_list[index].status = default to FAIL + # collect_host_ctrl_list[index].tarball = host's tarball name + # + ################################################################# + collect_host_ctrl_list[${index}]="${host}:\ + ${STAGE_RUN}:\ + ${CHILD_PID}:\ + ${SECONDS}:\ + ${FAIL}:\ + ${TARNAME}" + collect_host_ctrl_list_index_print ${index} + index=$((index+1)) + + else + + collect_host_run "${host}" "${TARNAME}" + rc=${?} + if [ ${rc} -eq ${PASS} ] ; then + + if [ "${host}" == "${HOSTNAME}" ] ; then + collect_host_complete_local "${TARNAME}" + else + collect_host_complete_remote "${host}" "${TARNAME}" + fi + rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + # handle copy error here + report_error "failed to collect from ${host} [host file get]" ${rc} + else + secs=$((SECONDS-HOST_START_TIME)) + echo -n "done" + echo_stats $secs "${TARNAME}" "${COLLECT_DIR}/${TARNAME}.tgz" + fi + elif [ ${rc} -ge ${FAIL_TIMEOUT} -a ${rc} -le ${FAIL_TIMEOUT9} -a "${DCROLE}" == "${DCROLE_SUBCLOUD}" ] ; then + report_error "failed to collect from ${host} [subcloud host run timeout]" ${FAIL_SUBCLOUD_TIMEOUT} + else + report_error "failed to collect from ${host} [host]" ${rc} + fi + fi + fi + done + + ############################################# + # + # Parallel Collect Mode + # + ############################################# + monitoring=false + if [ "${PARALLEL_COLLECT_MODE}" = true ] ; then + + echo -n "monitoring host collect ; please standby " + PLEASE_STANDBY=true + + # All hosts collected overall timeout + while [ ${UNTIL} -ge ${SECONDS} ] ; do + index=0 + monitoring=false + for HOST in "${collect_host_ctrl_list[@]}" ; do + info=(${HOST//:/ }) + # collect_host_ctrl_list_index_print ${index} + if [ "${info[${INDEX_STAGE}]}" == "${STAGE_MON}" ] ; then + + # check to see if this collect_host pocess is done collecting + kill -0 "${info[${INDEX_PID}]}" 2>/dev/null + rc=${?} + if [ ${rc} -ne 0 ] ; then + + # the process is done ; get its exit code + wait "${info[${INDEX_PID}]}" + rc=${?} + if [ ${rc} == ${PASS} ] ; then + + # if it passed then fetch that host's tarball + if [ "${info[${INDEX_HOST}]}" == "${HOSTNAME}" ] ; then + collect_host_complete_local "${info[${INDEX_TARBALL}]}" + else + collect_host_complete_remote "${info[${INDEX_HOST}]}" \ + "${info[${INDEX_TARBALL}]}" + fi + rc=${?} + collect_host_done ${index} ${rc} + if [ ${rc} -eq ${PASS} ] ; then + collect_host_stats ${index} ${rc} + fi + DONE_COUNT=$((DONE_COUNT+1)) + else + collect_host_done ${index} ${rc} + report_error "failed to collect from ${info[${INDEX_HOST}]} [target]" ${rc} + fi + else + if [ ${DONE_COUNT} -eq 0 ] ; then + if [ ${SECONDS} -gt ${NEXT_PROGRESS_TIME} ] ; then + echo -n "." + let NEXT_PROGRESS_TIME=${SECONDS}+${PROGRESS_INTERVAL} + fi + fi + + monitoring=true + fi + + elif [ "${info[${INDEX_STAGE}]}" == "${STAGE_RUN}" ] ; then + monitoring=true + # update stage to Monitor + collect_host_monitor ${index} + fi + index=$((index+1)) + done + + if [ "${monitoring}" = false ] ; then + ilog "collected from ${DONE_COUNT} hosts" + break + fi + done + fi + + # Report that the overall collect timed-out + if [ "$monitoring" = true ]; then + # there may be partial collect worth keeping + report_error "collect operation timeout after $TIMEOUT secs" ${FAIL_TIMEOUT} + fi +} + +############################################################################ +# +# Name : collect_subclouds +# +# Purpose : Run collect for all subclouds in SUBCLOUDLIST +# +# Description: Loop over all the specified subclouds and +# +# 1. run collect_subcloud +# +# if PARALLEL = true - Collect all subcloudss in parallel (all at once). +# i.e. launch one background task per subcloud. +# All hosts in subcloud also collected in parallel +# Default behavior. +# +# if PARALLEL = false - Collect all hosts inline, one after the other. +# i.e. run collect for each host one after the other. +# All hosts in subcloud also collected inline +# Specify the -in or --inline command line option. +# +# 2. copy the tarball to $COLLECT_DIR +# +############################################################################ + +declare -i PROGRESS_INTERVAL=15 # seconds +collect_subclouds() +{ + dlog "collect_subclouds: [${SUBCLOUDS}] ${SUBCLOUDLIST[@]}" + let NEXT_PROGRESS_TIME=${SECONDS}+${PROGRESS_INTERVAL} + + local -a DONE_LIST=() + for subcloud in "${SUBCLOUDLIST[@]}" ; do + if [ "${subcloud}" != " " ] ; then + + if [ "${subcloud}" == "None" ] ; then + continue + elif [ "${subcloud}" == "" ] ; then + continue + fi + + check_host_reachable "${subcloud}" + if [ ${?} -ne ${PASS} ] ; then + report_error "cannot collect from ${subcloud}" ${FAIL_UNREACHABLE} + continue + fi + + SUBCLOUD_START_TIME=${SECONDS} + if [ "${PARALLEL_COLLECT_MODE}" = true ] ; then + + # Collect subclouds in parallel mode + + #Run collect_subcloud in the background + (collect_subcloud_run "${subcloud}" "${subcloud}_${NOWDATE}")& + + # save the child process's pid + CHILD_PID=${!} + + ################################################################# + # + # Add this collect_subcloud_run's background child process info + # to the collect_host_ctrl_list + # + # collect_host_ctrl_list[index].hostname = subcloud + # collect_host_ctrl_list[index].stage = RUN + # collect_host_ctrl_list[index].pid = invalid pid (-1) + # collect_host_ctrl_list[index].seconds = script time in secs + # collect_host_ctrl_list[index].status = default to FAIL + # collect_host_ctrl_list[index].tarball = host's tarball name + # + ################################################################# + collect_host_ctrl_list[${index}]="${subcloud}:\ + ${STAGE_RUN}:\ + ${CHILD_PID}:\ + ${SECONDS}:\ + ${FAIL}:\ + ${subcloud}_${NOWDATE}" + collect_host_ctrl_list_index_print ${index} + index=$((index+1)) + + else + + # Run collect subclouds one after the other (legacy) mode. + + # make the collected filename be the subcloud name it was + # collected from with the date of this overall collect. + collect_subcloud_run "${subcloud}" "${subcloud}_${NOWDATE}" + rc=${?} + if [ ${rc} -eq ${PASS} ] ; then + + collect_host_complete_remote "${subcloud}" "${subcloud}_${NOWDATE}" + rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + # handle copy error here + report_error "failed to collect from ${subcloud} [subcloud get]" ${rc} + else + secs=$((SECONDS-SUBCLOUD_START_TIME)) + echo -n "done" + if [ "${SUBCLOUD_COLLECT}" = true ] ; then + SUFFIX="tar" + else + SUFFIX="tgz" + fi + echo_stats $secs "${COLLECT_NAME}" "${COLLECT_DIR}/${subcloud}_${NOWDATE}.${SUFFIX}" + fi + else + report_error "failed to collect from ${subcloud} [subcloud run]" ${rc} + fi + DONE_COUNT=$((DONE_COUNT+1)) + DONE_LIST+=(${subcloud}) + + ################################################# + # Check available space and stop collecting + # if the scratch_full threshold is reached + ################################################# + if [ ${DONE_COUNT} -lt ${SUBCLOUDS} ] ; then + scratch_full + if [ ${?} -eq ${FAIL} ] ; then + wlog "unable to collect more subclouds ; ${COLLECT_BASE_DIR} is almost full ; suspending subcloud collect" + + TODO_LIST=() + for sc in "${SUBCLOUDLIST[@]}" ; do + local found=false + for done_sc in "${DONE_LIST[@]}" ; do + if [ "${done_sc}" == "${sc}" ] ; then + found=true + break + fi + done + if [ "${found}" = false ] ; then + TODO_LIST+=($sc) + fi + done + if [ ${#TODO_LIST[@]} -ne 0 ] ; then + log "the following ${#TODO_LIST[@]} subclouds were not collected: ${TODO_LIST[@]}" + echo "${TODO_LIST[@]}" > ${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE} + COLLECT_CONTINUE_MSG_NEEDED=true + fi + monitoring=false + break + fi + fi + fi + fi + done + + ############################################# + # + # Parallel Collect Mode - Monitoring + # + ############################################# + monitoring=false + + if [ "${PARALLEL_COLLECT_MODE}" = true ] ; then + + echo -n "monitoring subcloud collect ; please standby " + PLEASE_STANDBY=true + + # All hosts collected overall timeout + while [ ${UNTIL} -ge ${SECONDS} ] ; do + index=0 + monitoring=false + for subcloud in "${collect_host_ctrl_list[@]}" ; do + info=(${subcloud//:/ }) + + # collect_host_ctrl_list_index_print ${index} + if [ "${info[${INDEX_STAGE}]}" == "${STAGE_MON}" ] ; then + + # check to see if this collect_host pocess is done collecting + kill -0 "${info[${INDEX_PID}]}" 2>/dev/null + rc=${?} + if [ ${rc} -ne 0 ] ; then + + # the process is done ; get its exit code + wait "${info[${INDEX_PID}]}" + rc=${?} + if [ ${rc} == ${PASS} ] ; then + + # if it passed then fetch that host's tarball + if [ "${info[${INDEX_HOST}]}" == "${HOSTNAME}" ] ; then + collect_host_complete_local "${info[${INDEX_TARBALL}]}" + else + collect_host_complete_remote "${info[${INDEX_HOST}]}" \ + "${info[${INDEX_TARBALL}]}" + fi + rc=${?} + collect_host_done ${index} ${rc} + if [ ${rc} -eq ${PASS} ] ; then + collect_host_stats ${index} ${rc} + fi + DONE_COUNT=$((DONE_COUNT+1)) + + ################################################# + # Check available space and stop collecting + # if the scratch_full threshold is reached + ################################################# + if [ ${DONE_COUNT} -lt ${SUBCLOUDS} ] ; then + scratch_full + if [ ${?} -eq ${FAIL} ] ; then + wlog "unable to collect more subclouds ; ${COLLECT_BASE_DIR} is almost full ; suspending subcloud collect" + + # search for subclouds in the MONitoring state + # and add them to the TODO_LIST + TODO_LIST=() + for sc in "${collect_host_ctrl_list[@]}" ; do + info=(${sc//:/ }) + if [ "${info[${INDEX_STAGE}]}" == "${STAGE_MON}" ] ; then + TODO_LIST+=(${info[${INDEX_HOST}]}) + fi + done + if [ ${#TODO_LIST[@]} -ne 0 ] ; then + log "the following ${#TODO_LIST[@]} subclouds were not collected: ${TODO_LIST[@]}" + echo "${TODO_LIST[@]}" > ${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE} + COLLECT_CONTINUE_MSG_NEEDED=true + fi + monitoring=false + break + fi + fi + else + collect_host_done ${index} ${rc} + report_error "failed to collect from ${info[${INDEX_HOST}]} [remote]" ${rc} + fi + else + if [ ${DONE_COUNT} -eq 0 ] ; then + if [ ${SECONDS} -gt ${NEXT_PROGRESS_TIME} ] ; then + echo -n "." + let NEXT_PROGRESS_TIME=${SECONDS}+${PROGRESS_INTERVAL} + fi + fi + + monitoring=true + fi + + elif [ "${info[${INDEX_STAGE}]}" == "${STAGE_RUN}" ] ; then + monitoring=true + # update stage to Monitor + collect_host_monitor ${index} + fi + index=$((index+1)) + done + + if [ "${monitoring}" = false ] ; then + ilog "collected from ${DONE_COUNT} subclouds" + break + fi + done + fi + # Report that the overall collect timed-out + if [ "$monitoring" = true ]; then + if [ "${ORCHESTRATED_COLLECT}" = true ] ; then + report_error "collect operation timeout after $TIMEOUT secs" ${FAIL_SUBCLOUD_TIMEOUT} + else + report_error "collect operation timeout after $TIMEOUT secs" ${FAIL_TIMEOUT} + fi + fi +} + +############################################################################ +# +# Handle subcloud and system hosts batched collect +# +############################################################################ + +declare -i TIMEOUT_THRESHOLD_FACTOR=20 +declare -i SUBCLOUDS_TIMEOUT_BOOST=20 +declare -i HOSTS_TIMEOUT_BOOST=10 +declare -i MAX_LIST_PRINT=6 + +if [ "${SUBCLOUD_COLLECT}" = true ] ; then + if [ ${SUBCLOUDS} -eq 0 ] ; then + report_error "no valid subclouds to collect" ${FAIL_NO_SUBCLOUDS} + collect_exit ${FAIL_NO_SUBCLOUDS} + fi + if [ ${SUBCLOUDS} -gt ${TIMEOUT_THRESHOLD_FACTOR} -a "${PARALLEL_COLLECT_MODE}" = true ] ; then + # adjust overall timeout to account for the large number of subclouds + let UNTIL=$(((SUBCLOUDS*SUBCLOUDS_TIMEOUT_BOOST)+TIMEOUT)) + ilog "adjusted subcloud collect timout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds" + fi + if [ "${ALLHOSTS}" = true ] ; then + if [ ${SUBCLOUDS} -gt ${MAX_LIST_PRINT} ] ; then + ilog "collecting data from all ${SUBCLOUDS} subcloud(s)" + else + ilog "collecting data from ${SUBCLOUDS} subcloud(s)" + fi + elif [ ${SUBCLOUDS} -gt ${MAX_LIST_PRINT} ] ; then + ilog "collecting data from ${SUBCLOUDS} subcloud(s)" + else + ilog "collecting data from ${SUBCLOUDS} subcloud(s): ${SUBCLOUDLIST[@]}" + fi + collect_subclouds "$@" +else + if [ ${HOSTS} -eq 0 ] ; then + report_error "no valid hosts to collect" ${FAIL_NO_HOSTS} + collect_exit ${FAIL_NO_HOSTS} + fi + if [ ${HOSTS} -gt ${TIMEOUT_THRESHOLD_FACTOR} -a "${PARALLEL_COLLECT_MODE}" = true ] ; then + # adjust overall timeout to account for the large number of hosts + let UNTIL=$(((HOSTS*HOSTS_TIMEOUT_BOOST)+TIMEOUT)) + ilog "adjusted hosts collect timout from ${TIMEOUT} to ${UNTIL} secs to account for ${HOSTS} hosts" + fi + if [ "${ALLHOSTS}" = true ] ; then + if [ ${HOSTS} -gt ${MAX_LIST_PRINT} ] ; then + ilog "collecting data from all ${HOSTS} host(s)" + else + ilog "collecting data from ${HOSTS} host(s)" + fi + elif [ ${HOSTS} -gt ${MAX_LIST_PRINT} ] ; then + ilog "collecting data from ${HOSTS} host(s)" + else + ilog "collecting data from ${HOSTS} host(s): ${HOSTLIST[@]}" + fi + collect_hosts "$@" +fi + +############################################################################ +# +# Pre tar check. Don't try to create a tarball from an empty COLLECT_DIR +# +############################################################################ + +if [ -d ${COLLECT_DIR} ] ; then + stat ${COLLECT_DIR}/* 2>/dev/null 1>/dev/null + if [ $? -eq 0 ] ; then + tarballs=(${COLLECT_DIR}/*) + for tarball in "${tarballs[@]}" ; do + dlog "collected $tarball" + done + else + elog "No ${COLLECT_DIR} tarballs found ; refusing to create empty ${TARBALL_NAME}" + collect_exit ${FAIL_NO_TARFILES} + fi +else + elog "${COLLECT_DIR} not present ; refusing to create empty ${TARBALL_NAME}" + collect_exit ${FAIL_NO_TARDIR} +fi + +############################################################################ +# +# Proceed with the tar after cleaning up error files. +# These files are used to seach for tar failures due to out-of-space logs +# +############################################################################ + +echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... " + +remove_file_local ${COLLECT_ERROR_LOG} +remove_file_local ${HOST_COLLECT_ERROR_LOG} + +/usr/bin/expect << EOF + log_user ${USER_LOG_MODE} + spawn bash -i + expect -re $ + set timeout 200 + send "(cd ${COLLECT_BASE_DIR} ; sudo ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD_APPEND} ${TARBALL_NAME} --remove-files ${COLLECT_NAME}/* 2>>${COLLECT_ERROR_LOG} ; cat ${cmd_done_file})\n" + expect { + "assword:" { + send "${pw}\r" + expect { + "${cmd_done_sig}" { exit ${PASS} } + "${pw_error}" { exit ${FAIL_PASSWORD} } + "${ac_error}" { exit ${FAIL_PERMISSION} } + timeout { exit ${FAIL_TIMEOUT1} } + } + } + timeout { exit ${FAIL_TIMEOUT} } + } +EOF +rc=${?} +if [ ${rc} -ne ${PASS} ] ; then + collect_errors ${HOSTNAME} + report_error "failed to create ${TARBALL_NAME}" ${rc} +else + collect_errors ${HOSTNAME} + rc=$? + if [ ${rc} -eq ${PASS} ] ; then + secs=$((SECONDS-COLLECT_START_TIME)) + echo -n "done" + echo_stats $secs "stats-only" "${TARBALL_NAME}" + log "created ${COLLECT_TYPE} tarball ${TARBALL_NAME}" + + if [ "${ORCHESTRATED_COLLECT}" = true ] ; then + echo "${collect_done}" + fi + else + echo "removing incomplete collect: ${TARBALL_NAME}" + remove_file_local "${TARBALL_NAME}" + + if [ "${COLLECT_CONTINUE_MSG_NEEDED}" = true ] ; then + # collect continue is not supported if the previous collect fails + remove_file_local "${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE}" + COLLECT_CONTINUE_MSG_NEEDED=false + fi + fi +fi +remove_file_local ${COLLECT_ERROR_LOG} +remove_dir_local "${COLLECT_DIR}" + +if [ "${COLLECT_CONTINUE_MSG_NEEDED}" = true ] ; then + echo "------------------------------------------------------------------------------------------" + echo "" + wlog "Unable to gather from all requested subclouds due to limited ${COLLECT_BASE_DIR} space." + echo "... Successful subcloud collects stored in ${TARBALL_NAME}" + echo "... List of uncollected subclouds is saved in ${SUBCLOUD_COLLECT_CONTINUE_LIST_FILE}" + echo "... Copy ${TARBALL_NAME} off-system and then delete it from ${COLLECT_BASE_DIR}." + echo "... Re-run collect subcloud with the --continue option to collect remaining subclouds:" + echo "" + echo " ${HOSTNAME}:$ collect --subcloud --continue" + echo "" + echo "------------------------------------------------------------------------------------------" +fi + +# return to callers dir +cd ${CURR_DIR} + +collect_exit ${rc} diff --git a/tools/collector/debian-scripts/collect_ceph.sh b/tools/collector/debian-scripts/collect_ceph.sh new file mode 100755 index 00000000..1a5863ed --- /dev/null +++ b/tools/collector/debian-scripts/collect_ceph.sh @@ -0,0 +1,81 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="ceph" +LOGFILE="${extradir}/ceph.info" +echo "${hostname}: Ceph Info .........: ${LOGFILE}" + +function is_service_active { + active=`sm-query service management-ip | grep "enabled-active"` + if [ -z "$active" ] ; then + return 0 + else + return 1 + fi +} + +function exit_if_timeout { + if [ "$?" = "124" ] ; then + echo "Exiting due to ceph command timeout" >> ${LOGFILE} + exit 0 + fi +} + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + # Using timeout with all ceph commands because commands can hang for + # minutes if the ceph cluster is down. If ceph is not configured, the + # commands return immediately. + + delimiter ${LOGFILE} "ceph status" + timeout 30 ceph status >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph mon dump" + timeout 30 ceph mon dump >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph osd dump" + timeout 30 ceph osd dump >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph osd tree" + timeout 30 ceph osd tree >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph osd crush dump" + timeout 30 ceph osd crush dump >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + delimiter ${LOGFILE} "ceph df" + timeout 30 ceph df >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph osd df tree" + timeout 30 ceph osd df tree >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + + delimiter ${LOGFILE} "ceph health detail" + timeout 30 ceph health detail >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + exit_if_timeout + +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_containerization.sh b/tools/collector/debian-scripts/collect_containerization.sh new file mode 100755 index 00000000..845b2432 --- /dev/null +++ b/tools/collector/debian-scripts/collect_containerization.sh @@ -0,0 +1,206 @@ +#! /bin/bash +# +# Copyright (c) 2019-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="containerization" +LOGFILE="${extradir}/${SERVICE}.info" +LOGFILE_EVENT="${extradir}/${SERVICE}_events.info" +LOGFILE_API="${extradir}/${SERVICE}_api.info" +LOGFILE_HOST="${extradir}/${SERVICE}_host.info" +LOGFILE_IMG="${extradir}/${SERVICE}_images.info" +LOGFILE_KUBE="${extradir}/${SERVICE}_kube.info" +LOGFILE_PODS="${extradir}/${SERVICE}_pods.info" +LOGFILE_HELM="${extradir}/${SERVICE}_helm.info" + +HELM_DIR="${extradir}/helm" +ETCD_DB_FILE="${extradir}/etcd_database.dump" +KUBE_CONFIG_FILE="/etc/kubernetes/admin.conf" +KUBE_CONFIG="--kubeconfig ${KUBE_CONFIG_FILE}" +echo "${hostname}: Containerization Info ...: ${LOGFILE}" + +############################################################################### +# All nodes +############################################################################### +mkdir -p ${HELM_DIR} +source_openrc_if_needed + +CMD="docker image ls -a" +delimiter ${LOGFILE_IMG} "${CMD}" +${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_IMG} + +CMD="crictl images" +delimiter ${LOGFILE_IMG} "${CMD}" +${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_IMG} + +CMD="ctr -n k8s.io images list" +delimiter ${LOGFILE_IMG} "${CMD}" +${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_IMG} + +CMD="docker container ps -a" +delimiter ${LOGFILE_IMG} "${CMD}" +${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_IMG} + +CMD="crictl ps -a" +delimiter ${LOGFILE_IMG} "${CMD}" +${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_IMG} + +CMD="cat /var/lib/kubelet/cpu_manager_state | python -m json.tool" +delimiter ${LOGFILE_HOST} "${CMD}" +eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HOST} + +############################################################################### +# Active Controller +############################################################################### +if [ "$nodetype" = "controller" -a "${ACTIVE}" = true ] ; then + + # Environment for kubectl and helm + export KUBECONFIG=${KUBE_CONFIG_FILE} + + declare -a CMDS=() + CMDS+=("kubectl version") + CMDS+=("kubectl get nodes -o wide") + CMDS+=("kubectl get nodes --show-labels") + CMDS+=("kubectl get nodes -o json") + CMDS+=("kubectl describe nodes") + CMDS+=("kubectl describe nodes | grep -e Capacity: -B1 -A40 | grep -e 'System Info:' -B13 | grep -v 'System Info:'") + CMDS+=("kubectl services") + CMDS+=("kubectl get configmaps --all-namespaces") + CMDS+=("kubectl get daemonsets --all-namespaces") + CMDS+=("kubectl get pods --all-namespaces -o wide") + CMDS+=("kubectl get pvc --all-namespaces") + CMDS+=("kubectl get pvc --all-namespaces -o yaml") + CMDS+=("kubectl get pv --all-namespaces") + CMDS+=("kubectl get pv --all-namespaces -o yaml") + CMDS+=("kubectl get sc --all-namespaces") + CMDS+=("kubectl get serviceaccounts --all-namespaces") + CMDS+=("kubectl get deployments.apps --all-namespaces") + CMDS+=("kubectl get rolebindings.rbac.authorization.k8s.io --all-namespaces") + CMDS+=("kubectl get roles.rbac.authorization.k8s.io --all-namespaces") + CMDS+=("kubectl get clusterrolebindings.rbac.authorization.k8s.io") + CMDS+=("kubectl get clusterroles.rbac.authorization.k8s.io") + for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE_KUBE} "${CMD}" + eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_KUBE} + echo >>${LOGFILE_KUBE} + done + + # api-resources; verbose, place in separate file + CMDS=() + CMDS+=("kubectl api-resources --verbs=list --namespaced -o name | xargs -n 1 kubectl get --show-kind --ignore-not-found --all-namespaces") + CMDS+=("kubectl api-resources --verbs=list --namespaced -o name | xargs -n 1 kubectl get --show-kind --ignore-not-found --all-namespaces -o yaml") + for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE_API} "${CMD}" + eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_API} + echo >>${LOGFILE_API} + done + + # describe pods; verbose, place in separate file + CMDS=() + CMDS+=("kubectl describe pods --all-namespaces") + for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE_PODS} "${CMD}" + eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_PODS} + echo >>${LOGFILE_API} + done + + # events; verbose, place in separate file + CMDS=() + CMDS+=("kubectl get events --all-namespaces --sort-by='.metadata.creationTimestamp' -o go-template='{{range .items}}{{printf \"%s %s\t%s\t%s\t%s\t%s\n\" .firstTimestamp .involvedObject.name .involvedObject.kind .message .reason .type}}{{end}}'") + for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE_EVENT} "${CMD}" + eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_EVENT} + echo >>${LOGFILE_EVENT} + done + + # Helm related + CMD="helm version" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + echo >>${LOGFILE_HELM} + + HELM_VERSION=$(helm version --client --short) + if [[ $HELM_VERSION =~ v2 ]]; then + CMD="helm list -a" + delimiter ${LOGFILE_HELM} "${CMD}" + APPLIST=$(${CMD} 2>>${COLLECT_ERROR_LOG} | tee -a ${LOGFILE_HELM}) + APPLIST=$(echo "${APPLIST}" | awk '{if (NR!=1) {print}}') + while read -r app; do + APPNAME=$(echo ${app} | awk '{print $1}') + APPREVISION=$(echo ${app} | awk '{print $2}') + helm status ${APPNAME} > ${HELM_DIR}/${APPNAME}.status + helm get values ${APPNAME} --revision ${APPREVISION} \ + > ${HELM_DIR}/${APPNAME}.v${APPREVISION} + done <<< "${APPLIST}" + elif [[ $HELM_VERSION =~ v3 ]]; then + # NOTE: helm environment not configured for root user + CMD="sudo -u sysadmin KUBECONFIG=${KUBECONFIG} helm list --all --all-namespaces" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + + CMD="sudo -u sysadmin KUBECONFIG=${KUBECONFIG} helm search repo" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + + CMD="sudo -u sysadmin KUBECONFIG=${KUBECONFIG} helm repo list" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + fi + + HELM2CLI=$(which helmv2-cli) + if [ $? -eq 0 ]; then + CMD="helmv2-cli -- helm version --short" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + + CMD="helmv2-cli -- helm list -a" + delimiter ${LOGFILE_HELM} "${CMD}" + mapfile -t ARR < <( ${CMD} 2>>${COLLECT_ERROR_LOG} ) + printf "%s\n" "${ARR[@]}" >> ${LOGFILE_HELM} + for((i=1; i < ${#ARR[@]}; i++)) + do + APPNAME=$(echo ${ARR[$i]} | awk '{print $1}') + APPREVISION=$(echo ${ARR[$i]} | awk '{print $2}') + ${HELM2CLI} -- helm status ${APPNAME} > ${HELM_DIR}/${APPNAME}.status + ${HELM2CLI} -- helm get values ${APPNAME} --revision ${APPREVISION} \ + > ${HELM_DIR}/${APPNAME}.v${APPREVISION} + done <<< "${APPLIST}" + + CMD="helmv2-cli -- helm search" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + + CMD="helmv2-cli -- helm repo list" + delimiter ${LOGFILE_HELM} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE_HELM} + fi + + CMD="cp -r /opt/platform/helm_charts ${HELM_DIR}/" + delimiter ${LOGFILE} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} + + export $(grep '^ETCD_LISTEN_CLIENT_URLS=' /etc/etcd/etcd.conf | tr -d '"') + + CMD="sudo ETCDCTL_API=3 etcdctl \ + --endpoints=$ETCD_LISTEN_CLIENT_URLS get / --prefix" + + #Use certificate if secured access is detected + SEC_STR='https' + if [[ "$ETCD_LISTEN_CLIENT_URLS" == *"$SEC_STR"* ]]; then + CMD="$CMD --cert=/etc/etcd/etcd-server.crt \ + --key=/etc/etcd/etcd-server.key --cacert=/etc/etcd/ca.crt" + fi + + delimiter ${LOGFILE} "${CMD}" + ${CMD} 2>>${COLLECT_ERROR_LOG} >> ${ETCD_DB_FILE} +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_coredump.sh b/tools/collector/debian-scripts/collect_coredump.sh new file mode 100644 index 00000000..7614909f --- /dev/null +++ b/tools/collector/debian-scripts/collect_coredump.sh @@ -0,0 +1,35 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="coredump" +LOGFILE="${extradir}/${SERVICE}.info" + + +COREDUMPDIR="/var/lib/systemd/coredump" + +echo "${hostname}: Core Dump Info ....: ${LOGFILE}" + +files=`ls ${COREDUMPDIR} | wc -l` +if [ "${files}" == "0" ] ; then + echo "No core dumps" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +else + COMMAND="ls -lrtd ${COREDUMPDIR}/*" + delimiter ${LOGFILE} "${COMMAND}" + ${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + COMMAND="md5sum ${COREDUMPDIR}/*" + delimiter ${LOGFILE} "${COMMAND}" + ${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_crash.sh b/tools/collector/debian-scripts/collect_crash.sh new file mode 100644 index 00000000..ce35ba8c --- /dev/null +++ b/tools/collector/debian-scripts/collect_crash.sh @@ -0,0 +1,38 @@ +#! /bin/bash +# +# Copyright (c) 2016-2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="crash" +LOGFILE="${extradir}/${SERVICE}.info" + + +CRASHDIR="/var/lib/kdump" + +echo "${hostname}: Kernel Crash Info .: ${LOGFILE}" + +COMMAND="find ${CRASHDIR}" +delimiter ${LOGFILE} "${COMMAND}" +${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +COMMAND="rsync -a --include=*.txt --include=*/ --exclude=* ${CRASHDIR} ${basedir}/var/" +delimiter ${LOGFILE} "${COMMAND}" +${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +COMMAND="ls -lrtd ${CRASHDIR}/*" +delimiter ${LOGFILE} "${COMMAND}" +${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +COMMAND="md5sum ${CRASHDIR}/*" +delimiter ${LOGFILE} "${COMMAND}" +${COMMAND} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +exit 0 diff --git a/tools/collector/debian-scripts/collect_date b/tools/collector/debian-scripts/collect_date new file mode 100755 index 00000000..22c62fb1 --- /dev/null +++ b/tools/collector/debian-scripts/collect_date @@ -0,0 +1,1064 @@ +#!/bin/bash +####################################################################### +# +# Copyright (c) 2017 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# This file is a new member of the Titanium Cloud "Log Collect Utility". +# This file enhances date restricted collect in response. +# +# This file is invoked by collect_host when a date restricted +# collect using --start-date and/or --end-date options are used. +# +# This new data restricted collect service applies to /var/log and its +# subdirectories only. This service determines if a log file is to be +# included in dated collect by looking at the logs at the head and tail +# of the files and subdirectories in /var/log. Those dates are then +# compared to the user specified date range. if a file is determined to +# contain logs within that date range then that file is included in the +# collect log. A valid log date prefix is "YYYY-MM-DD". +# +# Unfortunately, not all log files contain the correct date placement and +# format. This feature has implemented special case handling for many but not +# not all of such cases. To avoid accidental exclusion of a key file, this +# feature will by default include log files whose log date content could +# not be determined if its file date is after the specified start date. +# +# Note: local convension , example ${head_date} vs ${HEAD_DATE} +# +# Lower case date variables contain integer values while +# Upper case date variables contain formatted string values of same. +# +# Calling sequence: +# +# /usr/local/sbin/collect_date +# /usr/local/sbin/collect_date 20170701 20170901 /tmp/file.list true +# +######################################################################## + +# +# Import commands, variables and convenience functions available to +# all collectors ; common and user defined. +# +source /usr/local/sbin/collect_utils + +# where to find the logs +declare -r baselogdir="/var/log" + +# include / exclude labels +declare -r INCLUDE_FILE="inc" +declare -r EXCLUDE_FILE="exc" + +# a global reason string that is only valid +# in the context of the file beeing looked at. +declare __this_reason="" + +# setup defaults +INC_FILE_LIST="/var/run/collect_include.list" +EXC_FILE_LIST="/var/run/collect_exclude.list" +NOD_FILE_LIST="/var/run/collect_nodate.list" + +BOT_DATE="2000-01-01" # beginning of time date +bot_date=730013 # beginning of time date as integer + +EOT_DATE="9999-12-31" # end of time date +eot_date=3649810 # end of time date as integer + +# manage debug mode +DEBUG="${4}" +set_debug_mode "${DEBUG}" +echo "Debug Mode: ${DEBUG}" + +dlog "collect_date args: ${1} ${2} ${3} ${4} ${5}" + +############################################################################# +# +# 'track' is the main accounting procedure that manages file inclusions and +# exclusions as well as the metrics around all the parsed files. +# +# It also reports accounting mismatch logs, if they occur (should not) +# and the file that started the mismatch (to assist in debug). +# +# $1 - filename +# $2 - label +# +############################################################################# + +# accounting defaults +declare -i file_count=0 +declare -i inc_file_count=0 +declare -i exc_file_count=0 +declare -i empty_file_count=0 + +function track() +{ + local fn="${1}" + local label="${2}" + + if [ -z "${fn}" ] ; then + elog "Ignoring call with empty filename" + return + + elif [ "${label}" == "totals" ] ; then + ((file_count++)) + return + + elif [ "${label}" == "empty" ] ; then + ((empty_file_count++)) + return + + elif [ "${label}" == "${INCLUDE_FILE}" ] ; then + manage_file "${fn}" "${label}" "${__this_reason}" + ((inc_file_count++)) + + elif [ "${label}" == "${EXCLUDE_FILE}" ] ; then + manage_file "${fn}" "${label}" "${__this_reason}" + ((exc_file_count++)) + + else + elog "Unknown label '${label}'" + + fi + + sum=$((inc_file_count + exc_file_count)) + if [ ${file_count} -ne ${sum} ] ; then + wlog "MISMATCH: ${file_count} != ${inc_file_count} + ${exc_file_count} - ${fn}" + fi +} + +############################################################################ +# +# 'summary' is an accounting display procedure used to show the +# accounting results of the total number of files processed, +# number of empty files and most importanly the number if +# included or excluded files. +# +############################################################################ + +function summary() +{ + dlog "Summary:" + dlog "Total Files: ${file_count}" + dlog "Empty Files: ${empty_file_count}" + dlog "Added Files: ${inc_file_count}" + dlog "Omitd Files: ${exc_file_count}" +} + +############################################################################# +# +# 'date_to_int' converts a standard formatted YYYY-MM-DD string date +# to an integer and stores it in __this_integer_date variable +# to be used in context on demand. +# +############################################################################# + +# short lived global integer date value updated by date_to_int utility +declare -i __this_integer_date="" + +function date_to_int() +{ + local yy="${1:0:4}" + local mm="${1:5:2}" + local dd="${1:8:2}" + + # handle leading zeros in month and day + if [ "${mm:0:1}" == "0" ] ; then + mm=${mm:1:1} + fi + if [ "${dd:0:1}" == "0" ] ; then + dd=${dd:1:1} + fi + + # 365 days in a year, 31 days in a month, 1 day in a day + __this_integer_date=$((yy*365 + mm*31 + dd)) +} + +############################################################################ +# +# 'create_list_file' removes old/stale list file and creates a new empty +# one with correct permissions. +# +############################################################################ + +function create_list_file() +{ + local fn="${1}" + if [ -e "${fn}" ] ; then + rm -f "${fn}" + fi + touch "${fn}" + chmod 644 "${fn}" +} + +######################################################################## +# +# Handle the incoming 'start' and 'end' date format defensively. +# +# If the date is with no dashes as it would come in from the user's +# date specification then set it up like the standard date delimited with '-' +# i.e. 20171002 is updated to 2017-10-02. +# +# If verified to be in the standard format just copy in. +# +# Otherwise assume the start date is from the beginning of time or +# end date is the end of time. + +# load up the start date string and integer representation +if [ -z "${1}" ] ; then + START_DATE="${BOT_DATE}" +elif [[ "${1}" =~ [0-9]{4}[0-9]{2}[0-9]{2} ]] ; then + START_DATE="${1:0:4}-${1:4:2}-${1:6:2}" +elif [[ "${1}" =~ [0-9]{4}-[0-9]{2}-[0-9]{2} ]] ; then + START_DATE="${1}" +else + START_DATE="${BOT_DATE}" +fi + +# Convert the correct or corrected 'start' date to an integer value +date_to_int "${START_DATE}" +start_date=${__this_integer_date} + + +# load up the end date string and integer representation +if [ -z "${2}" ] ; then + END_DATE="${EOT_DATE}" +elif [[ "${2}" =~ [0-9]{4}[0-9]{2}[0-9]{2} ]] ; then + END_DATE="${2:0:4}-${2:4:2}-${2:6:2}" +elif [[ "${2}" =~ [0-9]{4}-[0-9]{2}-[0-9]{2} ]] ; then + END_DATE="${2}" +else + END_DATE="${EOT_DATE}" +fi + +# Convert the correct or corrected 'end' date to an integer value +date_to_int "${END_DATE}" +end_date=${__this_integer_date} + +# Handle user error of specifying an end date that is before the start date +if [ ${start_date} -gt ${end_date} ] ; then + wlog "invalid date range ; end date (${END_DATE}:${end_date}) is before start (${START_DATE}:${start_date})" + wlog "correcting to defaults: from ${START_DATE} to ${END_DATE}" + START_DATE="${BOT_DATE}" + END_DATE="${EOT_DATE}" + start_date=${bot_date} + end_date="${eot_date}" +fi + +ilog "collecting log files containing logs dated ${START_DATE} to ${END_DATE} (inclusive)" + + +if [ "${3}" == "" ] ; then + elog "dated collect include file list name not specified ... exiting" + exit 1 +else + VAR_LOG_INCLUDE_LIST=${3} +fi + +create_list_file "${VAR_LOG_INCLUDE_LIST}" +create_list_file "${INC_FILE_LIST}" +create_list_file "${EXC_FILE_LIST}" +create_list_file "${NOD_FILE_LIST}" + +# Declare and init the include and exclude debug lists. +inclist=("") +exclist=("") + +############################################################################# +# +# 'filedatelist' is a list of files that are known to not contain dated logs. +# Instead these files are included unless its file date is +# older that the specified start date. +# +############################################################################# + +filedatelist=("") +filedatelist+=("/var/log/wtmp") +filedatelist+=("/var/log/dmesg") +filedatelist+=("/var/log/dmesg.old") +filedatelist+=("/var/log/sm-trap.log") +filedatelist+=("/var/log/sm-customer.log") +filedatelist+=("/var/log/sm-customer.alarm") +filedatelist+=("/var/log/sm-shutdown.log") +filedatelist+=("/var/log/nfv-vim-events.log") +filedatelist+=("/var/log/fm-customer.log") +filedatelist+=("/var/log/fm-alarm.log") +filedatelist+=("/var/log/lighttpd-access.log") +filedatelist+=("/var/log/audit/audit.log") +filedatelist+=("/var/log/rabbitmq/shutdown_log") +filedatelist+=("/var/log/rabbitmq/startup_log") +filedatelist+=("/var/log/rabbitmq/wait_log") +filedatelist+=("/var/log/rabbitmq/rabbit@localhost.log") +filedatelist+=("/var/log/nfv-vim-alarms.log") +filedatelist+=("/var/log/vswitch.cmds.log") + +# This is a list of files to always include +autoaddlist=("") +autoaddlist+=("/var/log/collect.log") + +######################################################################### +# +# 'is_in_range' returns true if the specified log file data range +# is within the bounded date range specified by the caller. +# Otherwise a false is returned. +# +# ${1} is HEAD_DATE and is the date of the first log of the file in contect +# ${2} is TAIL_DATE and is the date of the last log in the file in context +# +# expected date format is ... YYYY-MM-DD +# +# Calling Sequence is ... is_in_range HEAD_DATE TAIL_DATE +# +# There are several cases that aer handled ; +# see case comment inline below. +# +######################################################################### + +function is_in_range() +{ + local HEAD_DATE="${1}" + local TAIL_DATE="${2}" + if [[ ${HEAD_DATE} =~ [0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + # Convert the date to an integer value + # to make the compare easier and faster + date_to_int "${HEAD_DATE}" + head_date=${__this_integer_date} + + if [[ ${TAIL_DATE} =~ [0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + # Convert the date to an integer value + # to make the compare easier and faster + date_to_int "${TAIL_DATE}" + tail_date=${__this_integer_date} + + in_range=false + + # The last log is before the start date or the first log is after the end date + # if [[ "${TAIL_DATE}" < "${START_DATE}" || "${HEAD_DATE}" > "${END_DATE}" ]] ; then + if [ ${tail_date} -lt ${start_date} -o ${head_date} -gt ${end_date} ] ; then + __this_reason+=":case 0" + in_range=false + + # Case 1: the head after the start but before the end date + # .... S ... head ... E .... + elif [ ${head_date} -ge ${start_date} -a ${head_date} -le ${end_date} ] ; then + __this_reason+=":case 1" + in_range=true + + # Case 2: the tail after the start but before the end date + # .... S ... tail ... E .... + elif [ ${tail_date} -ge ${start_date} -a ${tail_date} -le ${end_date} ] ; then + __this_reason+=":case 2" + in_range=true + + # Case 3: log file date range spans the start and end dates + # head S ... ... E tail + elif [ ${head_date} -le ${start_date} -a ${tail_date} -ge ${end_date} ] ; then + __this_reason+=":case 3" + in_range=true + + else + __this_reason+=":default" + fi + else + __this_reason+=":invalid-tail-date" + # so the tail date is unknown. + # include this file as long as the head date is before end date + if [ ${head_date} -lt ${end_date} ] ; then + in_range=true + else + in_range=false + fi + fi + + if [ "${in_range}" = true ] ; then + __this_reason+=":in-range ${HEAD_DATE} to ${TAIL_DATE}" + true + else + __this_reason+=":out-of-range ${HEAD_DATE} to ${TAIL_DATE}" + false + fi + return + fi + + __this_reason+=":date-format-error ${HEAD_DATE} to ${TAIL_DATE}" + true + return +} + +########################################################################### +# +# Name : want_this_file +# +# Description: This utility first compares the filename to known exception +# cases and handles them accordingly. Exception cases do look +# for the date but with different methods. Once the date info +# is or is not found then the choice to or not to include it +# follows same general logic as others below. +# +# If not an exception case then it determines the file type +# and performs any preprocessing required. i.e. uncompressing +# the file and switching the filename to the uncompressed name. +# Data files or other unknown file types are automatically +# included without further data query by immediately returning +# true. +# +# With an expected supported filename in hand this utility will +# extract the date-only (time not included) portion, the first +# 10 characters of the first and last logs and determin if this +# logfile has logs that fall withing the specified date range. +# +# Returns : If there is no valid date found then true is returned. +# If file contains in range logs then true is returned. +# if file does not contain in range logs then false is returned. +# +# Parameters : $1 is the full pathed log file name. +# +# $1 - the filename of the file to check the date for +# +########################################################################### + +function want_this_file() +{ + local inc=true + local LOGFILE="${1}" + local filetype=$(file "${LOGFILE}") + local HEAD_DATE="" + local TAIL_DATE="" + + for add in "${autoaddlist[@]}" + do + if [ "${add}" == "${LOGFILE}" ] ; then + __this_reason+="autoadd" + true + return + fi + done + + ########################################################################## + # Exception Case: known free formatted log files. + ########################################################################## + # + # Some log files are known to not contain properly dated logs. + # Such files may just contian free format strings of information. + # + # A list of such files is in hard coded in filedatelist. + # TODO: consider making this a file that is loaded. + # + # Check to see if this is an auto add file + # Only exlude such files if its last modified date is before start date. + # + ########################################################################## + for add in "${filedatelist[@]}" + do + if [ "${add}" == "${LOGFILE}" ] ; then + __this_reason+="filedate" + + # Don't include empty files that are in the hard coded filedatelist + filetype=$(file "${LOGFILE}") + if [ ! -z "${filetype}" ] ; then + case ${filetype} in + *empty*) + __this_reason="empty" + track "${LOGFILE}" "empty" + false + return + ;; + *) + ;; + esac + fi + + # get last modified date + FILE_DATE=$(stat -c %y "${LOGFILE}" | cut -b 1-10) + date_to_int "${FILE_DATE}" + if [ ${__this_integer_date} -ge ${start_date} ] ; then + __this_reason+=":in-range ${FILE_DATE}" + true + else + __this_reason+=":out-of-range ${FILE_DATE}" + false + fi + return + fi + done + + # O.K. if we get here then this filename is not in the static list + if [ ! -z "${filetype}" ] ; then + + case ${filetype} in + + *directory*) + # Skip over a directory only path. + # No worries, the files in that directory will be handled. + __this_reason+="directory" + false + return + ;; + + *ASCII*|*text*|*compressed*) + + if [[ ${filetype} == *"compressed"* ]] ; then + fileext=${LOGFILE##*.} + case "${fileext}" in + gz) + tmpfile=$(mktemp) + #__this_reason+="gzipped" + zcat "${LOGFILE}" | head -5 > "$tmpfile" + zcat "${LOGFILE}" | tail -5 >> "$tmpfile" + + # save the current compressed log filename + # so that it can be restored after the + # recursion call below + LOGFILE_save="${LOGFILE}" + want_this_file "$tmpfile" + rc=${?} + LOGFILE="${LOGFILE_save}" + + # cleanup ; get rid of the temp file + rm -f "$tmpfile" 2>/dev/null + if [ ${rc} -eq 0 ] ; then + true + else + false + fi + return + ;; + tgz) + __this_reason+="tarball" + true + return + ;; + *) + __this_reason+="compress:[${fileext}]" + true + return + ;; + esac + fi + + # Read the first log in the file + HEAD_DATE=$(head -1 "${LOGFILE}") + + ############################################################## + # Minor Exception Case: empty/short first log + ############################################################## + # + # handle one empty or short first line by fetching second log + # + ############################################################## + + if [ ${#HEAD_DATE} -lt 10 ] ; then + HEAD_DATE=$(head -2 "${LOGFILE}" | sed -n '2p' | cut -b 1-11) + fi + + + ############################################################## + # Typical Case: YYYY-MM-DD + ############################################################## + # + # check for most typical date format. + # + ############################################################## + + if [[ ${HEAD_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + __this_reason+="typical" + TAIL_DATE=$(tail -1 "${LOGFILE}" | cut -b 1-11) + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + # a call to 'is_in_range' returns false (1) if this + # file's logs are all out of range date + is_in_range "${HEAD_DATE:0:10}" "${TAIL_DATE:0:10}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + else + + ####################################################### + # Exception Case: Unrecognized date format in last log + ####################################################### + # + # try the second last line. This case is typical in + # cron.log in 15.12 MAIL logs which send a purious ')' + # as a second log. Also if the log file has auto blank + # lines between logs leaving a blank line as the last + # log. + # + # this exception ties the second last log instead. + # + ####################################################### + TAIL_DATE=$(tail -2 "${LOGFILE}" | sed -n '1p' | cut -b 1-11) + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + is_in_range "${HEAD_DATE:0:10}" "${TAIL_DATE:0:10}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + else + # default to true if the dates could not be parsed + __this_reason+=":invalid-tail-date" + + date_to_int "${HEAD_DATE}" + head_date=${__this_integer_date} + + # so the tail date is unknown. + # include this file as long as the head date is before end date + if [ ${head_date} -lt ${end_date} ] ; then + true + else + false + fi + return + fi + fi + + else + + ########################################################### + # Exception Case 1: logs date prefix starts with '[' + ########################################################### + # + # logdate starts with a '[' ... [2017-10-02 + # + # In this case we just recognize it and increment past it + # and then assume the last log will have the same format + # + ########################################################### + + if [ "${HEAD_DATE:0:1}" == "[" ] ; then + __this_reason+="exception1" + HEAD_DATE=${HEAD_DATE:1:11} + if [[ ${HEAD_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + TAIL_DATE=$(tail -1 "${LOGFILE}" | cut -b 2-11) + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + __this_reason+=".1" + is_in_range "${HEAD_DATE:0:10}" "${TAIL_DATE:0:10}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + else + TAIL_DATE=$(tail -1 "${LOGFILE}" | cut -b 1-10) + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + __this_reason+=".2" + is_in_range "${HEAD_DATE:0:10}" "${TAIL_DATE:0:10}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + else + + if [ "${TAIL_DATE:0:1}" == "[" ] ; then + __this_reason+=".3" + TAIL_DATE=${TAIL_DATE:1:11} + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + is_in_range "${HEAD_DATE}" "${TAIL_DATE}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + else + __this_reason+=":invalid-tail-date" + true + return + fi + else + __this_reason+=":tail-date-not-found" + is_in_range "${HEAD_DATE}" "${EOT_DATE}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + fi + fi + fi + else + # /var/log/dmesg is typical of this case + # no log date and many logs start with [ uptime] + __this_reason+=":invalid-head-date" + true + return + fi + + ########################################################### + # Exception Case 2: journel.log handling + ########################################################### + # + # first log in file contains start and stop date + # + # "-- Logs begin at Thu 2017-07-06 12:28:35 UTC, end at Thu 2017-07-06 12:33:31 UTC. --" + # ^^^^^^^^^^ ^^^^^^^^^^ + # + # This exception case gets the head and tail log date from + # this first log. + ########################################################### + + elif [ "${HEAD_DATE:0:13}" == "-- Logs begin" ] ; then + __this_reason+="exception2" + + # need to get more of the line + HEAD_DATE=$(head -1 "${LOGFILE}") + + is_in_range "${HEAD_DATE:21:10}" "${HEAD_DATE:57:10}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + ########################################################### + # Exception Case 3: journel.log handling + ########################################################### + # + # some logs like openstack.log have some logs that are + # prefixed by keystone:log. This case handles that + # + ########################################################### + elif [ "${HEAD_DATE:0:13}" == "keystone:log " ] ; then + __this_reason+="exception3" + + # need to get more of the line + HEAD_DATE="${HEAD_DATE:13:10}" + TAIL_DATE=$(tail -1 "${LOGFILE}") + + if [ "${TAIL_DATE:0:13}" == "keystone:log " ] ; then + TAIL_DATE="${TAIL_DATE:13:10}" + else + TAIL_DATE="${TAIL_DATE:0:10}" + fi + + is_in_range "${HEAD_DATE}" "${TAIL_DATE}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + else + + ####################################################### + # Exception Case 4: horizon.log + ####################################################### + # + # Search the first and last 30 logs for a valid date. + # This should handle seeing a traceback at the head or + # tail of the log file. + # + ####################################################### + __this_reason+="exception4" + temp_head=$(head -30 "${LOGFILE}") + for ((loop_head=1;loop_head<31;loop_head++)) + do + HEAD_DATE=$(echo "${temp_head}" | sed -n "${loop_head}"p | cut -b 1-10) + if [[ ${HEAD_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + temp_tail=$(tail -30 "${LOGFILE}") + for ((loop_tail=1;loop_tail<31;loop_tail++)) + do + TAIL_DATE=$(echo "${temp_tail}" | sed -n ${loop_tail}p | cut -b 1-10) + if [[ ${TAIL_DATE} =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2} ]]; then + + is_in_range "${HEAD_DATE}" "${TAIL_DATE}" + if [ $? -eq 0 ] ; then + true + else + false + fi + return + + fi + done + + # default to including it if no date at + # the end of the file is found + true + return + fi + done + + ###################################################### + # Exception Case 5: + ###################################################### + # + # Otherwise the file has no date or the date + # format is unrecognized so just include the file + # regardless of its date. + # + ###################################################### + __this_reason="nodate" + true + return + fi + fi + ;; + + *archive*) + + # Archive files like .tar are not extracted. + # Instead it is only collected if its last modified date is + # after the start date + + __this_reason+="archive" + FILE_DATE=$(stat -c %y "${LOGFILE}" | cut -b 1-10) + date_to_int "${FILE_DATE}" + if [ ${__this_integer_date} -ge ${start_date} ] ; then + __this_reason+=":in-range ${FILE_DATE}" + true + else + __this_reason+=":out-of-range ${FILE_DATE}" + false + fi + return + ;; + + *empty*) + __this_reason="empty" + track "${LOGFILE}" "empty" + false + return + ;; + + *data*) + __this_reason="data" + true + return + ;; + + *executable*) + __this_reason="executable" + true + return + ;; + + # very short file (no magic) + *"very short file"*) + __this_reason="small" + true + return + ;; + + *link*) + __this_reason="link" + false + return + ;; + + *swap*) + + __this_reason="swap" + false + return + ;; + + *fifo*) + + __this_reason="fifo" + false + return + ;; + + *socket*) + + __this_reason="socket" + false + return + ;; + + *) + __this_reason="other" + true + return + ;; + esac + else + __this_reason="unknown" + wlog "Adding ${logfile} ; unknown filetype" + true + return + fi + + # catch all default + true + return +} + +############################################################################# +# +# 'manage_file' adds the specified file to either the 'include' or exclude' +# reason lists. In the include case the most important part of +# this function appends the filename to the file specified by +# "VAR_LOG_INCLUDE_LIST" which is the file that collect_host +# uses to know what files in /var/log need to be included in +# the collect tarball. +# +############################################################################# + +function manage_file() +{ + local filename="${1}" + local action="${2}" + local reason="${3}" + + if [ "${action}" == "${EXCLUDE_FILE}" ] ; then + echo "${filename} excluded (${reason})" >> "${EXC_FILE_LIST}" + else + echo "${filename} included (${reason})" >> "${INC_FILE_LIST}" + + # add the file to the list of files to be collected + echo "${filename}" >> ${VAR_LOG_INCLUDE_LIST} + fi + + dlog "${action}: ${filename} (${reason})" +} + +############################################################################# +# +# 'handle_response' adds or excludes the specified file based on +# arguement $2 being 0 - true - include or +# !0 - false - exclude +# +# $1 - file +# $2 - include control ( true or false ) +# +############################################################################# + +function handle_response() +{ + local logfile="${1}" + local include="${2}" + + if [ "${include}" -eq 0 ] ; then + inclist=("${inclist[@]}" ${logfile}) + track "${logfile}" "${INCLUDE_FILE}" + + else + exclist=("${exclist[@]}" ${logfile}) + track "${logfile}" "${EXCLUDE_FILE}" + fi + + # record any that have been tagged as 'nodate' as + # candidate for special handling. + if [[ "${__this_reason}" == *"nodate"* ]] ; then + echo "${logfile}" >> "${NOD_FILE_LIST}" + fi +} + +########################################################################### +########################################################################### +# +# Lets start looking at the files now ... +# +# Get all the files in /var/log base dir (not the subdirectories) +# +########################################################################### +########################################################################### + +# get a list of the files in "baselogdir" ; aka /var/log +# will look at the sub directories later. +dirlist+=$(find ${baselogdir} -mindepth 1 -maxdepth 1 -type f) + +# +# Debug: +# +# To debug handling a specific file as a filelist override. +# This clears the list in favor of the specific file specified as +# argument 8 on the command line. +# +if [ "${5}" != "" ] ; then + dlog "Overriding dirlist with specified file:${5}" + dirlist=("${5}") +fi + +# echo "${baselogdir} filelist: ... ${dirlist}..." +for logfile in ${dirlist} +do + # echo "File: ${logfile}" + __this_reason="" + track "${logfile}" "totals" + want_this_file "${logfile}" + handle_response "${logfile}" "${?}" +done + +########################################################################### +# Get all the files in baselogdir subdirectories # +########################################################################### + +subdirlist=$(find ${baselogdir} -mindepth 1 -maxdepth 20 -type d) + +# +# Debug: +# +# To debug handling a specific file that is in a /var/log subdirectory as a +# filelist override. +# +if [ "${5}" != "" ] ; then + dlog "Overriding subdirlist with specified file:${5}" + subdirlist=("") +fi + +# echo "${baselogdir} subdirlist ${subdirlist}..." +for logdir in ${subdirlist} +do + __this_reason="" + + # this find must find more than just its own dir + # so we compare to greater than one + if [ $(find "${logdir}" | wc -l) -gt 1 ]; then + for logfile in ${logdir}/* + do + __this_reason="" + track "$logfile" "totals" + want_this_file "$logfile" + handle_response "$logfile" "$?" + done + else + __this_reason="empty" + manage_file "${logdir}" "${EXCLUDE_FILE}" "empty directory" + fi +done + + +dlog "Include List: ${INC_FILE_LIST}" +for inc in "${inclist[@]}" +do + if [ ${#inc} -gt 2 ] ; then + dlog "including ${inc}" + # echo "${inc}" >> "${INC_FILE_LIST}.summary" + fi +done + + +dlog "Exclude List: ${EXC_FILE_LIST}" +for exc in "${exclist[@]}" +do + if [ ${#exc} -gt 2 ] ; then + dlog "excluding ${exc}" + # echo "${exc}" >> "${EXC_FILE_LIST}.summary" + fi +done + +summary + +exit 0 diff --git a/tools/collector/debian-scripts/collect_dc.sh b/tools/collector/debian-scripts/collect_dc.sh new file mode 100755 index 00000000..da44a935 --- /dev/null +++ b/tools/collector/debian-scripts/collect_dc.sh @@ -0,0 +1,97 @@ +#! /bin/bash +# +# Copyright (c) 2020-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="distributed_cloud" +LOGFILE="${extradir}/${SERVICE}.info" +RPMLOG="${extradir}/rpm.info" + +function is_active_controller { + active_controller=`sm-query service management-ip | grep "enabled-active"` + if [ -z "$active_controller" ] ; then + return 0 + else + return 1 + fi +} + +function is_distributed_cloud_env { + distributed_cloud=`sm-query service-group distributed-cloud-services | grep "active"` + if [ -z "$distributed_cloud" ] ; then + return 0 + else + return 1 + fi +} + +function is_subcloud { + subcloud=`cat /etc/platform/platform.conf | grep "distributed_cloud_role" | grep "subcloud"` + if [ -z "$subcloud" ] ; then + return 0 + else + return 1 + fi +} + +# Must be a distributed cloud environment +is_distributed_cloud_env +if [ "$?" = "0" ] ; then + exit 0 +fi + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + # Must be an active controller + is_active_controller + if [ "$?" = "0" ] ; then + exit 0 + fi + + echo "${hostname}: Distributed Cloud ..: ${LOGFILE}" + + is_subcloud + if [ "$?" = "1" ] ; then + # Subcloud + echo "Distributed Cloud Role: Subcloud" >> ${LOGFILE} + + delimiter ${LOGFILE} "Address Pool of System Controller" + # Prints the column names of the table + system addrpool-list --nowrap | head -3 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + # Prints the System Controller's address pool + system addrpool-list --nowrap | grep "system-controller-subnet" 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + else + # System Controller + echo "Distributed Cloud Role: System Controller" >> ${LOGFILE} + + delimiter ${LOGFILE} "dcmanager alarm summary" + dcmanager alarm summary 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "dcmanager subcloud list" + dcmanager subcloud list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "dcmanager subcloud-group list" + dcmanager subcloud-group list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + # copy the /opt/dc/ansible dir but exclude any iso files + rsync -a --exclude '*.iso' /opt/dc/ansible ${extradir} + + delimiter ${LOGFILE} "find /opt/dc-vault -ls" + find /opt/dc-vault -ls 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + fi + +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_disk.sh b/tools/collector/debian-scripts/collect_disk.sh new file mode 100644 index 00000000..e2495c1e --- /dev/null +++ b/tools/collector/debian-scripts/collect_disk.sh @@ -0,0 +1,28 @@ +#! /bin/bash +# +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="disk" +LOGFILE="${extradir}/${SERVICE}.info" + +############################################################################### +# Disk Info +############################################################################### + +echo "${hostname}: Disk Info .: ${LOGFILE}" + +for device in $(lsblk -l -o NAME,TYPE,TRAN | grep -v usb | grep -e disk | cut -d ' ' -f1); do + delimiter ${LOGFILE} "smartctl -a ${device}" + smartctl -a "/dev/${device}" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +exit 0 diff --git a/tools/collector/debian-scripts/collect_fm.sh b/tools/collector/debian-scripts/collect_fm.sh new file mode 100644 index 00000000..6f9e45e2 --- /dev/null +++ b/tools/collector/debian-scripts/collect_fm.sh @@ -0,0 +1,43 @@ +#! /bin/bash +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="alarms" +LOGFILE="${extradir}/${SERVICE}.info" + +function is_service_active { + active=`sm-query service management-ip | grep "enabled-active"` + if [ -z "$active" ] ; then + return 0 + else + return 1 + fi +} + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + echo "${hostname}: System Alarm List .: ${LOGFILE}" + + # These go into the SERVICE.info file + delimiter ${LOGFILE} "fm alarm-list" + fm alarm-list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + delimiter ${LOGFILE} "fm event-list --nopaging" + fm event-list --nopaging 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_host b/tools/collector/debian-scripts/collect_host new file mode 100755 index 00000000..f552fa18 --- /dev/null +++ b/tools/collector/debian-scripts/collect_host @@ -0,0 +1,488 @@ +#! /bin/bash +######################################################################## +# +# Copyright (c) 2016-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## + +# make these platform.conf variables global. +# values are loaded in source_openrc_if_needed. +export nodetype="" +export subfunction="" +export system_type="" +export security_profile="" +export sdn_enabled="" +export region_config="" +export vswitch_type="" +export system_mode="" +export sw_version="" + +# assume this is not the active controller until learned +export ACTIVE=false + +# +# Import commands, variables and convenience functions available to +# all collectors ; common and user defined. +# +source /usr/local/sbin/collect_utils +source_openrc_if_needed + +# +# parse input parameters +# +COLLECT_NAME="${1}" +DEBUG=${8} +INVENTORY=${9} +set_debug_mode ${DEBUG} + +# Calling parms +# +# 1 = collect name +# 2 = start date option +# 3 = start date +# 4 = "any" (ignored - no longer used ; kept to support upgrades/downgrades) +# 5 = end date option +# 6 = end date +# 7 = "any" (ignored - no longer used ; kept to support upgrades/downgrades) +# 8 = debug mode +# 9 = inventory +logger -t ${COLLECT_TAG} "${0} ${1} ${2} ${3} ${4} ${5} ${6} ${7} ${8} ${9}" + +# parse out the start data/time data if it is present +STARTDATE_RANGE=false +STARTDATE="any" +if [ "${2}" == "${STARTDATE_OPTION}" ] ; then + if [ "${3}" != "any" -a ${#3} -gt 7 ] ; then + STARTDATE_RANGE=true + STARTDATE="${3}" + fi +fi + +# parse out the end date/time if it is present +ENDDATE_RANGE=false +ENDDATE="any" +if [ "${5}" == "${ENDDATE_OPTION}" ] ; then + if [ "${6}" != "any" -a ${#6} -gt 7 ] ; then + ENDDATE_RANGE=true + ENDDATE="${6}" + fi +fi + +COLLECT_BASE_DIR="/scratch" +EXTRA="var/extra" +hostname="${HOSTNAME}" +COLLECT_NAME_DIR="${COLLECT_BASE_DIR}/${COLLECT_NAME}" +EXTRA_DIR="${COLLECT_NAME_DIR}/${EXTRA}" +TARBALL="${COLLECT_NAME_DIR}.tgz" +COLLECT_PATH="/etc/collect.d" +RUN_EXCLUDE="/etc/collect/run.exclude" +ETC_EXCLUDE="/etc/collect/etc.exclude" +VAR_LOG_EXCLUDE="/etc/collect/varlog.exclude" +COLLECT_INCLUDE="/var/run /etc /root" +FLIGHT_RECORDER_PATH="var/lib/sm/" +FLIGHT_RECORDER_FILE="sm.eru.v1" +VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst" +COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}" +COLLECT_DATE="/usr/local/sbin/collect_date" +COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv" + +function log_space() +{ + local msg=${1} + + space="`${COLLECT_DIR_USAGE_CMD}`" + space1=`echo "${space}" | grep -v Filesystem` + ilog "${COLLECT_BASE_DIR} ${msg} ${space1}" +} + +space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR} + +CURR_DIR=`pwd` +mkdir -p ${COLLECT_NAME_DIR} +cd ${COLLECT_NAME_DIR} + +# create dump target extra-stuff directory +mkdir -p ${EXTRA_DIR} + +RETVAL=0 + +# Remove any previous collect error log. +# Start this collect with an empty file. +# +# stderr is directed to this log during the collect process. +# By searching this log after collect_host is run we can find +# errors that occured during collect. +# The only real error that we care about right now is the +# +# "No space left on device" error +# +rm -f ${COLLECT_ERROR_LOG} +touch ${COLLECT_ERROR_LOG} +chmod 644 ${COLLECT_ERROR_LOG} +echo "`date '+%F %T'` :${COLLECT_NAME_DIR}" > ${COLLECT_ERROR_LOG} + +ilog "creating local collect tarball ${COLLECT_NAME_DIR}.tgz" + +################################################################################ +# Run collect scripts to check system status +################################################################################ +function collect_parts() +{ + if [ -d ${COLLECT_PATH} ]; then + for i in ${COLLECT_PATH}/*; do + if [ -f $i ]; then + if [ ${i} = ${COLLECT_SYSINV} ]; then + $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname} ${INVENTORY} + else + $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname} + fi + fi + done + fi +} + + +function collect_extra() +{ + # dump process lists + LOGFILE="${EXTRA_DIR}/process.info" + echo "${hostname}: Process Info ......: ${LOGFILE}" + + delimiter ${LOGFILE} "ps -e -H -o ..." + ${PROCESS_DETAIL_CMD} >> ${LOGFILE} + + # Collect process and thread info (tree view) + delimiter ${LOGFILE} "pstree --arguments --ascii --long --show-pids" + pstree --arguments --ascii --long --show-pids >> ${LOGFILE} + + # Collect process, thread and scheduling info (worker subfunction only) + # (also gets process 'affinity' which is useful on workers; + which ps-sched.sh >/dev/null 2>&1 + if [ $? -eq 0 ]; then + delimiter ${LOGFILE} "ps-sched.sh" + ps-sched.sh >> ${LOGFILE} + fi + + # Collect process, thread and scheduling, and elapsed time + # This has everything that ps-sched.sh does, except for cpu affinity mask, + # adds: stime,etime,time,wchan,tty). + delimiter ${LOGFILE} "ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command" + ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command >> ${LOGFILE} + + # Collect per kubernetes container name, QoS, and cpusets per numa node + delimiter ${LOGFILE} "kube-cpusets" + kube-cpusets >> ${LOGFILE} + + # Various host attributes + LOGFILE="${EXTRA_DIR}/host.info" + echo "${hostname}: Host Info .........: ${LOGFILE}" + + # CGCS build info + delimiter ${LOGFILE} "${BUILD_INFO_CMD}" + ${BUILD_INFO_CMD} >> ${LOGFILE} + + delimiter ${LOGFILE} "uptime" + uptime >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/cmdline" + cat /proc/cmdline >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/version" + cat /proc/version >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "lscpu" + lscpu >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "lscpu -e" + lscpu -e >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/cpuinfo" + cat /proc/cpuinfo >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /sys/devices/system/cpu/isolated" + cat /sys/devices/system/cpu/isolated >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "ip addr show" + ip addr show >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "lspci -nn" + lspci -nn >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "find /sys/kernel/iommu_groups/ -type l" + find /sys/kernel/iommu_groups/ -type l >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # networking totals + delimiter ${LOGFILE} "cat /proc/net/dev" + cat /proc/net/dev >> ${LOGFILE} + + delimiter ${LOGFILE} "dmidecode" + dmidecode >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # summary of scheduler tunable settings + delimiter ${LOGFILE} "cat /proc/sched_debug | head -15" + cat /proc/sched_debug | head -15 >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + if [ "${SKIP_MASK}" = "true" ]; then + delimiter ${LOGFILE} "facter (excluding ssh info)" + facter | grep -iv '^ssh' >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + else + delimiter ${LOGFILE} "facter" + facter >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + fi + + if [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then + delimiter ${LOGFILE} "topology" + topology >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + fi + + LOGFILE="${EXTRA_DIR}/memory.info" + echo "${hostname}: Memory Info .......: ${LOGFILE}" + + delimiter ${LOGFILE} "cat /proc/meminfo" + cat /proc/meminfo >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /sys/devices/system/node/node?/meminfo" + cat /sys/devices/system/node/node?/meminfo >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/slabinfo" + log_slabinfo ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "ps -e -o ppid,pid,nlwp,rss:10,vsz:10,cmd --sort=-rss" + ps -e -o ppid,pid,nlwp,rss:10,vsz:10,cmd --sort=-rss >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # list open files + delimiter ${LOGFILE} "lsof -lwX" + lsof -lwX >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # hugepages numa mapping + delimiter ${LOGFILE} "grep huge /proc/*/numa_maps" + grep -e " huge " /proc/*/numa_maps >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # rootfs and tmpfs usage + delimiter ${LOGFILE} "df -h -H -T --local -t rootfs -t tmpfs" + df -h -H -T --local -t rootfs -t tmpfs >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + LOGFILE="${EXTRA_DIR}/filesystem.info" + echo "${hostname}: Filesystem Info ...: ${LOGFILE}" + + # disk inodes usage + delimiter ${LOGFILE} "df -h -H -T --local -t rootfs -t tmpfs" + df -h -H -T --local -t rootfs -t tmpfs >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # disk space usage + delimiter ${LOGFILE} "df -h -H -T --local -t ext2 -t ext3 -t ext4 -t xfs --total" + df -h -H -T --local -t ext2 -t ext3 -t ext4 -t xfs --total >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # disk inodes usage + delimiter ${LOGFILE} "df -h -H -T --local -i -t ext2 -t ext3 -t ext4 -t xfs --total" + df -h -H -T --local -i -t ext2 -t ext3 -t ext4 -t xfs --total >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # disks by-path values + delimiter ${LOGFILE} "ls -lR /dev/disk" + ls -lR /dev/disk >> ${LOGFILE} + + # disk summary (requires sudo/root) + delimiter ${LOGFILE} "fdisk -l" + fdisk -l >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/scsi/scsi" + cat /proc/scsi/scsi >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Controller specific stuff + if [ "$nodetype" = "controller" ] ; then + + delimiter ${LOGFILE} "cat /proc/drbd" + cat /proc/drbd >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "/sbin/drbdadm dump" + /sbin/drbdadm dump >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + fi + + # LVM summary + delimiter ${LOGFILE} "/usr/sbin/vgs --version ; /usr/sbin/pvs --version ; /usr/sbin/lvs --version" + /usr/sbin/vgs --version >> ${LOGFILE} + /usr/sbin/pvs --version >> ${LOGFILE} + /usr/sbin/lvs --version >> ${LOGFILE} + + delimiter ${LOGFILE} "/usr/sbin/vgs --all --options all" + /usr/sbin/vgs --all --options all >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "/usr/sbin/pvs --all --options all" + /usr/sbin/pvs --all --options all >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "/usr/sbin/lvs --all --options all" + /usr/sbin/lvs --all --options all >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # iSCSI Information + LOGFILE="${EXTRA_DIR}/iscsi.info" + echo "${hostname}: iSCSI Information ......: ${LOGFILE}" + + if [ "$nodetype" = "controller" ] ; then + # Controller- LIO exported initiators summary + delimiter ${LOGFILE} "targetcli ls" + targetcli ls >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Controller - LIO sessions + delimiter ${LOGFILE} "targetcli sessions detail" + targetcli sessions detail >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + elif [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then + # Worker - iSCSI initiator information + collect_dir=${EXTRA_DIR}/iscsi_initiator_info + mkdir -p ${collect_dir} + cp -rf /run/iscsi-cache/nodes/* ${collect_dir} + find ${collect_dir} -type d -exec chmod 750 {} \; + + # Worker - iSCSI initiator active sessions + delimiter ${LOGFILE} "iscsiadm -m session" + iscsiadm -m session >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Worker - iSCSI udev created nodes + delimiter ${LOGFILE} "ls -la /dev/disk/by-path | grep \"iqn\"" + ls -la /dev/disk/by-path | grep "iqn" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + fi + + LOGFILE="${EXTRA_DIR}/history.info" + echo "${hostname}: Bash History ......: ${LOGFILE}" + + # history + delimiter ${LOGFILE} "cat /home/sysadmin/.bash_history" + cat /home/sysadmin/.bash_history >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + LOGFILE="${EXTRA_DIR}/interrupt.info" + echo "${hostname}: Interrupt Info ....: ${LOGFILE}" + + # interrupts + delimiter ${LOGFILE} "cat /proc/interrupts" + cat /proc/interrupts >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /proc/softirqs" + cat /proc/softirqs >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Controller specific stuff + if [ "$nodetype" = "controller" ] ; then + netstat -pan > ${EXTRA_DIR}/netstat.info + fi + + LOGFILE="${EXTRA_DIR}/blockdev.info" + echo "${hostname}: Block Devices Info : ${LOGFILE}" + + # Collect block devices - show all sda and cinder devices, and size + delimiter ${LOGFILE} "lsblk" + lsblk >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Collect block device topology - show devices and which io-scheduler + delimiter ${LOGFILE} "lsblk --topology" + lsblk --topology >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + # Collect SCSI devices - show devices and cinder attaches, etc + delimiter ${LOGFILE} "lsblk --scsi" + lsblk --scsi >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +} + +log_space "before collect ......:" + +collect_extra +collect_parts + +# +# handle collect collect-after and collect-range and then +# in elif clause collect-before +# +VAR_LOG="/var/log" +if [ -e /www/var/log ]; then + VAR_LOG="$VAR_LOG /www/var/log" +fi + +rm -f ${VAR_LOG_INCLUDE_LIST} + +if [ "${STARTDATE_RANGE}" == true ] ; then + if [ "${ENDDATE_RANGE}" == false ] ; then + ilog "collecting $VAR_LOG files containing logs after ${STARTDATE}" + ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" + else + ilog "collecting $VAR_LOG files containing logs between ${STARTDATE} and ${ENDDATE}" + ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" + fi +elif [ "${ENDDATE_RANGE}" == true ] ; then + STARTDATE="20130101" + ilog "collecting $VAR_LOG files containing logs before ${ENDDATE}" + ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" +else + ilog "collecting all of $VAR_LOG" + find $VAR_LOG ! -empty > ${VAR_LOG_INCLUDE_LIST} +fi + +# Add VM console.log +for i in /var/lib/nova/instances/*/console.log; do + if [ -e "$i" ]; then + tmp=`dirname $i` + mkdir -p ${COLLECT_NAME_DIR}/$tmp + cp $i ${COLLECT_NAME_DIR}/$tmp + fi +done + +log_space "before first tar ....:" + +(cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar -T ${VAR_LOG_INCLUDE_LIST} -X ${RUN_EXCLUDE} -X ${ETC_EXCLUDE} -X ${VAR_LOG_EXCLUDE} ${COLLECT_INCLUDE} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) + +log_space "after first tar .....:" + +(cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${UNTAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) + +log_space "after first untar ...:" + +rm -f ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar + +log_space "after delete tar ....:" + +if [ "${SKIP_MASK}" != "true" ]; then + # Run password masking before final tar + dlog "running /usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR}" + /usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR} + log_space "after passwd masking :" +fi + +(cd ${COLLECT_BASE_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}.tgz ${COLLECT_NAME} 2>/dev/null 1>/dev/null ) + +log_space "after first tarball .:" + +mkdir -p ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH} + +(cd /${FLIGHT_RECORDER_PATH} ; ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH}/${FLIGHT_RECORDER_FILE}.tgz ./${FLIGHT_RECORDER_FILE} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG}) + +# Pull in an updated user.log which contains the most recent collect logs +# ... be sure to exclude any out of space logs +tail -30 /var/log/user.log | grep "COLLECT:" | grep -v "${FAIL_OUT_OF_SPACE_STR}" >> ${COLLECT_ERROR_LOG} +cp -a ${COLLECT_LOG} ${COLLECT_LOG}.last +cp -a ${COLLECT_ERROR_LOG} ${COLLECT_LOG} +cp -a ${COLLECT_LOG} ${COLLECT_NAME_DIR}/var/log + +log_space "with flight data ....:" + +(cd ${COLLECT_BASE_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}.tgz ${COLLECT_NAME} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) + +log_space "after collect .......:" + +rm -rf ${COLLECT_NAME_DIR} +rm -f ${VAR_LOG_INCLUDE_LIST} + +log_space "after cleanup .......:" + +# Check for collect errors +# Only out of space error is enough to fail this hosts's collect +collect_errors ${HOSTNAME} +RC=${?} + +rm -f ${COLLECT_ERROR_LOG} + +if [ ${RC} -ne 0 ] ; then + rm -f ${COLLECT_NAME_DIR}.tgz + ilog "${FAIL_OUT_OF_SPACE_STR} ${COLLECT_BASE_DIR}" +else + ilog "collect of ${COLLECT_NAME_DIR}.tgz succeeded" + echo "${collect_done}" +fi diff --git a/tools/collector/debian-scripts/collect_ima.sh b/tools/collector/debian-scripts/collect_ima.sh new file mode 100755 index 00000000..14c751e4 --- /dev/null +++ b/tools/collector/debian-scripts/collect_ima.sh @@ -0,0 +1,59 @@ +#! /bin/bash +# +# Copyright (c) 2017 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +function is_extended_profile { + if [ ! -n "${security_profile}" ] || [ "${security_profile}" != "extended" ]; then + return 0 + else + return 1 + fi +} + +SERVICE="ima" +LOGFILE="${extradir}/${SERVICE}.info" + +############################################################################### +# All Node Types +############################################################################### + +is_extended_profile +if [ "$?" = "0" ] ; then + exit 0 +fi + +echo "${hostname}: IMA Info ..........: ${LOGFILE}" + +delimiter ${LOGFILE} "IMA Kernel Modules" +lsmod | grep ima >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +delimiter ${LOGFILE} "Auditd status" +service auditd status >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +ps -aux | grep audit >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +mkdir -p ${extradir}/integrity 2>>${COLLECT_ERROR_LOG} + +delimiter ${LOGFILE} "IMA Runtime Measurement and Violations cache" +if [ -d "/sys/kernel/security/ima" ]; then + ls /sys/kernel/security/ima >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + cp -rf /sys/kernel/security/ima ${extradir}/integrity 2>>${COLLECT_ERROR_LOG} +else + echo "ERROR: IMA Securityfs directory does not exist!" >> ${LOGFILE} +fi + +cp -rf /etc/modprobe.d/ima.conf ${extradir}/integrity 2>>${COLLECT_ERROR_LOG} +cp -rf /etc/modprobe.d/integrity.conf ${extradir}/integrity 2>>${COLLECT_ERROR_LOG} +cp -rf /etc/ima.policy ${extradir}/integrity 2>>${COLLECT_ERROR_LOG} + +# make sure all these collected files are world readible +chmod -R 755 ${extradir}/integrity + +exit 0 diff --git a/tools/collector/debian-scripts/collect_interfaces.sh b/tools/collector/debian-scripts/collect_interfaces.sh new file mode 100644 index 00000000..574d4894 --- /dev/null +++ b/tools/collector/debian-scripts/collect_interfaces.sh @@ -0,0 +1,34 @@ +#! /bin/bash +# +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="interface" +LOGFILE="${extradir}/${SERVICE}.info" + +############################################################################### +# Interface Info +############################################################################### + +echo "${hostname}: Interface Info .: ${LOGFILE}" + +delimiter ${LOGFILE} "ip link" +ip link >> ${LOGFILE} + +for i in $(ls /sys/class/net/); do + delimiter ${LOGFILE} "ethtool -i ${i}" + ethtool -i ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "ethtool -S ${i} | grep -v ': 0'" + ethtool -S ${i} | grep -v ": 0" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +exit 0 diff --git a/tools/collector/debian-scripts/collect_mariadb.sh b/tools/collector/debian-scripts/collect_mariadb.sh new file mode 100755 index 00000000..0ed902d7 --- /dev/null +++ b/tools/collector/debian-scripts/collect_mariadb.sh @@ -0,0 +1,61 @@ +#! /bin/bash +# +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Gather containerized MariaDB information from active controller. + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="mariadb" +DB_DIR="${extradir}/${SERVICE}" +LOGFILE="${extradir}/${SERVICE}.info" +echo "${hostname}: MariaDB Info .....: ${LOGFILE}" + +function is_service_active { + active=$(sm-query service postgres | grep "enabled-active") + if [ -z "${active}" ] ; then + return 0 + else + return 1 + fi +} + +if [ "${nodetype}" = "controller" ] ; then + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + # MariaDB databases + delimiter ${LOGFILE} "MariaDB databases:" + mariadb-cli --command 'show databases' >> ${LOGFILE} + + # MariaDB database sizes + delimiter ${LOGFILE} "MariaDB database sizes:" + mariadb-cli --command ' +SELECT table_schema AS "database", + ROUND(SUM(DATA_LENGTH + INDEX_LENGTH)/1024/1024, 3) AS "Size (MiB)", + SUM(TABLE_ROWS) AS "rowCount" +FROM information_schema.TABLES +GROUP BY table_schema' >> ${LOGFILE} + + delimiter ${LOGFILE} "MariaDB database table sizes:" + mariadb-cli --command ' +SELECT + table_schema AS "database", TABLE_NAME AS "table", + ROUND((DATA_LENGTH + INDEX_LENGTH)/1024/1024, 6) AS "Size (MiB)", + TABLE_ROWS AS "rowCount" +FROM information_schema.TABLES +ORDER BY table_schema, TABLE_NAME' >> ${LOGFILE} + + # MariaDB dump all databases + delimiter ${LOGFILE} "Dumping MariaDB databases: ${DB_DIR}" + mkdir -p ${DB_DIR} + (cd ${DB_DIR}; mariadb-cli --dump --exclude keystone,ceilometer) +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_mask_passwords b/tools/collector/debian-scripts/collect_mask_passwords new file mode 100644 index 00000000..a168564d --- /dev/null +++ b/tools/collector/debian-scripts/collect_mask_passwords @@ -0,0 +1,138 @@ +#! /bin/bash +# +# Copyright (c) 2017 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +COLLECT_NAME_DIR=$1 +EXTRA_DIR=$2 + +# Strip the passwords from assorted config files +for conffile in \ + ${COLLECT_NAME_DIR}/etc/aodh/aodh.conf \ + ${COLLECT_NAME_DIR}/etc/barbican/barbican.conf \ + ${COLLECT_NAME_DIR}/etc/ceilometer/ceilometer.conf \ + ${COLLECT_NAME_DIR}/etc/cinder/cinder.conf \ + ${COLLECT_NAME_DIR}/etc/fm/fm.conf \ + ${COLLECT_NAME_DIR}/etc/glance/glance-api.conf \ + ${COLLECT_NAME_DIR}/etc/glance/glance-registry.conf \ + ${COLLECT_NAME_DIR}/etc/heat/heat.conf \ + ${COLLECT_NAME_DIR}/etc/ironic/ironic.conf \ + ${COLLECT_NAME_DIR}/etc/keystone/keystone.conf \ + ${COLLECT_NAME_DIR}/etc/magnum/magnum.conf \ + ${COLLECT_NAME_DIR}/etc/murano/murano.conf \ + ${COLLECT_NAME_DIR}/etc/neutron/metadata_agent.ini \ + ${COLLECT_NAME_DIR}/etc/neutron/neutron.conf \ + ${COLLECT_NAME_DIR}/etc/nfv/nfv_plugins/nfvi_plugins/config.ini \ + ${COLLECT_NAME_DIR}/etc/nova/nova.conf \ + ${COLLECT_NAME_DIR}/etc/nslcd.conf \ + ${COLLECT_NAME_DIR}/etc/openldap/slapd.conf.backup \ + ${COLLECT_NAME_DIR}/etc/openstack-dashboard/local_settings \ + ${COLLECT_NAME_DIR}/etc/panko/panko.conf \ + ${COLLECT_NAME_DIR}/etc/patching/patching.conf \ + ${COLLECT_NAME_DIR}/etc/proxy/nova-api-proxy.conf \ + ${COLLECT_NAME_DIR}/etc/rabbitmq/murano-rabbitmq.config \ + ${COLLECT_NAME_DIR}/etc/rabbitmq/rabbitmq.config \ + ${COLLECT_NAME_DIR}/etc/sysinv/api-paste.ini \ + ${COLLECT_NAME_DIR}/etc/sysinv/sysinv.conf \ + ${COLLECT_NAME_DIR}/var/extra/platform/sysinv/*/sysinv.conf.default \ + ${COLLECT_NAME_DIR}/etc/mtc.ini + +do + if [ ! -f $conffile ]; then + continue + fi + + sed -i -r 's/^(admin_password) *=.*/\1 = xxxxxx/; + s/^(auth_encryption_key) *=.*/\1 = xxxxxx/; + s/^(bindpw) .*/\1 xxxxxx/; + s/^(rootpw) .*/\1 xxxxxx/; + s/^(connection) *=.*/\1 = xxxxxx/; + s/^( *credentials) *=.*/\1 = xxxxxx/; + s/^(metadata_proxy_shared_secret) *=.*/\1 = xxxxxx/; + s/^(password) *=.*/\1 = xxxxxx/; + s/^(rabbit_password) *=.*/\1 = xxxxxx/; + s/^(sql_connection) *=.*/\1 = xxxxxx/; + s/^(stack_domain_admin_password) *=.*/\1 = xxxxxx/; + s/^(transport_url) *=.*/\1 = xxxxxx/; + s/^(SECRET_KEY) *=.*/\1 = xxxxxx/; + s/^(keystone_auth_pw) *=.*/\1 = xxxxxx/; + s/\{default_pass, <<\".*\">>\}/\{default_pass, <<\"xxxxxx\">>\}/' $conffile +done + +find ${COLLECT_NAME_DIR} -name server-cert.pem | xargs --no-run-if-empty rm -f +rm -rf ${COLLECT_NAME_DIR}/var/extra/platform/config/*/ssh_config +rm -f ${COLLECT_NAME_DIR}/var/extra/platform/puppet/*/hieradata/secure*.yaml +rm -f ${COLLECT_NAME_DIR}/etc/puppet/cache/hieradata/secure*.yaml + +# dir /etc/kubernetes/pki was etc.excluded +if [ -d "/etc/kubernetes/pki" ] ; then + # grab the public certificates if /etc/kubernetes/pki exists + mkdir -p ${COLLECT_NAME_DIR}/etc/kubernetes/pki + cp -a /etc/kubernetes/pki/*.crt ${COLLECT_NAME_DIR}/etc/kubernetes/pki 2>/dev/null 1>/dev/null +fi + +# Mask user passwords in sysinv db dump +if [ -f ${COLLECT_NAME_DIR}/var/extra/database/sysinv.db.sql.txt ]; then + sed -i -r '/COPY i_user/, /^--/ s/^(([^\t]*\t){10})[^\t]*(\t.*)/\1xxxxxx\3/; + /COPY i_community/, /^--/ s/^(([^\t]*\t){5})[^\t]*(\t.*)/\1xxxxxx\3/; + /COPY i_trap_destination/, /^--/ s/^(([^\t]*\t){6})[^\t]*(\t.*)/\1xxxxxx\3/; + s/(identity\t[^\t]*\tpassword\t)[^\t]*/\1xxxxxx/' \ + ${COLLECT_NAME_DIR}/var/extra/database/sysinv.db.sql.txt +fi + +# Mask passwords in host profiles +grep -rl '\"name\": \"password\"' ${COLLECT_NAME_DIR}/var/extra/platform/sysinv/ \ + | xargs --no-run-if-empty perl -i -e ' + $prev=""; + while (<>) + { + if (/\"name\": \"password\"/) + { + $prev =~ s/\"value\": \".*\"/\"value\": \"xxxxxx\"/; + } + print $prev; + $prev=$_; + } + print $prev;' + +# Cleanup snmp +sed -i -r 's/(rocommunity[^ ]*).*/\1 xxxxxx/' ${COLLECT_NAME_DIR}/var/extra/platform/config/*/snmp/* +sed -i -r 's/(trap2sink *[^ ]*).*/\1 xxxxxx/' ${COLLECT_NAME_DIR}/var/extra/platform/config/*/snmp/* + +# Mask passwords in bash.log and history logs +USER_HISTORY_FILES=$(find ${COLLECT_NAME_DIR} -type f -name .bash_history 2>/dev/null) +sed -i -r 's/(snmp-comm-(delete|show)) *((\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*) *){1,}/\1 xxxxxx/; + s/(snmp.*) *(--community|-c) *(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 \2 xxxxxx/; + s/(-password)=(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1=xxxxxx/; + s/(-password) (\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 xxxxxx/g; + s/(password)'\'': (\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1'\':' xxxxxx/g; + s/(password):(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)'\''/\1:xxxxxx'\''/g; + s/(openstack.*) *(--password) *(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 \2 xxxxxx/; + s/(ldapmodifyuser.*userPassword *)(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 xxxxxx/' \ + ${USER_HISTORY_FILES} \ + ${COLLECT_NAME_DIR}/var/extra/history.info \ + ${COLLECT_NAME_DIR}/var/log/bash.log \ + ${COLLECT_NAME_DIR}/var/log/auth.log \ + ${COLLECT_NAME_DIR}/var/log/user.log \ + ${COLLECT_NAME_DIR}/var/log/ldapscripts.log + +for f in ${COLLECT_NAME_DIR}/var/log/bash.log.*.gz \ + ${COLLECT_NAME_DIR}/var/log/auth.log.*.gz \ + ${COLLECT_NAME_DIR}/var/log/user.log.*.gz \ + ${COLLECT_NAME_DIR}/var/log/ldapscripts.log.*.gz +do + zgrep -q 'snmp|password' $f || continue + gunzip $f + unzipped=${f%%.gz} + sed -i -r 's/(snmp-comm-(delete|show)) *((\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*) *){1,}/\1 xxxxxx/; + s/(snmp.*) *(--community|-c) *(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 \2 xxxxxx/; + s/(-password)=(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1=xxxxxx/; + s/(-password) (\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 xxxxxx/g; + s/(password)'\'': (\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1'\':' xxxxxx/g; + s/(password):(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)'\''/\1:xxxxxx'\''/g; + s/(openstack.*) *(--password) *(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 \2 xxxxxx/; + s/(ldapmodifyuser.*userPassword *)(\"[^\"]*\"|'\''[^'"'"']*'"'"'|[^ ]*)/\1 xxxxxx/' $unzipped + gzip $unzipped +done diff --git a/tools/collector/debian-scripts/collect_networking.sh b/tools/collector/debian-scripts/collect_networking.sh new file mode 100755 index 00000000..98f4136e --- /dev/null +++ b/tools/collector/debian-scripts/collect_networking.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="networking" +LOGFILE="${extradir}/${SERVICE}.info" +echo "${hostname}: Networking Info ...: ${LOGFILE}" + +############################################################################### +# All nodes +############################################################################### +declare -a CMDS=("ip -s link" +"ip -4 -s addr" +"ip -6 -s addr" +"ip -4 -s neigh" +"ip -6 -s neigh" +"ip -4 rule" +"ip -6 rule" +"ip -4 route" +"ip -6 route" +) + +for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE} "${CMD}" + ${CMD} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +CMD="iptables-save" +delimiter ${LOGFILE} "${CMD}" +${CMD} > ${extradir}/iptables.dump 2>>${COLLECT_ERROR_LOG} + +CMD="ip6tables-save" +delimiter ${LOGFILE} "${CMD}" +${CMD} > ${extradir}/ip6tables.dump 2>>${COLLECT_ERROR_LOG} + +############################################################################### +# Only Worker +############################################################################### +if [[ "$nodetype" = "worker" || "$subfunction" == *"worker"* ]] ; then + NAMESPACES=($(ip netns)) + for NS in ${NAMESPACES[@]}; do + delimiter ${LOGFILE} "${NS}" + for CMD in "${CMDS[@]}" ; do + ip netns exec ${NS} ${CMD} + done + done >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_nfv_vim.sh b/tools/collector/debian-scripts/collect_nfv_vim.sh new file mode 100644 index 00000000..c5ccbc7f --- /dev/null +++ b/tools/collector/debian-scripts/collect_nfv_vim.sh @@ -0,0 +1,44 @@ +#! /bin/bash +# +# Copyright (c) 2013-2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +LOGFILE="${extradir}/nfv-vim.info" +echo "${hostname}: NFV-Vim Info ......: ${LOGFILE}" + +function is_service_active { + active=`sm-query service vim | grep "enabled-active"` + if [ -z "$active" ] ; then + return 0 + else + return 1 + fi +} + +############################################################################### +# Only Controller +############################################################################### + +if [ "$nodetype" = "controller" ] ; then + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + # Assumes that database_dir is unique in /etc/nfv/vim/config.ini + DATABASE_DIR=$(awk -F "=" '/database_dir/ {print $2}' /etc/nfv/vim/config.ini) + + SQLITE_DUMP="/usr/bin/sqlite3 ${DATABASE_DIR}/vim_db_v1 .dump" + + delimiter ${LOGFILE} "dump database" + timeout 30 ${SQLITE_DUMP} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +fi + +exit 0 + diff --git a/tools/collector/debian-scripts/collect_openstack.sh b/tools/collector/debian-scripts/collect_openstack.sh new file mode 100755 index 00000000..62fce1e0 --- /dev/null +++ b/tools/collector/debian-scripts/collect_openstack.sh @@ -0,0 +1,154 @@ +#! /bin/bash +# +# Copyright (c) 2013-2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +# Environment for kubectl +export KUBECONFIG=/etc/kubernetes/admin.conf + +SERVICE="openstack" +LOGFILE="${extradir}/${SERVICE}.info" +echo "${hostname}: Openstack Info ....: ${LOGFILE}" + +function is_service_active { + active=$(sm-query service rabbit-fs | grep "enabled-active") + if [ -z "${active}" ] ; then + return 0 + else + return 1 + fi +} + +function is_openstack_node { + local PASS=0 + local FAIL=1 + # NOTE: hostname changes during first configuration + local this_node=$(cat /proc/sys/kernel/hostname) + + labels=$(kubectl get node ${this_node} \ + --no-headers --show-labels 2>/dev/null | awk '{print $NF}') + if [[ $labels =~ openstack-control-plane=enabled ]]; then + return ${PASS} + else + return ${FAIL} + fi +} + +function openstack_credentials { + # Setup openstack admin tenant credentials using environment variables + unset OS_SERVICE_TOKEN + export OS_ENDPOINT_TYPE=internalURL + export CINDER_ENDPOINT_TYPE=internalURL + export OS_USERNAME=admin + export OS_PASSWORD=$(TERM=linux /opt/platform/.keyring/*/.CREDENTIAL 2>/dev/null) + export OS_AUTH_TYPE=password + export OS_AUTH_URL=http://keystone.openstack.svc.cluster.local/v3 + export OS_PROJECT_NAME=admin + export OS_USER_DOMAIN_NAME=Default + export OS_PROJECT_DOMAIN_NAME=Default + export OS_IDENTITY_API_VERSION=3 + export OS_REGION_NAME=RegionOne + export OS_INTERFACE=internal +} + +function openstack_commands { + declare -a CMDS=() + CMDS+=("openstack project list --long") + CMDS+=("openstack user list --long") + CMDS+=("openstack service list --long") + CMDS+=("openstack router list --long") + CMDS+=("openstack network list --long") + CMDS+=("openstack subnet list --long") + CMDS+=("openstack image list --long") + CMDS+=("openstack volume list --all-projects --long") + CMDS+=("openstack availability zone list --long") + CMDS+=("openstack server group list --all-projects --long") + CMDS+=('openstack server list --all-projects --long -c ID -c Name -c Status -c "Task State" -c "Power State" -c Networks -c "Image Name" -c "Image ID" -c "Flavor Name" -c "Flavor ID" -c "Availability Zone" -c Host -c Properties') + CMDS+=("openstack stack list --long --all-projects") + CMDS+=("openstack security group list --all-projects") + CMDS+=("openstack security group rule list --all-projects --long") + CMDS+=("openstack keypair list") + CMDS+=("openstack configuration show") + CMDS+=("openstack quota list --compute") + CMDS+=("openstack quota list --volume") + CMDS+=("openstack quota list --network") + CMDS+=("openstack host list") + CMDS+=("openstack hypervisor list --long") + CMDS+=("openstack hypervisor stats show") + HOSTS=( $(openstack hypervisor list -f value -c "Hypervisor Hostname" 2>/dev/null) ) + for host in "${HOSTS[@]}" ; do + CMDS+=("openstack hypervisor show -f yaml ${host}") + done + + # nova commands + CMDS+=("nova service-list") + + for CMD in "${CMDS[@]}" ; do + delimiter ${LOGFILE} "${CMD}" + eval ${CMD} 2>>${COLLECT_ERROR_LOG} >>${LOGFILE} + echo >>${LOGFILE} + done +} + +function rabbitmq_usage_stats { + # RabbitMQ usage stats + MQ_STATUS="rabbitmqctl status" + delimiter ${LOGFILE} "${MQ_STATUS} | grep -e '{memory' -A30" + ${MQ_STATUS} 2>/dev/null | grep -e '{memory' -A30 >> ${LOGFILE} + echo >>${LOGFILE} + + delimiter ${LOGFILE} "RabbitMQ Queue Info" + num_queues=$(rabbitmqctl list_queues | wc -l); ((num_queues-=2)) + num_bindings=$(rabbitmqctl list_bindings | wc -l); ((num_bindings-=2)) + num_exchanges=$(rabbitmqctl list_exchanges | wc -l); ((num_exchanges-=2)) + num_connections=$(rabbitmqctl list_connections | wc -l); ((num_connections-=2)) + num_channels=$(rabbitmqctl list_channels | wc -l); ((num_channels-=2)) + arr=($(rabbitmqctl list_queues messages consumers memory | \ + awk '/^[0-9]/ {a+=$1; b+=$2; c+=$3} END {print a, b, c}')) + messages=${arr[0]}; consumers=${arr[1]}; memory=${arr[2]} + printf "%6s %8s %9s %11s %8s %8s %9s %10s\n" "queues" "bindings" "exchanges" "connections" "channels" "messages" "consumers" "memory" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + printf "%6d %8d %9d %11d %8d %8d %9d %10d\n" $num_queues $num_bindings $num_exchanges $num_connections $num_channels $messages $consumers $memory >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +} + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + # host rabbitmq usage + rabbitmq_usage_stats + + # Check for openstack label on this node + if ! is_openstack_node; then + exit 0 + fi + + # Run as subshell so we don't contaminate environment + (openstack_credentials; openstack_commands) + + # TODO(jgauld): Should also get containerized rabbitmq usage, + # need wrapper script rabbitmq-cli +fi + +############################################################################### +# collect does not retrieve /etc/keystone dir +# Additional logic included to copy /etc/keystone directory +############################################################################### + +mkdir -p ${extradir}/../../etc/ +cp -R /etc/keystone/ ${extradir}/../../etc +chmod -R 755 ${extradir}/../../etc/keystone + +exit 0 diff --git a/tools/collector/debian-scripts/collect_ovs.sh b/tools/collector/debian-scripts/collect_ovs.sh new file mode 100644 index 00000000..94e98e69 --- /dev/null +++ b/tools/collector/debian-scripts/collect_ovs.sh @@ -0,0 +1,35 @@ +#! /bin/bash +######################################################################## +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## + +# Loads Up Utilities and Commands Variables + +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="ovs" +LOGFILE="${extradir}/${SERVICE}.info" + + +############################################################################### +# Only Worker Nodes +############################################################################### +if [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then + + if [[ "$vswitch_type" == *ovs* ]]; then + echo "${hostname}: OVS Info ..........: ${LOGFILE}" + + delimiter ${LOGFILE} "ovsdb-client dump" + ovsdb-client dump >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "ovs-vsctl show" + ovs-vsctl --timeout 10 show >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + fi +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_parms b/tools/collector/debian-scripts/collect_parms new file mode 100644 index 00000000..66001504 --- /dev/null +++ b/tools/collector/debian-scripts/collect_parms @@ -0,0 +1,29 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +#echo "defaults: $1-$2-$3-$4" + +if [ -z ${1} ] ; then + basedir=/scratch +else + basedir=$1 +fi + +if [ -z ${2} ] ; then + extradir=$basedir/var/extra +else + extradir=$2 +fi + +if [ -z ${3} ] ; then + hostname=$HOSTNAME +else + hostname=$3 +fi + +mkdir -p ${extradir} diff --git a/tools/collector/debian-scripts/collect_patching.sh b/tools/collector/debian-scripts/collect_patching.sh new file mode 100755 index 00000000..32ac8ba6 --- /dev/null +++ b/tools/collector/debian-scripts/collect_patching.sh @@ -0,0 +1,46 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="patching" +LOGFILE="${extradir}/${SERVICE}.info" +echo "${hostname}: Patching Info .....: ${LOGFILE}" + +############################################################################### +# All nodes +############################################################################### +# FIXME: Debian doesnt support smart channel +#delimiter ${LOGFILE} "smart channel --show" +#smart channel --show 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + delimiter ${LOGFILE} "sw-patch query" + sw-patch query 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "sw-patch query-hosts" + sw-patch query-hosts 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "sw-patch query-hosts --debug" + sw-patch query-hosts --debug 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "find /opt/patching" + find /opt/patching 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "find /var/www/pages/updates" + find /var/www/pages/updates 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_psqldb.sh b/tools/collector/debian-scripts/collect_psqldb.sh new file mode 100755 index 00000000..d223b1b7 --- /dev/null +++ b/tools/collector/debian-scripts/collect_psqldb.sh @@ -0,0 +1,117 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +# postgres database commands +PSQL_CMD="sudo -u postgres psql --pset pager=off -q" +PG_DUMP_CMD="sudo -u postgres pg_dump" + +SERVICE="database" +DB_DIR="${extradir}/database" +LOGFILE="${extradir}/database.info" +echo "${hostname}: Database Info .....: ${LOGFILE}" + +function is_service_active { + active=`sm-query service postgres | grep "enabled-active"` + if [ -z "$active" ] ; then + return 0 + else + return 1 + fi +} + +############################################################################### +# All node types +############################################################################### +mkdir -p ${DB_DIR} + +function log_database { + db_list=( $(${PSQL_CMD} -t -c "SELECT datname FROM pg_database WHERE datistemplate = false;") ) + for db in "${db_list[@]}"; do + echo "postgres database: ${db}" + ${PSQL_CMD} -d ${db} -c " + SELECT + table_schema, + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size, + live_tuples, + dead_tuples + FROM ( + SELECT + table_schema, + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size, + pg_stat_get_live_tuples(table_name::regclass) AS live_tuples, + pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples + FROM ( + SELECT + table_schema, + table_name + FROM information_schema.tables + WHERE table_schema='public' + AND table_type='BASE TABLE' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + " + done >> ${1} +} + + + +DB_EXT=db.sql.txt +function database_dump { + mkdir -p ${DB_DIR} + db_list=( $(${PSQL_CMD} -t -c "SELECT datname FROM pg_database WHERE datistemplate = false;") ) + for DB in "${db_list[@]}"; do + if [ "$DB" != "keystone" -a "$DB" != "ceilometer" ] ; then + echo "${hostname}: Dumping Database ..: ${DB_DIR}/$DB.$DB_EXT" + (cd ${DB_DIR} ; sudo -u postgres pg_dump $DB > $DB.$DB_EXT) + fi + done +} + +############################################################################### +# Only Controller +############################################################################### + +if [ "$nodetype" = "controller" ] ; then + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + + # postgres DB sizes + delimiter ${LOGFILE} "formatted ${PSQL_CMD} -c" + ${PSQL_CMD} -c " + SELECT + pg_database.datname, + pg_database_size(pg_database.datname), + pg_size_pretty(pg_database_size(pg_database.datname)) + FROM pg_database + ORDER BY pg_database_size DESC; + " >> ${LOGFILE} + + # Number of postgres connections + delimiter ${LOGFILE} "ps -C postgres -o cmd=" + ps -C postgres -o cmd= >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "call to log_database" + log_database ${LOGFILE} + + database_dump +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_sm.sh b/tools/collector/debian-scripts/collect_sm.sh new file mode 100644 index 00000000..5f0f3c9b --- /dev/null +++ b/tools/collector/debian-scripts/collect_sm.sh @@ -0,0 +1,26 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="sm" +LOGFILE="${extradir}/sm.info" +echo "${hostname}: Service Management : ${LOGFILE}" + +############################################################################### +# Only Controller +############################################################################### + +if [ "$nodetype" = "controller" ] ; then + kill -SIGUSR1 $(>${COLLECT_ERROR_LOG} >> ${LOGFILE} +fi + +exit 0 diff --git a/tools/collector/debian-scripts/collect_sysinv.sh b/tools/collector/debian-scripts/collect_sysinv.sh new file mode 100755 index 00000000..3c00ed0a --- /dev/null +++ b/tools/collector/debian-scripts/collect_sysinv.sh @@ -0,0 +1,118 @@ +#! /bin/bash +# +# Copyright (c) 2013-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="inventory" +LOGFILE="${extradir}/${SERVICE}.info" +RPMLOG="${extradir}/rpm.info" +INVENTORY=${4} + +function is_service_active { + active=`sm-query service management-ip | grep "enabled-active"` + if [ -z "$active" ] ; then + return 0 + else + return 1 + fi +} + +function collect_inventory { + is_service_active + if [ "$?" = "0" ] ; then + exit 0 + fi + echo "${hostname}: System Inventory ..: ${LOGFILE}" + + HOSTNAMES=$(system host-list --nowrap | grep '[0-9]' | cut -d '|' -f 3 | tr -d ' ') + if [[ -z ${HOSTNAMES} || ${HOSTNAMES} != *"controller"* ]]; then + echo "Failed to get system host-list" > $LOGFILE + exit 0 + fi + + # These go into the SERVICE.info file + delimiter ${LOGFILE} "system show" + system show 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-list" + system host-list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system datanetwork-list" + system datanetwork-list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system service-list" + system service-list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + # delimiter ${LOGFILE} "vm-topology" + # timeout 60 vm-topology --show all 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system network-list" + system network-list 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + for host in ${HOSTNAMES}; do + delimiter ${LOGFILE} "system host-show ${host}" + system host-show 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-port-list ${host}" + system host-port-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-if-list ${host}" + system host-if-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system interface-network-list ${host}" + system interface-network-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-ethernet-port-list ${host}" + system host-ethernet-port-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-cpu-list ${host}" + system host-cpu-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-memory-list ${host}" + system host-memory-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-label-list ${host}" + system host-label-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-disk-list ${host}" + system host-disk-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-stor-list ${host}" + system host-stor-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-lvg-list ${host}" + system host-lvg-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + + delimiter ${LOGFILE} "system host-pv-list ${host}" + system host-pv-list ${host} 2>>${COLLECT_ERROR_LOG} >> ${LOGFILE} + done +} + +############################################################################### +# Only Controller +############################################################################### +if [ "$nodetype" = "controller" ] ; then + + echo "${hostname}: Software Config ...: ${RPMLOG}" + # These go into the SERVICE.info file + delimiter ${RPMLOG} "dpkg -l" + dpkg -l >> ${RPMLOG} + + if [ "${INVENTORY}" = true ] ; then + collect_inventory + fi + + # copy /opt/platform to extra dir while filtering out the + # iso and lost+found dirs + rsync -a --exclude 'iso' --exclude 'lost+found' /opt/platform ${extradir} +fi + + +exit 0 diff --git a/tools/collector/debian-scripts/collect_tc.sh b/tools/collector/debian-scripts/collect_tc.sh new file mode 100755 index 00000000..95cd0fef --- /dev/null +++ b/tools/collector/debian-scripts/collect_tc.sh @@ -0,0 +1,82 @@ +#! /bin/bash +# +# Copyright (c) 2013-2014 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +# Loads Up Utilities and Commands Variables +source /usr/local/sbin/collect_parms +source /usr/local/sbin/collect_utils + +SERVICE="tc" +LOGFILE="${extradir}/tc.info" +echo "${hostname}: Traffic Controls . : ${LOGFILE}" + +############################################################################### +# Interface Info +############################################################################### +delimiter ${LOGFILE} "cat /etc/network/interfaces" +if [ -f /etc/network/interfaces ]; then + cat /etc/network/interfaces >> ${LOGFILE} +else + echo "/etc/network/interfaces NOT FOUND" >> ${LOGFILE} +fi + +delimiter ${LOGFILE} "ip link" +ip link >> ${LOGFILE} + +for i in $(ip link | grep mtu | grep eth |awk '{print $2}' | sed 's#:##g'); do + + delimiter ${LOGFILE} "ethtool ${i}" + ethtool ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "cat /sys/class/net/${i}/speed" + cat /sys/class/net/${i}/speed >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "ethtool -S ${i}" + ethtool -S ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +############################################################################### +# TC Configuration Script (/usr/local/bin/tc_setup.sh) +############################################################################### +delimiter ${LOGFILE} "cat /usr/local/bin/tc_setup.sh" +if [ -f /usr/local/bin/tc_setup.sh ]; then + cat /usr/local/bin/tc_setup.sh >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +else + echo "/usr/local/bin/tc_setup.sh NOT FOUND" >> ${LOGFILE} +fi + +############################################################################### +# TC Configuration +############################################################################### +delimiter ${LOGFILE} "tc qdisc show" +tc qdisc show >> ${LOGFILE} + +for i in $(ip link | grep htb | awk '{print $2}' | sed 's#:##g'); do + + delimiter ${LOGFILE} "tc class show dev ${i}" + tc class show dev ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "tc filter show dev ${i}" + tc filter show dev ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +############################################################################### +# TC Statistics +############################################################################### +delimiter ${LOGFILE} "tc -s qdisc show" +tc -s qdisc show >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + +for i in $(ip link | grep htb | awk '{print $2}' | sed 's#:##g'); do + + delimiter ${LOGFILE} "tc -s class show dev ${i}" + tc -s class show dev ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} + + delimiter ${LOGFILE} "tc -s filter show dev ${i}" + tc -s filter show dev ${i} >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +done + +exit 0 diff --git a/tools/collector/debian-scripts/collect_utils b/tools/collector/debian-scripts/collect_utils new file mode 100755 index 00000000..95f634e6 --- /dev/null +++ b/tools/collector/debian-scripts/collect_utils @@ -0,0 +1,318 @@ +#! /bin/bash +# +# Copyright (c) 2013-2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +########################################################################################## + +DEBUG=false + +# Fail Codes +PASS=0 +FAIL=1 +RETRY=2 + +FAIL_NODETYPE=3 + +FAIL_TIMEOUT=10 +FAIL_TIMEOUT1=11 +FAIL_TIMEOUT2=12 +FAIL_TIMEOUT3=13 +FAIL_TIMEOUT4=14 +FAIL_TIMEOUT5=15 +FAIL_TIMEOUT6=16 +FAIL_TIMEOUT7=17 +FAIL_TIMEOUT8=18 +FAIL_TIMEOUT9=19 + +FAIL_SUBCLOUD_TIMEOUT=20 + +FAIL_PASSWORD=30 +FAIL_PERMISSION=31 +FAIL_CLEANUP=32 +FAIL_UNREACHABLE=33 +FAIL_HOSTNAME=34 +FAIL_INACTIVE=35 +FAIL_PERMISSION_SKIP=36 +FAIL_OUT_OF_SPACE=37 +FAIL_INSUFFICIENT_SPACE=38 +FAIL_INTERNAL=39 +FAIL_NO_TARDIR=40 +FAIL_NO_TARBALLS=41 +FAIL_NO_FILE_SPECIFIED=42 +FAIL_FILE_NOT_FOUND=43 +FAIL_FILE_EMPTY=44 +FAIL_PASSWORD_PROMPT=45 +FAIL_MISSING_PARAMETER=46 +FAIL_DATE_FORMAT=47 +FAIL_NO_HOSTS=48 +FAIL_FILE_COPY=49 +FAIL_SUBCLOUD=50 +FAIL_CONTINUE=51 +FAIL_SUBCLOUDNAME=52 +FAIL_NO_SUBCLOUDS=53 +FAIL_NOT_SYSTEMCONTROLLER=54 + + +# Warnings are above 200 +WARN_WARNING=200 +WARN_HOSTNAME=201 +WARN_SUBCLOUD=202 + +COLLECT_ERROR="Error:" +COLLECT_DEBUG="Debug:" +COLLECT_WARN="Warning:" + +# Failure Strings +FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space" +FAIL_OUT_OF_SPACE_STR="No space left on device" +FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable" +FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device" +FAIL_UNREACHABLE_STR="Unreachable" + +FAIL_TIMEOUT_STR="operation timeout" +FAIL_SUBCLOUD_TIMEOUT_STR="subcloud collect timeout" + +FAIL_NO_FILE_SPECIFIED_STR="no file specified" +FAIL_FILE_NOT_FOUND_STR="no such file or directory" +FAIL_FILE_EMPTY_STR="file is empty" +FAIL_PASSWORD_PROMPT_STR="password for" + +FAIL_DATE_FORMAT_STR="date format" +FAIL_INACTIVE_STR="not active" +FAIL_NO_HOSTS_STR="empty host list" +FAIL_NO_SUBCLOUDS_STR="empty subcloud list" +FAIL_MISSING_PARAMETER_STR="missing parameter" +FAIL_FILE_COPY_STR="failed to copy" +FAIL_CONTINUE_STR="cannot continue" + +# The minimum amount of % free space on /scratch to allow collect to proceed +MIN_PERCENT_SPACE_REQUIRED=75 + +# Subcloud collect stops when avail scratch drops below this threshold. +# Use collect -sc --continue to tell collect to continue collecting subclouds +# from where it left off. +# 2Gib in K blocks rounded up +declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up + +# Log file path/names +COLLECT_LOG=/var/log/collect.log +COLLECT_ERROR_LOG=/tmp/collect_error.log +HOST_COLLECT_ERROR_LOG="/tmp/host_collect_error.log" + +DCROLE_SYSTEMCONTROLLER="systemcontroller" +DCROLE_SUBCLOUD="subcloud" + +function source_openrc_if_needed +{ + # get the node and subfunction types + nodetype="" + subfunction="" + PLATFORM_CONF=/etc/platform/platform.conf + if [ -e ${PLATFORM_CONF} ] ; then + source ${PLATFORM_CONF} + fi + + if [ "${nodetype}" != "controller" -a "${nodetype}" != "worker" -a "${nodetype}" != "storage" ] ; then + logger -t ${COLLECT_TAG} "could not identify nodetype ($nodetype)" + exit $FAIL_NODETYPE + fi + + ACTIVE=false + if [ "$nodetype" == "controller" ] ; then + # get local host activity state + OPENRC="/etc/platform/openrc" + if [ -e "${OPENRC}" ] ; then + OS_PASSWORD="" + source ${OPENRC} 2>/dev/null 1>/dev/null + if [ "${OS_PASSWORD}" != "" ] ; then + ACTIVE=true + fi + fi + fi +} + + +# Setup an expect command completion file. +# This is used to force serialization of expect +# sequences and highlight command completion +collect_done="collect done" +cmd_done_sig="expect done" +cmd_done_file="/usr/local/sbin/expect_done" + +# Compression Commands +TAR_ZIP_CMD="tar -cvzf" +TAR_UZIP_CMD="tar -xvzf" +TAR_CMD="tar -cvhf" +TAR_CMD_APPEND="tar -rvhf" +UNTAR_CMD="tar -xvf" +ZIP_CMD="gzip" +NICE_CMD="/usr/bin/nice -n19" +IONICE_CMD="/usr/bin/ionice -c2 -n7" +COLLECT_TAG="COLLECT" + +STARTDATE_OPTION="--start-date" +ENDDATE_OPTION="--end-date" + + +PROCESS_DETAIL_CMD="ps -e -H -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,tty,cputime,wchan:14,cmd" +BUILD_INFO_CMD="cat /etc/build.info" + +################################################################################ +# Log Debug, Info or Error log message to syslog +################################################################################ +function log +{ + logger -t ${COLLECT_TAG} $@ +} + +function ilog +{ + echo "$@" + logger -t ${COLLECT_TAG} $@ +} + +function elog +{ + echo "${COLLECT_ERROR} $@" + logger -t ${COLLECT_TAG} "${COLLECT_ERROR} $@" +} + +function wlog +{ + echo "${COLLECT_WARN} $@" + logger -t ${COLLECT_TAG} "${COLLECT_WARN} $@" +} + +function set_debug_mode() +{ + DEBUG=${1} +} + +function dlog() +{ + if [ "$DEBUG" == true ] ; then + logger -t ${COLLECT_TAG} "${COLLECT_DEBUG} $@" + echo "$(date) ${COLLECT_DEBUG} $@" + fi +} + + +function delimiter() +{ + echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG} + echo "`date` : ${myhostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG} + echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG} +} + +function log_slabinfo() +{ + PAGE_SIZE=$(getconf PAGE_SIZE) + cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} ' + BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;} + (NF == 17) { + gsub(/[<>]/, ""); + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n", + $2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB"); + } + (NF == 16) { + num_objs=$3; obj_per_slab=$5; pages_per_slab=$6; + KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0; + TOT_KiB += KiB; + printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n", + $1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB); + } + END { + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n", + "TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB); + } + ' >> ${1} 2>>${COLLECT_ERROR_LOG} +} +########################################################################### +# +# Name : collect_errors +# +# Description: search COLLECT_ERROR_LOG for "No space left on device" logs +# Return 0 if no such logs are found. +# Return 1 if such logs are found +# +# Assumptions: Caller should assume a non-zero return as an indication of +# a corrupt or incomplete collect log +# +# Create logs and screen echos that record the error for the user. +# +# May look for other errors in the future +# +########################################################################### + +listOfOutOfSpaceErrors=( +"${FAIL_OUT_OF_SPACE_STR}" +"${FAIL_TAR_OUT_OF_SPACE_STR}" +"${FAIL_INSUFFICIENT_SPACE_STR}" +) + +function collect_errors() +{ + local host=${1} + local RC=0 + + if [ -e "${COLLECT_ERROR_LOG}" ] ; then + + ## now loop through known space related error strings + index=0 + while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do + grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG} + if [ "$?" == "0" ] ; then + + string="failed to collect from ${host} (reason:${FAIL_OUT_OF_SPACE}:${FAIL_OUT_OF_SPACE_STR})" + + # /var/log/user.log it + logger -t ${COLLECT_TAG} "${string}" + + # logs that show up in the foreground + echo "${string}" + echo "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation." + + # return error code + RC=1 + break + fi + index=$(($index+1)) + done + fi + return ${RC} +} + +############################################################################ +# +# Name : space_precheck +# +# Description: +# +############################################################################ + +function space_precheck() +{ + HOSTNAME=${1} + COLLECT_BASE_DIR=${2} + COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}" + + space="`${COLLECT_DIR_PCENT_CMD}`" + space1=`echo "${space}" | grep -v Use` + size=`echo ${space1} | cut -f 1 -d '%'` + if [ ${size} -ge 0 -a ${size} -le 100 ] ; then + if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then + ilog "${COLLECT_BASE_DIR} is $size% full" + echo "${FAIL_INSUFFICIENT_SPACE_STR}" + wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect" + wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect" + wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation." + exit ${FAIL_INSUFFICIENT_SPACE} + fi + else + wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output" + fi +} + diff --git a/tools/collector/debian-scripts/etc.exclude b/tools/collector/debian-scripts/etc.exclude new file mode 100644 index 00000000..85987308 --- /dev/null +++ b/tools/collector/debian-scripts/etc.exclude @@ -0,0 +1,41 @@ +/etc/postgresql +/etc/alternatives +/etc/terminfo +/etc/tempest +/etc/security +/etc/yum +/etc/collect +/etc/collect.d +/etc/logrotate.d +/etc/logrotate* +/etc/keystone +/etc/pam.d +/etc/environment +/etc/sudoers.d +/etc/sudoers +/etc/passwd +/etc/passwd- +/etc/shadow +/etc/shadow- +/etc/gshadow +/etc/gshadow- +/etc/group +/etc/group- +/etc/ssh +/etc/X11 +/etc/bluetooth +/etc/chatscripts +/etc/cron* +/etc/rc5.d +/etc/rc4.d +/etc/rc1.d +/etc/rc2.d +/etc/bash_completion.d +/etc/pm +/etc/systemd/system/*.mount +/etc/systemd/system/*.socket +/etc/systemd/system/lvm2-lvmetad.service +/etc/systemd/system/ctrl-alt-del.target +/etc/ssl +/etc/mtc/tmp +/etc/kubernetes/pki diff --git a/tools/collector/debian-scripts/expect_done b/tools/collector/debian-scripts/expect_done new file mode 100755 index 00000000..a846adb7 --- /dev/null +++ b/tools/collector/debian-scripts/expect_done @@ -0,0 +1 @@ +expect done diff --git a/tools/collector/debian-scripts/mariadb-cli.sh b/tools/collector/debian-scripts/mariadb-cli.sh new file mode 100755 index 00000000..f5244e02 --- /dev/null +++ b/tools/collector/debian-scripts/mariadb-cli.sh @@ -0,0 +1,232 @@ +#!/bin/bash + +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# This script is wrapper to containerized mariadb-server mysql client. +# This provides access to MariaDB databases. +# +# There are three modes of operation: +# - no command specified gives an interactive mysql shell +# - command specified executes a single mysql command +# - dump option to dump database contents to sql text file +# +set -euo pipefail + +# Define minimal path +PATH=/bin:/usr/bin:/usr/local/bin + +# Environment for kubectl +export KUBECONFIG=/etc/kubernetes/admin.conf + +# Process input options +SCRIPT=$(basename $0) +OPTS=$(getopt -o dh --long debug,help,command:,database:,exclude:,dump -n ${SCRIPT} -- "$@") +if [ $? != 0 ]; then + echo "Failed parsing options." >&2 + exit 1 +fi +eval set -- "$OPTS" + +DEBUG=false +HELP=false +DUMP=false +COMMAND="" +DATABASE="" +EXCLUDE="" +while true +do + case "$1" in + -d | --debug ) DEBUG=true; shift ;; + -h | --help ) HELP=true; shift ;; + --command ) + COMMAND="$2" + shift 2 + ;; + --database ) + DATABASE="$2" + shift 2 + ;; + --exclude ) + EXCLUDE="$2" + shift 2 + ;; + --dump ) + DUMP=true + shift + ;; + -- ) + shift + break + ;; + * ) + break + ;; + esac +done + +# Treat remaining arguments as commands + options +shift $((OPTIND-1)) +OTHERARGS="$@" + +if [ ${HELP} == 'true' ]; then + echo "Usage: ${SCRIPT} [-d|--debug] [-h|--help] [--database ] [--exclude ] [--command ] [--dump]" + echo "Options:" + echo " -d | --debug : display debug information" + echo " -h | --help : this help" + echo " --database : connect to database db" + echo " --exclude : list of databases to exclude" + echo " --command : execute mysql command cmd" + echo " --dump : dump database(s) to sql file in current directory" + echo + echo "Command option examples:" + echo + echo "Interactive mysql shell:" + echo " mariadb-cli" + echo " mariadb-cli --database nova" + echo " mariadb-cli --command 'show_databases'" + echo " mariadb-cli --database nova --command 'select * from compute_nodes'" + echo + echo "Dump MariaDB databases to sql file:" + echo " mariadb-cli --dump" + echo " mariadb-cli --dump --database nova" + echo " mariadb-cli --dump --exclude keystone" + exit 0 +fi + + +# Logger setup +LOG_FACILITY=user +LOG_PRIORITY=info +function LOG { + logger -t "${0##*/}[$$]" -p ${LOG_FACILITY}.${LOG_PRIORITY} "$@" + echo "${0##*/}[$$]" "$@" +} +function ERROR { + MSG="ERROR" + LOG "${MSG} $@" +} + +function is_openstack_node { + local PASS=0 + local FAIL=1 + # NOTE: hostname changes during first configuration + local this_node=$(cat /proc/sys/kernel/hostname) + + labels=$(kubectl get node ${this_node} \ + --no-headers --show-labels 2>/dev/null | awk '{print $NF}') + if [[ $labels =~ openstack-control-plane=enabled ]]; then + return ${PASS} + else + return ${FAIL} + fi +} + +# Selected options +if [ ${DEBUG} == 'true' ]; then + LOG "Options: DUMP=${DUMP} OTHERARGS: ${OTHERARGS}" + if [ ! -z "${DATABASE}" ]; then + LOG "Options: DATABASE:${DATABASE}" + fi + if [ ! -z "${EXCLUDE}" ]; then + LOG "Options: EXCLUDE:${EXCLUDE}" + fi + if [ ! -z "${COMMAND}" ]; then + LOG "Options: COMMAND:${COMMAND}" + fi +fi + +# Check for openstack label on this node +if ! is_openstack_node; then + ERROR "This node not configured for openstack." + exit 1 +fi + +# Determine running mariadb pods +MARIADB_PODS=( $(kubectl get pods -n openstack \ + --selector=application=mariadb,component=server \ + --field-selector status.phase=Running \ + --output=jsonpath={.items..metadata.name}) ) +if [ ${DEBUG} == 'true' ]; then + LOG "Found mariadb-server pods: ${MARIADB_PODS[@]}" +fi + +# Get first available mariadb pod with container we can exec +DBPOD="" +for POD in "${MARIADB_PODS[@]}" +do + kubectl exec -it -n openstack ${POD} -c mariadb -- pwd 1>/dev/null 2>/dev/null + RC=$? + if [ ${RC} -eq 0 ]; then + DBPOD=${POD} + break + fi +done +if [ -z "${DBPOD}" ]; then + ERROR "Could not find mariadb-server pod." + exit 1 +fi +if [ ${DEBUG} == 'true' ]; then + LOG "Found mariadb-server pod: ${DBPOD}" +fi + +EVAL='eval env 1>/dev/null' +DBOPTS='--password=$MYSQL_DBADMIN_PASSWORD --user=$MYSQL_DBADMIN_USERNAME' + +if [ ${DUMP} == 'true' ]; then + # Dump database contents to sql text file + DB_EXT=sql + + DATABASES=() + if [ ! -z "${DATABASE}" ]; then + DATABASES+=( $DATABASE ) + else + # Get list of databases + MYSQL_CMD="${EVAL}; mysql ${DBOPTS} -e 'show databases' -sN --disable-pager" + if [ ${DEBUG} == 'true' ]; then + LOG "MYSQL_CMD: ${MYSQL_CMD}" + fi + + # Suppress error: line from stdout, eg., + # error: Found option without preceding group in config file: /etc/mysql/conf.d/20-override.cnf at line: 1 + # Exclude databases: mysql, information_schema, performance_schema + # Remove linefeed control character. + DATABASES=( $(kubectl exec -it -n openstack ${DBPOD} -c mariadb -- bash -c "${MYSQL_CMD}" | \ + grep -v -e error: -e mysql -e information_schema -e performance_schema | tr -d '\r') ) + fi + + for dbname in "${DATABASES[@]}" + do + re=\\b"${dbname}"\\b + if [[ "${EXCLUDE}" =~ ${re} ]]; then + LOG "excluding: ${dbname}" + continue + fi + + # NOTE: --skip-opt will show an INSERT for each record + DUMP_CMD="${EVAL}; mysqldump ${DBOPTS} --skip-opt --skip-comments --skip-set-charset ${dbname}" + dbfile=${dbname}.${DB_EXT} + LOG "Dump database: $dbname to file: ${dbfile}" + if [ ${DEBUG} == 'true' ]; then + LOG "DUMP_CMD: ${DUMP_CMD}" + fi + kubectl exec -it -n openstack ${DBPOD} -c mariadb -- bash -c "${DUMP_CMD}" > ${dbfile} + done + +else + # Interactive mariadb mysql client + LOG "Interactive MariaDB mysql shell" + MYSQL_CMD="${EVAL}; mysql ${DBOPTS} ${DATABASE}" + if [ ! -z "${COMMAND}" ]; then + MYSQL_CMD="${MYSQL_CMD} -e '${COMMAND}'" + fi + + if [ ${DEBUG} == 'true' ]; then + LOG "MYSQL_CMD: ${MYSQL_CMD}" + fi + kubectl exec -it -n openstack ${DBPOD} -c mariadb -- bash -c "${MYSQL_CMD}" +fi + +exit 0 diff --git a/tools/collector/debian-scripts/run.exclude b/tools/collector/debian-scripts/run.exclude new file mode 100644 index 00000000..15e9ba06 --- /dev/null +++ b/tools/collector/debian-scripts/run.exclude @@ -0,0 +1,14 @@ +/var/run/sanlock/sanlock.sock +/var/run/tgtd.ipc_abstract_namespace.0 +/var/run/wdmd/wdmd.sock +/var/run/acpid.socket +/var/run/rpcbind.sock +/var/run/libvirt/libvirt-sock-ro +/var/run/libvirt/libvirt-sock +/var/run/dbus/system_bus_socket +/var/run/named-chroot +/var/run/avahi-daemon +/var/run/neutron/metadata_proxy +/var/run/.vswitch +/var/run/containerd +/var/run/nvidia diff --git a/tools/collector/debian-scripts/varlog.exclude b/tools/collector/debian-scripts/varlog.exclude new file mode 100644 index 00000000..8b035de3 --- /dev/null +++ b/tools/collector/debian-scripts/varlog.exclude @@ -0,0 +1 @@ +/var/log/crash diff --git a/tools/collector/debian/meta_data.yaml b/tools/collector/debian/meta_data.yaml index 57d83649..4be9bdef 100644 --- a/tools/collector/debian/meta_data.yaml +++ b/tools/collector/debian/meta_data.yaml @@ -1,7 +1,7 @@ --- debname: collector debver: 1.0-1 -src_path: scripts +src_path: debian-scripts revision: dist: $STX_DIST PKG_GITREVCOUNT: true