Eric MacDonald 0b079b4804 Add --timeout option to collect tool
This update adds a new --timeout command line option to the collect
tool so that users can extend collect's global timeout.

Prior to this update the collect tool had a fixed 1000 second
or 16.6 minute timeout. Collect of hosts in large busy systems can
take an unpredictably long time. Sometimes longer than 1000 seconds.
This can be particularly true when collecting from the active
controller deploying and managing lots of pods across many hosts.

This new timeout option allows the user to specify a specific timeout
in minutes, between 10 and 120, while defaulting to 20 minutes.
The default or user specified global timeout is passed to subclouds
for subcloud collect as well.

Test Plan:

PASS: Verify new --timeout or -t options at command line arg level
PASS: Verify --timeout <minutes> parse; error, in and out of bounds
PASS: Verify timeout option is described in collect help
PASS: Verify 110 minute collect with --timeout 120
PASS: Verify 45 minute collect times out with --timeout 40
PASS: Verify 2 minute collect with --timeout 10
PASS: Verify default timeout is 20 minutes
PASS: Verify default or specified timeout is displayed
PASS: Verify default or specified timeout is shared with the subcloud
PASS: Verify timeout error handling.
PASS: Verify collect error handling behavior if --timeout or -t is
      specified but the number of minutes is missing.

Regression:

PASS: Verify collect system and subcloud handling
PASS: Verify system and subcloud dated collects ; verified content
PASS: Verify collect with a variety of options

Closes-Bug: 2004666
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
Change-Id: Ib68b78f7c810f43fc8d13cbf291ac00f08c3c4f4
2023-02-14 13:14:14 -05:00

323 lines
8.9 KiB
Bash
Executable File

#! /bin/bash
#
# Copyright (c) 2013-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
##########################################################################################
DEBUG=false
# Fail Codes
PASS=0
FAIL=1
RETRY=2
FAIL_NODETYPE=3
FAIL_TIMEOUT=10
FAIL_TIMEOUT1=11
FAIL_TIMEOUT2=12
FAIL_TIMEOUT3=13
FAIL_TIMEOUT4=14
FAIL_TIMEOUT5=15
FAIL_TIMEOUT6=16
FAIL_TIMEOUT7=17
FAIL_TIMEOUT8=18
FAIL_TIMEOUT9=19
FAIL_SUBCLOUD_TIMEOUT=20
FAIL_PASSWORD=30
FAIL_PERMISSION=31
FAIL_CLEANUP=32
FAIL_UNREACHABLE=33
FAIL_HOSTNAME=34
FAIL_INACTIVE=35
FAIL_PERMISSION_SKIP=36
FAIL_OUT_OF_SPACE=37
FAIL_INSUFFICIENT_SPACE=38
FAIL_INTERNAL=39
FAIL_NO_TARDIR=40
FAIL_NO_TARBALLS=41
FAIL_NO_FILE_SPECIFIED=42
FAIL_FILE_NOT_FOUND=43
FAIL_FILE_EMPTY=44
FAIL_PASSWORD_PROMPT=45
FAIL_MISSING_PARAMETER=46
FAIL_DATE_FORMAT=47
FAIL_NO_HOSTS=48
FAIL_FILE_COPY=49
FAIL_SUBCLOUD=50
FAIL_CONTINUE=51
FAIL_SUBCLOUDNAME=52
FAIL_NO_SUBCLOUDS=53
FAIL_NOT_SYSTEMCONTROLLER=54
FAIL_NAME_TOO_LONG=55
FAIL_INVALID_START_DATE=56
FAIL_INVALID_END_DATE=57
FAIL_INVALID_DATE_RANGE=58
FAIL_TIMEOUT_ARG=59
# Warnings are above 200
WARN_WARNING=200
WARN_HOSTNAME=201
WARN_SUBCLOUD=202
COLLECT_ERROR="Error:"
COLLECT_DEBUG="Debug:"
COLLECT_WARN="Warning:"
# Failure Strings
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
FAIL_OUT_OF_SPACE_STR="No space left on device"
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
FAIL_UNREACHABLE_STR="Unreachable"
FAIL_TIMEOUT_STR="operation timeout"
FAIL_SUBCLOUD_TIMEOUT_STR="subcloud collect timeout"
FAIL_NO_FILE_SPECIFIED_STR="no file specified"
FAIL_FILE_NOT_FOUND_STR="no such file or directory"
FAIL_FILE_EMPTY_STR="file is empty"
FAIL_PASSWORD_PROMPT_STR="password for"
FAIL_DATE_FORMAT_STR="date format"
FAIL_INACTIVE_STR="not active"
FAIL_NO_HOSTS_STR="empty host list"
FAIL_NO_SUBCLOUDS_STR="empty subcloud list"
FAIL_MISSING_PARAMETER_STR="missing parameter"
FAIL_FILE_COPY_STR="failed to copy"
FAIL_CONTINUE_STR="cannot continue"
# The minimum amount of % free space on /scratch to allow collect to proceed
MIN_PERCENT_SPACE_REQUIRED=75
# Subcloud collect stops when avail scratch drops below this threshold.
# Use collect -sc --continue to tell collect to continue collecting subclouds
# from where it left off.
# 2Gib in K blocks rounded up
declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up
# Log file path/names
COLLECT_LOG=/var/log/collect.log
COLLECT_ERROR_LOG=/tmp/collect_error.log
HOST_COLLECT_ERROR_LOG="/tmp/host_collect_error.log"
DCROLE_SYSTEMCONTROLLER="systemcontroller"
DCROLE_SUBCLOUD="subcloud"
function source_openrc_if_needed
{
# get the node and subfunction types
nodetype=""
subfunction=""
PLATFORM_CONF=/etc/platform/platform.conf
if [ -e ${PLATFORM_CONF} ] ; then
source ${PLATFORM_CONF}
fi
if [ "${nodetype}" != "controller" -a "${nodetype}" != "worker" -a "${nodetype}" != "storage" ] ; then
logger -t ${COLLECT_TAG} "could not identify nodetype ($nodetype)"
exit $FAIL_NODETYPE
fi
ACTIVE=false
if [ "$nodetype" == "controller" ] ; then
# get local host activity state
OPENRC="/etc/platform/openrc"
if [ -e "${OPENRC}" ] ; then
OS_PASSWORD=""
source ${OPENRC} 2>/dev/null 1>/dev/null
if [ "${OS_PASSWORD}" != "" ] ; then
ACTIVE=true
fi
fi
fi
}
# Setup an expect command completion file.
# This is used to force serialization of expect
# sequences and highlight command completion
collect_done="collect done"
cmd_done_sig="expect done"
cmd_done_file="/usr/local/sbin/expect_done"
# Compression Commands
TAR_ZIP_CMD="tar -cvzf"
TAR_UZIP_CMD="tar -xvzf"
TAR_CMD="tar -cvhf"
TAR_CMD_APPEND="tar -rvhf"
UNTAR_CMD="tar -xvf"
ZIP_CMD="gzip"
NICE_CMD="/usr/bin/nice -n19"
IONICE_CMD="/usr/bin/ionice -c2 -n7"
COLLECT_TAG="COLLECT"
STARTDATE_OPTION="--start-date"
ENDDATE_OPTION="--end-date"
PROCESS_DETAIL_CMD="ps -e -H -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,tty,cputime,wchan:14,cmd"
BUILD_INFO_CMD="cat /etc/build.info"
################################################################################
# Log Debug, Info or Error log message to syslog
################################################################################
function log
{
logger -t ${COLLECT_TAG} $@
}
function ilog
{
echo "$@"
logger -t ${COLLECT_TAG} $@
}
function elog
{
echo "${COLLECT_ERROR} $@"
logger -t ${COLLECT_TAG} "${COLLECT_ERROR} $@"
}
function wlog
{
echo "${COLLECT_WARN} $@"
logger -t ${COLLECT_TAG} "${COLLECT_WARN} $@"
}
function set_debug_mode()
{
DEBUG=${1}
}
function dlog()
{
if [ "$DEBUG" == true ] ; then
logger -t ${COLLECT_TAG} "${COLLECT_DEBUG} $@"
echo "$(date) ${COLLECT_DEBUG} $@"
fi
}
function delimiter()
{
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
echo "`date` : ${myhostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG}
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
}
function log_slabinfo()
{
PAGE_SIZE=$(getconf PAGE_SIZE)
cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} '
BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;}
(NF == 17) {
gsub(/[<>]/, "");
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n",
$2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB");
}
(NF == 16) {
num_objs=$3; obj_per_slab=$5; pages_per_slab=$6;
KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0;
TOT_KiB += KiB;
printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n",
$1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB);
}
END {
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n",
"TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB);
}
' >> ${1} 2>>${COLLECT_ERROR_LOG}
}
###########################################################################
#
# Name : collect_errors
#
# Description: search COLLECT_ERROR_LOG for "No space left on device" logs
# Return 0 if no such logs are found.
# Return 1 if such logs are found
#
# Assumptions: Caller should assume a non-zero return as an indication of
# a corrupt or incomplete collect log
#
# Create logs and screen echos that record the error for the user.
#
# May look for other errors in the future
#
###########################################################################
listOfOutOfSpaceErrors=(
"${FAIL_OUT_OF_SPACE_STR}"
"${FAIL_TAR_OUT_OF_SPACE_STR}"
"${FAIL_INSUFFICIENT_SPACE_STR}"
)
function collect_errors()
{
local host=${1}
local RC=0
if [ -e "${COLLECT_ERROR_LOG}" ] ; then
## now loop through known space related error strings
index=0
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do
grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
if [ "$?" == "0" ] ; then
string="failed to collect from ${host} (reason:${FAIL_OUT_OF_SPACE}:${FAIL_OUT_OF_SPACE_STR})"
# /var/log/user.log it
logger -t ${COLLECT_TAG} "${string}"
# logs that show up in the foreground
echo "${string}"
echo "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation."
# return error code
RC=1
break
fi
index=$(($index+1))
done
fi
return ${RC}
}
############################################################################
#
# Name : space_precheck
#
# Description:
#
############################################################################
function space_precheck()
{
HOSTNAME=${1}
COLLECT_BASE_DIR=${2}
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
space="`${COLLECT_DIR_PCENT_CMD}`"
space1=`echo "${space}" | grep -v Use`
size=`echo ${space1} | cut -f 1 -d '%'`
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
ilog "${COLLECT_BASE_DIR} is $size% full"
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
exit ${FAIL_INSUFFICIENT_SPACE}
fi
else
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
fi
}