
This update adds a new --timeout command line option to the collect tool so that users can extend collect's global timeout. Prior to this update the collect tool had a fixed 1000 second or 16.6 minute timeout. Collect of hosts in large busy systems can take an unpredictably long time. Sometimes longer than 1000 seconds. This can be particularly true when collecting from the active controller deploying and managing lots of pods across many hosts. This new timeout option allows the user to specify a specific timeout in minutes, between 10 and 120, while defaulting to 20 minutes. The default or user specified global timeout is passed to subclouds for subcloud collect as well. Test Plan: PASS: Verify new --timeout or -t options at command line arg level PASS: Verify --timeout <minutes> parse; error, in and out of bounds PASS: Verify timeout option is described in collect help PASS: Verify 110 minute collect with --timeout 120 PASS: Verify 45 minute collect times out with --timeout 40 PASS: Verify 2 minute collect with --timeout 10 PASS: Verify default timeout is 20 minutes PASS: Verify default or specified timeout is displayed PASS: Verify default or specified timeout is shared with the subcloud PASS: Verify timeout error handling. PASS: Verify collect error handling behavior if --timeout or -t is specified but the number of minutes is missing. Regression: PASS: Verify collect system and subcloud handling PASS: Verify system and subcloud dated collects ; verified content PASS: Verify collect with a variety of options Closes-Bug: 2004666 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com> Change-Id: Ib68b78f7c810f43fc8d13cbf291ac00f08c3c4f4
323 lines
8.9 KiB
Bash
Executable File
323 lines
8.9 KiB
Bash
Executable File
#! /bin/bash
|
|
#
|
|
# Copyright (c) 2013-2019 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
##########################################################################################
|
|
|
|
DEBUG=false
|
|
|
|
# Fail Codes
|
|
PASS=0
|
|
FAIL=1
|
|
RETRY=2
|
|
|
|
FAIL_NODETYPE=3
|
|
|
|
FAIL_TIMEOUT=10
|
|
FAIL_TIMEOUT1=11
|
|
FAIL_TIMEOUT2=12
|
|
FAIL_TIMEOUT3=13
|
|
FAIL_TIMEOUT4=14
|
|
FAIL_TIMEOUT5=15
|
|
FAIL_TIMEOUT6=16
|
|
FAIL_TIMEOUT7=17
|
|
FAIL_TIMEOUT8=18
|
|
FAIL_TIMEOUT9=19
|
|
|
|
FAIL_SUBCLOUD_TIMEOUT=20
|
|
|
|
FAIL_PASSWORD=30
|
|
FAIL_PERMISSION=31
|
|
FAIL_CLEANUP=32
|
|
FAIL_UNREACHABLE=33
|
|
FAIL_HOSTNAME=34
|
|
FAIL_INACTIVE=35
|
|
FAIL_PERMISSION_SKIP=36
|
|
FAIL_OUT_OF_SPACE=37
|
|
FAIL_INSUFFICIENT_SPACE=38
|
|
FAIL_INTERNAL=39
|
|
FAIL_NO_TARDIR=40
|
|
FAIL_NO_TARBALLS=41
|
|
FAIL_NO_FILE_SPECIFIED=42
|
|
FAIL_FILE_NOT_FOUND=43
|
|
FAIL_FILE_EMPTY=44
|
|
FAIL_PASSWORD_PROMPT=45
|
|
FAIL_MISSING_PARAMETER=46
|
|
FAIL_DATE_FORMAT=47
|
|
FAIL_NO_HOSTS=48
|
|
FAIL_FILE_COPY=49
|
|
FAIL_SUBCLOUD=50
|
|
FAIL_CONTINUE=51
|
|
FAIL_SUBCLOUDNAME=52
|
|
FAIL_NO_SUBCLOUDS=53
|
|
FAIL_NOT_SYSTEMCONTROLLER=54
|
|
FAIL_NAME_TOO_LONG=55
|
|
FAIL_INVALID_START_DATE=56
|
|
FAIL_INVALID_END_DATE=57
|
|
FAIL_INVALID_DATE_RANGE=58
|
|
FAIL_TIMEOUT_ARG=59
|
|
|
|
# Warnings are above 200
|
|
WARN_WARNING=200
|
|
WARN_HOSTNAME=201
|
|
WARN_SUBCLOUD=202
|
|
|
|
COLLECT_ERROR="Error:"
|
|
COLLECT_DEBUG="Debug:"
|
|
COLLECT_WARN="Warning:"
|
|
|
|
# Failure Strings
|
|
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
|
|
FAIL_OUT_OF_SPACE_STR="No space left on device"
|
|
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
|
|
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
|
|
FAIL_UNREACHABLE_STR="Unreachable"
|
|
|
|
FAIL_TIMEOUT_STR="operation timeout"
|
|
FAIL_SUBCLOUD_TIMEOUT_STR="subcloud collect timeout"
|
|
|
|
FAIL_NO_FILE_SPECIFIED_STR="no file specified"
|
|
FAIL_FILE_NOT_FOUND_STR="no such file or directory"
|
|
FAIL_FILE_EMPTY_STR="file is empty"
|
|
FAIL_PASSWORD_PROMPT_STR="password for"
|
|
|
|
FAIL_DATE_FORMAT_STR="date format"
|
|
FAIL_INACTIVE_STR="not active"
|
|
FAIL_NO_HOSTS_STR="empty host list"
|
|
FAIL_NO_SUBCLOUDS_STR="empty subcloud list"
|
|
FAIL_MISSING_PARAMETER_STR="missing parameter"
|
|
FAIL_FILE_COPY_STR="failed to copy"
|
|
FAIL_CONTINUE_STR="cannot continue"
|
|
|
|
# The minimum amount of % free space on /scratch to allow collect to proceed
|
|
MIN_PERCENT_SPACE_REQUIRED=75
|
|
|
|
# Subcloud collect stops when avail scratch drops below this threshold.
|
|
# Use collect -sc --continue to tell collect to continue collecting subclouds
|
|
# from where it left off.
|
|
# 2Gib in K blocks rounded up
|
|
declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up
|
|
|
|
# Log file path/names
|
|
COLLECT_LOG=/var/log/collect.log
|
|
COLLECT_ERROR_LOG=/tmp/collect_error.log
|
|
HOST_COLLECT_ERROR_LOG="/tmp/host_collect_error.log"
|
|
|
|
DCROLE_SYSTEMCONTROLLER="systemcontroller"
|
|
DCROLE_SUBCLOUD="subcloud"
|
|
|
|
function source_openrc_if_needed
|
|
{
|
|
# get the node and subfunction types
|
|
nodetype=""
|
|
subfunction=""
|
|
PLATFORM_CONF=/etc/platform/platform.conf
|
|
if [ -e ${PLATFORM_CONF} ] ; then
|
|
source ${PLATFORM_CONF}
|
|
fi
|
|
|
|
if [ "${nodetype}" != "controller" -a "${nodetype}" != "worker" -a "${nodetype}" != "storage" ] ; then
|
|
logger -t ${COLLECT_TAG} "could not identify nodetype ($nodetype)"
|
|
exit $FAIL_NODETYPE
|
|
fi
|
|
|
|
ACTIVE=false
|
|
if [ "$nodetype" == "controller" ] ; then
|
|
# get local host activity state
|
|
OPENRC="/etc/platform/openrc"
|
|
if [ -e "${OPENRC}" ] ; then
|
|
OS_PASSWORD=""
|
|
source ${OPENRC} 2>/dev/null 1>/dev/null
|
|
if [ "${OS_PASSWORD}" != "" ] ; then
|
|
ACTIVE=true
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
|
|
# Setup an expect command completion file.
|
|
# This is used to force serialization of expect
|
|
# sequences and highlight command completion
|
|
collect_done="collect done"
|
|
cmd_done_sig="expect done"
|
|
cmd_done_file="/usr/local/sbin/expect_done"
|
|
|
|
# Compression Commands
|
|
TAR_ZIP_CMD="tar -cvzf"
|
|
TAR_UZIP_CMD="tar -xvzf"
|
|
TAR_CMD="tar -cvhf"
|
|
TAR_CMD_APPEND="tar -rvhf"
|
|
UNTAR_CMD="tar -xvf"
|
|
ZIP_CMD="gzip"
|
|
NICE_CMD="/usr/bin/nice -n19"
|
|
IONICE_CMD="/usr/bin/ionice -c2 -n7"
|
|
COLLECT_TAG="COLLECT"
|
|
|
|
STARTDATE_OPTION="--start-date"
|
|
ENDDATE_OPTION="--end-date"
|
|
|
|
|
|
PROCESS_DETAIL_CMD="ps -e -H -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,tty,cputime,wchan:14,cmd"
|
|
BUILD_INFO_CMD="cat /etc/build.info"
|
|
|
|
################################################################################
|
|
# Log Debug, Info or Error log message to syslog
|
|
################################################################################
|
|
function log
|
|
{
|
|
logger -t ${COLLECT_TAG} $@
|
|
}
|
|
|
|
function ilog
|
|
{
|
|
echo "$@"
|
|
logger -t ${COLLECT_TAG} $@
|
|
}
|
|
|
|
function elog
|
|
{
|
|
echo "${COLLECT_ERROR} $@"
|
|
logger -t ${COLLECT_TAG} "${COLLECT_ERROR} $@"
|
|
}
|
|
|
|
function wlog
|
|
{
|
|
echo "${COLLECT_WARN} $@"
|
|
logger -t ${COLLECT_TAG} "${COLLECT_WARN} $@"
|
|
}
|
|
|
|
function set_debug_mode()
|
|
{
|
|
DEBUG=${1}
|
|
}
|
|
|
|
function dlog()
|
|
{
|
|
if [ "$DEBUG" == true ] ; then
|
|
logger -t ${COLLECT_TAG} "${COLLECT_DEBUG} $@"
|
|
echo "$(date) ${COLLECT_DEBUG} $@"
|
|
fi
|
|
}
|
|
|
|
|
|
function delimiter()
|
|
{
|
|
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
echo "`date` : ${myhostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
}
|
|
|
|
function log_slabinfo()
|
|
{
|
|
PAGE_SIZE=$(getconf PAGE_SIZE)
|
|
cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} '
|
|
BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;}
|
|
(NF == 17) {
|
|
gsub(/[<>]/, "");
|
|
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n",
|
|
$2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB");
|
|
}
|
|
(NF == 16) {
|
|
num_objs=$3; obj_per_slab=$5; pages_per_slab=$6;
|
|
KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0;
|
|
TOT_KiB += KiB;
|
|
printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n",
|
|
$1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB);
|
|
}
|
|
END {
|
|
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n",
|
|
"TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB);
|
|
}
|
|
' >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
}
|
|
###########################################################################
|
|
#
|
|
# Name : collect_errors
|
|
#
|
|
# Description: search COLLECT_ERROR_LOG for "No space left on device" logs
|
|
# Return 0 if no such logs are found.
|
|
# Return 1 if such logs are found
|
|
#
|
|
# Assumptions: Caller should assume a non-zero return as an indication of
|
|
# a corrupt or incomplete collect log
|
|
#
|
|
# Create logs and screen echos that record the error for the user.
|
|
#
|
|
# May look for other errors in the future
|
|
#
|
|
###########################################################################
|
|
|
|
listOfOutOfSpaceErrors=(
|
|
"${FAIL_OUT_OF_SPACE_STR}"
|
|
"${FAIL_TAR_OUT_OF_SPACE_STR}"
|
|
"${FAIL_INSUFFICIENT_SPACE_STR}"
|
|
)
|
|
|
|
function collect_errors()
|
|
{
|
|
local host=${1}
|
|
local RC=0
|
|
|
|
if [ -e "${COLLECT_ERROR_LOG}" ] ; then
|
|
|
|
## now loop through known space related error strings
|
|
index=0
|
|
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do
|
|
grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
|
|
if [ "$?" == "0" ] ; then
|
|
|
|
string="failed to collect from ${host} (reason:${FAIL_OUT_OF_SPACE}:${FAIL_OUT_OF_SPACE_STR})"
|
|
|
|
# /var/log/user.log it
|
|
logger -t ${COLLECT_TAG} "${string}"
|
|
|
|
# logs that show up in the foreground
|
|
echo "${string}"
|
|
echo "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation."
|
|
|
|
# return error code
|
|
RC=1
|
|
break
|
|
fi
|
|
index=$(($index+1))
|
|
done
|
|
fi
|
|
return ${RC}
|
|
}
|
|
|
|
############################################################################
|
|
#
|
|
# Name : space_precheck
|
|
#
|
|
# Description:
|
|
#
|
|
############################################################################
|
|
|
|
function space_precheck()
|
|
{
|
|
HOSTNAME=${1}
|
|
COLLECT_BASE_DIR=${2}
|
|
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
|
|
|
|
space="`${COLLECT_DIR_PCENT_CMD}`"
|
|
space1=`echo "${space}" | grep -v Use`
|
|
size=`echo ${space1} | cut -f 1 -d '%'`
|
|
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
|
|
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
|
|
ilog "${COLLECT_BASE_DIR} is $size% full"
|
|
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
|
|
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
|
|
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
|
|
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
|
|
exit ${FAIL_INSUFFICIENT_SPACE}
|
|
fi
|
|
else
|
|
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
|
|
fi
|
|
}
|
|
|