wguy 744b435648 Add kubelet config state files and Kubernetes PDB to collect
Update collect to save kubelet information including state files
and config data from /var/lib/kubelet, and query pods with
Pod Disruption Budget.

Test plan:

AIO-SX:
PASS: Verify collect has var/lib/kubelet contents
PASS: Verify collect containerization_kube.info contains PDB

Closes-Bug: #2091674
Implements: stx utilities tools

Change-Id: Ib025696f0832d557614b2d20f735fa102460bde8
Signed-off-by: wguy <wey-yi.guy@windriver.com>
2024-12-12 23:07:15 +00:00

526 lines
17 KiB
Bash
Executable File

#! /bin/bash
########################################################################
#
# Copyright (c) 2016-2022, 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
########################################################################
# make these platform.conf variables global.
# values are loaded in source_openrc_if_needed.
export nodetype=""
export subfunction=""
export system_type=""
export security_profile=""
export sdn_enabled=""
export region_config=""
export vswitch_type=""
export system_mode=""
export sw_version=""
# assume this is not the active controller until learned
export ACTIVE=false
#
# Import commands, variables and convenience functions available to
# all collectors ; common and user defined.
#
source /usr/local/sbin/collect_utils
source_openrc_if_needed
#
# parse input parameters
#
COLLECT_NAME="${1}"
DEBUG=${8}
INVENTORY=${9}
set_debug_mode ${DEBUG}
expect_debug=$([ "${DEBUG}" = true ] && echo "-d" || echo "")
# Calling parms
#
# 1 = collect name
# 2 = start date option
# 3 = start date
# 4 = "any" (ignored - no longer used ; kept to support upgrades/downgrades)
# 5 = end date option
# 6 = end date
# 7 = "any" (ignored - no longer used ; kept to support upgrades/downgrades)
# 8 = debug mode
# 9 = inventory
logger -t ${COLLECT_TAG} "${0} ${1} ${2} ${3} ${4} ${5} ${6} ${7} ${8} ${9}"
# parse out the start data/time data if it is present
STARTDATE_RANGE=false
STARTDATE="any"
if [ "${2}" == "${STARTDATE_OPTION}" ] ; then
if [ "${3}" != "any" -a ${#3} -gt 7 ] ; then
STARTDATE_RANGE=true
STARTDATE="${3}"
fi
fi
# parse out the end date/time if it is present
ENDDATE_RANGE=false
ENDDATE="any"
if [ "${5}" == "${ENDDATE_OPTION}" ] ; then
if [ "${6}" != "any" -a ${#6} -gt 7 ] ; then
ENDDATE_RANGE=true
ENDDATE="${6}"
fi
fi
COLLECT_BASE_DIR="/scratch"
EXTRA="var/extra"
COLLECT_NAME_DIR="${COLLECT_BASE_DIR}/${COLLECT_NAME}"
EXTRA_DIR="${COLLECT_NAME_DIR}/${EXTRA}"
TARBALL="${COLLECT_NAME_DIR}.tgz"
COLLECT_PATH="/etc/collect.d"
RUN_EXCLUDE="/etc/collect/run.exclude"
ETC_EXCLUDE="/etc/collect/etc.exclude"
VAR_LOG_EXCLUDE="/etc/collect/varlog.exclude"
COLLECT_INCLUDE="/var/run /etc /root"
FLIGHT_RECORDER_PATH="var/lib/sm/"
FLIGHT_RECORDER_FILE="sm.eru.v1"
VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst"
COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}"
COLLECT_DATE="/usr/local/sbin/collect_date"
COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv"
rm -f ${COLLECT_CMD_TIMING_LOG}
function log_space()
{
local msg=${1}
space="`${COLLECT_DIR_USAGE_CMD}`"
space1=`echo "${space}" | grep -v Filesystem`
ilog "${COLLECT_BASE_DIR} ${msg} ${space1}"
}
space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR}
CURR_DIR=`pwd`
mkdir -p ${COLLECT_NAME_DIR}
cd ${COLLECT_NAME_DIR}
# create dump target extra-stuff directory
mkdir -p ${EXTRA_DIR}
RETVAL=0
# Remove any previous collect error log.
# Start this collect with an empty file.
#
# stderr is directed to this log during the collect process.
# By searching this log after collect_host is run we can find
# errors that occured during collect.
# The only real error that we care about right now is the
#
# "No space left on device" error
#
rm -f ${COLLECT_ERROR_LOG}
touch ${COLLECT_ERROR_LOG}
chmod 644 ${COLLECT_ERROR_LOG}
echo "`date '+%F %T'` :${COLLECT_NAME_DIR}" > ${COLLECT_ERROR_LOG}
ilog "creating local collect tarball ${COLLECT_NAME_DIR}.tgz"
################################################################################
# Run collect scripts to check system status
################################################################################
function collect_parts()
{
if [ -d ${COLLECT_PATH} ]; then
for i in ${COLLECT_PATH}/*; do
if [ -f $i ]; then
local start=$(date ${DATE_FORMAT})
if [ ${i} = ${COLLECT_SYSINV} ]; then
${IONICE_CMD} ${NICE_CMD} $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname} ${INVENTORY}
else
${IONICE_CMD} ${NICE_CMD} $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname}
fi
local stop=$(date ${DATE_FORMAT})
duration=$(get_time_delta "${start}" "${stop}")
echo "${stop}: ${duration} Plugin $i" >> ${COLLECT_CMD_TIMING_LOG}
# Add delay between parts ; i.e. collect plugins
sleep ${COLLECT_RUNPARTS_DELAY}
fi
done
fi
}
function collect_extra()
{
# dump process lists
LOGFILE="${EXTRA_DIR}/process.info"
echo "${hostname}: Process Info ......: ${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} ${PROCESS_DETAIL_CMD}" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
# Collect process and thread info (tree view)
run_command "${IONICE_CMD} ${NICE_CMD} pstree --arguments --ascii --long --show-pids" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
# Collect process, thread and scheduling info (worker subfunction only)
# (also gets process 'affinity' which is useful on workers;
which ps-sched.sh >/dev/null 2>&1
if [ $? -eq 0 ]; then
run_command "${IONICE_CMD} ${NICE_CMD} ps-sched.sh" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
fi
# Collect per kubernetes container name, QoS, and cpusets per numa node
run_command "${IONICE_CMD} ${NICE_CMD} kube-cpusets" "${LOGFILE}"
# For debugging pods and cgroups, etc
run_command "sudo LANG=POSIX systemd-cgls cpu -k -l" "${LOGFILE}"
# Various host attributes
LOGFILE="${EXTRA_DIR}/host.info"
echo "${hostname}: Host Info .........: ${LOGFILE}"
# StarlingX build info
run_command "${BUILD_INFO_CMD}" "${LOGFILE}"
run_command "timedatectl" "${LOGFILE}"
run_command "uptime" "${LOGFILE}"
run_command "cat /proc/cmdline" "${LOGFILE}"
run_command "cat /proc/version" "${LOGFILE}"
run_command "lscpu" "${LOGFILE}"
run_command "lscpu -e" "${LOGFILE}"
run_command "cat /proc/cpuinfo" "${LOGFILE}"
run_command "cat /sys/devices/system/cpu/isolated" "${LOGFILE}"
run_command "ip addr show" "${LOGFILE}"
run_command "lspci -nn" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
run_command "find /sys/kernel/iommu_groups/ -type l" "${LOGFILE}"
# networking totals
run_command "cat /proc/net/dev" "${LOGFILE}"
run_command "dmidecode" "${LOGFILE}"
# summary of scheduler tunable settings
run_command "cat /proc/sched_debug | head -15" "${LOGFILE}"
if [ "${SKIP_MASK}" = "true" ]; then
run_command "facter | grep -iv '^ssh'" "${LOGFILE}"
else
run_command "facter" "${LOGFILE}"
fi
if [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then
run_command "topology" "${LOGFILE}"
fi
# CPU C-state power info
run_command "${IONICE_CMD} ${NICE_CMD} cpupower monitor" "${LOGFILE}"
LOGFILE="${EXTRA_DIR}/memory.info"
echo "${hostname}: Memory Info .......: ${LOGFILE}"
run_command "cat /proc/meminfo" "${LOGFILE}"
run_command "cat /sys/devices/system/node/node?/meminfo" "${LOGFILE}"
run_command "log_slabinfo" "${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} ps -e -o ppid,pid,nlwp,rss:10,vsz:10,cmd --sort=-rss" "${LOGFILE}"
# list open files
run_command "timeout 60 ${IONICE_CMD} ${NICE_CMD} lsof -lwX" "${LOGFILE}"
# hugepages numa mapping
delimiter ${LOGFILE} "grep huge /proc/*/numa_maps"
grep -e " huge " /proc/*/numa_maps >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG}
LOGFILE="${EXTRA_DIR}/filesystem.info"
echo "${hostname}: Filesystem Info ...: ${LOGFILE}"
# rootfs and tmpfs usage
run_command "df -h -H -T --local -t rootfs -t tmpfs" "${LOGFILE}"
# disk space usage
run_command "df -h -H -T --local -t ext2 -t ext3 -t ext4 -t xfs --total" "${LOGFILE}"
# disk inodes usage
run_command "df -h -H -T --local -i -t ext2 -t ext3 -t ext4 -t xfs --total" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
# disks by-path values
run_command "ls -lR /dev/disk" "${LOGFILE}"
# disk summary (requires sudo/root)
run_command "fdisk -l" "${LOGFILE}"
run_command "cat /proc/scsi/scsi" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
# Controller specific stuff
if [ "$nodetype" = "controller" ] ; then
run_command "cat /proc/drbd" "${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} /sbin/drbdadm dump" "${LOGFILE}"
fi
# LVM summary
run_command "/usr/sbin/vgs --version" "${LOGFILE}"
run_command "/usr/sbin/pvs --version" "${LOGFILE}"
run_command "/usr/sbin/lvs --version" "${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/vgs --all --options all" "${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/pvs --all --options all" "${LOGFILE}"
run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/lvs --all --options all" "${LOGFILE}"
# iSCSI Information
LOGFILE="${EXTRA_DIR}/iscsi.info"
echo "${hostname}: iSCSI Info ........: ${LOGFILE}"
if [ "$nodetype" = "controller" ] ; then
# Controller- LIO exported initiators summary
run_command "${IONICE_CMD} ${NICE_CMD} targetcli ls" "${LOGFILE}"
# Controller - LIO sessions
run_command "${IONICE_CMD} ${NICE_CMD} targetcli sessions detail" "${LOGFILE}"
elif [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then
# Worker - iSCSI initiator information
collect_dir=${EXTRA_DIR}/iscsi_initiator_info
mkdir -p ${collect_dir}
cp -rf /run/iscsi-cache/nodes/* ${collect_dir}
find ${collect_dir} -type d -exec chmod 750 {} \;
# Worker - iSCSI initiator active sessions
run_command "iscsiadm -m session" "${LOGFILE}"
# Worker - iSCSI udev created nodes
delimiter ${LOGFILE} "ls -la /dev/disk/by-path | grep \"iqn\""
ls -la /dev/disk/by-path | grep "iqn" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG}
fi
sleep ${COLLECT_RUNEXTRA_DELAY}
LOGFILE="${EXTRA_DIR}/history.info"
echo "${hostname}: Bash History ......: ${LOGFILE}"
# history
run_command "cat /home/sysadmin/.bash_history" "${LOGFILE}"
LOGFILE="${EXTRA_DIR}/interrupt.info"
echo "${hostname}: Interrupt Info ....: ${LOGFILE}"
# interrupts
run_command "${IONICE_CMD} ${NICE_CMD} cat /proc/interrupts" "${LOGFILE}"
sleep ${COLLECT_RUNEXTRA_DELAY}
run_command "${IONICE_CMD} ${NICE_CMD} cat /proc/softirqs" "${LOGFILE}"
# Controller specific stuff
if [ "$nodetype" = "controller" ] ; then
LOGFILE="${EXTRA_DIR}/netstat.info"
run_command "${IONICE_CMD} ${NICE_CMD} netstat -pan" "${LOGFILE}"
fi
LOGFILE="${EXTRA_DIR}/blockdev.info"
echo "${hostname}: Block Devices Info : ${LOGFILE}"
# Collect block devices - show all sda and cinder devices, and size
run_command "lsblk" "${LOGFILE}"
# Collect block device topology - show devices and which io-scheduler
run_command "${IONICE_CMD} ${NICE_CMD} lsblk --topology" "${LOGFILE}"
# Collect SCSI devices - show devices and cinder attaches, etc
run_command "${IONICE_CMD} ${NICE_CMD} lsblk --scsi" "${LOGFILE}"
}
log_space "before collect ......:"
collect_extra_start=$(date ${DATE_FORMAT})
echo "${collect_extra_start}: collect_extra start" >> ${COLLECT_CMD_TIMING_LOG}
collect_extra
collect_extra_stop=$(date ${DATE_FORMAT})
duration=$(get_time_delta "${collect_extra_start}" "${collect_extra_stop}")
echo "${collect_extra_stop}: ${duration} collect_extra done" >> ${COLLECT_CMD_TIMING_LOG}
collect_parts_start=$(date ${DATE_FORMAT})
echo "$(date ${DATE_FORMAT}): collect_parts start" >> ${COLLECT_CMD_TIMING_LOG}
collect_parts
collect_parts_stop=$(date ${DATE_FORMAT})
duration=$(get_time_delta "${collect_parts_start}" "${collect_parts_stop}")
echo "$(date ${DATE_FORMAT}): ${duration} collect_parts done" >> ${COLLECT_CMD_TIMING_LOG}
#
# handle collect collect-after and collect-range and then
# in elif clause collect-before
#
VAR_LOG="/var/log"
rm -f ${VAR_LOG_INCLUDE_LIST}
# Dated collect defaults to 1 month of logs.
# Consider adding a check for how long the system has been provisioned
# and avoid running dated collect, which causes a CPU spike that can
# take up to 10 seconds or more even on newly provisioned systems.
echo "$(date ${DATE_FORMAT}): Date range start" >> ${COLLECT_CMD_TIMING_LOG}
if [ "${STARTDATE_RANGE}" == true ] ; then
if [ "${ENDDATE_RANGE}" == false ] ; then
ilog "collecting $VAR_LOG files containing logs after ${STARTDATE}"
${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} ""
else
ilog "collecting $VAR_LOG files containing logs between ${STARTDATE} and ${ENDDATE}"
${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} ""
fi
elif [ "${ENDDATE_RANGE}" == true ] ; then
STARTDATE="20130101"
ilog "collecting $VAR_LOG files containing logs before ${ENDDATE}"
${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} ""
else
ilog "collecting all of $VAR_LOG"
${IONICE_CMD} ${NICE_CMD} find $VAR_LOG ! -empty > ${VAR_LOG_INCLUDE_LIST}
fi
echo "$(date ${DATE_FORMAT}): Date range stop" >> ${COLLECT_CMD_TIMING_LOG}
sleep ${COLLECT_RUNEXTRA_DELAY}
# collect the www lighttpd logs if they exists and are not empty.
# note: The lighttpd logs don't have the date in the right place.
# So there is no point in passing those potentially large
# files through the dated collect; it would just waste time.
# Add that path in the raw form after.
WWW_LOG="www/var/log"
if [ -e "/var/${WWW_LOG}" ] ; then
find "/var/${WWW_LOG}" ! -empty >> ${VAR_LOG_INCLUDE_LIST}
elif [ -e "/${WWW_LOG}" ] ; then
find "/${WWW_LOG}" ! -empty >> ${VAR_LOG_INCLUDE_LIST}
fi
# Filter any dirs or files in the exclude list from the include list
remove_common_paths "${VAR_LOG_INCLUDE_LIST}" "${VAR_LOG_EXCLUDE}"
# Add VM console.log
for i in /var/lib/nova/instances/*/console.log; do
if [ -e "$i" ]; then
tmp=`dirname $i`
mkdir -p ${COLLECT_NAME_DIR}/$tmp
cp $i ${COLLECT_NAME_DIR}/$tmp
fi
done
# Add Rook-ceph logs
for i in /var/lib/ceph/data/rook-ceph/log/*; do
if [ -e "$i" ]; then
rook_ceph_log_dir="var/log/rook-ceph"
mkdir -p ${COLLECT_NAME_DIR}/$rook_ceph_log_dir
cp $i ${COLLECT_NAME_DIR}/$rook_ceph_log_dir
fi
done
# Add kubelet config state files
for i in /var/lib/kubelet/*; do
if [ -e "$i" ]; then
kube_config_dir="var/lib/kubelet"
mkdir -p ${COLLECT_NAME_DIR}/$kube_config_dir
cp $i ${COLLECT_NAME_DIR}/$kube_config_dir
fi
done
sleep ${COLLECT_RUNEXTRA_DELAY}
echo "$(date +'%H:%M:%S.%3N'): Running host tars" >> ${COLLECT_CMD_TIMING_LOG}
log_space "before first tar ....:"
(cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar -T ${VAR_LOG_INCLUDE_LIST} -X ${RUN_EXCLUDE} -X ${ETC_EXCLUDE} -X ${VAR_LOG_EXCLUDE} ${COLLECT_INCLUDE} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} )
log_space "after first tar .....:"
(cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${UNTAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} )
log_space "after first untar ...:"
rm -f ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar
log_space "after delete tar ....:"
if [ "${SKIP_MASK}" != "true" ]; then
# Run password masking before final tar
dlog "running /usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR}"
/usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR}
log_space "after passwd masking :"
fi
if [ "${OMIT_CERTS}" != "true" ]; then
# Collect certificates from the host
dlog "running /usr/local/sbin/collect_certificates ${EXTRA_DIR}"
COLLECT_ERROR_LOG="$COLLECT_ERROR_LOG" \
/usr/local/sbin/collect_certificates ${EXTRA_DIR}
log_space "after certificates ..:"
fi
mkdir -p ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH}
(cd /${FLIGHT_RECORDER_PATH} ; ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH}/${FLIGHT_RECORDER_FILE}.tgz ./${FLIGHT_RECORDER_FILE} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG})
# save the collect.log file to this host's tarball
cp -a ${COLLECT_ERROR_LOG} ${COLLECT_NAME_DIR}/${COLLECT_LOG}
cp -a ${COLLECT_CMD_TIMING_LOG} "${COLLECT_NAME_DIR}/${HOST_COLLECT_CMD_TIMING_LOG}"
log_space "with flight data ....:"
(cd ${COLLECT_BASE_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}.tgz ${COLLECT_NAME} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} )
log_space "after collect .......:"
echo "$(date +'%H:%M:%S.%3N'): Finished host tars" >> ${COLLECT_CMD_TIMING_LOG}
rm -rf ${COLLECT_NAME_DIR}
rm -f ${VAR_LOG_INCLUDE_LIST}
log_space "after cleanup .......:"
# Check for collect errors
# Only out of space error is enough to fail this hosts's collect
collect_errors "${HOSTNAME}"
RC=${?}
rm -f ${COLLECT_ERROR_LOG}
if [ ${RC} -ne 0 ] ; then
rm -f ${COLLECT_NAME_DIR}.tgz
if [ "${REMOTE_HOST}" = true ] ; then
ilog "${FAIL_OUT_OF_SPACE_REMOTE_STR} ${COLLECT_BASE_DIR}"
else
ilog "${FAIL_OUT_OF_SPACE_STR} ${COLLECT_BASE_DIR}"
fi
else
ilog "collect of ${COLLECT_NAME_DIR}.tgz succeeded"
echo "${collect_done}"
fi
dlog "collect_host exit code: ${rc}"
exit ${rc}