#! /bin/bash ######################################################################## # # Copyright (c) 2016-2022, 2024 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ######################################################################## # make these platform.conf variables global. # values are loaded in source_openrc_if_needed. export nodetype="" export subfunction="" export system_type="" export security_profile="" export sdn_enabled="" export region_config="" export vswitch_type="" export system_mode="" export sw_version="" # assume this is not the active controller until learned export ACTIVE=false # # Import commands, variables and convenience functions available to # all collectors ; common and user defined. # source /usr/local/sbin/collect_utils source_openrc_if_needed # # parse input parameters # COLLECT_NAME="${1}" DEBUG=${8} INVENTORY=${9} set_debug_mode ${DEBUG} expect_debug=$([ "${DEBUG}" = true ] && echo "-d" || echo "") # Calling parms # # 1 = collect name # 2 = start date option # 3 = start date # 4 = "any" (ignored - no longer used ; kept to support upgrades/downgrades) # 5 = end date option # 6 = end date # 7 = "any" (ignored - no longer used ; kept to support upgrades/downgrades) # 8 = debug mode # 9 = inventory logger -t ${COLLECT_TAG} "${0} ${1} ${2} ${3} ${4} ${5} ${6} ${7} ${8} ${9}" # parse out the start data/time data if it is present STARTDATE_RANGE=false STARTDATE="any" if [ "${2}" == "${STARTDATE_OPTION}" ] ; then if [ "${3}" != "any" -a ${#3} -gt 7 ] ; then STARTDATE_RANGE=true STARTDATE="${3}" fi fi # parse out the end date/time if it is present ENDDATE_RANGE=false ENDDATE="any" if [ "${5}" == "${ENDDATE_OPTION}" ] ; then if [ "${6}" != "any" -a ${#6} -gt 7 ] ; then ENDDATE_RANGE=true ENDDATE="${6}" fi fi COLLECT_BASE_DIR="/scratch" EXTRA="var/extra" COLLECT_NAME_DIR="${COLLECT_BASE_DIR}/${COLLECT_NAME}" EXTRA_DIR="${COLLECT_NAME_DIR}/${EXTRA}" TARBALL="${COLLECT_NAME_DIR}.tgz" COLLECT_PATH="/etc/collect.d" RUN_EXCLUDE="/etc/collect/run.exclude" ETC_EXCLUDE="/etc/collect/etc.exclude" VAR_LOG_EXCLUDE="/etc/collect/varlog.exclude" COLLECT_INCLUDE="/var/run /etc /root" FLIGHT_RECORDER_PATH="var/lib/sm/" FLIGHT_RECORDER_FILE="sm.eru.v1" VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst" COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}" COLLECT_DATE="/usr/local/sbin/collect_date" COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv" rm -f ${COLLECT_CMD_TIMING_LOG} function log_space() { local msg=${1} space="`${COLLECT_DIR_USAGE_CMD}`" space1=`echo "${space}" | grep -v Filesystem` ilog "${COLLECT_BASE_DIR} ${msg} ${space1}" } space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR} CURR_DIR=`pwd` mkdir -p ${COLLECT_NAME_DIR} cd ${COLLECT_NAME_DIR} # create dump target extra-stuff directory mkdir -p ${EXTRA_DIR} RETVAL=0 # Remove any previous collect error log. # Start this collect with an empty file. # # stderr is directed to this log during the collect process. # By searching this log after collect_host is run we can find # errors that occured during collect. # The only real error that we care about right now is the # # "No space left on device" error # rm -f ${COLLECT_ERROR_LOG} touch ${COLLECT_ERROR_LOG} chmod 644 ${COLLECT_ERROR_LOG} echo "`date '+%F %T'` :${COLLECT_NAME_DIR}" > ${COLLECT_ERROR_LOG} ilog "creating local collect tarball ${COLLECT_NAME_DIR}.tgz" ################################################################################ # Run collect scripts to check system status ################################################################################ function collect_parts() { if [ -d ${COLLECT_PATH} ]; then for i in ${COLLECT_PATH}/*; do if [ -f $i ]; then local start=$(date ${DATE_FORMAT}) if [ ${i} = ${COLLECT_SYSINV} ]; then ${IONICE_CMD} ${NICE_CMD} $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname} ${INVENTORY} else ${IONICE_CMD} ${NICE_CMD} $i ${COLLECT_NAME_DIR} ${EXTRA_DIR} ${hostname} fi local stop=$(date ${DATE_FORMAT}) duration=$(get_time_delta "${start}" "${stop}") echo "${stop}: ${duration} Plugin $i" >> ${COLLECT_CMD_TIMING_LOG} # Add delay between parts ; i.e. collect plugins sleep ${COLLECT_RUNPARTS_DELAY} fi done fi } function collect_extra() { # dump process lists LOGFILE="${EXTRA_DIR}/process.info" echo "${hostname}: Process Info ......: ${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} ${PROCESS_DETAIL_CMD}" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} # Collect process and thread info (tree view) run_command "${IONICE_CMD} ${NICE_CMD} pstree --arguments --ascii --long --show-pids" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} # Collect process, thread and scheduling info (worker subfunction only) # (also gets process 'affinity' which is useful on workers; which ps-sched.sh >/dev/null 2>&1 if [ $? -eq 0 ]; then run_command "${IONICE_CMD} ${NICE_CMD} ps-sched.sh" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} fi # Collect per kubernetes container name, QoS, and cpusets per numa node run_command "${IONICE_CMD} ${NICE_CMD} kube-cpusets" "${LOGFILE}" # For debugging pods and cgroups, etc run_command "sudo LANG=POSIX systemd-cgls cpu -k -l" "${LOGFILE}" # Various host attributes LOGFILE="${EXTRA_DIR}/host.info" echo "${hostname}: Host Info .........: ${LOGFILE}" # StarlingX build info run_command "${BUILD_INFO_CMD}" "${LOGFILE}" run_command "timedatectl" "${LOGFILE}" run_command "uptime" "${LOGFILE}" run_command "cat /proc/cmdline" "${LOGFILE}" run_command "cat /proc/version" "${LOGFILE}" run_command "lscpu" "${LOGFILE}" run_command "lscpu -e" "${LOGFILE}" run_command "cat /proc/cpuinfo" "${LOGFILE}" run_command "cat /sys/devices/system/cpu/isolated" "${LOGFILE}" run_command "ip addr show" "${LOGFILE}" run_command "lspci -nn" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} run_command "find /sys/kernel/iommu_groups/ -type l" "${LOGFILE}" # networking totals run_command "cat /proc/net/dev" "${LOGFILE}" run_command "dmidecode" "${LOGFILE}" # summary of scheduler tunable settings run_command "cat /proc/sched_debug | head -15" "${LOGFILE}" if [ "${SKIP_MASK}" = "true" ]; then run_command "facter | grep -iv '^ssh'" "${LOGFILE}" else run_command "facter" "${LOGFILE}" fi if [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then run_command "topology" "${LOGFILE}" fi # CPU C-state power info run_command "${IONICE_CMD} ${NICE_CMD} cpupower monitor" "${LOGFILE}" LOGFILE="${EXTRA_DIR}/memory.info" echo "${hostname}: Memory Info .......: ${LOGFILE}" run_command "cat /proc/meminfo" "${LOGFILE}" run_command "cat /sys/devices/system/node/node?/meminfo" "${LOGFILE}" run_command "log_slabinfo" "${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} ps -e -o ppid,pid,nlwp,rss:10,vsz:10,cmd --sort=-rss" "${LOGFILE}" # list open files run_command "timeout 60 ${IONICE_CMD} ${NICE_CMD} lsof -lwX" "${LOGFILE}" # hugepages numa mapping delimiter ${LOGFILE} "grep huge /proc/*/numa_maps" grep -e " huge " /proc/*/numa_maps >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} LOGFILE="${EXTRA_DIR}/filesystem.info" echo "${hostname}: Filesystem Info ...: ${LOGFILE}" # rootfs and tmpfs usage run_command "df -h -H -T --local -t rootfs -t tmpfs" "${LOGFILE}" # disk space usage run_command "df -h -H -T --local -t ext2 -t ext3 -t ext4 -t xfs --total" "${LOGFILE}" # disk inodes usage run_command "df -h -H -T --local -i -t ext2 -t ext3 -t ext4 -t xfs --total" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} # disks by-path values run_command "ls -lR /dev/disk" "${LOGFILE}" # disk summary (requires sudo/root) run_command "fdisk -l" "${LOGFILE}" run_command "cat /proc/scsi/scsi" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} # Controller specific stuff if [ "$nodetype" = "controller" ] ; then run_command "cat /proc/drbd" "${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} /sbin/drbdadm dump" "${LOGFILE}" fi # LVM summary run_command "/usr/sbin/vgs --version" "${LOGFILE}" run_command "/usr/sbin/pvs --version" "${LOGFILE}" run_command "/usr/sbin/lvs --version" "${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/vgs --all --options all" "${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/pvs --all --options all" "${LOGFILE}" run_command "${IONICE_CMD} ${NICE_CMD} /usr/sbin/lvs --all --options all" "${LOGFILE}" # iSCSI Information LOGFILE="${EXTRA_DIR}/iscsi.info" echo "${hostname}: iSCSI Info ........: ${LOGFILE}" if [ "$nodetype" = "controller" ] ; then # Controller- LIO exported initiators summary run_command "${IONICE_CMD} ${NICE_CMD} targetcli ls" "${LOGFILE}" # Controller - LIO sessions run_command "${IONICE_CMD} ${NICE_CMD} targetcli sessions detail" "${LOGFILE}" elif [[ "$nodetype" == "worker" || "$subfunction" == *"worker"* ]] ; then # Worker - iSCSI initiator information collect_dir=${EXTRA_DIR}/iscsi_initiator_info mkdir -p ${collect_dir} cp -rf /run/iscsi-cache/nodes/* ${collect_dir} find ${collect_dir} -type d -exec chmod 750 {} \; # Worker - iSCSI initiator active sessions run_command "iscsiadm -m session" "${LOGFILE}" # Worker - iSCSI udev created nodes delimiter ${LOGFILE} "ls -la /dev/disk/by-path | grep \"iqn\"" ls -la /dev/disk/by-path | grep "iqn" >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} fi sleep ${COLLECT_RUNEXTRA_DELAY} LOGFILE="${EXTRA_DIR}/history.info" echo "${hostname}: Bash History ......: ${LOGFILE}" # history run_command "cat /home/sysadmin/.bash_history" "${LOGFILE}" LOGFILE="${EXTRA_DIR}/interrupt.info" echo "${hostname}: Interrupt Info ....: ${LOGFILE}" # interrupts run_command "${IONICE_CMD} ${NICE_CMD} cat /proc/interrupts" "${LOGFILE}" sleep ${COLLECT_RUNEXTRA_DELAY} run_command "${IONICE_CMD} ${NICE_CMD} cat /proc/softirqs" "${LOGFILE}" # Controller specific stuff if [ "$nodetype" = "controller" ] ; then LOGFILE="${EXTRA_DIR}/netstat.info" run_command "${IONICE_CMD} ${NICE_CMD} netstat -pan" "${LOGFILE}" fi LOGFILE="${EXTRA_DIR}/blockdev.info" echo "${hostname}: Block Devices Info : ${LOGFILE}" # Collect block devices - show all sda and cinder devices, and size run_command "lsblk" "${LOGFILE}" # Collect block device topology - show devices and which io-scheduler run_command "${IONICE_CMD} ${NICE_CMD} lsblk --topology" "${LOGFILE}" # Collect SCSI devices - show devices and cinder attaches, etc run_command "${IONICE_CMD} ${NICE_CMD} lsblk --scsi" "${LOGFILE}" } log_space "before collect ......:" collect_extra_start=$(date ${DATE_FORMAT}) echo "${collect_extra_start}: collect_extra start" >> ${COLLECT_CMD_TIMING_LOG} collect_extra collect_extra_stop=$(date ${DATE_FORMAT}) duration=$(get_time_delta "${collect_extra_start}" "${collect_extra_stop}") echo "${collect_extra_stop}: ${duration} collect_extra done" >> ${COLLECT_CMD_TIMING_LOG} collect_parts_start=$(date ${DATE_FORMAT}) echo "$(date ${DATE_FORMAT}): collect_parts start" >> ${COLLECT_CMD_TIMING_LOG} collect_parts collect_parts_stop=$(date ${DATE_FORMAT}) duration=$(get_time_delta "${collect_parts_start}" "${collect_parts_stop}") echo "$(date ${DATE_FORMAT}): ${duration} collect_parts done" >> ${COLLECT_CMD_TIMING_LOG} # # handle collect collect-after and collect-range and then # in elif clause collect-before # VAR_LOG="/var/log" rm -f ${VAR_LOG_INCLUDE_LIST} # Dated collect defaults to 1 month of logs. # Consider adding a check for how long the system has been provisioned # and avoid running dated collect, which causes a CPU spike that can # take up to 10 seconds or more even on newly provisioned systems. echo "$(date ${DATE_FORMAT}): Date range start" >> ${COLLECT_CMD_TIMING_LOG} if [ "${STARTDATE_RANGE}" == true ] ; then if [ "${ENDDATE_RANGE}" == false ] ; then ilog "collecting $VAR_LOG files containing logs after ${STARTDATE}" ${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" else ilog "collecting $VAR_LOG files containing logs between ${STARTDATE} and ${ENDDATE}" ${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" fi elif [ "${ENDDATE_RANGE}" == true ] ; then STARTDATE="20130101" ilog "collecting $VAR_LOG files containing logs before ${ENDDATE}" ${IONICE_CMD} ${NICE_CMD} ${COLLECT_DATE} ${STARTDATE} ${ENDDATE} ${VAR_LOG_INCLUDE_LIST} ${DEBUG} "" else ilog "collecting all of $VAR_LOG" ${IONICE_CMD} ${NICE_CMD} find $VAR_LOG ! -empty > ${VAR_LOG_INCLUDE_LIST} fi echo "$(date ${DATE_FORMAT}): Date range stop" >> ${COLLECT_CMD_TIMING_LOG} sleep ${COLLECT_RUNEXTRA_DELAY} # collect the www lighttpd logs if they exists and are not empty. # note: The lighttpd logs don't have the date in the right place. # So there is no point in passing those potentially large # files through the dated collect; it would just waste time. # Add that path in the raw form after. WWW_LOG="www/var/log" if [ -e "/var/${WWW_LOG}" ] ; then find "/var/${WWW_LOG}" ! -empty >> ${VAR_LOG_INCLUDE_LIST} elif [ -e "/${WWW_LOG}" ] ; then find "/${WWW_LOG}" ! -empty >> ${VAR_LOG_INCLUDE_LIST} fi # Filter any dirs or files in the exclude list from the include list remove_common_paths "${VAR_LOG_INCLUDE_LIST}" "${VAR_LOG_EXCLUDE}" # Add VM console.log for i in /var/lib/nova/instances/*/console.log; do if [ -e "$i" ]; then tmp=`dirname $i` mkdir -p ${COLLECT_NAME_DIR}/$tmp cp $i ${COLLECT_NAME_DIR}/$tmp fi done # Add Rook-ceph logs for i in /var/lib/ceph/data/rook-ceph/log/*; do if [ -e "$i" ]; then rook_ceph_log_dir="var/log/rook-ceph" mkdir -p ${COLLECT_NAME_DIR}/$rook_ceph_log_dir cp $i ${COLLECT_NAME_DIR}/$rook_ceph_log_dir fi done # Add kubelet config state files for i in /var/lib/kubelet/*; do if [ -e "$i" ]; then kube_config_dir="var/lib/kubelet" mkdir -p ${COLLECT_NAME_DIR}/$kube_config_dir cp $i ${COLLECT_NAME_DIR}/$kube_config_dir fi done sleep ${COLLECT_RUNEXTRA_DELAY} echo "$(date +'%H:%M:%S.%3N'): Running host tars" >> ${COLLECT_CMD_TIMING_LOG} log_space "before first tar ....:" (cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar -T ${VAR_LOG_INCLUDE_LIST} -X ${RUN_EXCLUDE} -X ${ETC_EXCLUDE} -X ${VAR_LOG_EXCLUDE} ${COLLECT_INCLUDE} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) log_space "after first tar .....:" (cd ${COLLECT_NAME_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${UNTAR_CMD} ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) log_space "after first untar ...:" rm -f ${COLLECT_NAME_DIR}/${COLLECT_NAME}.tar log_space "after delete tar ....:" if [ "${SKIP_MASK}" != "true" ]; then # Run password masking before final tar dlog "running /usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR}" /usr/local/sbin/collect_mask_passwords ${COLLECT_NAME_DIR} ${EXTRA_DIR} log_space "after passwd masking :" fi if [ "${OMIT_CERTS}" != "true" ]; then # Collect certificates from the host dlog "running /usr/local/sbin/collect_certificates ${EXTRA_DIR}" COLLECT_ERROR_LOG="$COLLECT_ERROR_LOG" \ /usr/local/sbin/collect_certificates ${EXTRA_DIR} log_space "after certificates ..:" fi mkdir -p ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH} (cd /${FLIGHT_RECORDER_PATH} ; ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}/${FLIGHT_RECORDER_PATH}/${FLIGHT_RECORDER_FILE}.tgz ./${FLIGHT_RECORDER_FILE} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG}) # save the collect.log file to this host's tarball cp -a ${COLLECT_ERROR_LOG} ${COLLECT_NAME_DIR}/${COLLECT_LOG} cp -a ${COLLECT_CMD_TIMING_LOG} "${COLLECT_NAME_DIR}/${HOST_COLLECT_CMD_TIMING_LOG}" log_space "with flight data ....:" (cd ${COLLECT_BASE_DIR} ; ${IONICE_CMD} ${NICE_CMD} ${TAR_ZIP_CMD} ${COLLECT_NAME_DIR}.tgz ${COLLECT_NAME} ${CHECKPOINT_CMD} 2>>${COLLECT_ERROR_LOG} 1>>${COLLECT_ERROR_LOG} ) log_space "after collect .......:" echo "$(date +'%H:%M:%S.%3N'): Finished host tars" >> ${COLLECT_CMD_TIMING_LOG} rm -rf ${COLLECT_NAME_DIR} rm -f ${VAR_LOG_INCLUDE_LIST} log_space "after cleanup .......:" # Check for collect errors # Only out of space error is enough to fail this hosts's collect collect_errors "${HOSTNAME}" RC=${?} rm -f ${COLLECT_ERROR_LOG} if [ ${RC} -ne 0 ] ; then rm -f ${COLLECT_NAME_DIR}.tgz if [ "${REMOTE_HOST}" = true ] ; then ilog "${FAIL_OUT_OF_SPACE_REMOTE_STR} ${COLLECT_BASE_DIR}" else ilog "${FAIL_OUT_OF_SPACE_STR} ${COLLECT_BASE_DIR}" fi else ilog "collect of ${COLLECT_NAME_DIR}.tgz succeeded" echo "${collect_done}" fi dlog "collect_host exit code: ${rc}" exit ${rc}