Eric MacDonald 8841bceb80 Make collectd plugins use FM API V2
Using Fm API V2 allows the collectd plugins to distinguish
between FM connection failures and no existing alarm query
requests on process startup as well as failure to clear or
assert alarms during runtime so that such actions can be
retried on next audit interval.

This allows the plugins to be more robust in its alarm
management and avoids leaving stuck alarms which fixes
the following three reported stuck alarm bugs.

Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1802535
Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1813974
Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1814944

Additional improvements were made to each plugin to handle
failure paths better with the V2 API.

Additional changes made by this update include:

1. fixed stale unmounted filesystems alarm handling
2. percent usage alarm actual readings are updated on change
3. fix of threshold values
4. add 2 decimal point resolution to % usage alarm text
5. added commented FIT code to mem, cpu and df plugins
6. reversed True/False return polarity in interface plugin functions

Test Plan:

Regression:
PASS: normal alarm handling with FM V2 API ; process startup
PASS: normal alarm handling with FM V2 API ; runtime alarm assert
PASS: normal alarm handling with FM V2 API ; runtime alarm clear

PASS: Verify alarms of unmounted fs gets automatically cleared
PASS: Verify interface alarm/clear operation

Robustness:
PASS: Verify general startup behavior of all plugins while FM
      is not running only to see it start at some later time.
PASS: Verify alarm handling over process startup with existing
      cpu alarms while FM not running.
PASS: Verify alarm handling over process startup with existing
      mem alarms while FM not running.
PASS: Verify alarm handling over process startup with existing
      df alarms while FM not running.

PASS: Verify runtime cpu plugin alarm assertion retry handling
PASS: Verify runtime cpu plugin alarm clear retry handling
PASS: Verify runtime cpu plugin handling over process restart
PASS: Verify alarm handling over process startup with existing
      cpu alarms while FM initially not running and then started.

PASS: Verify runtime mem plugin alarm assertion retry handling
PASS: Verify runtime mem plugin alarm clear retry handling
PASS: Verify runtime mem plugin handling over process restart
PASS: Verify alarm handling over process startup with existing
      mem alarms while FM initially not running and then started.

PASS: Verify runtime df plugin alarm assertion retry handling
PASS: Verify runtime df plugin alarm clear retry handling
PASS: Verify runtime df plugin handling over process restart
PASS: Verify alarm handling over process startup with existing
      df alarms while FM initially not running and then started.

PASS: Verify alarm set/clear threshold boundaries for cpu plugin
PASS: Verify alarm set/clear threshold boundaries for memory plugin
PASS: Verify alarm set/clear threshold boundaries for df plugin

New Features: ... threshold exceeded ; threshold 80.00%, actual 80.33%
PASS: Verify percent usage alarms are refreshed with current value
PASS: Verify percent usage alarms show two decimal points

Change-Id: Ibe173617d11c17bdc4b41115e25bd8c18b49807e
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2019-05-01 12:13:27 -04:00

280 lines
9.2 KiB
Python
Executable File

#
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This file is the collectd 'Platform CPU Usage' Monitor.
#
# The Platform CPU Usage is calculated as an averaged percentage of
# platform core usable since the previous sample.
#
# Init Function:
# - if 'worker_reserved.conf exists then query/store PLATFORM_CPU_LIST
#
############################################################################
import os
import collectd
debug = False
PLUGIN = 'platform memory usage'
PLUGIN_NUMA = 'numa memory usage'
PLUGIN_HUGE = 'hugepage memory usage'
# CPU Control class
class MEM:
hostname = "" # hostname for sample notification message
cmd = '/proc/meminfo' # the query comment
value = float(0.0) # float value of memory usage
# meminfo values we care about
memTotal_kB = 0
memFree_kB = 0
buffers = 0
cached = 0
SReclaimable = 0
CommitLimit = 0
Committed_AS = 0
HugePages_Total = 0
HugePages_Free = 0
Hugepagesize = 0
AnonPages = 0
FilePages = 0
# derived values
avail = 0
total = 0
strict = 0
# Instantiate the class
obj = MEM()
def log_meminfo(plugin, name, meminfo):
"""Log the supplied meminfo"""
if debug is False:
return
collectd.info("%s %s" % (plugin, name))
collectd.info("%s ---------------------------" % plugin)
collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB))
collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB))
collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers))
collectd.info("%s Cached : %f" % (plugin, meminfo.cached))
collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable))
collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit))
collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS))
collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total))
collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free))
collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize))
collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages))
def config_func(config):
"""Configure the memory usage plugin"""
for node in config.children:
key = node.key.lower()
val = node.values[0]
if key == 'path':
obj.cmd = str(val)
collectd.info("%s configured query command: '%s'" %
(PLUGIN, obj.cmd))
return 0
collectd.info("%s no config command provided ; "
"defaulting to '%s'" %
(PLUGIN, obj.cmd))
# Load the hostname and kernel memory 'overcommit' setting.
def init_func():
# get current hostname
obj.hostname = os.uname()[1]
# get strict setting
#
# a value of 0 means "heuristic overcommit"
# a value of 1 means "always overcommit"
# a value of 2 means "don't overcommit".
#
# set strict true strict=1 if value is = 2
# otherwise strict is false strict=0 (default)
fn = '/proc/sys/vm/overcommit_memory'
if os.path.exists(fn):
with open(fn, 'r') as infile:
for line in infile:
if int(line) == 2:
obj.strict = 1
break
collectd.info("%s strict:%d" % (PLUGIN, obj.strict))
# Calculate the CPU usage sample
def read_func():
meminfo = {}
try:
with open(obj.cmd) as fd:
for line in fd:
meminfo[line.split(':')[0]] = line.split(':')[1].strip()
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN, str(e)))
return 0
# setup the sample structure
val = collectd.Values(host=obj.hostname)
val.type = 'percent'
val.type_instance = 'used'
# fit_value = 0
# if os.path.exists('/var/run/fit/mem_data'):
# with open('/var/run/fit/mem_data', 'r') as infile:
# for line in infile:
# fit_value = float(line)
# collectd.info("%s using FIT data:%.2f" %
# (PLUGIN, fit_value))
# break
# remove the 'unit' (kB) suffix that might be on some of the lines
for line in meminfo:
# remove the units from the value read
value_unit = [u.strip() for u in meminfo[line].split(' ', 1)]
if len(value_unit) == 2:
value, unit = value_unit
meminfo[line] = float(value)
else:
meminfo[line] = float(meminfo[line])
obj.memTotal_kB = float(meminfo['MemTotal'])
obj.memFree_kB = float(meminfo['MemFree'])
obj.buffers = float(meminfo['Buffers'])
obj.cached = float(meminfo['Cached'])
obj.SReclaimable = float(meminfo['SReclaimable'])
obj.CommitLimit = float(meminfo['CommitLimit'])
obj.Committed_AS = float(meminfo['Committed_AS'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
obj.Hugepagesize = float(meminfo['Hugepagesize'])
obj.AnonPages = float(meminfo['AnonPages'])
log_meminfo(PLUGIN, "/proc/meminfo", obj)
obj.avail = float(float(obj.memFree_kB) +
float(obj.buffers) +
float(obj.cached) +
float(obj.SReclaimable))
obj.total = float(float(obj.avail) +
float(obj.AnonPages))
if obj.strict == 1:
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
else:
obj.value = float(float(obj.AnonPages) / float(obj.total))
obj.value = float(float(obj.value) * 100)
# if fit_value != 0:
# obj.value = fit_value
if debug is True:
collectd.info("%s ---------------------------" % PLUGIN)
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
collectd.info("%s memTotal: %d" % (PLUGIN, obj.total))
collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value))
# Dispatch usage value to collectd
val.plugin = 'memory'
val.plugin_instance = 'platform'
val.dispatch(values=[obj.value])
#####################################################################
# Now get the Numa Node Memory Usage
#####################################################################
numa_node_files = []
fn = "/sys/devices/system/node/"
files = os.listdir(fn)
for file in files:
if 'node' in file:
numa_node_files.append(fn + file + '/meminfo')
for numa_node in numa_node_files:
meminfo = {}
try:
with open(numa_node) as fd:
for line in fd:
meminfo[line.split()[2][0:-1]] = line.split()[3].strip()
obj.memFree_kB = float(meminfo['MemFree'])
obj.FilePages = float(meminfo['FilePages'])
obj.SReclaimable = float(meminfo['SReclaimable'])
obj.AnonPages = float(meminfo['AnonPages'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
log_meminfo(PLUGIN, numa_node, obj)
avail = float(float(obj.memFree_kB) +
float(obj.FilePages) +
float(obj.SReclaimable))
total = float(float(avail) +
float(obj.AnonPages))
obj.value = float(float(obj.AnonPages)) / float(total)
obj.value = float(float(obj.value) * 100)
# if fit_value != 0:
# obj.value = fit_value
# Dispatch usage value to collectd for this numa node
val.plugin_instance = numa_node.split('/')[5]
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_NUMA,
val.plugin,
obj.value,
val.plugin_instance))
# Numa Node Huge Page Memory Monitoring
#
# Only monitor if there is Huge Page Memory
if obj.HugePages_Total > 0:
obj.value = \
float(float(obj.HugePages_Total -
obj.HugePages_Free)) / \
float(obj.HugePages_Total)
obj.value = float(float(obj.value) * 100)
# if fit_value != 0:
# obj.value = fit_value
# Dispatch huge page memory usage value
# to collectd for this numa node.
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_HUGE,
val.plugin,
obj.value,
val.plugin_instance))
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN_NUMA, str(e)))
return 0
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)