Aggregate by fqdn label instead instance in host cpu metrics
While in a regular case a specific metric for a specific host will be provider by a single instance (exporter) so aggregating by label and by intances should be the same, it is more correct to aggregate by the same label that the one we use to filter the metrics. This is follow up of https://review.opendev.org/c/openstack/watcher/+/944795 Related-Bug: #2103451 Change-Id: Ia61f051547ddc51e0d1ccd5a56485ab49ce84c2e
This commit is contained in:
parent
035e6584c7
commit
c7158b08d1
@ -264,8 +264,8 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
This function builds and returns the string query that will be sent
|
||||
to the Prometheus server /query endpoint. For host cpu usage we use:
|
||||
|
||||
100 - (avg by (instance)(rate(node_cpu_seconds_total{mode='idle',
|
||||
instance='some_host'}[300s])) * 100)
|
||||
100 - (avg by (fqdn)(rate(node_cpu_seconds_total{mode='idle',
|
||||
fqdn='some_host'}[300s])) * 100)
|
||||
|
||||
so using prometheus rate function over the specified period, we average
|
||||
per instance (all cpus) idle time and then 'everything else' is cpu
|
||||
@ -307,7 +307,7 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
|
||||
if meter == 'node_cpu_seconds_total':
|
||||
query_args = (
|
||||
"100 - (%(agg)s by (instance)(rate(%(meter)s"
|
||||
"100 - (%(agg)s by (%(label)s)(rate(%(meter)s"
|
||||
"{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
|
||||
"* 100)"
|
||||
% {'label': self.prometheus_fqdn_label,
|
||||
@ -464,8 +464,8 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
This calculates the host cpu usage and returns it as a percentage
|
||||
The calculation is made by using the cpu 'idle' time, per
|
||||
instance (so all CPUs are included). For example the query looks like
|
||||
(100 - (avg by (instance)(rate(node_cpu_seconds_total
|
||||
{mode='idle',instance='localhost:9100'}[300s])) * 100))
|
||||
(100 - (avg by (fqdn)(rate(node_cpu_seconds_total
|
||||
{mode='idle',fqdn='compute1.example.com'}[300s])) * 100))
|
||||
"""
|
||||
aggregate = self._invert_max_min_aggregate(aggregate)
|
||||
cpu_usage = self.statistic_aggregation(
|
||||
|
@ -146,7 +146,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
)
|
||||
self.assertEqual(expected_cpu_usage, result)
|
||||
mock_prometheus_query.assert_called_once_with(
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
|
||||
" * 100)")
|
||||
|
||||
@ -575,7 +575,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
|
||||
def test_build_prometheus_query_node_cpu_avg_agg(self):
|
||||
expected_query = (
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',fqdn='a_host'}[111s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'avg', 'node_cpu_seconds_total', 'a_host', '111')
|
||||
@ -583,7 +583,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
|
||||
def test_build_prometheus_query_node_cpu_max_agg(self):
|
||||
expected_query = (
|
||||
"100 - (max by (instance)(rate(node_cpu_seconds_total"
|
||||
"100 - (max by (fqdn)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',fqdn='b_host'}[444s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'max', 'node_cpu_seconds_total', 'b_host', '444')
|
||||
@ -610,7 +610,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
|
||||
self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
|
||||
expected_query = (
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"100 - (avg by (custom_fqdn_label)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'avg', 'node_cpu_seconds_total', 'a_host', '111')
|
||||
|
Loading…
x
Reference in New Issue
Block a user