diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py index 9b803b6be..7b7572f3b 100644 --- a/watcher/decision_engine/datasources/prometheus.py +++ b/watcher/decision_engine/datasources/prometheus.py @@ -264,8 +264,8 @@ class PrometheusHelper(base.DataSourceBase): This function builds and returns the string query that will be sent to the Prometheus server /query endpoint. For host cpu usage we use: - 100 - (avg by (instance)(rate(node_cpu_seconds_total{mode='idle', - instance='some_host'}[300s])) * 100) + 100 - (avg by (fqdn)(rate(node_cpu_seconds_total{mode='idle', + fqdn='some_host'}[300s])) * 100) so using prometheus rate function over the specified period, we average per instance (all cpus) idle time and then 'everything else' is cpu @@ -307,7 +307,7 @@ class PrometheusHelper(base.DataSourceBase): if meter == 'node_cpu_seconds_total': query_args = ( - "100 - (%(agg)s by (instance)(rate(%(meter)s" + "100 - (%(agg)s by (%(label)s)(rate(%(meter)s" "{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) " "* 100)" % {'label': self.prometheus_fqdn_label, @@ -464,8 +464,8 @@ class PrometheusHelper(base.DataSourceBase): This calculates the host cpu usage and returns it as a percentage The calculation is made by using the cpu 'idle' time, per instance (so all CPUs are included). For example the query looks like - (100 - (avg by (instance)(rate(node_cpu_seconds_total - {mode='idle',instance='localhost:9100'}[300s])) * 100)) + (100 - (avg by (fqdn)(rate(node_cpu_seconds_total + {mode='idle',fqdn='compute1.example.com'}[300s])) * 100)) """ aggregate = self._invert_max_min_aggregate(aggregate) cpu_usage = self.statistic_aggregation( diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py index 1c4b0d662..5aaccb0ec 100644 --- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py +++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py @@ -146,7 +146,7 @@ class TestPrometheusHelper(base.BaseTestCase): ) self.assertEqual(expected_cpu_usage, result) mock_prometheus_query.assert_called_once_with( - "100 - (avg by (instance)(rate(node_cpu_seconds_total" + "100 - (avg by (fqdn)(rate(node_cpu_seconds_total" "{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))" " * 100)") @@ -575,7 +575,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_cpu_avg_agg(self): expected_query = ( - "100 - (avg by (instance)(rate(node_cpu_seconds_total" + "100 - (avg by (fqdn)(rate(node_cpu_seconds_total" "{mode='idle',fqdn='a_host'}[111s])) * 100)") result = self.helper._build_prometheus_query( 'avg', 'node_cpu_seconds_total', 'a_host', '111') @@ -583,7 +583,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_cpu_max_agg(self): expected_query = ( - "100 - (max by (instance)(rate(node_cpu_seconds_total" + "100 - (max by (fqdn)(rate(node_cpu_seconds_total" "{mode='idle',fqdn='b_host'}[444s])) * 100)") result = self.helper._build_prometheus_query( 'max', 'node_cpu_seconds_total', 'b_host', '444') @@ -610,7 +610,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self): self.helper.prometheus_fqdn_label = 'custom_fqdn_label' expected_query = ( - "100 - (avg by (instance)(rate(node_cpu_seconds_total" + "100 - (avg by (custom_fqdn_label)(rate(node_cpu_seconds_total" "{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)") result = self.helper._build_prometheus_query( 'avg', 'node_cpu_seconds_total', 'a_host', '111')