From c7158b08d1a89336a920d9208969ce6a913d9587 Mon Sep 17 00:00:00 2001
From: Alfredo Moralejo <amoralej@redhat.com>
Date: Tue, 1 Apr 2025 16:51:12 +0200
Subject: [PATCH] Aggregate by fqdn label instead instance in host cpu metrics

While in a regular case a specific metric for a specific host will be
provider by a single instance (exporter) so aggregating by label and by
intances should be the same, it is more correct to aggregate by the same
label that the one we use to filter the metrics.

This is follow up of https://review.opendev.org/c/openstack/watcher/+/944795

Related-Bug: #2103451

Change-Id: Ia61f051547ddc51e0d1ccd5a56485ab49ce84c2e
---
 watcher/decision_engine/datasources/prometheus.py      | 10 +++++-----
 .../datasources/test_prometheus_helper.py              |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py
index 9b803b6be..7b7572f3b 100644
--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -264,8 +264,8 @@ class PrometheusHelper(base.DataSourceBase):
         This function builds and returns the string query that will be sent
         to the Prometheus server /query endpoint. For host cpu usage we use:
 
-        100 - (avg by (instance)(rate(node_cpu_seconds_total{mode='idle',
-                                       instance='some_host'}[300s])) * 100)
+        100 - (avg by (fqdn)(rate(node_cpu_seconds_total{mode='idle',
+                                       fqdn='some_host'}[300s])) * 100)
 
         so using prometheus rate function over the specified period, we average
         per instance (all cpus) idle time and then 'everything else' is cpu
@@ -307,7 +307,7 @@ class PrometheusHelper(base.DataSourceBase):
 
         if meter == 'node_cpu_seconds_total':
             query_args = (
-                "100 - (%(agg)s by (instance)(rate(%(meter)s"
+                "100 - (%(agg)s by (%(label)s)(rate(%(meter)s"
                 "{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
                 "* 100)"
                 % {'label': self.prometheus_fqdn_label,
@@ -464,8 +464,8 @@ class PrometheusHelper(base.DataSourceBase):
         This calculates the host cpu usage and returns it as a percentage
         The calculation is made by using the cpu 'idle' time, per
         instance (so all CPUs are included). For example the query looks like
-        (100 - (avg by (instance)(rate(node_cpu_seconds_total
-            {mode='idle',instance='localhost:9100'}[300s])) * 100))
+        (100 - (avg by (fqdn)(rate(node_cpu_seconds_total
+            {mode='idle',fqdn='compute1.example.com'}[300s])) * 100))
         """
         aggregate = self._invert_max_min_aggregate(aggregate)
         cpu_usage = self.statistic_aggregation(
diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
index 1c4b0d662..5aaccb0ec 100644
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -146,7 +146,7 @@ class TestPrometheusHelper(base.BaseTestCase):
         )
         self.assertEqual(expected_cpu_usage, result)
         mock_prometheus_query.assert_called_once_with(
-            "100 - (avg by (instance)(rate(node_cpu_seconds_total"
+            "100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
             "{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
             " * 100)")
 
@@ -575,7 +575,7 @@ class TestPrometheusHelper(base.BaseTestCase):
 
     def test_build_prometheus_query_node_cpu_avg_agg(self):
         expected_query = (
-            "100 - (avg by (instance)(rate(node_cpu_seconds_total"
+            "100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
             "{mode='idle',fqdn='a_host'}[111s])) * 100)")
         result = self.helper._build_prometheus_query(
             'avg', 'node_cpu_seconds_total', 'a_host', '111')
@@ -583,7 +583,7 @@ class TestPrometheusHelper(base.BaseTestCase):
 
     def test_build_prometheus_query_node_cpu_max_agg(self):
         expected_query = (
-            "100 - (max by (instance)(rate(node_cpu_seconds_total"
+            "100 - (max by (fqdn)(rate(node_cpu_seconds_total"
             "{mode='idle',fqdn='b_host'}[444s])) * 100)")
         result = self.helper._build_prometheus_query(
             'max', 'node_cpu_seconds_total', 'b_host', '444')
@@ -610,7 +610,7 @@ class TestPrometheusHelper(base.BaseTestCase):
     def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
         self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
         expected_query = (
-            "100 - (avg by (instance)(rate(node_cpu_seconds_total"
+            "100 - (avg by (custom_fqdn_label)(rate(node_cpu_seconds_total"
             "{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
         result = self.helper._build_prometheus_query(
             'avg', 'node_cpu_seconds_total', 'a_host', '111')