Merge "Query by fqdn_label instead of instance for host metrics"

This commit is contained in:
Zuul 2025-03-20 12:50:28 +00:00 committed by Gerrit Code Review
commit 035e6584c7
4 changed files with 173 additions and 90 deletions

View File

@ -29,19 +29,19 @@ This default can be overridden when a deployer uses a different label to
identify the exporter host (for example ``hostname`` or ``host``, or any other
label, as long as it identifies the host).
Internally this label is used in creating a ``fqdn_instance_map``, mapping
the fqdn with the Prometheus instance label associated with each exporter.
The keys of the resulting fqdn_instance_map are expected to match the
Internally this label is used in creating ``fqdn_instance_labels``, containing
the list of values assigned to the the label in the Prometheus targets.
The elements of the resulting fqdn_instance_labels are expected to match the
``ComputeNode.hostname`` used in the Watcher decision engine cluster model.
An example ``fqdn_instance_map`` is the following:
An example ``fqdn_instance_labels`` is the following:
.. code-block::
{
'ena.controlplane.domain': '10.1.2.1:9100',
'dio.controlplane.domain': '10.1.2.2:9100',
'tria.controlplane.domain': '10.1.2.3:9100'
}
[
'ena.controlplane.domain',
'dio.controlplane.domain',
'tria.controlplane.domain',
]
For instance metrics, it is required that Prometheus contains a label
with the uuid of the OpenStack instance in each relevant metric. By default,

View File

@ -0,0 +1,9 @@
---
fixes:
- |
When using prometheus datasource and more that one target has the same value
for the `fqdn_label`, the driver used the wrong instance label to query for host
metrics. The `instance` label is no longer used in the queries but the `fqdn_label`
which identifies all the metrics for a specific compute node.
.. _Bug 2103451: https://bugs.launchpad.net/watcher/+bug/2103451

View File

@ -51,10 +51,10 @@ class PrometheusHelper(base.DataSourceBase):
The prometheus helper uses the PrometheusAPIClient provided by
python-observabilityclient.
The prometheus_fqdn_instance_map maps the fqdn of each node to the
Prometheus instance label added to all metrics on that node. When
making queries to Prometheus we use the instance label to specify
the node for which metrics are to be retrieved.
The prometheus_fqdn_labels contains a list the values contained in
the fqdn_label in the Prometheus instance. When making queries to
Prometheus we use the fqdn_label to specify the node for which
metrics are to be retrieved.
host, port and fqdn_label come from watcher_client
config. The prometheus_fqdn_label allows override of the required label
in Prometheus scrape configs that specifies each target's fqdn.
@ -63,8 +63,8 @@ class PrometheusHelper(base.DataSourceBase):
self.prometheus_fqdn_label = (
CONF.prometheus_client.fqdn_label
)
self.prometheus_fqdn_instance_map = (
self._build_prometheus_fqdn_instance_map()
self.prometheus_fqdn_labels = (
self._build_prometheus_fqdn_labels()
)
self.prometheus_host_instance_map = (
self._build_prometheus_host_instance_map()
@ -136,73 +136,71 @@ class PrometheusHelper(base.DataSourceBase):
return the_client
def _build_prometheus_fqdn_instance_map(self):
"""Build the fqdn<-->instance_label mapping needed for queries
def _build_prometheus_fqdn_labels(self):
"""Build the list of fqdn_label values to be used in host queries
Watcher knows nodes by their hostname. In Prometheus however the
scrape targets (also known as 'instances') are specified by I.P.
(or hostname) and port number. This function creates a mapping between
the fully qualified domain name of each node and the corresponding
instance label used in the scrape config. This relies on a custom
'fqdn' label added to Prometheus scrape_configs. Operators can use
a different custom label instead by setting the prometheus_fqdn_label
(or hostname) and port number and fqdn is stored in a custom 'fqdn'
label added to Prometheus scrape_configs. Operators can use a
different custom label instead by setting the prometheus_fqdn_label
config option under the prometheus_client section of watcher config.
The built prometheus_fqdn_instance_map is used to match watcher
node.hostname if watcher stores fqdn and otherwise the
host_instance_map is used instead.
:return a dict mapping fqdn to instance label. For example:
{'marios-env-again.controlplane.domain': '10.1.2.3:9100'}
The built prometheus_fqdn_labels is created with the full list
of values of the prometheus_fqdn_label label in Prometheus. This will
be used to create a map of hostname<-->fqdn and to identify if a target
exist in prometheus for the compute nodes before sending the query.
:return a set of values of the fqdn label. For example:
{'foo.example.com', 'bar.example.com'}
{'foo', 'bar'}
"""
prometheus_targets = self.prometheus._get(
"targets?state=active")['data']['activeTargets']
# >>> prometheus_targets[0]['labels']
# {'fqdn': 'marios-env-again.controlplane.domain',
# 'instance': 'localhost:9100', 'job': 'node'}
fqdn_instance_map = {
fqdn: instance for (fqdn, instance) in (
(target['labels'].get(self.prometheus_fqdn_label),
target['labels'].get('instance'))
for target in prometheus_targets
if target.get('labels', {}).get(self.prometheus_fqdn_label)
)
}
if not fqdn_instance_map:
fqdn_instance_labels = set()
for target in prometheus_targets:
if target.get('labels', {}).get(self.prometheus_fqdn_label):
fqdn_instance_labels.add(
target['labels'].get(self.prometheus_fqdn_label))
if not fqdn_instance_labels:
LOG.error(
"Could not create fqdn instance map from Prometheus "
"Could not create fqdn labels list from Prometheus "
"targets config. Prometheus returned the following: %s",
prometheus_targets
)
return {}
return fqdn_instance_map
return set()
return fqdn_instance_labels
def _build_prometheus_host_instance_map(self):
"""Build the hostname<-->instance_label mapping needed for queries
The prometheus_fqdn_instance_map has the fully qualified domain name
The prometheus_fqdn_labels has the fully qualified domain name
for hosts. This will create a duplicate map containing only the host
name part. Depending on the watcher node.hostname either the
fqdn_instance_map or the host_instance_map will be used to resolve
the correct prometheus instance label for queries. In the event the
fqdn_instance_map keys are not valid fqdn (for example it contains
fqdn_instance_labels or the host_instance_map will be used to resolve
the correct prometheus fqdn_label for queries. In the event the
fqdn_instance_labels elements are not valid fqdn (for example it has
hostnames, not fqdn) the host_instance_map cannot be created and
an empty dictionary is returned with a warning logged.
:return a dict mapping hostname to instance label. For example:
{'marios-env-again': 'localhost:9100'}
{'foo': 'foo.example.com', 'bar': 'bar.example.com'}
"""
if not self.prometheus_fqdn_instance_map:
if not self.prometheus_fqdn_labels:
LOG.error("Cannot build host_instance_map without "
"fqdn_instance_map")
"fqdn_instance_labels")
return {}
host_instance_map = {
host: instance for (host, instance) in (
(fqdn.split('.')[0], inst)
for fqdn, inst in self.prometheus_fqdn_instance_map.items()
host: fqdn for (host, fqdn) in (
(fqdn.split('.')[0], fqdn)
for fqdn in self.prometheus_fqdn_labels
if '.' in fqdn
)
}
if not host_instance_map:
LOG.warning("Creating empty host instance map. Are the keys "
"in prometheus_fqdn_instance_map valid fqdn?")
"in prometheus_fqdn_labels valid fqdn?")
return {}
return host_instance_map
@ -210,23 +208,25 @@ class PrometheusHelper(base.DataSourceBase):
"""Resolve the prometheus instance label to use in queries
Given the watcher node.hostname, resolve the prometheus instance
label for use in queries, first trying the fqdn_instance_map and
label for use in queries, first trying the fqdn_instance_labels and
then the host_instance_map (watcher.node_name can be fqdn or hostname).
If the name is not resolved after the first attempt, rebuild the fqdn
and host instance maps and try again. This allows for new hosts added
after the initialisation of the fqdn_instance_map.
after the initialisation of the fqdn_instance_labels.
:param node_name: the watcher node.hostname
:return String for the prometheus instance label and None if not found
"""
def _query_maps(node):
return self.prometheus_fqdn_instance_map.get(
node, self.prometheus_host_instance_map.get(node, None))
if node in self.prometheus_fqdn_labels:
return node
else:
return self.prometheus_host_instance_map.get(node, None)
instance_label = _query_maps(node_name)
# refresh the fqdn and host instance maps and retry
if not instance_label:
self.prometheus_fqdn_instance_map = (
self._build_prometheus_fqdn_instance_map()
self.prometheus_fqdn_labels = (
self._build_prometheus_fqdn_labels()
)
self.prometheus_host_instance_map = (
self._build_prometheus_host_instance_map()
@ -307,17 +307,21 @@ class PrometheusHelper(base.DataSourceBase):
if meter == 'node_cpu_seconds_total':
query_args = (
"100 - (%s by (instance)(rate(%s"
"{mode='idle',instance='%s'}[%ss])) * 100)" %
(aggregate, meter, instance_label, period)
"100 - (%(agg)s by (instance)(rate(%(meter)s"
"{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
"* 100)"
% {'label': self.prometheus_fqdn_label,
'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}
)
elif meter == 'node_memory_MemAvailable_bytes':
query_args = (
"(node_memory_MemTotal_bytes{instance='%s'} "
"- %s_over_time(%s{instance='%s'}[%ss])) "
"/ 1024 / 1024" %
(instance_label, aggregate, meter,
instance_label, period)
"(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
"- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
"[%(period)ss])) / 1024 / 1024"
% {'label': self.prometheus_fqdn_label,
'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}
)
elif meter == 'ceilometer_memory_usage':
query_args = (

View File

@ -147,7 +147,8 @@ class TestPrometheusHelper(base.BaseTestCase):
self.assertEqual(expected_cpu_usage, result)
mock_prometheus_query.assert_called_once_with(
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='10.0.1.2:9100'}[300s])) * 100)")
"{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
" * 100)")
@mock.patch.object(prometheus_client.PrometheusAPIClient, 'query')
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@ -437,15 +438,48 @@ class TestPrometheusHelper(base.BaseTestCase):
'instance': '10.1.2.3:9100', 'job': 'node',
}},
]}}
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
'bar.controlplane.domain': '10.1.2.2:9100',
'baz.controlplane.domain': '10.1.2.3:9100'}
expected_host_map = {'foo': '10.1.2.1:9100',
'bar': '10.1.2.2:9100',
'baz': '10.1.2.3:9100'}
expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain',
'baz.controlplane.domain'}
expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': 'bar.controlplane.domain',
'baz': 'baz.controlplane.domain'}
helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_instance_map,
expected_fqdn_map)
self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_build_prometheus_fqdn_host_instance_map_dupl_fqdn(
self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [
{'labels': {
'fqdn': 'foo.controlplane.domain',
'instance': '10.1.2.1:9100', 'job': 'node',
}},
{'labels': {
'fqdn': 'foo.controlplane.domain',
'instance': '10.1.2.1:9229', 'job': 'podman',
}},
{'labels': {
'fqdn': 'bar.controlplane.domain',
'instance': '10.1.2.2:9100', 'job': 'node',
}},
{'labels': {
'fqdn': 'baz.controlplane.domain',
'instance': '10.1.2.3:9100', 'job': 'node',
}},
]}}
expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain',
'baz.controlplane.domain'}
expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': 'bar.controlplane.domain',
'baz': 'baz.controlplane.domain'}
helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map)
@ -460,7 +494,7 @@ class TestPrometheusHelper(base.BaseTestCase):
}},
]}}
helper = prometheus_helper.PrometheusHelper()
self.assertEqual({}, helper.prometheus_fqdn_instance_map)
self.assertEqual(set(), helper.prometheus_fqdn_labels)
self.assertEqual({}, helper.prometheus_host_instance_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@ -476,12 +510,29 @@ class TestPrometheusHelper(base.BaseTestCase):
}},
]}}
helper = prometheus_helper.PrometheusHelper()
expected_fqdn_map = {'ena': '10.1.2.1:9100',
'dyo': '10.1.2.2:9100'}
expected_fqdn_list = {'ena', 'dyo'}
self.assertEqual(
helper.prometheus_fqdn_instance_map, expected_fqdn_map)
helper.prometheus_fqdn_labels, expected_fqdn_list)
self.assertEqual({}, helper.prometheus_host_instance_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_using_ips_not_fqdn(self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [
{'labels': {
'ip_label': '10.1.2.1',
'instance': '10.1.2.1:9100', 'job': 'node',
}},
{'labels': {
'ip_label': '10.1.2.2',
'instance': '10.1.2.2:9100', 'job': 'node',
}},
]}}
cfg.CONF.prometheus_client.fqdn_label = 'ip_label'
helper = prometheus_helper.PrometheusHelper()
expected_fqdn_list = {'10.1.2.1', '10.1.2.2'}
self.assertEqual(
helper.prometheus_fqdn_labels, expected_fqdn_list)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_override_prometheus_fqdn_label(self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [
@ -494,19 +545,19 @@ class TestPrometheusHelper(base.BaseTestCase):
'instance': '10.1.2.2:9100', 'job': 'node',
}},
]}}
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
'bar.controlplane.domain': '10.1.2.2:9100'}
expected_host_map = {'foo': '10.1.2.1:9100',
'bar': '10.1.2.2:9100'}
expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain'}
expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': 'bar.controlplane.domain'}
cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label'
helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_instance_map,
expected_fqdn_map)
self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map)
def test_resolve_prometheus_instance_label(self):
expected_instance_label = '10.0.1.2:9100'
expected_instance_label = 'marios-env.controlplane.domain'
result = self.helper._resolve_prometheus_instance_label(
'marios-env.controlplane.domain')
self.assertEqual(result, expected_instance_label)
@ -525,7 +576,7 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_cpu_avg_agg(self):
expected_query = (
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='a_host'}[111s])) * 100)")
"{mode='idle',fqdn='a_host'}[111s])) * 100)")
result = self.helper._build_prometheus_query(
'avg', 'node_cpu_seconds_total', 'a_host', '111')
self.assertEqual(result, expected_query)
@ -533,15 +584,15 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_cpu_max_agg(self):
expected_query = (
"100 - (max by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='b_host'}[444s])) * 100)")
"{mode='idle',fqdn='b_host'}[444s])) * 100)")
result = self.helper._build_prometheus_query(
'max', 'node_cpu_seconds_total', 'b_host', '444')
self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_memory_avg_agg(self):
expected_query = (
"(node_memory_MemTotal_bytes{instance='c_host'} - avg_over_time"
"(node_memory_MemAvailable_bytes{instance='c_host'}[555s])) "
"(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
"(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
"/ 1024 / 1024")
result = self.helper._build_prometheus_query(
'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
@ -549,8 +600,27 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_memory_min_agg(self):
expected_query = (
"(node_memory_MemTotal_bytes{instance='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{instance='d_host'}[222s])) "
"(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
expected_query = (
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
result = self.helper._build_prometheus_query(
'avg', 'node_cpu_seconds_total', 'a_host', '111')
self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_memory_min_agg_custom_label(self):
self.helper.prometheus_fqdn_label = 'custom_fqdn'
expected_query = (
"(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')