Merge "Query by fqdn_label instead of instance for host metrics"
This commit is contained in:
commit
035e6584c7
@ -29,19 +29,19 @@ This default can be overridden when a deployer uses a different label to
|
||||
identify the exporter host (for example ``hostname`` or ``host``, or any other
|
||||
label, as long as it identifies the host).
|
||||
|
||||
Internally this label is used in creating a ``fqdn_instance_map``, mapping
|
||||
the fqdn with the Prometheus instance label associated with each exporter.
|
||||
The keys of the resulting fqdn_instance_map are expected to match the
|
||||
Internally this label is used in creating ``fqdn_instance_labels``, containing
|
||||
the list of values assigned to the the label in the Prometheus targets.
|
||||
The elements of the resulting fqdn_instance_labels are expected to match the
|
||||
``ComputeNode.hostname`` used in the Watcher decision engine cluster model.
|
||||
An example ``fqdn_instance_map`` is the following:
|
||||
An example ``fqdn_instance_labels`` is the following:
|
||||
|
||||
.. code-block::
|
||||
|
||||
{
|
||||
'ena.controlplane.domain': '10.1.2.1:9100',
|
||||
'dio.controlplane.domain': '10.1.2.2:9100',
|
||||
'tria.controlplane.domain': '10.1.2.3:9100'
|
||||
}
|
||||
[
|
||||
'ena.controlplane.domain',
|
||||
'dio.controlplane.domain',
|
||||
'tria.controlplane.domain',
|
||||
]
|
||||
|
||||
For instance metrics, it is required that Prometheus contains a label
|
||||
with the uuid of the OpenStack instance in each relevant metric. By default,
|
||||
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
When using prometheus datasource and more that one target has the same value
|
||||
for the `fqdn_label`, the driver used the wrong instance label to query for host
|
||||
metrics. The `instance` label is no longer used in the queries but the `fqdn_label`
|
||||
which identifies all the metrics for a specific compute node.
|
||||
|
||||
.. _Bug 2103451: https://bugs.launchpad.net/watcher/+bug/2103451
|
@ -51,10 +51,10 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
|
||||
The prometheus helper uses the PrometheusAPIClient provided by
|
||||
python-observabilityclient.
|
||||
The prometheus_fqdn_instance_map maps the fqdn of each node to the
|
||||
Prometheus instance label added to all metrics on that node. When
|
||||
making queries to Prometheus we use the instance label to specify
|
||||
the node for which metrics are to be retrieved.
|
||||
The prometheus_fqdn_labels contains a list the values contained in
|
||||
the fqdn_label in the Prometheus instance. When making queries to
|
||||
Prometheus we use the fqdn_label to specify the node for which
|
||||
metrics are to be retrieved.
|
||||
host, port and fqdn_label come from watcher_client
|
||||
config. The prometheus_fqdn_label allows override of the required label
|
||||
in Prometheus scrape configs that specifies each target's fqdn.
|
||||
@ -63,8 +63,8 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
self.prometheus_fqdn_label = (
|
||||
CONF.prometheus_client.fqdn_label
|
||||
)
|
||||
self.prometheus_fqdn_instance_map = (
|
||||
self._build_prometheus_fqdn_instance_map()
|
||||
self.prometheus_fqdn_labels = (
|
||||
self._build_prometheus_fqdn_labels()
|
||||
)
|
||||
self.prometheus_host_instance_map = (
|
||||
self._build_prometheus_host_instance_map()
|
||||
@ -136,73 +136,71 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
|
||||
return the_client
|
||||
|
||||
def _build_prometheus_fqdn_instance_map(self):
|
||||
"""Build the fqdn<-->instance_label mapping needed for queries
|
||||
def _build_prometheus_fqdn_labels(self):
|
||||
"""Build the list of fqdn_label values to be used in host queries
|
||||
|
||||
Watcher knows nodes by their hostname. In Prometheus however the
|
||||
scrape targets (also known as 'instances') are specified by I.P.
|
||||
(or hostname) and port number. This function creates a mapping between
|
||||
the fully qualified domain name of each node and the corresponding
|
||||
instance label used in the scrape config. This relies on a custom
|
||||
'fqdn' label added to Prometheus scrape_configs. Operators can use
|
||||
a different custom label instead by setting the prometheus_fqdn_label
|
||||
(or hostname) and port number and fqdn is stored in a custom 'fqdn'
|
||||
label added to Prometheus scrape_configs. Operators can use a
|
||||
different custom label instead by setting the prometheus_fqdn_label
|
||||
config option under the prometheus_client section of watcher config.
|
||||
The built prometheus_fqdn_instance_map is used to match watcher
|
||||
node.hostname if watcher stores fqdn and otherwise the
|
||||
host_instance_map is used instead.
|
||||
:return a dict mapping fqdn to instance label. For example:
|
||||
{'marios-env-again.controlplane.domain': '10.1.2.3:9100'}
|
||||
The built prometheus_fqdn_labels is created with the full list
|
||||
of values of the prometheus_fqdn_label label in Prometheus. This will
|
||||
be used to create a map of hostname<-->fqdn and to identify if a target
|
||||
exist in prometheus for the compute nodes before sending the query.
|
||||
:return a set of values of the fqdn label. For example:
|
||||
{'foo.example.com', 'bar.example.com'}
|
||||
{'foo', 'bar'}
|
||||
"""
|
||||
prometheus_targets = self.prometheus._get(
|
||||
"targets?state=active")['data']['activeTargets']
|
||||
# >>> prometheus_targets[0]['labels']
|
||||
# {'fqdn': 'marios-env-again.controlplane.domain',
|
||||
# 'instance': 'localhost:9100', 'job': 'node'}
|
||||
fqdn_instance_map = {
|
||||
fqdn: instance for (fqdn, instance) in (
|
||||
(target['labels'].get(self.prometheus_fqdn_label),
|
||||
target['labels'].get('instance'))
|
||||
for target in prometheus_targets
|
||||
if target.get('labels', {}).get(self.prometheus_fqdn_label)
|
||||
)
|
||||
}
|
||||
if not fqdn_instance_map:
|
||||
fqdn_instance_labels = set()
|
||||
for target in prometheus_targets:
|
||||
if target.get('labels', {}).get(self.prometheus_fqdn_label):
|
||||
fqdn_instance_labels.add(
|
||||
target['labels'].get(self.prometheus_fqdn_label))
|
||||
|
||||
if not fqdn_instance_labels:
|
||||
LOG.error(
|
||||
"Could not create fqdn instance map from Prometheus "
|
||||
"Could not create fqdn labels list from Prometheus "
|
||||
"targets config. Prometheus returned the following: %s",
|
||||
prometheus_targets
|
||||
)
|
||||
return {}
|
||||
return fqdn_instance_map
|
||||
return set()
|
||||
return fqdn_instance_labels
|
||||
|
||||
def _build_prometheus_host_instance_map(self):
|
||||
"""Build the hostname<-->instance_label mapping needed for queries
|
||||
|
||||
The prometheus_fqdn_instance_map has the fully qualified domain name
|
||||
The prometheus_fqdn_labels has the fully qualified domain name
|
||||
for hosts. This will create a duplicate map containing only the host
|
||||
name part. Depending on the watcher node.hostname either the
|
||||
fqdn_instance_map or the host_instance_map will be used to resolve
|
||||
the correct prometheus instance label for queries. In the event the
|
||||
fqdn_instance_map keys are not valid fqdn (for example it contains
|
||||
fqdn_instance_labels or the host_instance_map will be used to resolve
|
||||
the correct prometheus fqdn_label for queries. In the event the
|
||||
fqdn_instance_labels elements are not valid fqdn (for example it has
|
||||
hostnames, not fqdn) the host_instance_map cannot be created and
|
||||
an empty dictionary is returned with a warning logged.
|
||||
:return a dict mapping hostname to instance label. For example:
|
||||
{'marios-env-again': 'localhost:9100'}
|
||||
{'foo': 'foo.example.com', 'bar': 'bar.example.com'}
|
||||
"""
|
||||
if not self.prometheus_fqdn_instance_map:
|
||||
if not self.prometheus_fqdn_labels:
|
||||
LOG.error("Cannot build host_instance_map without "
|
||||
"fqdn_instance_map")
|
||||
"fqdn_instance_labels")
|
||||
return {}
|
||||
host_instance_map = {
|
||||
host: instance for (host, instance) in (
|
||||
(fqdn.split('.')[0], inst)
|
||||
for fqdn, inst in self.prometheus_fqdn_instance_map.items()
|
||||
host: fqdn for (host, fqdn) in (
|
||||
(fqdn.split('.')[0], fqdn)
|
||||
for fqdn in self.prometheus_fqdn_labels
|
||||
if '.' in fqdn
|
||||
)
|
||||
}
|
||||
if not host_instance_map:
|
||||
LOG.warning("Creating empty host instance map. Are the keys "
|
||||
"in prometheus_fqdn_instance_map valid fqdn?")
|
||||
"in prometheus_fqdn_labels valid fqdn?")
|
||||
return {}
|
||||
return host_instance_map
|
||||
|
||||
@ -210,23 +208,25 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
"""Resolve the prometheus instance label to use in queries
|
||||
|
||||
Given the watcher node.hostname, resolve the prometheus instance
|
||||
label for use in queries, first trying the fqdn_instance_map and
|
||||
label for use in queries, first trying the fqdn_instance_labels and
|
||||
then the host_instance_map (watcher.node_name can be fqdn or hostname).
|
||||
If the name is not resolved after the first attempt, rebuild the fqdn
|
||||
and host instance maps and try again. This allows for new hosts added
|
||||
after the initialisation of the fqdn_instance_map.
|
||||
after the initialisation of the fqdn_instance_labels.
|
||||
:param node_name: the watcher node.hostname
|
||||
:return String for the prometheus instance label and None if not found
|
||||
"""
|
||||
def _query_maps(node):
|
||||
return self.prometheus_fqdn_instance_map.get(
|
||||
node, self.prometheus_host_instance_map.get(node, None))
|
||||
if node in self.prometheus_fqdn_labels:
|
||||
return node
|
||||
else:
|
||||
return self.prometheus_host_instance_map.get(node, None)
|
||||
|
||||
instance_label = _query_maps(node_name)
|
||||
# refresh the fqdn and host instance maps and retry
|
||||
if not instance_label:
|
||||
self.prometheus_fqdn_instance_map = (
|
||||
self._build_prometheus_fqdn_instance_map()
|
||||
self.prometheus_fqdn_labels = (
|
||||
self._build_prometheus_fqdn_labels()
|
||||
)
|
||||
self.prometheus_host_instance_map = (
|
||||
self._build_prometheus_host_instance_map()
|
||||
@ -307,17 +307,21 @@ class PrometheusHelper(base.DataSourceBase):
|
||||
|
||||
if meter == 'node_cpu_seconds_total':
|
||||
query_args = (
|
||||
"100 - (%s by (instance)(rate(%s"
|
||||
"{mode='idle',instance='%s'}[%ss])) * 100)" %
|
||||
(aggregate, meter, instance_label, period)
|
||||
"100 - (%(agg)s by (instance)(rate(%(meter)s"
|
||||
"{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
|
||||
"* 100)"
|
||||
% {'label': self.prometheus_fqdn_label,
|
||||
'label_value': instance_label, 'agg': aggregate,
|
||||
'meter': meter, 'period': period}
|
||||
)
|
||||
elif meter == 'node_memory_MemAvailable_bytes':
|
||||
query_args = (
|
||||
"(node_memory_MemTotal_bytes{instance='%s'} "
|
||||
"- %s_over_time(%s{instance='%s'}[%ss])) "
|
||||
"/ 1024 / 1024" %
|
||||
(instance_label, aggregate, meter,
|
||||
instance_label, period)
|
||||
"(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
|
||||
"- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
|
||||
"[%(period)ss])) / 1024 / 1024"
|
||||
% {'label': self.prometheus_fqdn_label,
|
||||
'label_value': instance_label, 'agg': aggregate,
|
||||
'meter': meter, 'period': period}
|
||||
)
|
||||
elif meter == 'ceilometer_memory_usage':
|
||||
query_args = (
|
||||
|
@ -147,7 +147,8 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
self.assertEqual(expected_cpu_usage, result)
|
||||
mock_prometheus_query.assert_called_once_with(
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',instance='10.0.1.2:9100'}[300s])) * 100)")
|
||||
"{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
|
||||
" * 100)")
|
||||
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, 'query')
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
|
||||
@ -437,15 +438,48 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
'instance': '10.1.2.3:9100', 'job': 'node',
|
||||
}},
|
||||
]}}
|
||||
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
|
||||
'bar.controlplane.domain': '10.1.2.2:9100',
|
||||
'baz.controlplane.domain': '10.1.2.3:9100'}
|
||||
expected_host_map = {'foo': '10.1.2.1:9100',
|
||||
'bar': '10.1.2.2:9100',
|
||||
'baz': '10.1.2.3:9100'}
|
||||
expected_fqdn_list = {'foo.controlplane.domain',
|
||||
'bar.controlplane.domain',
|
||||
'baz.controlplane.domain'}
|
||||
expected_host_map = {'foo': 'foo.controlplane.domain',
|
||||
'bar': 'bar.controlplane.domain',
|
||||
'baz': 'baz.controlplane.domain'}
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
self.assertEqual(helper.prometheus_fqdn_instance_map,
|
||||
expected_fqdn_map)
|
||||
self.assertEqual(helper.prometheus_fqdn_labels,
|
||||
expected_fqdn_list)
|
||||
self.assertEqual(helper.prometheus_host_instance_map,
|
||||
expected_host_map)
|
||||
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
|
||||
def test_build_prometheus_fqdn_host_instance_map_dupl_fqdn(
|
||||
self, mock_prometheus_get):
|
||||
mock_prometheus_get.return_value = {'data': {'activeTargets': [
|
||||
{'labels': {
|
||||
'fqdn': 'foo.controlplane.domain',
|
||||
'instance': '10.1.2.1:9100', 'job': 'node',
|
||||
}},
|
||||
{'labels': {
|
||||
'fqdn': 'foo.controlplane.domain',
|
||||
'instance': '10.1.2.1:9229', 'job': 'podman',
|
||||
}},
|
||||
{'labels': {
|
||||
'fqdn': 'bar.controlplane.domain',
|
||||
'instance': '10.1.2.2:9100', 'job': 'node',
|
||||
}},
|
||||
{'labels': {
|
||||
'fqdn': 'baz.controlplane.domain',
|
||||
'instance': '10.1.2.3:9100', 'job': 'node',
|
||||
}},
|
||||
]}}
|
||||
expected_fqdn_list = {'foo.controlplane.domain',
|
||||
'bar.controlplane.domain',
|
||||
'baz.controlplane.domain'}
|
||||
expected_host_map = {'foo': 'foo.controlplane.domain',
|
||||
'bar': 'bar.controlplane.domain',
|
||||
'baz': 'baz.controlplane.domain'}
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
self.assertEqual(helper.prometheus_fqdn_labels,
|
||||
expected_fqdn_list)
|
||||
self.assertEqual(helper.prometheus_host_instance_map,
|
||||
expected_host_map)
|
||||
|
||||
@ -460,7 +494,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
}},
|
||||
]}}
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
self.assertEqual({}, helper.prometheus_fqdn_instance_map)
|
||||
self.assertEqual(set(), helper.prometheus_fqdn_labels)
|
||||
self.assertEqual({}, helper.prometheus_host_instance_map)
|
||||
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
|
||||
@ -476,12 +510,29 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
}},
|
||||
]}}
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
expected_fqdn_map = {'ena': '10.1.2.1:9100',
|
||||
'dyo': '10.1.2.2:9100'}
|
||||
expected_fqdn_list = {'ena', 'dyo'}
|
||||
self.assertEqual(
|
||||
helper.prometheus_fqdn_instance_map, expected_fqdn_map)
|
||||
helper.prometheus_fqdn_labels, expected_fqdn_list)
|
||||
self.assertEqual({}, helper.prometheus_host_instance_map)
|
||||
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
|
||||
def test_using_ips_not_fqdn(self, mock_prometheus_get):
|
||||
mock_prometheus_get.return_value = {'data': {'activeTargets': [
|
||||
{'labels': {
|
||||
'ip_label': '10.1.2.1',
|
||||
'instance': '10.1.2.1:9100', 'job': 'node',
|
||||
}},
|
||||
{'labels': {
|
||||
'ip_label': '10.1.2.2',
|
||||
'instance': '10.1.2.2:9100', 'job': 'node',
|
||||
}},
|
||||
]}}
|
||||
cfg.CONF.prometheus_client.fqdn_label = 'ip_label'
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
expected_fqdn_list = {'10.1.2.1', '10.1.2.2'}
|
||||
self.assertEqual(
|
||||
helper.prometheus_fqdn_labels, expected_fqdn_list)
|
||||
|
||||
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
|
||||
def test_override_prometheus_fqdn_label(self, mock_prometheus_get):
|
||||
mock_prometheus_get.return_value = {'data': {'activeTargets': [
|
||||
@ -494,19 +545,19 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
'instance': '10.1.2.2:9100', 'job': 'node',
|
||||
}},
|
||||
]}}
|
||||
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
|
||||
'bar.controlplane.domain': '10.1.2.2:9100'}
|
||||
expected_host_map = {'foo': '10.1.2.1:9100',
|
||||
'bar': '10.1.2.2:9100'}
|
||||
expected_fqdn_list = {'foo.controlplane.domain',
|
||||
'bar.controlplane.domain'}
|
||||
expected_host_map = {'foo': 'foo.controlplane.domain',
|
||||
'bar': 'bar.controlplane.domain'}
|
||||
cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label'
|
||||
helper = prometheus_helper.PrometheusHelper()
|
||||
self.assertEqual(helper.prometheus_fqdn_instance_map,
|
||||
expected_fqdn_map)
|
||||
self.assertEqual(helper.prometheus_fqdn_labels,
|
||||
expected_fqdn_list)
|
||||
self.assertEqual(helper.prometheus_host_instance_map,
|
||||
expected_host_map)
|
||||
|
||||
def test_resolve_prometheus_instance_label(self):
|
||||
expected_instance_label = '10.0.1.2:9100'
|
||||
expected_instance_label = 'marios-env.controlplane.domain'
|
||||
result = self.helper._resolve_prometheus_instance_label(
|
||||
'marios-env.controlplane.domain')
|
||||
self.assertEqual(result, expected_instance_label)
|
||||
@ -525,7 +576,7 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
def test_build_prometheus_query_node_cpu_avg_agg(self):
|
||||
expected_query = (
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',instance='a_host'}[111s])) * 100)")
|
||||
"{mode='idle',fqdn='a_host'}[111s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'avg', 'node_cpu_seconds_total', 'a_host', '111')
|
||||
self.assertEqual(result, expected_query)
|
||||
@ -533,15 +584,15 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
def test_build_prometheus_query_node_cpu_max_agg(self):
|
||||
expected_query = (
|
||||
"100 - (max by (instance)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',instance='b_host'}[444s])) * 100)")
|
||||
"{mode='idle',fqdn='b_host'}[444s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'max', 'node_cpu_seconds_total', 'b_host', '444')
|
||||
self.assertEqual(result, expected_query)
|
||||
|
||||
def test_build_prometheus_query_node_memory_avg_agg(self):
|
||||
expected_query = (
|
||||
"(node_memory_MemTotal_bytes{instance='c_host'} - avg_over_time"
|
||||
"(node_memory_MemAvailable_bytes{instance='c_host'}[555s])) "
|
||||
"(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
|
||||
"(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
|
||||
"/ 1024 / 1024")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
|
||||
@ -549,8 +600,27 @@ class TestPrometheusHelper(base.BaseTestCase):
|
||||
|
||||
def test_build_prometheus_query_node_memory_min_agg(self):
|
||||
expected_query = (
|
||||
"(node_memory_MemTotal_bytes{instance='d_host'} - min_over_time"
|
||||
"(node_memory_MemAvailable_bytes{instance='d_host'}[222s])) "
|
||||
"(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
|
||||
"(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
|
||||
"/ 1024 / 1024")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
|
||||
self.assertEqual(result, expected_query)
|
||||
|
||||
def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
|
||||
self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
|
||||
expected_query = (
|
||||
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
|
||||
"{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'avg', 'node_cpu_seconds_total', 'a_host', '111')
|
||||
self.assertEqual(result, expected_query)
|
||||
|
||||
def test_build_prometheus_query_node_memory_min_agg_custom_label(self):
|
||||
self.helper.prometheus_fqdn_label = 'custom_fqdn'
|
||||
expected_query = (
|
||||
"(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
|
||||
"(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
|
||||
"/ 1024 / 1024")
|
||||
result = self.helper._build_prometheus_query(
|
||||
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
|
||||
|
Loading…
x
Reference in New Issue
Block a user