From 2caaa989cea93e6fb8c6b16f5c8685f30a6e6cc0 Mon Sep 17 00:00:00 2001
From: Artem Goncharov <Artem.goncharov@gmail.com>
Date: Tue, 12 May 2020 09:59:04 +0200
Subject: [PATCH] Extend statistics reporting

In some cases it is desired to extend stat metrics with additional tags
based on the selected connection (i.e. "environment"). Statsd does not
support tags, so add this only to influxdb for now.
Fix establishing connection from parameters when passing influx_config.
When exception happens, we need to also generate metric to be able to
see errors (i.e. timeout happening from Ansible)

I know there are no tests for that area so far at all, this will come later.

Change-Id: Ie0862f04eb224345559f9092cd0a9d8ffa43bef3
---
 doc/source/user/guides/stats.rst |  12 +++
 openstack/config/loader.py       |  41 +++++-----
 openstack/connection.py          |   8 ++
 openstack/proxy.py               | 133 ++++++++++++++++++++-----------
 4 files changed, 129 insertions(+), 65 deletions(-)

diff --git a/doc/source/user/guides/stats.rst b/doc/source/user/guides/stats.rst
index d01ef8b4f..537a12707 100644
--- a/doc/source/user/guides/stats.rst
+++ b/doc/source/user/guides/stats.rst
@@ -57,3 +57,15 @@ Metrics will be reported only when corresponding client libraries (
 `statsd` for 'statsd' reporting, `influxdb` for influxdb reporting
 correspondingly). When those libraries are not available reporting will be
 silently ignored.
+
+InfluxDB reporting allows setting additional tags into the metrics based on the
+selected cloud.
+
+.. code-block:: yaml
+
+  clouds:
+    my_cloud:
+      profile: some_profile
+      ...
+      additional_metric_tags:
+        environment: production
diff --git a/openstack/config/loader.py b/openstack/config/loader.py
index 50bc22adf..6e931d663 100644
--- a/openstack/config/loader.py
+++ b/openstack/config/loader.py
@@ -289,25 +289,28 @@ class OpenStackConfig:
 
             influxdb_cfg = metrics_config.get('influxdb', {})
             # Parse InfluxDB configuration
-            if influxdb_config:
-                influxdb_cfg.update(influxdb_config)
-            if influxdb_cfg:
-                config = {}
-                if 'use_udp' in influxdb_cfg:
-                    use_udp = influxdb_cfg['use_udp']
-                    if isinstance(use_udp, str):
-                        use_udp = use_udp.lower() in ('true', 'yes', '1')
-                    elif not isinstance(use_udp, bool):
-                        use_udp = False
-                        self.log.warning('InfluxDB.use_udp value type is not '
-                                         'supported. Use one of '
-                                         '[true|false|yes|no|1|0]')
-                    config['use_udp'] = use_udp
-                for key in ['host', 'port', 'username', 'password', 'database',
-                            'measurement', 'timeout']:
-                    if key in influxdb_cfg:
-                        config[key] = influxdb_cfg[key]
-                self._influxdb_config = config
+            if not influxdb_config:
+                influxdb_config = influxdb_cfg
+            else:
+                influxdb_config.update(influxdb_cfg)
+
+        if influxdb_config:
+            config = {}
+            if 'use_udp' in influxdb_config:
+                use_udp = influxdb_config['use_udp']
+                if isinstance(use_udp, str):
+                    use_udp = use_udp.lower() in ('true', 'yes', '1')
+                elif not isinstance(use_udp, bool):
+                    use_udp = False
+                    self.log.warning('InfluxDB.use_udp value type is not '
+                                     'supported. Use one of '
+                                     '[true|false|yes|no|1|0]')
+                config['use_udp'] = use_udp
+            for key in ['host', 'port', 'username', 'password', 'database',
+                        'measurement', 'timeout']:
+                if key in influxdb_config:
+                    config[key] = influxdb_config[key]
+            self._influxdb_config = config
 
         if load_envvars:
             statsd_host = statsd_host or os.environ.get('STATSD_HOST')
diff --git a/openstack/connection.py b/openstack/connection.py
index 3d39440d4..eaade0c9c 100644
--- a/openstack/connection.py
+++ b/openstack/connection.py
@@ -435,6 +435,14 @@ class Connection(
                 self.log.warning('Configured hook %s cannot be executed: %s',
                                  vendor_hook, e)
 
+        # Add additional metrics into the configuration according to the
+        # selected connection. We don't want to deal with overall config in the
+        # proxy, just pass required part.
+        if (self.config._influxdb_config
+                and 'additional_metric_tags' in self.config.config):
+            self.config._influxdb_config['additional_metric_tags'] = \
+                self.config.config['additional_metric_tags']
+
     @property
     def session(self):
         if not self._session:
diff --git a/openstack/proxy.py b/openstack/proxy.py
index dbb246399..a8812de70 100644
--- a/openstack/proxy.py
+++ b/openstack/proxy.py
@@ -91,15 +91,23 @@ class Proxy(adapter.Adapter):
             if conn:
                 # Per-request setting should take precedence
                 global_request_id = conn._global_request_id
-        response = super(Proxy, self).request(
-            url, method,
-            connect_retries=connect_retries, raise_exc=raise_exc,
-            global_request_id=global_request_id,
-            **kwargs)
-        for h in response.history:
-            self._report_stats(h)
-        self._report_stats(response)
-        return response
+        try:
+            response = super(Proxy, self).request(
+                url, method,
+                connect_retries=connect_retries, raise_exc=raise_exc,
+                global_request_id=global_request_id,
+                **kwargs)
+            for h in response.history:
+                self._report_stats(h)
+            self._report_stats(response)
+            return response
+        except Exception as e:
+            # If we want metrics to be generated we also need to generate some
+            # in case of exceptions as well, so that timeouts and connection
+            # problems (especially when called from ansible) are being
+            # generated as well.
+            self._report_stats(None, url, method, e)
+            raise
 
     def _extract_name(self, url, service_type=None, project_id=None):
         '''Produce a key name to use in logging/metrics from the URL path.
@@ -185,58 +193,91 @@ class Proxy(adapter.Adapter):
 
         return name_parts
 
-    def _report_stats(self, response):
+    def _report_stats(self, response, url=None, method=None, exc=None):
         if self._statsd_client:
-            self._report_stats_statsd(response)
+            self._report_stats_statsd(response, url, method, exc)
         if self._prometheus_counter and self._prometheus_histogram:
-            self._report_stats_prometheus(response)
+            self._report_stats_prometheus(response, url, method, exc)
         if self._influxdb_client:
-            self._report_stats_influxdb(response)
+            self._report_stats_influxdb(response, url, method, exc)
 
-    def _report_stats_statsd(self, response):
-        name_parts = self._extract_name(response.request.url,
+    def _report_stats_statsd(self, response, url=None, method=None, exc=None):
+        if response is not None and not url:
+            url = response.request.url
+        if response is not None and not method:
+            method = response.request.method
+        name_parts = self._extract_name(url,
                                         self.service_type,
                                         self.session.get_project_id())
         key = '.'.join(
-            [self._statsd_prefix, self.service_type, response.request.method]
+            [self._statsd_prefix, self.service_type, method]
             + name_parts)
-        self._statsd_client.timing(key, int(
-            response.elapsed.microseconds / 1000))
-        self._statsd_client.incr(key)
+        if response is not None:
+            duration = int(response.elapsed.microseconds / 1000)
+            self._statsd_client.timing(key, duration)
+            self._statsd_client.incr(key)
+        elif exc is not None:
+            self._statsd_client.incr('%s.failed' % key)
 
-    def _report_stats_prometheus(self, response):
-        labels = dict(
-            method=response.request.method,
-            endpoint=response.request.url,
-            service_type=self.service_type,
-            status_code=response.status_code,
-        )
-        self._prometheus_counter.labels(**labels).inc()
-        self._prometheus_histogram.labels(**labels).observe(
-            response.elapsed.microseconds / 1000)
+    def _report_stats_prometheus(self, response, url=None, method=None,
+                                 exc=None):
+        if response is not None and not url:
+            url = response.request.url
+        if response is not None and not method:
+            method = response.request.method
+        if response is not None:
+            labels = dict(
+                method=method,
+                endpoint=url,
+                service_type=self.service_type,
+                status_code=response.status_code,
+            )
+            self._prometheus_counter.labels(**labels).inc()
+            self._prometheus_histogram.labels(**labels).observe(
+                response.elapsed.microseconds / 1000)
 
-    def _report_stats_influxdb(self, response):
+    def _report_stats_influxdb(self, response, url=None, method=None,
+                               exc=None):
         # NOTE(gtema): status_code is saved both as tag and field to give
         # ability showing it as a value and not only as a legend.
         # However Influx is not ok with having same name in tags and fields,
         # therefore use different names.
+        if response is not None and not url:
+            url = response.request.url
+        if response is not None and not method:
+            method = response.request.method
+        tags = dict(
+            method=method,
+            name='_'.join(self._extract_name(
+                url, self.service_type,
+                self.session.get_project_id()))
+        )
+        fields = dict(
+            attempted=1
+        )
+        if response is not None:
+            fields['duration'] = int(response.elapsed.microseconds / 1000)
+            tags['status_code'] = str(response.status_code)
+            # Note(gtema): emit also status_code as a value (counter)
+            fields[str(response.status_code)] = 1
+            fields['%s.%s' % (method, response.status_code)] = 1
+            # Note(gtema): status_code field itself is also very helpful on the
+            # graphs to show what was the code, instead of counting its
+            # occurences
+            fields['status_code_val'] = response.status_code
+        elif exc:
+            fields['failed'] = 1
+        if 'additional_metric_tags' in self._influxdb_config:
+            tags.update(self._influxdb_config['additional_metric_tags'])
+        measurement = self._influxdb_config.get(
+            'measurement', 'openstack_api') \
+            if self._influxdb_config else 'openstack_api'
+        # Note(gtema) append service name into the measurement name
+        measurement = '%s.%s' % (measurement, self.service_type)
         data = [dict(
-            measurement=(self._influxdb_config.get('measurement',
-                                                   'openstack_api')
-                         if self._influxdb_config else 'openstack_api'),
-            tags=dict(
-                method=response.request.method,
-                service_type=self.service_type,
-                status_code=response.status_code,
-                name='_'.join(self._extract_name(
-                    response.request.url, self.service_type,
-                    self.session.get_project_id())
-                )
-            ),
-            fields=dict(
-                duration=int(response.elapsed.microseconds / 1000),
-                status_code_val=int(response.status_code)
-            )
+            measurement=measurement,
+            tags=tags,
+            fields=fields
         )]
         try:
             self._influxdb_client.write_points(data)