From d3872cfcdd26f6c46d262588e0d79eeab5d4be1c Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Fri, 25 Sep 2020 14:09:43 +0200
Subject: [PATCH] Fix a race condition in the hash ring code

We're handling hash rings and updated_at differently: one
is stored on the class level, the other - on instance. Apparently,
there is a race there, resulting in updated_at never updated.
Store hash rings and updated_at in one tuple, so that they're
always loaded and stored together.

Also remove double loading of the hash ring in _get_ring that
could contribute to the problem.

Change-Id: Ib659014e07549ae3d5ec7e69da318301f5994ca8
---
 ironic/common/hash_ring.py                    | 28 +++++++++----------
 ironic/tests/unit/common/test_hash_ring.py    |  7 +++--
 .../notes/hash-ring-6ce212ab86c2592d.yaml     |  5 ++++
 3 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 releasenotes/notes/hash-ring-6ce212ab86c2592d.yaml

diff --git a/ironic/common/hash_ring.py b/ironic/common/hash_ring.py
index 0c2c534a95..1f11070080 100644
--- a/ironic/common/hash_ring.py
+++ b/ironic/common/hash_ring.py
@@ -29,12 +29,11 @@ LOG = log.getLogger(__name__)
 
 
 class HashRingManager(object):
-    _hash_rings = None
+    _hash_rings = (None, 0)
     _lock = threading.Lock()
 
     def __init__(self, use_groups=True, cache=True):
         self.dbapi = dbapi.get_instance()
-        self.updated_at = time.time()
         self.use_groups = use_groups
         self.cache = cache
 
@@ -48,19 +47,19 @@ class HashRingManager(object):
 
         # Hot path, no lock. Using a local variable to avoid races with code
         # changing the class variable.
-        hash_rings = self.__class__._hash_rings
-        if hash_rings is not None and self.updated_at >= limit:
+        hash_rings, updated_at = self.__class__._hash_rings
+        if hash_rings is not None and updated_at >= limit:
             return hash_rings
 
         with self._lock:
-            if self.__class__._hash_rings is None or self.updated_at < limit:
+            hash_rings, updated_at = self.__class__._hash_rings
+            if hash_rings is None or updated_at < limit:
                 LOG.debug('Rebuilding cached hash rings')
-                rings = self._load_hash_rings()
-                self.__class__._hash_rings = rings
-                self.updated_at = time.time()
+                hash_rings = self._load_hash_rings()
+                self.__class__._hash_rings = hash_rings, time.time()
                 LOG.debug('Finished rebuilding hash rings, available drivers '
-                          'are %s', ', '.join(rings))
-            return self.__class__._hash_rings
+                          'are %s', ', '.join(hash_rings))
+            return hash_rings
 
     def _load_hash_rings(self):
         rings = {}
@@ -78,7 +77,7 @@ class HashRingManager(object):
     def reset(cls):
         with cls._lock:
             LOG.debug('Resetting cached hash rings')
-            cls._hash_rings = None
+            cls._hash_rings = (None, 0)
 
     def get_ring(self, driver_name, conductor_group):
         try:
@@ -96,13 +95,14 @@ class HashRingManager(object):
 
     def _get_ring(self, driver_name, conductor_group):
         # There are no conductors, temporary failure - 503 Service Unavailable
-        if not self.ring:
+        ring = self.ring  # a property, don't load twice
+        if not ring:
             raise exception.TemporaryFailure()
 
         try:
             if self.use_groups:
-                return self.ring['%s:%s' % (conductor_group, driver_name)]
-            return self.ring[driver_name]
+                return ring['%s:%s' % (conductor_group, driver_name)]
+            return ring[driver_name]
         except KeyError:
             raise exception.DriverNotFound(
                 _("The driver '%s' is unknown.") % driver_name)
diff --git a/ironic/tests/unit/common/test_hash_ring.py b/ironic/tests/unit/common/test_hash_ring.py
index 5ad5ee8701..882216a8b6 100644
--- a/ironic/tests/unit/common/test_hash_ring.py
+++ b/ironic/tests/unit/common/test_hash_ring.py
@@ -112,7 +112,10 @@ class HashRingManagerTestCase(db_base.DbTestCase):
         # since there is an active conductor for the requested hardware type.
         self.assertEqual(1, len(ring))
 
-        self.ring_manager.updated_at = time.time() - 31
+        self.ring_manager.__class__._hash_rings = (
+            self.ring_manager.__class__._hash_rings[0],
+            time.time() - 31
+        )
         ring = self.ring_manager.get_ring('hardware-type', '')
         self.assertEqual(2, len(ring))
 
@@ -121,7 +124,7 @@ class HashRingManagerTestCase(db_base.DbTestCase):
                                              use_groups=self.use_groups)
         ring = ring_mgr.ring
         self.assertIsNotNone(ring)
-        self.assertIsNone(hash_ring.HashRingManager._hash_rings)
+        self.assertEqual((None, 0), hash_ring.HashRingManager._hash_rings)
 
 
 class HashRingManagerWithGroupsTestCase(HashRingManagerTestCase):
diff --git a/releasenotes/notes/hash-ring-6ce212ab86c2592d.yaml b/releasenotes/notes/hash-ring-6ce212ab86c2592d.yaml
new file mode 100644
index 0000000000..ee75d93cc1
--- /dev/null
+++ b/releasenotes/notes/hash-ring-6ce212ab86c2592d.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes a potential race in the hash ring code that could result in the
+    hash rings never updated after their initial load.