Fix ER bot to report back to gerrit with bug/error report

Examples of comments: https://review.rdoproject.org/r/c/testproject/+/36119/2#message-501d5e016218453253b2af37c27b16ec03a90474 Change-Id: Ic6d4c7a5714151e630100e6daf342c830061c10b
2021-08-23 15:10:07 +02:00 · 2021-08-23 15:10:07 +02:00 · 4e4ae3e36d
commit 4e4ae3e36d
parent 760a90296e
14 changed files with 69 additions and 104 deletions
--- a/data/elastic-recheck.conf
+++ b/data/elastic-recheck.conf
@ -10,16 +10,16 @@
 [ircbot]
 # nick=${IRC_NICK}
 # pass=${IRC_PASS}
-server=irc.freenode.net
+server=irc.oftc.net
 port=6667
 channel_config=recheckwatchbot.yaml
 # log_config=${LOG_CONFIG}

 [gerrit]
-# host=${GERRIT_HOST}
-# user=${GERRIT_USER}
-# query_file=${GERRIT_QUERY_FILE}
-# key=data/id_rsa
+host=${GERRIT_HOST}
+user=${GERRIT_USER}
+query_file=${GERRIT_QUERY_FILE}
+key=/root/.ssh/id_rsa

 [data_source]
 es_url=${ES_URL}
--- a/data/recheckwatchbot.yaml
+++ b/data/recheckwatchbot.yaml
@ -1,5 +1,5 @@
 channels:
-  openstack-neutron:
+  oooq:
    projects:
      # elastic-recheck doesn't allow to limit reports to patches from a specific repo,
      # so let's at least scope to bugs that neutron team acknowledged ownership for
@ -10,7 +10,7 @@ channels:
      # we may revisit it once elastic-recheck learns how to limit the scope of reports
      # to particular repositories
      - positive
-  openstack-qa:
+  oooq:
    projects:
      - all
    events:
@ -25,14 +25,14 @@ messages:
    %(bugs)s
  footer: >-
    For more details on this and other bugs, please see
-    http://status.openstack.org/elastic-recheck/
+    http://ci-health-rdo.tripleo.org/ 
  recheck_instructions: >-
    If you believe we've correctly identified the failure, feel free to leave a 'recheck'
    comment to run the tests again.
  unrecognized: >-
    Some of the tests failed in a way that we did not understand. Please help
    us classify these issues so that they can be part of Elastic Recheck
-    http://status.openstack.org/elastic-recheck/
+    http://ci-health-rdo.tripleo.org/
  no_bugs_found: >-
    I noticed Zuul failed, refer to:
    https://docs.openstack.org/infra/manual/developers.html#automated-testing
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -22,7 +22,7 @@ services:
    container_name: er-bot
    image: localhost/elastic-recheck
    working_dir: /root
-    command: /usr/bin/elastic-recheck -f elastic-recheck.conf
+    command: /usr/bin/elastic-recheck --noirc -f elastic-recheck.conf
    environment:
      - DB_URI
      - ES_URL
--- a/elasticRecheck.conf.sample
+++ b/elasticRecheck.conf.sample
@ -4,7 +4,7 @@ nick=RecheckWatchBot
 pass=
 server=irc.freenode.net
 port=6667
-channel_config=/home/mtreinish/elasticRecheck/recheckwatchbot.yaml
+channel_config=data/recheckwatchbot.yaml

 [recheckwatch]
 #Any project that has a job that matches this regex will have all their
@ -13,10 +13,10 @@ jobs_re=dsvm
 ci_username=jenkins

 [gerrit]
-user=treinish
+user=os-tripleo-ci
 host=review.opendev.org
-query_file=/home/mtreinish/elasticRecheck/queries
-key=/home/mtreinish/.ssh/id_rsa
+query_file=/opt/elastic-recheck/queries
+key=/root/.ssh/id_rsa

 [data_source]
 es_url=http://logstash.openstack.org:80/elasticsearch
--- a/elastic_recheck/bot.py
+++ b/elastic_recheck/bot.py
@ -209,16 +209,16 @@ class RecheckWatch(threading.Thread):
                if not event.get_all_bugs():
                    self._read(event)
                else:
-                    self._read(event)
                    stream.leave_comment(
                        event,
                        self.msgs,
                        debug=not self.commenting)
            except er.ResultTimedOut as e:
-                self.log.warning(e.message)
-                self._read(msg=e.message)
-            except Exception:
-                self.log.exception("Uncaught exception processing event.")
+                self.log.warning(e.args[0])
+                self._read(msg=e.args[0])
+            except Exception as exp:
+                self.log.exception("Uncaught exception processing event: %s",
+                                   str(exp))


 class MessageConfig(dict):
--- a/elastic_recheck/cmd/init.py
+++ b/elastic_recheck/cmd/init.py
--- a/elastic_recheck/config.py
+++ b/elastic_recheck/config.py
@ -30,18 +30,18 @@ DEFAULTS = {
    'DB_URI': 'mysql+pymysql://query:query@logstash.openstack.org/subunit2sql',
    'server_password': '',
    'CI_USERNAME': 'jenkins',
-    'JOBS_RE': 'dsvm',
+    'JOBS_RE': '(dsvm|tripleo|tox)',
    'PID_FN': '/var/run/elastic-recheck/elastic-recheck.pid',
    'INDEX_FORMAT': r'logstash-%Y.%m.%d',
    'GERRIT_QUERY_FILE': 'queries',
-    'GERRIT_HOST': 'review.opendev.org',
+    'GERRIT_HOST': 'review.rdoproject.org',
    'GERRIT_USER': None,
    'IRC_LOG_CONFIG': '',
-    'IRC_SERVER': "irc.freenode.net",
+    'IRC_SERVER': "irc.oftc.net",
    'IRC_PORT': "6667",
    'IRC_PASS': "",
-    'IRC_SERVER_PASSWORD': "",
-    'IRC_NICK': "",
+    'IRC_SERVER_PASSWORD': "erbot",
+    'IRC_NICK': "erbot",
 }

 # Not all teams actively used elastic recheck for categorizing their
--- a/elastic_recheck/elasticRecheck.py
+++ b/elastic_recheck/elasticRecheck.py
@ -20,6 +20,7 @@ import time
 import dateutil.parser as dp
 import gerritlib.gerrit
 import elasticsearch
+import requests
 import sqlalchemy
 from sqlalchemy import orm
 from subunit2sql.db import api as db_api
@ -31,30 +32,8 @@ from elastic_recheck import results


 def required_files(job):
-    files = []
-    if re.match("(tempest|grenade)-dsvm", job):
-        files.extend([
-            'logs/screen-n-api.txt',
-            'logs/screen-n-cpu.txt',
-            'logs/screen-n-sch.txt',
-            'logs/screen-g-api.txt',
-            'logs/screen-c-api.txt',
-            'logs/screen-c-vol.txt',
-            'logs/syslog.txt'])
-        # we could probably add more neutron files
-        # but currently only q-svc is used in queries
-        if re.match("neutron", job):
-            files.extend([
-                'logs/screen-q-svc.txt',
-                ])
-        else:
-            files.extend([
-                'logs/screen-n-net.txt',
-                ])
-    # make sure that grenade logs exist
-    if re.match("grenade", job):
-        files.extend(['logs/grenade.sh.txt'])
-
+    files = ["job-output.txt"]
+    # Can add more files for specific jobs here
    return files


@ -130,7 +109,8 @@ class FailEvent(object):
            bugs = self.get_all_bugs()
        if not bugs:
            return None
-        urls = ['https://bugs.launchpad.net/bugs/%s' % x for
+        # Return health dashboard link as all of our queries may not have bug
+        urls = ['http://ci-health-rdo.tripleo.org/#%s' % x for
                x in bugs]
        return urls

@ -168,7 +148,9 @@ class FailEvent(object):
        # Assume one queue per gerrit event
        if len(self.failed_jobs) == 0:
            return None
-        return self.failed_jobs[0].url.split('/')[6]
+        if len(self.failed_jobs[0].url.split('/')) >= 7:
+            return self.failed_jobs[0].url.split('/')[6]
+        return None

    def build_short_uuids(self):
        return [job.build_short_uuid for job in self.failed_jobs]
@ -225,7 +207,7 @@ class Stream(object):
            # these items. It's orthoginal to non voting ES searching.
            if " (non-voting)" in line:
                continue
-            m = re.search(r"- ([\w-]+)\s*(http://\S+)\s*:\s*FAILURE", line)
+            m = re.search(r"([\w-]+)\s*(https?://\S+)\s*:\s*FAILURE", line)
            if m:
                failed_tests.append(FailJob(m.group(1), m.group(2)))
        return failed_tests
@ -243,7 +225,7 @@ class Stream(object):
    def _has_required_files(self, change, patch, name, build_short_uuid):
        query = qb.files_ready(change, patch, name, build_short_uuid)
        r = self.es.search(query, size='80', recent=True)
-        files = [x['term'] for x in r.terms]
+        files = [x["_source"]["filename"] for x in r.hits["hits"]]
        # TODO(dmsimard): Reliably differentiate zuul v2 and v3 jobs
        required = required_files(name)
        missing_files = [x for x in required if x not in files]
@ -255,11 +237,12 @@ class Stream(object):

    def _does_es_have_data(self, event):
        """Wait till ElasticSearch is ready, but return False if timeout."""
-        # We wait 20 minutes wall time since receiving the event until we
-        # treat the logs as missing
-        timeout = 1200
-        # Wait 40 seconds between queries.
-        sleep_time = 40
+        # We wait 5 minutes wall time since receiving the event until we
+        # treat the logs as missing. And for now 5 minutes is enough since we
+        # don't have too many jobs being collected.
+        timeout = 300
+        # Wait 300 seconds between queries.
+        sleep_time = 300
        timed_out = False
        job = None
        # This checks that we've got the console log uploaded, need to retry
@ -329,6 +312,7 @@ class Stream(object):
            # bail if the failure is from a project
            # that hasn't run any of the included jobs
            if not fevent.is_included_job():
+                self.log.debug("Ignored comment: %s", fevent.comment)
                continue

            self.log.info("Looking for failures in %d,%d on %s",
@ -405,26 +389,27 @@ class Classifier(object):
        bug_matches = []
        engine = sqlalchemy.create_engine(self.config.db_uri)
        Session = orm.sessionmaker(bind=engine)
-        session = Session()
+        Session()
        for x in self.queries:
            if x.get('suppress-notification'):
                continue
            self.log.debug(
                "Looking for bug: https://bugs.launchpad.net/bugs/%s",
                x['bug'])
+
            query = qb.single_patch(x['query'], change_number, patch_number,
                                    build_short_uuid)
            results = self.es.search(query, size='10', recent=recent)
            if len(results) > 0:
-                if x.get('test_ids', None):
-                    test_ids = x['test_ids']
-                    self.log.debug(
-                        "For bug %s checking subunit2sql for failures on "
-                        "test_ids: %s", x['bug'], test_ids)
-                    if check_failed_test_ids_for_job(build_short_uuid,
-                                                     test_ids, session):
-                        bug_matches.append(x['bug'])
+                response = requests.get("https://bugs.launchpad.net/bugs/" +
+                                        x['bug'])
+                if response.status_code != 200:
+                    bug_matches.append(x['bug'] +
+                                       ": " +
+                                       x.get('msg',
+                                             re.escape(x.get('query', ""))))
                else:
-                    bug_matches.append(x['bug'])
+                    bug_matches.append(x['bug'] + ": " + x.get(
+                        'msg', re.escape(x.get('query', ""))))

        return bug_matches
--- a/elastic_recheck/loader.py
+++ b/elastic_recheck/loader.py
@ -22,7 +22,7 @@ import os.path
 import yaml


-def load(directory='queries'):
+def load(directory="/opt/elastic-recheck/queries"):
    """Load queries from a set of yaml files in a directory."""
    bugs = glob.glob("%s/*.yaml" % directory)
    data = []
--- a/elastic_recheck/query_builder.py
+++ b/elastic_recheck/query_builder.py
@ -53,7 +53,8 @@ def generic(raw_query, facet=None):
        if isinstance(facet, list):
            data = dict(fields=facet, size=200)

-        query['facets'] = {
+        # facets moved to aggs
+        query['aggs'] = {
            "tag": {
                "terms": data
                }
@ -77,15 +78,16 @@ def result_ready(change, patchset, name, short_uuid):
    """
    # TODO(dmsimard): Revisit this query once Zuul v2 is no longer supported
    # Let's value legibility over pep8 line width here...
+    # build_short_uuid doesnt return the whole uuid in rdo es
    query = (
-        '((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"project-config/playbooks/base/post-ssh.yaml")'  # noqa E501
+        '((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"post.yaml")'  # noqa E501
        ' OR '
        '(filename:"console.html" AND (message:"[Zuul] Job complete" OR message:"[SCP] Copying console log" OR message:"Grabbing consoleLog")))'  # noqa E501
        ' AND build_status:"FAILURE"'
        ' AND build_change:"{change}"'
        ' AND build_patchset:"{patchset}"'
        ' AND build_name:"{name}"'
-        ' AND build_short_uuid:"{short_uuid}"'
+        ' AND build_uuid:"{short_uuid}"'
    )
    return generic(query.format(
        change=change,
@ -107,9 +109,9 @@ def files_ready(review, patch, name, build_short_uuid):
                   'AND build_change:"%s" '
                   'AND build_patchset:"%s" '
                   'AND build_name:"%s" '
-                   'AND build_short_uuid:%s' %
+                   'AND build_uuid:%s' %
                   (review, patch, name, build_short_uuid),
-                   facet='filename')
+                   facet='filename.name')


 def single_patch(query, review, patch, build_short_uuid):
@ -121,7 +123,7 @@ def single_patch(query, review, patch, build_short_uuid):
    return generic('%s '
                   'AND build_change:"%s" '
                   'AND build_patchset:"%s" '
-                   'AND build_short_uuid:%s' %
+                   'AND build_uuid:%s' %
                   (query, review, patch, build_short_uuid))


--- a/elastic_recheck/tests/unit/test_bot.py
+++ b/elastic_recheck/tests/unit/test_bot.py
@ -115,7 +115,7 @@ class TestBotWithTestTools(tests.TestCase):
        reference = ("openstack/keystone change: https://review.opendev.org/"
                     "64750 failed because of: "
                     "gate-keystone-python26: "
-                     "https://bugs.launchpad.net/bugs/123456, "
+                     "http://ci-health-rdo.tripleo.org/#123456, "
                     "gate-keystone-python27: unrecognized error")
        self.assertEqual(reference, msg)

--- a/elastic_recheck/tests/unit/test_elastic_recheck.py
+++ b/elastic_recheck/tests/unit/test_elastic_recheck.py
@ -56,25 +56,3 @@ class TestSubunit2sqlCrossover(unit.UnitTestCase):
                                               ['test1', 'test4'],
                                               mock.sentinel.session)
        self.assertFalse(res)
-
-    @mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=True)
-    def test_classify_with_test_id_filter_match(self, mock_id_check):
-        c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
-        es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
-        es_mock.start()
-        self.addCleanup(es_mock.stop)
-        res = c.classify(1234, 1, 'fake')
-        self.assertEqual(res, ['1234567'],
-                         "classify() returned %s when it should have returned "
-                         "a list with one bug id: '1234567'" % res)
-
-    @mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=False)
-    def test_classify_with_test_id_filter_no_match(self, mock_id_check):
-        c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
-        es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
-        es_mock.start()
-        self.addCleanup(es_mock.stop)
-        res = c.classify(1234, 1, 'fake')
-        self.assertEqual(res, [],
-                         "classify() returned bug matches %s when none should "
-                         "have been found" % res)
--- a/elastic_recheck/tests/unit/test_stream.py
+++ b/elastic_recheck/tests/unit/test_stream.py
@ -152,10 +152,10 @@ class TestStream(tests.TestCase):
            self.assertTrue(event.is_included_job())
            self.assertEqual(event.queue(), "gate")
            self.assertEqual(event.bug_urls(),
-                             ['https://bugs.launchpad.net/bugs/123456'])
+                             ['http://ci-health-rdo.tripleo.org/#123456'])
            errors = ['gate-keystone-python27: unrecognized error',
                      'gate-keystone-python26: '
-                      'https://bugs.launchpad.net/bugs/123456']
+                      'http://ci-health-rdo.tripleo.org/#123456']
            bug_map = event.bug_urls_map()
            for error in errors:
                self.assertIn(error, bug_map)
@ -180,10 +180,10 @@ class TestStream(tests.TestCase):
            self.assertTrue(event.is_included_job())
            self.assertEqual(event.queue(), "check")
            self.assertEqual(event.bug_urls(),
-                             ['https://bugs.launchpad.net/bugs/123456'])
+                             ['http://ci-health-rdo.tripleo.org/#123456'])
            self.assertEqual(event.bug_urls_map(),
                             ['gate-keystone-python26: '
-                              'https://bugs.launchpad.net/bugs/123456',
+                              'http://ci-health-rdo.tripleo.org/#123456',
                              'gate-keystone-python27: unrecognized error'])
            self.assertEqual(sorted(event.failed_job_names()),
                             ['gate-keystone-python26',
--- a/recheckwatchbot.yaml
+++ b/recheckwatchbot.yaml
@ -23,14 +23,14 @@ messages:
    %(bugs)s
  footer: >-
    For more details on this and other bugs, please see
-    http://status.openstack.org/elastic-recheck/
+    http://ci-health-rdo.tripleo.org/
  recheck_instructions: >-
    If you believe we've correctly identified the failure, feel free to leave a 'recheck'
    comment to run the tests again.
  unrecognized: >-
    Some of the tests failed in a way that we did not understand. Please help
    us classify these issues so that they can be part of Elastic Recheck
-    http://status.openstack.org/elastic-recheck/
+    http://ci-health-rdo.tripleo.org/
  no_bugs_found: >-
    I noticed Zuul failed, refer to:
-    http://docs.openstack.org/infra/manual/developers.html#automated-testing
+    http://docs.openstack.org/infra/manual/developers.html#automated-testing