Fix ER bot to report back to gerrit with bug/error report

Examples of comments: https://review.rdoproject.org/r/c/testproject/+/36119/2#message-501d5e016218453253b2af37c27b16ec03a90474

Change-Id: Ic6d4c7a5714151e630100e6daf342c830061c10b
This commit is contained in:
frenzyfriday 2021-08-23 15:10:07 +02:00 committed by Arx Cruz
parent 760a90296e
commit 4e4ae3e36d
14 changed files with 69 additions and 104 deletions

View File

@ -10,16 +10,16 @@
[ircbot]
# nick=${IRC_NICK}
# pass=${IRC_PASS}
server=irc.freenode.net
server=irc.oftc.net
port=6667
channel_config=recheckwatchbot.yaml
# log_config=${LOG_CONFIG}
[gerrit]
# host=${GERRIT_HOST}
# user=${GERRIT_USER}
# query_file=${GERRIT_QUERY_FILE}
# key=data/id_rsa
host=${GERRIT_HOST}
user=${GERRIT_USER}
query_file=${GERRIT_QUERY_FILE}
key=/root/.ssh/id_rsa
[data_source]
es_url=${ES_URL}

View File

@ -1,5 +1,5 @@
channels:
openstack-neutron:
oooq:
projects:
# elastic-recheck doesn't allow to limit reports to patches from a specific repo,
# so let's at least scope to bugs that neutron team acknowledged ownership for
@ -10,7 +10,7 @@ channels:
# we may revisit it once elastic-recheck learns how to limit the scope of reports
# to particular repositories
- positive
openstack-qa:
oooq:
projects:
- all
events:
@ -25,14 +25,14 @@ messages:
%(bugs)s
footer: >-
For more details on this and other bugs, please see
http://status.openstack.org/elastic-recheck/
http://ci-health-rdo.tripleo.org/
recheck_instructions: >-
If you believe we've correctly identified the failure, feel free to leave a 'recheck'
comment to run the tests again.
unrecognized: >-
Some of the tests failed in a way that we did not understand. Please help
us classify these issues so that they can be part of Elastic Recheck
http://status.openstack.org/elastic-recheck/
http://ci-health-rdo.tripleo.org/
no_bugs_found: >-
I noticed Zuul failed, refer to:
https://docs.openstack.org/infra/manual/developers.html#automated-testing

View File

@ -22,7 +22,7 @@ services:
container_name: er-bot
image: localhost/elastic-recheck
working_dir: /root
command: /usr/bin/elastic-recheck -f elastic-recheck.conf
command: /usr/bin/elastic-recheck --noirc -f elastic-recheck.conf
environment:
- DB_URI
- ES_URL

View File

@ -4,7 +4,7 @@ nick=RecheckWatchBot
pass=
server=irc.freenode.net
port=6667
channel_config=/home/mtreinish/elasticRecheck/recheckwatchbot.yaml
channel_config=data/recheckwatchbot.yaml
[recheckwatch]
#Any project that has a job that matches this regex will have all their
@ -13,10 +13,10 @@ jobs_re=dsvm
ci_username=jenkins
[gerrit]
user=treinish
user=os-tripleo-ci
host=review.opendev.org
query_file=/home/mtreinish/elasticRecheck/queries
key=/home/mtreinish/.ssh/id_rsa
query_file=/opt/elastic-recheck/queries
key=/root/.ssh/id_rsa
[data_source]
es_url=http://logstash.openstack.org:80/elasticsearch

View File

@ -209,16 +209,16 @@ class RecheckWatch(threading.Thread):
if not event.get_all_bugs():
self._read(event)
else:
self._read(event)
stream.leave_comment(
event,
self.msgs,
debug=not self.commenting)
except er.ResultTimedOut as e:
self.log.warning(e.message)
self._read(msg=e.message)
except Exception:
self.log.exception("Uncaught exception processing event.")
self.log.warning(e.args[0])
self._read(msg=e.args[0])
except Exception as exp:
self.log.exception("Uncaught exception processing event: %s",
str(exp))
class MessageConfig(dict):

View File

@ -30,18 +30,18 @@ DEFAULTS = {
'DB_URI': 'mysql+pymysql://query:query@logstash.openstack.org/subunit2sql',
'server_password': '',
'CI_USERNAME': 'jenkins',
'JOBS_RE': 'dsvm',
'JOBS_RE': '(dsvm|tripleo|tox)',
'PID_FN': '/var/run/elastic-recheck/elastic-recheck.pid',
'INDEX_FORMAT': r'logstash-%Y.%m.%d',
'GERRIT_QUERY_FILE': 'queries',
'GERRIT_HOST': 'review.opendev.org',
'GERRIT_HOST': 'review.rdoproject.org',
'GERRIT_USER': None,
'IRC_LOG_CONFIG': '',
'IRC_SERVER': "irc.freenode.net",
'IRC_SERVER': "irc.oftc.net",
'IRC_PORT': "6667",
'IRC_PASS': "",
'IRC_SERVER_PASSWORD': "",
'IRC_NICK': "",
'IRC_SERVER_PASSWORD': "erbot",
'IRC_NICK': "erbot",
}
# Not all teams actively used elastic recheck for categorizing their

View File

@ -20,6 +20,7 @@ import time
import dateutil.parser as dp
import gerritlib.gerrit
import elasticsearch
import requests
import sqlalchemy
from sqlalchemy import orm
from subunit2sql.db import api as db_api
@ -31,30 +32,8 @@ from elastic_recheck import results
def required_files(job):
files = []
if re.match("(tempest|grenade)-dsvm", job):
files.extend([
'logs/screen-n-api.txt',
'logs/screen-n-cpu.txt',
'logs/screen-n-sch.txt',
'logs/screen-g-api.txt',
'logs/screen-c-api.txt',
'logs/screen-c-vol.txt',
'logs/syslog.txt'])
# we could probably add more neutron files
# but currently only q-svc is used in queries
if re.match("neutron", job):
files.extend([
'logs/screen-q-svc.txt',
])
else:
files.extend([
'logs/screen-n-net.txt',
])
# make sure that grenade logs exist
if re.match("grenade", job):
files.extend(['logs/grenade.sh.txt'])
files = ["job-output.txt"]
# Can add more files for specific jobs here
return files
@ -130,7 +109,8 @@ class FailEvent(object):
bugs = self.get_all_bugs()
if not bugs:
return None
urls = ['https://bugs.launchpad.net/bugs/%s' % x for
# Return health dashboard link as all of our queries may not have bug
urls = ['http://ci-health-rdo.tripleo.org/#%s' % x for
x in bugs]
return urls
@ -168,7 +148,9 @@ class FailEvent(object):
# Assume one queue per gerrit event
if len(self.failed_jobs) == 0:
return None
return self.failed_jobs[0].url.split('/')[6]
if len(self.failed_jobs[0].url.split('/')) >= 7:
return self.failed_jobs[0].url.split('/')[6]
return None
def build_short_uuids(self):
return [job.build_short_uuid for job in self.failed_jobs]
@ -225,7 +207,7 @@ class Stream(object):
# these items. It's orthoginal to non voting ES searching.
if " (non-voting)" in line:
continue
m = re.search(r"- ([\w-]+)\s*(http://\S+)\s*:\s*FAILURE", line)
m = re.search(r"([\w-]+)\s*(https?://\S+)\s*:\s*FAILURE", line)
if m:
failed_tests.append(FailJob(m.group(1), m.group(2)))
return failed_tests
@ -243,7 +225,7 @@ class Stream(object):
def _has_required_files(self, change, patch, name, build_short_uuid):
query = qb.files_ready(change, patch, name, build_short_uuid)
r = self.es.search(query, size='80', recent=True)
files = [x['term'] for x in r.terms]
files = [x["_source"]["filename"] for x in r.hits["hits"]]
# TODO(dmsimard): Reliably differentiate zuul v2 and v3 jobs
required = required_files(name)
missing_files = [x for x in required if x not in files]
@ -255,11 +237,12 @@ class Stream(object):
def _does_es_have_data(self, event):
"""Wait till ElasticSearch is ready, but return False if timeout."""
# We wait 20 minutes wall time since receiving the event until we
# treat the logs as missing
timeout = 1200
# Wait 40 seconds between queries.
sleep_time = 40
# We wait 5 minutes wall time since receiving the event until we
# treat the logs as missing. And for now 5 minutes is enough since we
# don't have too many jobs being collected.
timeout = 300
# Wait 300 seconds between queries.
sleep_time = 300
timed_out = False
job = None
# This checks that we've got the console log uploaded, need to retry
@ -329,6 +312,7 @@ class Stream(object):
# bail if the failure is from a project
# that hasn't run any of the included jobs
if not fevent.is_included_job():
self.log.debug("Ignored comment: %s", fevent.comment)
continue
self.log.info("Looking for failures in %d,%d on %s",
@ -405,26 +389,27 @@ class Classifier(object):
bug_matches = []
engine = sqlalchemy.create_engine(self.config.db_uri)
Session = orm.sessionmaker(bind=engine)
session = Session()
Session()
for x in self.queries:
if x.get('suppress-notification'):
continue
self.log.debug(
"Looking for bug: https://bugs.launchpad.net/bugs/%s",
x['bug'])
query = qb.single_patch(x['query'], change_number, patch_number,
build_short_uuid)
results = self.es.search(query, size='10', recent=recent)
if len(results) > 0:
if x.get('test_ids', None):
test_ids = x['test_ids']
self.log.debug(
"For bug %s checking subunit2sql for failures on "
"test_ids: %s", x['bug'], test_ids)
if check_failed_test_ids_for_job(build_short_uuid,
test_ids, session):
bug_matches.append(x['bug'])
response = requests.get("https://bugs.launchpad.net/bugs/" +
x['bug'])
if response.status_code != 200:
bug_matches.append(x['bug'] +
": " +
x.get('msg',
re.escape(x.get('query', ""))))
else:
bug_matches.append(x['bug'])
bug_matches.append(x['bug'] + ": " + x.get(
'msg', re.escape(x.get('query', ""))))
return bug_matches

View File

@ -22,7 +22,7 @@ import os.path
import yaml
def load(directory='queries'):
def load(directory="/opt/elastic-recheck/queries"):
"""Load queries from a set of yaml files in a directory."""
bugs = glob.glob("%s/*.yaml" % directory)
data = []

View File

@ -53,7 +53,8 @@ def generic(raw_query, facet=None):
if isinstance(facet, list):
data = dict(fields=facet, size=200)
query['facets'] = {
# facets moved to aggs
query['aggs'] = {
"tag": {
"terms": data
}
@ -77,15 +78,16 @@ def result_ready(change, patchset, name, short_uuid):
"""
# TODO(dmsimard): Revisit this query once Zuul v2 is no longer supported
# Let's value legibility over pep8 line width here...
# build_short_uuid doesnt return the whole uuid in rdo es
query = (
'((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"project-config/playbooks/base/post-ssh.yaml")' # noqa E501
'((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"post.yaml")' # noqa E501
' OR '
'(filename:"console.html" AND (message:"[Zuul] Job complete" OR message:"[SCP] Copying console log" OR message:"Grabbing consoleLog")))' # noqa E501
' AND build_status:"FAILURE"'
' AND build_change:"{change}"'
' AND build_patchset:"{patchset}"'
' AND build_name:"{name}"'
' AND build_short_uuid:"{short_uuid}"'
' AND build_uuid:"{short_uuid}"'
)
return generic(query.format(
change=change,
@ -107,9 +109,9 @@ def files_ready(review, patch, name, build_short_uuid):
'AND build_change:"%s" '
'AND build_patchset:"%s" '
'AND build_name:"%s" '
'AND build_short_uuid:%s' %
'AND build_uuid:%s' %
(review, patch, name, build_short_uuid),
facet='filename')
facet='filename.name')
def single_patch(query, review, patch, build_short_uuid):
@ -121,7 +123,7 @@ def single_patch(query, review, patch, build_short_uuid):
return generic('%s '
'AND build_change:"%s" '
'AND build_patchset:"%s" '
'AND build_short_uuid:%s' %
'AND build_uuid:%s' %
(query, review, patch, build_short_uuid))

View File

@ -115,7 +115,7 @@ class TestBotWithTestTools(tests.TestCase):
reference = ("openstack/keystone change: https://review.opendev.org/"
"64750 failed because of: "
"gate-keystone-python26: "
"https://bugs.launchpad.net/bugs/123456, "
"http://ci-health-rdo.tripleo.org/#123456, "
"gate-keystone-python27: unrecognized error")
self.assertEqual(reference, msg)

View File

@ -56,25 +56,3 @@ class TestSubunit2sqlCrossover(unit.UnitTestCase):
['test1', 'test4'],
mock.sentinel.session)
self.assertFalse(res)
@mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=True)
def test_classify_with_test_id_filter_match(self, mock_id_check):
c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
es_mock.start()
self.addCleanup(es_mock.stop)
res = c.classify(1234, 1, 'fake')
self.assertEqual(res, ['1234567'],
"classify() returned %s when it should have returned "
"a list with one bug id: '1234567'" % res)
@mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=False)
def test_classify_with_test_id_filter_no_match(self, mock_id_check):
c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
es_mock.start()
self.addCleanup(es_mock.stop)
res = c.classify(1234, 1, 'fake')
self.assertEqual(res, [],
"classify() returned bug matches %s when none should "
"have been found" % res)

View File

@ -152,10 +152,10 @@ class TestStream(tests.TestCase):
self.assertTrue(event.is_included_job())
self.assertEqual(event.queue(), "gate")
self.assertEqual(event.bug_urls(),
['https://bugs.launchpad.net/bugs/123456'])
['http://ci-health-rdo.tripleo.org/#123456'])
errors = ['gate-keystone-python27: unrecognized error',
'gate-keystone-python26: '
'https://bugs.launchpad.net/bugs/123456']
'http://ci-health-rdo.tripleo.org/#123456']
bug_map = event.bug_urls_map()
for error in errors:
self.assertIn(error, bug_map)
@ -180,10 +180,10 @@ class TestStream(tests.TestCase):
self.assertTrue(event.is_included_job())
self.assertEqual(event.queue(), "check")
self.assertEqual(event.bug_urls(),
['https://bugs.launchpad.net/bugs/123456'])
['http://ci-health-rdo.tripleo.org/#123456'])
self.assertEqual(event.bug_urls_map(),
['gate-keystone-python26: '
'https://bugs.launchpad.net/bugs/123456',
'http://ci-health-rdo.tripleo.org/#123456',
'gate-keystone-python27: unrecognized error'])
self.assertEqual(sorted(event.failed_job_names()),
['gate-keystone-python26',

View File

@ -23,14 +23,14 @@ messages:
%(bugs)s
footer: >-
For more details on this and other bugs, please see
http://status.openstack.org/elastic-recheck/
http://ci-health-rdo.tripleo.org/
recheck_instructions: >-
If you believe we've correctly identified the failure, feel free to leave a 'recheck'
comment to run the tests again.
unrecognized: >-
Some of the tests failed in a way that we did not understand. Please help
us classify these issues so that they can be part of Elastic Recheck
http://status.openstack.org/elastic-recheck/
http://ci-health-rdo.tripleo.org/
no_bugs_found: >-
I noticed Zuul failed, refer to:
http://docs.openstack.org/infra/manual/developers.html#automated-testing
http://docs.openstack.org/infra/manual/developers.html#automated-testing