Fix ER bot to report back to gerrit with bug/error report
Examples of comments: https://review.rdoproject.org/r/c/testproject/+/36119/2#message-501d5e016218453253b2af37c27b16ec03a90474 Change-Id: Ic6d4c7a5714151e630100e6daf342c830061c10b
This commit is contained in:
parent
760a90296e
commit
4e4ae3e36d
@ -10,16 +10,16 @@
|
||||
[ircbot]
|
||||
# nick=${IRC_NICK}
|
||||
# pass=${IRC_PASS}
|
||||
server=irc.freenode.net
|
||||
server=irc.oftc.net
|
||||
port=6667
|
||||
channel_config=recheckwatchbot.yaml
|
||||
# log_config=${LOG_CONFIG}
|
||||
|
||||
[gerrit]
|
||||
# host=${GERRIT_HOST}
|
||||
# user=${GERRIT_USER}
|
||||
# query_file=${GERRIT_QUERY_FILE}
|
||||
# key=data/id_rsa
|
||||
host=${GERRIT_HOST}
|
||||
user=${GERRIT_USER}
|
||||
query_file=${GERRIT_QUERY_FILE}
|
||||
key=/root/.ssh/id_rsa
|
||||
|
||||
[data_source]
|
||||
es_url=${ES_URL}
|
||||
|
@ -1,5 +1,5 @@
|
||||
channels:
|
||||
openstack-neutron:
|
||||
oooq:
|
||||
projects:
|
||||
# elastic-recheck doesn't allow to limit reports to patches from a specific repo,
|
||||
# so let's at least scope to bugs that neutron team acknowledged ownership for
|
||||
@ -10,7 +10,7 @@ channels:
|
||||
# we may revisit it once elastic-recheck learns how to limit the scope of reports
|
||||
# to particular repositories
|
||||
- positive
|
||||
openstack-qa:
|
||||
oooq:
|
||||
projects:
|
||||
- all
|
||||
events:
|
||||
@ -25,14 +25,14 @@ messages:
|
||||
%(bugs)s
|
||||
footer: >-
|
||||
For more details on this and other bugs, please see
|
||||
http://status.openstack.org/elastic-recheck/
|
||||
http://ci-health-rdo.tripleo.org/
|
||||
recheck_instructions: >-
|
||||
If you believe we've correctly identified the failure, feel free to leave a 'recheck'
|
||||
comment to run the tests again.
|
||||
unrecognized: >-
|
||||
Some of the tests failed in a way that we did not understand. Please help
|
||||
us classify these issues so that they can be part of Elastic Recheck
|
||||
http://status.openstack.org/elastic-recheck/
|
||||
http://ci-health-rdo.tripleo.org/
|
||||
no_bugs_found: >-
|
||||
I noticed Zuul failed, refer to:
|
||||
https://docs.openstack.org/infra/manual/developers.html#automated-testing
|
||||
|
@ -22,7 +22,7 @@ services:
|
||||
container_name: er-bot
|
||||
image: localhost/elastic-recheck
|
||||
working_dir: /root
|
||||
command: /usr/bin/elastic-recheck -f elastic-recheck.conf
|
||||
command: /usr/bin/elastic-recheck --noirc -f elastic-recheck.conf
|
||||
environment:
|
||||
- DB_URI
|
||||
- ES_URL
|
||||
|
@ -4,7 +4,7 @@ nick=RecheckWatchBot
|
||||
pass=
|
||||
server=irc.freenode.net
|
||||
port=6667
|
||||
channel_config=/home/mtreinish/elasticRecheck/recheckwatchbot.yaml
|
||||
channel_config=data/recheckwatchbot.yaml
|
||||
|
||||
[recheckwatch]
|
||||
#Any project that has a job that matches this regex will have all their
|
||||
@ -13,10 +13,10 @@ jobs_re=dsvm
|
||||
ci_username=jenkins
|
||||
|
||||
[gerrit]
|
||||
user=treinish
|
||||
user=os-tripleo-ci
|
||||
host=review.opendev.org
|
||||
query_file=/home/mtreinish/elasticRecheck/queries
|
||||
key=/home/mtreinish/.ssh/id_rsa
|
||||
query_file=/opt/elastic-recheck/queries
|
||||
key=/root/.ssh/id_rsa
|
||||
|
||||
[data_source]
|
||||
es_url=http://logstash.openstack.org:80/elasticsearch
|
||||
|
@ -209,16 +209,16 @@ class RecheckWatch(threading.Thread):
|
||||
if not event.get_all_bugs():
|
||||
self._read(event)
|
||||
else:
|
||||
self._read(event)
|
||||
stream.leave_comment(
|
||||
event,
|
||||
self.msgs,
|
||||
debug=not self.commenting)
|
||||
except er.ResultTimedOut as e:
|
||||
self.log.warning(e.message)
|
||||
self._read(msg=e.message)
|
||||
except Exception:
|
||||
self.log.exception("Uncaught exception processing event.")
|
||||
self.log.warning(e.args[0])
|
||||
self._read(msg=e.args[0])
|
||||
except Exception as exp:
|
||||
self.log.exception("Uncaught exception processing event: %s",
|
||||
str(exp))
|
||||
|
||||
|
||||
class MessageConfig(dict):
|
||||
|
@ -30,18 +30,18 @@ DEFAULTS = {
|
||||
'DB_URI': 'mysql+pymysql://query:query@logstash.openstack.org/subunit2sql',
|
||||
'server_password': '',
|
||||
'CI_USERNAME': 'jenkins',
|
||||
'JOBS_RE': 'dsvm',
|
||||
'JOBS_RE': '(dsvm|tripleo|tox)',
|
||||
'PID_FN': '/var/run/elastic-recheck/elastic-recheck.pid',
|
||||
'INDEX_FORMAT': r'logstash-%Y.%m.%d',
|
||||
'GERRIT_QUERY_FILE': 'queries',
|
||||
'GERRIT_HOST': 'review.opendev.org',
|
||||
'GERRIT_HOST': 'review.rdoproject.org',
|
||||
'GERRIT_USER': None,
|
||||
'IRC_LOG_CONFIG': '',
|
||||
'IRC_SERVER': "irc.freenode.net",
|
||||
'IRC_SERVER': "irc.oftc.net",
|
||||
'IRC_PORT': "6667",
|
||||
'IRC_PASS': "",
|
||||
'IRC_SERVER_PASSWORD': "",
|
||||
'IRC_NICK': "",
|
||||
'IRC_SERVER_PASSWORD': "erbot",
|
||||
'IRC_NICK': "erbot",
|
||||
}
|
||||
|
||||
# Not all teams actively used elastic recheck for categorizing their
|
||||
|
@ -20,6 +20,7 @@ import time
|
||||
import dateutil.parser as dp
|
||||
import gerritlib.gerrit
|
||||
import elasticsearch
|
||||
import requests
|
||||
import sqlalchemy
|
||||
from sqlalchemy import orm
|
||||
from subunit2sql.db import api as db_api
|
||||
@ -31,30 +32,8 @@ from elastic_recheck import results
|
||||
|
||||
|
||||
def required_files(job):
|
||||
files = []
|
||||
if re.match("(tempest|grenade)-dsvm", job):
|
||||
files.extend([
|
||||
'logs/screen-n-api.txt',
|
||||
'logs/screen-n-cpu.txt',
|
||||
'logs/screen-n-sch.txt',
|
||||
'logs/screen-g-api.txt',
|
||||
'logs/screen-c-api.txt',
|
||||
'logs/screen-c-vol.txt',
|
||||
'logs/syslog.txt'])
|
||||
# we could probably add more neutron files
|
||||
# but currently only q-svc is used in queries
|
||||
if re.match("neutron", job):
|
||||
files.extend([
|
||||
'logs/screen-q-svc.txt',
|
||||
])
|
||||
else:
|
||||
files.extend([
|
||||
'logs/screen-n-net.txt',
|
||||
])
|
||||
# make sure that grenade logs exist
|
||||
if re.match("grenade", job):
|
||||
files.extend(['logs/grenade.sh.txt'])
|
||||
|
||||
files = ["job-output.txt"]
|
||||
# Can add more files for specific jobs here
|
||||
return files
|
||||
|
||||
|
||||
@ -130,7 +109,8 @@ class FailEvent(object):
|
||||
bugs = self.get_all_bugs()
|
||||
if not bugs:
|
||||
return None
|
||||
urls = ['https://bugs.launchpad.net/bugs/%s' % x for
|
||||
# Return health dashboard link as all of our queries may not have bug
|
||||
urls = ['http://ci-health-rdo.tripleo.org/#%s' % x for
|
||||
x in bugs]
|
||||
return urls
|
||||
|
||||
@ -168,7 +148,9 @@ class FailEvent(object):
|
||||
# Assume one queue per gerrit event
|
||||
if len(self.failed_jobs) == 0:
|
||||
return None
|
||||
return self.failed_jobs[0].url.split('/')[6]
|
||||
if len(self.failed_jobs[0].url.split('/')) >= 7:
|
||||
return self.failed_jobs[0].url.split('/')[6]
|
||||
return None
|
||||
|
||||
def build_short_uuids(self):
|
||||
return [job.build_short_uuid for job in self.failed_jobs]
|
||||
@ -225,7 +207,7 @@ class Stream(object):
|
||||
# these items. It's orthoginal to non voting ES searching.
|
||||
if " (non-voting)" in line:
|
||||
continue
|
||||
m = re.search(r"- ([\w-]+)\s*(http://\S+)\s*:\s*FAILURE", line)
|
||||
m = re.search(r"([\w-]+)\s*(https?://\S+)\s*:\s*FAILURE", line)
|
||||
if m:
|
||||
failed_tests.append(FailJob(m.group(1), m.group(2)))
|
||||
return failed_tests
|
||||
@ -243,7 +225,7 @@ class Stream(object):
|
||||
def _has_required_files(self, change, patch, name, build_short_uuid):
|
||||
query = qb.files_ready(change, patch, name, build_short_uuid)
|
||||
r = self.es.search(query, size='80', recent=True)
|
||||
files = [x['term'] for x in r.terms]
|
||||
files = [x["_source"]["filename"] for x in r.hits["hits"]]
|
||||
# TODO(dmsimard): Reliably differentiate zuul v2 and v3 jobs
|
||||
required = required_files(name)
|
||||
missing_files = [x for x in required if x not in files]
|
||||
@ -255,11 +237,12 @@ class Stream(object):
|
||||
|
||||
def _does_es_have_data(self, event):
|
||||
"""Wait till ElasticSearch is ready, but return False if timeout."""
|
||||
# We wait 20 minutes wall time since receiving the event until we
|
||||
# treat the logs as missing
|
||||
timeout = 1200
|
||||
# Wait 40 seconds between queries.
|
||||
sleep_time = 40
|
||||
# We wait 5 minutes wall time since receiving the event until we
|
||||
# treat the logs as missing. And for now 5 minutes is enough since we
|
||||
# don't have too many jobs being collected.
|
||||
timeout = 300
|
||||
# Wait 300 seconds between queries.
|
||||
sleep_time = 300
|
||||
timed_out = False
|
||||
job = None
|
||||
# This checks that we've got the console log uploaded, need to retry
|
||||
@ -329,6 +312,7 @@ class Stream(object):
|
||||
# bail if the failure is from a project
|
||||
# that hasn't run any of the included jobs
|
||||
if not fevent.is_included_job():
|
||||
self.log.debug("Ignored comment: %s", fevent.comment)
|
||||
continue
|
||||
|
||||
self.log.info("Looking for failures in %d,%d on %s",
|
||||
@ -405,26 +389,27 @@ class Classifier(object):
|
||||
bug_matches = []
|
||||
engine = sqlalchemy.create_engine(self.config.db_uri)
|
||||
Session = orm.sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
Session()
|
||||
for x in self.queries:
|
||||
if x.get('suppress-notification'):
|
||||
continue
|
||||
self.log.debug(
|
||||
"Looking for bug: https://bugs.launchpad.net/bugs/%s",
|
||||
x['bug'])
|
||||
|
||||
query = qb.single_patch(x['query'], change_number, patch_number,
|
||||
build_short_uuid)
|
||||
results = self.es.search(query, size='10', recent=recent)
|
||||
if len(results) > 0:
|
||||
if x.get('test_ids', None):
|
||||
test_ids = x['test_ids']
|
||||
self.log.debug(
|
||||
"For bug %s checking subunit2sql for failures on "
|
||||
"test_ids: %s", x['bug'], test_ids)
|
||||
if check_failed_test_ids_for_job(build_short_uuid,
|
||||
test_ids, session):
|
||||
bug_matches.append(x['bug'])
|
||||
response = requests.get("https://bugs.launchpad.net/bugs/" +
|
||||
x['bug'])
|
||||
if response.status_code != 200:
|
||||
bug_matches.append(x['bug'] +
|
||||
": " +
|
||||
x.get('msg',
|
||||
re.escape(x.get('query', ""))))
|
||||
else:
|
||||
bug_matches.append(x['bug'])
|
||||
bug_matches.append(x['bug'] + ": " + x.get(
|
||||
'msg', re.escape(x.get('query', ""))))
|
||||
|
||||
return bug_matches
|
||||
|
@ -22,7 +22,7 @@ import os.path
|
||||
import yaml
|
||||
|
||||
|
||||
def load(directory='queries'):
|
||||
def load(directory="/opt/elastic-recheck/queries"):
|
||||
"""Load queries from a set of yaml files in a directory."""
|
||||
bugs = glob.glob("%s/*.yaml" % directory)
|
||||
data = []
|
||||
|
@ -53,7 +53,8 @@ def generic(raw_query, facet=None):
|
||||
if isinstance(facet, list):
|
||||
data = dict(fields=facet, size=200)
|
||||
|
||||
query['facets'] = {
|
||||
# facets moved to aggs
|
||||
query['aggs'] = {
|
||||
"tag": {
|
||||
"terms": data
|
||||
}
|
||||
@ -77,15 +78,16 @@ def result_ready(change, patchset, name, short_uuid):
|
||||
"""
|
||||
# TODO(dmsimard): Revisit this query once Zuul v2 is no longer supported
|
||||
# Let's value legibility over pep8 line width here...
|
||||
# build_short_uuid doesnt return the whole uuid in rdo es
|
||||
query = (
|
||||
'((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"project-config/playbooks/base/post-ssh.yaml")' # noqa E501
|
||||
'((filename:"job-output.txt" AND message:"POST-RUN END" AND message:"post.yaml")' # noqa E501
|
||||
' OR '
|
||||
'(filename:"console.html" AND (message:"[Zuul] Job complete" OR message:"[SCP] Copying console log" OR message:"Grabbing consoleLog")))' # noqa E501
|
||||
' AND build_status:"FAILURE"'
|
||||
' AND build_change:"{change}"'
|
||||
' AND build_patchset:"{patchset}"'
|
||||
' AND build_name:"{name}"'
|
||||
' AND build_short_uuid:"{short_uuid}"'
|
||||
' AND build_uuid:"{short_uuid}"'
|
||||
)
|
||||
return generic(query.format(
|
||||
change=change,
|
||||
@ -107,9 +109,9 @@ def files_ready(review, patch, name, build_short_uuid):
|
||||
'AND build_change:"%s" '
|
||||
'AND build_patchset:"%s" '
|
||||
'AND build_name:"%s" '
|
||||
'AND build_short_uuid:%s' %
|
||||
'AND build_uuid:%s' %
|
||||
(review, patch, name, build_short_uuid),
|
||||
facet='filename')
|
||||
facet='filename.name')
|
||||
|
||||
|
||||
def single_patch(query, review, patch, build_short_uuid):
|
||||
@ -121,7 +123,7 @@ def single_patch(query, review, patch, build_short_uuid):
|
||||
return generic('%s '
|
||||
'AND build_change:"%s" '
|
||||
'AND build_patchset:"%s" '
|
||||
'AND build_short_uuid:%s' %
|
||||
'AND build_uuid:%s' %
|
||||
(query, review, patch, build_short_uuid))
|
||||
|
||||
|
||||
|
@ -115,7 +115,7 @@ class TestBotWithTestTools(tests.TestCase):
|
||||
reference = ("openstack/keystone change: https://review.opendev.org/"
|
||||
"64750 failed because of: "
|
||||
"gate-keystone-python26: "
|
||||
"https://bugs.launchpad.net/bugs/123456, "
|
||||
"http://ci-health-rdo.tripleo.org/#123456, "
|
||||
"gate-keystone-python27: unrecognized error")
|
||||
self.assertEqual(reference, msg)
|
||||
|
||||
|
@ -56,25 +56,3 @@ class TestSubunit2sqlCrossover(unit.UnitTestCase):
|
||||
['test1', 'test4'],
|
||||
mock.sentinel.session)
|
||||
self.assertFalse(res)
|
||||
|
||||
@mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=True)
|
||||
def test_classify_with_test_id_filter_match(self, mock_id_check):
|
||||
c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
|
||||
es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
|
||||
es_mock.start()
|
||||
self.addCleanup(es_mock.stop)
|
||||
res = c.classify(1234, 1, 'fake')
|
||||
self.assertEqual(res, ['1234567'],
|
||||
"classify() returned %s when it should have returned "
|
||||
"a list with one bug id: '1234567'" % res)
|
||||
|
||||
@mock.patch.object(er, 'check_failed_test_ids_for_job', return_value=False)
|
||||
def test_classify_with_test_id_filter_no_match(self, mock_id_check):
|
||||
c = er.Classifier('./elastic_recheck/tests/unit/queries_with_filters')
|
||||
es_mock = mock.patch.object(c.es, 'search', return_value=[1, 2, 3])
|
||||
es_mock.start()
|
||||
self.addCleanup(es_mock.stop)
|
||||
res = c.classify(1234, 1, 'fake')
|
||||
self.assertEqual(res, [],
|
||||
"classify() returned bug matches %s when none should "
|
||||
"have been found" % res)
|
||||
|
@ -152,10 +152,10 @@ class TestStream(tests.TestCase):
|
||||
self.assertTrue(event.is_included_job())
|
||||
self.assertEqual(event.queue(), "gate")
|
||||
self.assertEqual(event.bug_urls(),
|
||||
['https://bugs.launchpad.net/bugs/123456'])
|
||||
['http://ci-health-rdo.tripleo.org/#123456'])
|
||||
errors = ['gate-keystone-python27: unrecognized error',
|
||||
'gate-keystone-python26: '
|
||||
'https://bugs.launchpad.net/bugs/123456']
|
||||
'http://ci-health-rdo.tripleo.org/#123456']
|
||||
bug_map = event.bug_urls_map()
|
||||
for error in errors:
|
||||
self.assertIn(error, bug_map)
|
||||
@ -180,10 +180,10 @@ class TestStream(tests.TestCase):
|
||||
self.assertTrue(event.is_included_job())
|
||||
self.assertEqual(event.queue(), "check")
|
||||
self.assertEqual(event.bug_urls(),
|
||||
['https://bugs.launchpad.net/bugs/123456'])
|
||||
['http://ci-health-rdo.tripleo.org/#123456'])
|
||||
self.assertEqual(event.bug_urls_map(),
|
||||
['gate-keystone-python26: '
|
||||
'https://bugs.launchpad.net/bugs/123456',
|
||||
'http://ci-health-rdo.tripleo.org/#123456',
|
||||
'gate-keystone-python27: unrecognized error'])
|
||||
self.assertEqual(sorted(event.failed_job_names()),
|
||||
['gate-keystone-python26',
|
||||
|
@ -23,14 +23,14 @@ messages:
|
||||
%(bugs)s
|
||||
footer: >-
|
||||
For more details on this and other bugs, please see
|
||||
http://status.openstack.org/elastic-recheck/
|
||||
http://ci-health-rdo.tripleo.org/
|
||||
recheck_instructions: >-
|
||||
If you believe we've correctly identified the failure, feel free to leave a 'recheck'
|
||||
comment to run the tests again.
|
||||
unrecognized: >-
|
||||
Some of the tests failed in a way that we did not understand. Please help
|
||||
us classify these issues so that they can be part of Elastic Recheck
|
||||
http://status.openstack.org/elastic-recheck/
|
||||
http://ci-health-rdo.tripleo.org/
|
||||
no_bugs_found: >-
|
||||
I noticed Zuul failed, refer to:
|
||||
http://docs.openstack.org/infra/manual/developers.html#automated-testing
|
||||
http://docs.openstack.org/infra/manual/developers.html#automated-testing
|
||||
|
Loading…
x
Reference in New Issue
Block a user