diff --git a/engagement/maintainers.py b/engagement/maintainers.py index 5974b0d..297c503 100755 --- a/engagement/maintainers.py +++ b/engagement/maintainers.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python - -# Copyright (c) 2015 OpenStack Foundation +# Copyright OpenDev Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -8,154 +6,169 @@ # # http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS +# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language +# governing permissions and limitations under the License. -# Description: When run using OpenStack's Gerrit server, this builds -# JSON and YAML representations of repos with information on the -# official owning project team if any, deliverable tags, and groups -# with approve rights listing the members of each along with their -# Gerrit preferred E-mail addresses and usernames when available. - -# Rationale: It was done as a demonstration to a representative of a -# foundation member company who requested a list of the "core -# reviewers" for official projects, optionally broken down by -# integrated vs. other. I'm attempting to show that this data is -# already publicly available and can be extracted/analyzed by anyone -# without needing to request it. - -# Use: This needs your Gerrit username passed as the command-line -# parameter, found at https://review.opendev.org/#/settings/ when -# authenticated in the WebUI. It also prompts for an HTTP password -# which https://review.opendev.org/#/settings/http-password will -# allow you to generate. The results end up in files named -# approvers.json and approvers.yaml. At the time of writing, it -# takes approximately 6.5 minutes to run on a well-connected machine -# with 70-80ms round-trip latency to review.opendev.org. - -# Example: -# -# $ virtualenv approvers -# [...] -# $ ./approvers/bin/pip install pyyaml requests -# [...] -# $ ./approvers/bin/python tools/who-approves.py fungi -# Password: -# [wait for completion] -# $ ./approvers/bin/python -# >>> import yaml -# >>> -# >>> def get_approvers(repos): -# ... approvers = set() -# ... for repo in repos: -# ... for group in repos[repo]['approvers']: -# ... for approver in repos[repo]['approvers'][group]: -# ... approvers.add(approver) -# ... return(approvers) -# ... -# >>> p = yaml.safe_load(open('approvers.yaml')) -# >>> print('Total repos: %s' % len(p)) -# Total repos: 751 -# >>> print('Total approvers: %s' % len(get_approvers(p))) -# Total approvers: 849 -# >>> -# >>> o = {k: v for k, v in p.iteritems() if 'team' in v} -# >>> print('Repos for official teams: %s' % len(o)) -# Repos for official teams: 380 -# >>> print('OpenStack repo approvers: %s' % len(get_approvers(o))) -# OpenStack repo approvers: 456 -# >>> -# >>> i = {k: v for k, v in p.iteritems() if 'tags' in v -# ... and 'release:managed' in v['tags']} -# >>> print('Repos under release management: %s' % len(i)) -# Repos under release management: 77 -# >>> print('Managed release repo approvers: %s' % len(get_approvers(i))) -# Managed release repo approvers: 245 - -import getpass -import json +import datetime +import os import re import sys -import requests +from engagement.stats import ( + from_gerrit_time, + get_projects, + query_gerrit, + report_times, + to_gerrit_time, + ) + import yaml -def main(): - gerrit_url = 'https://review.opendev.org/' - try: - gerrit_auth = requests.auth.HTTPDigestAuth( - sys.argv[1], getpass.getpass()) - except IndexError: - sys.stderr.write("Usage: %s USERNAME\n" % sys.argv[0]) - sys.exit(1) - acl_path = ( - 'gitweb?p=%s.git;a=blob_plain;f=project.config;hb=refs/meta/config') - group_path = 'a/groups/%s/members/?recursive&pp=0' - projects_file = ('gitweb?p=openstack/governance.git;a=blob_plain;' - 'f=reference/projects.yaml;hb=%s') - ref_name = 'refs/heads/master' - aprv_pattern = r'label-Workflow = .*\.\.\+1 group (.*)' - projects = requests.get(gerrit_url + projects_file % ref_name) - projects.encoding = 'utf-8' # Workaround for Gitweb encoding - projects = yaml.safe_load(projects.text) - repos_dump = json.loads(requests.get( - gerrit_url + 'projects/?pp=0').text[4:]) - all_groups = json.loads(requests.get(gerrit_url + 'a/groups/', - auth=gerrit_auth).text[4:]) - repos = {} - aprv_groups = {} - for repo in repos_dump: - repos[repo.encode('utf-8')] = {'approvers': {}} - acl_ini = requests.get(gerrit_url + acl_path % repo).text - for aprv_group in [str(x) for x in re.findall(aprv_pattern, acl_ini)]: - if aprv_group not in repos[repo]['approvers']: - repos[repo]['approvers'][aprv_group] = [] - if aprv_group not in aprv_groups: - aprv_groups[aprv_group] = [] - for team in projects: - if 'deliverables' in projects[team]: - for deli in projects[team]['deliverables']: - if 'repos' in projects[team]['deliverables'][deli]: - drepos = projects[team]['deliverables'][deli]['repos'] - for repo in drepos: - if repo in repos: - repos[repo]['team'] = team - if 'tags' in projects[team]['deliverables'][deli]: - repos[repo]['tags'] = projects[ - team]['deliverables'][deli]['tags'] - for aprv_group in aprv_groups.keys(): - # It's possible for built-in metagroups in recent Gerrit releases to - # appear in ACLs but not in the groups list - if aprv_group in all_groups: - aprv_groups[aprv_group] = json.loads(requests.get( - gerrit_url + group_path % all_groups[aprv_group]['id'], - auth=gerrit_auth).text[4:]) - else: - sys.stderr.write('Ignoring nonexistent "%s" group.\n' % aprv_group) - for repo in repos: - for aprv_group in repos[repo]['approvers'].keys(): - for approver in aprv_groups[aprv_group]: - if 'name' in approver: - approver_details = '"%s"' % approver['name'] - else: - approver_details = '' - if 'email' in approver: - if approver_details: - approver_details += ' ' - approver_details += '<%s>' % approver['email'] - if 'username' in approver: - if approver_details: - approver_details += ' ' - approver_details += '(%s)' % approver['username'] - repos[repo]['approvers'][aprv_group].append( - approver_details.encode('utf-8')) - approvers_yaml = open('approvers.yaml', 'w') - yaml.dump(repos, approvers_yaml, allow_unicode=True, encoding='utf-8', - default_flow_style=False) - approvers_json = open('approvers.json', 'w') - json.dump(repos, approvers_json, indent=2) +def usage_error(): + """Write a generic usage message to stderr and exit nonzero""" + + sys.stderr.write( + 'ERROR: specify report period like YEAR, YEAR-H[1-2], YEAR-Q[1-4],\n' + ' YEAR-[01-12], or YYYY-MM-DD..YYYY-MM-DD for a date range\n' + ' (start date is inclusive, end date is exclusive)\n') + sys.exit(1) + + +def parse_report_period(when): + """Parse a supplied report period string, returning a tuple of + after and before datetime objects""" + + daterange = re.compile( + r'^(\d{4})-(\d{2})-(\d{2})\.\.(\d{4})-(\d{2})-(\d{2})$') + monthly = re.compile(r'^(\d{4})-(\d{2})$') + quarterly = re.compile(r'^(\d{4})-q([1-4])$', re.IGNORECASE) + halfyearly = re.compile(r'^(\d{4})-h([1-4])$', re.IGNORECASE) + yearly = re.compile(r'^\d{4}$') + # TODO: merge this functionality into engagement.stats.parse_report_period + if daterange.match(when): + after = datetime.datetime( + int(daterange.match(when).group(1)), + int(daterange.match(when).group(2)), + int(daterange.match(when).group(3))) + before = datetime.datetime( + int(daterange.match(when).group(4)), + int(daterange.match(when).group(5)), + int(daterange.match(when).group(6))) + return after, before + if monthly.match(when): + start_year = int(monthly.match(when).group(1)) + start_month = int(monthly.match(when).group(2)) + end_year = start_year + start_month // 12 + end_month = 1 + start_month % 12 + elif quarterly.match(when): + start_year = int(quarterly.match(when).group(1)) + start_month = 1 + 3 * (int(quarterly.match(when).group(2)) - 1) + end_year = start_year + (start_month + 2) // 12 + end_month = 1 + (start_month + 2) % 12 + elif halfyearly.match(when): + start_year = int(halfyearly.match(when).group(1)) + start_month = 1 + 6 * (int(halfyearly.match(when).group(2)) - 1) + end_year = start_year + (start_month + 5) // 12 + end_month = 1 + (start_month + 5) % 12 + elif yearly.match(when): + start_year = int(yearly.match(when).group()) + start_month = 1 + end_year = start_year + 1 + end_month = 1 + else: + usage_error() + after = datetime.datetime(start_year, start_month, 1) + before = datetime.datetime(end_year, end_month, 1) + return after, before + + +def parse_command_line(): + """Parse the command line to obtain the report period, then return it""" + + if len(sys.argv) == 2: + return sys.argv[1] + else: + usage_error() + + +def main(verbose=0): + """Utility entry point""" + + argument = parse_command_line() + after, before = parse_report_period(argument) + changes = dict() + + # TODO: deduplicate this and the similar version in stats.main + # Shard querying by project, to help with the inherent instability of + # result pagination from the Gerrit API + for project in get_projects(verbose=verbose): + if verbose >= 1: + print("Checking project: %s" % project) + offset = 0 + # Loop due to unavoidable query result pagination + while offset >= 0: + # We only constrain the query by the after date, as changes created + # between the before and after date may have been updated more + # recently with a new revision or comment + new_changes = query_gerrit("changes/", params={ + "q": "project:%s after:{%s}" % ( + project, to_gerrit_time(after)), + "no-limit": "1", + "start": offset, + "o": ["DETAILED_ACCOUNTS", "DETAILED_LABELS", "SKIP_DIFFSTAT"], + }, verbose=verbose) + # Since we redundantly query ranges with offsets to help combat + # pagination instability, we must deduplicate results + for change in new_changes: + if change["id"] not in changes: + changes[change["id"]] = change + # Offset additional pages by half the returned entry count to help + # avoid missing changes due to pagination instability + if new_changes and new_changes[-1].get("_more_changes", False): + offset += int(len(new_changes) / 2) + else: + offset = -1 + + report = {"namespaces": dict()} + report_times(report, after, before) + maintainers = dict() + for change in changes.values(): + namespace = change["project"].split("/")[0] + if namespace not in report["namespaces"]: + report["namespaces"][namespace] = set() + if "labels" in change: + for label, maintvotes in { + "Code-Review": (-2, 2), "Workflow": (1,)}.items(): + if label in change["labels"]: + for vote in change["labels"][label].get("all", []): + when = vote.get("date") + if ("name" in vote and "email" in vote + and vote.get("value", 0) in maintvotes and when + and after < from_gerrit_time(when) < before): + if namespace not in maintainers: + maintainers[namespace] = set() + maintainers[namespace].add('"%s" <%s>' % ( + vote["name"], vote["email"])) + for namespace in maintainers: + report["namespaces"][namespace] = sorted(list(maintainers[namespace])) + + # Operate on a copy of the keys since we'll be altering the dict + for namespace in list(report["namespaces"].keys()): + # Cull inactive namespaces from the report + if not report["namespaces"][namespace]: + del report["namespaces"][namespace] + + # Write the full YAML structured data report + os.makedirs("maintainers", exist_ok=True) + open("maintainers/%s.yaml" % argument, "w").write(yaml.dump(report)) + + # Write per-namespace text dumps of names/addresses + for namespace, maintlist in list(report["namespaces"].items()): + with open("maintainers/%s_%s.txt" % ( + argument, namespace), "w", encoding="utf-8") as dumpfile: + for maintainer in maintlist: + dumpfile.write(maintainer + "\n")