dev/create-release/translate-contributors.py

0001 #!/usr/bin/env python
0002 #
0003 # Licensed to the Apache Software Foundation (ASF) under one or more
0004 # contributor license agreements.  See the NOTICE file distributed with
0005 # this work for additional information regarding copyright ownership.
0006 # The ASF licenses this file to You under the Apache License, Version 2.0
0007 # (the "License"); you may not use this file except in compliance with
0008 # the License.  You may obtain a copy of the License at
0009 #
0010 #    http://www.apache.org/licenses/LICENSE-2.0
0011 #
0012 # Unless required by applicable law or agreed to in writing, software
0013 # distributed under the License is distributed on an "AS IS" BASIS,
0014 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0015 # See the License for the specific language governing permissions and
0016 # limitations under the License.
0017
0018 # This script translates invalid authors in the contributors list generated
0019 # by generate-contributors.py. When the script encounters an author name that
0020 # is considered invalid, it searches Github and JIRA in an attempt to search
0021 # for replacements. This tool runs in two modes:
0022 #
0023 # (1) Interactive mode: For each invalid author name, this script presents
0024 # all candidate replacements to the user and awaits user response. In this
0025 # mode, the user may also input a custom name. This is the default.
0026 #
0027 # (2) Non-interactive mode: For each invalid author name, this script replaces
0028 # the name with the first valid candidate it can find. If there is none, it
0029 # uses the original name. This can be enabled through the --non-interactive flag.
0030
0031 import os
0032 import sys
0033
0034 from releaseutils import *
0035
0036 # You must set the following before use!
0037 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
0038 JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
0039 JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
0040 GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
0041 if not JIRA_USERNAME or not JIRA_PASSWORD:
0042     sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
0043 if not GITHUB_API_TOKEN:
0044     sys.exit("GITHUB_API_TOKEN must be set")
0045
0046 # Write new contributors list to <old_file_name>.final
0047 if not os.path.isfile(contributors_file_name):
0048     print("Contributors file %s does not exist!" % contributors_file_name)
0049     print("Have you run ./generate-contributors.py yet?")
0050     sys.exit(1)
0051 contributors_file = open(contributors_file_name, "r")
0052 warnings = []
0053
0054 # In non-interactive mode, this script will choose the first replacement that is valid
0055 INTERACTIVE_MODE = True
0056 if len(sys.argv) > 1:
0057     options = set(sys.argv[1:])
0058     if "--non-interactive" in options:
0059         INTERACTIVE_MODE = False
0060 if INTERACTIVE_MODE:
0061     print("Running in interactive mode. To disable this, provide the --non-interactive flag.")
0062
0063 # Setup Github and JIRA clients
0064 jira_options = {"server": JIRA_API_BASE}
0065 jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
0066 github_client = Github(GITHUB_API_TOKEN)
0067
0068 # Load known author translations that are cached locally
0069 known_translations = {}
0070 known_translations_file_name = "known_translations"
0071 known_translations_file = open(known_translations_file_name, "r")
0072 for line in known_translations_file:
0073     if line.startswith("#"):
0074         continue
0075     [old_name, new_name] = line.strip("\n").split(" - ")
0076     known_translations[old_name] = new_name
0077 known_translations_file.close()
0078
0079 # Open again in case the user adds new mappings
0080 known_translations_file = open(known_translations_file_name, "a")
0081
0082 # Generate candidates for the given author. This should only be called if the given author
0083 # name does not represent a full name as this operation is somewhat expensive. Under the
0084 # hood, it makes several calls to the Github and JIRA API servers to find the candidates.
0085 #
0086 # This returns a list of (candidate name, source) 2-tuples. E.g.
0087 # [
0088 #   (NOT_FOUND, "No full name found for Github user andrewor14"),
0089 #   ("Andrew Or", "Full name of JIRA user andrewor14"),
0090 #   ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
0091 #   ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
0092 #   (NOT_FOUND, "No assignee found for SPARK-1763")
0093 # ]
0094 NOT_FOUND = "Not found"
0095
0096
0097 def generate_candidates(author, issues):
0098     candidates = []
0099     # First check for full name of Github user
0100     github_name = get_github_name(author, github_client)
0101     if github_name:
0102         candidates.append((github_name, "Full name of Github user %s" % author))
0103     else:
0104         candidates.append((NOT_FOUND, "No full name found for Github user %s" % author))
0105     # Then do the same for JIRA user
0106     jira_name = get_jira_name(author, jira_client)
0107     if jira_name:
0108         candidates.append((jira_name, "Full name of JIRA user %s" % author))
0109     else:
0110         candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author))
0111     # Then do the same for the assignee of each of the associated JIRAs
0112     # Note that a given issue may not have an assignee, or the assignee may not have a full name
0113     for issue in issues:
0114         try:
0115             jira_issue = jira_client.issue(issue)
0116         except JIRAError as e:
0117             # Do not exit just because an issue is not found!
0118             if e.status_code == 404:
0119                 warnings.append("Issue %s not found!" % issue)
0120                 continue
0121             raise e
0122         jira_assignee = jira_issue.fields.assignee
0123         if jira_assignee:
0124             user_name = jira_assignee.name
0125             display_name = jira_assignee.displayName
0126             if display_name:
0127                 candidates.append(
0128                     (display_name, "Full name of %s assignee %s" % (issue, user_name)))
0129             else:
0130                 candidates.append(
0131                     (NOT_FOUND, "No full name found for %s assignee %s" % (issue, user_name)))
0132         else:
0133             candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
0134     # Guard against special characters in candidate names
0135     # Note that the candidate name may already be in unicode (JIRA returns this)
0136     for i, (candidate, source) in enumerate(candidates):
0137         try:
0138             candidate = unicode(candidate, "UTF-8")
0139         except TypeError:
0140             # already in unicode
0141             pass
0142         candidate = unidecode.unidecode(candidate).strip()
0143         candidates[i] = (candidate, source)
0144     return candidates
0145
0146 # Translate each invalid author by searching for possible candidates from Github and JIRA
0147 # In interactive mode, this script presents the user with a list of choices and have the user
0148 # select from this list. Additionally, the user may also choose to enter a custom name.
0149 # In non-interactive mode, this script picks the first valid author name from the candidates
0150 # If no such name exists, the original name is used (without the JIRA numbers).
0151 print("\n========================== Translating contributor list ==========================")
0152 lines = contributors_file.readlines()
0153 contributions = []
0154 for i, line in enumerate(lines):
0155     # It is possible that a line in the contributor file only has the github name, e.g. yhuai.
0156     # So, we need a strip() to remove the newline.
0157     temp_author = line.strip(" * ").split(" -- ")[0].strip()
0158     print("Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)))
0159     if not temp_author:
0160         error_msg = "    ERROR: Expected the following format \" * <author> -- <contributions>\"\n"
0161         error_msg += "    ERROR: Actual = %s" % line
0162         print(error_msg)
0163         warnings.append(error_msg)
0164         contributions.append(line)
0165         continue
0166     author = temp_author.split("/")[0]
0167     # Use the local copy of known translations where possible
0168     if author in known_translations:
0169         line = line.replace(temp_author, known_translations[author])
0170     elif not is_valid_author(author):
0171         new_author = author
0172         issues = temp_author.split("/")[1:]
0173         candidates = generate_candidates(author, issues)
0174         # Print out potential replacement candidates along with the sources, e.g.
0175         #   [X] No full name found for Github user andrewor14
0176         #   [X] No assignee found for SPARK-1763
0177         #   [0] Andrew Or - Full name of JIRA user andrewor14
0178         #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
0179         #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
0180         #   [3] andrewor14 - Raw Github username
0181         #   [4] Custom
0182         candidate_names = []
0183         bad_prompts = []  # Prompts that can't actually be selected; print these first.
0184         good_prompts = []  # Prompts that contain valid choices
0185         for candidate, source in candidates:
0186             if candidate == NOT_FOUND:
0187                 bad_prompts.append("    [X] %s" % source)
0188             else:
0189                 index = len(candidate_names)
0190                 candidate_names.append(candidate)
0191                 good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
0192         raw_index = len(candidate_names)
0193         custom_index = len(candidate_names) + 1
0194         for p in bad_prompts:
0195             print(p)
0196         if bad_prompts:
0197             print("    ---")
0198         for p in good_prompts:
0199             print(p)
0200         # In interactive mode, additionally provide "custom" option and await user response
0201         if INTERACTIVE_MODE:
0202             print("    [%d] %s - Raw Github username" % (raw_index, author))
0203             print("    [%d] Custom" % custom_index)
0204             response = raw_input("    Your choice: ")
0205             last_index = custom_index
0206             while not response.isdigit() or int(response) > last_index:
0207                 response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
0208             response = int(response)
0209             if response == custom_index:
0210                 new_author = raw_input("    Please type a custom name for this author: ")
0211             elif response != raw_index:
0212                 new_author = candidate_names[response]
0213         # In non-interactive mode, just pick the first candidate
0214         else:
0215             valid_candidate_names = [name for name, _ in candidates
0216                                      if is_valid_author(name) and name != NOT_FOUND]
0217             if valid_candidate_names:
0218                 new_author = valid_candidate_names[0]
0219         # Finally, capitalize the author and replace the original one with it
0220         # If the final replacement is still invalid, log a warning
0221         if is_valid_author(new_author):
0222             new_author = capitalize_author(new_author)
0223         else:
0224             warnings.append(
0225                 "Unable to find a valid name %s for author %s" % (author, temp_author))
0226         print("    * Replacing %s with %s" % (author, new_author))
0227         # If we are in interactive mode, prompt the user whether we want to remember this new
0228         # mapping
0229         if INTERACTIVE_MODE and \
0230             author not in known_translations and \
0231                 yesOrNoPrompt(
0232                     "    Add mapping %s -> %s to known translations file?" % (author, new_author)):
0233             known_translations_file.write("%s - %s\n" % (author, new_author))
0234             known_translations_file.flush()
0235         line = line.replace(temp_author, author)
0236     contributions.append(line)
0237 print("==================================================================================\n")
0238 contributors_file.close()
0239 known_translations_file.close()
0240
0241 # Sort the contributions before writing them to the new file.
0242 # Additionally, check if there are any duplicate author rows.
0243 # This could happen if the same user has both a valid full
0244 # name (e.g. Andrew Or) and an invalid one (andrewor14).
0245 # If so, warn the user about this at the end.
0246 contributions.sort()
0247 all_authors = set()
0248 new_contributors_file_name = contributors_file_name + ".final"
0249 new_contributors_file = open(new_contributors_file_name, "w")
0250 for line in contributions:
0251     author = line.strip(" * ").split(" -- ")[0]
0252     if author in all_authors:
0253         warnings.append("Detected duplicate author name %s. Please merge these manually." % author)
0254     all_authors.add(author)
0255     new_contributors_file.write(line)
0256 new_contributors_file.close()
0257
0258 print("Translated contributors list successfully written to %s!" % new_contributors_file_name)
0259
0260 # Log any warnings encountered in the process
0261 if warnings:
0262     print("\n========== Warnings encountered while translating the contributor list ===========")
0263     for w in warnings:
0264         print(w)
0265     print("Please manually correct these in the final contributors list at %s." %
0266           new_contributors_file_name)
0267     print("==================================================================================\n")