the-tree/dev/run-tests.py

0001 #!/usr/bin/env python3
0002
0003 #
0004 # Licensed to the Apache Software Foundation (ASF) under one or more
0005 # contributor license agreements.  See the NOTICE file distributed with
0006 # this work for additional information regarding copyright ownership.
0007 # The ASF licenses this file to You under the Apache License, Version 2.0
0008 # (the "License"); you may not use this file except in compliance with
0009 # the License.  You may obtain a copy of the License at
0010 #
0011 #    http://www.apache.org/licenses/LICENSE-2.0
0012 #
0013 # Unless required by applicable law or agreed to in writing, software
0014 # distributed under the License is distributed on an "AS IS" BASIS,
0015 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016 # See the License for the specific language governing permissions and
0017 # limitations under the License.
0018 #
0019
0020 import itertools
0021 from argparse import ArgumentParser
0022 import os
0023 import random
0024 import re
0025 import sys
0026 import subprocess
0027 import glob
0028 import shutil
0029 from collections import namedtuple
0030
0031 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
0032 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
0033 from sparktestsupport.toposort import toposort_flatten
0034 import sparktestsupport.modules as modules
0035
0036
0037 # -------------------------------------------------------------------------------------------------
0038 # Functions for traversing module dependency graph
0039 # -------------------------------------------------------------------------------------------------
0040
0041
0042 def determine_modules_for_files(filenames):
0043     """
0044     Given a list of filenames, return the set of modules that contain those files.
0045     If a file is not associated with a more specific submodule, then this method will consider that
0046     file to belong to the 'root' module. GitHub Action and Appveyor files are ignored.
0047
0048     >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
0049     ['pyspark-core', 'sql']
0050     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
0051     ['root']
0052     >>> [x.name for x in determine_modules_for_files( \
0053             [".github/workflows/master.yml", "appveyor.yml"])]
0054     []
0055     """
0056     changed_modules = set()
0057     for filename in filenames:
0058         if filename in (".github/workflows/master.yml", "appveyor.yml"):
0059             continue
0060         matched_at_least_one_module = False
0061         for module in modules.all_modules:
0062             if module.contains_file(filename):
0063                 changed_modules.add(module)
0064                 matched_at_least_one_module = True
0065         if not matched_at_least_one_module:
0066             changed_modules.add(modules.root)
0067     return changed_modules
0068
0069
0070 def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
0071     """
0072     Given a git commit and target ref, use the set of files changed in the diff in order to
0073     determine which modules' tests should be run.
0074
0075     >>> [x.name for x in determine_modules_for_files( \
0076             identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
0077     ['graphx']
0078     >>> 'root' in [x.name for x in determine_modules_for_files( \
0079          identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
0080     True
0081     """
0082     if target_branch is None and target_ref is None:
0083         raise AttributeError("must specify either target_branch or target_ref")
0084     elif target_branch is not None and target_ref is not None:
0085         raise AttributeError("must specify either target_branch or target_ref, not both")
0086     if target_branch is not None:
0087         diff_target = target_branch
0088         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
0089     else:
0090         diff_target = target_ref
0091     raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
0092                                          universal_newlines=True)
0093     # Remove any empty strings
0094     return [f for f in raw_output.split('\n') if f]
0095
0096
0097 def setup_test_environ(environ):
0098     print("[info] Setup the following environment variables for tests: ")
0099     for (k, v) in environ.items():
0100         print("%s=%s" % (k, v))
0101         os.environ[k] = v
0102
0103
0104 def determine_modules_to_test(changed_modules):
0105     """
0106     Given a set of modules that have changed, compute the transitive closure of those modules'
0107     dependent modules in order to determine the set of modules that should be tested.
0108
0109     Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
0110
0111     >>> [x.name for x in determine_modules_to_test([modules.root])]
0112     ['root']
0113     >>> [x.name for x in determine_modules_to_test([modules.build])]
0114     ['root']
0115     >>> [x.name for x in determine_modules_to_test([modules.graphx])]
0116     ['graphx', 'examples']
0117     >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
0118     >>> x # doctest: +NORMALIZE_WHITESPACE
0119     ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
0120      'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
0121     """
0122     modules_to_test = set()
0123     for module in changed_modules:
0124         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
0125     modules_to_test = modules_to_test.union(set(changed_modules))
0126     # If we need to run all of the tests, then we should short-circuit and return 'root'
0127     if modules.root in modules_to_test:
0128         return [modules.root]
0129     return toposort_flatten(
0130         {m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True)
0131
0132
0133 def determine_tags_to_exclude(changed_modules):
0134     tags = []
0135     for m in modules.all_modules:
0136         if m not in changed_modules:
0137             tags += m.test_tags
0138     return tags
0139
0140
0141 # -------------------------------------------------------------------------------------------------
0142 # Functions for working with subprocesses and shell tools
0143 # -------------------------------------------------------------------------------------------------
0144
0145
0146 def determine_java_executable():
0147     """Will return the path of the java executable that will be used by Spark's
0148     tests or `None`"""
0149
0150     # Any changes in the way that Spark's build detects java must be reflected
0151     # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
0152     # the `java` executable on the path
0153
0154     java_home = os.environ.get("JAVA_HOME")
0155
0156     # check if there is an executable at $JAVA_HOME/bin/java
0157     java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
0158     # if the java_exe wasn't set, check for a `java` version on the $PATH
0159     return java_exe if java_exe else which("java")
0160
0161
0162 # -------------------------------------------------------------------------------------------------
0163 # Functions for running the other build and test scripts
0164 # -------------------------------------------------------------------------------------------------
0165
0166
0167 def set_title_and_block(title, err_block):
0168     os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block])
0169     line_str = '=' * 72
0170
0171     print('')
0172     print(line_str)
0173     print(title)
0174     print(line_str)
0175
0176
0177 def run_apache_rat_checks():
0178     set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
0179     run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
0180
0181
0182 def run_scala_style_checks(extra_profiles):
0183     build_profiles = extra_profiles + modules.root.build_profile_flags
0184     set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
0185     profiles = " ".join(build_profiles)
0186     print("[info] Checking Scala style using SBT with these profiles: ", profiles)
0187     run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala"), profiles])
0188
0189
0190 def run_java_style_checks(build_profiles):
0191     set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE")
0192     # The same profiles used for building are used to run Checkstyle by SBT as well because
0193     # the previous build looks reused for Checkstyle and affecting Checkstyle. See SPARK-27130.
0194     profiles = " ".join(build_profiles)
0195     print("[info] Checking Java style using SBT with these profiles: ", profiles)
0196     run_cmd([os.path.join(SPARK_HOME, "dev", "sbt-checkstyle"), profiles])
0197
0198
0199 def run_python_style_checks():
0200     set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
0201     run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
0202
0203
0204 def run_sparkr_style_checks():
0205     set_title_and_block("Running R style checks", "BLOCK_R_STYLE")
0206
0207     if which("R"):
0208         # R style check should be executed after `install-dev.sh`.
0209         # Since warnings about `no visible global function definition` appear
0210         # without the installation. SEE ALSO: SPARK-9121.
0211         run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
0212     else:
0213         print("Ignoring SparkR style check as R was not found in PATH")
0214
0215
0216 def build_spark_documentation():
0217     set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
0218     os.environ["PRODUCTION"] = "1 jekyll build"
0219
0220     os.chdir(os.path.join(SPARK_HOME, "docs"))
0221
0222     jekyll_bin = which("jekyll")
0223
0224     if not jekyll_bin:
0225         print("[error] Cannot find a version of `jekyll` on the system; please",
0226               " install one and retry to build documentation.")
0227         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
0228     else:
0229         run_cmd([jekyll_bin, "build"])
0230
0231     os.chdir(SPARK_HOME)
0232
0233
0234 def get_zinc_port():
0235     """
0236     Get a randomized port on which to start Zinc
0237     """
0238     return random.randrange(3030, 4030)
0239
0240
0241 def exec_maven(mvn_args=()):
0242     """Will call Maven in the current directory with the list of mvn_args passed
0243     in and returns the subprocess for any further processing"""
0244
0245     zinc_port = get_zinc_port()
0246     os.environ["ZINC_PORT"] = "%s" % zinc_port
0247     zinc_flag = "-DzincPort=%s" % zinc_port
0248     flags = [os.path.join(SPARK_HOME, "build", "mvn"), zinc_flag]
0249     run_cmd(flags + mvn_args)
0250
0251
0252 def exec_sbt(sbt_args=()):
0253     """Will call SBT in the current directory with the list of mvn_args passed
0254     in and returns the subprocess for any further processing"""
0255
0256     sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
0257
0258     sbt_output_filter = re.compile(b"^.*[info].*Resolving" + b"|" +
0259                                    b"^.*[warn].*Merging" + b"|" +
0260                                    b"^.*[info].*Including")
0261
0262     # NOTE: echo "q" is needed because sbt on encountering a build file
0263     # with failure (either resolution or compilation) prompts the user for
0264     # input either q, r, etc to quit or retry. This echo is there to make it
0265     # not block.
0266     echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
0267     sbt_proc = subprocess.Popen(sbt_cmd,
0268                                 stdin=echo_proc.stdout,
0269                                 stdout=subprocess.PIPE)
0270     echo_proc.wait()
0271     for line in iter(sbt_proc.stdout.readline, b''):
0272         if not sbt_output_filter.match(line):
0273             print(line.decode('utf-8'), end='')
0274     retcode = sbt_proc.wait()
0275
0276     if retcode != 0:
0277         exit_from_command_with_retcode(sbt_cmd, retcode)
0278
0279
0280 def get_hadoop_profiles(hadoop_version):
0281     """
0282     For the given Hadoop version tag, return a list of Maven/SBT profile flags for
0283     building and testing against that Hadoop version.
0284     """
0285
0286     sbt_maven_hadoop_profiles = {
0287         "hadoop2.7": ["-Phadoop-2.7"],
0288         "hadoop3.2": ["-Phadoop-3.2"],
0289     }
0290
0291     if hadoop_version in sbt_maven_hadoop_profiles:
0292         return sbt_maven_hadoop_profiles[hadoop_version]
0293     else:
0294         print("[error] Could not find", hadoop_version, "in the list. Valid options",
0295               " are", sbt_maven_hadoop_profiles.keys())
0296         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
0297
0298
0299 def get_hive_profiles(hive_version):
0300     """
0301     For the given Hive version tag, return a list of Maven/SBT profile flags for
0302     building and testing against that Hive version.
0303     """
0304
0305     sbt_maven_hive_profiles = {
0306         "hive1.2": ["-Phive-1.2"],
0307         "hive2.3": ["-Phive-2.3"],
0308     }
0309
0310     if hive_version in sbt_maven_hive_profiles:
0311         return sbt_maven_hive_profiles[hive_version]
0312     else:
0313         print("[error] Could not find", hive_version, "in the list. Valid options",
0314               " are", sbt_maven_hive_profiles.keys())
0315         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
0316
0317
0318 def build_spark_maven(extra_profiles):
0319     # Enable all of the profiles for the build:
0320     build_profiles = extra_profiles + modules.root.build_profile_flags
0321     mvn_goals = ["clean", "package", "-DskipTests"]
0322     profiles_and_goals = build_profiles + mvn_goals
0323
0324     print("[info] Building Spark using Maven with these arguments: ", " ".join(profiles_and_goals))
0325
0326     exec_maven(profiles_and_goals)
0327
0328
0329 def build_spark_sbt(extra_profiles):
0330     # Enable all of the profiles for the build:
0331     build_profiles = extra_profiles + modules.root.build_profile_flags
0332     sbt_goals = ["test:package",  # Build test jars as some tests depend on them
0333                  "streaming-kinesis-asl-assembly/assembly"]
0334     profiles_and_goals = build_profiles + sbt_goals
0335
0336     print("[info] Building Spark using SBT with these arguments: ", " ".join(profiles_and_goals))
0337
0338     exec_sbt(profiles_and_goals)
0339
0340
0341 def build_spark_unidoc_sbt(extra_profiles):
0342     set_title_and_block("Building Unidoc API Documentation", "BLOCK_DOCUMENTATION")
0343     # Enable all of the profiles for the build:
0344     build_profiles = extra_profiles + modules.root.build_profile_flags
0345     sbt_goals = ["unidoc"]
0346     profiles_and_goals = build_profiles + sbt_goals
0347
0348     print("[info] Building Spark unidoc using SBT with these arguments: ",
0349           " ".join(profiles_and_goals))
0350
0351     exec_sbt(profiles_and_goals)
0352
0353
0354 def build_spark_assembly_sbt(extra_profiles, checkstyle=False):
0355     # Enable all of the profiles for the build:
0356     build_profiles = extra_profiles + modules.root.build_profile_flags
0357     sbt_goals = ["assembly/package"]
0358     profiles_and_goals = build_profiles + sbt_goals
0359     print("[info] Building Spark assembly using SBT with these arguments: ",
0360           " ".join(profiles_and_goals))
0361     exec_sbt(profiles_and_goals)
0362
0363     if checkstyle:
0364         run_java_style_checks(build_profiles)
0365
0366     build_spark_unidoc_sbt(extra_profiles)
0367
0368
0369 def build_apache_spark(build_tool, extra_profiles):
0370     """Will build Spark with the extra profiles and the passed in build tool
0371     (either `sbt` or `maven`). Defaults to using `sbt`."""
0372
0373     set_title_and_block("Building Spark", "BLOCK_BUILD")
0374
0375     rm_r("lib_managed")
0376
0377     if build_tool == "maven":
0378         build_spark_maven(extra_profiles)
0379     else:
0380         build_spark_sbt(extra_profiles)
0381
0382
0383 def detect_binary_inop_with_mima(extra_profiles):
0384     build_profiles = extra_profiles + modules.root.build_profile_flags
0385     set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
0386     profiles = " ".join(build_profiles)
0387     print("[info] Detecting binary incompatibilities with MiMa using SBT with these profiles: ",
0388           profiles)
0389     run_cmd([os.path.join(SPARK_HOME, "dev", "mima"), profiles])
0390
0391
0392 def run_scala_tests_maven(test_profiles):
0393     mvn_test_goals = ["test", "--fail-at-end"]
0394
0395     profiles_and_goals = test_profiles + mvn_test_goals
0396
0397     print("[info] Running Spark tests using Maven with these arguments: ",
0398           " ".join(profiles_and_goals))
0399
0400     exec_maven(profiles_and_goals)
0401
0402
0403 def run_scala_tests_sbt(test_modules, test_profiles):
0404
0405     sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
0406
0407     if not sbt_test_goals:
0408         return
0409
0410     profiles_and_goals = test_profiles + sbt_test_goals
0411
0412     print("[info] Running Spark tests using SBT with these arguments: ",
0413           " ".join(profiles_and_goals))
0414
0415     exec_sbt(profiles_and_goals)
0416
0417
0418 def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags):
0419     """Function to properly execute all tests passed in as a set from the
0420     `determine_test_suites` function"""
0421     set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
0422
0423     test_modules = set(test_modules)
0424
0425     test_profiles = extra_profiles + \
0426         list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
0427
0428     if excluded_tags:
0429         test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
0430
0431     # set up java11 env if this is a pull request build with 'test-java11' in the title
0432     if "ghprbPullTitle" in os.environ:
0433         if "test-java11" in os.environ["ghprbPullTitle"].lower():
0434             os.environ["JAVA_HOME"] = "/usr/java/jdk-11.0.1"
0435             os.environ["PATH"] = "%s/bin:%s" % (os.environ["JAVA_HOME"], os.environ["PATH"])
0436             test_profiles += ['-Djava.version=11']
0437
0438     if build_tool == "maven":
0439         run_scala_tests_maven(test_profiles)
0440     else:
0441         run_scala_tests_sbt(test_modules, test_profiles)
0442
0443
0444 def run_python_tests(test_modules, parallelism, with_coverage=False):
0445     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
0446
0447     if with_coverage:
0448         # Coverage makes the PySpark tests flaky due to heavy parallelism.
0449         # When we run PySpark tests with coverage, it uses 4 for now as
0450         # workaround.
0451         parallelism = 4
0452         script = "run-tests-with-coverage"
0453     else:
0454         script = "run-tests"
0455     command = [os.path.join(SPARK_HOME, "python", script)]
0456     if test_modules != [modules.root]:
0457         command.append("--modules=%s" % ','.join(m.name for m in test_modules))
0458     command.append("--parallelism=%i" % parallelism)
0459     run_cmd(command)
0460
0461     if with_coverage:
0462         post_python_tests_results()
0463
0464
0465 def post_python_tests_results():
0466     if "SPARK_TEST_KEY" not in os.environ:
0467         print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post "
0468               "PySpark coverage results.")
0469         sys.exit(1)
0470     spark_test_key = os.environ.get("SPARK_TEST_KEY")
0471     # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'.
0472     # 1. Clone PySpark coverage site.
0473     run_cmd([
0474         "git",
0475         "clone",
0476         "https://spark-test:%s@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key])
0477     # 2. Remove existing HTMLs.
0478     run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*"))
0479     # 3. Copy generated coverage HTMLs.
0480     for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME):
0481         shutil.copy(f, "pyspark-coverage-site/")
0482     os.chdir("pyspark-coverage-site")
0483     try:
0484         # 4. Check out to a temporary branch.
0485         run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"])
0486         # 5. Add all the files.
0487         run_cmd(["git", "add", "-A"])
0488         # 6. Commit current HTMLs.
0489         run_cmd([
0490             "git",
0491             "commit",
0492             "-am",
0493             "Coverage report at latest commit in Apache Spark",
0494             '--author="Apache Spark Test Account <sparktestacc@gmail.com>"'])
0495         # 7. Delete the old branch.
0496         run_cmd(["git", "branch", "-D", "gh-pages"])
0497         # 8. Rename the temporary branch to master.
0498         run_cmd(["git", "branch", "-m", "gh-pages"])
0499         # 9. Finally, force update to our repository.
0500         run_cmd(["git", "push", "-f", "origin", "gh-pages"])
0501     finally:
0502         os.chdir("..")
0503         # 10. Remove the cloned repository.
0504         shutil.rmtree("pyspark-coverage-site")
0505
0506
0507 def run_python_packaging_tests():
0508     set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
0509     command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")]
0510     run_cmd(command)
0511
0512
0513 def run_build_tests():
0514     set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
0515     run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])
0516
0517
0518 def run_sparkr_tests():
0519     set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
0520
0521     if which("R"):
0522         run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
0523     else:
0524         print("Ignoring SparkR tests as R was not found in PATH")
0525
0526
0527 def parse_opts():
0528     parser = ArgumentParser(
0529         prog="run-tests"
0530     )
0531     parser.add_argument(
0532         "-p", "--parallelism", type=int, default=8,
0533         help="The number of suites to test in parallel (default %(default)d)"
0534     )
0535
0536     args, unknown = parser.parse_known_args()
0537     if unknown:
0538         parser.error("Unsupported arguments: %s" % ' '.join(unknown))
0539     if args.parallelism < 1:
0540         parser.error("Parallelism cannot be less than 1")
0541     return args
0542
0543
0544 def main():
0545     opts = parse_opts()
0546     # Ensure the user home directory (HOME) is valid and is an absolute directory
0547     if not USER_HOME or not os.path.isabs(USER_HOME):
0548         print("[error] Cannot determine your home directory as an absolute path;",
0549               " ensure the $HOME environment variable is set properly.")
0550         sys.exit(1)
0551
0552     os.chdir(SPARK_HOME)
0553
0554     rm_r(os.path.join(SPARK_HOME, "work"))
0555     rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
0556     rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
0557
0558     os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])
0559
0560     java_exe = determine_java_executable()
0561
0562     if not java_exe:
0563         print("[error] Cannot find a version of `java` on the system; please",
0564               " install one and retry.")
0565         sys.exit(2)
0566
0567     # install SparkR
0568     if which("R"):
0569         run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
0570     else:
0571         print("Cannot install SparkR as R was not found in PATH")
0572
0573     if os.environ.get("AMPLAB_JENKINS"):
0574         # if we're on the Amplab Jenkins build servers setup variables
0575         # to reflect the environment settings
0576         build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
0577         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7")
0578         hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3")
0579         test_env = "amplab_jenkins"
0580         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
0581         # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
0582         # /home/jenkins/anaconda2/envs/py36/bin
0583         os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
0584     else:
0585         # else we're running locally and can use local settings
0586         build_tool = "sbt"
0587         hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
0588         hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
0589         test_env = "local"
0590
0591     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
0592           "and Hive profile", hive_version, "under environment", test_env)
0593     extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
0594
0595     changed_modules = None
0596     changed_files = None
0597     if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
0598         target_branch = os.environ["ghprbTargetBranch"]
0599         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
0600         changed_modules = determine_modules_for_files(changed_files)
0601         excluded_tags = determine_tags_to_exclude(changed_modules)
0602
0603     if not changed_modules:
0604         changed_modules = [modules.root]
0605         excluded_tags = []
0606     print("[info] Found the following changed modules:",
0607           ", ".join(x.name for x in changed_modules))
0608
0609     # setup environment variables
0610     # note - the 'root' module doesn't collect environment variables for all modules. Because the
0611     # environment variables should not be set if a module is not changed, even if running the 'root'
0612     # module. So here we should use changed_modules rather than test_modules.
0613     test_environ = {}
0614     for m in changed_modules:
0615         test_environ.update(m.environ)
0616     setup_test_environ(test_environ)
0617
0618     test_modules = determine_modules_to_test(changed_modules)
0619
0620     # license checks
0621     run_apache_rat_checks()
0622
0623     # style checks
0624     if not changed_files or any(f.endswith(".scala")
0625                                 or f.endswith("scalastyle-config.xml")
0626                                 for f in changed_files):
0627         run_scala_style_checks(extra_profiles)
0628     should_run_java_style_checks = False
0629     if not changed_files or any(f.endswith(".java")
0630                                 or f.endswith("checkstyle.xml")
0631                                 or f.endswith("checkstyle-suppressions.xml")
0632                                 for f in changed_files):
0633         # Run SBT Checkstyle after the build to prevent a side-effect to the build.
0634         should_run_java_style_checks = True
0635     if not changed_files or any(f.endswith("lint-python")
0636                                 or f.endswith("tox.ini")
0637                                 or f.endswith(".py")
0638                                 for f in changed_files):
0639         run_python_style_checks()
0640     if not changed_files or any(f.endswith(".R")
0641                                 or f.endswith("lint-r")
0642                                 or f.endswith(".lintr")
0643                                 for f in changed_files):
0644         run_sparkr_style_checks()
0645
0646     # determine if docs were changed and if we're inside the amplab environment
0647     # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
0648     # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
0649     #    build_spark_documentation()
0650
0651     if any(m.should_run_build_tests for m in test_modules):
0652         run_build_tests()
0653
0654     # spark build
0655     build_apache_spark(build_tool, extra_profiles)
0656
0657     # backwards compatibility checks
0658     if build_tool == "sbt":
0659         # Note: compatibility tests only supported in sbt for now
0660         detect_binary_inop_with_mima(extra_profiles)
0661         # Since we did not build assembly/package before running dev/mima, we need to
0662         # do it here because the tests still rely on it; see SPARK-13294 for details.
0663         build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
0664
0665     # run the test suites
0666     run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags)
0667
0668     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
0669     if modules_with_python_tests:
0670         # We only run PySpark tests with coverage report in one specific job with
0671         # Spark master with SBT in Jenkins.
0672         is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
0673         run_python_tests(
0674             modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
0675         run_python_packaging_tests()
0676     if any(m.should_run_r_tests for m in test_modules):
0677         run_sparkr_tests()
0678
0679
0680 def _test():
0681     import doctest
0682     failure_count = doctest.testmod()[0]
0683     if failure_count:
0684         sys.exit(-1)
0685
0686
0687 if __name__ == "__main__":
0688     _test()
0689     main()