Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 from functools import total_ordering
0019 import itertools
0020 import re
0021 
0022 all_modules = []
0023 
0024 
0025 @total_ordering
0026 class Module(object):
0027     """
0028     A module is the basic abstraction in our test runner script. Each module consists of a set
0029     of source files, a set of test commands, and a set of dependencies on other modules. We use
0030     modules to define a dependency graph that let us determine which tests to run based on which
0031     files have changed.
0032     """
0033 
0034     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
0035                  sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
0036                  test_tags=(), should_run_r_tests=False, should_run_build_tests=False):
0037         """
0038         Define a new module.
0039 
0040         :param name: A short module name, for display in logging and error messages.
0041         :param dependencies: A set of dependencies for this module. This should only include direct
0042             dependencies; transitive dependencies are resolved automatically.
0043         :param source_file_regexes: a set of regexes that match source files belonging to this
0044             module. These regexes are applied by attempting to match at the beginning of the
0045             filename strings.
0046         :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
0047             order to build and test this module (e.g. '-PprofileName').
0048         :param environ: A dict of environment variables that should be set when files in this
0049             module are changed.
0050         :param sbt_test_goals: A set of SBT test goals for testing this module.
0051         :param python_test_goals: A set of Python test goals for testing this module.
0052         :param blacklisted_python_implementations: A set of Python implementations that are not
0053             supported by this module's Python components. The values in this set should match
0054             strings returned by Python's `platform.python_implementation()`.
0055         :param test_tags A set of tags that will be excluded when running unit tests if the module
0056             is not explicitly changed.
0057         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
0058         :param should_run_build_tests: If true, changes in this module will trigger build tests.
0059         """
0060         self.name = name
0061         self.dependencies = dependencies
0062         self.source_file_prefixes = source_file_regexes
0063         self.sbt_test_goals = sbt_test_goals
0064         self.build_profile_flags = build_profile_flags
0065         self.environ = environ
0066         self.python_test_goals = python_test_goals
0067         self.blacklisted_python_implementations = blacklisted_python_implementations
0068         self.test_tags = test_tags
0069         self.should_run_r_tests = should_run_r_tests
0070         self.should_run_build_tests = should_run_build_tests
0071 
0072         self.dependent_modules = set()
0073         for dep in dependencies:
0074             dep.dependent_modules.add(self)
0075         all_modules.append(self)
0076 
0077     def contains_file(self, filename):
0078         return any(re.match(p, filename) for p in self.source_file_prefixes)
0079 
0080     def __repr__(self):
0081         return "Module<%s>" % self.name
0082 
0083     def __lt__(self, other):
0084         return self.name < other.name
0085 
0086     def __eq__(self, other):
0087         return self.name == other.name
0088 
0089     def __ne__(self, other):
0090         return not (self.name == other.name)
0091 
0092     def __hash__(self):
0093         return hash(self.name)
0094 
0095 tags = Module(
0096     name="tags",
0097     dependencies=[],
0098     source_file_regexes=[
0099         "common/tags/",
0100     ]
0101 )
0102 
0103 catalyst = Module(
0104     name="catalyst",
0105     dependencies=[tags],
0106     source_file_regexes=[
0107         "sql/catalyst/",
0108     ],
0109     sbt_test_goals=[
0110         "catalyst/test",
0111     ],
0112 )
0113 
0114 
0115 sql = Module(
0116     name="sql",
0117     dependencies=[catalyst],
0118     source_file_regexes=[
0119         "sql/core/",
0120     ],
0121     sbt_test_goals=[
0122         "sql/test",
0123     ],
0124 )
0125 
0126 
0127 hive = Module(
0128     name="hive",
0129     dependencies=[sql],
0130     source_file_regexes=[
0131         "sql/hive/",
0132         "bin/spark-sql",
0133     ],
0134     build_profile_flags=[
0135         "-Phive",
0136     ],
0137     sbt_test_goals=[
0138         "hive/test",
0139     ],
0140     test_tags=[
0141         "org.apache.spark.tags.ExtendedHiveTest"
0142     ]
0143 )
0144 
0145 
0146 repl = Module(
0147     name="repl",
0148     dependencies=[hive],
0149     source_file_regexes=[
0150         "repl/",
0151     ],
0152     sbt_test_goals=[
0153         "repl/test",
0154     ],
0155 )
0156 
0157 
0158 hive_thriftserver = Module(
0159     name="hive-thriftserver",
0160     dependencies=[hive],
0161     source_file_regexes=[
0162         "sql/hive-thriftserver",
0163         "sbin/start-thriftserver.sh",
0164     ],
0165     build_profile_flags=[
0166         "-Phive-thriftserver",
0167     ],
0168     sbt_test_goals=[
0169         "hive-thriftserver/test",
0170     ]
0171 )
0172 
0173 avro = Module(
0174     name="avro",
0175     dependencies=[sql],
0176     source_file_regexes=[
0177         "external/avro",
0178     ],
0179     sbt_test_goals=[
0180         "avro/test",
0181     ]
0182 )
0183 
0184 sql_kafka = Module(
0185     name="sql-kafka-0-10",
0186     dependencies=[sql],
0187     source_file_regexes=[
0188         "external/kafka-0-10-sql",
0189     ],
0190     sbt_test_goals=[
0191         "sql-kafka-0-10/test",
0192     ]
0193 )
0194 
0195 
0196 sketch = Module(
0197     name="sketch",
0198     dependencies=[tags],
0199     source_file_regexes=[
0200         "common/sketch/",
0201     ],
0202     sbt_test_goals=[
0203         "sketch/test"
0204     ]
0205 )
0206 
0207 
0208 graphx = Module(
0209     name="graphx",
0210     dependencies=[tags],
0211     source_file_regexes=[
0212         "graphx/",
0213     ],
0214     sbt_test_goals=[
0215         "graphx/test"
0216     ]
0217 )
0218 
0219 
0220 streaming = Module(
0221     name="streaming",
0222     dependencies=[tags],
0223     source_file_regexes=[
0224         "streaming",
0225     ],
0226     sbt_test_goals=[
0227         "streaming/test",
0228     ]
0229 )
0230 
0231 
0232 # Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
0233 # Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
0234 # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
0235 # fail other PRs.
0236 streaming_kinesis_asl = Module(
0237     name="streaming-kinesis-asl",
0238     dependencies=[tags],
0239     source_file_regexes=[
0240         "external/kinesis-asl/",
0241         "external/kinesis-asl-assembly/",
0242     ],
0243     build_profile_flags=[
0244         "-Pkinesis-asl",
0245     ],
0246     environ={
0247         "ENABLE_KINESIS_TESTS": "1"
0248     },
0249     sbt_test_goals=[
0250         "streaming-kinesis-asl/test",
0251     ]
0252 )
0253 
0254 
0255 streaming_kafka_0_10 = Module(
0256     name="streaming-kafka-0-10",
0257     dependencies=[streaming],
0258     source_file_regexes=[
0259         # The ending "/" is necessary otherwise it will include "sql-kafka" codes
0260         "external/kafka-0-10/",
0261         "external/kafka-0-10-assembly",
0262     ],
0263     sbt_test_goals=[
0264         "streaming-kafka-0-10/test",
0265     ]
0266 )
0267 
0268 
0269 mllib_local = Module(
0270     name="mllib-local",
0271     dependencies=[tags],
0272     source_file_regexes=[
0273         "mllib-local",
0274     ],
0275     sbt_test_goals=[
0276         "mllib-local/test",
0277     ]
0278 )
0279 
0280 
0281 mllib = Module(
0282     name="mllib",
0283     dependencies=[mllib_local, streaming, sql],
0284     source_file_regexes=[
0285         "data/mllib/",
0286         "mllib/",
0287     ],
0288     sbt_test_goals=[
0289         "mllib/test",
0290     ]
0291 )
0292 
0293 
0294 examples = Module(
0295     name="examples",
0296     dependencies=[graphx, mllib, streaming, hive],
0297     source_file_regexes=[
0298         "examples/",
0299     ],
0300     sbt_test_goals=[
0301         "examples/test",
0302     ]
0303 )
0304 
0305 
0306 pyspark_core = Module(
0307     name="pyspark-core",
0308     dependencies=[],
0309     source_file_regexes=[
0310         "python/(?!pyspark/(ml|mllib|sql|streaming))"
0311     ],
0312     python_test_goals=[
0313         # doctests
0314         "pyspark.rdd",
0315         "pyspark.context",
0316         "pyspark.conf",
0317         "pyspark.broadcast",
0318         "pyspark.accumulators",
0319         "pyspark.serializers",
0320         "pyspark.profiler",
0321         "pyspark.shuffle",
0322         "pyspark.util",
0323         # unittests
0324         "pyspark.tests.test_appsubmit",
0325         "pyspark.tests.test_broadcast",
0326         "pyspark.tests.test_conf",
0327         "pyspark.tests.test_context",
0328         "pyspark.tests.test_daemon",
0329         "pyspark.tests.test_join",
0330         "pyspark.tests.test_profiler",
0331         "pyspark.tests.test_rdd",
0332         "pyspark.tests.test_rddbarrier",
0333         "pyspark.tests.test_readwrite",
0334         "pyspark.tests.test_serializers",
0335         "pyspark.tests.test_shuffle",
0336         "pyspark.tests.test_taskcontext",
0337         "pyspark.tests.test_util",
0338         "pyspark.tests.test_worker",
0339     ]
0340 )
0341 
0342 
0343 pyspark_sql = Module(
0344     name="pyspark-sql",
0345     dependencies=[pyspark_core, hive, avro],
0346     source_file_regexes=[
0347         "python/pyspark/sql"
0348     ],
0349     python_test_goals=[
0350         # doctests
0351         "pyspark.sql.types",
0352         "pyspark.sql.context",
0353         "pyspark.sql.session",
0354         "pyspark.sql.conf",
0355         "pyspark.sql.catalog",
0356         "pyspark.sql.column",
0357         "pyspark.sql.dataframe",
0358         "pyspark.sql.group",
0359         "pyspark.sql.functions",
0360         "pyspark.sql.readwriter",
0361         "pyspark.sql.streaming",
0362         "pyspark.sql.udf",
0363         "pyspark.sql.window",
0364         "pyspark.sql.avro.functions",
0365         "pyspark.sql.pandas.conversion",
0366         "pyspark.sql.pandas.map_ops",
0367         "pyspark.sql.pandas.group_ops",
0368         "pyspark.sql.pandas.types",
0369         "pyspark.sql.pandas.serializers",
0370         "pyspark.sql.pandas.typehints",
0371         "pyspark.sql.pandas.utils",
0372         # unittests
0373         "pyspark.sql.tests.test_arrow",
0374         "pyspark.sql.tests.test_catalog",
0375         "pyspark.sql.tests.test_column",
0376         "pyspark.sql.tests.test_conf",
0377         "pyspark.sql.tests.test_context",
0378         "pyspark.sql.tests.test_dataframe",
0379         "pyspark.sql.tests.test_datasources",
0380         "pyspark.sql.tests.test_functions",
0381         "pyspark.sql.tests.test_group",
0382         "pyspark.sql.tests.test_pandas_cogrouped_map",
0383         "pyspark.sql.tests.test_pandas_grouped_map",
0384         "pyspark.sql.tests.test_pandas_map",
0385         "pyspark.sql.tests.test_pandas_udf",
0386         "pyspark.sql.tests.test_pandas_udf_grouped_agg",
0387         "pyspark.sql.tests.test_pandas_udf_scalar",
0388         "pyspark.sql.tests.test_pandas_udf_typehints",
0389         "pyspark.sql.tests.test_pandas_udf_window",
0390         "pyspark.sql.tests.test_readwriter",
0391         "pyspark.sql.tests.test_serde",
0392         "pyspark.sql.tests.test_session",
0393         "pyspark.sql.tests.test_streaming",
0394         "pyspark.sql.tests.test_types",
0395         "pyspark.sql.tests.test_udf",
0396         "pyspark.sql.tests.test_utils",
0397     ]
0398 )
0399 
0400 
0401 pyspark_streaming = Module(
0402     name="pyspark-streaming",
0403     dependencies=[
0404         pyspark_core,
0405         streaming,
0406         streaming_kinesis_asl
0407     ],
0408     source_file_regexes=[
0409         "python/pyspark/streaming"
0410     ],
0411     python_test_goals=[
0412         # doctests
0413         "pyspark.streaming.util",
0414         # unittests
0415         "pyspark.streaming.tests.test_context",
0416         "pyspark.streaming.tests.test_dstream",
0417         "pyspark.streaming.tests.test_kinesis",
0418         "pyspark.streaming.tests.test_listener",
0419     ]
0420 )
0421 
0422 
0423 pyspark_mllib = Module(
0424     name="pyspark-mllib",
0425     dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
0426     source_file_regexes=[
0427         "python/pyspark/mllib"
0428     ],
0429     python_test_goals=[
0430         # doctests
0431         "pyspark.mllib.classification",
0432         "pyspark.mllib.clustering",
0433         "pyspark.mllib.evaluation",
0434         "pyspark.mllib.feature",
0435         "pyspark.mllib.fpm",
0436         "pyspark.mllib.linalg.__init__",
0437         "pyspark.mllib.linalg.distributed",
0438         "pyspark.mllib.random",
0439         "pyspark.mllib.recommendation",
0440         "pyspark.mllib.regression",
0441         "pyspark.mllib.stat._statistics",
0442         "pyspark.mllib.stat.KernelDensity",
0443         "pyspark.mllib.tree",
0444         "pyspark.mllib.util",
0445         # unittests
0446         "pyspark.mllib.tests.test_algorithms",
0447         "pyspark.mllib.tests.test_feature",
0448         "pyspark.mllib.tests.test_linalg",
0449         "pyspark.mllib.tests.test_stat",
0450         "pyspark.mllib.tests.test_streaming_algorithms",
0451         "pyspark.mllib.tests.test_util",
0452     ],
0453     blacklisted_python_implementations=[
0454         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
0455     ]
0456 )
0457 
0458 
0459 pyspark_ml = Module(
0460     name="pyspark-ml",
0461     dependencies=[pyspark_core, pyspark_mllib],
0462     source_file_regexes=[
0463         "python/pyspark/ml/"
0464     ],
0465     python_test_goals=[
0466         # doctests
0467         "pyspark.ml.classification",
0468         "pyspark.ml.clustering",
0469         "pyspark.ml.evaluation",
0470         "pyspark.ml.feature",
0471         "pyspark.ml.fpm",
0472         "pyspark.ml.functions",
0473         "pyspark.ml.image",
0474         "pyspark.ml.linalg.__init__",
0475         "pyspark.ml.recommendation",
0476         "pyspark.ml.regression",
0477         "pyspark.ml.stat",
0478         "pyspark.ml.tuning",
0479         # unittests
0480         "pyspark.ml.tests.test_algorithms",
0481         "pyspark.ml.tests.test_base",
0482         "pyspark.ml.tests.test_evaluation",
0483         "pyspark.ml.tests.test_feature",
0484         "pyspark.ml.tests.test_image",
0485         "pyspark.ml.tests.test_linalg",
0486         "pyspark.ml.tests.test_param",
0487         "pyspark.ml.tests.test_persistence",
0488         "pyspark.ml.tests.test_pipeline",
0489         "pyspark.ml.tests.test_stat",
0490         "pyspark.ml.tests.test_training_summary",
0491         "pyspark.ml.tests.test_tuning",
0492         "pyspark.ml.tests.test_wrapper",
0493     ],
0494     blacklisted_python_implementations=[
0495         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
0496     ]
0497 )
0498 
0499 sparkr = Module(
0500     name="sparkr",
0501     dependencies=[hive, mllib],
0502     source_file_regexes=[
0503         "R/",
0504     ],
0505     should_run_r_tests=True
0506 )
0507 
0508 
0509 docs = Module(
0510     name="docs",
0511     dependencies=[],
0512     source_file_regexes=[
0513         "docs/",
0514     ]
0515 )
0516 
0517 build = Module(
0518     name="build",
0519     dependencies=[],
0520     source_file_regexes=[
0521         ".*pom.xml",
0522         "dev/test-dependencies.sh",
0523     ],
0524     should_run_build_tests=True
0525 )
0526 
0527 yarn = Module(
0528     name="yarn",
0529     dependencies=[],
0530     source_file_regexes=[
0531         "resource-managers/yarn/",
0532         "common/network-yarn/",
0533     ],
0534     build_profile_flags=["-Pyarn"],
0535     sbt_test_goals=[
0536         "yarn/test",
0537         "network-yarn/test",
0538     ],
0539     test_tags=[
0540         "org.apache.spark.tags.ExtendedYarnTest"
0541     ]
0542 )
0543 
0544 mesos = Module(
0545     name="mesos",
0546     dependencies=[],
0547     source_file_regexes=["resource-managers/mesos/"],
0548     build_profile_flags=["-Pmesos"],
0549     sbt_test_goals=["mesos/test"]
0550 )
0551 
0552 kubernetes = Module(
0553     name="kubernetes",
0554     dependencies=[],
0555     source_file_regexes=["resource-managers/kubernetes"],
0556     build_profile_flags=["-Pkubernetes"],
0557     sbt_test_goals=["kubernetes/test"]
0558 )
0559 
0560 hadoop_cloud = Module(
0561     name="hadoop-cloud",
0562     dependencies=[],
0563     source_file_regexes=["hadoop-cloud"],
0564     build_profile_flags=["-Phadoop-cloud"],
0565     sbt_test_goals=["hadoop-cloud/test"]
0566 )
0567 
0568 spark_ganglia_lgpl = Module(
0569     name="spark-ganglia-lgpl",
0570     dependencies=[],
0571     build_profile_flags=["-Pspark-ganglia-lgpl"],
0572     source_file_regexes=[
0573         "external/spark-ganglia-lgpl",
0574     ]
0575 )
0576 
0577 # The root module is a dummy module which is used to run all of the tests.
0578 # No other modules should directly depend on this module.
0579 root = Module(
0580     name="root",
0581     dependencies=[build],  # Changes to build should trigger all tests.
0582     source_file_regexes=[],
0583     # In order to run all of the tests, enable every test profile:
0584     build_profile_flags=list(set(
0585         itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
0586     sbt_test_goals=[
0587         "test",
0588     ],
0589     python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
0590     should_run_r_tests=True,
0591     should_run_build_tests=True
0592 )