0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from functools import total_ordering
0019 import itertools
0020 import re
0021
0022 all_modules = []
0023
0024
0025 @total_ordering
0026 class Module(object):
0027 """
0028 A module is the basic abstraction in our test runner script. Each module consists of a set
0029 of source files, a set of test commands, and a set of dependencies on other modules. We use
0030 modules to define a dependency graph that let us determine which tests to run based on which
0031 files have changed.
0032 """
0033
0034 def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
0035 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
0036 test_tags=(), should_run_r_tests=False, should_run_build_tests=False):
0037 """
0038 Define a new module.
0039
0040 :param name: A short module name, for display in logging and error messages.
0041 :param dependencies: A set of dependencies for this module. This should only include direct
0042 dependencies; transitive dependencies are resolved automatically.
0043 :param source_file_regexes: a set of regexes that match source files belonging to this
0044 module. These regexes are applied by attempting to match at the beginning of the
0045 filename strings.
0046 :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
0047 order to build and test this module (e.g. '-PprofileName').
0048 :param environ: A dict of environment variables that should be set when files in this
0049 module are changed.
0050 :param sbt_test_goals: A set of SBT test goals for testing this module.
0051 :param python_test_goals: A set of Python test goals for testing this module.
0052 :param blacklisted_python_implementations: A set of Python implementations that are not
0053 supported by this module's Python components. The values in this set should match
0054 strings returned by Python's `platform.python_implementation()`.
0055 :param test_tags A set of tags that will be excluded when running unit tests if the module
0056 is not explicitly changed.
0057 :param should_run_r_tests: If true, changes in this module will trigger all R tests.
0058 :param should_run_build_tests: If true, changes in this module will trigger build tests.
0059 """
0060 self.name = name
0061 self.dependencies = dependencies
0062 self.source_file_prefixes = source_file_regexes
0063 self.sbt_test_goals = sbt_test_goals
0064 self.build_profile_flags = build_profile_flags
0065 self.environ = environ
0066 self.python_test_goals = python_test_goals
0067 self.blacklisted_python_implementations = blacklisted_python_implementations
0068 self.test_tags = test_tags
0069 self.should_run_r_tests = should_run_r_tests
0070 self.should_run_build_tests = should_run_build_tests
0071
0072 self.dependent_modules = set()
0073 for dep in dependencies:
0074 dep.dependent_modules.add(self)
0075 all_modules.append(self)
0076
0077 def contains_file(self, filename):
0078 return any(re.match(p, filename) for p in self.source_file_prefixes)
0079
0080 def __repr__(self):
0081 return "Module<%s>" % self.name
0082
0083 def __lt__(self, other):
0084 return self.name < other.name
0085
0086 def __eq__(self, other):
0087 return self.name == other.name
0088
0089 def __ne__(self, other):
0090 return not (self.name == other.name)
0091
0092 def __hash__(self):
0093 return hash(self.name)
0094
0095 tags = Module(
0096 name="tags",
0097 dependencies=[],
0098 source_file_regexes=[
0099 "common/tags/",
0100 ]
0101 )
0102
0103 catalyst = Module(
0104 name="catalyst",
0105 dependencies=[tags],
0106 source_file_regexes=[
0107 "sql/catalyst/",
0108 ],
0109 sbt_test_goals=[
0110 "catalyst/test",
0111 ],
0112 )
0113
0114
0115 sql = Module(
0116 name="sql",
0117 dependencies=[catalyst],
0118 source_file_regexes=[
0119 "sql/core/",
0120 ],
0121 sbt_test_goals=[
0122 "sql/test",
0123 ],
0124 )
0125
0126
0127 hive = Module(
0128 name="hive",
0129 dependencies=[sql],
0130 source_file_regexes=[
0131 "sql/hive/",
0132 "bin/spark-sql",
0133 ],
0134 build_profile_flags=[
0135 "-Phive",
0136 ],
0137 sbt_test_goals=[
0138 "hive/test",
0139 ],
0140 test_tags=[
0141 "org.apache.spark.tags.ExtendedHiveTest"
0142 ]
0143 )
0144
0145
0146 repl = Module(
0147 name="repl",
0148 dependencies=[hive],
0149 source_file_regexes=[
0150 "repl/",
0151 ],
0152 sbt_test_goals=[
0153 "repl/test",
0154 ],
0155 )
0156
0157
0158 hive_thriftserver = Module(
0159 name="hive-thriftserver",
0160 dependencies=[hive],
0161 source_file_regexes=[
0162 "sql/hive-thriftserver",
0163 "sbin/start-thriftserver.sh",
0164 ],
0165 build_profile_flags=[
0166 "-Phive-thriftserver",
0167 ],
0168 sbt_test_goals=[
0169 "hive-thriftserver/test",
0170 ]
0171 )
0172
0173 avro = Module(
0174 name="avro",
0175 dependencies=[sql],
0176 source_file_regexes=[
0177 "external/avro",
0178 ],
0179 sbt_test_goals=[
0180 "avro/test",
0181 ]
0182 )
0183
0184 sql_kafka = Module(
0185 name="sql-kafka-0-10",
0186 dependencies=[sql],
0187 source_file_regexes=[
0188 "external/kafka-0-10-sql",
0189 ],
0190 sbt_test_goals=[
0191 "sql-kafka-0-10/test",
0192 ]
0193 )
0194
0195
0196 sketch = Module(
0197 name="sketch",
0198 dependencies=[tags],
0199 source_file_regexes=[
0200 "common/sketch/",
0201 ],
0202 sbt_test_goals=[
0203 "sketch/test"
0204 ]
0205 )
0206
0207
0208 graphx = Module(
0209 name="graphx",
0210 dependencies=[tags],
0211 source_file_regexes=[
0212 "graphx/",
0213 ],
0214 sbt_test_goals=[
0215 "graphx/test"
0216 ]
0217 )
0218
0219
0220 streaming = Module(
0221 name="streaming",
0222 dependencies=[tags],
0223 source_file_regexes=[
0224 "streaming",
0225 ],
0226 sbt_test_goals=[
0227 "streaming/test",
0228 ]
0229 )
0230
0231
0232
0233
0234
0235
0236 streaming_kinesis_asl = Module(
0237 name="streaming-kinesis-asl",
0238 dependencies=[tags],
0239 source_file_regexes=[
0240 "external/kinesis-asl/",
0241 "external/kinesis-asl-assembly/",
0242 ],
0243 build_profile_flags=[
0244 "-Pkinesis-asl",
0245 ],
0246 environ={
0247 "ENABLE_KINESIS_TESTS": "1"
0248 },
0249 sbt_test_goals=[
0250 "streaming-kinesis-asl/test",
0251 ]
0252 )
0253
0254
0255 streaming_kafka_0_10 = Module(
0256 name="streaming-kafka-0-10",
0257 dependencies=[streaming],
0258 source_file_regexes=[
0259
0260 "external/kafka-0-10/",
0261 "external/kafka-0-10-assembly",
0262 ],
0263 sbt_test_goals=[
0264 "streaming-kafka-0-10/test",
0265 ]
0266 )
0267
0268
0269 mllib_local = Module(
0270 name="mllib-local",
0271 dependencies=[tags],
0272 source_file_regexes=[
0273 "mllib-local",
0274 ],
0275 sbt_test_goals=[
0276 "mllib-local/test",
0277 ]
0278 )
0279
0280
0281 mllib = Module(
0282 name="mllib",
0283 dependencies=[mllib_local, streaming, sql],
0284 source_file_regexes=[
0285 "data/mllib/",
0286 "mllib/",
0287 ],
0288 sbt_test_goals=[
0289 "mllib/test",
0290 ]
0291 )
0292
0293
0294 examples = Module(
0295 name="examples",
0296 dependencies=[graphx, mllib, streaming, hive],
0297 source_file_regexes=[
0298 "examples/",
0299 ],
0300 sbt_test_goals=[
0301 "examples/test",
0302 ]
0303 )
0304
0305
0306 pyspark_core = Module(
0307 name="pyspark-core",
0308 dependencies=[],
0309 source_file_regexes=[
0310 "python/(?!pyspark/(ml|mllib|sql|streaming))"
0311 ],
0312 python_test_goals=[
0313
0314 "pyspark.rdd",
0315 "pyspark.context",
0316 "pyspark.conf",
0317 "pyspark.broadcast",
0318 "pyspark.accumulators",
0319 "pyspark.serializers",
0320 "pyspark.profiler",
0321 "pyspark.shuffle",
0322 "pyspark.util",
0323
0324 "pyspark.tests.test_appsubmit",
0325 "pyspark.tests.test_broadcast",
0326 "pyspark.tests.test_conf",
0327 "pyspark.tests.test_context",
0328 "pyspark.tests.test_daemon",
0329 "pyspark.tests.test_join",
0330 "pyspark.tests.test_profiler",
0331 "pyspark.tests.test_rdd",
0332 "pyspark.tests.test_rddbarrier",
0333 "pyspark.tests.test_readwrite",
0334 "pyspark.tests.test_serializers",
0335 "pyspark.tests.test_shuffle",
0336 "pyspark.tests.test_taskcontext",
0337 "pyspark.tests.test_util",
0338 "pyspark.tests.test_worker",
0339 ]
0340 )
0341
0342
0343 pyspark_sql = Module(
0344 name="pyspark-sql",
0345 dependencies=[pyspark_core, hive, avro],
0346 source_file_regexes=[
0347 "python/pyspark/sql"
0348 ],
0349 python_test_goals=[
0350
0351 "pyspark.sql.types",
0352 "pyspark.sql.context",
0353 "pyspark.sql.session",
0354 "pyspark.sql.conf",
0355 "pyspark.sql.catalog",
0356 "pyspark.sql.column",
0357 "pyspark.sql.dataframe",
0358 "pyspark.sql.group",
0359 "pyspark.sql.functions",
0360 "pyspark.sql.readwriter",
0361 "pyspark.sql.streaming",
0362 "pyspark.sql.udf",
0363 "pyspark.sql.window",
0364 "pyspark.sql.avro.functions",
0365 "pyspark.sql.pandas.conversion",
0366 "pyspark.sql.pandas.map_ops",
0367 "pyspark.sql.pandas.group_ops",
0368 "pyspark.sql.pandas.types",
0369 "pyspark.sql.pandas.serializers",
0370 "pyspark.sql.pandas.typehints",
0371 "pyspark.sql.pandas.utils",
0372
0373 "pyspark.sql.tests.test_arrow",
0374 "pyspark.sql.tests.test_catalog",
0375 "pyspark.sql.tests.test_column",
0376 "pyspark.sql.tests.test_conf",
0377 "pyspark.sql.tests.test_context",
0378 "pyspark.sql.tests.test_dataframe",
0379 "pyspark.sql.tests.test_datasources",
0380 "pyspark.sql.tests.test_functions",
0381 "pyspark.sql.tests.test_group",
0382 "pyspark.sql.tests.test_pandas_cogrouped_map",
0383 "pyspark.sql.tests.test_pandas_grouped_map",
0384 "pyspark.sql.tests.test_pandas_map",
0385 "pyspark.sql.tests.test_pandas_udf",
0386 "pyspark.sql.tests.test_pandas_udf_grouped_agg",
0387 "pyspark.sql.tests.test_pandas_udf_scalar",
0388 "pyspark.sql.tests.test_pandas_udf_typehints",
0389 "pyspark.sql.tests.test_pandas_udf_window",
0390 "pyspark.sql.tests.test_readwriter",
0391 "pyspark.sql.tests.test_serde",
0392 "pyspark.sql.tests.test_session",
0393 "pyspark.sql.tests.test_streaming",
0394 "pyspark.sql.tests.test_types",
0395 "pyspark.sql.tests.test_udf",
0396 "pyspark.sql.tests.test_utils",
0397 ]
0398 )
0399
0400
0401 pyspark_streaming = Module(
0402 name="pyspark-streaming",
0403 dependencies=[
0404 pyspark_core,
0405 streaming,
0406 streaming_kinesis_asl
0407 ],
0408 source_file_regexes=[
0409 "python/pyspark/streaming"
0410 ],
0411 python_test_goals=[
0412
0413 "pyspark.streaming.util",
0414
0415 "pyspark.streaming.tests.test_context",
0416 "pyspark.streaming.tests.test_dstream",
0417 "pyspark.streaming.tests.test_kinesis",
0418 "pyspark.streaming.tests.test_listener",
0419 ]
0420 )
0421
0422
0423 pyspark_mllib = Module(
0424 name="pyspark-mllib",
0425 dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
0426 source_file_regexes=[
0427 "python/pyspark/mllib"
0428 ],
0429 python_test_goals=[
0430
0431 "pyspark.mllib.classification",
0432 "pyspark.mllib.clustering",
0433 "pyspark.mllib.evaluation",
0434 "pyspark.mllib.feature",
0435 "pyspark.mllib.fpm",
0436 "pyspark.mllib.linalg.__init__",
0437 "pyspark.mllib.linalg.distributed",
0438 "pyspark.mllib.random",
0439 "pyspark.mllib.recommendation",
0440 "pyspark.mllib.regression",
0441 "pyspark.mllib.stat._statistics",
0442 "pyspark.mllib.stat.KernelDensity",
0443 "pyspark.mllib.tree",
0444 "pyspark.mllib.util",
0445
0446 "pyspark.mllib.tests.test_algorithms",
0447 "pyspark.mllib.tests.test_feature",
0448 "pyspark.mllib.tests.test_linalg",
0449 "pyspark.mllib.tests.test_stat",
0450 "pyspark.mllib.tests.test_streaming_algorithms",
0451 "pyspark.mllib.tests.test_util",
0452 ],
0453 blacklisted_python_implementations=[
0454 "PyPy"
0455 ]
0456 )
0457
0458
0459 pyspark_ml = Module(
0460 name="pyspark-ml",
0461 dependencies=[pyspark_core, pyspark_mllib],
0462 source_file_regexes=[
0463 "python/pyspark/ml/"
0464 ],
0465 python_test_goals=[
0466
0467 "pyspark.ml.classification",
0468 "pyspark.ml.clustering",
0469 "pyspark.ml.evaluation",
0470 "pyspark.ml.feature",
0471 "pyspark.ml.fpm",
0472 "pyspark.ml.functions",
0473 "pyspark.ml.image",
0474 "pyspark.ml.linalg.__init__",
0475 "pyspark.ml.recommendation",
0476 "pyspark.ml.regression",
0477 "pyspark.ml.stat",
0478 "pyspark.ml.tuning",
0479
0480 "pyspark.ml.tests.test_algorithms",
0481 "pyspark.ml.tests.test_base",
0482 "pyspark.ml.tests.test_evaluation",
0483 "pyspark.ml.tests.test_feature",
0484 "pyspark.ml.tests.test_image",
0485 "pyspark.ml.tests.test_linalg",
0486 "pyspark.ml.tests.test_param",
0487 "pyspark.ml.tests.test_persistence",
0488 "pyspark.ml.tests.test_pipeline",
0489 "pyspark.ml.tests.test_stat",
0490 "pyspark.ml.tests.test_training_summary",
0491 "pyspark.ml.tests.test_tuning",
0492 "pyspark.ml.tests.test_wrapper",
0493 ],
0494 blacklisted_python_implementations=[
0495 "PyPy"
0496 ]
0497 )
0498
0499 sparkr = Module(
0500 name="sparkr",
0501 dependencies=[hive, mllib],
0502 source_file_regexes=[
0503 "R/",
0504 ],
0505 should_run_r_tests=True
0506 )
0507
0508
0509 docs = Module(
0510 name="docs",
0511 dependencies=[],
0512 source_file_regexes=[
0513 "docs/",
0514 ]
0515 )
0516
0517 build = Module(
0518 name="build",
0519 dependencies=[],
0520 source_file_regexes=[
0521 ".*pom.xml",
0522 "dev/test-dependencies.sh",
0523 ],
0524 should_run_build_tests=True
0525 )
0526
0527 yarn = Module(
0528 name="yarn",
0529 dependencies=[],
0530 source_file_regexes=[
0531 "resource-managers/yarn/",
0532 "common/network-yarn/",
0533 ],
0534 build_profile_flags=["-Pyarn"],
0535 sbt_test_goals=[
0536 "yarn/test",
0537 "network-yarn/test",
0538 ],
0539 test_tags=[
0540 "org.apache.spark.tags.ExtendedYarnTest"
0541 ]
0542 )
0543
0544 mesos = Module(
0545 name="mesos",
0546 dependencies=[],
0547 source_file_regexes=["resource-managers/mesos/"],
0548 build_profile_flags=["-Pmesos"],
0549 sbt_test_goals=["mesos/test"]
0550 )
0551
0552 kubernetes = Module(
0553 name="kubernetes",
0554 dependencies=[],
0555 source_file_regexes=["resource-managers/kubernetes"],
0556 build_profile_flags=["-Pkubernetes"],
0557 sbt_test_goals=["kubernetes/test"]
0558 )
0559
0560 hadoop_cloud = Module(
0561 name="hadoop-cloud",
0562 dependencies=[],
0563 source_file_regexes=["hadoop-cloud"],
0564 build_profile_flags=["-Phadoop-cloud"],
0565 sbt_test_goals=["hadoop-cloud/test"]
0566 )
0567
0568 spark_ganglia_lgpl = Module(
0569 name="spark-ganglia-lgpl",
0570 dependencies=[],
0571 build_profile_flags=["-Pspark-ganglia-lgpl"],
0572 source_file_regexes=[
0573 "external/spark-ganglia-lgpl",
0574 ]
0575 )
0576
0577
0578
0579 root = Module(
0580 name="root",
0581 dependencies=[build],
0582 source_file_regexes=[],
0583
0584 build_profile_flags=list(set(
0585 itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
0586 sbt_test_goals=[
0587 "test",
0588 ],
0589 python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
0590 should_run_r_tests=True,
0591 should_run_build_tests=True
0592 )