Back to home page

OSCL-LXR

 
 

    


0001 #!/usr/bin/env python
0002 
0003 #
0004 # Licensed to the Apache Software Foundation (ASF) under one or more
0005 # contributor license agreements.  See the NOTICE file distributed with
0006 # this work for additional information regarding copyright ownership.
0007 # The ASF licenses this file to You under the Apache License, Version 2.0
0008 # (the "License"); you may not use this file except in compliance with
0009 # the License.  You may obtain a copy of the License at
0010 #
0011 #    http://www.apache.org/licenses/LICENSE-2.0
0012 #
0013 # Unless required by applicable law or agreed to in writing, software
0014 # distributed under the License is distributed on an "AS IS" BASIS,
0015 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016 # See the License for the specific language governing permissions and
0017 # limitations under the License.
0018 
0019 from __future__ import print_function
0020 import glob
0021 import os
0022 import sys
0023 from setuptools import setup
0024 from shutil import copyfile, copytree, rmtree
0025 
0026 if sys.version_info < (2, 7):
0027     print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
0028           file=sys.stderr)
0029     sys.exit(-1)
0030 
0031 try:
0032     exec(open('pyspark/version.py').read())
0033 except IOError:
0034     print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
0035           file=sys.stderr)
0036     sys.exit(-1)
0037 VERSION = __version__  # noqa
0038 # A temporary path so we can access above the Python project root and fetch scripts and jars we need
0039 TEMP_PATH = "deps"
0040 SPARK_HOME = os.path.abspath("../")
0041 
0042 # Provide guidance about how to use setup.py
0043 incorrect_invocation_message = """
0044 If you are installing pyspark from spark source, you must first build Spark and
0045 run sdist.
0046 
0047     To build Spark with maven you can run:
0048       ./build/mvn -DskipTests clean package
0049     Building the source dist is done in the Python directory:
0050       cd python
0051       python setup.py sdist
0052       pip install dist/*.tar.gz"""
0053 
0054 # Figure out where the jars are we need to package with PySpark.
0055 JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
0056 
0057 if len(JARS_PATH) == 1:
0058     JARS_PATH = JARS_PATH[0]
0059 elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
0060     # Release mode puts the jars in a jars directory
0061     JARS_PATH = os.path.join(SPARK_HOME, "jars")
0062 elif len(JARS_PATH) > 1:
0063     print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
0064         JARS_PATH), file=sys.stderr)
0065     sys.exit(-1)
0066 elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
0067     print(incorrect_invocation_message, file=sys.stderr)
0068     sys.exit(-1)
0069 
0070 EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
0071 SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
0072 USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
0073 DATA_PATH = os.path.join(SPARK_HOME, "data")
0074 LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
0075 
0076 SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
0077 USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
0078 JARS_TARGET = os.path.join(TEMP_PATH, "jars")
0079 EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
0080 DATA_TARGET = os.path.join(TEMP_PATH, "data")
0081 LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
0082 
0083 # Check and see if we are under the spark path in which case we need to build the symlink farm.
0084 # This is important because we only want to build the symlink farm while under Spark otherwise we
0085 # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
0086 # partially built sdist) we should error and have the user sort it out.
0087 in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
0088             (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
0089 
0090 
0091 def _supports_symlinks():
0092     """Check if the system supports symlinks (e.g. *nix) or not."""
0093     return getattr(os, "symlink", None) is not None
0094 
0095 
0096 if (in_spark):
0097     # Construct links for setup
0098     try:
0099         os.mkdir(TEMP_PATH)
0100     except:
0101         print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
0102               file=sys.stderr)
0103         sys.exit(-1)
0104 
0105 # If you are changing the versions here, please also change ./python/pyspark/sql/utils.py
0106 # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
0107 # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
0108 _minimum_pandas_version = "0.23.2"
0109 _minimum_pyarrow_version = "0.15.1"
0110 
0111 try:
0112     # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
0113     # find it where expected. The rest of the files aren't copied because they are accessed
0114     # using Python imports instead which will be resolved correctly.
0115     try:
0116         os.makedirs("pyspark/python/pyspark")
0117     except OSError:
0118         # Don't worry if the directory already exists.
0119         pass
0120     copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
0121 
0122     if (in_spark):
0123         # Construct the symlink farm - this is necessary since we can't refer to the path above the
0124         # package root and we need to copy the jars and scripts which are up above the python root.
0125         if _supports_symlinks():
0126             os.symlink(JARS_PATH, JARS_TARGET)
0127             os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
0128             os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
0129             os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
0130             os.symlink(DATA_PATH, DATA_TARGET)
0131             os.symlink(LICENSES_PATH, LICENSES_TARGET)
0132         else:
0133             # For windows fall back to the slower copytree
0134             copytree(JARS_PATH, JARS_TARGET)
0135             copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
0136             copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
0137             copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
0138             copytree(DATA_PATH, DATA_TARGET)
0139             copytree(LICENSES_PATH, LICENSES_TARGET)
0140     else:
0141         # If we are not inside of SPARK_HOME verify we have the required symlink farm
0142         if not os.path.exists(JARS_TARGET):
0143             print("To build packaging must be in the python directory under the SPARK_HOME.",
0144                   file=sys.stderr)
0145 
0146     if not os.path.isdir(SCRIPTS_TARGET):
0147         print(incorrect_invocation_message, file=sys.stderr)
0148         sys.exit(-1)
0149 
0150     # Scripts directive requires a list of each script path and does not take wild cards.
0151     script_names = os.listdir(SCRIPTS_TARGET)
0152     scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
0153     # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
0154     # will search for SPARK_HOME with Python.
0155     scripts.append("pyspark/find_spark_home.py")
0156 
0157     with open('README.md') as f:
0158         long_description = f.read()
0159 
0160     setup(
0161         name='pyspark',
0162         version=VERSION,
0163         description='Apache Spark Python API',
0164         long_description=long_description,
0165         long_description_content_type="text/markdown",
0166         author='Spark Developers',
0167         author_email='dev@spark.apache.org',
0168         url='https://github.com/apache/spark/tree/master/python',
0169         packages=['pyspark',
0170                   'pyspark.mllib',
0171                   'pyspark.mllib.linalg',
0172                   'pyspark.mllib.stat',
0173                   'pyspark.ml',
0174                   'pyspark.ml.linalg',
0175                   'pyspark.ml.param',
0176                   'pyspark.sql',
0177                   'pyspark.sql.avro',
0178                   'pyspark.sql.pandas',
0179                   'pyspark.streaming',
0180                   'pyspark.bin',
0181                   'pyspark.sbin',
0182                   'pyspark.jars',
0183                   'pyspark.python.pyspark',
0184                   'pyspark.python.lib',
0185                   'pyspark.data',
0186                   'pyspark.licenses',
0187                   'pyspark.examples.src.main.python'],
0188         include_package_data=True,
0189         package_dir={
0190             'pyspark.jars': 'deps/jars',
0191             'pyspark.bin': 'deps/bin',
0192             'pyspark.sbin': 'deps/sbin',
0193             'pyspark.python.lib': 'lib',
0194             'pyspark.data': 'deps/data',
0195             'pyspark.licenses': 'deps/licenses',
0196             'pyspark.examples.src.main.python': 'deps/examples',
0197         },
0198         package_data={
0199             'pyspark.jars': ['*.jar'],
0200             'pyspark.bin': ['*'],
0201             'pyspark.sbin': ['spark-config.sh', 'spark-daemon.sh',
0202                              'start-history-server.sh',
0203                              'stop-history-server.sh', ],
0204             'pyspark.python.lib': ['*.zip'],
0205             'pyspark.data': ['*.txt', '*.data'],
0206             'pyspark.licenses': ['*.txt'],
0207             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
0208         scripts=scripts,
0209         license='http://www.apache.org/licenses/LICENSE-2.0',
0210         install_requires=['py4j==0.10.9'],
0211         extras_require={
0212             'ml': ['numpy>=1.7'],
0213             'mllib': ['numpy>=1.7'],
0214             'sql': [
0215                 'pandas>=%s' % _minimum_pandas_version,
0216                 'pyarrow>=%s' % _minimum_pyarrow_version,
0217             ]
0218         },
0219         classifiers=[
0220             'Development Status :: 5 - Production/Stable',
0221             'License :: OSI Approved :: Apache Software License',
0222             'Programming Language :: Python :: 2.7',
0223             'Programming Language :: Python :: 3',
0224             'Programming Language :: Python :: 3.4',
0225             'Programming Language :: Python :: 3.5',
0226             'Programming Language :: Python :: 3.6',
0227             'Programming Language :: Python :: 3.7',
0228             'Programming Language :: Python :: 3.8',
0229             'Programming Language :: Python :: Implementation :: CPython',
0230             'Programming Language :: Python :: Implementation :: PyPy']
0231     )
0232 finally:
0233     # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
0234     # packaging.
0235     if (in_spark):
0236         # Depending on cleaning up the symlink farm or copied version
0237         if _supports_symlinks():
0238             os.remove(os.path.join(TEMP_PATH, "jars"))
0239             os.remove(os.path.join(TEMP_PATH, "bin"))
0240             os.remove(os.path.join(TEMP_PATH, "sbin"))
0241             os.remove(os.path.join(TEMP_PATH, "examples"))
0242             os.remove(os.path.join(TEMP_PATH, "data"))
0243             os.remove(os.path.join(TEMP_PATH, "licenses"))
0244         else:
0245             rmtree(os.path.join(TEMP_PATH, "jars"))
0246             rmtree(os.path.join(TEMP_PATH, "bin"))
0247             rmtree(os.path.join(TEMP_PATH, "sbin"))
0248             rmtree(os.path.join(TEMP_PATH, "examples"))
0249             rmtree(os.path.join(TEMP_PATH, "data"))
0250             rmtree(os.path.join(TEMP_PATH, "licenses"))
0251         os.rmdir(TEMP_PATH)