0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019 from __future__ import print_function
0020 import glob
0021 import os
0022 import sys
0023 from setuptools import setup
0024 from shutil import copyfile, copytree, rmtree
0025
0026 if sys.version_info < (2, 7):
0027 print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
0028 file=sys.stderr)
0029 sys.exit(-1)
0030
0031 try:
0032 exec(open('pyspark/version.py').read())
0033 except IOError:
0034 print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
0035 file=sys.stderr)
0036 sys.exit(-1)
0037 VERSION = __version__
0038
0039 TEMP_PATH = "deps"
0040 SPARK_HOME = os.path.abspath("../")
0041
0042
0043 incorrect_invocation_message = """
0044 If you are installing pyspark from spark source, you must first build Spark and
0045 run sdist.
0046
0047 To build Spark with maven you can run:
0048 ./build/mvn -DskipTests clean package
0049 Building the source dist is done in the Python directory:
0050 cd python
0051 python setup.py sdist
0052 pip install dist/*.tar.gz"""
0053
0054
0055 JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
0056
0057 if len(JARS_PATH) == 1:
0058 JARS_PATH = JARS_PATH[0]
0059 elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
0060
0061 JARS_PATH = os.path.join(SPARK_HOME, "jars")
0062 elif len(JARS_PATH) > 1:
0063 print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
0064 JARS_PATH), file=sys.stderr)
0065 sys.exit(-1)
0066 elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
0067 print(incorrect_invocation_message, file=sys.stderr)
0068 sys.exit(-1)
0069
0070 EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
0071 SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
0072 USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
0073 DATA_PATH = os.path.join(SPARK_HOME, "data")
0074 LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
0075
0076 SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
0077 USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
0078 JARS_TARGET = os.path.join(TEMP_PATH, "jars")
0079 EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
0080 DATA_TARGET = os.path.join(TEMP_PATH, "data")
0081 LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
0082
0083
0084
0085
0086
0087 in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
0088 (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
0089
0090
0091 def _supports_symlinks():
0092 """Check if the system supports symlinks (e.g. *nix) or not."""
0093 return getattr(os, "symlink", None) is not None
0094
0095
0096 if (in_spark):
0097
0098 try:
0099 os.mkdir(TEMP_PATH)
0100 except:
0101 print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
0102 file=sys.stderr)
0103 sys.exit(-1)
0104
0105
0106
0107
0108 _minimum_pandas_version = "0.23.2"
0109 _minimum_pyarrow_version = "0.15.1"
0110
0111 try:
0112
0113
0114
0115 try:
0116 os.makedirs("pyspark/python/pyspark")
0117 except OSError:
0118
0119 pass
0120 copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
0121
0122 if (in_spark):
0123
0124
0125 if _supports_symlinks():
0126 os.symlink(JARS_PATH, JARS_TARGET)
0127 os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
0128 os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
0129 os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
0130 os.symlink(DATA_PATH, DATA_TARGET)
0131 os.symlink(LICENSES_PATH, LICENSES_TARGET)
0132 else:
0133
0134 copytree(JARS_PATH, JARS_TARGET)
0135 copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
0136 copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
0137 copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
0138 copytree(DATA_PATH, DATA_TARGET)
0139 copytree(LICENSES_PATH, LICENSES_TARGET)
0140 else:
0141
0142 if not os.path.exists(JARS_TARGET):
0143 print("To build packaging must be in the python directory under the SPARK_HOME.",
0144 file=sys.stderr)
0145
0146 if not os.path.isdir(SCRIPTS_TARGET):
0147 print(incorrect_invocation_message, file=sys.stderr)
0148 sys.exit(-1)
0149
0150
0151 script_names = os.listdir(SCRIPTS_TARGET)
0152 scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
0153
0154
0155 scripts.append("pyspark/find_spark_home.py")
0156
0157 with open('README.md') as f:
0158 long_description = f.read()
0159
0160 setup(
0161 name='pyspark',
0162 version=VERSION,
0163 description='Apache Spark Python API',
0164 long_description=long_description,
0165 long_description_content_type="text/markdown",
0166 author='Spark Developers',
0167 author_email='dev@spark.apache.org',
0168 url='https://github.com/apache/spark/tree/master/python',
0169 packages=['pyspark',
0170 'pyspark.mllib',
0171 'pyspark.mllib.linalg',
0172 'pyspark.mllib.stat',
0173 'pyspark.ml',
0174 'pyspark.ml.linalg',
0175 'pyspark.ml.param',
0176 'pyspark.sql',
0177 'pyspark.sql.avro',
0178 'pyspark.sql.pandas',
0179 'pyspark.streaming',
0180 'pyspark.bin',
0181 'pyspark.sbin',
0182 'pyspark.jars',
0183 'pyspark.python.pyspark',
0184 'pyspark.python.lib',
0185 'pyspark.data',
0186 'pyspark.licenses',
0187 'pyspark.examples.src.main.python'],
0188 include_package_data=True,
0189 package_dir={
0190 'pyspark.jars': 'deps/jars',
0191 'pyspark.bin': 'deps/bin',
0192 'pyspark.sbin': 'deps/sbin',
0193 'pyspark.python.lib': 'lib',
0194 'pyspark.data': 'deps/data',
0195 'pyspark.licenses': 'deps/licenses',
0196 'pyspark.examples.src.main.python': 'deps/examples',
0197 },
0198 package_data={
0199 'pyspark.jars': ['*.jar'],
0200 'pyspark.bin': ['*'],
0201 'pyspark.sbin': ['spark-config.sh', 'spark-daemon.sh',
0202 'start-history-server.sh',
0203 'stop-history-server.sh', ],
0204 'pyspark.python.lib': ['*.zip'],
0205 'pyspark.data': ['*.txt', '*.data'],
0206 'pyspark.licenses': ['*.txt'],
0207 'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
0208 scripts=scripts,
0209 license='http://www.apache.org/licenses/LICENSE-2.0',
0210 install_requires=['py4j==0.10.9'],
0211 extras_require={
0212 'ml': ['numpy>=1.7'],
0213 'mllib': ['numpy>=1.7'],
0214 'sql': [
0215 'pandas>=%s' % _minimum_pandas_version,
0216 'pyarrow>=%s' % _minimum_pyarrow_version,
0217 ]
0218 },
0219 classifiers=[
0220 'Development Status :: 5 - Production/Stable',
0221 'License :: OSI Approved :: Apache Software License',
0222 'Programming Language :: Python :: 2.7',
0223 'Programming Language :: Python :: 3',
0224 'Programming Language :: Python :: 3.4',
0225 'Programming Language :: Python :: 3.5',
0226 'Programming Language :: Python :: 3.6',
0227 'Programming Language :: Python :: 3.7',
0228 'Programming Language :: Python :: 3.8',
0229 'Programming Language :: Python :: Implementation :: CPython',
0230 'Programming Language :: Python :: Implementation :: PyPy']
0231 )
0232 finally:
0233
0234
0235 if (in_spark):
0236
0237 if _supports_symlinks():
0238 os.remove(os.path.join(TEMP_PATH, "jars"))
0239 os.remove(os.path.join(TEMP_PATH, "bin"))
0240 os.remove(os.path.join(TEMP_PATH, "sbin"))
0241 os.remove(os.path.join(TEMP_PATH, "examples"))
0242 os.remove(os.path.join(TEMP_PATH, "data"))
0243 os.remove(os.path.join(TEMP_PATH, "licenses"))
0244 else:
0245 rmtree(os.path.join(TEMP_PATH, "jars"))
0246 rmtree(os.path.join(TEMP_PATH, "bin"))
0247 rmtree(os.path.join(TEMP_PATH, "sbin"))
0248 rmtree(os.path.join(TEMP_PATH, "examples"))
0249 rmtree(os.path.join(TEMP_PATH, "data"))
0250 rmtree(os.path.join(TEMP_PATH, "licenses"))
0251 os.rmdir(TEMP_PATH)