0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 PySpark is the Python API for Spark.
0020
0021 Public classes:
0022
0023 - :class:`SparkContext`:
0024 Main entry point for Spark functionality.
0025 - :class:`RDD`:
0026 A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
0027 - :class:`Broadcast`:
0028 A broadcast variable that gets reused across tasks.
0029 - :class:`Accumulator`:
0030 An "add-only" shared variable that tasks can only add values to.
0031 - :class:`SparkConf`:
0032 For configuring Spark.
0033 - :class:`SparkFiles`:
0034 Access files shipped with jobs.
0035 - :class:`StorageLevel`:
0036 Finer-grained cache persistence levels.
0037 - :class:`TaskContext`:
0038 Information about the current running task, available on the workers and experimental.
0039 - :class:`RDDBarrier`:
0040 Wraps an RDD under a barrier stage for barrier execution.
0041 - :class:`BarrierTaskContext`:
0042 A :class:`TaskContext` that provides extra info and tooling for barrier execution.
0043 - :class:`BarrierTaskInfo`:
0044 Information about a barrier task.
0045 """
0046
0047 from functools import wraps
0048 import types
0049
0050 from pyspark.conf import SparkConf
0051 from pyspark.context import SparkContext
0052 from pyspark.rdd import RDD, RDDBarrier
0053 from pyspark.files import SparkFiles
0054 from pyspark.storagelevel import StorageLevel
0055 from pyspark.accumulators import Accumulator, AccumulatorParam
0056 from pyspark.broadcast import Broadcast
0057 from pyspark.serializers import MarshalSerializer, PickleSerializer
0058 from pyspark.status import *
0059 from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
0060 from pyspark.profiler import Profiler, BasicProfiler
0061 from pyspark.version import __version__
0062 from pyspark._globals import _NoValue
0063
0064
0065 def since(version):
0066 """
0067 A decorator that annotates a function to append the version of Spark the function was added.
0068 """
0069 import re
0070 indent_p = re.compile(r'\n( +)')
0071
0072 def deco(f):
0073 indents = indent_p.findall(f.__doc__)
0074 indent = ' ' * (min(len(m) for m in indents) if indents else 0)
0075 f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
0076 return f
0077 return deco
0078
0079
0080 def copy_func(f, name=None, sinceversion=None, doc=None):
0081 """
0082 Returns a function with same code, globals, defaults, closure, and
0083 name (or provide a new name).
0084 """
0085
0086
0087 fn = types.FunctionType(f.__code__, f.__globals__, name or f.__name__, f.__defaults__,
0088 f.__closure__)
0089
0090 fn.__dict__.update(f.__dict__)
0091 if doc is not None:
0092 fn.__doc__ = doc
0093 if sinceversion is not None:
0094 fn = since(sinceversion)(fn)
0095 return fn
0096
0097
0098 def keyword_only(func):
0099 """
0100 A decorator that forces keyword arguments in the wrapped method
0101 and saves actual input keyword arguments in `_input_kwargs`.
0102
0103 .. note:: Should only be used to wrap a method where first arg is `self`
0104 """
0105 @wraps(func)
0106 def wrapper(self, *args, **kwargs):
0107 if len(args) > 0:
0108 raise TypeError("Method %s forces keyword arguments." % func.__name__)
0109 self._input_kwargs = kwargs
0110 return func(self, **kwargs)
0111 return wrapper
0112
0113
0114
0115 from pyspark.sql import SQLContext, HiveContext, Row
0116
0117 __all__ = [
0118 "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
0119 "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
0120 "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler", "TaskContext",
0121 "RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo",
0122 ]