python/pyspark/__init__.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 """
0019 PySpark is the Python API for Spark.
0020
0021 Public classes:
0022
0023   - :class:`SparkContext`:
0024       Main entry point for Spark functionality.
0025   - :class:`RDD`:
0026       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
0027   - :class:`Broadcast`:
0028       A broadcast variable that gets reused across tasks.
0029   - :class:`Accumulator`:
0030       An "add-only" shared variable that tasks can only add values to.
0031   - :class:`SparkConf`:
0032       For configuring Spark.
0033   - :class:`SparkFiles`:
0034       Access files shipped with jobs.
0035   - :class:`StorageLevel`:
0036       Finer-grained cache persistence levels.
0037   - :class:`TaskContext`:
0038       Information about the current running task, available on the workers and experimental.
0039   - :class:`RDDBarrier`:
0040       Wraps an RDD under a barrier stage for barrier execution.
0041   - :class:`BarrierTaskContext`:
0042       A :class:`TaskContext` that provides extra info and tooling for barrier execution.
0043   - :class:`BarrierTaskInfo`:
0044       Information about a barrier task.
0045 """
0046
0047 from functools import wraps
0048 import types
0049
0050 from pyspark.conf import SparkConf
0051 from pyspark.context import SparkContext
0052 from pyspark.rdd import RDD, RDDBarrier
0053 from pyspark.files import SparkFiles
0054 from pyspark.storagelevel import StorageLevel
0055 from pyspark.accumulators import Accumulator, AccumulatorParam
0056 from pyspark.broadcast import Broadcast
0057 from pyspark.serializers import MarshalSerializer, PickleSerializer
0058 from pyspark.status import *
0059 from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
0060 from pyspark.profiler import Profiler, BasicProfiler
0061 from pyspark.version import __version__
0062 from pyspark._globals import _NoValue
0063
0064
0065 def since(version):
0066     """
0067     A decorator that annotates a function to append the version of Spark the function was added.
0068     """
0069     import re
0070     indent_p = re.compile(r'\n( +)')
0071
0072     def deco(f):
0073         indents = indent_p.findall(f.__doc__)
0074         indent = ' ' * (min(len(m) for m in indents) if indents else 0)
0075         f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
0076         return f
0077     return deco
0078
0079
0080 def copy_func(f, name=None, sinceversion=None, doc=None):
0081     """
0082     Returns a function with same code, globals, defaults, closure, and
0083     name (or provide a new name).
0084     """
0085     # See
0086     # http://stackoverflow.com/questions/6527633/how-can-i-make-a-deepcopy-of-a-function-in-python
0087     fn = types.FunctionType(f.__code__, f.__globals__, name or f.__name__, f.__defaults__,
0088                             f.__closure__)
0089     # in case f was given attrs (note this dict is a shallow copy):
0090     fn.__dict__.update(f.__dict__)
0091     if doc is not None:
0092         fn.__doc__ = doc
0093     if sinceversion is not None:
0094         fn = since(sinceversion)(fn)
0095     return fn
0096
0097
0098 def keyword_only(func):
0099     """
0100     A decorator that forces keyword arguments in the wrapped method
0101     and saves actual input keyword arguments in `_input_kwargs`.
0102
0103     .. note:: Should only be used to wrap a method where first arg is `self`
0104     """
0105     @wraps(func)
0106     def wrapper(self, *args, **kwargs):
0107         if len(args) > 0:
0108             raise TypeError("Method %s forces keyword arguments." % func.__name__)
0109         self._input_kwargs = kwargs
0110         return func(self, **kwargs)
0111     return wrapper
0112
0113
0114 # for back compatibility
0115 from pyspark.sql import SQLContext, HiveContext, Row
0116
0117 __all__ = [
0118     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
0119     "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
0120     "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler", "TaskContext",
0121     "RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo",
0122 ]