0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import sys
0019
0020 if sys.version > '3':
0021 xrange = range
0022
0023 import numpy as np
0024
0025 from pyspark.mllib.common import callMLlibFunc
0026 from pyspark.rdd import RDD
0027
0028
0029 class KernelDensity(object):
0030 """
0031 Estimate probability density at required points given an RDD of samples
0032 from the population.
0033
0034 >>> kd = KernelDensity()
0035 >>> sample = sc.parallelize([0.0, 1.0])
0036 >>> kd.setSample(sample)
0037 >>> kd.estimate([0.0, 1.0])
0038 array([ 0.12938758, 0.12938758])
0039 """
0040 def __init__(self):
0041 self._bandwidth = 1.0
0042 self._sample = None
0043
0044 def setBandwidth(self, bandwidth):
0045 """Set bandwidth of each sample. Defaults to 1.0"""
0046 self._bandwidth = bandwidth
0047
0048 def setSample(self, sample):
0049 """Set sample points from the population. Should be a RDD"""
0050 if not isinstance(sample, RDD):
0051 raise TypeError("samples should be a RDD, received %s" % type(sample))
0052 self._sample = sample
0053
0054 def estimate(self, points):
0055 """Estimate the probability density at points"""
0056 points = list(points)
0057 densities = callMLlibFunc(
0058 "estimateKernelDensity", self._sample, self._bandwidth, points)
0059 return np.asarray(densities)