mllib/stat/KernelDensity.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 import sys
0019
0020 if sys.version > '3':
0021     xrange = range
0022
0023 import numpy as np
0024
0025 from pyspark.mllib.common import callMLlibFunc
0026 from pyspark.rdd import RDD
0027
0028
0029 class KernelDensity(object):
0030     """
0031     Estimate probability density at required points given an RDD of samples
0032     from the population.
0033
0034     >>> kd = KernelDensity()
0035     >>> sample = sc.parallelize([0.0, 1.0])
0036     >>> kd.setSample(sample)
0037     >>> kd.estimate([0.0, 1.0])
0038     array([ 0.12938758,  0.12938758])
0039     """
0040     def __init__(self):
0041         self._bandwidth = 1.0
0042         self._sample = None
0043
0044     def setBandwidth(self, bandwidth):
0045         """Set bandwidth of each sample. Defaults to 1.0"""
0046         self._bandwidth = bandwidth
0047
0048     def setSample(self, sample):
0049         """Set sample points from the population. Should be a RDD"""
0050         if not isinstance(sample, RDD):
0051             raise TypeError("samples should be a RDD, received %s" % type(sample))
0052         self._sample = sample
0053
0054     def estimate(self, points):
0055         """Estimate the probability density at points"""
0056         points = list(points)
0057         densities = callMLlibFunc(
0058             "estimateKernelDensity", self._sample, self._bandwidth, points)
0059         return np.asarray(densities)