pyspark/ml/evaluation.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 import sys
0019 from abc import abstractmethod, ABCMeta
0020
0021 from pyspark import since, keyword_only
0022 from pyspark.ml.wrapper import JavaParams
0023 from pyspark.ml.param import Param, Params, TypeConverters
0024 from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasProbabilityCol, \
0025     HasRawPredictionCol, HasFeaturesCol, HasWeightCol
0026 from pyspark.ml.common import inherit_doc
0027 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
0028
0029 __all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
0030            'MulticlassClassificationEvaluator', 'MultilabelClassificationEvaluator',
0031            'ClusteringEvaluator', 'RankingEvaluator']
0032
0033
0034 @inherit_doc
0035 class Evaluator(Params):
0036     """
0037     Base class for evaluators that compute metrics from predictions.
0038
0039     .. versionadded:: 1.4.0
0040     """
0041
0042     __metaclass__ = ABCMeta
0043
0044     @abstractmethod
0045     def _evaluate(self, dataset):
0046         """
0047         Evaluates the output.
0048
0049         :param dataset: a dataset that contains labels/observations and
0050                predictions
0051         :return: metric
0052         """
0053         raise NotImplementedError()
0054
0055     @since("1.4.0")
0056     def evaluate(self, dataset, params=None):
0057         """
0058         Evaluates the output with optional parameters.
0059
0060         :param dataset: a dataset that contains labels/observations and
0061                         predictions
0062         :param params: an optional param map that overrides embedded
0063                        params
0064         :return: metric
0065         """
0066         if params is None:
0067             params = dict()
0068         if isinstance(params, dict):
0069             if params:
0070                 return self.copy(params)._evaluate(dataset)
0071             else:
0072                 return self._evaluate(dataset)
0073         else:
0074             raise ValueError("Params must be a param map but got %s." % type(params))
0075
0076     @since("1.5.0")
0077     def isLargerBetter(self):
0078         """
0079         Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
0080         (True, default) or minimized (False).
0081         A given evaluator may support multiple metrics which may be maximized or minimized.
0082         """
0083         return True
0084
0085
0086 @inherit_doc
0087 class JavaEvaluator(JavaParams, Evaluator):
0088     """
0089     Base class for :py:class:`Evaluator`s that wrap Java/Scala
0090     implementations.
0091     """
0092
0093     __metaclass__ = ABCMeta
0094
0095     def _evaluate(self, dataset):
0096         """
0097         Evaluates the output.
0098         :param dataset: a dataset that contains labels/observations and predictions.
0099         :return: evaluation metric
0100         """
0101         self._transfer_params_to_java()
0102         return self._java_obj.evaluate(dataset._jdf)
0103
0104     def isLargerBetter(self):
0105         self._transfer_params_to_java()
0106         return self._java_obj.isLargerBetter()
0107
0108
0109 @inherit_doc
0110 class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
0111                                     JavaMLReadable, JavaMLWritable):
0112     """
0113     Evaluator for binary classification, which expects input columns rawPrediction, label
0114     and an optional weight column.
0115     The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
0116     1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
0117
0118     >>> from pyspark.ml.linalg import Vectors
0119     >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
0120     ...    [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
0121     >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
0122     ...
0123     >>> evaluator = BinaryClassificationEvaluator()
0124     >>> evaluator.setRawPredictionCol("raw")
0125     BinaryClassificationEvaluator...
0126     >>> evaluator.evaluate(dataset)
0127     0.70...
0128     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0129     0.83...
0130     >>> bce_path = temp_path + "/bce"
0131     >>> evaluator.save(bce_path)
0132     >>> evaluator2 = BinaryClassificationEvaluator.load(bce_path)
0133     >>> str(evaluator2.getRawPredictionCol())
0134     'raw'
0135     >>> scoreAndLabelsAndWeight = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1], x[2]),
0136     ...    [(0.1, 0.0, 1.0), (0.1, 1.0, 0.9), (0.4, 0.0, 0.7), (0.6, 0.0, 0.9),
0137     ...     (0.6, 1.0, 1.0), (0.6, 1.0, 0.3), (0.8, 1.0, 1.0)])
0138     >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
0139     ...
0140     >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw", weightCol="weight")
0141     >>> evaluator.evaluate(dataset)
0142     0.70...
0143     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0144     0.82...
0145     >>> evaluator.getNumBins()
0146     1000
0147
0148     .. versionadded:: 1.4.0
0149     """
0150
0151     metricName = Param(Params._dummy(), "metricName",
0152                        "metric name in evaluation (areaUnderROC|areaUnderPR)",
0153                        typeConverter=TypeConverters.toString)
0154
0155     numBins = Param(Params._dummy(), "numBins", "Number of bins to down-sample the curves "
0156                     "(ROC curve, PR curve) in area computation. If 0, no down-sampling will "
0157                     "occur. Must be >= 0.",
0158                     typeConverter=TypeConverters.toInt)
0159
0160     @keyword_only
0161     def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
0162                  metricName="areaUnderROC", weightCol=None, numBins=1000):
0163         """
0164         __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
0165                  metricName="areaUnderROC", weightCol=None, numBins=1000)
0166         """
0167         super(BinaryClassificationEvaluator, self).__init__()
0168         self._java_obj = self._new_java_obj(
0169             "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
0170         self._setDefault(metricName="areaUnderROC", numBins=1000)
0171         kwargs = self._input_kwargs
0172         self._set(**kwargs)
0173
0174     @since("1.4.0")
0175     def setMetricName(self, value):
0176         """
0177         Sets the value of :py:attr:`metricName`.
0178         """
0179         return self._set(metricName=value)
0180
0181     @since("1.4.0")
0182     def getMetricName(self):
0183         """
0184         Gets the value of metricName or its default value.
0185         """
0186         return self.getOrDefault(self.metricName)
0187
0188     @since("3.0.0")
0189     def setNumBins(self, value):
0190         """
0191         Sets the value of :py:attr:`numBins`.
0192         """
0193         return self._set(numBins=value)
0194
0195     @since("3.0.0")
0196     def getNumBins(self):
0197         """
0198         Gets the value of numBins or its default value.
0199         """
0200         return self.getOrDefault(self.numBins)
0201
0202     def setLabelCol(self, value):
0203         """
0204         Sets the value of :py:attr:`labelCol`.
0205         """
0206         return self._set(labelCol=value)
0207
0208     def setRawPredictionCol(self, value):
0209         """
0210         Sets the value of :py:attr:`rawPredictionCol`.
0211         """
0212         return self._set(rawPredictionCol=value)
0213
0214     @since("3.0.0")
0215     def setWeightCol(self, value):
0216         """
0217         Sets the value of :py:attr:`weightCol`.
0218         """
0219         return self._set(weightCol=value)
0220
0221     @keyword_only
0222     @since("1.4.0")
0223     def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
0224                   metricName="areaUnderROC", weightCol=None, numBins=1000):
0225         """
0226         setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
0227                   metricName="areaUnderROC", weightCol=None, numBins=1000)
0228         Sets params for binary classification evaluator.
0229         """
0230         kwargs = self._input_kwargs
0231         return self._set(**kwargs)
0232
0233
0234 @inherit_doc
0235 class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
0236                           JavaMLReadable, JavaMLWritable):
0237     """
0238     Evaluator for Regression, which expects input columns prediction, label
0239     and an optional weight column.
0240
0241     >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
0242     ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
0243     >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
0244     ...
0245     >>> evaluator = RegressionEvaluator()
0246     >>> evaluator.setPredictionCol("raw")
0247     RegressionEvaluator...
0248     >>> evaluator.evaluate(dataset)
0249     2.842...
0250     >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
0251     0.993...
0252     >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
0253     2.649...
0254     >>> re_path = temp_path + "/re"
0255     >>> evaluator.save(re_path)
0256     >>> evaluator2 = RegressionEvaluator.load(re_path)
0257     >>> str(evaluator2.getPredictionCol())
0258     'raw'
0259     >>> scoreAndLabelsAndWeight = [(-28.98343821, -27.0, 1.0), (20.21491975, 21.5, 0.8),
0260     ...   (-25.98418959, -22.0, 1.0), (30.69731842, 33.0, 0.6), (74.69283752, 71.0, 0.2)]
0261     >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
0262     ...
0263     >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
0264     >>> evaluator.evaluate(dataset)
0265     2.740...
0266     >>> evaluator.getThroughOrigin()
0267     False
0268
0269     .. versionadded:: 1.4.0
0270     """
0271     metricName = Param(Params._dummy(), "metricName",
0272                        """metric name in evaluation - one of:
0273                        rmse - root mean squared error (default)
0274                        mse - mean squared error
0275                        r2 - r^2 metric
0276                        mae - mean absolute error
0277                        var - explained variance.""",
0278                        typeConverter=TypeConverters.toString)
0279
0280     throughOrigin = Param(Params._dummy(), "throughOrigin",
0281                           "whether the regression is through the origin.",
0282                           typeConverter=TypeConverters.toBoolean)
0283
0284     @keyword_only
0285     def __init__(self, predictionCol="prediction", labelCol="label",
0286                  metricName="rmse", weightCol=None, throughOrigin=False):
0287         """
0288         __init__(self, predictionCol="prediction", labelCol="label", \
0289                  metricName="rmse", weightCol=None, throughOrigin=False)
0290         """
0291         super(RegressionEvaluator, self).__init__()
0292         self._java_obj = self._new_java_obj(
0293             "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
0294         self._setDefault(metricName="rmse", throughOrigin=False)
0295         kwargs = self._input_kwargs
0296         self._set(**kwargs)
0297
0298     @since("1.4.0")
0299     def setMetricName(self, value):
0300         """
0301         Sets the value of :py:attr:`metricName`.
0302         """
0303         return self._set(metricName=value)
0304
0305     @since("1.4.0")
0306     def getMetricName(self):
0307         """
0308         Gets the value of metricName or its default value.
0309         """
0310         return self.getOrDefault(self.metricName)
0311
0312     @since("3.0.0")
0313     def setThroughOrigin(self, value):
0314         """
0315         Sets the value of :py:attr:`throughOrigin`.
0316         """
0317         return self._set(throughOrigin=value)
0318
0319     @since("3.0.0")
0320     def getThroughOrigin(self):
0321         """
0322         Gets the value of throughOrigin or its default value.
0323         """
0324         return self.getOrDefault(self.throughOrigin)
0325
0326     def setLabelCol(self, value):
0327         """
0328         Sets the value of :py:attr:`labelCol`.
0329         """
0330         return self._set(labelCol=value)
0331
0332     def setPredictionCol(self, value):
0333         """
0334         Sets the value of :py:attr:`predictionCol`.
0335         """
0336         return self._set(predictionCol=value)
0337
0338     @since("3.0.0")
0339     def setWeightCol(self, value):
0340         """
0341         Sets the value of :py:attr:`weightCol`.
0342         """
0343         return self._set(weightCol=value)
0344
0345     @keyword_only
0346     @since("1.4.0")
0347     def setParams(self, predictionCol="prediction", labelCol="label",
0348                   metricName="rmse", weightCol=None, throughOrigin=False):
0349         """
0350         setParams(self, predictionCol="prediction", labelCol="label", \
0351                   metricName="rmse", weightCol=None, throughOrigin=False)
0352         Sets params for regression evaluator.
0353         """
0354         kwargs = self._input_kwargs
0355         return self._set(**kwargs)
0356
0357
0358 @inherit_doc
0359 class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
0360                                         HasProbabilityCol, JavaMLReadable, JavaMLWritable):
0361     """
0362     Evaluator for Multiclass Classification, which expects input
0363     columns: prediction, label, weight (optional) and probabilityCol (only for logLoss).
0364
0365     >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
0366     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
0367     >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0368     >>> evaluator = MulticlassClassificationEvaluator()
0369     >>> evaluator.setPredictionCol("prediction")
0370     MulticlassClassificationEvaluator...
0371     >>> evaluator.evaluate(dataset)
0372     0.66...
0373     >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0374     0.66...
0375     >>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
0376     ...     evaluator.metricLabel: 1.0})
0377     0.75...
0378     >>> evaluator.setMetricName("hammingLoss")
0379     MulticlassClassificationEvaluator...
0380     >>> evaluator.evaluate(dataset)
0381     0.33...
0382     >>> mce_path = temp_path + "/mce"
0383     >>> evaluator.save(mce_path)
0384     >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
0385     >>> str(evaluator2.getPredictionCol())
0386     'prediction'
0387     >>> scoreAndLabelsAndWeight = [(0.0, 0.0, 1.0), (0.0, 1.0, 1.0), (0.0, 0.0, 1.0),
0388     ...     (1.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0),
0389     ...     (2.0, 2.0, 1.0), (2.0, 0.0, 1.0)]
0390     >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["prediction", "label", "weight"])
0391     >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
0392     ...     weightCol="weight")
0393     >>> evaluator.evaluate(dataset)
0394     0.66...
0395     >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0396     0.66...
0397     >>> predictionAndLabelsWithProbabilities = [
0398     ...      (1.0, 1.0, 1.0, [0.1, 0.8, 0.1]), (0.0, 2.0, 1.0, [0.9, 0.05, 0.05]),
0399     ...      (0.0, 0.0, 1.0, [0.8, 0.2, 0.0]), (1.0, 1.0, 1.0, [0.3, 0.65, 0.05])]
0400     >>> dataset = spark.createDataFrame(predictionAndLabelsWithProbabilities, ["prediction",
0401     ...     "label", "weight", "probability"])
0402     >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
0403     ...     probabilityCol="probability")
0404     >>> evaluator.setMetricName("logLoss")
0405     MulticlassClassificationEvaluator...
0406     >>> evaluator.evaluate(dataset)
0407     0.9682...
0408
0409     .. versionadded:: 1.5.0
0410     """
0411     metricName = Param(Params._dummy(), "metricName",
0412                        "metric name in evaluation "
0413                        "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
0414                        "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
0415                        "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
0416                        "logLoss|hammingLoss)",
0417                        typeConverter=TypeConverters.toString)
0418     metricLabel = Param(Params._dummy(), "metricLabel",
0419                         "The class whose metric will be computed in truePositiveRateByLabel|"
0420                         "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel."
0421                         " Must be >= 0. The default value is 0.",
0422                         typeConverter=TypeConverters.toFloat)
0423     beta = Param(Params._dummy(), "beta",
0424                  "The beta value used in weightedFMeasure|fMeasureByLabel."
0425                  " Must be > 0. The default value is 1.",
0426                  typeConverter=TypeConverters.toFloat)
0427     eps = Param(Params._dummy(), "eps",
0428                 "log-loss is undefined for p=0 or p=1, so probabilities are clipped to "
0429                 "max(eps, min(1 - eps, p)). "
0430                 "Must be in range (0, 0.5). The default value is 1e-15.",
0431                 typeConverter=TypeConverters.toFloat)
0432
0433     @keyword_only
0434     def __init__(self, predictionCol="prediction", labelCol="label",
0435                  metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
0436                  probabilityCol="probability", eps=1e-15):
0437         """
0438         __init__(self, predictionCol="prediction", labelCol="label", \
0439                  metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
0440                  probabilityCol="probability", eps=1e-15)
0441         """
0442         super(MulticlassClassificationEvaluator, self).__init__()
0443         self._java_obj = self._new_java_obj(
0444             "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
0445         self._setDefault(metricName="f1", metricLabel=0.0, beta=1.0, eps=1e-15)
0446         kwargs = self._input_kwargs
0447         self._set(**kwargs)
0448
0449     @since("1.5.0")
0450     def setMetricName(self, value):
0451         """
0452         Sets the value of :py:attr:`metricName`.
0453         """
0454         return self._set(metricName=value)
0455
0456     @since("1.5.0")
0457     def getMetricName(self):
0458         """
0459         Gets the value of metricName or its default value.
0460         """
0461         return self.getOrDefault(self.metricName)
0462
0463     @since("3.0.0")
0464     def setMetricLabel(self, value):
0465         """
0466         Sets the value of :py:attr:`metricLabel`.
0467         """
0468         return self._set(metricLabel=value)
0469
0470     @since("3.0.0")
0471     def getMetricLabel(self):
0472         """
0473         Gets the value of metricLabel or its default value.
0474         """
0475         return self.getOrDefault(self.metricLabel)
0476
0477     @since("3.0.0")
0478     def setBeta(self, value):
0479         """
0480         Sets the value of :py:attr:`beta`.
0481         """
0482         return self._set(beta=value)
0483
0484     @since("3.0.0")
0485     def getBeta(self):
0486         """
0487         Gets the value of beta or its default value.
0488         """
0489         return self.getOrDefault(self.beta)
0490
0491     @since("3.0.0")
0492     def setEps(self, value):
0493         """
0494         Sets the value of :py:attr:`eps`.
0495         """
0496         return self._set(eps=value)
0497
0498     @since("3.0.0")
0499     def getEps(self):
0500         """
0501         Gets the value of eps or its default value.
0502         """
0503         return self.getOrDefault(self.eps)
0504
0505     def setLabelCol(self, value):
0506         """
0507         Sets the value of :py:attr:`labelCol`.
0508         """
0509         return self._set(labelCol=value)
0510
0511     def setPredictionCol(self, value):
0512         """
0513         Sets the value of :py:attr:`predictionCol`.
0514         """
0515         return self._set(predictionCol=value)
0516
0517     @since("3.0.0")
0518     def setProbabilityCol(self, value):
0519         """
0520         Sets the value of :py:attr:`probabilityCol`.
0521         """
0522         return self._set(probabilityCol=value)
0523
0524     @since("3.0.0")
0525     def setWeightCol(self, value):
0526         """
0527         Sets the value of :py:attr:`weightCol`.
0528         """
0529         return self._set(weightCol=value)
0530
0531     @keyword_only
0532     @since("1.5.0")
0533     def setParams(self, predictionCol="prediction", labelCol="label",
0534                   metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
0535                   probabilityCol="probability", eps=1e-15):
0536         """
0537         setParams(self, predictionCol="prediction", labelCol="label", \
0538                   metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
0539                   probabilityCol="probability", eps=1e-15)
0540         Sets params for multiclass classification evaluator.
0541         """
0542         kwargs = self._input_kwargs
0543         return self._set(**kwargs)
0544
0545
0546 @inherit_doc
0547 class MultilabelClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
0548                                         JavaMLReadable, JavaMLWritable):
0549     """
0550     .. note:: Experimental
0551
0552     Evaluator for Multilabel Classification, which expects two input
0553     columns: prediction and label.
0554
0555     >>> scoreAndLabels = [([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
0556     ...     ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
0557     ...     ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]
0558     >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0559     ...
0560     >>> evaluator = MultilabelClassificationEvaluator()
0561     >>> evaluator.setPredictionCol("prediction")
0562     MultilabelClassificationEvaluator...
0563     >>> evaluator.evaluate(dataset)
0564     0.63...
0565     >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0566     0.54...
0567     >>> mlce_path = temp_path + "/mlce"
0568     >>> evaluator.save(mlce_path)
0569     >>> evaluator2 = MultilabelClassificationEvaluator.load(mlce_path)
0570     >>> str(evaluator2.getPredictionCol())
0571     'prediction'
0572
0573     .. versionadded:: 3.0.0
0574     """
0575     metricName = Param(Params._dummy(), "metricName",
0576                        "metric name in evaluation "
0577                        "(subsetAccuracy|accuracy|hammingLoss|precision|recall|f1Measure|"
0578                        "precisionByLabel|recallByLabel|f1MeasureByLabel|microPrecision|"
0579                        "microRecall|microF1Measure)",
0580                        typeConverter=TypeConverters.toString)
0581     metricLabel = Param(Params._dummy(), "metricLabel",
0582                         "The class whose metric will be computed in precisionByLabel|"
0583                         "recallByLabel|f1MeasureByLabel. "
0584                         "Must be >= 0. The default value is 0.",
0585                         typeConverter=TypeConverters.toFloat)
0586
0587     @keyword_only
0588     def __init__(self, predictionCol="prediction", labelCol="label",
0589                  metricName="f1Measure", metricLabel=0.0):
0590         """
0591         __init__(self, predictionCol="prediction", labelCol="label", \
0592                  metricName="f1Measure", metricLabel=0.0)
0593         """
0594         super(MultilabelClassificationEvaluator, self).__init__()
0595         self._java_obj = self._new_java_obj(
0596             "org.apache.spark.ml.evaluation.MultilabelClassificationEvaluator", self.uid)
0597         self._setDefault(metricName="f1Measure", metricLabel=0.0)
0598         kwargs = self._input_kwargs
0599         self._set(**kwargs)
0600
0601     @since("3.0.0")
0602     def setMetricName(self, value):
0603         """
0604         Sets the value of :py:attr:`metricName`.
0605         """
0606         return self._set(metricName=value)
0607
0608     @since("3.0.0")
0609     def getMetricName(self):
0610         """
0611         Gets the value of metricName or its default value.
0612         """
0613         return self.getOrDefault(self.metricName)
0614
0615     @since("3.0.0")
0616     def setMetricLabel(self, value):
0617         """
0618         Sets the value of :py:attr:`metricLabel`.
0619         """
0620         return self._set(metricLabel=value)
0621
0622     @since("3.0.0")
0623     def getMetricLabel(self):
0624         """
0625         Gets the value of metricLabel or its default value.
0626         """
0627         return self.getOrDefault(self.metricLabel)
0628
0629     @since("3.0.0")
0630     def setLabelCol(self, value):
0631         """
0632         Sets the value of :py:attr:`labelCol`.
0633         """
0634         return self._set(labelCol=value)
0635
0636     @since("3.0.0")
0637     def setPredictionCol(self, value):
0638         """
0639         Sets the value of :py:attr:`predictionCol`.
0640         """
0641         return self._set(predictionCol=value)
0642
0643     @keyword_only
0644     @since("3.0.0")
0645     def setParams(self, predictionCol="prediction", labelCol="label",
0646                   metricName="f1Measure", metricLabel=0.0):
0647         """
0648         setParams(self, predictionCol="prediction", labelCol="label", \
0649                   metricName="f1Measure", metricLabel=0.0)
0650         Sets params for multilabel classification evaluator.
0651         """
0652         kwargs = self._input_kwargs
0653         return self._set(**kwargs)
0654
0655
0656 @inherit_doc
0657 class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
0658                           JavaMLReadable, JavaMLWritable):
0659     """
0660     Evaluator for Clustering results, which expects two input
0661     columns: prediction and features. The metric computes the Silhouette
0662     measure using the squared Euclidean distance.
0663
0664     The Silhouette is a measure for the validation of the consistency
0665     within clusters. It ranges between 1 and -1, where a value close to
0666     1 means that the points in a cluster are close to the other points
0667     in the same cluster and far from the points of the other clusters.
0668
0669     >>> from pyspark.ml.linalg import Vectors
0670     >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
0671     ...     [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0),
0672     ...     ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)])
0673     >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
0674     ...
0675     >>> evaluator = ClusteringEvaluator()
0676     >>> evaluator.setPredictionCol("prediction")
0677     ClusteringEvaluator...
0678     >>> evaluator.evaluate(dataset)
0679     0.9079...
0680     >>> ce_path = temp_path + "/ce"
0681     >>> evaluator.save(ce_path)
0682     >>> evaluator2 = ClusteringEvaluator.load(ce_path)
0683     >>> str(evaluator2.getPredictionCol())
0684     'prediction'
0685
0686     .. versionadded:: 2.3.0
0687     """
0688     metricName = Param(Params._dummy(), "metricName",
0689                        "metric name in evaluation (silhouette)",
0690                        typeConverter=TypeConverters.toString)
0691     distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
0692                             "Supported options: 'squaredEuclidean' and 'cosine'.",
0693                             typeConverter=TypeConverters.toString)
0694
0695     @keyword_only
0696     def __init__(self, predictionCol="prediction", featuresCol="features",
0697                  metricName="silhouette", distanceMeasure="squaredEuclidean"):
0698         """
0699         __init__(self, predictionCol="prediction", featuresCol="features", \
0700                  metricName="silhouette", distanceMeasure="squaredEuclidean")
0701         """
0702         super(ClusteringEvaluator, self).__init__()
0703         self._java_obj = self._new_java_obj(
0704             "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
0705         self._setDefault(metricName="silhouette", distanceMeasure="squaredEuclidean")
0706         kwargs = self._input_kwargs
0707         self._set(**kwargs)
0708
0709     @keyword_only
0710     @since("2.3.0")
0711     def setParams(self, predictionCol="prediction", featuresCol="features",
0712                   metricName="silhouette", distanceMeasure="squaredEuclidean"):
0713         """
0714         setParams(self, predictionCol="prediction", featuresCol="features", \
0715                   metricName="silhouette", distanceMeasure="squaredEuclidean")
0716         Sets params for clustering evaluator.
0717         """
0718         kwargs = self._input_kwargs
0719         return self._set(**kwargs)
0720
0721     @since("2.3.0")
0722     def setMetricName(self, value):
0723         """
0724         Sets the value of :py:attr:`metricName`.
0725         """
0726         return self._set(metricName=value)
0727
0728     @since("2.3.0")
0729     def getMetricName(self):
0730         """
0731         Gets the value of metricName or its default value.
0732         """
0733         return self.getOrDefault(self.metricName)
0734
0735     @since("2.4.0")
0736     def setDistanceMeasure(self, value):
0737         """
0738         Sets the value of :py:attr:`distanceMeasure`.
0739         """
0740         return self._set(distanceMeasure=value)
0741
0742     @since("2.4.0")
0743     def getDistanceMeasure(self):
0744         """
0745         Gets the value of `distanceMeasure`
0746         """
0747         return self.getOrDefault(self.distanceMeasure)
0748
0749     def setFeaturesCol(self, value):
0750         """
0751         Sets the value of :py:attr:`featuresCol`.
0752         """
0753         return self._set(featuresCol=value)
0754
0755     def setPredictionCol(self, value):
0756         """
0757         Sets the value of :py:attr:`predictionCol`.
0758         """
0759         return self._set(predictionCol=value)
0760
0761
0762 @inherit_doc
0763 class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
0764                        JavaMLReadable, JavaMLWritable):
0765     """
0766     .. note:: Experimental
0767
0768     Evaluator for Ranking, which expects two input
0769     columns: prediction and label.
0770
0771     >>> scoreAndLabels = [([1.0, 6.0, 2.0, 7.0, 8.0, 3.0, 9.0, 10.0, 4.0, 5.0],
0772     ...     [1.0, 2.0, 3.0, 4.0, 5.0]),
0773     ...     ([4.0, 1.0, 5.0, 6.0, 2.0, 7.0, 3.0, 8.0, 9.0, 10.0], [1.0, 2.0, 3.0]),
0774     ...     ([1.0, 2.0, 3.0, 4.0, 5.0], [])]
0775     >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0776     ...
0777     >>> evaluator = RankingEvaluator()
0778     >>> evaluator.setPredictionCol("prediction")
0779     RankingEvaluator...
0780     >>> evaluator.evaluate(dataset)
0781     0.35...
0782     >>> evaluator.evaluate(dataset, {evaluator.metricName: "precisionAtK", evaluator.k: 2})
0783     0.33...
0784     >>> ranke_path = temp_path + "/ranke"
0785     >>> evaluator.save(ranke_path)
0786     >>> evaluator2 = RankingEvaluator.load(ranke_path)
0787     >>> str(evaluator2.getPredictionCol())
0788     'prediction'
0789
0790     .. versionadded:: 3.0.0
0791     """
0792     metricName = Param(Params._dummy(), "metricName",
0793                        "metric name in evaluation "
0794                        "(meanAveragePrecision|meanAveragePrecisionAtK|"
0795                        "precisionAtK|ndcgAtK|recallAtK)",
0796                        typeConverter=TypeConverters.toString)
0797     k = Param(Params._dummy(), "k",
0798               "The ranking position value used in meanAveragePrecisionAtK|precisionAtK|"
0799               "ndcgAtK|recallAtK. Must be > 0. The default value is 10.",
0800               typeConverter=TypeConverters.toInt)
0801
0802     @keyword_only
0803     def __init__(self, predictionCol="prediction", labelCol="label",
0804                  metricName="meanAveragePrecision", k=10):
0805         """
0806         __init__(self, predictionCol="prediction", labelCol="label", \
0807                  metricName="meanAveragePrecision", k=10)
0808         """
0809         super(RankingEvaluator, self).__init__()
0810         self._java_obj = self._new_java_obj(
0811             "org.apache.spark.ml.evaluation.RankingEvaluator", self.uid)
0812         self._setDefault(metricName="meanAveragePrecision", k=10)
0813         kwargs = self._input_kwargs
0814         self._set(**kwargs)
0815
0816     @since("3.0.0")
0817     def setMetricName(self, value):
0818         """
0819         Sets the value of :py:attr:`metricName`.
0820         """
0821         return self._set(metricName=value)
0822
0823     @since("3.0.0")
0824     def getMetricName(self):
0825         """
0826         Gets the value of metricName or its default value.
0827         """
0828         return self.getOrDefault(self.metricName)
0829
0830     @since("3.0.0")
0831     def setK(self, value):
0832         """
0833         Sets the value of :py:attr:`k`.
0834         """
0835         return self._set(k=value)
0836
0837     @since("3.0.0")
0838     def getK(self):
0839         """
0840         Gets the value of k or its default value.
0841         """
0842         return self.getOrDefault(self.k)
0843
0844     @since("3.0.0")
0845     def setLabelCol(self, value):
0846         """
0847         Sets the value of :py:attr:`labelCol`.
0848         """
0849         return self._set(labelCol=value)
0850
0851     @since("3.0.0")
0852     def setPredictionCol(self, value):
0853         """
0854         Sets the value of :py:attr:`predictionCol`.
0855         """
0856         return self._set(predictionCol=value)
0857
0858     @keyword_only
0859     @since("3.0.0")
0860     def setParams(self, predictionCol="prediction", labelCol="label",
0861                   metricName="meanAveragePrecision", k=10):
0862         """
0863         setParams(self, predictionCol="prediction", labelCol="label", \
0864                   metricName="meanAveragePrecision", k=10)
0865         Sets params for ranking evaluator.
0866         """
0867         kwargs = self._input_kwargs
0868         return self._set(**kwargs)
0869
0870
0871 if __name__ == "__main__":
0872     import doctest
0873     import tempfile
0874     import pyspark.ml.evaluation
0875     from pyspark.sql import SparkSession
0876     globs = pyspark.ml.evaluation.__dict__.copy()
0877     # The small batch size here ensures that we see multiple batches,
0878     # even in these small test examples:
0879     spark = SparkSession.builder\
0880         .master("local[2]")\
0881         .appName("ml.evaluation tests")\
0882         .getOrCreate()
0883     globs['spark'] = spark
0884     temp_path = tempfile.mkdtemp()
0885     globs['temp_path'] = temp_path
0886     try:
0887         (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
0888         spark.stop()
0889     finally:
0890         from shutil import rmtree
0891         try:
0892             rmtree(temp_path)
0893         except OSError:
0894             pass
0895     if failure_count:
0896         sys.exit(-1)