0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import sys
0019 from abc import abstractmethod, ABCMeta
0020
0021 from pyspark import since, keyword_only
0022 from pyspark.ml.wrapper import JavaParams
0023 from pyspark.ml.param import Param, Params, TypeConverters
0024 from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasProbabilityCol, \
0025 HasRawPredictionCol, HasFeaturesCol, HasWeightCol
0026 from pyspark.ml.common import inherit_doc
0027 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
0028
0029 __all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
0030 'MulticlassClassificationEvaluator', 'MultilabelClassificationEvaluator',
0031 'ClusteringEvaluator', 'RankingEvaluator']
0032
0033
0034 @inherit_doc
0035 class Evaluator(Params):
0036 """
0037 Base class for evaluators that compute metrics from predictions.
0038
0039 .. versionadded:: 1.4.0
0040 """
0041
0042 __metaclass__ = ABCMeta
0043
0044 @abstractmethod
0045 def _evaluate(self, dataset):
0046 """
0047 Evaluates the output.
0048
0049 :param dataset: a dataset that contains labels/observations and
0050 predictions
0051 :return: metric
0052 """
0053 raise NotImplementedError()
0054
0055 @since("1.4.0")
0056 def evaluate(self, dataset, params=None):
0057 """
0058 Evaluates the output with optional parameters.
0059
0060 :param dataset: a dataset that contains labels/observations and
0061 predictions
0062 :param params: an optional param map that overrides embedded
0063 params
0064 :return: metric
0065 """
0066 if params is None:
0067 params = dict()
0068 if isinstance(params, dict):
0069 if params:
0070 return self.copy(params)._evaluate(dataset)
0071 else:
0072 return self._evaluate(dataset)
0073 else:
0074 raise ValueError("Params must be a param map but got %s." % type(params))
0075
0076 @since("1.5.0")
0077 def isLargerBetter(self):
0078 """
0079 Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
0080 (True, default) or minimized (False).
0081 A given evaluator may support multiple metrics which may be maximized or minimized.
0082 """
0083 return True
0084
0085
0086 @inherit_doc
0087 class JavaEvaluator(JavaParams, Evaluator):
0088 """
0089 Base class for :py:class:`Evaluator`s that wrap Java/Scala
0090 implementations.
0091 """
0092
0093 __metaclass__ = ABCMeta
0094
0095 def _evaluate(self, dataset):
0096 """
0097 Evaluates the output.
0098 :param dataset: a dataset that contains labels/observations and predictions.
0099 :return: evaluation metric
0100 """
0101 self._transfer_params_to_java()
0102 return self._java_obj.evaluate(dataset._jdf)
0103
0104 def isLargerBetter(self):
0105 self._transfer_params_to_java()
0106 return self._java_obj.isLargerBetter()
0107
0108
0109 @inherit_doc
0110 class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
0111 JavaMLReadable, JavaMLWritable):
0112 """
0113 Evaluator for binary classification, which expects input columns rawPrediction, label
0114 and an optional weight column.
0115 The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
0116 1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
0117
0118 >>> from pyspark.ml.linalg import Vectors
0119 >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
0120 ... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
0121 >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
0122 ...
0123 >>> evaluator = BinaryClassificationEvaluator()
0124 >>> evaluator.setRawPredictionCol("raw")
0125 BinaryClassificationEvaluator...
0126 >>> evaluator.evaluate(dataset)
0127 0.70...
0128 >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0129 0.83...
0130 >>> bce_path = temp_path + "/bce"
0131 >>> evaluator.save(bce_path)
0132 >>> evaluator2 = BinaryClassificationEvaluator.load(bce_path)
0133 >>> str(evaluator2.getRawPredictionCol())
0134 'raw'
0135 >>> scoreAndLabelsAndWeight = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1], x[2]),
0136 ... [(0.1, 0.0, 1.0), (0.1, 1.0, 0.9), (0.4, 0.0, 0.7), (0.6, 0.0, 0.9),
0137 ... (0.6, 1.0, 1.0), (0.6, 1.0, 0.3), (0.8, 1.0, 1.0)])
0138 >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
0139 ...
0140 >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw", weightCol="weight")
0141 >>> evaluator.evaluate(dataset)
0142 0.70...
0143 >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0144 0.82...
0145 >>> evaluator.getNumBins()
0146 1000
0147
0148 .. versionadded:: 1.4.0
0149 """
0150
0151 metricName = Param(Params._dummy(), "metricName",
0152 "metric name in evaluation (areaUnderROC|areaUnderPR)",
0153 typeConverter=TypeConverters.toString)
0154
0155 numBins = Param(Params._dummy(), "numBins", "Number of bins to down-sample the curves "
0156 "(ROC curve, PR curve) in area computation. If 0, no down-sampling will "
0157 "occur. Must be >= 0.",
0158 typeConverter=TypeConverters.toInt)
0159
0160 @keyword_only
0161 def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
0162 metricName="areaUnderROC", weightCol=None, numBins=1000):
0163 """
0164 __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
0165 metricName="areaUnderROC", weightCol=None, numBins=1000)
0166 """
0167 super(BinaryClassificationEvaluator, self).__init__()
0168 self._java_obj = self._new_java_obj(
0169 "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
0170 self._setDefault(metricName="areaUnderROC", numBins=1000)
0171 kwargs = self._input_kwargs
0172 self._set(**kwargs)
0173
0174 @since("1.4.0")
0175 def setMetricName(self, value):
0176 """
0177 Sets the value of :py:attr:`metricName`.
0178 """
0179 return self._set(metricName=value)
0180
0181 @since("1.4.0")
0182 def getMetricName(self):
0183 """
0184 Gets the value of metricName or its default value.
0185 """
0186 return self.getOrDefault(self.metricName)
0187
0188 @since("3.0.0")
0189 def setNumBins(self, value):
0190 """
0191 Sets the value of :py:attr:`numBins`.
0192 """
0193 return self._set(numBins=value)
0194
0195 @since("3.0.0")
0196 def getNumBins(self):
0197 """
0198 Gets the value of numBins or its default value.
0199 """
0200 return self.getOrDefault(self.numBins)
0201
0202 def setLabelCol(self, value):
0203 """
0204 Sets the value of :py:attr:`labelCol`.
0205 """
0206 return self._set(labelCol=value)
0207
0208 def setRawPredictionCol(self, value):
0209 """
0210 Sets the value of :py:attr:`rawPredictionCol`.
0211 """
0212 return self._set(rawPredictionCol=value)
0213
0214 @since("3.0.0")
0215 def setWeightCol(self, value):
0216 """
0217 Sets the value of :py:attr:`weightCol`.
0218 """
0219 return self._set(weightCol=value)
0220
0221 @keyword_only
0222 @since("1.4.0")
0223 def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
0224 metricName="areaUnderROC", weightCol=None, numBins=1000):
0225 """
0226 setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
0227 metricName="areaUnderROC", weightCol=None, numBins=1000)
0228 Sets params for binary classification evaluator.
0229 """
0230 kwargs = self._input_kwargs
0231 return self._set(**kwargs)
0232
0233
0234 @inherit_doc
0235 class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
0236 JavaMLReadable, JavaMLWritable):
0237 """
0238 Evaluator for Regression, which expects input columns prediction, label
0239 and an optional weight column.
0240
0241 >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
0242 ... (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
0243 >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
0244 ...
0245 >>> evaluator = RegressionEvaluator()
0246 >>> evaluator.setPredictionCol("raw")
0247 RegressionEvaluator...
0248 >>> evaluator.evaluate(dataset)
0249 2.842...
0250 >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
0251 0.993...
0252 >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
0253 2.649...
0254 >>> re_path = temp_path + "/re"
0255 >>> evaluator.save(re_path)
0256 >>> evaluator2 = RegressionEvaluator.load(re_path)
0257 >>> str(evaluator2.getPredictionCol())
0258 'raw'
0259 >>> scoreAndLabelsAndWeight = [(-28.98343821, -27.0, 1.0), (20.21491975, 21.5, 0.8),
0260 ... (-25.98418959, -22.0, 1.0), (30.69731842, 33.0, 0.6), (74.69283752, 71.0, 0.2)]
0261 >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
0262 ...
0263 >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
0264 >>> evaluator.evaluate(dataset)
0265 2.740...
0266 >>> evaluator.getThroughOrigin()
0267 False
0268
0269 .. versionadded:: 1.4.0
0270 """
0271 metricName = Param(Params._dummy(), "metricName",
0272 """metric name in evaluation - one of:
0273 rmse - root mean squared error (default)
0274 mse - mean squared error
0275 r2 - r^2 metric
0276 mae - mean absolute error
0277 var - explained variance.""",
0278 typeConverter=TypeConverters.toString)
0279
0280 throughOrigin = Param(Params._dummy(), "throughOrigin",
0281 "whether the regression is through the origin.",
0282 typeConverter=TypeConverters.toBoolean)
0283
0284 @keyword_only
0285 def __init__(self, predictionCol="prediction", labelCol="label",
0286 metricName="rmse", weightCol=None, throughOrigin=False):
0287 """
0288 __init__(self, predictionCol="prediction", labelCol="label", \
0289 metricName="rmse", weightCol=None, throughOrigin=False)
0290 """
0291 super(RegressionEvaluator, self).__init__()
0292 self._java_obj = self._new_java_obj(
0293 "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
0294 self._setDefault(metricName="rmse", throughOrigin=False)
0295 kwargs = self._input_kwargs
0296 self._set(**kwargs)
0297
0298 @since("1.4.0")
0299 def setMetricName(self, value):
0300 """
0301 Sets the value of :py:attr:`metricName`.
0302 """
0303 return self._set(metricName=value)
0304
0305 @since("1.4.0")
0306 def getMetricName(self):
0307 """
0308 Gets the value of metricName or its default value.
0309 """
0310 return self.getOrDefault(self.metricName)
0311
0312 @since("3.0.0")
0313 def setThroughOrigin(self, value):
0314 """
0315 Sets the value of :py:attr:`throughOrigin`.
0316 """
0317 return self._set(throughOrigin=value)
0318
0319 @since("3.0.0")
0320 def getThroughOrigin(self):
0321 """
0322 Gets the value of throughOrigin or its default value.
0323 """
0324 return self.getOrDefault(self.throughOrigin)
0325
0326 def setLabelCol(self, value):
0327 """
0328 Sets the value of :py:attr:`labelCol`.
0329 """
0330 return self._set(labelCol=value)
0331
0332 def setPredictionCol(self, value):
0333 """
0334 Sets the value of :py:attr:`predictionCol`.
0335 """
0336 return self._set(predictionCol=value)
0337
0338 @since("3.0.0")
0339 def setWeightCol(self, value):
0340 """
0341 Sets the value of :py:attr:`weightCol`.
0342 """
0343 return self._set(weightCol=value)
0344
0345 @keyword_only
0346 @since("1.4.0")
0347 def setParams(self, predictionCol="prediction", labelCol="label",
0348 metricName="rmse", weightCol=None, throughOrigin=False):
0349 """
0350 setParams(self, predictionCol="prediction", labelCol="label", \
0351 metricName="rmse", weightCol=None, throughOrigin=False)
0352 Sets params for regression evaluator.
0353 """
0354 kwargs = self._input_kwargs
0355 return self._set(**kwargs)
0356
0357
0358 @inherit_doc
0359 class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
0360 HasProbabilityCol, JavaMLReadable, JavaMLWritable):
0361 """
0362 Evaluator for Multiclass Classification, which expects input
0363 columns: prediction, label, weight (optional) and probabilityCol (only for logLoss).
0364
0365 >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
0366 ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
0367 >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0368 >>> evaluator = MulticlassClassificationEvaluator()
0369 >>> evaluator.setPredictionCol("prediction")
0370 MulticlassClassificationEvaluator...
0371 >>> evaluator.evaluate(dataset)
0372 0.66...
0373 >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0374 0.66...
0375 >>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
0376 ... evaluator.metricLabel: 1.0})
0377 0.75...
0378 >>> evaluator.setMetricName("hammingLoss")
0379 MulticlassClassificationEvaluator...
0380 >>> evaluator.evaluate(dataset)
0381 0.33...
0382 >>> mce_path = temp_path + "/mce"
0383 >>> evaluator.save(mce_path)
0384 >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
0385 >>> str(evaluator2.getPredictionCol())
0386 'prediction'
0387 >>> scoreAndLabelsAndWeight = [(0.0, 0.0, 1.0), (0.0, 1.0, 1.0), (0.0, 0.0, 1.0),
0388 ... (1.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0),
0389 ... (2.0, 2.0, 1.0), (2.0, 0.0, 1.0)]
0390 >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["prediction", "label", "weight"])
0391 >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
0392 ... weightCol="weight")
0393 >>> evaluator.evaluate(dataset)
0394 0.66...
0395 >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0396 0.66...
0397 >>> predictionAndLabelsWithProbabilities = [
0398 ... (1.0, 1.0, 1.0, [0.1, 0.8, 0.1]), (0.0, 2.0, 1.0, [0.9, 0.05, 0.05]),
0399 ... (0.0, 0.0, 1.0, [0.8, 0.2, 0.0]), (1.0, 1.0, 1.0, [0.3, 0.65, 0.05])]
0400 >>> dataset = spark.createDataFrame(predictionAndLabelsWithProbabilities, ["prediction",
0401 ... "label", "weight", "probability"])
0402 >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
0403 ... probabilityCol="probability")
0404 >>> evaluator.setMetricName("logLoss")
0405 MulticlassClassificationEvaluator...
0406 >>> evaluator.evaluate(dataset)
0407 0.9682...
0408
0409 .. versionadded:: 1.5.0
0410 """
0411 metricName = Param(Params._dummy(), "metricName",
0412 "metric name in evaluation "
0413 "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
0414 "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
0415 "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
0416 "logLoss|hammingLoss)",
0417 typeConverter=TypeConverters.toString)
0418 metricLabel = Param(Params._dummy(), "metricLabel",
0419 "The class whose metric will be computed in truePositiveRateByLabel|"
0420 "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel."
0421 " Must be >= 0. The default value is 0.",
0422 typeConverter=TypeConverters.toFloat)
0423 beta = Param(Params._dummy(), "beta",
0424 "The beta value used in weightedFMeasure|fMeasureByLabel."
0425 " Must be > 0. The default value is 1.",
0426 typeConverter=TypeConverters.toFloat)
0427 eps = Param(Params._dummy(), "eps",
0428 "log-loss is undefined for p=0 or p=1, so probabilities are clipped to "
0429 "max(eps, min(1 - eps, p)). "
0430 "Must be in range (0, 0.5). The default value is 1e-15.",
0431 typeConverter=TypeConverters.toFloat)
0432
0433 @keyword_only
0434 def __init__(self, predictionCol="prediction", labelCol="label",
0435 metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
0436 probabilityCol="probability", eps=1e-15):
0437 """
0438 __init__(self, predictionCol="prediction", labelCol="label", \
0439 metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
0440 probabilityCol="probability", eps=1e-15)
0441 """
0442 super(MulticlassClassificationEvaluator, self).__init__()
0443 self._java_obj = self._new_java_obj(
0444 "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
0445 self._setDefault(metricName="f1", metricLabel=0.0, beta=1.0, eps=1e-15)
0446 kwargs = self._input_kwargs
0447 self._set(**kwargs)
0448
0449 @since("1.5.0")
0450 def setMetricName(self, value):
0451 """
0452 Sets the value of :py:attr:`metricName`.
0453 """
0454 return self._set(metricName=value)
0455
0456 @since("1.5.0")
0457 def getMetricName(self):
0458 """
0459 Gets the value of metricName or its default value.
0460 """
0461 return self.getOrDefault(self.metricName)
0462
0463 @since("3.0.0")
0464 def setMetricLabel(self, value):
0465 """
0466 Sets the value of :py:attr:`metricLabel`.
0467 """
0468 return self._set(metricLabel=value)
0469
0470 @since("3.0.0")
0471 def getMetricLabel(self):
0472 """
0473 Gets the value of metricLabel or its default value.
0474 """
0475 return self.getOrDefault(self.metricLabel)
0476
0477 @since("3.0.0")
0478 def setBeta(self, value):
0479 """
0480 Sets the value of :py:attr:`beta`.
0481 """
0482 return self._set(beta=value)
0483
0484 @since("3.0.0")
0485 def getBeta(self):
0486 """
0487 Gets the value of beta or its default value.
0488 """
0489 return self.getOrDefault(self.beta)
0490
0491 @since("3.0.0")
0492 def setEps(self, value):
0493 """
0494 Sets the value of :py:attr:`eps`.
0495 """
0496 return self._set(eps=value)
0497
0498 @since("3.0.0")
0499 def getEps(self):
0500 """
0501 Gets the value of eps or its default value.
0502 """
0503 return self.getOrDefault(self.eps)
0504
0505 def setLabelCol(self, value):
0506 """
0507 Sets the value of :py:attr:`labelCol`.
0508 """
0509 return self._set(labelCol=value)
0510
0511 def setPredictionCol(self, value):
0512 """
0513 Sets the value of :py:attr:`predictionCol`.
0514 """
0515 return self._set(predictionCol=value)
0516
0517 @since("3.0.0")
0518 def setProbabilityCol(self, value):
0519 """
0520 Sets the value of :py:attr:`probabilityCol`.
0521 """
0522 return self._set(probabilityCol=value)
0523
0524 @since("3.0.0")
0525 def setWeightCol(self, value):
0526 """
0527 Sets the value of :py:attr:`weightCol`.
0528 """
0529 return self._set(weightCol=value)
0530
0531 @keyword_only
0532 @since("1.5.0")
0533 def setParams(self, predictionCol="prediction", labelCol="label",
0534 metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
0535 probabilityCol="probability", eps=1e-15):
0536 """
0537 setParams(self, predictionCol="prediction", labelCol="label", \
0538 metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
0539 probabilityCol="probability", eps=1e-15)
0540 Sets params for multiclass classification evaluator.
0541 """
0542 kwargs = self._input_kwargs
0543 return self._set(**kwargs)
0544
0545
0546 @inherit_doc
0547 class MultilabelClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
0548 JavaMLReadable, JavaMLWritable):
0549 """
0550 .. note:: Experimental
0551
0552 Evaluator for Multilabel Classification, which expects two input
0553 columns: prediction and label.
0554
0555 >>> scoreAndLabels = [([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
0556 ... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
0557 ... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]
0558 >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0559 ...
0560 >>> evaluator = MultilabelClassificationEvaluator()
0561 >>> evaluator.setPredictionCol("prediction")
0562 MultilabelClassificationEvaluator...
0563 >>> evaluator.evaluate(dataset)
0564 0.63...
0565 >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0566 0.54...
0567 >>> mlce_path = temp_path + "/mlce"
0568 >>> evaluator.save(mlce_path)
0569 >>> evaluator2 = MultilabelClassificationEvaluator.load(mlce_path)
0570 >>> str(evaluator2.getPredictionCol())
0571 'prediction'
0572
0573 .. versionadded:: 3.0.0
0574 """
0575 metricName = Param(Params._dummy(), "metricName",
0576 "metric name in evaluation "
0577 "(subsetAccuracy|accuracy|hammingLoss|precision|recall|f1Measure|"
0578 "precisionByLabel|recallByLabel|f1MeasureByLabel|microPrecision|"
0579 "microRecall|microF1Measure)",
0580 typeConverter=TypeConverters.toString)
0581 metricLabel = Param(Params._dummy(), "metricLabel",
0582 "The class whose metric will be computed in precisionByLabel|"
0583 "recallByLabel|f1MeasureByLabel. "
0584 "Must be >= 0. The default value is 0.",
0585 typeConverter=TypeConverters.toFloat)
0586
0587 @keyword_only
0588 def __init__(self, predictionCol="prediction", labelCol="label",
0589 metricName="f1Measure", metricLabel=0.0):
0590 """
0591 __init__(self, predictionCol="prediction", labelCol="label", \
0592 metricName="f1Measure", metricLabel=0.0)
0593 """
0594 super(MultilabelClassificationEvaluator, self).__init__()
0595 self._java_obj = self._new_java_obj(
0596 "org.apache.spark.ml.evaluation.MultilabelClassificationEvaluator", self.uid)
0597 self._setDefault(metricName="f1Measure", metricLabel=0.0)
0598 kwargs = self._input_kwargs
0599 self._set(**kwargs)
0600
0601 @since("3.0.0")
0602 def setMetricName(self, value):
0603 """
0604 Sets the value of :py:attr:`metricName`.
0605 """
0606 return self._set(metricName=value)
0607
0608 @since("3.0.0")
0609 def getMetricName(self):
0610 """
0611 Gets the value of metricName or its default value.
0612 """
0613 return self.getOrDefault(self.metricName)
0614
0615 @since("3.0.0")
0616 def setMetricLabel(self, value):
0617 """
0618 Sets the value of :py:attr:`metricLabel`.
0619 """
0620 return self._set(metricLabel=value)
0621
0622 @since("3.0.0")
0623 def getMetricLabel(self):
0624 """
0625 Gets the value of metricLabel or its default value.
0626 """
0627 return self.getOrDefault(self.metricLabel)
0628
0629 @since("3.0.0")
0630 def setLabelCol(self, value):
0631 """
0632 Sets the value of :py:attr:`labelCol`.
0633 """
0634 return self._set(labelCol=value)
0635
0636 @since("3.0.0")
0637 def setPredictionCol(self, value):
0638 """
0639 Sets the value of :py:attr:`predictionCol`.
0640 """
0641 return self._set(predictionCol=value)
0642
0643 @keyword_only
0644 @since("3.0.0")
0645 def setParams(self, predictionCol="prediction", labelCol="label",
0646 metricName="f1Measure", metricLabel=0.0):
0647 """
0648 setParams(self, predictionCol="prediction", labelCol="label", \
0649 metricName="f1Measure", metricLabel=0.0)
0650 Sets params for multilabel classification evaluator.
0651 """
0652 kwargs = self._input_kwargs
0653 return self._set(**kwargs)
0654
0655
0656 @inherit_doc
0657 class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
0658 JavaMLReadable, JavaMLWritable):
0659 """
0660 Evaluator for Clustering results, which expects two input
0661 columns: prediction and features. The metric computes the Silhouette
0662 measure using the squared Euclidean distance.
0663
0664 The Silhouette is a measure for the validation of the consistency
0665 within clusters. It ranges between 1 and -1, where a value close to
0666 1 means that the points in a cluster are close to the other points
0667 in the same cluster and far from the points of the other clusters.
0668
0669 >>> from pyspark.ml.linalg import Vectors
0670 >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
0671 ... [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0),
0672 ... ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)])
0673 >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
0674 ...
0675 >>> evaluator = ClusteringEvaluator()
0676 >>> evaluator.setPredictionCol("prediction")
0677 ClusteringEvaluator...
0678 >>> evaluator.evaluate(dataset)
0679 0.9079...
0680 >>> ce_path = temp_path + "/ce"
0681 >>> evaluator.save(ce_path)
0682 >>> evaluator2 = ClusteringEvaluator.load(ce_path)
0683 >>> str(evaluator2.getPredictionCol())
0684 'prediction'
0685
0686 .. versionadded:: 2.3.0
0687 """
0688 metricName = Param(Params._dummy(), "metricName",
0689 "metric name in evaluation (silhouette)",
0690 typeConverter=TypeConverters.toString)
0691 distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
0692 "Supported options: 'squaredEuclidean' and 'cosine'.",
0693 typeConverter=TypeConverters.toString)
0694
0695 @keyword_only
0696 def __init__(self, predictionCol="prediction", featuresCol="features",
0697 metricName="silhouette", distanceMeasure="squaredEuclidean"):
0698 """
0699 __init__(self, predictionCol="prediction", featuresCol="features", \
0700 metricName="silhouette", distanceMeasure="squaredEuclidean")
0701 """
0702 super(ClusteringEvaluator, self).__init__()
0703 self._java_obj = self._new_java_obj(
0704 "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
0705 self._setDefault(metricName="silhouette", distanceMeasure="squaredEuclidean")
0706 kwargs = self._input_kwargs
0707 self._set(**kwargs)
0708
0709 @keyword_only
0710 @since("2.3.0")
0711 def setParams(self, predictionCol="prediction", featuresCol="features",
0712 metricName="silhouette", distanceMeasure="squaredEuclidean"):
0713 """
0714 setParams(self, predictionCol="prediction", featuresCol="features", \
0715 metricName="silhouette", distanceMeasure="squaredEuclidean")
0716 Sets params for clustering evaluator.
0717 """
0718 kwargs = self._input_kwargs
0719 return self._set(**kwargs)
0720
0721 @since("2.3.0")
0722 def setMetricName(self, value):
0723 """
0724 Sets the value of :py:attr:`metricName`.
0725 """
0726 return self._set(metricName=value)
0727
0728 @since("2.3.0")
0729 def getMetricName(self):
0730 """
0731 Gets the value of metricName or its default value.
0732 """
0733 return self.getOrDefault(self.metricName)
0734
0735 @since("2.4.0")
0736 def setDistanceMeasure(self, value):
0737 """
0738 Sets the value of :py:attr:`distanceMeasure`.
0739 """
0740 return self._set(distanceMeasure=value)
0741
0742 @since("2.4.0")
0743 def getDistanceMeasure(self):
0744 """
0745 Gets the value of `distanceMeasure`
0746 """
0747 return self.getOrDefault(self.distanceMeasure)
0748
0749 def setFeaturesCol(self, value):
0750 """
0751 Sets the value of :py:attr:`featuresCol`.
0752 """
0753 return self._set(featuresCol=value)
0754
0755 def setPredictionCol(self, value):
0756 """
0757 Sets the value of :py:attr:`predictionCol`.
0758 """
0759 return self._set(predictionCol=value)
0760
0761
0762 @inherit_doc
0763 class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
0764 JavaMLReadable, JavaMLWritable):
0765 """
0766 .. note:: Experimental
0767
0768 Evaluator for Ranking, which expects two input
0769 columns: prediction and label.
0770
0771 >>> scoreAndLabels = [([1.0, 6.0, 2.0, 7.0, 8.0, 3.0, 9.0, 10.0, 4.0, 5.0],
0772 ... [1.0, 2.0, 3.0, 4.0, 5.0]),
0773 ... ([4.0, 1.0, 5.0, 6.0, 2.0, 7.0, 3.0, 8.0, 9.0, 10.0], [1.0, 2.0, 3.0]),
0774 ... ([1.0, 2.0, 3.0, 4.0, 5.0], [])]
0775 >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
0776 ...
0777 >>> evaluator = RankingEvaluator()
0778 >>> evaluator.setPredictionCol("prediction")
0779 RankingEvaluator...
0780 >>> evaluator.evaluate(dataset)
0781 0.35...
0782 >>> evaluator.evaluate(dataset, {evaluator.metricName: "precisionAtK", evaluator.k: 2})
0783 0.33...
0784 >>> ranke_path = temp_path + "/ranke"
0785 >>> evaluator.save(ranke_path)
0786 >>> evaluator2 = RankingEvaluator.load(ranke_path)
0787 >>> str(evaluator2.getPredictionCol())
0788 'prediction'
0789
0790 .. versionadded:: 3.0.0
0791 """
0792 metricName = Param(Params._dummy(), "metricName",
0793 "metric name in evaluation "
0794 "(meanAveragePrecision|meanAveragePrecisionAtK|"
0795 "precisionAtK|ndcgAtK|recallAtK)",
0796 typeConverter=TypeConverters.toString)
0797 k = Param(Params._dummy(), "k",
0798 "The ranking position value used in meanAveragePrecisionAtK|precisionAtK|"
0799 "ndcgAtK|recallAtK. Must be > 0. The default value is 10.",
0800 typeConverter=TypeConverters.toInt)
0801
0802 @keyword_only
0803 def __init__(self, predictionCol="prediction", labelCol="label",
0804 metricName="meanAveragePrecision", k=10):
0805 """
0806 __init__(self, predictionCol="prediction", labelCol="label", \
0807 metricName="meanAveragePrecision", k=10)
0808 """
0809 super(RankingEvaluator, self).__init__()
0810 self._java_obj = self._new_java_obj(
0811 "org.apache.spark.ml.evaluation.RankingEvaluator", self.uid)
0812 self._setDefault(metricName="meanAveragePrecision", k=10)
0813 kwargs = self._input_kwargs
0814 self._set(**kwargs)
0815
0816 @since("3.0.0")
0817 def setMetricName(self, value):
0818 """
0819 Sets the value of :py:attr:`metricName`.
0820 """
0821 return self._set(metricName=value)
0822
0823 @since("3.0.0")
0824 def getMetricName(self):
0825 """
0826 Gets the value of metricName or its default value.
0827 """
0828 return self.getOrDefault(self.metricName)
0829
0830 @since("3.0.0")
0831 def setK(self, value):
0832 """
0833 Sets the value of :py:attr:`k`.
0834 """
0835 return self._set(k=value)
0836
0837 @since("3.0.0")
0838 def getK(self):
0839 """
0840 Gets the value of k or its default value.
0841 """
0842 return self.getOrDefault(self.k)
0843
0844 @since("3.0.0")
0845 def setLabelCol(self, value):
0846 """
0847 Sets the value of :py:attr:`labelCol`.
0848 """
0849 return self._set(labelCol=value)
0850
0851 @since("3.0.0")
0852 def setPredictionCol(self, value):
0853 """
0854 Sets the value of :py:attr:`predictionCol`.
0855 """
0856 return self._set(predictionCol=value)
0857
0858 @keyword_only
0859 @since("3.0.0")
0860 def setParams(self, predictionCol="prediction", labelCol="label",
0861 metricName="meanAveragePrecision", k=10):
0862 """
0863 setParams(self, predictionCol="prediction", labelCol="label", \
0864 metricName="meanAveragePrecision", k=10)
0865 Sets params for ranking evaluator.
0866 """
0867 kwargs = self._input_kwargs
0868 return self._set(**kwargs)
0869
0870
0871 if __name__ == "__main__":
0872 import doctest
0873 import tempfile
0874 import pyspark.ml.evaluation
0875 from pyspark.sql import SparkSession
0876 globs = pyspark.ml.evaluation.__dict__.copy()
0877
0878
0879 spark = SparkSession.builder\
0880 .master("local[2]")\
0881 .appName("ml.evaluation tests")\
0882 .getOrCreate()
0883 globs['spark'] = spark
0884 temp_path = tempfile.mkdtemp()
0885 globs['temp_path'] = temp_path
0886 try:
0887 (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
0888 spark.stop()
0889 finally:
0890 from shutil import rmtree
0891 try:
0892 rmtree(temp_path)
0893 except OSError:
0894 pass
0895 if failure_count:
0896 sys.exit(-1)