0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import unittest
0019
0020 import numpy as np
0021
0022 from pyspark.ml.evaluation import ClusteringEvaluator, RegressionEvaluator
0023 from pyspark.ml.linalg import Vectors
0024 from pyspark.sql import Row
0025 from pyspark.testing.mlutils import SparkSessionTestCase
0026
0027
0028 class EvaluatorTests(SparkSessionTestCase):
0029
0030 def test_java_params(self):
0031 """
0032 This tests a bug fixed by SPARK-18274 which causes multiple copies
0033 of a Params instance in Python to be linked to the same Java instance.
0034 """
0035 evaluator = RegressionEvaluator(metricName="r2")
0036 df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
0037 evaluator.evaluate(df)
0038 self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
0039 evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
0040 evaluator.evaluate(df)
0041 evaluatorCopy.evaluate(df)
0042 self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
0043 self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
0044
0045 def test_clustering_evaluator_with_cosine_distance(self):
0046 featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
0047 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
0048 ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
0049 dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
0050 evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
0051 self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
0052 self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5))
0053
0054
0055 if __name__ == "__main__":
0056 from pyspark.ml.tests.test_evaluation import *
0057
0058 try:
0059 import xmlrunner
0060 testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0061 except ImportError:
0062 testRunner = None
0063 unittest.main(testRunner=testRunner, verbosity=2)