ml/tests/test_evaluation.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 import unittest
0019
0020 import numpy as np
0021
0022 from pyspark.ml.evaluation import ClusteringEvaluator, RegressionEvaluator
0023 from pyspark.ml.linalg import Vectors
0024 from pyspark.sql import Row
0025 from pyspark.testing.mlutils import SparkSessionTestCase
0026
0027
0028 class EvaluatorTests(SparkSessionTestCase):
0029
0030     def test_java_params(self):
0031         """
0032         This tests a bug fixed by SPARK-18274 which causes multiple copies
0033         of a Params instance in Python to be linked to the same Java instance.
0034         """
0035         evaluator = RegressionEvaluator(metricName="r2")
0036         df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
0037         evaluator.evaluate(df)
0038         self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
0039         evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
0040         evaluator.evaluate(df)
0041         evaluatorCopy.evaluate(df)
0042         self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
0043         self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
0044
0045     def test_clustering_evaluator_with_cosine_distance(self):
0046         featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
0047                                     [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
0048                                      ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
0049         dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
0050         evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
0051         self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
0052         self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
0053
0054
0055 if __name__ == "__main__":
0056     from pyspark.ml.tests.test_evaluation import *
0057
0058     try:
0059         import xmlrunner
0060         testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0061     except ImportError:
0062         testRunner = None
0063     unittest.main(testRunner=testRunner, verbosity=2)