0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import sys
0019 import unittest
0020
0021 if sys.version > '3':
0022 basestring = str
0023
0024 from pyspark.ml.classification import BinaryLogisticRegressionSummary, LogisticRegression, \
0025 LogisticRegressionSummary
0026 from pyspark.ml.clustering import BisectingKMeans, GaussianMixture, KMeans
0027 from pyspark.ml.linalg import Vectors
0028 from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
0029 from pyspark.sql import DataFrame
0030 from pyspark.testing.mlutils import SparkSessionTestCase
0031
0032
0033 class TrainingSummaryTest(SparkSessionTestCase):
0034
0035 def test_linear_regression_summary(self):
0036 df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
0037 (0.0, 2.0, Vectors.sparse(1, [], []))],
0038 ["label", "weight", "features"])
0039 lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
0040 fitIntercept=False)
0041 model = lr.fit(df)
0042 self.assertTrue(model.hasSummary)
0043 s = model.summary
0044
0045 self.assertGreater(s.totalIterations, 0)
0046 self.assertTrue(isinstance(s.predictions, DataFrame))
0047 self.assertEqual(s.predictionCol, "prediction")
0048 self.assertEqual(s.labelCol, "label")
0049 self.assertEqual(s.featuresCol, "features")
0050 objHist = s.objectiveHistory
0051 self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
0052 self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
0053 self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
0054 self.assertAlmostEqual(s.meanSquaredError, 0.0)
0055 self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
0056 self.assertAlmostEqual(s.r2, 1.0, 2)
0057 self.assertAlmostEqual(s.r2adj, 1.0, 2)
0058 self.assertTrue(isinstance(s.residuals, DataFrame))
0059 self.assertEqual(s.numInstances, 2)
0060 self.assertEqual(s.degreesOfFreedom, 1)
0061 devResiduals = s.devianceResiduals
0062 self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
0063 coefStdErr = s.coefficientStandardErrors
0064 self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
0065 tValues = s.tValues
0066 self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
0067 pValues = s.pValues
0068 self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
0069
0070
0071
0072 sameSummary = model.evaluate(df)
0073 self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
0074
0075 def test_glr_summary(self):
0076 from pyspark.ml.linalg import Vectors
0077 df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
0078 (0.0, 2.0, Vectors.sparse(1, [], []))],
0079 ["label", "weight", "features"])
0080 glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
0081 fitIntercept=False)
0082 model = glr.fit(df)
0083 self.assertTrue(model.hasSummary)
0084 s = model.summary
0085
0086 self.assertEqual(s.numIterations, 1)
0087 self.assertTrue(isinstance(s.predictions, DataFrame))
0088 self.assertEqual(s.predictionCol, "prediction")
0089 self.assertEqual(s.numInstances, 2)
0090 self.assertTrue(isinstance(s.residuals(), DataFrame))
0091 self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
0092 coefStdErr = s.coefficientStandardErrors
0093 self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
0094 tValues = s.tValues
0095 self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
0096 pValues = s.pValues
0097 self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
0098 self.assertEqual(s.degreesOfFreedom, 1)
0099 self.assertEqual(s.residualDegreeOfFreedom, 1)
0100 self.assertEqual(s.residualDegreeOfFreedomNull, 2)
0101 self.assertEqual(s.rank, 1)
0102 self.assertTrue(isinstance(s.solver, basestring))
0103 self.assertTrue(isinstance(s.aic, float))
0104 self.assertTrue(isinstance(s.deviance, float))
0105 self.assertTrue(isinstance(s.nullDeviance, float))
0106 self.assertTrue(isinstance(s.dispersion, float))
0107
0108
0109
0110 sameSummary = model.evaluate(df)
0111 self.assertAlmostEqual(sameSummary.deviance, s.deviance)
0112
0113 def test_binary_logistic_regression_summary(self):
0114 df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
0115 (0.0, 2.0, Vectors.sparse(1, [], []))],
0116 ["label", "weight", "features"])
0117 lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
0118 model = lr.fit(df)
0119 self.assertTrue(model.hasSummary)
0120 s = model.summary
0121
0122 self.assertTrue(isinstance(s.predictions, DataFrame))
0123 self.assertEqual(s.probabilityCol, "probability")
0124 self.assertEqual(s.labelCol, "label")
0125 self.assertEqual(s.featuresCol, "features")
0126 self.assertEqual(s.predictionCol, "prediction")
0127 objHist = s.objectiveHistory
0128 self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
0129 self.assertGreater(s.totalIterations, 0)
0130 self.assertTrue(isinstance(s.labels, list))
0131 self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
0132 self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
0133 self.assertTrue(isinstance(s.precisionByLabel, list))
0134 self.assertTrue(isinstance(s.recallByLabel, list))
0135 self.assertTrue(isinstance(s.fMeasureByLabel(), list))
0136 self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
0137 self.assertTrue(isinstance(s.roc, DataFrame))
0138 self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
0139 self.assertTrue(isinstance(s.pr, DataFrame))
0140 self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
0141 self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
0142 self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
0143 self.assertAlmostEqual(s.accuracy, 1.0, 2)
0144 self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
0145 self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
0146 self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
0147 self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
0148 self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
0149 self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
0150
0151
0152 sameSummary = model.evaluate(df)
0153 self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary))
0154 self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
0155
0156 def test_multiclass_logistic_regression_summary(self):
0157 df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
0158 (0.0, 2.0, Vectors.sparse(1, [], [])),
0159 (2.0, 2.0, Vectors.dense(2.0)),
0160 (2.0, 2.0, Vectors.dense(1.9))],
0161 ["label", "weight", "features"])
0162 lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
0163 model = lr.fit(df)
0164 self.assertTrue(model.hasSummary)
0165 s = model.summary
0166
0167 self.assertTrue(isinstance(s.predictions, DataFrame))
0168 self.assertEqual(s.probabilityCol, "probability")
0169 self.assertEqual(s.labelCol, "label")
0170 self.assertEqual(s.featuresCol, "features")
0171 self.assertEqual(s.predictionCol, "prediction")
0172 objHist = s.objectiveHistory
0173 self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
0174 self.assertGreater(s.totalIterations, 0)
0175 self.assertTrue(isinstance(s.labels, list))
0176 self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
0177 self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
0178 self.assertTrue(isinstance(s.precisionByLabel, list))
0179 self.assertTrue(isinstance(s.recallByLabel, list))
0180 self.assertTrue(isinstance(s.fMeasureByLabel(), list))
0181 self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
0182 self.assertAlmostEqual(s.accuracy, 0.75, 2)
0183 self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
0184 self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
0185 self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
0186 self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
0187 self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
0188 self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
0189
0190
0191 sameSummary = model.evaluate(df)
0192 self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary))
0193 self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary))
0194 self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
0195
0196 def test_gaussian_mixture_summary(self):
0197 data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
0198 (Vectors.sparse(1, [], []),)]
0199 df = self.spark.createDataFrame(data, ["features"])
0200 gmm = GaussianMixture(k=2)
0201 model = gmm.fit(df)
0202 self.assertTrue(model.hasSummary)
0203 s = model.summary
0204 self.assertTrue(isinstance(s.predictions, DataFrame))
0205 self.assertEqual(s.probabilityCol, "probability")
0206 self.assertTrue(isinstance(s.probability, DataFrame))
0207 self.assertEqual(s.featuresCol, "features")
0208 self.assertEqual(s.predictionCol, "prediction")
0209 self.assertTrue(isinstance(s.cluster, DataFrame))
0210 self.assertEqual(len(s.clusterSizes), 2)
0211 self.assertEqual(s.k, 2)
0212 self.assertEqual(s.numIter, 3)
0213
0214 def test_bisecting_kmeans_summary(self):
0215 data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
0216 (Vectors.sparse(1, [], []),)]
0217 df = self.spark.createDataFrame(data, ["features"])
0218 bkm = BisectingKMeans(k=2)
0219 model = bkm.fit(df)
0220 self.assertTrue(model.hasSummary)
0221 s = model.summary
0222 self.assertTrue(isinstance(s.predictions, DataFrame))
0223 self.assertEqual(s.featuresCol, "features")
0224 self.assertEqual(s.predictionCol, "prediction")
0225 self.assertTrue(isinstance(s.cluster, DataFrame))
0226 self.assertEqual(len(s.clusterSizes), 2)
0227 self.assertEqual(s.k, 2)
0228 self.assertEqual(s.numIter, 20)
0229
0230 def test_kmeans_summary(self):
0231 data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
0232 (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
0233 df = self.spark.createDataFrame(data, ["features"])
0234 kmeans = KMeans(k=2, seed=1)
0235 model = kmeans.fit(df)
0236 self.assertTrue(model.hasSummary)
0237 s = model.summary
0238 self.assertTrue(isinstance(s.predictions, DataFrame))
0239 self.assertEqual(s.featuresCol, "features")
0240 self.assertEqual(s.predictionCol, "prediction")
0241 self.assertTrue(isinstance(s.cluster, DataFrame))
0242 self.assertEqual(len(s.clusterSizes), 2)
0243 self.assertEqual(s.k, 2)
0244 self.assertEqual(s.numIter, 1)
0245
0246
0247 if __name__ == "__main__":
0248 from pyspark.ml.tests.test_training_summary import *
0249
0250 try:
0251 import xmlrunner
0252 testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0253 except ImportError:
0254 testRunner = None
0255 unittest.main(testRunner=testRunner, verbosity=2)