0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import array as pyarray
0019 import unittest
0020
0021 from numpy import array
0022
0023 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
0024 DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
0025 from pyspark.mllib.random import RandomRDDs
0026 from pyspark.mllib.regression import LabeledPoint
0027 from pyspark.mllib.stat import Statistics
0028 from pyspark.sql.utils import IllegalArgumentException
0029 from pyspark.testing.mllibutils import MLlibTestCase
0030
0031
0032 class StatTests(MLlibTestCase):
0033
0034 def test_col_with_different_rdds(self):
0035
0036 data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0037 summary = Statistics.colStats(data)
0038 self.assertEqual(1000, summary.count())
0039
0040 data = self.sc.parallelize([range(10)] * 10)
0041 summary = Statistics.colStats(data)
0042 self.assertEqual(10, summary.count())
0043
0044 data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
0045 summary = Statistics.colStats(data)
0046 self.assertEqual(10, summary.count())
0047
0048 def test_col_norms(self):
0049 data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0050 summary = Statistics.colStats(data)
0051 self.assertEqual(10, len(summary.normL1()))
0052 self.assertEqual(10, len(summary.normL2()))
0053
0054 data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
0055 summary2 = Statistics.colStats(data2)
0056 self.assertEqual(array([45.0]), summary2.normL1())
0057 import math
0058 expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
0059 self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
0060
0061
0062 class ChiSqTestTests(MLlibTestCase):
0063 def test_goodness_of_fit(self):
0064 from numpy import inf
0065
0066 observed = Vectors.dense([4, 6, 5])
0067 pearson = Statistics.chiSqTest(observed)
0068
0069
0070 self.assertEqual(pearson.statistic, 0.4)
0071 self.assertEqual(pearson.degreesOfFreedom, 2)
0072 self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
0073
0074
0075 observed1 = Vectors.dense([21, 38, 43, 80])
0076 expected1 = Vectors.dense([3, 5, 7, 20])
0077 pearson1 = Statistics.chiSqTest(observed1, expected1)
0078
0079
0080
0081 self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
0082 self.assertEqual(pearson1.degreesOfFreedom, 3)
0083 self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
0084
0085
0086 observed3 = Vectors.dense([1.0, 2.0, 3.0])
0087 expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
0088 self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
0089
0090
0091 neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
0092 self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)
0093
0094
0095 zero_expected = Vectors.dense([1.0, 0.0, 3.0])
0096 pearson_inf = Statistics.chiSqTest(observed, zero_expected)
0097 self.assertEqual(pearson_inf.statistic, inf)
0098 self.assertEqual(pearson_inf.degreesOfFreedom, 2)
0099 self.assertEqual(pearson_inf.pValue, 0.0)
0100
0101
0102 zero_observed = Vectors.dense([2.0, 0.0, 1.0])
0103 self.assertRaises(
0104 IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
0105
0106 def test_matrix_independence(self):
0107 data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
0108 chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
0109
0110
0111
0112 self.assertAlmostEqual(chi.statistic, 21.9958, 4)
0113 self.assertEqual(chi.degreesOfFreedom, 6)
0114 self.assertAlmostEqual(chi.pValue, 0.001213, 4)
0115
0116
0117 neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
0118 self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts)
0119
0120
0121 row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
0122 self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero)
0123
0124
0125 col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
0126 self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
0127
0128 def test_chi_sq_pearson(self):
0129 data = [
0130 LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
0131 LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
0132 LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
0133 LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
0134 LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
0135 LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
0136 ]
0137
0138 for numParts in [2, 4, 6, 8]:
0139 chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
0140 feature1 = chi[0]
0141 self.assertEqual(feature1.statistic, 0.75)
0142 self.assertEqual(feature1.degreesOfFreedom, 2)
0143 self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
0144
0145 feature2 = chi[1]
0146 self.assertEqual(feature2.statistic, 1.5)
0147 self.assertEqual(feature2.degreesOfFreedom, 3)
0148 self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
0149
0150 def test_right_number_of_results(self):
0151 num_cols = 1001
0152 sparse_data = [
0153 LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
0154 LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
0155 ]
0156 chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
0157 self.assertEqual(len(chi), num_cols)
0158 self.assertIsNotNone(chi[1000])
0159
0160
0161 class KolmogorovSmirnovTest(MLlibTestCase):
0162
0163 def test_R_implementation_equivalence(self):
0164 data = self.sc.parallelize([
0165 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
0166 -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
0167 -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
0168 -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
0169 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
0170 ])
0171 model = Statistics.kolmogorovSmirnovTest(data, "norm")
0172 self.assertAlmostEqual(model.statistic, 0.189, 3)
0173 self.assertAlmostEqual(model.pValue, 0.422, 3)
0174
0175 model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
0176 self.assertAlmostEqual(model.statistic, 0.189, 3)
0177 self.assertAlmostEqual(model.pValue, 0.422, 3)
0178
0179
0180 if __name__ == "__main__":
0181 from pyspark.mllib.tests.test_stat import *
0182
0183 try:
0184 import xmlrunner
0185 testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0186 except ImportError:
0187 testRunner = None
0188 unittest.main(testRunner=testRunner, verbosity=2)