mllib/tests/test_stat.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 import array as pyarray
0019 import unittest
0020
0021 from numpy import array
0022
0023 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
0024     DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
0025 from pyspark.mllib.random import RandomRDDs
0026 from pyspark.mllib.regression import LabeledPoint
0027 from pyspark.mllib.stat import Statistics
0028 from pyspark.sql.utils import IllegalArgumentException
0029 from pyspark.testing.mllibutils import MLlibTestCase
0030
0031
0032 class StatTests(MLlibTestCase):
0033     # SPARK-4023
0034     def test_col_with_different_rdds(self):
0035         # numpy
0036         data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0037         summary = Statistics.colStats(data)
0038         self.assertEqual(1000, summary.count())
0039         # array
0040         data = self.sc.parallelize([range(10)] * 10)
0041         summary = Statistics.colStats(data)
0042         self.assertEqual(10, summary.count())
0043         # array
0044         data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
0045         summary = Statistics.colStats(data)
0046         self.assertEqual(10, summary.count())
0047
0048     def test_col_norms(self):
0049         data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0050         summary = Statistics.colStats(data)
0051         self.assertEqual(10, len(summary.normL1()))
0052         self.assertEqual(10, len(summary.normL2()))
0053
0054         data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
0055         summary2 = Statistics.colStats(data2)
0056         self.assertEqual(array([45.0]), summary2.normL1())
0057         import math
0058         expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
0059         self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
0060
0061
0062 class ChiSqTestTests(MLlibTestCase):
0063     def test_goodness_of_fit(self):
0064         from numpy import inf
0065
0066         observed = Vectors.dense([4, 6, 5])
0067         pearson = Statistics.chiSqTest(observed)
0068
0069         # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
0070         self.assertEqual(pearson.statistic, 0.4)
0071         self.assertEqual(pearson.degreesOfFreedom, 2)
0072         self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
0073
0074         # Different expected and observed sum
0075         observed1 = Vectors.dense([21, 38, 43, 80])
0076         expected1 = Vectors.dense([3, 5, 7, 20])
0077         pearson1 = Statistics.chiSqTest(observed1, expected1)
0078
0079         # Results validated against the R command
0080         # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
0081         self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
0082         self.assertEqual(pearson1.degreesOfFreedom, 3)
0083         self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
0084
0085         # Vectors with different sizes
0086         observed3 = Vectors.dense([1.0, 2.0, 3.0])
0087         expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
0088         self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
0089
0090         # Negative counts in observed
0091         neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
0092         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)
0093
0094         # Count = 0.0 in expected but not observed
0095         zero_expected = Vectors.dense([1.0, 0.0, 3.0])
0096         pearson_inf = Statistics.chiSqTest(observed, zero_expected)
0097         self.assertEqual(pearson_inf.statistic, inf)
0098         self.assertEqual(pearson_inf.degreesOfFreedom, 2)
0099         self.assertEqual(pearson_inf.pValue, 0.0)
0100
0101         # 0.0 in expected and observed simultaneously
0102         zero_observed = Vectors.dense([2.0, 0.0, 1.0])
0103         self.assertRaises(
0104             IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
0105
0106     def test_matrix_independence(self):
0107         data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
0108         chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
0109
0110         # Results validated against R command
0111         # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
0112         self.assertAlmostEqual(chi.statistic, 21.9958, 4)
0113         self.assertEqual(chi.degreesOfFreedom, 6)
0114         self.assertAlmostEqual(chi.pValue, 0.001213, 4)
0115
0116         # Negative counts
0117         neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
0118         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts)
0119
0120         # Row sum = 0.0
0121         row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
0122         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero)
0123
0124         # Column sum = 0.0
0125         col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
0126         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
0127
0128     def test_chi_sq_pearson(self):
0129         data = [
0130             LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
0131             LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
0132             LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
0133             LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
0134             LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
0135             LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
0136         ]
0137
0138         for numParts in [2, 4, 6, 8]:
0139             chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
0140             feature1 = chi[0]
0141             self.assertEqual(feature1.statistic, 0.75)
0142             self.assertEqual(feature1.degreesOfFreedom, 2)
0143             self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
0144
0145             feature2 = chi[1]
0146             self.assertEqual(feature2.statistic, 1.5)
0147             self.assertEqual(feature2.degreesOfFreedom, 3)
0148             self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
0149
0150     def test_right_number_of_results(self):
0151         num_cols = 1001
0152         sparse_data = [
0153             LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
0154             LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
0155         ]
0156         chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
0157         self.assertEqual(len(chi), num_cols)
0158         self.assertIsNotNone(chi[1000])
0159
0160
0161 class KolmogorovSmirnovTest(MLlibTestCase):
0162
0163     def test_R_implementation_equivalence(self):
0164         data = self.sc.parallelize([
0165             1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
0166             -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
0167             -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
0168             -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
0169             0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
0170         ])
0171         model = Statistics.kolmogorovSmirnovTest(data, "norm")
0172         self.assertAlmostEqual(model.statistic, 0.189, 3)
0173         self.assertAlmostEqual(model.pValue, 0.422, 3)
0174
0175         model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
0176         self.assertAlmostEqual(model.statistic, 0.189, 3)
0177         self.assertAlmostEqual(model.pValue, 0.422, 3)
0178
0179
0180 if __name__ == "__main__":
0181     from pyspark.mllib.tests.test_stat import *
0182
0183     try:
0184         import xmlrunner
0185         testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0186     except ImportError:
0187         testRunner = None
0188     unittest.main(testRunner=testRunner, verbosity=2)