Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 import array as pyarray
0019 import unittest
0020 
0021 from numpy import array
0022 
0023 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
0024     DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
0025 from pyspark.mllib.random import RandomRDDs
0026 from pyspark.mllib.regression import LabeledPoint
0027 from pyspark.mllib.stat import Statistics
0028 from pyspark.sql.utils import IllegalArgumentException
0029 from pyspark.testing.mllibutils import MLlibTestCase
0030 
0031 
0032 class StatTests(MLlibTestCase):
0033     # SPARK-4023
0034     def test_col_with_different_rdds(self):
0035         # numpy
0036         data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0037         summary = Statistics.colStats(data)
0038         self.assertEqual(1000, summary.count())
0039         # array
0040         data = self.sc.parallelize([range(10)] * 10)
0041         summary = Statistics.colStats(data)
0042         self.assertEqual(10, summary.count())
0043         # array
0044         data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
0045         summary = Statistics.colStats(data)
0046         self.assertEqual(10, summary.count())
0047 
0048     def test_col_norms(self):
0049         data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
0050         summary = Statistics.colStats(data)
0051         self.assertEqual(10, len(summary.normL1()))
0052         self.assertEqual(10, len(summary.normL2()))
0053 
0054         data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
0055         summary2 = Statistics.colStats(data2)
0056         self.assertEqual(array([45.0]), summary2.normL1())
0057         import math
0058         expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
0059         self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
0060 
0061 
0062 class ChiSqTestTests(MLlibTestCase):
0063     def test_goodness_of_fit(self):
0064         from numpy import inf
0065 
0066         observed = Vectors.dense([4, 6, 5])
0067         pearson = Statistics.chiSqTest(observed)
0068 
0069         # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
0070         self.assertEqual(pearson.statistic, 0.4)
0071         self.assertEqual(pearson.degreesOfFreedom, 2)
0072         self.assertAlmostEqual(pearson.pValue, 0.8187, 4)
0073 
0074         # Different expected and observed sum
0075         observed1 = Vectors.dense([21, 38, 43, 80])
0076         expected1 = Vectors.dense([3, 5, 7, 20])
0077         pearson1 = Statistics.chiSqTest(observed1, expected1)
0078 
0079         # Results validated against the R command
0080         # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
0081         self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
0082         self.assertEqual(pearson1.degreesOfFreedom, 3)
0083         self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)
0084 
0085         # Vectors with different sizes
0086         observed3 = Vectors.dense([1.0, 2.0, 3.0])
0087         expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
0088         self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)
0089 
0090         # Negative counts in observed
0091         neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
0092         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)
0093 
0094         # Count = 0.0 in expected but not observed
0095         zero_expected = Vectors.dense([1.0, 0.0, 3.0])
0096         pearson_inf = Statistics.chiSqTest(observed, zero_expected)
0097         self.assertEqual(pearson_inf.statistic, inf)
0098         self.assertEqual(pearson_inf.degreesOfFreedom, 2)
0099         self.assertEqual(pearson_inf.pValue, 0.0)
0100 
0101         # 0.0 in expected and observed simultaneously
0102         zero_observed = Vectors.dense([2.0, 0.0, 1.0])
0103         self.assertRaises(
0104             IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)
0105 
0106     def test_matrix_independence(self):
0107         data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
0108         chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
0109 
0110         # Results validated against R command
0111         # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
0112         self.assertAlmostEqual(chi.statistic, 21.9958, 4)
0113         self.assertEqual(chi.degreesOfFreedom, 6)
0114         self.assertAlmostEqual(chi.pValue, 0.001213, 4)
0115 
0116         # Negative counts
0117         neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
0118         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts)
0119 
0120         # Row sum = 0.0
0121         row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
0122         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero)
0123 
0124         # Column sum = 0.0
0125         col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
0126         self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)
0127 
0128     def test_chi_sq_pearson(self):
0129         data = [
0130             LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
0131             LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
0132             LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
0133             LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
0134             LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
0135             LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
0136         ]
0137 
0138         for numParts in [2, 4, 6, 8]:
0139             chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
0140             feature1 = chi[0]
0141             self.assertEqual(feature1.statistic, 0.75)
0142             self.assertEqual(feature1.degreesOfFreedom, 2)
0143             self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
0144 
0145             feature2 = chi[1]
0146             self.assertEqual(feature2.statistic, 1.5)
0147             self.assertEqual(feature2.degreesOfFreedom, 3)
0148             self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
0149 
0150     def test_right_number_of_results(self):
0151         num_cols = 1001
0152         sparse_data = [
0153             LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
0154             LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
0155         ]
0156         chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
0157         self.assertEqual(len(chi), num_cols)
0158         self.assertIsNotNone(chi[1000])
0159 
0160 
0161 class KolmogorovSmirnovTest(MLlibTestCase):
0162 
0163     def test_R_implementation_equivalence(self):
0164         data = self.sc.parallelize([
0165             1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
0166             -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
0167             -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
0168             -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
0169             0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
0170         ])
0171         model = Statistics.kolmogorovSmirnovTest(data, "norm")
0172         self.assertAlmostEqual(model.statistic, 0.189, 3)
0173         self.assertAlmostEqual(model.pValue, 0.422, 3)
0174 
0175         model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
0176         self.assertAlmostEqual(model.statistic, 0.189, 3)
0177         self.assertAlmostEqual(model.pValue, 0.422, 3)
0178 
0179 
0180 if __name__ == "__main__":
0181     from pyspark.mllib.tests.test_stat import *
0182 
0183     try:
0184         import xmlrunner
0185         testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0186     except ImportError:
0187         testRunner = None
0188     unittest.main(testRunner=testRunner, verbosity=2)