Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 import unittest
0019 import array as pyarray
0020 
0021 from numpy import arange, array, array_equal, inf, ones, tile, zeros
0022 
0023 from pyspark.serializers import PickleSerializer
0024 from pyspark.ml.linalg import DenseMatrix, DenseVector, MatrixUDT, SparseMatrix, SparseVector, \
0025     Vector, VectorUDT, Vectors
0026 from pyspark.testing.mllibutils import MLlibTestCase
0027 from pyspark.sql import Row
0028 
0029 
0030 class VectorTests(MLlibTestCase):
0031 
0032     def _test_serialize(self, v):
0033         ser = PickleSerializer()
0034         self.assertEqual(v, ser.loads(ser.dumps(v)))
0035         jvec = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(v)))
0036         nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvec)))
0037         self.assertEqual(v, nv)
0038         vs = [v] * 100
0039         jvecs = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(vs)))
0040         nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvecs)))
0041         self.assertEqual(vs, nvs)
0042 
0043     def test_serialize(self):
0044         self._test_serialize(DenseVector(range(10)))
0045         self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
0046         self._test_serialize(DenseVector(pyarray.array('d', range(10))))
0047         self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
0048         self._test_serialize(SparseVector(3, {}))
0049         self._test_serialize(DenseMatrix(2, 3, range(6)))
0050         sm1 = SparseMatrix(
0051             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0052         self._test_serialize(sm1)
0053 
0054     def test_dot(self):
0055         sv = SparseVector(4, {1: 1, 3: 2})
0056         dv = DenseVector(array([1., 2., 3., 4.]))
0057         lst = DenseVector([1, 2, 3, 4])
0058         mat = array([[1., 2., 3., 4.],
0059                      [1., 2., 3., 4.],
0060                      [1., 2., 3., 4.],
0061                      [1., 2., 3., 4.]])
0062         arr = pyarray.array('d', [0, 1, 2, 3])
0063         self.assertEqual(10.0, sv.dot(dv))
0064         self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
0065         self.assertEqual(30.0, dv.dot(dv))
0066         self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
0067         self.assertEqual(30.0, lst.dot(dv))
0068         self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
0069         self.assertEqual(7.0, sv.dot(arr))
0070 
0071     def test_squared_distance(self):
0072         def squared_distance(a, b):
0073             if isinstance(a, Vector):
0074                 return a.squared_distance(b)
0075             else:
0076                 return b.squared_distance(a)
0077 
0078         sv = SparseVector(4, {1: 1, 3: 2})
0079         dv = DenseVector(array([1., 2., 3., 4.]))
0080         lst = DenseVector([4, 3, 2, 1])
0081         lst1 = [4, 3, 2, 1]
0082         arr = pyarray.array('d', [0, 2, 1, 3])
0083         narr = array([0, 2, 1, 3])
0084         self.assertEqual(15.0, squared_distance(sv, dv))
0085         self.assertEqual(25.0, squared_distance(sv, lst))
0086         self.assertEqual(20.0, squared_distance(dv, lst))
0087         self.assertEqual(15.0, squared_distance(dv, sv))
0088         self.assertEqual(25.0, squared_distance(lst, sv))
0089         self.assertEqual(20.0, squared_distance(lst, dv))
0090         self.assertEqual(0.0, squared_distance(sv, sv))
0091         self.assertEqual(0.0, squared_distance(dv, dv))
0092         self.assertEqual(0.0, squared_distance(lst, lst))
0093         self.assertEqual(25.0, squared_distance(sv, lst1))
0094         self.assertEqual(3.0, squared_distance(sv, arr))
0095         self.assertEqual(3.0, squared_distance(sv, narr))
0096 
0097     def test_hash(self):
0098         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0099         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0100         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0101         v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0102         self.assertEqual(hash(v1), hash(v2))
0103         self.assertEqual(hash(v1), hash(v3))
0104         self.assertEqual(hash(v2), hash(v3))
0105         self.assertFalse(hash(v1) == hash(v4))
0106         self.assertFalse(hash(v2) == hash(v4))
0107 
0108     def test_eq(self):
0109         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0110         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0111         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0112         v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
0113         v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
0114         v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0115         dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
0116         sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
0117         self.assertEqual(v1, v2)
0118         self.assertEqual(v1, v3)
0119         self.assertFalse(v2 == v4)
0120         self.assertFalse(v1 == v5)
0121         self.assertFalse(v1 == v6)
0122         # this is done as Dense and Sparse matrices can be semantically
0123         # equal while still implementing a different __eq__ method
0124         self.assertEqual(dm1, sm1)
0125         self.assertEqual(sm1, dm1)
0126 
0127     def test_equals(self):
0128         indices = [1, 2, 4]
0129         values = [1., 3., 2.]
0130         self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
0131         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
0132         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
0133         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
0134 
0135     def test_conversion(self):
0136         # numpy arrays should be automatically upcast to float64
0137         # tests for fix of [SPARK-5089]
0138         v = array([1, 2, 3, 4], dtype='float64')
0139         dv = DenseVector(v)
0140         self.assertTrue(dv.array.dtype == 'float64')
0141         v = array([1, 2, 3, 4], dtype='float32')
0142         dv = DenseVector(v)
0143         self.assertTrue(dv.array.dtype == 'float64')
0144 
0145     def test_sparse_vector_indexing(self):
0146         sv = SparseVector(5, {1: 1, 3: 2})
0147         self.assertEqual(sv[0], 0.)
0148         self.assertEqual(sv[3], 2.)
0149         self.assertEqual(sv[1], 1.)
0150         self.assertEqual(sv[2], 0.)
0151         self.assertEqual(sv[4], 0.)
0152         self.assertEqual(sv[-1], 0.)
0153         self.assertEqual(sv[-2], 2.)
0154         self.assertEqual(sv[-3], 0.)
0155         self.assertEqual(sv[-5], 0.)
0156         for ind in [5, -6]:
0157             self.assertRaises(IndexError, sv.__getitem__, ind)
0158         for ind in [7.8, '1']:
0159             self.assertRaises(TypeError, sv.__getitem__, ind)
0160 
0161         zeros = SparseVector(4, {})
0162         self.assertEqual(zeros[0], 0.0)
0163         self.assertEqual(zeros[3], 0.0)
0164         for ind in [4, -5]:
0165             self.assertRaises(IndexError, zeros.__getitem__, ind)
0166 
0167         empty = SparseVector(0, {})
0168         for ind in [-1, 0, 1]:
0169             self.assertRaises(IndexError, empty.__getitem__, ind)
0170 
0171     def test_sparse_vector_iteration(self):
0172         self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
0173         self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
0174 
0175     def test_matrix_indexing(self):
0176         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0177         expected = [[0, 6], [1, 8], [4, 10]]
0178         for i in range(3):
0179             for j in range(2):
0180                 self.assertEqual(mat[i, j], expected[i][j])
0181 
0182         for i, j in [(-1, 0), (4, 1), (3, 4)]:
0183             self.assertRaises(IndexError, mat.__getitem__, (i, j))
0184 
0185     def test_repr_dense_matrix(self):
0186         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0187         self.assertTrue(
0188             repr(mat),
0189             'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0190 
0191         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
0192         self.assertTrue(
0193             repr(mat),
0194             'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0195 
0196         mat = DenseMatrix(6, 3, zeros(18))
0197         self.assertTrue(
0198             repr(mat),
0199             'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
0200                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
0201 
0202     def test_repr_sparse_matrix(self):
0203         sm1t = SparseMatrix(
0204             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0205             isTransposed=True)
0206         self.assertTrue(
0207             repr(sm1t),
0208             'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
0209 
0210         indices = tile(arange(6), 3)
0211         values = ones(18)
0212         sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
0213         self.assertTrue(
0214             repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
0215                 [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
0216                 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
0217                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
0218 
0219         self.assertTrue(
0220             str(sm),
0221             "6 X 3 CSCMatrix\n\
0222             (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
0223             (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
0224             (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
0225 
0226         sm = SparseMatrix(1, 18, zeros(19), [], [])
0227         self.assertTrue(
0228             repr(sm),
0229             'SparseMatrix(1, 18, \
0230                 [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
0231 
0232     def test_sparse_matrix(self):
0233         # Test sparse matrix creation.
0234         sm1 = SparseMatrix(
0235             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0236         self.assertEqual(sm1.numRows, 3)
0237         self.assertEqual(sm1.numCols, 4)
0238         self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
0239         self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
0240         self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
0241         self.assertTrue(
0242             repr(sm1),
0243             'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
0244 
0245         # Test indexing
0246         expected = [
0247             [0, 0, 0, 0],
0248             [1, 0, 4, 0],
0249             [2, 0, 5, 0]]
0250 
0251         for i in range(3):
0252             for j in range(4):
0253                 self.assertEqual(expected[i][j], sm1[i, j])
0254         self.assertTrue(array_equal(sm1.toArray(), expected))
0255 
0256         for i, j in [(-1, 1), (4, 3), (3, 5)]:
0257             self.assertRaises(IndexError, sm1.__getitem__, (i, j))
0258 
0259         # Test conversion to dense and sparse.
0260         smnew = sm1.toDense().toSparse()
0261         self.assertEqual(sm1.numRows, smnew.numRows)
0262         self.assertEqual(sm1.numCols, smnew.numCols)
0263         self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
0264         self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
0265         self.assertTrue(array_equal(sm1.values, smnew.values))
0266 
0267         sm1t = SparseMatrix(
0268             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0269             isTransposed=True)
0270         self.assertEqual(sm1t.numRows, 3)
0271         self.assertEqual(sm1t.numCols, 4)
0272         self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
0273         self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
0274         self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
0275 
0276         expected = [
0277             [3, 2, 0, 0],
0278             [0, 0, 4, 0],
0279             [9, 0, 8, 0]]
0280 
0281         for i in range(3):
0282             for j in range(4):
0283                 self.assertEqual(expected[i][j], sm1t[i, j])
0284         self.assertTrue(array_equal(sm1t.toArray(), expected))
0285 
0286     def test_dense_matrix_is_transposed(self):
0287         mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
0288         mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
0289         self.assertEqual(mat1, mat)
0290 
0291         expected = [[0, 4], [1, 6], [3, 9]]
0292         for i in range(3):
0293             for j in range(2):
0294                 self.assertEqual(mat1[i, j], expected[i][j])
0295         self.assertTrue(array_equal(mat1.toArray(), expected))
0296 
0297         sm = mat1.toSparse()
0298         self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
0299         self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
0300         self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
0301 
0302     def test_norms(self):
0303         a = DenseVector([0, 2, 3, -1])
0304         self.assertAlmostEqual(a.norm(2), 3.742, 3)
0305         self.assertTrue(a.norm(1), 6)
0306         self.assertTrue(a.norm(inf), 3)
0307         a = SparseVector(4, [0, 2], [3, -4])
0308         self.assertAlmostEqual(a.norm(2), 5)
0309         self.assertTrue(a.norm(1), 7)
0310         self.assertTrue(a.norm(inf), 4)
0311 
0312         tmp = SparseVector(4, [0, 2], [3, 0])
0313         self.assertEqual(tmp.numNonzeros(), 1)
0314 
0315 
0316 class VectorUDTTests(MLlibTestCase):
0317 
0318     dv0 = DenseVector([])
0319     dv1 = DenseVector([1.0, 2.0])
0320     sv0 = SparseVector(2, [], [])
0321     sv1 = SparseVector(2, [1], [2.0])
0322     udt = VectorUDT()
0323 
0324     def test_json_schema(self):
0325         self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
0326 
0327     def test_serialization(self):
0328         for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
0329             self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))
0330 
0331     def test_infer_schema(self):
0332         rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1),
0333                                    Row(label=0.0, features=self.sv1)])
0334         df = rdd.toDF()
0335         schema = df.schema
0336         field = [f for f in schema.fields if f.name == "features"][0]
0337         self.assertEqual(field.dataType, self.udt)
0338         vectors = df.rdd.map(lambda p: p.features).collect()
0339         self.assertEqual(len(vectors), 2)
0340         for v in vectors:
0341             if isinstance(v, SparseVector):
0342                 self.assertEqual(v, self.sv1)
0343             elif isinstance(v, DenseVector):
0344                 self.assertEqual(v, self.dv1)
0345             else:
0346                 raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
0347 
0348 
0349 class MatrixUDTTests(MLlibTestCase):
0350 
0351     dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
0352     dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
0353     sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
0354     sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
0355     udt = MatrixUDT()
0356 
0357     def test_json_schema(self):
0358         self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
0359 
0360     def test_serialization(self):
0361         for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
0362             self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
0363 
0364     def test_infer_schema(self):
0365         rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
0366         df = rdd.toDF()
0367         schema = df.schema
0368         self.assertTrue(schema.fields[1].dataType, self.udt)
0369         matrices = df.rdd.map(lambda x: x._2).collect()
0370         self.assertEqual(len(matrices), 2)
0371         for m in matrices:
0372             if isinstance(m, DenseMatrix):
0373                 self.assertTrue(m, self.dm1)
0374             elif isinstance(m, SparseMatrix):
0375                 self.assertTrue(m, self.sm1)
0376             else:
0377                 raise ValueError("Expected a matrix but got type %r" % type(m))
0378 
0379 
0380 if __name__ == "__main__":
0381     from pyspark.ml.tests.test_linalg import *
0382 
0383     try:
0384         import xmlrunner
0385         testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0386     except ImportError:
0387         testRunner = None
0388     unittest.main(testRunner=testRunner, verbosity=2)