Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 import sys
0019 import array as pyarray
0020 import unittest
0021 
0022 from numpy import array, array_equal, zeros, arange, tile, ones, inf
0023 
0024 import pyspark.ml.linalg as newlinalg
0025 from pyspark.serializers import PickleSerializer
0026 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
0027     DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
0028 from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix
0029 from pyspark.mllib.regression import LabeledPoint
0030 from pyspark.sql import Row
0031 from pyspark.testing.mllibutils import MLlibTestCase
0032 from pyspark.testing.utils import have_scipy
0033 
0034 if sys.version >= '3':
0035     long = int
0036 
0037 
0038 class VectorTests(MLlibTestCase):
0039 
0040     def _test_serialize(self, v):
0041         ser = PickleSerializer()
0042         self.assertEqual(v, ser.loads(ser.dumps(v)))
0043         jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
0044         nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
0045         self.assertEqual(v, nv)
0046         vs = [v] * 100
0047         jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
0048         nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
0049         self.assertEqual(vs, nvs)
0050 
0051     def test_serialize(self):
0052         self._test_serialize(DenseVector(range(10)))
0053         self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
0054         self._test_serialize(DenseVector(pyarray.array('d', range(10))))
0055         self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
0056         self._test_serialize(SparseVector(3, {}))
0057         self._test_serialize(DenseMatrix(2, 3, range(6)))
0058         sm1 = SparseMatrix(
0059             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0060         self._test_serialize(sm1)
0061 
0062     def test_dot(self):
0063         sv = SparseVector(4, {1: 1, 3: 2})
0064         dv = DenseVector(array([1., 2., 3., 4.]))
0065         lst = DenseVector([1, 2, 3, 4])
0066         mat = array([[1., 2., 3., 4.],
0067                      [1., 2., 3., 4.],
0068                      [1., 2., 3., 4.],
0069                      [1., 2., 3., 4.]])
0070         arr = pyarray.array('d', [0, 1, 2, 3])
0071         self.assertEqual(10.0, sv.dot(dv))
0072         self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
0073         self.assertEqual(30.0, dv.dot(dv))
0074         self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
0075         self.assertEqual(30.0, lst.dot(dv))
0076         self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
0077         self.assertEqual(7.0, sv.dot(arr))
0078 
0079     def test_squared_distance(self):
0080         def squared_distance(a, b):
0081             if isinstance(a, Vector):
0082                 return a.squared_distance(b)
0083             else:
0084                 return b.squared_distance(a)
0085 
0086         sv = SparseVector(4, {1: 1, 3: 2})
0087         dv = DenseVector(array([1., 2., 3., 4.]))
0088         lst = DenseVector([4, 3, 2, 1])
0089         lst1 = [4, 3, 2, 1]
0090         arr = pyarray.array('d', [0, 2, 1, 3])
0091         narr = array([0, 2, 1, 3])
0092         self.assertEqual(15.0, squared_distance(sv, dv))
0093         self.assertEqual(25.0, squared_distance(sv, lst))
0094         self.assertEqual(20.0, squared_distance(dv, lst))
0095         self.assertEqual(15.0, squared_distance(dv, sv))
0096         self.assertEqual(25.0, squared_distance(lst, sv))
0097         self.assertEqual(20.0, squared_distance(lst, dv))
0098         self.assertEqual(0.0, squared_distance(sv, sv))
0099         self.assertEqual(0.0, squared_distance(dv, dv))
0100         self.assertEqual(0.0, squared_distance(lst, lst))
0101         self.assertEqual(25.0, squared_distance(sv, lst1))
0102         self.assertEqual(3.0, squared_distance(sv, arr))
0103         self.assertEqual(3.0, squared_distance(sv, narr))
0104 
0105     def test_hash(self):
0106         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0107         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0108         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0109         v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0110         self.assertEqual(hash(v1), hash(v2))
0111         self.assertEqual(hash(v1), hash(v3))
0112         self.assertEqual(hash(v2), hash(v3))
0113         self.assertFalse(hash(v1) == hash(v4))
0114         self.assertFalse(hash(v2) == hash(v4))
0115 
0116     def test_eq(self):
0117         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0118         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0119         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0120         v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
0121         v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
0122         v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0123         dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
0124         sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
0125         self.assertEqual(v1, v2)
0126         self.assertEqual(v1, v3)
0127         self.assertFalse(v2 == v4)
0128         self.assertFalse(v1 == v5)
0129         self.assertFalse(v1 == v6)
0130         # this is done as Dense and Sparse matrices can be semantically
0131         # equal while still implementing a different __eq__ method
0132         self.assertEqual(dm1, sm1)
0133         self.assertEqual(sm1, dm1)
0134 
0135     def test_equals(self):
0136         indices = [1, 2, 4]
0137         values = [1., 3., 2.]
0138         self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
0139         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
0140         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
0141         self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
0142 
0143     def test_conversion(self):
0144         # numpy arrays should be automatically upcast to float64
0145         # tests for fix of [SPARK-5089]
0146         v = array([1, 2, 3, 4], dtype='float64')
0147         dv = DenseVector(v)
0148         self.assertTrue(dv.array.dtype == 'float64')
0149         v = array([1, 2, 3, 4], dtype='float32')
0150         dv = DenseVector(v)
0151         self.assertTrue(dv.array.dtype == 'float64')
0152 
0153     def test_sparse_vector_indexing(self):
0154         sv = SparseVector(5, {1: 1, 3: 2})
0155         self.assertEqual(sv[0], 0.)
0156         self.assertEqual(sv[3], 2.)
0157         self.assertEqual(sv[1], 1.)
0158         self.assertEqual(sv[2], 0.)
0159         self.assertEqual(sv[4], 0.)
0160         self.assertEqual(sv[-1], 0.)
0161         self.assertEqual(sv[-2], 2.)
0162         self.assertEqual(sv[-3], 0.)
0163         self.assertEqual(sv[-5], 0.)
0164         for ind in [5, -6]:
0165             self.assertRaises(IndexError, sv.__getitem__, ind)
0166         for ind in [7.8, '1']:
0167             self.assertRaises(TypeError, sv.__getitem__, ind)
0168 
0169         zeros = SparseVector(4, {})
0170         self.assertEqual(zeros[0], 0.0)
0171         self.assertEqual(zeros[3], 0.0)
0172         for ind in [4, -5]:
0173             self.assertRaises(IndexError, zeros.__getitem__, ind)
0174 
0175         empty = SparseVector(0, {})
0176         for ind in [-1, 0, 1]:
0177             self.assertRaises(IndexError, empty.__getitem__, ind)
0178 
0179     def test_sparse_vector_iteration(self):
0180         self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
0181         self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
0182 
0183     def test_matrix_indexing(self):
0184         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0185         expected = [[0, 6], [1, 8], [4, 10]]
0186         for i in range(3):
0187             for j in range(2):
0188                 self.assertEqual(mat[i, j], expected[i][j])
0189 
0190         for i, j in [(-1, 0), (4, 1), (3, 4)]:
0191             self.assertRaises(IndexError, mat.__getitem__, (i, j))
0192 
0193     def test_repr_dense_matrix(self):
0194         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0195         self.assertTrue(
0196             repr(mat),
0197             'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0198 
0199         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
0200         self.assertTrue(
0201             repr(mat),
0202             'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0203 
0204         mat = DenseMatrix(6, 3, zeros(18))
0205         self.assertTrue(
0206             repr(mat),
0207             'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
0208                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
0209 
0210     def test_repr_sparse_matrix(self):
0211         sm1t = SparseMatrix(
0212             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0213             isTransposed=True)
0214         self.assertTrue(
0215             repr(sm1t),
0216             'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
0217 
0218         indices = tile(arange(6), 3)
0219         values = ones(18)
0220         sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
0221         self.assertTrue(
0222             repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
0223                 [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
0224                 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
0225                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
0226 
0227         self.assertTrue(
0228             str(sm),
0229             "6 X 3 CSCMatrix\n\
0230             (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
0231             (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
0232             (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
0233 
0234         sm = SparseMatrix(1, 18, zeros(19), [], [])
0235         self.assertTrue(
0236             repr(sm),
0237             'SparseMatrix(1, 18, \
0238                 [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
0239 
0240     def test_sparse_matrix(self):
0241         # Test sparse matrix creation.
0242         sm1 = SparseMatrix(
0243             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0244         self.assertEqual(sm1.numRows, 3)
0245         self.assertEqual(sm1.numCols, 4)
0246         self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
0247         self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
0248         self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
0249         self.assertTrue(
0250             repr(sm1),
0251             'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
0252 
0253         # Test indexing
0254         expected = [
0255             [0, 0, 0, 0],
0256             [1, 0, 4, 0],
0257             [2, 0, 5, 0]]
0258 
0259         for i in range(3):
0260             for j in range(4):
0261                 self.assertEqual(expected[i][j], sm1[i, j])
0262         self.assertTrue(array_equal(sm1.toArray(), expected))
0263 
0264         for i, j in [(-1, 1), (4, 3), (3, 5)]:
0265             self.assertRaises(IndexError, sm1.__getitem__, (i, j))
0266 
0267         # Test conversion to dense and sparse.
0268         smnew = sm1.toDense().toSparse()
0269         self.assertEqual(sm1.numRows, smnew.numRows)
0270         self.assertEqual(sm1.numCols, smnew.numCols)
0271         self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
0272         self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
0273         self.assertTrue(array_equal(sm1.values, smnew.values))
0274 
0275         sm1t = SparseMatrix(
0276             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0277             isTransposed=True)
0278         self.assertEqual(sm1t.numRows, 3)
0279         self.assertEqual(sm1t.numCols, 4)
0280         self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
0281         self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
0282         self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
0283 
0284         expected = [
0285             [3, 2, 0, 0],
0286             [0, 0, 4, 0],
0287             [9, 0, 8, 0]]
0288 
0289         for i in range(3):
0290             for j in range(4):
0291                 self.assertEqual(expected[i][j], sm1t[i, j])
0292         self.assertTrue(array_equal(sm1t.toArray(), expected))
0293 
0294     def test_dense_matrix_is_transposed(self):
0295         mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
0296         mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
0297         self.assertEqual(mat1, mat)
0298 
0299         expected = [[0, 4], [1, 6], [3, 9]]
0300         for i in range(3):
0301             for j in range(2):
0302                 self.assertEqual(mat1[i, j], expected[i][j])
0303         self.assertTrue(array_equal(mat1.toArray(), expected))
0304 
0305         sm = mat1.toSparse()
0306         self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
0307         self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
0308         self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
0309 
0310     def test_parse_vector(self):
0311         a = DenseVector([])
0312         self.assertEqual(str(a), '[]')
0313         self.assertEqual(Vectors.parse(str(a)), a)
0314         a = DenseVector([3, 4, 6, 7])
0315         self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
0316         self.assertEqual(Vectors.parse(str(a)), a)
0317         a = SparseVector(4, [], [])
0318         self.assertEqual(str(a), '(4,[],[])')
0319         self.assertEqual(SparseVector.parse(str(a)), a)
0320         a = SparseVector(4, [0, 2], [3, 4])
0321         self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
0322         self.assertEqual(Vectors.parse(str(a)), a)
0323         a = SparseVector(10, [0, 1], [4, 5])
0324         self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
0325 
0326     def test_norms(self):
0327         a = DenseVector([0, 2, 3, -1])
0328         self.assertAlmostEqual(a.norm(2), 3.742, 3)
0329         self.assertTrue(a.norm(1), 6)
0330         self.assertTrue(a.norm(inf), 3)
0331         a = SparseVector(4, [0, 2], [3, -4])
0332         self.assertAlmostEqual(a.norm(2), 5)
0333         self.assertTrue(a.norm(1), 7)
0334         self.assertTrue(a.norm(inf), 4)
0335 
0336         tmp = SparseVector(4, [0, 2], [3, 0])
0337         self.assertEqual(tmp.numNonzeros(), 1)
0338 
0339     def test_ml_mllib_vector_conversion(self):
0340         # to ml
0341         # dense
0342         mllibDV = Vectors.dense([1, 2, 3])
0343         mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
0344         mlDV2 = mllibDV.asML()
0345         self.assertEqual(mlDV2, mlDV1)
0346         # sparse
0347         mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
0348         mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
0349         mlSV2 = mllibSV.asML()
0350         self.assertEqual(mlSV2, mlSV1)
0351         # from ml
0352         # dense
0353         mllibDV1 = Vectors.dense([1, 2, 3])
0354         mlDV = newlinalg.Vectors.dense([1, 2, 3])
0355         mllibDV2 = Vectors.fromML(mlDV)
0356         self.assertEqual(mllibDV1, mllibDV2)
0357         # sparse
0358         mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
0359         mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
0360         mllibSV2 = Vectors.fromML(mlSV)
0361         self.assertEqual(mllibSV1, mllibSV2)
0362 
0363     def test_ml_mllib_matrix_conversion(self):
0364         # to ml
0365         # dense
0366         mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
0367         mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
0368         mlDM2 = mllibDM.asML()
0369         self.assertEqual(mlDM2, mlDM1)
0370         # transposed
0371         mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
0372         mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
0373         mlDMt2 = mllibDMt.asML()
0374         self.assertEqual(mlDMt2, mlDMt1)
0375         # sparse
0376         mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0377         mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0378         mlSM2 = mllibSM.asML()
0379         self.assertEqual(mlSM2, mlSM1)
0380         # transposed
0381         mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0382         mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0383         mlSMt2 = mllibSMt.asML()
0384         self.assertEqual(mlSMt2, mlSMt1)
0385         # from ml
0386         # dense
0387         mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
0388         mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
0389         mllibDM2 = Matrices.fromML(mlDM)
0390         self.assertEqual(mllibDM1, mllibDM2)
0391         # transposed
0392         mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
0393         mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
0394         mllibDMt2 = Matrices.fromML(mlDMt)
0395         self.assertEqual(mllibDMt1, mllibDMt2)
0396         # sparse
0397         mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0398         mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0399         mllibSM2 = Matrices.fromML(mlSM)
0400         self.assertEqual(mllibSM1, mllibSM2)
0401         # transposed
0402         mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0403         mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0404         mllibSMt2 = Matrices.fromML(mlSMt)
0405         self.assertEqual(mllibSMt1, mllibSMt2)
0406 
0407 
0408 class VectorUDTTests(MLlibTestCase):
0409 
0410     dv0 = DenseVector([])
0411     dv1 = DenseVector([1.0, 2.0])
0412     sv0 = SparseVector(2, [], [])
0413     sv1 = SparseVector(2, [1], [2.0])
0414     udt = VectorUDT()
0415 
0416     def test_json_schema(self):
0417         self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
0418 
0419     def test_serialization(self):
0420         for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
0421             self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))
0422 
0423     def test_infer_schema(self):
0424         rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
0425         df = rdd.toDF()
0426         schema = df.schema
0427         field = [f for f in schema.fields if f.name == "features"][0]
0428         self.assertEqual(field.dataType, self.udt)
0429         vectors = df.rdd.map(lambda p: p.features).collect()
0430         self.assertEqual(len(vectors), 2)
0431         for v in vectors:
0432             if isinstance(v, SparseVector):
0433                 self.assertEqual(v, self.sv1)
0434             elif isinstance(v, DenseVector):
0435                 self.assertEqual(v, self.dv1)
0436             else:
0437                 raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
0438 
0439     def test_row_matrix_from_dataframe(self):
0440         from pyspark.sql.utils import IllegalArgumentException
0441         df = self.spark.createDataFrame([Row(Vectors.dense(1))])
0442         row_matrix = RowMatrix(df)
0443         self.assertEqual(row_matrix.numRows(), 1)
0444         self.assertEqual(row_matrix.numCols(), 1)
0445         with self.assertRaises(IllegalArgumentException):
0446             RowMatrix(df.selectExpr("'monkey'"))
0447 
0448     def test_indexed_row_matrix_from_dataframe(self):
0449         from pyspark.sql.utils import IllegalArgumentException
0450         df = self.spark.createDataFrame([Row(long(0), Vectors.dense(1))])
0451         matrix = IndexedRowMatrix(df)
0452         self.assertEqual(matrix.numRows(), 1)
0453         self.assertEqual(matrix.numCols(), 1)
0454         with self.assertRaises(IllegalArgumentException):
0455             IndexedRowMatrix(df.drop("_1"))
0456 
0457 
0458 class MatrixUDTTests(MLlibTestCase):
0459 
0460     dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
0461     dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
0462     sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
0463     sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
0464     udt = MatrixUDT()
0465 
0466     def test_json_schema(self):
0467         self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
0468 
0469     def test_serialization(self):
0470         for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
0471             self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
0472 
0473     def test_infer_schema(self):
0474         rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
0475         df = rdd.toDF()
0476         schema = df.schema
0477         self.assertTrue(schema.fields[1].dataType, self.udt)
0478         matrices = df.rdd.map(lambda x: x._2).collect()
0479         self.assertEqual(len(matrices), 2)
0480         for m in matrices:
0481             if isinstance(m, DenseMatrix):
0482                 self.assertTrue(m, self.dm1)
0483             elif isinstance(m, SparseMatrix):
0484                 self.assertTrue(m, self.sm1)
0485             else:
0486                 raise ValueError("Expected a matrix but got type %r" % type(m))
0487 
0488 
0489 @unittest.skipIf(not have_scipy, "SciPy not installed")
0490 class SciPyTests(MLlibTestCase):
0491 
0492     """
0493     Test both vector operations and MLlib algorithms with SciPy sparse matrices,
0494     if SciPy is available.
0495     """
0496 
0497     def test_serialize(self):
0498         from scipy.sparse import lil_matrix
0499 
0500         ser = PickleSerializer()
0501         lil = lil_matrix((4, 1))
0502         lil[1, 0] = 1
0503         lil[3, 0] = 2
0504         sv = SparseVector(4, {1: 1, 3: 2})
0505         self.assertEqual(sv, _convert_to_vector(lil))
0506         self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
0507         self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
0508         self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
0509         self.assertEqual(sv, _convert_to_vector(lil.todok()))
0510 
0511         def serialize(l):
0512             return ser.loads(ser.dumps(_convert_to_vector(l)))
0513         self.assertEqual(sv, serialize(lil))
0514         self.assertEqual(sv, serialize(lil.tocsc()))
0515         self.assertEqual(sv, serialize(lil.tocsr()))
0516         self.assertEqual(sv, serialize(lil.todok()))
0517 
0518     def test_convert_to_vector(self):
0519         from scipy.sparse import csc_matrix
0520         # Create a CSC matrix with non-sorted indices
0521         indptr = array([0, 2])
0522         indices = array([3, 1])
0523         data = array([2.0, 1.0])
0524         csc = csc_matrix((data, indices, indptr))
0525         self.assertFalse(csc.has_sorted_indices)
0526         sv = SparseVector(4, {1: 1, 3: 2})
0527         self.assertEqual(sv, _convert_to_vector(csc))
0528 
0529     def test_dot(self):
0530         from scipy.sparse import lil_matrix
0531         lil = lil_matrix((4, 1))
0532         lil[1, 0] = 1
0533         lil[3, 0] = 2
0534         dv = DenseVector(array([1., 2., 3., 4.]))
0535         self.assertEqual(10.0, dv.dot(lil))
0536 
0537     def test_squared_distance(self):
0538         from scipy.sparse import lil_matrix
0539         lil = lil_matrix((4, 1))
0540         lil[1, 0] = 3
0541         lil[3, 0] = 2
0542         dv = DenseVector(array([1., 2., 3., 4.]))
0543         sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
0544         self.assertEqual(15.0, dv.squared_distance(lil))
0545         self.assertEqual(15.0, sv.squared_distance(lil))
0546 
0547     def scipy_matrix(self, size, values):
0548         """Create a column SciPy matrix from a dictionary of values"""
0549         from scipy.sparse import lil_matrix
0550         lil = lil_matrix((size, 1))
0551         for key, value in values.items():
0552             lil[key, 0] = value
0553         return lil
0554 
0555     def test_clustering(self):
0556         from pyspark.mllib.clustering import KMeans
0557         data = [
0558             self.scipy_matrix(3, {1: 1.0}),
0559             self.scipy_matrix(3, {1: 1.1}),
0560             self.scipy_matrix(3, {2: 1.0}),
0561             self.scipy_matrix(3, {2: 1.1})
0562         ]
0563         clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
0564         self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
0565         self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
0566 
0567     def test_classification(self):
0568         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
0569         from pyspark.mllib.tree import DecisionTree
0570         data = [
0571             LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
0572             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
0573             LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
0574             LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
0575         ]
0576         rdd = self.sc.parallelize(data)
0577         features = [p.features for p in data]
0578 
0579         lr_model = LogisticRegressionWithSGD.train(rdd)
0580         self.assertTrue(lr_model.predict(features[0]) <= 0)
0581         self.assertTrue(lr_model.predict(features[1]) > 0)
0582         self.assertTrue(lr_model.predict(features[2]) <= 0)
0583         self.assertTrue(lr_model.predict(features[3]) > 0)
0584 
0585         svm_model = SVMWithSGD.train(rdd)
0586         self.assertTrue(svm_model.predict(features[0]) <= 0)
0587         self.assertTrue(svm_model.predict(features[1]) > 0)
0588         self.assertTrue(svm_model.predict(features[2]) <= 0)
0589         self.assertTrue(svm_model.predict(features[3]) > 0)
0590 
0591         nb_model = NaiveBayes.train(rdd)
0592         self.assertTrue(nb_model.predict(features[0]) <= 0)
0593         self.assertTrue(nb_model.predict(features[1]) > 0)
0594         self.assertTrue(nb_model.predict(features[2]) <= 0)
0595         self.assertTrue(nb_model.predict(features[3]) > 0)
0596 
0597         categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
0598         dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
0599                                                 categoricalFeaturesInfo=categoricalFeaturesInfo)
0600         self.assertTrue(dt_model.predict(features[0]) <= 0)
0601         self.assertTrue(dt_model.predict(features[1]) > 0)
0602         self.assertTrue(dt_model.predict(features[2]) <= 0)
0603         self.assertTrue(dt_model.predict(features[3]) > 0)
0604 
0605     def test_regression(self):
0606         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
0607             RidgeRegressionWithSGD
0608         from pyspark.mllib.tree import DecisionTree
0609         data = [
0610             LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
0611             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
0612             LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
0613             LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
0614         ]
0615         rdd = self.sc.parallelize(data)
0616         features = [p.features for p in data]
0617 
0618         lr_model = LinearRegressionWithSGD.train(rdd)
0619         self.assertTrue(lr_model.predict(features[0]) <= 0)
0620         self.assertTrue(lr_model.predict(features[1]) > 0)
0621         self.assertTrue(lr_model.predict(features[2]) <= 0)
0622         self.assertTrue(lr_model.predict(features[3]) > 0)
0623 
0624         lasso_model = LassoWithSGD.train(rdd)
0625         self.assertTrue(lasso_model.predict(features[0]) <= 0)
0626         self.assertTrue(lasso_model.predict(features[1]) > 0)
0627         self.assertTrue(lasso_model.predict(features[2]) <= 0)
0628         self.assertTrue(lasso_model.predict(features[3]) > 0)
0629 
0630         rr_model = RidgeRegressionWithSGD.train(rdd)
0631         self.assertTrue(rr_model.predict(features[0]) <= 0)
0632         self.assertTrue(rr_model.predict(features[1]) > 0)
0633         self.assertTrue(rr_model.predict(features[2]) <= 0)
0634         self.assertTrue(rr_model.predict(features[3]) > 0)
0635 
0636         categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
0637         dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
0638         self.assertTrue(dt_model.predict(features[0]) <= 0)
0639         self.assertTrue(dt_model.predict(features[1]) > 0)
0640         self.assertTrue(dt_model.predict(features[2]) <= 0)
0641         self.assertTrue(dt_model.predict(features[3]) > 0)
0642 
0643 
0644 if __name__ == "__main__":
0645     from pyspark.mllib.tests.test_linalg import *
0646 
0647     try:
0648         import xmlrunner
0649         testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0650     except ImportError:
0651         testRunner = None
0652     unittest.main(testRunner=testRunner, verbosity=2)