0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import sys
0019 import array as pyarray
0020 import unittest
0021
0022 from numpy import array, array_equal, zeros, arange, tile, ones, inf
0023
0024 import pyspark.ml.linalg as newlinalg
0025 from pyspark.serializers import PickleSerializer
0026 from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
0027 DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
0028 from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix
0029 from pyspark.mllib.regression import LabeledPoint
0030 from pyspark.sql import Row
0031 from pyspark.testing.mllibutils import MLlibTestCase
0032 from pyspark.testing.utils import have_scipy
0033
0034 if sys.version >= '3':
0035 long = int
0036
0037
0038 class VectorTests(MLlibTestCase):
0039
0040 def _test_serialize(self, v):
0041 ser = PickleSerializer()
0042 self.assertEqual(v, ser.loads(ser.dumps(v)))
0043 jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
0044 nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
0045 self.assertEqual(v, nv)
0046 vs = [v] * 100
0047 jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
0048 nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
0049 self.assertEqual(vs, nvs)
0050
0051 def test_serialize(self):
0052 self._test_serialize(DenseVector(range(10)))
0053 self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
0054 self._test_serialize(DenseVector(pyarray.array('d', range(10))))
0055 self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
0056 self._test_serialize(SparseVector(3, {}))
0057 self._test_serialize(DenseMatrix(2, 3, range(6)))
0058 sm1 = SparseMatrix(
0059 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0060 self._test_serialize(sm1)
0061
0062 def test_dot(self):
0063 sv = SparseVector(4, {1: 1, 3: 2})
0064 dv = DenseVector(array([1., 2., 3., 4.]))
0065 lst = DenseVector([1, 2, 3, 4])
0066 mat = array([[1., 2., 3., 4.],
0067 [1., 2., 3., 4.],
0068 [1., 2., 3., 4.],
0069 [1., 2., 3., 4.]])
0070 arr = pyarray.array('d', [0, 1, 2, 3])
0071 self.assertEqual(10.0, sv.dot(dv))
0072 self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
0073 self.assertEqual(30.0, dv.dot(dv))
0074 self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
0075 self.assertEqual(30.0, lst.dot(dv))
0076 self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
0077 self.assertEqual(7.0, sv.dot(arr))
0078
0079 def test_squared_distance(self):
0080 def squared_distance(a, b):
0081 if isinstance(a, Vector):
0082 return a.squared_distance(b)
0083 else:
0084 return b.squared_distance(a)
0085
0086 sv = SparseVector(4, {1: 1, 3: 2})
0087 dv = DenseVector(array([1., 2., 3., 4.]))
0088 lst = DenseVector([4, 3, 2, 1])
0089 lst1 = [4, 3, 2, 1]
0090 arr = pyarray.array('d', [0, 2, 1, 3])
0091 narr = array([0, 2, 1, 3])
0092 self.assertEqual(15.0, squared_distance(sv, dv))
0093 self.assertEqual(25.0, squared_distance(sv, lst))
0094 self.assertEqual(20.0, squared_distance(dv, lst))
0095 self.assertEqual(15.0, squared_distance(dv, sv))
0096 self.assertEqual(25.0, squared_distance(lst, sv))
0097 self.assertEqual(20.0, squared_distance(lst, dv))
0098 self.assertEqual(0.0, squared_distance(sv, sv))
0099 self.assertEqual(0.0, squared_distance(dv, dv))
0100 self.assertEqual(0.0, squared_distance(lst, lst))
0101 self.assertEqual(25.0, squared_distance(sv, lst1))
0102 self.assertEqual(3.0, squared_distance(sv, arr))
0103 self.assertEqual(3.0, squared_distance(sv, narr))
0104
0105 def test_hash(self):
0106 v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0107 v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0108 v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0109 v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0110 self.assertEqual(hash(v1), hash(v2))
0111 self.assertEqual(hash(v1), hash(v3))
0112 self.assertEqual(hash(v2), hash(v3))
0113 self.assertFalse(hash(v1) == hash(v4))
0114 self.assertFalse(hash(v2) == hash(v4))
0115
0116 def test_eq(self):
0117 v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0118 v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0119 v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0120 v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
0121 v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
0122 v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0123 dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
0124 sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
0125 self.assertEqual(v1, v2)
0126 self.assertEqual(v1, v3)
0127 self.assertFalse(v2 == v4)
0128 self.assertFalse(v1 == v5)
0129 self.assertFalse(v1 == v6)
0130
0131
0132 self.assertEqual(dm1, sm1)
0133 self.assertEqual(sm1, dm1)
0134
0135 def test_equals(self):
0136 indices = [1, 2, 4]
0137 values = [1., 3., 2.]
0138 self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
0139 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
0140 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
0141 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
0142
0143 def test_conversion(self):
0144
0145
0146 v = array([1, 2, 3, 4], dtype='float64')
0147 dv = DenseVector(v)
0148 self.assertTrue(dv.array.dtype == 'float64')
0149 v = array([1, 2, 3, 4], dtype='float32')
0150 dv = DenseVector(v)
0151 self.assertTrue(dv.array.dtype == 'float64')
0152
0153 def test_sparse_vector_indexing(self):
0154 sv = SparseVector(5, {1: 1, 3: 2})
0155 self.assertEqual(sv[0], 0.)
0156 self.assertEqual(sv[3], 2.)
0157 self.assertEqual(sv[1], 1.)
0158 self.assertEqual(sv[2], 0.)
0159 self.assertEqual(sv[4], 0.)
0160 self.assertEqual(sv[-1], 0.)
0161 self.assertEqual(sv[-2], 2.)
0162 self.assertEqual(sv[-3], 0.)
0163 self.assertEqual(sv[-5], 0.)
0164 for ind in [5, -6]:
0165 self.assertRaises(IndexError, sv.__getitem__, ind)
0166 for ind in [7.8, '1']:
0167 self.assertRaises(TypeError, sv.__getitem__, ind)
0168
0169 zeros = SparseVector(4, {})
0170 self.assertEqual(zeros[0], 0.0)
0171 self.assertEqual(zeros[3], 0.0)
0172 for ind in [4, -5]:
0173 self.assertRaises(IndexError, zeros.__getitem__, ind)
0174
0175 empty = SparseVector(0, {})
0176 for ind in [-1, 0, 1]:
0177 self.assertRaises(IndexError, empty.__getitem__, ind)
0178
0179 def test_sparse_vector_iteration(self):
0180 self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
0181 self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
0182
0183 def test_matrix_indexing(self):
0184 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0185 expected = [[0, 6], [1, 8], [4, 10]]
0186 for i in range(3):
0187 for j in range(2):
0188 self.assertEqual(mat[i, j], expected[i][j])
0189
0190 for i, j in [(-1, 0), (4, 1), (3, 4)]:
0191 self.assertRaises(IndexError, mat.__getitem__, (i, j))
0192
0193 def test_repr_dense_matrix(self):
0194 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0195 self.assertTrue(
0196 repr(mat),
0197 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0198
0199 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
0200 self.assertTrue(
0201 repr(mat),
0202 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0203
0204 mat = DenseMatrix(6, 3, zeros(18))
0205 self.assertTrue(
0206 repr(mat),
0207 'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
0208 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
0209
0210 def test_repr_sparse_matrix(self):
0211 sm1t = SparseMatrix(
0212 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0213 isTransposed=True)
0214 self.assertTrue(
0215 repr(sm1t),
0216 'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
0217
0218 indices = tile(arange(6), 3)
0219 values = ones(18)
0220 sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
0221 self.assertTrue(
0222 repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
0223 [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
0224 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
0225 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
0226
0227 self.assertTrue(
0228 str(sm),
0229 "6 X 3 CSCMatrix\n\
0230 (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
0231 (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
0232 (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
0233
0234 sm = SparseMatrix(1, 18, zeros(19), [], [])
0235 self.assertTrue(
0236 repr(sm),
0237 'SparseMatrix(1, 18, \
0238 [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
0239
0240 def test_sparse_matrix(self):
0241
0242 sm1 = SparseMatrix(
0243 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0244 self.assertEqual(sm1.numRows, 3)
0245 self.assertEqual(sm1.numCols, 4)
0246 self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
0247 self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
0248 self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
0249 self.assertTrue(
0250 repr(sm1),
0251 'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
0252
0253
0254 expected = [
0255 [0, 0, 0, 0],
0256 [1, 0, 4, 0],
0257 [2, 0, 5, 0]]
0258
0259 for i in range(3):
0260 for j in range(4):
0261 self.assertEqual(expected[i][j], sm1[i, j])
0262 self.assertTrue(array_equal(sm1.toArray(), expected))
0263
0264 for i, j in [(-1, 1), (4, 3), (3, 5)]:
0265 self.assertRaises(IndexError, sm1.__getitem__, (i, j))
0266
0267
0268 smnew = sm1.toDense().toSparse()
0269 self.assertEqual(sm1.numRows, smnew.numRows)
0270 self.assertEqual(sm1.numCols, smnew.numCols)
0271 self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
0272 self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
0273 self.assertTrue(array_equal(sm1.values, smnew.values))
0274
0275 sm1t = SparseMatrix(
0276 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0277 isTransposed=True)
0278 self.assertEqual(sm1t.numRows, 3)
0279 self.assertEqual(sm1t.numCols, 4)
0280 self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
0281 self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
0282 self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
0283
0284 expected = [
0285 [3, 2, 0, 0],
0286 [0, 0, 4, 0],
0287 [9, 0, 8, 0]]
0288
0289 for i in range(3):
0290 for j in range(4):
0291 self.assertEqual(expected[i][j], sm1t[i, j])
0292 self.assertTrue(array_equal(sm1t.toArray(), expected))
0293
0294 def test_dense_matrix_is_transposed(self):
0295 mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
0296 mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
0297 self.assertEqual(mat1, mat)
0298
0299 expected = [[0, 4], [1, 6], [3, 9]]
0300 for i in range(3):
0301 for j in range(2):
0302 self.assertEqual(mat1[i, j], expected[i][j])
0303 self.assertTrue(array_equal(mat1.toArray(), expected))
0304
0305 sm = mat1.toSparse()
0306 self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
0307 self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
0308 self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
0309
0310 def test_parse_vector(self):
0311 a = DenseVector([])
0312 self.assertEqual(str(a), '[]')
0313 self.assertEqual(Vectors.parse(str(a)), a)
0314 a = DenseVector([3, 4, 6, 7])
0315 self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
0316 self.assertEqual(Vectors.parse(str(a)), a)
0317 a = SparseVector(4, [], [])
0318 self.assertEqual(str(a), '(4,[],[])')
0319 self.assertEqual(SparseVector.parse(str(a)), a)
0320 a = SparseVector(4, [0, 2], [3, 4])
0321 self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
0322 self.assertEqual(Vectors.parse(str(a)), a)
0323 a = SparseVector(10, [0, 1], [4, 5])
0324 self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
0325
0326 def test_norms(self):
0327 a = DenseVector([0, 2, 3, -1])
0328 self.assertAlmostEqual(a.norm(2), 3.742, 3)
0329 self.assertTrue(a.norm(1), 6)
0330 self.assertTrue(a.norm(inf), 3)
0331 a = SparseVector(4, [0, 2], [3, -4])
0332 self.assertAlmostEqual(a.norm(2), 5)
0333 self.assertTrue(a.norm(1), 7)
0334 self.assertTrue(a.norm(inf), 4)
0335
0336 tmp = SparseVector(4, [0, 2], [3, 0])
0337 self.assertEqual(tmp.numNonzeros(), 1)
0338
0339 def test_ml_mllib_vector_conversion(self):
0340
0341
0342 mllibDV = Vectors.dense([1, 2, 3])
0343 mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
0344 mlDV2 = mllibDV.asML()
0345 self.assertEqual(mlDV2, mlDV1)
0346
0347 mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
0348 mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
0349 mlSV2 = mllibSV.asML()
0350 self.assertEqual(mlSV2, mlSV1)
0351
0352
0353 mllibDV1 = Vectors.dense([1, 2, 3])
0354 mlDV = newlinalg.Vectors.dense([1, 2, 3])
0355 mllibDV2 = Vectors.fromML(mlDV)
0356 self.assertEqual(mllibDV1, mllibDV2)
0357
0358 mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
0359 mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
0360 mllibSV2 = Vectors.fromML(mlSV)
0361 self.assertEqual(mllibSV1, mllibSV2)
0362
0363 def test_ml_mllib_matrix_conversion(self):
0364
0365
0366 mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
0367 mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
0368 mlDM2 = mllibDM.asML()
0369 self.assertEqual(mlDM2, mlDM1)
0370
0371 mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
0372 mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
0373 mlDMt2 = mllibDMt.asML()
0374 self.assertEqual(mlDMt2, mlDMt1)
0375
0376 mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0377 mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0378 mlSM2 = mllibSM.asML()
0379 self.assertEqual(mlSM2, mlSM1)
0380
0381 mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0382 mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0383 mlSMt2 = mllibSMt.asML()
0384 self.assertEqual(mlSMt2, mlSMt1)
0385
0386
0387 mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
0388 mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
0389 mllibDM2 = Matrices.fromML(mlDM)
0390 self.assertEqual(mllibDM1, mllibDM2)
0391
0392 mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
0393 mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
0394 mllibDMt2 = Matrices.fromML(mlDMt)
0395 self.assertEqual(mllibDMt1, mllibDMt2)
0396
0397 mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0398 mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
0399 mllibSM2 = Matrices.fromML(mlSM)
0400 self.assertEqual(mllibSM1, mllibSM2)
0401
0402 mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0403 mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
0404 mllibSMt2 = Matrices.fromML(mlSMt)
0405 self.assertEqual(mllibSMt1, mllibSMt2)
0406
0407
0408 class VectorUDTTests(MLlibTestCase):
0409
0410 dv0 = DenseVector([])
0411 dv1 = DenseVector([1.0, 2.0])
0412 sv0 = SparseVector(2, [], [])
0413 sv1 = SparseVector(2, [1], [2.0])
0414 udt = VectorUDT()
0415
0416 def test_json_schema(self):
0417 self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
0418
0419 def test_serialization(self):
0420 for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
0421 self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))
0422
0423 def test_infer_schema(self):
0424 rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
0425 df = rdd.toDF()
0426 schema = df.schema
0427 field = [f for f in schema.fields if f.name == "features"][0]
0428 self.assertEqual(field.dataType, self.udt)
0429 vectors = df.rdd.map(lambda p: p.features).collect()
0430 self.assertEqual(len(vectors), 2)
0431 for v in vectors:
0432 if isinstance(v, SparseVector):
0433 self.assertEqual(v, self.sv1)
0434 elif isinstance(v, DenseVector):
0435 self.assertEqual(v, self.dv1)
0436 else:
0437 raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
0438
0439 def test_row_matrix_from_dataframe(self):
0440 from pyspark.sql.utils import IllegalArgumentException
0441 df = self.spark.createDataFrame([Row(Vectors.dense(1))])
0442 row_matrix = RowMatrix(df)
0443 self.assertEqual(row_matrix.numRows(), 1)
0444 self.assertEqual(row_matrix.numCols(), 1)
0445 with self.assertRaises(IllegalArgumentException):
0446 RowMatrix(df.selectExpr("'monkey'"))
0447
0448 def test_indexed_row_matrix_from_dataframe(self):
0449 from pyspark.sql.utils import IllegalArgumentException
0450 df = self.spark.createDataFrame([Row(long(0), Vectors.dense(1))])
0451 matrix = IndexedRowMatrix(df)
0452 self.assertEqual(matrix.numRows(), 1)
0453 self.assertEqual(matrix.numCols(), 1)
0454 with self.assertRaises(IllegalArgumentException):
0455 IndexedRowMatrix(df.drop("_1"))
0456
0457
0458 class MatrixUDTTests(MLlibTestCase):
0459
0460 dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
0461 dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
0462 sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
0463 sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
0464 udt = MatrixUDT()
0465
0466 def test_json_schema(self):
0467 self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
0468
0469 def test_serialization(self):
0470 for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
0471 self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
0472
0473 def test_infer_schema(self):
0474 rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
0475 df = rdd.toDF()
0476 schema = df.schema
0477 self.assertTrue(schema.fields[1].dataType, self.udt)
0478 matrices = df.rdd.map(lambda x: x._2).collect()
0479 self.assertEqual(len(matrices), 2)
0480 for m in matrices:
0481 if isinstance(m, DenseMatrix):
0482 self.assertTrue(m, self.dm1)
0483 elif isinstance(m, SparseMatrix):
0484 self.assertTrue(m, self.sm1)
0485 else:
0486 raise ValueError("Expected a matrix but got type %r" % type(m))
0487
0488
0489 @unittest.skipIf(not have_scipy, "SciPy not installed")
0490 class SciPyTests(MLlibTestCase):
0491
0492 """
0493 Test both vector operations and MLlib algorithms with SciPy sparse matrices,
0494 if SciPy is available.
0495 """
0496
0497 def test_serialize(self):
0498 from scipy.sparse import lil_matrix
0499
0500 ser = PickleSerializer()
0501 lil = lil_matrix((4, 1))
0502 lil[1, 0] = 1
0503 lil[3, 0] = 2
0504 sv = SparseVector(4, {1: 1, 3: 2})
0505 self.assertEqual(sv, _convert_to_vector(lil))
0506 self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
0507 self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
0508 self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
0509 self.assertEqual(sv, _convert_to_vector(lil.todok()))
0510
0511 def serialize(l):
0512 return ser.loads(ser.dumps(_convert_to_vector(l)))
0513 self.assertEqual(sv, serialize(lil))
0514 self.assertEqual(sv, serialize(lil.tocsc()))
0515 self.assertEqual(sv, serialize(lil.tocsr()))
0516 self.assertEqual(sv, serialize(lil.todok()))
0517
0518 def test_convert_to_vector(self):
0519 from scipy.sparse import csc_matrix
0520
0521 indptr = array([0, 2])
0522 indices = array([3, 1])
0523 data = array([2.0, 1.0])
0524 csc = csc_matrix((data, indices, indptr))
0525 self.assertFalse(csc.has_sorted_indices)
0526 sv = SparseVector(4, {1: 1, 3: 2})
0527 self.assertEqual(sv, _convert_to_vector(csc))
0528
0529 def test_dot(self):
0530 from scipy.sparse import lil_matrix
0531 lil = lil_matrix((4, 1))
0532 lil[1, 0] = 1
0533 lil[3, 0] = 2
0534 dv = DenseVector(array([1., 2., 3., 4.]))
0535 self.assertEqual(10.0, dv.dot(lil))
0536
0537 def test_squared_distance(self):
0538 from scipy.sparse import lil_matrix
0539 lil = lil_matrix((4, 1))
0540 lil[1, 0] = 3
0541 lil[3, 0] = 2
0542 dv = DenseVector(array([1., 2., 3., 4.]))
0543 sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
0544 self.assertEqual(15.0, dv.squared_distance(lil))
0545 self.assertEqual(15.0, sv.squared_distance(lil))
0546
0547 def scipy_matrix(self, size, values):
0548 """Create a column SciPy matrix from a dictionary of values"""
0549 from scipy.sparse import lil_matrix
0550 lil = lil_matrix((size, 1))
0551 for key, value in values.items():
0552 lil[key, 0] = value
0553 return lil
0554
0555 def test_clustering(self):
0556 from pyspark.mllib.clustering import KMeans
0557 data = [
0558 self.scipy_matrix(3, {1: 1.0}),
0559 self.scipy_matrix(3, {1: 1.1}),
0560 self.scipy_matrix(3, {2: 1.0}),
0561 self.scipy_matrix(3, {2: 1.1})
0562 ]
0563 clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
0564 self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
0565 self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
0566
0567 def test_classification(self):
0568 from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
0569 from pyspark.mllib.tree import DecisionTree
0570 data = [
0571 LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
0572 LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
0573 LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
0574 LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
0575 ]
0576 rdd = self.sc.parallelize(data)
0577 features = [p.features for p in data]
0578
0579 lr_model = LogisticRegressionWithSGD.train(rdd)
0580 self.assertTrue(lr_model.predict(features[0]) <= 0)
0581 self.assertTrue(lr_model.predict(features[1]) > 0)
0582 self.assertTrue(lr_model.predict(features[2]) <= 0)
0583 self.assertTrue(lr_model.predict(features[3]) > 0)
0584
0585 svm_model = SVMWithSGD.train(rdd)
0586 self.assertTrue(svm_model.predict(features[0]) <= 0)
0587 self.assertTrue(svm_model.predict(features[1]) > 0)
0588 self.assertTrue(svm_model.predict(features[2]) <= 0)
0589 self.assertTrue(svm_model.predict(features[3]) > 0)
0590
0591 nb_model = NaiveBayes.train(rdd)
0592 self.assertTrue(nb_model.predict(features[0]) <= 0)
0593 self.assertTrue(nb_model.predict(features[1]) > 0)
0594 self.assertTrue(nb_model.predict(features[2]) <= 0)
0595 self.assertTrue(nb_model.predict(features[3]) > 0)
0596
0597 categoricalFeaturesInfo = {0: 3}
0598 dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
0599 categoricalFeaturesInfo=categoricalFeaturesInfo)
0600 self.assertTrue(dt_model.predict(features[0]) <= 0)
0601 self.assertTrue(dt_model.predict(features[1]) > 0)
0602 self.assertTrue(dt_model.predict(features[2]) <= 0)
0603 self.assertTrue(dt_model.predict(features[3]) > 0)
0604
0605 def test_regression(self):
0606 from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
0607 RidgeRegressionWithSGD
0608 from pyspark.mllib.tree import DecisionTree
0609 data = [
0610 LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
0611 LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
0612 LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
0613 LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
0614 ]
0615 rdd = self.sc.parallelize(data)
0616 features = [p.features for p in data]
0617
0618 lr_model = LinearRegressionWithSGD.train(rdd)
0619 self.assertTrue(lr_model.predict(features[0]) <= 0)
0620 self.assertTrue(lr_model.predict(features[1]) > 0)
0621 self.assertTrue(lr_model.predict(features[2]) <= 0)
0622 self.assertTrue(lr_model.predict(features[3]) > 0)
0623
0624 lasso_model = LassoWithSGD.train(rdd)
0625 self.assertTrue(lasso_model.predict(features[0]) <= 0)
0626 self.assertTrue(lasso_model.predict(features[1]) > 0)
0627 self.assertTrue(lasso_model.predict(features[2]) <= 0)
0628 self.assertTrue(lasso_model.predict(features[3]) > 0)
0629
0630 rr_model = RidgeRegressionWithSGD.train(rdd)
0631 self.assertTrue(rr_model.predict(features[0]) <= 0)
0632 self.assertTrue(rr_model.predict(features[1]) > 0)
0633 self.assertTrue(rr_model.predict(features[2]) <= 0)
0634 self.assertTrue(rr_model.predict(features[3]) > 0)
0635
0636 categoricalFeaturesInfo = {0: 2}
0637 dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
0638 self.assertTrue(dt_model.predict(features[0]) <= 0)
0639 self.assertTrue(dt_model.predict(features[1]) > 0)
0640 self.assertTrue(dt_model.predict(features[2]) <= 0)
0641 self.assertTrue(dt_model.predict(features[3]) > 0)
0642
0643
0644 if __name__ == "__main__":
0645 from pyspark.mllib.tests.test_linalg import *
0646
0647 try:
0648 import xmlrunner
0649 testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0650 except ImportError:
0651 testRunner = None
0652 unittest.main(testRunner=testRunner, verbosity=2)