0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 import unittest
0019 import array as pyarray
0020
0021 from numpy import arange, array, array_equal, inf, ones, tile, zeros
0022
0023 from pyspark.serializers import PickleSerializer
0024 from pyspark.ml.linalg import DenseMatrix, DenseVector, MatrixUDT, SparseMatrix, SparseVector, \
0025 Vector, VectorUDT, Vectors
0026 from pyspark.testing.mllibutils import MLlibTestCase
0027 from pyspark.sql import Row
0028
0029
0030 class VectorTests(MLlibTestCase):
0031
0032 def _test_serialize(self, v):
0033 ser = PickleSerializer()
0034 self.assertEqual(v, ser.loads(ser.dumps(v)))
0035 jvec = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(v)))
0036 nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvec)))
0037 self.assertEqual(v, nv)
0038 vs = [v] * 100
0039 jvecs = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(vs)))
0040 nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvecs)))
0041 self.assertEqual(vs, nvs)
0042
0043 def test_serialize(self):
0044 self._test_serialize(DenseVector(range(10)))
0045 self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
0046 self._test_serialize(DenseVector(pyarray.array('d', range(10))))
0047 self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
0048 self._test_serialize(SparseVector(3, {}))
0049 self._test_serialize(DenseMatrix(2, 3, range(6)))
0050 sm1 = SparseMatrix(
0051 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0052 self._test_serialize(sm1)
0053
0054 def test_dot(self):
0055 sv = SparseVector(4, {1: 1, 3: 2})
0056 dv = DenseVector(array([1., 2., 3., 4.]))
0057 lst = DenseVector([1, 2, 3, 4])
0058 mat = array([[1., 2., 3., 4.],
0059 [1., 2., 3., 4.],
0060 [1., 2., 3., 4.],
0061 [1., 2., 3., 4.]])
0062 arr = pyarray.array('d', [0, 1, 2, 3])
0063 self.assertEqual(10.0, sv.dot(dv))
0064 self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
0065 self.assertEqual(30.0, dv.dot(dv))
0066 self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
0067 self.assertEqual(30.0, lst.dot(dv))
0068 self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
0069 self.assertEqual(7.0, sv.dot(arr))
0070
0071 def test_squared_distance(self):
0072 def squared_distance(a, b):
0073 if isinstance(a, Vector):
0074 return a.squared_distance(b)
0075 else:
0076 return b.squared_distance(a)
0077
0078 sv = SparseVector(4, {1: 1, 3: 2})
0079 dv = DenseVector(array([1., 2., 3., 4.]))
0080 lst = DenseVector([4, 3, 2, 1])
0081 lst1 = [4, 3, 2, 1]
0082 arr = pyarray.array('d', [0, 2, 1, 3])
0083 narr = array([0, 2, 1, 3])
0084 self.assertEqual(15.0, squared_distance(sv, dv))
0085 self.assertEqual(25.0, squared_distance(sv, lst))
0086 self.assertEqual(20.0, squared_distance(dv, lst))
0087 self.assertEqual(15.0, squared_distance(dv, sv))
0088 self.assertEqual(25.0, squared_distance(lst, sv))
0089 self.assertEqual(20.0, squared_distance(lst, dv))
0090 self.assertEqual(0.0, squared_distance(sv, sv))
0091 self.assertEqual(0.0, squared_distance(dv, dv))
0092 self.assertEqual(0.0, squared_distance(lst, lst))
0093 self.assertEqual(25.0, squared_distance(sv, lst1))
0094 self.assertEqual(3.0, squared_distance(sv, arr))
0095 self.assertEqual(3.0, squared_distance(sv, narr))
0096
0097 def test_hash(self):
0098 v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0099 v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0100 v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0101 v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0102 self.assertEqual(hash(v1), hash(v2))
0103 self.assertEqual(hash(v1), hash(v3))
0104 self.assertEqual(hash(v2), hash(v3))
0105 self.assertFalse(hash(v1) == hash(v4))
0106 self.assertFalse(hash(v2) == hash(v4))
0107
0108 def test_eq(self):
0109 v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
0110 v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
0111 v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
0112 v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
0113 v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
0114 v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
0115 dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
0116 sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
0117 self.assertEqual(v1, v2)
0118 self.assertEqual(v1, v3)
0119 self.assertFalse(v2 == v4)
0120 self.assertFalse(v1 == v5)
0121 self.assertFalse(v1 == v6)
0122
0123
0124 self.assertEqual(dm1, sm1)
0125 self.assertEqual(sm1, dm1)
0126
0127 def test_equals(self):
0128 indices = [1, 2, 4]
0129 values = [1., 3., 2.]
0130 self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
0131 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
0132 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
0133 self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
0134
0135 def test_conversion(self):
0136
0137
0138 v = array([1, 2, 3, 4], dtype='float64')
0139 dv = DenseVector(v)
0140 self.assertTrue(dv.array.dtype == 'float64')
0141 v = array([1, 2, 3, 4], dtype='float32')
0142 dv = DenseVector(v)
0143 self.assertTrue(dv.array.dtype == 'float64')
0144
0145 def test_sparse_vector_indexing(self):
0146 sv = SparseVector(5, {1: 1, 3: 2})
0147 self.assertEqual(sv[0], 0.)
0148 self.assertEqual(sv[3], 2.)
0149 self.assertEqual(sv[1], 1.)
0150 self.assertEqual(sv[2], 0.)
0151 self.assertEqual(sv[4], 0.)
0152 self.assertEqual(sv[-1], 0.)
0153 self.assertEqual(sv[-2], 2.)
0154 self.assertEqual(sv[-3], 0.)
0155 self.assertEqual(sv[-5], 0.)
0156 for ind in [5, -6]:
0157 self.assertRaises(IndexError, sv.__getitem__, ind)
0158 for ind in [7.8, '1']:
0159 self.assertRaises(TypeError, sv.__getitem__, ind)
0160
0161 zeros = SparseVector(4, {})
0162 self.assertEqual(zeros[0], 0.0)
0163 self.assertEqual(zeros[3], 0.0)
0164 for ind in [4, -5]:
0165 self.assertRaises(IndexError, zeros.__getitem__, ind)
0166
0167 empty = SparseVector(0, {})
0168 for ind in [-1, 0, 1]:
0169 self.assertRaises(IndexError, empty.__getitem__, ind)
0170
0171 def test_sparse_vector_iteration(self):
0172 self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
0173 self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
0174
0175 def test_matrix_indexing(self):
0176 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0177 expected = [[0, 6], [1, 8], [4, 10]]
0178 for i in range(3):
0179 for j in range(2):
0180 self.assertEqual(mat[i, j], expected[i][j])
0181
0182 for i, j in [(-1, 0), (4, 1), (3, 4)]:
0183 self.assertRaises(IndexError, mat.__getitem__, (i, j))
0184
0185 def test_repr_dense_matrix(self):
0186 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
0187 self.assertTrue(
0188 repr(mat),
0189 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0190
0191 mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
0192 self.assertTrue(
0193 repr(mat),
0194 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
0195
0196 mat = DenseMatrix(6, 3, zeros(18))
0197 self.assertTrue(
0198 repr(mat),
0199 'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
0200 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
0201
0202 def test_repr_sparse_matrix(self):
0203 sm1t = SparseMatrix(
0204 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0205 isTransposed=True)
0206 self.assertTrue(
0207 repr(sm1t),
0208 'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
0209
0210 indices = tile(arange(6), 3)
0211 values = ones(18)
0212 sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
0213 self.assertTrue(
0214 repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
0215 [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
0216 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
0217 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
0218
0219 self.assertTrue(
0220 str(sm),
0221 "6 X 3 CSCMatrix\n\
0222 (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
0223 (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
0224 (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
0225
0226 sm = SparseMatrix(1, 18, zeros(19), [], [])
0227 self.assertTrue(
0228 repr(sm),
0229 'SparseMatrix(1, 18, \
0230 [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
0231
0232 def test_sparse_matrix(self):
0233
0234 sm1 = SparseMatrix(
0235 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
0236 self.assertEqual(sm1.numRows, 3)
0237 self.assertEqual(sm1.numCols, 4)
0238 self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
0239 self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
0240 self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
0241 self.assertTrue(
0242 repr(sm1),
0243 'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
0244
0245
0246 expected = [
0247 [0, 0, 0, 0],
0248 [1, 0, 4, 0],
0249 [2, 0, 5, 0]]
0250
0251 for i in range(3):
0252 for j in range(4):
0253 self.assertEqual(expected[i][j], sm1[i, j])
0254 self.assertTrue(array_equal(sm1.toArray(), expected))
0255
0256 for i, j in [(-1, 1), (4, 3), (3, 5)]:
0257 self.assertRaises(IndexError, sm1.__getitem__, (i, j))
0258
0259
0260 smnew = sm1.toDense().toSparse()
0261 self.assertEqual(sm1.numRows, smnew.numRows)
0262 self.assertEqual(sm1.numCols, smnew.numCols)
0263 self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
0264 self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
0265 self.assertTrue(array_equal(sm1.values, smnew.values))
0266
0267 sm1t = SparseMatrix(
0268 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
0269 isTransposed=True)
0270 self.assertEqual(sm1t.numRows, 3)
0271 self.assertEqual(sm1t.numCols, 4)
0272 self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
0273 self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
0274 self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
0275
0276 expected = [
0277 [3, 2, 0, 0],
0278 [0, 0, 4, 0],
0279 [9, 0, 8, 0]]
0280
0281 for i in range(3):
0282 for j in range(4):
0283 self.assertEqual(expected[i][j], sm1t[i, j])
0284 self.assertTrue(array_equal(sm1t.toArray(), expected))
0285
0286 def test_dense_matrix_is_transposed(self):
0287 mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
0288 mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
0289 self.assertEqual(mat1, mat)
0290
0291 expected = [[0, 4], [1, 6], [3, 9]]
0292 for i in range(3):
0293 for j in range(2):
0294 self.assertEqual(mat1[i, j], expected[i][j])
0295 self.assertTrue(array_equal(mat1.toArray(), expected))
0296
0297 sm = mat1.toSparse()
0298 self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
0299 self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
0300 self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
0301
0302 def test_norms(self):
0303 a = DenseVector([0, 2, 3, -1])
0304 self.assertAlmostEqual(a.norm(2), 3.742, 3)
0305 self.assertTrue(a.norm(1), 6)
0306 self.assertTrue(a.norm(inf), 3)
0307 a = SparseVector(4, [0, 2], [3, -4])
0308 self.assertAlmostEqual(a.norm(2), 5)
0309 self.assertTrue(a.norm(1), 7)
0310 self.assertTrue(a.norm(inf), 4)
0311
0312 tmp = SparseVector(4, [0, 2], [3, 0])
0313 self.assertEqual(tmp.numNonzeros(), 1)
0314
0315
0316 class VectorUDTTests(MLlibTestCase):
0317
0318 dv0 = DenseVector([])
0319 dv1 = DenseVector([1.0, 2.0])
0320 sv0 = SparseVector(2, [], [])
0321 sv1 = SparseVector(2, [1], [2.0])
0322 udt = VectorUDT()
0323
0324 def test_json_schema(self):
0325 self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
0326
0327 def test_serialization(self):
0328 for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
0329 self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))
0330
0331 def test_infer_schema(self):
0332 rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1),
0333 Row(label=0.0, features=self.sv1)])
0334 df = rdd.toDF()
0335 schema = df.schema
0336 field = [f for f in schema.fields if f.name == "features"][0]
0337 self.assertEqual(field.dataType, self.udt)
0338 vectors = df.rdd.map(lambda p: p.features).collect()
0339 self.assertEqual(len(vectors), 2)
0340 for v in vectors:
0341 if isinstance(v, SparseVector):
0342 self.assertEqual(v, self.sv1)
0343 elif isinstance(v, DenseVector):
0344 self.assertEqual(v, self.dv1)
0345 else:
0346 raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
0347
0348
0349 class MatrixUDTTests(MLlibTestCase):
0350
0351 dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
0352 dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
0353 sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
0354 sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
0355 udt = MatrixUDT()
0356
0357 def test_json_schema(self):
0358 self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
0359
0360 def test_serialization(self):
0361 for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
0362 self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
0363
0364 def test_infer_schema(self):
0365 rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
0366 df = rdd.toDF()
0367 schema = df.schema
0368 self.assertTrue(schema.fields[1].dataType, self.udt)
0369 matrices = df.rdd.map(lambda x: x._2).collect()
0370 self.assertEqual(len(matrices), 2)
0371 for m in matrices:
0372 if isinstance(m, DenseMatrix):
0373 self.assertTrue(m, self.dm1)
0374 elif isinstance(m, SparseMatrix):
0375 self.assertTrue(m, self.sm1)
0376 else:
0377 raise ValueError("Expected a matrix but got type %r" % type(m))
0378
0379
0380 if __name__ == "__main__":
0381 from pyspark.ml.tests.test_linalg import *
0382
0383 try:
0384 import xmlrunner
0385 testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
0386 except ImportError:
0387 testRunner = None
0388 unittest.main(testRunner=testRunner, verbosity=2)