0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 MLlib utilities for linear algebra. For dense vectors, MLlib
0020 uses the NumPy `array` type, so you can simply pass NumPy arrays
0021 around. For sparse vectors, users can construct a :class:`SparseVector`
0022 object from MLlib or pass SciPy `scipy.sparse` column vectors if
0023 SciPy is available in their environment.
0024 """
0025
0026 import sys
0027 import array
0028 import struct
0029
0030 if sys.version >= '3':
0031 basestring = str
0032 xrange = range
0033 import copyreg as copy_reg
0034 long = int
0035 else:
0036 from itertools import izip as zip
0037 import copy_reg
0038
0039 import numpy as np
0040
0041 from pyspark import since
0042 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
0043 IntegerType, ByteType, BooleanType
0044
0045
0046 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
0047 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
0048
0049
0050 if sys.version_info[:2] == (2, 7):
0051
0052 def fast_pickle_array(ar):
0053 return array.array, (ar.typecode, ar.tostring())
0054 copy_reg.pickle(array.array, fast_pickle_array)
0055
0056
0057
0058
0059
0060 try:
0061 import scipy.sparse
0062 _have_scipy = True
0063 except:
0064
0065 _have_scipy = False
0066
0067
0068 def _convert_to_vector(l):
0069 if isinstance(l, Vector):
0070 return l
0071 elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
0072 return DenseVector(l)
0073 elif _have_scipy and scipy.sparse.issparse(l):
0074 assert l.shape[1] == 1, "Expected column vector"
0075
0076 csc = l.tocsc()
0077 if not csc.has_sorted_indices:
0078 csc.sort_indices()
0079 return SparseVector(l.shape[0], csc.indices, csc.data)
0080 else:
0081 raise TypeError("Cannot convert type %s into Vector" % type(l))
0082
0083
0084 def _vector_size(v):
0085 """
0086 Returns the size of the vector.
0087
0088 >>> _vector_size([1., 2., 3.])
0089 3
0090 >>> _vector_size((1., 2., 3.))
0091 3
0092 >>> _vector_size(array.array('d', [1., 2., 3.]))
0093 3
0094 >>> _vector_size(np.zeros(3))
0095 3
0096 >>> _vector_size(np.zeros((3, 1)))
0097 3
0098 >>> _vector_size(np.zeros((1, 3)))
0099 Traceback (most recent call last):
0100 ...
0101 ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
0102 """
0103 if isinstance(v, Vector):
0104 return len(v)
0105 elif type(v) in (array.array, list, tuple, xrange):
0106 return len(v)
0107 elif type(v) == np.ndarray:
0108 if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
0109 return len(v)
0110 else:
0111 raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
0112 elif _have_scipy and scipy.sparse.issparse(v):
0113 assert v.shape[1] == 1, "Expected column vector"
0114 return v.shape[0]
0115 else:
0116 raise TypeError("Cannot treat type %s as a vector" % type(v))
0117
0118
0119 def _format_float(f, digits=4):
0120 s = str(round(f, digits))
0121 if '.' in s:
0122 s = s[:s.index('.') + 1 + digits]
0123 return s
0124
0125
0126 def _format_float_list(l):
0127 return [_format_float(x) for x in l]
0128
0129
0130 def _double_to_long_bits(value):
0131 if np.isnan(value):
0132 value = float('nan')
0133
0134 return struct.unpack('Q', struct.pack('d', value))[0]
0135
0136
0137 class VectorUDT(UserDefinedType):
0138 """
0139 SQL user-defined type (UDT) for Vector.
0140 """
0141
0142 @classmethod
0143 def sqlType(cls):
0144 return StructType([
0145 StructField("type", ByteType(), False),
0146 StructField("size", IntegerType(), True),
0147 StructField("indices", ArrayType(IntegerType(), False), True),
0148 StructField("values", ArrayType(DoubleType(), False), True)])
0149
0150 @classmethod
0151 def module(cls):
0152 return "pyspark.ml.linalg"
0153
0154 @classmethod
0155 def scalaUDT(cls):
0156 return "org.apache.spark.ml.linalg.VectorUDT"
0157
0158 def serialize(self, obj):
0159 if isinstance(obj, SparseVector):
0160 indices = [int(i) for i in obj.indices]
0161 values = [float(v) for v in obj.values]
0162 return (0, obj.size, indices, values)
0163 elif isinstance(obj, DenseVector):
0164 values = [float(v) for v in obj]
0165 return (1, None, None, values)
0166 else:
0167 raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
0168
0169 def deserialize(self, datum):
0170 assert len(datum) == 4, \
0171 "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
0172 tpe = datum[0]
0173 if tpe == 0:
0174 return SparseVector(datum[1], datum[2], datum[3])
0175 elif tpe == 1:
0176 return DenseVector(datum[3])
0177 else:
0178 raise ValueError("do not recognize type %r" % tpe)
0179
0180 def simpleString(self):
0181 return "vector"
0182
0183
0184 class MatrixUDT(UserDefinedType):
0185 """
0186 SQL user-defined type (UDT) for Matrix.
0187 """
0188
0189 @classmethod
0190 def sqlType(cls):
0191 return StructType([
0192 StructField("type", ByteType(), False),
0193 StructField("numRows", IntegerType(), False),
0194 StructField("numCols", IntegerType(), False),
0195 StructField("colPtrs", ArrayType(IntegerType(), False), True),
0196 StructField("rowIndices", ArrayType(IntegerType(), False), True),
0197 StructField("values", ArrayType(DoubleType(), False), True),
0198 StructField("isTransposed", BooleanType(), False)])
0199
0200 @classmethod
0201 def module(cls):
0202 return "pyspark.ml.linalg"
0203
0204 @classmethod
0205 def scalaUDT(cls):
0206 return "org.apache.spark.ml.linalg.MatrixUDT"
0207
0208 def serialize(self, obj):
0209 if isinstance(obj, SparseMatrix):
0210 colPtrs = [int(i) for i in obj.colPtrs]
0211 rowIndices = [int(i) for i in obj.rowIndices]
0212 values = [float(v) for v in obj.values]
0213 return (0, obj.numRows, obj.numCols, colPtrs,
0214 rowIndices, values, bool(obj.isTransposed))
0215 elif isinstance(obj, DenseMatrix):
0216 values = [float(v) for v in obj.values]
0217 return (1, obj.numRows, obj.numCols, None, None, values,
0218 bool(obj.isTransposed))
0219 else:
0220 raise TypeError("cannot serialize type %r" % (type(obj)))
0221
0222 def deserialize(self, datum):
0223 assert len(datum) == 7, \
0224 "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
0225 tpe = datum[0]
0226 if tpe == 0:
0227 return SparseMatrix(*datum[1:])
0228 elif tpe == 1:
0229 return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
0230 else:
0231 raise ValueError("do not recognize type %r" % tpe)
0232
0233 def simpleString(self):
0234 return "matrix"
0235
0236
0237 class Vector(object):
0238
0239 __UDT__ = VectorUDT()
0240
0241 """
0242 Abstract class for DenseVector and SparseVector
0243 """
0244 def toArray(self):
0245 """
0246 Convert the vector into an numpy.ndarray
0247
0248 :return: numpy.ndarray
0249 """
0250 raise NotImplementedError
0251
0252
0253 class DenseVector(Vector):
0254 """
0255 A dense vector represented by a value array. We use numpy array for
0256 storage and arithmetics will be delegated to the underlying numpy
0257 array.
0258
0259 >>> v = Vectors.dense([1.0, 2.0])
0260 >>> u = Vectors.dense([3.0, 4.0])
0261 >>> v + u
0262 DenseVector([4.0, 6.0])
0263 >>> 2 - v
0264 DenseVector([1.0, 0.0])
0265 >>> v / 2
0266 DenseVector([0.5, 1.0])
0267 >>> v * u
0268 DenseVector([3.0, 8.0])
0269 >>> u / v
0270 DenseVector([3.0, 2.0])
0271 >>> u % 2
0272 DenseVector([1.0, 0.0])
0273 >>> -v
0274 DenseVector([-1.0, -2.0])
0275 """
0276 def __init__(self, ar):
0277 if isinstance(ar, bytes):
0278 ar = np.frombuffer(ar, dtype=np.float64)
0279 elif not isinstance(ar, np.ndarray):
0280 ar = np.array(ar, dtype=np.float64)
0281 if ar.dtype != np.float64:
0282 ar = ar.astype(np.float64)
0283 self.array = ar
0284
0285 def __reduce__(self):
0286 return DenseVector, (self.array.tostring(),)
0287
0288 def numNonzeros(self):
0289 """
0290 Number of nonzero elements. This scans all active values and count non zeros
0291 """
0292 return np.count_nonzero(self.array)
0293
0294 def norm(self, p):
0295 """
0296 Calculates the norm of a DenseVector.
0297
0298 >>> a = DenseVector([0, -1, 2, -3])
0299 >>> a.norm(2)
0300 3.7...
0301 >>> a.norm(1)
0302 6.0
0303 """
0304 return np.linalg.norm(self.array, p)
0305
0306 def dot(self, other):
0307 """
0308 Compute the dot product of two Vectors. We support
0309 (Numpy array, list, SparseVector, or SciPy sparse)
0310 and a target NumPy array that is either 1- or 2-dimensional.
0311 Equivalent to calling numpy.dot of the two vectors.
0312
0313 >>> dense = DenseVector(array.array('d', [1., 2.]))
0314 >>> dense.dot(dense)
0315 5.0
0316 >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
0317 4.0
0318 >>> dense.dot(range(1, 3))
0319 5.0
0320 >>> dense.dot(np.array(range(1, 3)))
0321 5.0
0322 >>> dense.dot([1.,])
0323 Traceback (most recent call last):
0324 ...
0325 AssertionError: dimension mismatch
0326 >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
0327 array([ 5., 11.])
0328 >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
0329 Traceback (most recent call last):
0330 ...
0331 AssertionError: dimension mismatch
0332 """
0333 if type(other) == np.ndarray:
0334 if other.ndim > 1:
0335 assert len(self) == other.shape[0], "dimension mismatch"
0336 return np.dot(self.array, other)
0337 elif _have_scipy and scipy.sparse.issparse(other):
0338 assert len(self) == other.shape[0], "dimension mismatch"
0339 return other.transpose().dot(self.toArray())
0340 else:
0341 assert len(self) == _vector_size(other), "dimension mismatch"
0342 if isinstance(other, SparseVector):
0343 return other.dot(self)
0344 elif isinstance(other, Vector):
0345 return np.dot(self.toArray(), other.toArray())
0346 else:
0347 return np.dot(self.toArray(), other)
0348
0349 def squared_distance(self, other):
0350 """
0351 Squared distance of two Vectors.
0352
0353 >>> dense1 = DenseVector(array.array('d', [1., 2.]))
0354 >>> dense1.squared_distance(dense1)
0355 0.0
0356 >>> dense2 = np.array([2., 1.])
0357 >>> dense1.squared_distance(dense2)
0358 2.0
0359 >>> dense3 = [2., 1.]
0360 >>> dense1.squared_distance(dense3)
0361 2.0
0362 >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
0363 >>> dense1.squared_distance(sparse1)
0364 2.0
0365 >>> dense1.squared_distance([1.,])
0366 Traceback (most recent call last):
0367 ...
0368 AssertionError: dimension mismatch
0369 >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
0370 Traceback (most recent call last):
0371 ...
0372 AssertionError: dimension mismatch
0373 """
0374 assert len(self) == _vector_size(other), "dimension mismatch"
0375 if isinstance(other, SparseVector):
0376 return other.squared_distance(self)
0377 elif _have_scipy and scipy.sparse.issparse(other):
0378 return _convert_to_vector(other).squared_distance(self)
0379
0380 if isinstance(other, Vector):
0381 other = other.toArray()
0382 elif not isinstance(other, np.ndarray):
0383 other = np.array(other)
0384 diff = self.toArray() - other
0385 return np.dot(diff, diff)
0386
0387 def toArray(self):
0388 """
0389 Returns the underlying numpy.ndarray
0390 """
0391 return self.array
0392
0393 @property
0394 def values(self):
0395 """
0396 Returns the underlying numpy.ndarray
0397 """
0398 return self.array
0399
0400 def __getitem__(self, item):
0401 return self.array[item]
0402
0403 def __len__(self):
0404 return len(self.array)
0405
0406 def __str__(self):
0407 return "[" + ",".join([str(v) for v in self.array]) + "]"
0408
0409 def __repr__(self):
0410 return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
0411
0412 def __eq__(self, other):
0413 if isinstance(other, DenseVector):
0414 return np.array_equal(self.array, other.array)
0415 elif isinstance(other, SparseVector):
0416 if len(self) != other.size:
0417 return False
0418 return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
0419 return False
0420
0421 def __ne__(self, other):
0422 return not self == other
0423
0424 def __hash__(self):
0425 size = len(self)
0426 result = 31 + size
0427 nnz = 0
0428 i = 0
0429 while i < size and nnz < 128:
0430 if self.array[i] != 0:
0431 result = 31 * result + i
0432 bits = _double_to_long_bits(self.array[i])
0433 result = 31 * result + (bits ^ (bits >> 32))
0434 nnz += 1
0435 i += 1
0436 return result
0437
0438 def __getattr__(self, item):
0439 return getattr(self.array, item)
0440
0441 def __neg__(self):
0442 return DenseVector(-self.array)
0443
0444 def _delegate(op):
0445 def func(self, other):
0446 if isinstance(other, DenseVector):
0447 other = other.array
0448 return DenseVector(getattr(self.array, op)(other))
0449 return func
0450
0451 __add__ = _delegate("__add__")
0452 __sub__ = _delegate("__sub__")
0453 __mul__ = _delegate("__mul__")
0454 __div__ = _delegate("__div__")
0455 __truediv__ = _delegate("__truediv__")
0456 __mod__ = _delegate("__mod__")
0457 __radd__ = _delegate("__radd__")
0458 __rsub__ = _delegate("__rsub__")
0459 __rmul__ = _delegate("__rmul__")
0460 __rdiv__ = _delegate("__rdiv__")
0461 __rtruediv__ = _delegate("__rtruediv__")
0462 __rmod__ = _delegate("__rmod__")
0463
0464
0465 class SparseVector(Vector):
0466 """
0467 A simple sparse vector class for passing data to MLlib. Users may
0468 alternatively pass SciPy's {scipy.sparse} data types.
0469 """
0470 def __init__(self, size, *args):
0471 """
0472 Create a sparse vector, using either a dictionary, a list of
0473 (index, value) pairs, or two separate arrays of indices and
0474 values (sorted by index).
0475
0476 :param size: Size of the vector.
0477 :param args: Active entries, as a dictionary {index: value, ...},
0478 a list of tuples [(index, value), ...], or a list of strictly
0479 increasing indices and a list of corresponding values [index, ...],
0480 [value, ...]. Inactive entries are treated as zeros.
0481
0482 >>> SparseVector(4, {1: 1.0, 3: 5.5})
0483 SparseVector(4, {1: 1.0, 3: 5.5})
0484 >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
0485 SparseVector(4, {1: 1.0, 3: 5.5})
0486 >>> SparseVector(4, [1, 3], [1.0, 5.5])
0487 SparseVector(4, {1: 1.0, 3: 5.5})
0488 >>> SparseVector(4, {1:1.0, 6:2.0})
0489 Traceback (most recent call last):
0490 ...
0491 AssertionError: Index 6 is out of the size of vector with size=4
0492 >>> SparseVector(4, {-1:1.0})
0493 Traceback (most recent call last):
0494 ...
0495 AssertionError: Contains negative index -1
0496 """
0497 self.size = int(size)
0498 """ Size of the vector. """
0499 assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
0500 if len(args) == 1:
0501 pairs = args[0]
0502 if type(pairs) == dict:
0503 pairs = pairs.items()
0504 pairs = sorted(pairs)
0505 self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
0506 """ A list of indices corresponding to active entries. """
0507 self.values = np.array([p[1] for p in pairs], dtype=np.float64)
0508 """ A list of values corresponding to active entries. """
0509 else:
0510 if isinstance(args[0], bytes):
0511 assert isinstance(args[1], bytes), "values should be string too"
0512 if args[0]:
0513 self.indices = np.frombuffer(args[0], np.int32)
0514 self.values = np.frombuffer(args[1], np.float64)
0515 else:
0516
0517 self.indices = np.array([], dtype=np.int32)
0518 self.values = np.array([], dtype=np.float64)
0519 else:
0520 self.indices = np.array(args[0], dtype=np.int32)
0521 self.values = np.array(args[1], dtype=np.float64)
0522 assert len(self.indices) == len(self.values), "index and value arrays not same length"
0523 for i in xrange(len(self.indices) - 1):
0524 if self.indices[i] >= self.indices[i + 1]:
0525 raise TypeError(
0526 "Indices %s and %s are not strictly increasing"
0527 % (self.indices[i], self.indices[i + 1]))
0528
0529 if self.indices.size > 0:
0530 assert np.max(self.indices) < self.size, \
0531 "Index %d is out of the size of vector with size=%d" \
0532 % (np.max(self.indices), self.size)
0533 assert np.min(self.indices) >= 0, \
0534 "Contains negative index %d" % (np.min(self.indices))
0535
0536 def numNonzeros(self):
0537 """
0538 Number of nonzero elements. This scans all active values and count non zeros.
0539 """
0540 return np.count_nonzero(self.values)
0541
0542 def norm(self, p):
0543 """
0544 Calculates the norm of a SparseVector.
0545
0546 >>> a = SparseVector(4, [0, 1], [3., -4.])
0547 >>> a.norm(1)
0548 7.0
0549 >>> a.norm(2)
0550 5.0
0551 """
0552 return np.linalg.norm(self.values, p)
0553
0554 def __reduce__(self):
0555 return (
0556 SparseVector,
0557 (self.size, self.indices.tostring(), self.values.tostring()))
0558
0559 def dot(self, other):
0560 """
0561 Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
0562
0563 >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0564 >>> a.dot(a)
0565 25.0
0566 >>> a.dot(array.array('d', [1., 2., 3., 4.]))
0567 22.0
0568 >>> b = SparseVector(4, [2], [1.0])
0569 >>> a.dot(b)
0570 0.0
0571 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
0572 array([ 22., 22.])
0573 >>> a.dot([1., 2., 3.])
0574 Traceback (most recent call last):
0575 ...
0576 AssertionError: dimension mismatch
0577 >>> a.dot(np.array([1., 2.]))
0578 Traceback (most recent call last):
0579 ...
0580 AssertionError: dimension mismatch
0581 >>> a.dot(DenseVector([1., 2.]))
0582 Traceback (most recent call last):
0583 ...
0584 AssertionError: dimension mismatch
0585 >>> a.dot(np.zeros((3, 2)))
0586 Traceback (most recent call last):
0587 ...
0588 AssertionError: dimension mismatch
0589 """
0590
0591 if isinstance(other, np.ndarray):
0592 if other.ndim not in [2, 1]:
0593 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
0594 assert len(self) == other.shape[0], "dimension mismatch"
0595 return np.dot(self.values, other[self.indices])
0596
0597 assert len(self) == _vector_size(other), "dimension mismatch"
0598
0599 if isinstance(other, DenseVector):
0600 return np.dot(other.array[self.indices], self.values)
0601
0602 elif isinstance(other, SparseVector):
0603
0604 self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
0605 self_values = self.values[self_cmind]
0606 if self_values.size == 0:
0607 return 0.0
0608 else:
0609 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
0610 return np.dot(self_values, other.values[other_cmind])
0611
0612 else:
0613 return self.dot(_convert_to_vector(other))
0614
0615 def squared_distance(self, other):
0616 """
0617 Squared distance from a SparseVector or 1-dimensional NumPy array.
0618
0619 >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0620 >>> a.squared_distance(a)
0621 0.0
0622 >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
0623 11.0
0624 >>> a.squared_distance(np.array([1., 2., 3., 4.]))
0625 11.0
0626 >>> b = SparseVector(4, [2], [1.0])
0627 >>> a.squared_distance(b)
0628 26.0
0629 >>> b.squared_distance(a)
0630 26.0
0631 >>> b.squared_distance([1., 2.])
0632 Traceback (most recent call last):
0633 ...
0634 AssertionError: dimension mismatch
0635 >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
0636 Traceback (most recent call last):
0637 ...
0638 AssertionError: dimension mismatch
0639 """
0640 assert len(self) == _vector_size(other), "dimension mismatch"
0641
0642 if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
0643 if isinstance(other, np.ndarray) and other.ndim != 1:
0644 raise Exception("Cannot call squared_distance with %d-dimensional array" %
0645 other.ndim)
0646 if isinstance(other, DenseVector):
0647 other = other.array
0648 sparse_ind = np.zeros(other.size, dtype=bool)
0649 sparse_ind[self.indices] = True
0650 dist = other[sparse_ind] - self.values
0651 result = np.dot(dist, dist)
0652
0653 other_ind = other[~sparse_ind]
0654 result += np.dot(other_ind, other_ind)
0655 return result
0656
0657 elif isinstance(other, SparseVector):
0658 result = 0.0
0659 i, j = 0, 0
0660 while i < len(self.indices) and j < len(other.indices):
0661 if self.indices[i] == other.indices[j]:
0662 diff = self.values[i] - other.values[j]
0663 result += diff * diff
0664 i += 1
0665 j += 1
0666 elif self.indices[i] < other.indices[j]:
0667 result += self.values[i] * self.values[i]
0668 i += 1
0669 else:
0670 result += other.values[j] * other.values[j]
0671 j += 1
0672 while i < len(self.indices):
0673 result += self.values[i] * self.values[i]
0674 i += 1
0675 while j < len(other.indices):
0676 result += other.values[j] * other.values[j]
0677 j += 1
0678 return result
0679 else:
0680 return self.squared_distance(_convert_to_vector(other))
0681
0682 def toArray(self):
0683 """
0684 Returns a copy of this SparseVector as a 1-dimensional numpy.ndarray.
0685 """
0686 arr = np.zeros((self.size,), dtype=np.float64)
0687 arr[self.indices] = self.values
0688 return arr
0689
0690 def __len__(self):
0691 return self.size
0692
0693 def __str__(self):
0694 inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
0695 vals = "[" + ",".join([str(v) for v in self.values]) + "]"
0696 return "(" + ",".join((str(self.size), inds, vals)) + ")"
0697
0698 def __repr__(self):
0699 inds = self.indices
0700 vals = self.values
0701 entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
0702 for i in xrange(len(inds))])
0703 return "SparseVector({0}, {{{1}}})".format(self.size, entries)
0704
0705 def __eq__(self, other):
0706 if isinstance(other, SparseVector):
0707 return other.size == self.size and np.array_equal(other.indices, self.indices) \
0708 and np.array_equal(other.values, self.values)
0709 elif isinstance(other, DenseVector):
0710 if self.size != len(other):
0711 return False
0712 return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
0713 return False
0714
0715 def __getitem__(self, index):
0716 inds = self.indices
0717 vals = self.values
0718 if not isinstance(index, int):
0719 raise TypeError(
0720 "Indices must be of type integer, got type %s" % type(index))
0721
0722 if index >= self.size or index < -self.size:
0723 raise IndexError("Index %d out of bounds." % index)
0724 if index < 0:
0725 index += self.size
0726
0727 if (inds.size == 0) or (index > inds.item(-1)):
0728 return 0.
0729
0730 insert_index = np.searchsorted(inds, index)
0731 row_ind = inds[insert_index]
0732 if row_ind == index:
0733 return vals[insert_index]
0734 return 0.
0735
0736 def __ne__(self, other):
0737 return not self.__eq__(other)
0738
0739 def __hash__(self):
0740 result = 31 + self.size
0741 nnz = 0
0742 i = 0
0743 while i < len(self.values) and nnz < 128:
0744 if self.values[i] != 0:
0745 result = 31 * result + int(self.indices[i])
0746 bits = _double_to_long_bits(self.values[i])
0747 result = 31 * result + (bits ^ (bits >> 32))
0748 nnz += 1
0749 i += 1
0750 return result
0751
0752
0753 class Vectors(object):
0754
0755 """
0756 Factory methods for working with vectors.
0757
0758 .. note:: Dense vectors are simply represented as NumPy array objects,
0759 so there is no need to covert them for use in MLlib. For sparse vectors,
0760 the factory methods in this class create an MLlib-compatible type, or users
0761 can pass in SciPy's `scipy.sparse` column vectors.
0762 """
0763
0764 @staticmethod
0765 def sparse(size, *args):
0766 """
0767 Create a sparse vector, using either a dictionary, a list of
0768 (index, value) pairs, or two separate arrays of indices and
0769 values (sorted by index).
0770
0771 :param size: Size of the vector.
0772 :param args: Non-zero entries, as a dictionary, list of tuples,
0773 or two sorted lists containing indices and values.
0774
0775 >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
0776 SparseVector(4, {1: 1.0, 3: 5.5})
0777 >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
0778 SparseVector(4, {1: 1.0, 3: 5.5})
0779 >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
0780 SparseVector(4, {1: 1.0, 3: 5.5})
0781 """
0782 return SparseVector(size, *args)
0783
0784 @staticmethod
0785 def dense(*elements):
0786 """
0787 Create a dense vector of 64-bit floats from a Python list or numbers.
0788
0789 >>> Vectors.dense([1, 2, 3])
0790 DenseVector([1.0, 2.0, 3.0])
0791 >>> Vectors.dense(1.0, 2.0)
0792 DenseVector([1.0, 2.0])
0793 """
0794 if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
0795
0796 elements = elements[0]
0797 return DenseVector(elements)
0798
0799 @staticmethod
0800 def squared_distance(v1, v2):
0801 """
0802 Squared distance between two vectors.
0803 a and b can be of type SparseVector, DenseVector, np.ndarray
0804 or array.array.
0805
0806 >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
0807 >>> b = Vectors.dense([2, 5, 4, 1])
0808 >>> a.squared_distance(b)
0809 51.0
0810 """
0811 v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
0812 return v1.squared_distance(v2)
0813
0814 @staticmethod
0815 def norm(vector, p):
0816 """
0817 Find norm of the given vector.
0818 """
0819 return _convert_to_vector(vector).norm(p)
0820
0821 @staticmethod
0822 def zeros(size):
0823 return DenseVector(np.zeros(size))
0824
0825 @staticmethod
0826 def _equals(v1_indices, v1_values, v2_indices, v2_values):
0827 """
0828 Check equality between sparse/dense vectors,
0829 v1_indices and v2_indices assume to be strictly increasing.
0830 """
0831 v1_size = len(v1_values)
0832 v2_size = len(v2_values)
0833 k1 = 0
0834 k2 = 0
0835 all_equal = True
0836 while all_equal:
0837 while k1 < v1_size and v1_values[k1] == 0:
0838 k1 += 1
0839 while k2 < v2_size and v2_values[k2] == 0:
0840 k2 += 1
0841
0842 if k1 >= v1_size or k2 >= v2_size:
0843 return k1 >= v1_size and k2 >= v2_size
0844
0845 all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
0846 k1 += 1
0847 k2 += 1
0848 return all_equal
0849
0850
0851 class Matrix(object):
0852
0853 __UDT__ = MatrixUDT()
0854
0855 """
0856 Represents a local matrix.
0857 """
0858 def __init__(self, numRows, numCols, isTransposed=False):
0859 self.numRows = numRows
0860 self.numCols = numCols
0861 self.isTransposed = isTransposed
0862
0863 def toArray(self):
0864 """
0865 Returns its elements in a numpy.ndarray.
0866 """
0867 raise NotImplementedError
0868
0869 @staticmethod
0870 def _convert_to_array(array_like, dtype):
0871 """
0872 Convert Matrix attributes which are array-like or buffer to array.
0873 """
0874 if isinstance(array_like, bytes):
0875 return np.frombuffer(array_like, dtype=dtype)
0876 return np.asarray(array_like, dtype=dtype)
0877
0878
0879 class DenseMatrix(Matrix):
0880 """
0881 Column-major dense matrix.
0882 """
0883 def __init__(self, numRows, numCols, values, isTransposed=False):
0884 Matrix.__init__(self, numRows, numCols, isTransposed)
0885 values = self._convert_to_array(values, np.float64)
0886 assert len(values) == numRows * numCols
0887 self.values = values
0888
0889 def __reduce__(self):
0890 return DenseMatrix, (
0891 self.numRows, self.numCols, self.values.tostring(),
0892 int(self.isTransposed))
0893
0894 def __str__(self):
0895 """
0896 Pretty printing of a DenseMatrix
0897
0898 >>> dm = DenseMatrix(2, 2, range(4))
0899 >>> print(dm)
0900 DenseMatrix([[ 0., 2.],
0901 [ 1., 3.]])
0902 >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
0903 >>> print(dm)
0904 DenseMatrix([[ 0., 1.],
0905 [ 2., 3.]])
0906 """
0907
0908 array_lines = repr(self.toArray()).splitlines()
0909
0910
0911
0912 x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
0913 return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
0914
0915 def __repr__(self):
0916 """
0917 Representation of a DenseMatrix
0918
0919 >>> dm = DenseMatrix(2, 2, range(4))
0920 >>> dm
0921 DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
0922 """
0923
0924
0925 if len(self.values) < 17:
0926 entries = _format_float_list(self.values)
0927 else:
0928 entries = (
0929 _format_float_list(self.values[:8]) +
0930 ["..."] +
0931 _format_float_list(self.values[-8:])
0932 )
0933
0934 entries = ", ".join(entries)
0935 return "DenseMatrix({0}, {1}, [{2}], {3})".format(
0936 self.numRows, self.numCols, entries, self.isTransposed)
0937
0938 def toArray(self):
0939 """
0940 Return a numpy.ndarray
0941
0942 >>> m = DenseMatrix(2, 2, range(4))
0943 >>> m.toArray()
0944 array([[ 0., 2.],
0945 [ 1., 3.]])
0946 """
0947 if self.isTransposed:
0948 return np.asfortranarray(
0949 self.values.reshape((self.numRows, self.numCols)))
0950 else:
0951 return self.values.reshape((self.numRows, self.numCols), order='F')
0952
0953 def toSparse(self):
0954 """Convert to SparseMatrix"""
0955 if self.isTransposed:
0956 values = np.ravel(self.toArray(), order='F')
0957 else:
0958 values = self.values
0959 indices = np.nonzero(values)[0]
0960 colCounts = np.bincount(indices // self.numRows)
0961 colPtrs = np.cumsum(np.hstack(
0962 (0, colCounts, np.zeros(self.numCols - colCounts.size))))
0963 values = values[indices]
0964 rowIndices = indices % self.numRows
0965
0966 return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
0967
0968 def __getitem__(self, indices):
0969 i, j = indices
0970 if i < 0 or i >= self.numRows:
0971 raise IndexError("Row index %d is out of range [0, %d)"
0972 % (i, self.numRows))
0973 if j >= self.numCols or j < 0:
0974 raise IndexError("Column index %d is out of range [0, %d)"
0975 % (j, self.numCols))
0976
0977 if self.isTransposed:
0978 return self.values[i * self.numCols + j]
0979 else:
0980 return self.values[i + j * self.numRows]
0981
0982 def __eq__(self, other):
0983 if (self.numRows != other.numRows or self.numCols != other.numCols):
0984 return False
0985 if isinstance(other, SparseMatrix):
0986 return np.all(self.toArray() == other.toArray())
0987
0988 self_values = np.ravel(self.toArray(), order='F')
0989 other_values = np.ravel(other.toArray(), order='F')
0990 return np.all(self_values == other_values)
0991
0992
0993 class SparseMatrix(Matrix):
0994 """Sparse Matrix stored in CSC format."""
0995 def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
0996 isTransposed=False):
0997 Matrix.__init__(self, numRows, numCols, isTransposed)
0998 self.colPtrs = self._convert_to_array(colPtrs, np.int32)
0999 self.rowIndices = self._convert_to_array(rowIndices, np.int32)
1000 self.values = self._convert_to_array(values, np.float64)
1001
1002 if self.isTransposed:
1003 if self.colPtrs.size != numRows + 1:
1004 raise ValueError("Expected colPtrs of size %d, got %d."
1005 % (numRows + 1, self.colPtrs.size))
1006 else:
1007 if self.colPtrs.size != numCols + 1:
1008 raise ValueError("Expected colPtrs of size %d, got %d."
1009 % (numCols + 1, self.colPtrs.size))
1010 if self.rowIndices.size != self.values.size:
1011 raise ValueError("Expected rowIndices of length %d, got %d."
1012 % (self.rowIndices.size, self.values.size))
1013
1014 def __str__(self):
1015 """
1016 Pretty printing of a SparseMatrix
1017
1018 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1019 >>> print(sm1)
1020 2 X 2 CSCMatrix
1021 (0,0) 2.0
1022 (1,0) 3.0
1023 (1,1) 4.0
1024 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1025 >>> print(sm1)
1026 2 X 2 CSRMatrix
1027 (0,0) 2.0
1028 (0,1) 3.0
1029 (1,1) 4.0
1030 """
1031 spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1032 if self.isTransposed:
1033 spstr += "CSRMatrix\n"
1034 else:
1035 spstr += "CSCMatrix\n"
1036
1037 cur_col = 0
1038 smlist = []
1039
1040
1041 if len(self.values) <= 16:
1042 zipindval = zip(self.rowIndices, self.values)
1043 else:
1044 zipindval = zip(self.rowIndices[:16], self.values[:16])
1045 for i, (rowInd, value) in enumerate(zipindval):
1046 if self.colPtrs[cur_col + 1] <= i:
1047 cur_col += 1
1048 if self.isTransposed:
1049 smlist.append('({0},{1}) {2}'.format(
1050 cur_col, rowInd, _format_float(value)))
1051 else:
1052 smlist.append('({0},{1}) {2}'.format(
1053 rowInd, cur_col, _format_float(value)))
1054 spstr += "\n".join(smlist)
1055
1056 if len(self.values) > 16:
1057 spstr += "\n.." * 2
1058 return spstr
1059
1060 def __repr__(self):
1061 """
1062 Representation of a SparseMatrix
1063
1064 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1065 >>> sm1
1066 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1067 """
1068 rowIndices = list(self.rowIndices)
1069 colPtrs = list(self.colPtrs)
1070
1071 if len(self.values) <= 16:
1072 values = _format_float_list(self.values)
1073
1074 else:
1075 values = (
1076 _format_float_list(self.values[:8]) +
1077 ["..."] +
1078 _format_float_list(self.values[-8:])
1079 )
1080 rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1081
1082 if len(self.colPtrs) > 16:
1083 colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1084
1085 values = ", ".join(values)
1086 rowIndices = ", ".join([str(ind) for ind in rowIndices])
1087 colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1088 return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1089 self.numRows, self.numCols, colPtrs, rowIndices,
1090 values, self.isTransposed)
1091
1092 def __reduce__(self):
1093 return SparseMatrix, (
1094 self.numRows, self.numCols, self.colPtrs.tostring(),
1095 self.rowIndices.tostring(), self.values.tostring(),
1096 int(self.isTransposed))
1097
1098 def __getitem__(self, indices):
1099 i, j = indices
1100 if i < 0 or i >= self.numRows:
1101 raise IndexError("Row index %d is out of range [0, %d)"
1102 % (i, self.numRows))
1103 if j < 0 or j >= self.numCols:
1104 raise IndexError("Column index %d is out of range [0, %d)"
1105 % (j, self.numCols))
1106
1107
1108
1109
1110 if self.isTransposed:
1111 j, i = i, j
1112
1113 colStart = self.colPtrs[j]
1114 colEnd = self.colPtrs[j + 1]
1115 nz = self.rowIndices[colStart: colEnd]
1116 ind = np.searchsorted(nz, i) + colStart
1117 if ind < colEnd and self.rowIndices[ind] == i:
1118 return self.values[ind]
1119 else:
1120 return 0.0
1121
1122 def toArray(self):
1123 """
1124 Return a numpy.ndarray
1125 """
1126 A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1127 for k in xrange(self.colPtrs.size - 1):
1128 startptr = self.colPtrs[k]
1129 endptr = self.colPtrs[k + 1]
1130 if self.isTransposed:
1131 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1132 else:
1133 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1134 return A
1135
1136 def toDense(self):
1137 densevals = np.ravel(self.toArray(), order='F')
1138 return DenseMatrix(self.numRows, self.numCols, densevals)
1139
1140
1141 def __eq__(self, other):
1142 return np.all(self.toArray() == other.toArray())
1143
1144
1145 class Matrices(object):
1146 @staticmethod
1147 def dense(numRows, numCols, values):
1148 """
1149 Create a DenseMatrix
1150 """
1151 return DenseMatrix(numRows, numCols, values)
1152
1153 @staticmethod
1154 def sparse(numRows, numCols, colPtrs, rowIndices, values):
1155 """
1156 Create a SparseMatrix
1157 """
1158 return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1159
1160
1161 def _test():
1162 import doctest
1163 try:
1164
1165 np.set_printoptions(legacy='1.13')
1166 except TypeError:
1167 pass
1168 (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1169 if failure_count:
1170 sys.exit(-1)
1171
1172 if __name__ == "__main__":
1173 _test()