0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 MLlib utilities for linear algebra. For dense vectors, MLlib
0020 uses the NumPy `array` type, so you can simply pass NumPy arrays
0021 around. For sparse vectors, users can construct a :class:`SparseVector`
0022 object from MLlib or pass SciPy `scipy.sparse` column vectors if
0023 SciPy is available in their environment.
0024 """
0025
0026 import sys
0027 import array
0028 import struct
0029
0030 if sys.version >= '3':
0031 basestring = str
0032 xrange = range
0033 import copyreg as copy_reg
0034 long = int
0035 else:
0036 from itertools import izip as zip
0037 import copy_reg
0038
0039 import numpy as np
0040
0041 from pyspark import since
0042 from pyspark.ml import linalg as newlinalg
0043 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
0044 IntegerType, ByteType, BooleanType
0045
0046
0047 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
0048 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices',
0049 'QRDecomposition']
0050
0051
0052 if sys.version_info[:2] == (2, 7):
0053
0054 def fast_pickle_array(ar):
0055 return array.array, (ar.typecode, ar.tostring())
0056 copy_reg.pickle(array.array, fast_pickle_array)
0057
0058
0059
0060
0061
0062 try:
0063 import scipy.sparse
0064 _have_scipy = True
0065 except:
0066
0067 _have_scipy = False
0068
0069
0070 def _convert_to_vector(l):
0071 if isinstance(l, Vector):
0072 return l
0073 elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
0074 return DenseVector(l)
0075 elif _have_scipy and scipy.sparse.issparse(l):
0076 assert l.shape[1] == 1, "Expected column vector"
0077
0078 csc = l.tocsc()
0079 if not csc.has_sorted_indices:
0080 csc.sort_indices()
0081 return SparseVector(l.shape[0], csc.indices, csc.data)
0082 else:
0083 raise TypeError("Cannot convert type %s into Vector" % type(l))
0084
0085
0086 def _vector_size(v):
0087 """
0088 Returns the size of the vector.
0089
0090 >>> _vector_size([1., 2., 3.])
0091 3
0092 >>> _vector_size((1., 2., 3.))
0093 3
0094 >>> _vector_size(array.array('d', [1., 2., 3.]))
0095 3
0096 >>> _vector_size(np.zeros(3))
0097 3
0098 >>> _vector_size(np.zeros((3, 1)))
0099 3
0100 >>> _vector_size(np.zeros((1, 3)))
0101 Traceback (most recent call last):
0102 ...
0103 ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
0104 """
0105 if isinstance(v, Vector):
0106 return len(v)
0107 elif type(v) in (array.array, list, tuple, xrange):
0108 return len(v)
0109 elif type(v) == np.ndarray:
0110 if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
0111 return len(v)
0112 else:
0113 raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
0114 elif _have_scipy and scipy.sparse.issparse(v):
0115 assert v.shape[1] == 1, "Expected column vector"
0116 return v.shape[0]
0117 else:
0118 raise TypeError("Cannot treat type %s as a vector" % type(v))
0119
0120
0121 def _format_float(f, digits=4):
0122 s = str(round(f, digits))
0123 if '.' in s:
0124 s = s[:s.index('.') + 1 + digits]
0125 return s
0126
0127
0128 def _format_float_list(l):
0129 return [_format_float(x) for x in l]
0130
0131
0132 def _double_to_long_bits(value):
0133 if np.isnan(value):
0134 value = float('nan')
0135
0136 return struct.unpack('Q', struct.pack('d', value))[0]
0137
0138
0139 class VectorUDT(UserDefinedType):
0140 """
0141 SQL user-defined type (UDT) for Vector.
0142 """
0143
0144 @classmethod
0145 def sqlType(cls):
0146 return StructType([
0147 StructField("type", ByteType(), False),
0148 StructField("size", IntegerType(), True),
0149 StructField("indices", ArrayType(IntegerType(), False), True),
0150 StructField("values", ArrayType(DoubleType(), False), True)])
0151
0152 @classmethod
0153 def module(cls):
0154 return "pyspark.mllib.linalg"
0155
0156 @classmethod
0157 def scalaUDT(cls):
0158 return "org.apache.spark.mllib.linalg.VectorUDT"
0159
0160 def serialize(self, obj):
0161 if isinstance(obj, SparseVector):
0162 indices = [int(i) for i in obj.indices]
0163 values = [float(v) for v in obj.values]
0164 return (0, obj.size, indices, values)
0165 elif isinstance(obj, DenseVector):
0166 values = [float(v) for v in obj]
0167 return (1, None, None, values)
0168 else:
0169 raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
0170
0171 def deserialize(self, datum):
0172 assert len(datum) == 4, \
0173 "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
0174 tpe = datum[0]
0175 if tpe == 0:
0176 return SparseVector(datum[1], datum[2], datum[3])
0177 elif tpe == 1:
0178 return DenseVector(datum[3])
0179 else:
0180 raise ValueError("do not recognize type %r" % tpe)
0181
0182 def simpleString(self):
0183 return "vector"
0184
0185
0186 class MatrixUDT(UserDefinedType):
0187 """
0188 SQL user-defined type (UDT) for Matrix.
0189 """
0190
0191 @classmethod
0192 def sqlType(cls):
0193 return StructType([
0194 StructField("type", ByteType(), False),
0195 StructField("numRows", IntegerType(), False),
0196 StructField("numCols", IntegerType(), False),
0197 StructField("colPtrs", ArrayType(IntegerType(), False), True),
0198 StructField("rowIndices", ArrayType(IntegerType(), False), True),
0199 StructField("values", ArrayType(DoubleType(), False), True),
0200 StructField("isTransposed", BooleanType(), False)])
0201
0202 @classmethod
0203 def module(cls):
0204 return "pyspark.mllib.linalg"
0205
0206 @classmethod
0207 def scalaUDT(cls):
0208 return "org.apache.spark.mllib.linalg.MatrixUDT"
0209
0210 def serialize(self, obj):
0211 if isinstance(obj, SparseMatrix):
0212 colPtrs = [int(i) for i in obj.colPtrs]
0213 rowIndices = [int(i) for i in obj.rowIndices]
0214 values = [float(v) for v in obj.values]
0215 return (0, obj.numRows, obj.numCols, colPtrs,
0216 rowIndices, values, bool(obj.isTransposed))
0217 elif isinstance(obj, DenseMatrix):
0218 values = [float(v) for v in obj.values]
0219 return (1, obj.numRows, obj.numCols, None, None, values,
0220 bool(obj.isTransposed))
0221 else:
0222 raise TypeError("cannot serialize type %r" % (type(obj)))
0223
0224 def deserialize(self, datum):
0225 assert len(datum) == 7, \
0226 "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
0227 tpe = datum[0]
0228 if tpe == 0:
0229 return SparseMatrix(*datum[1:])
0230 elif tpe == 1:
0231 return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
0232 else:
0233 raise ValueError("do not recognize type %r" % tpe)
0234
0235 def simpleString(self):
0236 return "matrix"
0237
0238
0239 class Vector(object):
0240
0241 __UDT__ = VectorUDT()
0242
0243 """
0244 Abstract class for DenseVector and SparseVector
0245 """
0246 def toArray(self):
0247 """
0248 Convert the vector into an numpy.ndarray
0249
0250 :return: numpy.ndarray
0251 """
0252 raise NotImplementedError
0253
0254 def asML(self):
0255 """
0256 Convert this vector to the new mllib-local representation.
0257 This does NOT copy the data; it copies references.
0258
0259 :return: :py:class:`pyspark.ml.linalg.Vector`
0260 """
0261 raise NotImplementedError
0262
0263
0264 class DenseVector(Vector):
0265 """
0266 A dense vector represented by a value array. We use numpy array for
0267 storage and arithmetics will be delegated to the underlying numpy
0268 array.
0269
0270 >>> v = Vectors.dense([1.0, 2.0])
0271 >>> u = Vectors.dense([3.0, 4.0])
0272 >>> v + u
0273 DenseVector([4.0, 6.0])
0274 >>> 2 - v
0275 DenseVector([1.0, 0.0])
0276 >>> v / 2
0277 DenseVector([0.5, 1.0])
0278 >>> v * u
0279 DenseVector([3.0, 8.0])
0280 >>> u / v
0281 DenseVector([3.0, 2.0])
0282 >>> u % 2
0283 DenseVector([1.0, 0.0])
0284 >>> -v
0285 DenseVector([-1.0, -2.0])
0286 """
0287 def __init__(self, ar):
0288 if isinstance(ar, bytes):
0289 ar = np.frombuffer(ar, dtype=np.float64)
0290 elif not isinstance(ar, np.ndarray):
0291 ar = np.array(ar, dtype=np.float64)
0292 if ar.dtype != np.float64:
0293 ar = ar.astype(np.float64)
0294 self.array = ar
0295
0296 @staticmethod
0297 def parse(s):
0298 """
0299 Parse string representation back into the DenseVector.
0300
0301 >>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]')
0302 DenseVector([0.0, 1.0, 2.0, 3.0])
0303 """
0304 start = s.find('[')
0305 if start == -1:
0306 raise ValueError("Array should start with '['.")
0307 end = s.find(']')
0308 if end == -1:
0309 raise ValueError("Array should end with ']'.")
0310 s = s[start + 1: end]
0311
0312 try:
0313 values = [float(val) for val in s.split(',') if val]
0314 except ValueError:
0315 raise ValueError("Unable to parse values from %s" % s)
0316 return DenseVector(values)
0317
0318 def __reduce__(self):
0319 return DenseVector, (self.array.tostring(),)
0320
0321 def numNonzeros(self):
0322 """
0323 Number of nonzero elements. This scans all active values and count non zeros
0324 """
0325 return np.count_nonzero(self.array)
0326
0327 def norm(self, p):
0328 """
0329 Calculates the norm of a DenseVector.
0330
0331 >>> a = DenseVector([0, -1, 2, -3])
0332 >>> a.norm(2)
0333 3.7...
0334 >>> a.norm(1)
0335 6.0
0336 """
0337 return np.linalg.norm(self.array, p)
0338
0339 def dot(self, other):
0340 """
0341 Compute the dot product of two Vectors. We support
0342 (Numpy array, list, SparseVector, or SciPy sparse)
0343 and a target NumPy array that is either 1- or 2-dimensional.
0344 Equivalent to calling numpy.dot of the two vectors.
0345
0346 >>> dense = DenseVector(array.array('d', [1., 2.]))
0347 >>> dense.dot(dense)
0348 5.0
0349 >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
0350 4.0
0351 >>> dense.dot(range(1, 3))
0352 5.0
0353 >>> dense.dot(np.array(range(1, 3)))
0354 5.0
0355 >>> dense.dot([1.,])
0356 Traceback (most recent call last):
0357 ...
0358 AssertionError: dimension mismatch
0359 >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
0360 array([ 5., 11.])
0361 >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
0362 Traceback (most recent call last):
0363 ...
0364 AssertionError: dimension mismatch
0365 """
0366 if type(other) == np.ndarray:
0367 if other.ndim > 1:
0368 assert len(self) == other.shape[0], "dimension mismatch"
0369 return np.dot(self.array, other)
0370 elif _have_scipy and scipy.sparse.issparse(other):
0371 assert len(self) == other.shape[0], "dimension mismatch"
0372 return other.transpose().dot(self.toArray())
0373 else:
0374 assert len(self) == _vector_size(other), "dimension mismatch"
0375 if isinstance(other, SparseVector):
0376 return other.dot(self)
0377 elif isinstance(other, Vector):
0378 return np.dot(self.toArray(), other.toArray())
0379 else:
0380 return np.dot(self.toArray(), other)
0381
0382 def squared_distance(self, other):
0383 """
0384 Squared distance of two Vectors.
0385
0386 >>> dense1 = DenseVector(array.array('d', [1., 2.]))
0387 >>> dense1.squared_distance(dense1)
0388 0.0
0389 >>> dense2 = np.array([2., 1.])
0390 >>> dense1.squared_distance(dense2)
0391 2.0
0392 >>> dense3 = [2., 1.]
0393 >>> dense1.squared_distance(dense3)
0394 2.0
0395 >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
0396 >>> dense1.squared_distance(sparse1)
0397 2.0
0398 >>> dense1.squared_distance([1.,])
0399 Traceback (most recent call last):
0400 ...
0401 AssertionError: dimension mismatch
0402 >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
0403 Traceback (most recent call last):
0404 ...
0405 AssertionError: dimension mismatch
0406 """
0407 assert len(self) == _vector_size(other), "dimension mismatch"
0408 if isinstance(other, SparseVector):
0409 return other.squared_distance(self)
0410 elif _have_scipy and scipy.sparse.issparse(other):
0411 return _convert_to_vector(other).squared_distance(self)
0412
0413 if isinstance(other, Vector):
0414 other = other.toArray()
0415 elif not isinstance(other, np.ndarray):
0416 other = np.array(other)
0417 diff = self.toArray() - other
0418 return np.dot(diff, diff)
0419
0420 def toArray(self):
0421 """
0422 Returns an numpy.ndarray
0423 """
0424 return self.array
0425
0426 def asML(self):
0427 """
0428 Convert this vector to the new mllib-local representation.
0429 This does NOT copy the data; it copies references.
0430
0431 :return: :py:class:`pyspark.ml.linalg.DenseVector`
0432
0433 .. versionadded:: 2.0.0
0434 """
0435 return newlinalg.DenseVector(self.array)
0436
0437 @property
0438 def values(self):
0439 """
0440 Returns a list of values
0441 """
0442 return self.array
0443
0444 def __getitem__(self, item):
0445 return self.array[item]
0446
0447 def __len__(self):
0448 return len(self.array)
0449
0450 def __str__(self):
0451 return "[" + ",".join([str(v) for v in self.array]) + "]"
0452
0453 def __repr__(self):
0454 return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
0455
0456 def __eq__(self, other):
0457 if isinstance(other, DenseVector):
0458 return np.array_equal(self.array, other.array)
0459 elif isinstance(other, SparseVector):
0460 if len(self) != other.size:
0461 return False
0462 return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
0463 return False
0464
0465 def __ne__(self, other):
0466 return not self == other
0467
0468 def __hash__(self):
0469 size = len(self)
0470 result = 31 + size
0471 nnz = 0
0472 i = 0
0473 while i < size and nnz < 128:
0474 if self.array[i] != 0:
0475 result = 31 * result + i
0476 bits = _double_to_long_bits(self.array[i])
0477 result = 31 * result + (bits ^ (bits >> 32))
0478 nnz += 1
0479 i += 1
0480 return result
0481
0482 def __getattr__(self, item):
0483 return getattr(self.array, item)
0484
0485 def __neg__(self):
0486 return DenseVector(-self.array)
0487
0488 def _delegate(op):
0489 def func(self, other):
0490 if isinstance(other, DenseVector):
0491 other = other.array
0492 return DenseVector(getattr(self.array, op)(other))
0493 return func
0494
0495 __add__ = _delegate("__add__")
0496 __sub__ = _delegate("__sub__")
0497 __mul__ = _delegate("__mul__")
0498 __div__ = _delegate("__div__")
0499 __truediv__ = _delegate("__truediv__")
0500 __mod__ = _delegate("__mod__")
0501 __radd__ = _delegate("__radd__")
0502 __rsub__ = _delegate("__rsub__")
0503 __rmul__ = _delegate("__rmul__")
0504 __rdiv__ = _delegate("__rdiv__")
0505 __rtruediv__ = _delegate("__rtruediv__")
0506 __rmod__ = _delegate("__rmod__")
0507
0508
0509 class SparseVector(Vector):
0510 """
0511 A simple sparse vector class for passing data to MLlib. Users may
0512 alternatively pass SciPy's {scipy.sparse} data types.
0513 """
0514 def __init__(self, size, *args):
0515 """
0516 Create a sparse vector, using either a dictionary, a list of
0517 (index, value) pairs, or two separate arrays of indices and
0518 values (sorted by index).
0519
0520 :param size: Size of the vector.
0521 :param args: Active entries, as a dictionary {index: value, ...},
0522 a list of tuples [(index, value), ...], or a list of strictly
0523 increasing indices and a list of corresponding values [index, ...],
0524 [value, ...]. Inactive entries are treated as zeros.
0525
0526 >>> SparseVector(4, {1: 1.0, 3: 5.5})
0527 SparseVector(4, {1: 1.0, 3: 5.5})
0528 >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
0529 SparseVector(4, {1: 1.0, 3: 5.5})
0530 >>> SparseVector(4, [1, 3], [1.0, 5.5])
0531 SparseVector(4, {1: 1.0, 3: 5.5})
0532 """
0533 self.size = int(size)
0534 """ Size of the vector. """
0535 assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
0536 if len(args) == 1:
0537 pairs = args[0]
0538 if type(pairs) == dict:
0539 pairs = pairs.items()
0540 pairs = sorted(pairs)
0541 self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
0542 """ A list of indices corresponding to active entries. """
0543 self.values = np.array([p[1] for p in pairs], dtype=np.float64)
0544 """ A list of values corresponding to active entries. """
0545 else:
0546 if isinstance(args[0], bytes):
0547 assert isinstance(args[1], bytes), "values should be string too"
0548 if args[0]:
0549 self.indices = np.frombuffer(args[0], np.int32)
0550 self.values = np.frombuffer(args[1], np.float64)
0551 else:
0552
0553 self.indices = np.array([], dtype=np.int32)
0554 self.values = np.array([], dtype=np.float64)
0555 else:
0556 self.indices = np.array(args[0], dtype=np.int32)
0557 self.values = np.array(args[1], dtype=np.float64)
0558 assert len(self.indices) == len(self.values), "index and value arrays not same length"
0559 for i in xrange(len(self.indices) - 1):
0560 if self.indices[i] >= self.indices[i + 1]:
0561 raise TypeError(
0562 "Indices %s and %s are not strictly increasing"
0563 % (self.indices[i], self.indices[i + 1]))
0564
0565 def numNonzeros(self):
0566 """
0567 Number of nonzero elements. This scans all active values and count non zeros.
0568 """
0569 return np.count_nonzero(self.values)
0570
0571 def norm(self, p):
0572 """
0573 Calculates the norm of a SparseVector.
0574
0575 >>> a = SparseVector(4, [0, 1], [3., -4.])
0576 >>> a.norm(1)
0577 7.0
0578 >>> a.norm(2)
0579 5.0
0580 """
0581 return np.linalg.norm(self.values, p)
0582
0583 def __reduce__(self):
0584 return (
0585 SparseVector,
0586 (self.size, self.indices.tostring(), self.values.tostring()))
0587
0588 @staticmethod
0589 def parse(s):
0590 """
0591 Parse string representation back into the SparseVector.
0592
0593 >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )')
0594 SparseVector(4, {0: 4.0, 1: 5.0})
0595 """
0596 start = s.find('(')
0597 if start == -1:
0598 raise ValueError("Tuple should start with '('")
0599 end = s.find(')')
0600 if end == -1:
0601 raise ValueError("Tuple should end with ')'")
0602 s = s[start + 1: end].strip()
0603
0604 size = s[: s.find(',')]
0605 try:
0606 size = int(size)
0607 except ValueError:
0608 raise ValueError("Cannot parse size %s." % size)
0609
0610 ind_start = s.find('[')
0611 if ind_start == -1:
0612 raise ValueError("Indices array should start with '['.")
0613 ind_end = s.find(']')
0614 if ind_end == -1:
0615 raise ValueError("Indices array should end with ']'")
0616 new_s = s[ind_start + 1: ind_end]
0617 ind_list = new_s.split(',')
0618 try:
0619 indices = [int(ind) for ind in ind_list if ind]
0620 except ValueError:
0621 raise ValueError("Unable to parse indices from %s." % new_s)
0622 s = s[ind_end + 1:].strip()
0623
0624 val_start = s.find('[')
0625 if val_start == -1:
0626 raise ValueError("Values array should start with '['.")
0627 val_end = s.find(']')
0628 if val_end == -1:
0629 raise ValueError("Values array should end with ']'.")
0630 val_list = s[val_start + 1: val_end].split(',')
0631 try:
0632 values = [float(val) for val in val_list if val]
0633 except ValueError:
0634 raise ValueError("Unable to parse values from %s." % s)
0635 return SparseVector(size, indices, values)
0636
0637 def dot(self, other):
0638 """
0639 Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
0640
0641 >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0642 >>> a.dot(a)
0643 25.0
0644 >>> a.dot(array.array('d', [1., 2., 3., 4.]))
0645 22.0
0646 >>> b = SparseVector(4, [2], [1.0])
0647 >>> a.dot(b)
0648 0.0
0649 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
0650 array([ 22., 22.])
0651 >>> a.dot([1., 2., 3.])
0652 Traceback (most recent call last):
0653 ...
0654 AssertionError: dimension mismatch
0655 >>> a.dot(np.array([1., 2.]))
0656 Traceback (most recent call last):
0657 ...
0658 AssertionError: dimension mismatch
0659 >>> a.dot(DenseVector([1., 2.]))
0660 Traceback (most recent call last):
0661 ...
0662 AssertionError: dimension mismatch
0663 >>> a.dot(np.zeros((3, 2)))
0664 Traceback (most recent call last):
0665 ...
0666 AssertionError: dimension mismatch
0667 """
0668
0669 if isinstance(other, np.ndarray):
0670 if other.ndim not in [2, 1]:
0671 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
0672 assert len(self) == other.shape[0], "dimension mismatch"
0673 return np.dot(self.values, other[self.indices])
0674
0675 assert len(self) == _vector_size(other), "dimension mismatch"
0676
0677 if isinstance(other, DenseVector):
0678 return np.dot(other.array[self.indices], self.values)
0679
0680 elif isinstance(other, SparseVector):
0681
0682 self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
0683 self_values = self.values[self_cmind]
0684 if self_values.size == 0:
0685 return 0.0
0686 else:
0687 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
0688 return np.dot(self_values, other.values[other_cmind])
0689
0690 else:
0691 return self.dot(_convert_to_vector(other))
0692
0693 def squared_distance(self, other):
0694 """
0695 Squared distance from a SparseVector or 1-dimensional NumPy array.
0696
0697 >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0698 >>> a.squared_distance(a)
0699 0.0
0700 >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
0701 11.0
0702 >>> a.squared_distance(np.array([1., 2., 3., 4.]))
0703 11.0
0704 >>> b = SparseVector(4, [2], [1.0])
0705 >>> a.squared_distance(b)
0706 26.0
0707 >>> b.squared_distance(a)
0708 26.0
0709 >>> b.squared_distance([1., 2.])
0710 Traceback (most recent call last):
0711 ...
0712 AssertionError: dimension mismatch
0713 >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
0714 Traceback (most recent call last):
0715 ...
0716 AssertionError: dimension mismatch
0717 """
0718 assert len(self) == _vector_size(other), "dimension mismatch"
0719
0720 if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
0721 if isinstance(other, np.ndarray) and other.ndim != 1:
0722 raise Exception("Cannot call squared_distance with %d-dimensional array" %
0723 other.ndim)
0724 if isinstance(other, DenseVector):
0725 other = other.array
0726 sparse_ind = np.zeros(other.size, dtype=bool)
0727 sparse_ind[self.indices] = True
0728 dist = other[sparse_ind] - self.values
0729 result = np.dot(dist, dist)
0730
0731 other_ind = other[~sparse_ind]
0732 result += np.dot(other_ind, other_ind)
0733 return result
0734
0735 elif isinstance(other, SparseVector):
0736 result = 0.0
0737 i, j = 0, 0
0738 while i < len(self.indices) and j < len(other.indices):
0739 if self.indices[i] == other.indices[j]:
0740 diff = self.values[i] - other.values[j]
0741 result += diff * diff
0742 i += 1
0743 j += 1
0744 elif self.indices[i] < other.indices[j]:
0745 result += self.values[i] * self.values[i]
0746 i += 1
0747 else:
0748 result += other.values[j] * other.values[j]
0749 j += 1
0750 while i < len(self.indices):
0751 result += self.values[i] * self.values[i]
0752 i += 1
0753 while j < len(other.indices):
0754 result += other.values[j] * other.values[j]
0755 j += 1
0756 return result
0757 else:
0758 return self.squared_distance(_convert_to_vector(other))
0759
0760 def toArray(self):
0761 """
0762 Returns a copy of this SparseVector as a 1-dimensional NumPy array.
0763 """
0764 arr = np.zeros((self.size,), dtype=np.float64)
0765 arr[self.indices] = self.values
0766 return arr
0767
0768 def asML(self):
0769 """
0770 Convert this vector to the new mllib-local representation.
0771 This does NOT copy the data; it copies references.
0772
0773 :return: :py:class:`pyspark.ml.linalg.SparseVector`
0774
0775 .. versionadded:: 2.0.0
0776 """
0777 return newlinalg.SparseVector(self.size, self.indices, self.values)
0778
0779 def __len__(self):
0780 return self.size
0781
0782 def __str__(self):
0783 inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
0784 vals = "[" + ",".join([str(v) for v in self.values]) + "]"
0785 return "(" + ",".join((str(self.size), inds, vals)) + ")"
0786
0787 def __repr__(self):
0788 inds = self.indices
0789 vals = self.values
0790 entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
0791 for i in xrange(len(inds))])
0792 return "SparseVector({0}, {{{1}}})".format(self.size, entries)
0793
0794 def __eq__(self, other):
0795 if isinstance(other, SparseVector):
0796 return other.size == self.size and np.array_equal(other.indices, self.indices) \
0797 and np.array_equal(other.values, self.values)
0798 elif isinstance(other, DenseVector):
0799 if self.size != len(other):
0800 return False
0801 return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
0802 return False
0803
0804 def __getitem__(self, index):
0805 inds = self.indices
0806 vals = self.values
0807 if not isinstance(index, int):
0808 raise TypeError(
0809 "Indices must be of type integer, got type %s" % type(index))
0810
0811 if index >= self.size or index < -self.size:
0812 raise IndexError("Index %d out of bounds." % index)
0813 if index < 0:
0814 index += self.size
0815
0816 if (inds.size == 0) or (index > inds.item(-1)):
0817 return 0.
0818
0819 insert_index = np.searchsorted(inds, index)
0820 row_ind = inds[insert_index]
0821 if row_ind == index:
0822 return vals[insert_index]
0823 return 0.
0824
0825 def __ne__(self, other):
0826 return not self.__eq__(other)
0827
0828 def __hash__(self):
0829 result = 31 + self.size
0830 nnz = 0
0831 i = 0
0832 while i < len(self.values) and nnz < 128:
0833 if self.values[i] != 0:
0834 result = 31 * result + int(self.indices[i])
0835 bits = _double_to_long_bits(self.values[i])
0836 result = 31 * result + (bits ^ (bits >> 32))
0837 nnz += 1
0838 i += 1
0839 return result
0840
0841
0842 class Vectors(object):
0843
0844 """
0845 Factory methods for working with vectors.
0846
0847 .. note:: Dense vectors are simply represented as NumPy array objects,
0848 so there is no need to covert them for use in MLlib. For sparse vectors,
0849 the factory methods in this class create an MLlib-compatible type, or users
0850 can pass in SciPy's `scipy.sparse` column vectors.
0851 """
0852
0853 @staticmethod
0854 def sparse(size, *args):
0855 """
0856 Create a sparse vector, using either a dictionary, a list of
0857 (index, value) pairs, or two separate arrays of indices and
0858 values (sorted by index).
0859
0860 :param size: Size of the vector.
0861 :param args: Non-zero entries, as a dictionary, list of tuples,
0862 or two sorted lists containing indices and values.
0863
0864 >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
0865 SparseVector(4, {1: 1.0, 3: 5.5})
0866 >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
0867 SparseVector(4, {1: 1.0, 3: 5.5})
0868 >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
0869 SparseVector(4, {1: 1.0, 3: 5.5})
0870 """
0871 return SparseVector(size, *args)
0872
0873 @staticmethod
0874 def dense(*elements):
0875 """
0876 Create a dense vector of 64-bit floats from a Python list or numbers.
0877
0878 >>> Vectors.dense([1, 2, 3])
0879 DenseVector([1.0, 2.0, 3.0])
0880 >>> Vectors.dense(1.0, 2.0)
0881 DenseVector([1.0, 2.0])
0882 """
0883 if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
0884
0885 elements = elements[0]
0886 return DenseVector(elements)
0887
0888 @staticmethod
0889 def fromML(vec):
0890 """
0891 Convert a vector from the new mllib-local representation.
0892 This does NOT copy the data; it copies references.
0893
0894 :param vec: a :py:class:`pyspark.ml.linalg.Vector`
0895 :return: a :py:class:`pyspark.mllib.linalg.Vector`
0896
0897 .. versionadded:: 2.0.0
0898 """
0899 if isinstance(vec, newlinalg.DenseVector):
0900 return DenseVector(vec.array)
0901 elif isinstance(vec, newlinalg.SparseVector):
0902 return SparseVector(vec.size, vec.indices, vec.values)
0903 else:
0904 raise TypeError("Unsupported vector type %s" % type(vec))
0905
0906 @staticmethod
0907 def stringify(vector):
0908 """
0909 Converts a vector into a string, which can be recognized by
0910 Vectors.parse().
0911
0912 >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0]))
0913 '(2,[1],[1.0])'
0914 >>> Vectors.stringify(Vectors.dense([0.0, 1.0]))
0915 '[0.0,1.0]'
0916 """
0917 return str(vector)
0918
0919 @staticmethod
0920 def squared_distance(v1, v2):
0921 """
0922 Squared distance between two vectors.
0923 a and b can be of type SparseVector, DenseVector, np.ndarray
0924 or array.array.
0925
0926 >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
0927 >>> b = Vectors.dense([2, 5, 4, 1])
0928 >>> a.squared_distance(b)
0929 51.0
0930 """
0931 v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
0932 return v1.squared_distance(v2)
0933
0934 @staticmethod
0935 def norm(vector, p):
0936 """
0937 Find norm of the given vector.
0938 """
0939 return _convert_to_vector(vector).norm(p)
0940
0941 @staticmethod
0942 def parse(s):
0943 """Parse a string representation back into the Vector.
0944
0945 >>> Vectors.parse('[2,1,2 ]')
0946 DenseVector([2.0, 1.0, 2.0])
0947 >>> Vectors.parse(' ( 100, [0], [2])')
0948 SparseVector(100, {0: 2.0})
0949 """
0950 if s.find('(') == -1 and s.find('[') != -1:
0951 return DenseVector.parse(s)
0952 elif s.find('(') != -1:
0953 return SparseVector.parse(s)
0954 else:
0955 raise ValueError(
0956 "Cannot find tokens '[' or '(' from the input string.")
0957
0958 @staticmethod
0959 def zeros(size):
0960 return DenseVector(np.zeros(size))
0961
0962 @staticmethod
0963 def _equals(v1_indices, v1_values, v2_indices, v2_values):
0964 """
0965 Check equality between sparse/dense vectors,
0966 v1_indices and v2_indices assume to be strictly increasing.
0967 """
0968 v1_size = len(v1_values)
0969 v2_size = len(v2_values)
0970 k1 = 0
0971 k2 = 0
0972 all_equal = True
0973 while all_equal:
0974 while k1 < v1_size and v1_values[k1] == 0:
0975 k1 += 1
0976 while k2 < v2_size and v2_values[k2] == 0:
0977 k2 += 1
0978
0979 if k1 >= v1_size or k2 >= v2_size:
0980 return k1 >= v1_size and k2 >= v2_size
0981
0982 all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
0983 k1 += 1
0984 k2 += 1
0985 return all_equal
0986
0987
0988 class Matrix(object):
0989
0990 __UDT__ = MatrixUDT()
0991
0992 """
0993 Represents a local matrix.
0994 """
0995 def __init__(self, numRows, numCols, isTransposed=False):
0996 self.numRows = numRows
0997 self.numCols = numCols
0998 self.isTransposed = isTransposed
0999
1000 def toArray(self):
1001 """
1002 Returns its elements in a NumPy ndarray.
1003 """
1004 raise NotImplementedError
1005
1006 def asML(self):
1007 """
1008 Convert this matrix to the new mllib-local representation.
1009 This does NOT copy the data; it copies references.
1010 """
1011 raise NotImplementedError
1012
1013 @staticmethod
1014 def _convert_to_array(array_like, dtype):
1015 """
1016 Convert Matrix attributes which are array-like or buffer to array.
1017 """
1018 if isinstance(array_like, bytes):
1019 return np.frombuffer(array_like, dtype=dtype)
1020 return np.asarray(array_like, dtype=dtype)
1021
1022
1023 class DenseMatrix(Matrix):
1024 """
1025 Column-major dense matrix.
1026 """
1027 def __init__(self, numRows, numCols, values, isTransposed=False):
1028 Matrix.__init__(self, numRows, numCols, isTransposed)
1029 values = self._convert_to_array(values, np.float64)
1030 assert len(values) == numRows * numCols
1031 self.values = values
1032
1033 def __reduce__(self):
1034 return DenseMatrix, (
1035 self.numRows, self.numCols, self.values.tostring(),
1036 int(self.isTransposed))
1037
1038 def __str__(self):
1039 """
1040 Pretty printing of a DenseMatrix
1041
1042 >>> dm = DenseMatrix(2, 2, range(4))
1043 >>> print(dm)
1044 DenseMatrix([[ 0., 2.],
1045 [ 1., 3.]])
1046 >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
1047 >>> print(dm)
1048 DenseMatrix([[ 0., 1.],
1049 [ 2., 3.]])
1050 """
1051
1052 array_lines = repr(self.toArray()).splitlines()
1053
1054
1055
1056 x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
1057 return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
1058
1059 def __repr__(self):
1060 """
1061 Representation of a DenseMatrix
1062
1063 >>> dm = DenseMatrix(2, 2, range(4))
1064 >>> dm
1065 DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
1066 """
1067
1068
1069 if len(self.values) < 17:
1070 entries = _format_float_list(self.values)
1071 else:
1072 entries = (
1073 _format_float_list(self.values[:8]) +
1074 ["..."] +
1075 _format_float_list(self.values[-8:])
1076 )
1077
1078 entries = ", ".join(entries)
1079 return "DenseMatrix({0}, {1}, [{2}], {3})".format(
1080 self.numRows, self.numCols, entries, self.isTransposed)
1081
1082 def toArray(self):
1083 """
1084 Return an numpy.ndarray
1085
1086 >>> m = DenseMatrix(2, 2, range(4))
1087 >>> m.toArray()
1088 array([[ 0., 2.],
1089 [ 1., 3.]])
1090 """
1091 if self.isTransposed:
1092 return np.asfortranarray(
1093 self.values.reshape((self.numRows, self.numCols)))
1094 else:
1095 return self.values.reshape((self.numRows, self.numCols), order='F')
1096
1097 def toSparse(self):
1098 """Convert to SparseMatrix"""
1099 if self.isTransposed:
1100 values = np.ravel(self.toArray(), order='F')
1101 else:
1102 values = self.values
1103 indices = np.nonzero(values)[0]
1104 colCounts = np.bincount(indices // self.numRows)
1105 colPtrs = np.cumsum(np.hstack(
1106 (0, colCounts, np.zeros(self.numCols - colCounts.size))))
1107 values = values[indices]
1108 rowIndices = indices % self.numRows
1109
1110 return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
1111
1112 def asML(self):
1113 """
1114 Convert this matrix to the new mllib-local representation.
1115 This does NOT copy the data; it copies references.
1116
1117 :return: :py:class:`pyspark.ml.linalg.DenseMatrix`
1118
1119 .. versionadded:: 2.0.0
1120 """
1121 return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
1122
1123 def __getitem__(self, indices):
1124 i, j = indices
1125 if i < 0 or i >= self.numRows:
1126 raise IndexError("Row index %d is out of range [0, %d)"
1127 % (i, self.numRows))
1128 if j >= self.numCols or j < 0:
1129 raise IndexError("Column index %d is out of range [0, %d)"
1130 % (j, self.numCols))
1131
1132 if self.isTransposed:
1133 return self.values[i * self.numCols + j]
1134 else:
1135 return self.values[i + j * self.numRows]
1136
1137 def __eq__(self, other):
1138 if (self.numRows != other.numRows or self.numCols != other.numCols):
1139 return False
1140 if isinstance(other, SparseMatrix):
1141 return np.all(self.toArray() == other.toArray())
1142
1143 self_values = np.ravel(self.toArray(), order='F')
1144 other_values = np.ravel(other.toArray(), order='F')
1145 return np.all(self_values == other_values)
1146
1147
1148 class SparseMatrix(Matrix):
1149 """Sparse Matrix stored in CSC format."""
1150 def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
1151 isTransposed=False):
1152 Matrix.__init__(self, numRows, numCols, isTransposed)
1153 self.colPtrs = self._convert_to_array(colPtrs, np.int32)
1154 self.rowIndices = self._convert_to_array(rowIndices, np.int32)
1155 self.values = self._convert_to_array(values, np.float64)
1156
1157 if self.isTransposed:
1158 if self.colPtrs.size != numRows + 1:
1159 raise ValueError("Expected colPtrs of size %d, got %d."
1160 % (numRows + 1, self.colPtrs.size))
1161 else:
1162 if self.colPtrs.size != numCols + 1:
1163 raise ValueError("Expected colPtrs of size %d, got %d."
1164 % (numCols + 1, self.colPtrs.size))
1165 if self.rowIndices.size != self.values.size:
1166 raise ValueError("Expected rowIndices of length %d, got %d."
1167 % (self.rowIndices.size, self.values.size))
1168
1169 def __str__(self):
1170 """
1171 Pretty printing of a SparseMatrix
1172
1173 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1174 >>> print(sm1)
1175 2 X 2 CSCMatrix
1176 (0,0) 2.0
1177 (1,0) 3.0
1178 (1,1) 4.0
1179 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1180 >>> print(sm1)
1181 2 X 2 CSRMatrix
1182 (0,0) 2.0
1183 (0,1) 3.0
1184 (1,1) 4.0
1185 """
1186 spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1187 if self.isTransposed:
1188 spstr += "CSRMatrix\n"
1189 else:
1190 spstr += "CSCMatrix\n"
1191
1192 cur_col = 0
1193 smlist = []
1194
1195
1196 if len(self.values) <= 16:
1197 zipindval = zip(self.rowIndices, self.values)
1198 else:
1199 zipindval = zip(self.rowIndices[:16], self.values[:16])
1200 for i, (rowInd, value) in enumerate(zipindval):
1201 if self.colPtrs[cur_col + 1] <= i:
1202 cur_col += 1
1203 if self.isTransposed:
1204 smlist.append('({0},{1}) {2}'.format(
1205 cur_col, rowInd, _format_float(value)))
1206 else:
1207 smlist.append('({0},{1}) {2}'.format(
1208 rowInd, cur_col, _format_float(value)))
1209 spstr += "\n".join(smlist)
1210
1211 if len(self.values) > 16:
1212 spstr += "\n.." * 2
1213 return spstr
1214
1215 def __repr__(self):
1216 """
1217 Representation of a SparseMatrix
1218
1219 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1220 >>> sm1
1221 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1222 """
1223 rowIndices = list(self.rowIndices)
1224 colPtrs = list(self.colPtrs)
1225
1226 if len(self.values) <= 16:
1227 values = _format_float_list(self.values)
1228
1229 else:
1230 values = (
1231 _format_float_list(self.values[:8]) +
1232 ["..."] +
1233 _format_float_list(self.values[-8:])
1234 )
1235 rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1236
1237 if len(self.colPtrs) > 16:
1238 colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1239
1240 values = ", ".join(values)
1241 rowIndices = ", ".join([str(ind) for ind in rowIndices])
1242 colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1243 return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1244 self.numRows, self.numCols, colPtrs, rowIndices,
1245 values, self.isTransposed)
1246
1247 def __reduce__(self):
1248 return SparseMatrix, (
1249 self.numRows, self.numCols, self.colPtrs.tostring(),
1250 self.rowIndices.tostring(), self.values.tostring(),
1251 int(self.isTransposed))
1252
1253 def __getitem__(self, indices):
1254 i, j = indices
1255 if i < 0 or i >= self.numRows:
1256 raise IndexError("Row index %d is out of range [0, %d)"
1257 % (i, self.numRows))
1258 if j < 0 or j >= self.numCols:
1259 raise IndexError("Column index %d is out of range [0, %d)"
1260 % (j, self.numCols))
1261
1262
1263
1264
1265 if self.isTransposed:
1266 j, i = i, j
1267
1268 colStart = self.colPtrs[j]
1269 colEnd = self.colPtrs[j + 1]
1270 nz = self.rowIndices[colStart: colEnd]
1271 ind = np.searchsorted(nz, i) + colStart
1272 if ind < colEnd and self.rowIndices[ind] == i:
1273 return self.values[ind]
1274 else:
1275 return 0.0
1276
1277 def toArray(self):
1278 """
1279 Return an numpy.ndarray
1280 """
1281 A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1282 for k in xrange(self.colPtrs.size - 1):
1283 startptr = self.colPtrs[k]
1284 endptr = self.colPtrs[k + 1]
1285 if self.isTransposed:
1286 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1287 else:
1288 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1289 return A
1290
1291 def toDense(self):
1292 densevals = np.ravel(self.toArray(), order='F')
1293 return DenseMatrix(self.numRows, self.numCols, densevals)
1294
1295 def asML(self):
1296 """
1297 Convert this matrix to the new mllib-local representation.
1298 This does NOT copy the data; it copies references.
1299
1300 :return: :py:class:`pyspark.ml.linalg.SparseMatrix`
1301
1302 .. versionadded:: 2.0.0
1303 """
1304 return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
1305 self.values, self.isTransposed)
1306
1307
1308 def __eq__(self, other):
1309 return np.all(self.toArray() == other.toArray())
1310
1311
1312 class Matrices(object):
1313 @staticmethod
1314 def dense(numRows, numCols, values):
1315 """
1316 Create a DenseMatrix
1317 """
1318 return DenseMatrix(numRows, numCols, values)
1319
1320 @staticmethod
1321 def sparse(numRows, numCols, colPtrs, rowIndices, values):
1322 """
1323 Create a SparseMatrix
1324 """
1325 return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1326
1327 @staticmethod
1328 def fromML(mat):
1329 """
1330 Convert a matrix from the new mllib-local representation.
1331 This does NOT copy the data; it copies references.
1332
1333 :param mat: a :py:class:`pyspark.ml.linalg.Matrix`
1334 :return: a :py:class:`pyspark.mllib.linalg.Matrix`
1335
1336 .. versionadded:: 2.0.0
1337 """
1338 if isinstance(mat, newlinalg.DenseMatrix):
1339 return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
1340 elif isinstance(mat, newlinalg.SparseMatrix):
1341 return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
1342 mat.values, mat.isTransposed)
1343 else:
1344 raise TypeError("Unsupported matrix type %s" % type(mat))
1345
1346
1347 class QRDecomposition(object):
1348 """
1349 Represents QR factors.
1350 """
1351 def __init__(self, Q, R):
1352 self._Q = Q
1353 self._R = R
1354
1355 @property
1356 @since('2.0.0')
1357 def Q(self):
1358 """
1359 An orthogonal matrix Q in a QR decomposition.
1360 May be null if not computed.
1361 """
1362 return self._Q
1363
1364 @property
1365 @since('2.0.0')
1366 def R(self):
1367 """
1368 An upper triangular matrix R in a QR decomposition.
1369 """
1370 return self._R
1371
1372
1373 def _test():
1374 import doctest
1375 import numpy
1376 try:
1377
1378 numpy.set_printoptions(legacy='1.13')
1379 except TypeError:
1380 pass
1381 (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1382 if failure_count:
1383 sys.exit(-1)
1384
1385 if __name__ == "__main__":
1386 _test()