ml/linalg/__init__.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 """
0019 MLlib utilities for linear algebra. For dense vectors, MLlib
0020 uses the NumPy `array` type, so you can simply pass NumPy arrays
0021 around. For sparse vectors, users can construct a :class:`SparseVector`
0022 object from MLlib or pass SciPy `scipy.sparse` column vectors if
0023 SciPy is available in their environment.
0024 """
0025
0026 import sys
0027 import array
0028 import struct
0029
0030 if sys.version >= '3':
0031     basestring = str
0032     xrange = range
0033     import copyreg as copy_reg
0034     long = int
0035 else:
0036     from itertools import izip as zip
0037     import copy_reg
0038
0039 import numpy as np
0040
0041 from pyspark import since
0042 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
0043     IntegerType, ByteType, BooleanType
0044
0045
0046 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
0047            'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
0048
0049
0050 if sys.version_info[:2] == (2, 7):
0051     # speed up pickling array in Python 2.7
0052     def fast_pickle_array(ar):
0053         return array.array, (ar.typecode, ar.tostring())
0054     copy_reg.pickle(array.array, fast_pickle_array)
0055
0056
0057 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
0058 # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
0059
0060 try:
0061     import scipy.sparse
0062     _have_scipy = True
0063 except:
0064     # No SciPy in environment, but that's okay
0065     _have_scipy = False
0066
0067
0068 def _convert_to_vector(l):
0069     if isinstance(l, Vector):
0070         return l
0071     elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
0072         return DenseVector(l)
0073     elif _have_scipy and scipy.sparse.issparse(l):
0074         assert l.shape[1] == 1, "Expected column vector"
0075         # Make sure the converted csc_matrix has sorted indices.
0076         csc = l.tocsc()
0077         if not csc.has_sorted_indices:
0078             csc.sort_indices()
0079         return SparseVector(l.shape[0], csc.indices, csc.data)
0080     else:
0081         raise TypeError("Cannot convert type %s into Vector" % type(l))
0082
0083
0084 def _vector_size(v):
0085     """
0086     Returns the size of the vector.
0087
0088     >>> _vector_size([1., 2., 3.])
0089     3
0090     >>> _vector_size((1., 2., 3.))
0091     3
0092     >>> _vector_size(array.array('d', [1., 2., 3.]))
0093     3
0094     >>> _vector_size(np.zeros(3))
0095     3
0096     >>> _vector_size(np.zeros((3, 1)))
0097     3
0098     >>> _vector_size(np.zeros((1, 3)))
0099     Traceback (most recent call last):
0100         ...
0101     ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
0102     """
0103     if isinstance(v, Vector):
0104         return len(v)
0105     elif type(v) in (array.array, list, tuple, xrange):
0106         return len(v)
0107     elif type(v) == np.ndarray:
0108         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
0109             return len(v)
0110         else:
0111             raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
0112     elif _have_scipy and scipy.sparse.issparse(v):
0113         assert v.shape[1] == 1, "Expected column vector"
0114         return v.shape[0]
0115     else:
0116         raise TypeError("Cannot treat type %s as a vector" % type(v))
0117
0118
0119 def _format_float(f, digits=4):
0120     s = str(round(f, digits))
0121     if '.' in s:
0122         s = s[:s.index('.') + 1 + digits]
0123     return s
0124
0125
0126 def _format_float_list(l):
0127     return [_format_float(x) for x in l]
0128
0129
0130 def _double_to_long_bits(value):
0131     if np.isnan(value):
0132         value = float('nan')
0133     # pack double into 64 bits, then unpack as long int
0134     return struct.unpack('Q', struct.pack('d', value))[0]
0135
0136
0137 class VectorUDT(UserDefinedType):
0138     """
0139     SQL user-defined type (UDT) for Vector.
0140     """
0141
0142     @classmethod
0143     def sqlType(cls):
0144         return StructType([
0145             StructField("type", ByteType(), False),
0146             StructField("size", IntegerType(), True),
0147             StructField("indices", ArrayType(IntegerType(), False), True),
0148             StructField("values", ArrayType(DoubleType(), False), True)])
0149
0150     @classmethod
0151     def module(cls):
0152         return "pyspark.ml.linalg"
0153
0154     @classmethod
0155     def scalaUDT(cls):
0156         return "org.apache.spark.ml.linalg.VectorUDT"
0157
0158     def serialize(self, obj):
0159         if isinstance(obj, SparseVector):
0160             indices = [int(i) for i in obj.indices]
0161             values = [float(v) for v in obj.values]
0162             return (0, obj.size, indices, values)
0163         elif isinstance(obj, DenseVector):
0164             values = [float(v) for v in obj]
0165             return (1, None, None, values)
0166         else:
0167             raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
0168
0169     def deserialize(self, datum):
0170         assert len(datum) == 4, \
0171             "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
0172         tpe = datum[0]
0173         if tpe == 0:
0174             return SparseVector(datum[1], datum[2], datum[3])
0175         elif tpe == 1:
0176             return DenseVector(datum[3])
0177         else:
0178             raise ValueError("do not recognize type %r" % tpe)
0179
0180     def simpleString(self):
0181         return "vector"
0182
0183
0184 class MatrixUDT(UserDefinedType):
0185     """
0186     SQL user-defined type (UDT) for Matrix.
0187     """
0188
0189     @classmethod
0190     def sqlType(cls):
0191         return StructType([
0192             StructField("type", ByteType(), False),
0193             StructField("numRows", IntegerType(), False),
0194             StructField("numCols", IntegerType(), False),
0195             StructField("colPtrs", ArrayType(IntegerType(), False), True),
0196             StructField("rowIndices", ArrayType(IntegerType(), False), True),
0197             StructField("values", ArrayType(DoubleType(), False), True),
0198             StructField("isTransposed", BooleanType(), False)])
0199
0200     @classmethod
0201     def module(cls):
0202         return "pyspark.ml.linalg"
0203
0204     @classmethod
0205     def scalaUDT(cls):
0206         return "org.apache.spark.ml.linalg.MatrixUDT"
0207
0208     def serialize(self, obj):
0209         if isinstance(obj, SparseMatrix):
0210             colPtrs = [int(i) for i in obj.colPtrs]
0211             rowIndices = [int(i) for i in obj.rowIndices]
0212             values = [float(v) for v in obj.values]
0213             return (0, obj.numRows, obj.numCols, colPtrs,
0214                     rowIndices, values, bool(obj.isTransposed))
0215         elif isinstance(obj, DenseMatrix):
0216             values = [float(v) for v in obj.values]
0217             return (1, obj.numRows, obj.numCols, None, None, values,
0218                     bool(obj.isTransposed))
0219         else:
0220             raise TypeError("cannot serialize type %r" % (type(obj)))
0221
0222     def deserialize(self, datum):
0223         assert len(datum) == 7, \
0224             "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
0225         tpe = datum[0]
0226         if tpe == 0:
0227             return SparseMatrix(*datum[1:])
0228         elif tpe == 1:
0229             return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
0230         else:
0231             raise ValueError("do not recognize type %r" % tpe)
0232
0233     def simpleString(self):
0234         return "matrix"
0235
0236
0237 class Vector(object):
0238
0239     __UDT__ = VectorUDT()
0240
0241     """
0242     Abstract class for DenseVector and SparseVector
0243     """
0244     def toArray(self):
0245         """
0246         Convert the vector into an numpy.ndarray
0247
0248         :return: numpy.ndarray
0249         """
0250         raise NotImplementedError
0251
0252
0253 class DenseVector(Vector):
0254     """
0255     A dense vector represented by a value array. We use numpy array for
0256     storage and arithmetics will be delegated to the underlying numpy
0257     array.
0258
0259     >>> v = Vectors.dense([1.0, 2.0])
0260     >>> u = Vectors.dense([3.0, 4.0])
0261     >>> v + u
0262     DenseVector([4.0, 6.0])
0263     >>> 2 - v
0264     DenseVector([1.0, 0.0])
0265     >>> v / 2
0266     DenseVector([0.5, 1.0])
0267     >>> v * u
0268     DenseVector([3.0, 8.0])
0269     >>> u / v
0270     DenseVector([3.0, 2.0])
0271     >>> u % 2
0272     DenseVector([1.0, 0.0])
0273     >>> -v
0274     DenseVector([-1.0, -2.0])
0275     """
0276     def __init__(self, ar):
0277         if isinstance(ar, bytes):
0278             ar = np.frombuffer(ar, dtype=np.float64)
0279         elif not isinstance(ar, np.ndarray):
0280             ar = np.array(ar, dtype=np.float64)
0281         if ar.dtype != np.float64:
0282             ar = ar.astype(np.float64)
0283         self.array = ar
0284
0285     def __reduce__(self):
0286         return DenseVector, (self.array.tostring(),)
0287
0288     def numNonzeros(self):
0289         """
0290         Number of nonzero elements. This scans all active values and count non zeros
0291         """
0292         return np.count_nonzero(self.array)
0293
0294     def norm(self, p):
0295         """
0296         Calculates the norm of a DenseVector.
0297
0298         >>> a = DenseVector([0, -1, 2, -3])
0299         >>> a.norm(2)
0300         3.7...
0301         >>> a.norm(1)
0302         6.0
0303         """
0304         return np.linalg.norm(self.array, p)
0305
0306     def dot(self, other):
0307         """
0308         Compute the dot product of two Vectors. We support
0309         (Numpy array, list, SparseVector, or SciPy sparse)
0310         and a target NumPy array that is either 1- or 2-dimensional.
0311         Equivalent to calling numpy.dot of the two vectors.
0312
0313         >>> dense = DenseVector(array.array('d', [1., 2.]))
0314         >>> dense.dot(dense)
0315         5.0
0316         >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
0317         4.0
0318         >>> dense.dot(range(1, 3))
0319         5.0
0320         >>> dense.dot(np.array(range(1, 3)))
0321         5.0
0322         >>> dense.dot([1.,])
0323         Traceback (most recent call last):
0324             ...
0325         AssertionError: dimension mismatch
0326         >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
0327         array([  5.,  11.])
0328         >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
0329         Traceback (most recent call last):
0330             ...
0331         AssertionError: dimension mismatch
0332         """
0333         if type(other) == np.ndarray:
0334             if other.ndim > 1:
0335                 assert len(self) == other.shape[0], "dimension mismatch"
0336             return np.dot(self.array, other)
0337         elif _have_scipy and scipy.sparse.issparse(other):
0338             assert len(self) == other.shape[0], "dimension mismatch"
0339             return other.transpose().dot(self.toArray())
0340         else:
0341             assert len(self) == _vector_size(other), "dimension mismatch"
0342             if isinstance(other, SparseVector):
0343                 return other.dot(self)
0344             elif isinstance(other, Vector):
0345                 return np.dot(self.toArray(), other.toArray())
0346             else:
0347                 return np.dot(self.toArray(), other)
0348
0349     def squared_distance(self, other):
0350         """
0351         Squared distance of two Vectors.
0352
0353         >>> dense1 = DenseVector(array.array('d', [1., 2.]))
0354         >>> dense1.squared_distance(dense1)
0355         0.0
0356         >>> dense2 = np.array([2., 1.])
0357         >>> dense1.squared_distance(dense2)
0358         2.0
0359         >>> dense3 = [2., 1.]
0360         >>> dense1.squared_distance(dense3)
0361         2.0
0362         >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
0363         >>> dense1.squared_distance(sparse1)
0364         2.0
0365         >>> dense1.squared_distance([1.,])
0366         Traceback (most recent call last):
0367             ...
0368         AssertionError: dimension mismatch
0369         >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
0370         Traceback (most recent call last):
0371             ...
0372         AssertionError: dimension mismatch
0373         """
0374         assert len(self) == _vector_size(other), "dimension mismatch"
0375         if isinstance(other, SparseVector):
0376             return other.squared_distance(self)
0377         elif _have_scipy and scipy.sparse.issparse(other):
0378             return _convert_to_vector(other).squared_distance(self)
0379
0380         if isinstance(other, Vector):
0381             other = other.toArray()
0382         elif not isinstance(other, np.ndarray):
0383             other = np.array(other)
0384         diff = self.toArray() - other
0385         return np.dot(diff, diff)
0386
0387     def toArray(self):
0388         """
0389         Returns the underlying numpy.ndarray
0390         """
0391         return self.array
0392
0393     @property
0394     def values(self):
0395         """
0396         Returns the underlying numpy.ndarray
0397         """
0398         return self.array
0399
0400     def __getitem__(self, item):
0401         return self.array[item]
0402
0403     def __len__(self):
0404         return len(self.array)
0405
0406     def __str__(self):
0407         return "[" + ",".join([str(v) for v in self.array]) + "]"
0408
0409     def __repr__(self):
0410         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
0411
0412     def __eq__(self, other):
0413         if isinstance(other, DenseVector):
0414             return np.array_equal(self.array, other.array)
0415         elif isinstance(other, SparseVector):
0416             if len(self) != other.size:
0417                 return False
0418             return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
0419         return False
0420
0421     def __ne__(self, other):
0422         return not self == other
0423
0424     def __hash__(self):
0425         size = len(self)
0426         result = 31 + size
0427         nnz = 0
0428         i = 0
0429         while i < size and nnz < 128:
0430             if self.array[i] != 0:
0431                 result = 31 * result + i
0432                 bits = _double_to_long_bits(self.array[i])
0433                 result = 31 * result + (bits ^ (bits >> 32))
0434                 nnz += 1
0435             i += 1
0436         return result
0437
0438     def __getattr__(self, item):
0439         return getattr(self.array, item)
0440
0441     def __neg__(self):
0442         return DenseVector(-self.array)
0443
0444     def _delegate(op):
0445         def func(self, other):
0446             if isinstance(other, DenseVector):
0447                 other = other.array
0448             return DenseVector(getattr(self.array, op)(other))
0449         return func
0450
0451     __add__ = _delegate("__add__")
0452     __sub__ = _delegate("__sub__")
0453     __mul__ = _delegate("__mul__")
0454     __div__ = _delegate("__div__")
0455     __truediv__ = _delegate("__truediv__")
0456     __mod__ = _delegate("__mod__")
0457     __radd__ = _delegate("__radd__")
0458     __rsub__ = _delegate("__rsub__")
0459     __rmul__ = _delegate("__rmul__")
0460     __rdiv__ = _delegate("__rdiv__")
0461     __rtruediv__ = _delegate("__rtruediv__")
0462     __rmod__ = _delegate("__rmod__")
0463
0464
0465 class SparseVector(Vector):
0466     """
0467     A simple sparse vector class for passing data to MLlib. Users may
0468     alternatively pass SciPy's {scipy.sparse} data types.
0469     """
0470     def __init__(self, size, *args):
0471         """
0472         Create a sparse vector, using either a dictionary, a list of
0473         (index, value) pairs, or two separate arrays of indices and
0474         values (sorted by index).
0475
0476         :param size: Size of the vector.
0477         :param args: Active entries, as a dictionary {index: value, ...},
0478           a list of tuples [(index, value), ...], or a list of strictly
0479           increasing indices and a list of corresponding values [index, ...],
0480           [value, ...]. Inactive entries are treated as zeros.
0481
0482         >>> SparseVector(4, {1: 1.0, 3: 5.5})
0483         SparseVector(4, {1: 1.0, 3: 5.5})
0484         >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
0485         SparseVector(4, {1: 1.0, 3: 5.5})
0486         >>> SparseVector(4, [1, 3], [1.0, 5.5])
0487         SparseVector(4, {1: 1.0, 3: 5.5})
0488         >>> SparseVector(4, {1:1.0, 6:2.0})
0489         Traceback (most recent call last):
0490         ...
0491         AssertionError: Index 6 is out of the size of vector with size=4
0492         >>> SparseVector(4, {-1:1.0})
0493         Traceback (most recent call last):
0494         ...
0495         AssertionError: Contains negative index -1
0496         """
0497         self.size = int(size)
0498         """ Size of the vector. """
0499         assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
0500         if len(args) == 1:
0501             pairs = args[0]
0502             if type(pairs) == dict:
0503                 pairs = pairs.items()
0504             pairs = sorted(pairs)
0505             self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
0506             """ A list of indices corresponding to active entries. """
0507             self.values = np.array([p[1] for p in pairs], dtype=np.float64)
0508             """ A list of values corresponding to active entries. """
0509         else:
0510             if isinstance(args[0], bytes):
0511                 assert isinstance(args[1], bytes), "values should be string too"
0512                 if args[0]:
0513                     self.indices = np.frombuffer(args[0], np.int32)
0514                     self.values = np.frombuffer(args[1], np.float64)
0515                 else:
0516                     # np.frombuffer() doesn't work well with empty string in older version
0517                     self.indices = np.array([], dtype=np.int32)
0518                     self.values = np.array([], dtype=np.float64)
0519             else:
0520                 self.indices = np.array(args[0], dtype=np.int32)
0521                 self.values = np.array(args[1], dtype=np.float64)
0522             assert len(self.indices) == len(self.values), "index and value arrays not same length"
0523             for i in xrange(len(self.indices) - 1):
0524                 if self.indices[i] >= self.indices[i + 1]:
0525                     raise TypeError(
0526                         "Indices %s and %s are not strictly increasing"
0527                         % (self.indices[i], self.indices[i + 1]))
0528
0529         if self.indices.size > 0:
0530             assert np.max(self.indices) < self.size, \
0531                 "Index %d is out of the size of vector with size=%d" \
0532                 % (np.max(self.indices), self.size)
0533             assert np.min(self.indices) >= 0, \
0534                 "Contains negative index %d" % (np.min(self.indices))
0535
0536     def numNonzeros(self):
0537         """
0538         Number of nonzero elements. This scans all active values and count non zeros.
0539         """
0540         return np.count_nonzero(self.values)
0541
0542     def norm(self, p):
0543         """
0544         Calculates the norm of a SparseVector.
0545
0546         >>> a = SparseVector(4, [0, 1], [3., -4.])
0547         >>> a.norm(1)
0548         7.0
0549         >>> a.norm(2)
0550         5.0
0551         """
0552         return np.linalg.norm(self.values, p)
0553
0554     def __reduce__(self):
0555         return (
0556             SparseVector,
0557             (self.size, self.indices.tostring(), self.values.tostring()))
0558
0559     def dot(self, other):
0560         """
0561         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
0562
0563         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0564         >>> a.dot(a)
0565         25.0
0566         >>> a.dot(array.array('d', [1., 2., 3., 4.]))
0567         22.0
0568         >>> b = SparseVector(4, [2], [1.0])
0569         >>> a.dot(b)
0570         0.0
0571         >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
0572         array([ 22.,  22.])
0573         >>> a.dot([1., 2., 3.])
0574         Traceback (most recent call last):
0575             ...
0576         AssertionError: dimension mismatch
0577         >>> a.dot(np.array([1., 2.]))
0578         Traceback (most recent call last):
0579             ...
0580         AssertionError: dimension mismatch
0581         >>> a.dot(DenseVector([1., 2.]))
0582         Traceback (most recent call last):
0583             ...
0584         AssertionError: dimension mismatch
0585         >>> a.dot(np.zeros((3, 2)))
0586         Traceback (most recent call last):
0587             ...
0588         AssertionError: dimension mismatch
0589         """
0590
0591         if isinstance(other, np.ndarray):
0592             if other.ndim not in [2, 1]:
0593                 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
0594             assert len(self) == other.shape[0], "dimension mismatch"
0595             return np.dot(self.values, other[self.indices])
0596
0597         assert len(self) == _vector_size(other), "dimension mismatch"
0598
0599         if isinstance(other, DenseVector):
0600             return np.dot(other.array[self.indices], self.values)
0601
0602         elif isinstance(other, SparseVector):
0603             # Find out common indices.
0604             self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
0605             self_values = self.values[self_cmind]
0606             if self_values.size == 0:
0607                 return 0.0
0608             else:
0609                 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
0610                 return np.dot(self_values, other.values[other_cmind])
0611
0612         else:
0613             return self.dot(_convert_to_vector(other))
0614
0615     def squared_distance(self, other):
0616         """
0617         Squared distance from a SparseVector or 1-dimensional NumPy array.
0618
0619         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0620         >>> a.squared_distance(a)
0621         0.0
0622         >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
0623         11.0
0624         >>> a.squared_distance(np.array([1., 2., 3., 4.]))
0625         11.0
0626         >>> b = SparseVector(4, [2], [1.0])
0627         >>> a.squared_distance(b)
0628         26.0
0629         >>> b.squared_distance(a)
0630         26.0
0631         >>> b.squared_distance([1., 2.])
0632         Traceback (most recent call last):
0633             ...
0634         AssertionError: dimension mismatch
0635         >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
0636         Traceback (most recent call last):
0637             ...
0638         AssertionError: dimension mismatch
0639         """
0640         assert len(self) == _vector_size(other), "dimension mismatch"
0641
0642         if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
0643             if isinstance(other, np.ndarray) and other.ndim != 1:
0644                 raise Exception("Cannot call squared_distance with %d-dimensional array" %
0645                                 other.ndim)
0646             if isinstance(other, DenseVector):
0647                 other = other.array
0648             sparse_ind = np.zeros(other.size, dtype=bool)
0649             sparse_ind[self.indices] = True
0650             dist = other[sparse_ind] - self.values
0651             result = np.dot(dist, dist)
0652
0653             other_ind = other[~sparse_ind]
0654             result += np.dot(other_ind, other_ind)
0655             return result
0656
0657         elif isinstance(other, SparseVector):
0658             result = 0.0
0659             i, j = 0, 0
0660             while i < len(self.indices) and j < len(other.indices):
0661                 if self.indices[i] == other.indices[j]:
0662                     diff = self.values[i] - other.values[j]
0663                     result += diff * diff
0664                     i += 1
0665                     j += 1
0666                 elif self.indices[i] < other.indices[j]:
0667                     result += self.values[i] * self.values[i]
0668                     i += 1
0669                 else:
0670                     result += other.values[j] * other.values[j]
0671                     j += 1
0672             while i < len(self.indices):
0673                 result += self.values[i] * self.values[i]
0674                 i += 1
0675             while j < len(other.indices):
0676                 result += other.values[j] * other.values[j]
0677                 j += 1
0678             return result
0679         else:
0680             return self.squared_distance(_convert_to_vector(other))
0681
0682     def toArray(self):
0683         """
0684         Returns a copy of this SparseVector as a 1-dimensional numpy.ndarray.
0685         """
0686         arr = np.zeros((self.size,), dtype=np.float64)
0687         arr[self.indices] = self.values
0688         return arr
0689
0690     def __len__(self):
0691         return self.size
0692
0693     def __str__(self):
0694         inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
0695         vals = "[" + ",".join([str(v) for v in self.values]) + "]"
0696         return "(" + ",".join((str(self.size), inds, vals)) + ")"
0697
0698     def __repr__(self):
0699         inds = self.indices
0700         vals = self.values
0701         entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
0702                              for i in xrange(len(inds))])
0703         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
0704
0705     def __eq__(self, other):
0706         if isinstance(other, SparseVector):
0707             return other.size == self.size and np.array_equal(other.indices, self.indices) \
0708                 and np.array_equal(other.values, self.values)
0709         elif isinstance(other, DenseVector):
0710             if self.size != len(other):
0711                 return False
0712             return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
0713         return False
0714
0715     def __getitem__(self, index):
0716         inds = self.indices
0717         vals = self.values
0718         if not isinstance(index, int):
0719             raise TypeError(
0720                 "Indices must be of type integer, got type %s" % type(index))
0721
0722         if index >= self.size or index < -self.size:
0723             raise IndexError("Index %d out of bounds." % index)
0724         if index < 0:
0725             index += self.size
0726
0727         if (inds.size == 0) or (index > inds.item(-1)):
0728             return 0.
0729
0730         insert_index = np.searchsorted(inds, index)
0731         row_ind = inds[insert_index]
0732         if row_ind == index:
0733             return vals[insert_index]
0734         return 0.
0735
0736     def __ne__(self, other):
0737         return not self.__eq__(other)
0738
0739     def __hash__(self):
0740         result = 31 + self.size
0741         nnz = 0
0742         i = 0
0743         while i < len(self.values) and nnz < 128:
0744             if self.values[i] != 0:
0745                 result = 31 * result + int(self.indices[i])
0746                 bits = _double_to_long_bits(self.values[i])
0747                 result = 31 * result + (bits ^ (bits >> 32))
0748                 nnz += 1
0749             i += 1
0750         return result
0751
0752
0753 class Vectors(object):
0754
0755     """
0756     Factory methods for working with vectors.
0757
0758     .. note:: Dense vectors are simply represented as NumPy array objects,
0759         so there is no need to covert them for use in MLlib. For sparse vectors,
0760         the factory methods in this class create an MLlib-compatible type, or users
0761         can pass in SciPy's `scipy.sparse` column vectors.
0762     """
0763
0764     @staticmethod
0765     def sparse(size, *args):
0766         """
0767         Create a sparse vector, using either a dictionary, a list of
0768         (index, value) pairs, or two separate arrays of indices and
0769         values (sorted by index).
0770
0771         :param size: Size of the vector.
0772         :param args: Non-zero entries, as a dictionary, list of tuples,
0773                      or two sorted lists containing indices and values.
0774
0775         >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
0776         SparseVector(4, {1: 1.0, 3: 5.5})
0777         >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
0778         SparseVector(4, {1: 1.0, 3: 5.5})
0779         >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
0780         SparseVector(4, {1: 1.0, 3: 5.5})
0781         """
0782         return SparseVector(size, *args)
0783
0784     @staticmethod
0785     def dense(*elements):
0786         """
0787         Create a dense vector of 64-bit floats from a Python list or numbers.
0788
0789         >>> Vectors.dense([1, 2, 3])
0790         DenseVector([1.0, 2.0, 3.0])
0791         >>> Vectors.dense(1.0, 2.0)
0792         DenseVector([1.0, 2.0])
0793         """
0794         if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
0795             # it's list, numpy.array or other iterable object.
0796             elements = elements[0]
0797         return DenseVector(elements)
0798
0799     @staticmethod
0800     def squared_distance(v1, v2):
0801         """
0802         Squared distance between two vectors.
0803         a and b can be of type SparseVector, DenseVector, np.ndarray
0804         or array.array.
0805
0806         >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
0807         >>> b = Vectors.dense([2, 5, 4, 1])
0808         >>> a.squared_distance(b)
0809         51.0
0810         """
0811         v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
0812         return v1.squared_distance(v2)
0813
0814     @staticmethod
0815     def norm(vector, p):
0816         """
0817         Find norm of the given vector.
0818         """
0819         return _convert_to_vector(vector).norm(p)
0820
0821     @staticmethod
0822     def zeros(size):
0823         return DenseVector(np.zeros(size))
0824
0825     @staticmethod
0826     def _equals(v1_indices, v1_values, v2_indices, v2_values):
0827         """
0828         Check equality between sparse/dense vectors,
0829         v1_indices and v2_indices assume to be strictly increasing.
0830         """
0831         v1_size = len(v1_values)
0832         v2_size = len(v2_values)
0833         k1 = 0
0834         k2 = 0
0835         all_equal = True
0836         while all_equal:
0837             while k1 < v1_size and v1_values[k1] == 0:
0838                 k1 += 1
0839             while k2 < v2_size and v2_values[k2] == 0:
0840                 k2 += 1
0841
0842             if k1 >= v1_size or k2 >= v2_size:
0843                 return k1 >= v1_size and k2 >= v2_size
0844
0845             all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
0846             k1 += 1
0847             k2 += 1
0848         return all_equal
0849
0850
0851 class Matrix(object):
0852
0853     __UDT__ = MatrixUDT()
0854
0855     """
0856     Represents a local matrix.
0857     """
0858     def __init__(self, numRows, numCols, isTransposed=False):
0859         self.numRows = numRows
0860         self.numCols = numCols
0861         self.isTransposed = isTransposed
0862
0863     def toArray(self):
0864         """
0865         Returns its elements in a numpy.ndarray.
0866         """
0867         raise NotImplementedError
0868
0869     @staticmethod
0870     def _convert_to_array(array_like, dtype):
0871         """
0872         Convert Matrix attributes which are array-like or buffer to array.
0873         """
0874         if isinstance(array_like, bytes):
0875             return np.frombuffer(array_like, dtype=dtype)
0876         return np.asarray(array_like, dtype=dtype)
0877
0878
0879 class DenseMatrix(Matrix):
0880     """
0881     Column-major dense matrix.
0882     """
0883     def __init__(self, numRows, numCols, values, isTransposed=False):
0884         Matrix.__init__(self, numRows, numCols, isTransposed)
0885         values = self._convert_to_array(values, np.float64)
0886         assert len(values) == numRows * numCols
0887         self.values = values
0888
0889     def __reduce__(self):
0890         return DenseMatrix, (
0891             self.numRows, self.numCols, self.values.tostring(),
0892             int(self.isTransposed))
0893
0894     def __str__(self):
0895         """
0896         Pretty printing of a DenseMatrix
0897
0898         >>> dm = DenseMatrix(2, 2, range(4))
0899         >>> print(dm)
0900         DenseMatrix([[ 0.,  2.],
0901                      [ 1.,  3.]])
0902         >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
0903         >>> print(dm)
0904         DenseMatrix([[ 0.,  1.],
0905                      [ 2.,  3.]])
0906         """
0907         # Inspired by __repr__ in scipy matrices.
0908         array_lines = repr(self.toArray()).splitlines()
0909
0910         # We need to adjust six spaces which is the difference in number
0911         # of letters between "DenseMatrix" and "array"
0912         x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
0913         return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
0914
0915     def __repr__(self):
0916         """
0917         Representation of a DenseMatrix
0918
0919         >>> dm = DenseMatrix(2, 2, range(4))
0920         >>> dm
0921         DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
0922         """
0923         # If the number of values are less than seventeen then return as it is.
0924         # Else return first eight values and last eight values.
0925         if len(self.values) < 17:
0926             entries = _format_float_list(self.values)
0927         else:
0928             entries = (
0929                 _format_float_list(self.values[:8]) +
0930                 ["..."] +
0931                 _format_float_list(self.values[-8:])
0932             )
0933
0934         entries = ", ".join(entries)
0935         return "DenseMatrix({0}, {1}, [{2}], {3})".format(
0936             self.numRows, self.numCols, entries, self.isTransposed)
0937
0938     def toArray(self):
0939         """
0940         Return a numpy.ndarray
0941
0942         >>> m = DenseMatrix(2, 2, range(4))
0943         >>> m.toArray()
0944         array([[ 0.,  2.],
0945                [ 1.,  3.]])
0946         """
0947         if self.isTransposed:
0948             return np.asfortranarray(
0949                 self.values.reshape((self.numRows, self.numCols)))
0950         else:
0951             return self.values.reshape((self.numRows, self.numCols), order='F')
0952
0953     def toSparse(self):
0954         """Convert to SparseMatrix"""
0955         if self.isTransposed:
0956             values = np.ravel(self.toArray(), order='F')
0957         else:
0958             values = self.values
0959         indices = np.nonzero(values)[0]
0960         colCounts = np.bincount(indices // self.numRows)
0961         colPtrs = np.cumsum(np.hstack(
0962             (0, colCounts, np.zeros(self.numCols - colCounts.size))))
0963         values = values[indices]
0964         rowIndices = indices % self.numRows
0965
0966         return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
0967
0968     def __getitem__(self, indices):
0969         i, j = indices
0970         if i < 0 or i >= self.numRows:
0971             raise IndexError("Row index %d is out of range [0, %d)"
0972                              % (i, self.numRows))
0973         if j >= self.numCols or j < 0:
0974             raise IndexError("Column index %d is out of range [0, %d)"
0975                              % (j, self.numCols))
0976
0977         if self.isTransposed:
0978             return self.values[i * self.numCols + j]
0979         else:
0980             return self.values[i + j * self.numRows]
0981
0982     def __eq__(self, other):
0983         if (self.numRows != other.numRows or self.numCols != other.numCols):
0984             return False
0985         if isinstance(other, SparseMatrix):
0986             return np.all(self.toArray() == other.toArray())
0987
0988         self_values = np.ravel(self.toArray(), order='F')
0989         other_values = np.ravel(other.toArray(), order='F')
0990         return np.all(self_values == other_values)
0991
0992
0993 class SparseMatrix(Matrix):
0994     """Sparse Matrix stored in CSC format."""
0995     def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
0996                  isTransposed=False):
0997         Matrix.__init__(self, numRows, numCols, isTransposed)
0998         self.colPtrs = self._convert_to_array(colPtrs, np.int32)
0999         self.rowIndices = self._convert_to_array(rowIndices, np.int32)
1000         self.values = self._convert_to_array(values, np.float64)
1001
1002         if self.isTransposed:
1003             if self.colPtrs.size != numRows + 1:
1004                 raise ValueError("Expected colPtrs of size %d, got %d."
1005                                  % (numRows + 1, self.colPtrs.size))
1006         else:
1007             if self.colPtrs.size != numCols + 1:
1008                 raise ValueError("Expected colPtrs of size %d, got %d."
1009                                  % (numCols + 1, self.colPtrs.size))
1010         if self.rowIndices.size != self.values.size:
1011             raise ValueError("Expected rowIndices of length %d, got %d."
1012                              % (self.rowIndices.size, self.values.size))
1013
1014     def __str__(self):
1015         """
1016         Pretty printing of a SparseMatrix
1017
1018         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1019         >>> print(sm1)
1020         2 X 2 CSCMatrix
1021         (0,0) 2.0
1022         (1,0) 3.0
1023         (1,1) 4.0
1024         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1025         >>> print(sm1)
1026         2 X 2 CSRMatrix
1027         (0,0) 2.0
1028         (0,1) 3.0
1029         (1,1) 4.0
1030         """
1031         spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1032         if self.isTransposed:
1033             spstr += "CSRMatrix\n"
1034         else:
1035             spstr += "CSCMatrix\n"
1036
1037         cur_col = 0
1038         smlist = []
1039
1040         # Display first 16 values.
1041         if len(self.values) <= 16:
1042             zipindval = zip(self.rowIndices, self.values)
1043         else:
1044             zipindval = zip(self.rowIndices[:16], self.values[:16])
1045         for i, (rowInd, value) in enumerate(zipindval):
1046             if self.colPtrs[cur_col + 1] <= i:
1047                 cur_col += 1
1048             if self.isTransposed:
1049                 smlist.append('({0},{1}) {2}'.format(
1050                     cur_col, rowInd, _format_float(value)))
1051             else:
1052                 smlist.append('({0},{1}) {2}'.format(
1053                     rowInd, cur_col, _format_float(value)))
1054         spstr += "\n".join(smlist)
1055
1056         if len(self.values) > 16:
1057             spstr += "\n.." * 2
1058         return spstr
1059
1060     def __repr__(self):
1061         """
1062         Representation of a SparseMatrix
1063
1064         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1065         >>> sm1
1066         SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1067         """
1068         rowIndices = list(self.rowIndices)
1069         colPtrs = list(self.colPtrs)
1070
1071         if len(self.values) <= 16:
1072             values = _format_float_list(self.values)
1073
1074         else:
1075             values = (
1076                 _format_float_list(self.values[:8]) +
1077                 ["..."] +
1078                 _format_float_list(self.values[-8:])
1079             )
1080             rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1081
1082         if len(self.colPtrs) > 16:
1083             colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1084
1085         values = ", ".join(values)
1086         rowIndices = ", ".join([str(ind) for ind in rowIndices])
1087         colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1088         return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1089             self.numRows, self.numCols, colPtrs, rowIndices,
1090             values, self.isTransposed)
1091
1092     def __reduce__(self):
1093         return SparseMatrix, (
1094             self.numRows, self.numCols, self.colPtrs.tostring(),
1095             self.rowIndices.tostring(), self.values.tostring(),
1096             int(self.isTransposed))
1097
1098     def __getitem__(self, indices):
1099         i, j = indices
1100         if i < 0 or i >= self.numRows:
1101             raise IndexError("Row index %d is out of range [0, %d)"
1102                              % (i, self.numRows))
1103         if j < 0 or j >= self.numCols:
1104             raise IndexError("Column index %d is out of range [0, %d)"
1105                              % (j, self.numCols))
1106
1107         # If a CSR matrix is given, then the row index should be searched
1108         # for in ColPtrs, and the column index should be searched for in the
1109         # corresponding slice obtained from rowIndices.
1110         if self.isTransposed:
1111             j, i = i, j
1112
1113         colStart = self.colPtrs[j]
1114         colEnd = self.colPtrs[j + 1]
1115         nz = self.rowIndices[colStart: colEnd]
1116         ind = np.searchsorted(nz, i) + colStart
1117         if ind < colEnd and self.rowIndices[ind] == i:
1118             return self.values[ind]
1119         else:
1120             return 0.0
1121
1122     def toArray(self):
1123         """
1124         Return a numpy.ndarray
1125         """
1126         A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1127         for k in xrange(self.colPtrs.size - 1):
1128             startptr = self.colPtrs[k]
1129             endptr = self.colPtrs[k + 1]
1130             if self.isTransposed:
1131                 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1132             else:
1133                 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1134         return A
1135
1136     def toDense(self):
1137         densevals = np.ravel(self.toArray(), order='F')
1138         return DenseMatrix(self.numRows, self.numCols, densevals)
1139
1140     # TODO: More efficient implementation:
1141     def __eq__(self, other):
1142         return np.all(self.toArray() == other.toArray())
1143
1144
1145 class Matrices(object):
1146     @staticmethod
1147     def dense(numRows, numCols, values):
1148         """
1149         Create a DenseMatrix
1150         """
1151         return DenseMatrix(numRows, numCols, values)
1152
1153     @staticmethod
1154     def sparse(numRows, numCols, colPtrs, rowIndices, values):
1155         """
1156         Create a SparseMatrix
1157         """
1158         return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1159
1160
1161 def _test():
1162     import doctest
1163     try:
1164         # Numpy 1.14+ changed it's string format.
1165         np.set_printoptions(legacy='1.13')
1166     except TypeError:
1167         pass
1168     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1169     if failure_count:
1170         sys.exit(-1)
1171
1172 if __name__ == "__main__":
1173     _test()