Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 """
0019 MLlib utilities for linear algebra. For dense vectors, MLlib
0020 uses the NumPy `array` type, so you can simply pass NumPy arrays
0021 around. For sparse vectors, users can construct a :class:`SparseVector`
0022 object from MLlib or pass SciPy `scipy.sparse` column vectors if
0023 SciPy is available in their environment.
0024 """
0025 
0026 import sys
0027 import array
0028 import struct
0029 
0030 if sys.version >= '3':
0031     basestring = str
0032     xrange = range
0033     import copyreg as copy_reg
0034     long = int
0035 else:
0036     from itertools import izip as zip
0037     import copy_reg
0038 
0039 import numpy as np
0040 
0041 from pyspark import since
0042 from pyspark.ml import linalg as newlinalg
0043 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
0044     IntegerType, ByteType, BooleanType
0045 
0046 
0047 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
0048            'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices',
0049            'QRDecomposition']
0050 
0051 
0052 if sys.version_info[:2] == (2, 7):
0053     # speed up pickling array in Python 2.7
0054     def fast_pickle_array(ar):
0055         return array.array, (ar.typecode, ar.tostring())
0056     copy_reg.pickle(array.array, fast_pickle_array)
0057 
0058 
0059 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
0060 # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
0061 
0062 try:
0063     import scipy.sparse
0064     _have_scipy = True
0065 except:
0066     # No SciPy in environment, but that's okay
0067     _have_scipy = False
0068 
0069 
0070 def _convert_to_vector(l):
0071     if isinstance(l, Vector):
0072         return l
0073     elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
0074         return DenseVector(l)
0075     elif _have_scipy and scipy.sparse.issparse(l):
0076         assert l.shape[1] == 1, "Expected column vector"
0077         # Make sure the converted csc_matrix has sorted indices.
0078         csc = l.tocsc()
0079         if not csc.has_sorted_indices:
0080             csc.sort_indices()
0081         return SparseVector(l.shape[0], csc.indices, csc.data)
0082     else:
0083         raise TypeError("Cannot convert type %s into Vector" % type(l))
0084 
0085 
0086 def _vector_size(v):
0087     """
0088     Returns the size of the vector.
0089 
0090     >>> _vector_size([1., 2., 3.])
0091     3
0092     >>> _vector_size((1., 2., 3.))
0093     3
0094     >>> _vector_size(array.array('d', [1., 2., 3.]))
0095     3
0096     >>> _vector_size(np.zeros(3))
0097     3
0098     >>> _vector_size(np.zeros((3, 1)))
0099     3
0100     >>> _vector_size(np.zeros((1, 3)))
0101     Traceback (most recent call last):
0102         ...
0103     ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
0104     """
0105     if isinstance(v, Vector):
0106         return len(v)
0107     elif type(v) in (array.array, list, tuple, xrange):
0108         return len(v)
0109     elif type(v) == np.ndarray:
0110         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
0111             return len(v)
0112         else:
0113             raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
0114     elif _have_scipy and scipy.sparse.issparse(v):
0115         assert v.shape[1] == 1, "Expected column vector"
0116         return v.shape[0]
0117     else:
0118         raise TypeError("Cannot treat type %s as a vector" % type(v))
0119 
0120 
0121 def _format_float(f, digits=4):
0122     s = str(round(f, digits))
0123     if '.' in s:
0124         s = s[:s.index('.') + 1 + digits]
0125     return s
0126 
0127 
0128 def _format_float_list(l):
0129     return [_format_float(x) for x in l]
0130 
0131 
0132 def _double_to_long_bits(value):
0133     if np.isnan(value):
0134         value = float('nan')
0135     # pack double into 64 bits, then unpack as long int
0136     return struct.unpack('Q', struct.pack('d', value))[0]
0137 
0138 
0139 class VectorUDT(UserDefinedType):
0140     """
0141     SQL user-defined type (UDT) for Vector.
0142     """
0143 
0144     @classmethod
0145     def sqlType(cls):
0146         return StructType([
0147             StructField("type", ByteType(), False),
0148             StructField("size", IntegerType(), True),
0149             StructField("indices", ArrayType(IntegerType(), False), True),
0150             StructField("values", ArrayType(DoubleType(), False), True)])
0151 
0152     @classmethod
0153     def module(cls):
0154         return "pyspark.mllib.linalg"
0155 
0156     @classmethod
0157     def scalaUDT(cls):
0158         return "org.apache.spark.mllib.linalg.VectorUDT"
0159 
0160     def serialize(self, obj):
0161         if isinstance(obj, SparseVector):
0162             indices = [int(i) for i in obj.indices]
0163             values = [float(v) for v in obj.values]
0164             return (0, obj.size, indices, values)
0165         elif isinstance(obj, DenseVector):
0166             values = [float(v) for v in obj]
0167             return (1, None, None, values)
0168         else:
0169             raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
0170 
0171     def deserialize(self, datum):
0172         assert len(datum) == 4, \
0173             "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
0174         tpe = datum[0]
0175         if tpe == 0:
0176             return SparseVector(datum[1], datum[2], datum[3])
0177         elif tpe == 1:
0178             return DenseVector(datum[3])
0179         else:
0180             raise ValueError("do not recognize type %r" % tpe)
0181 
0182     def simpleString(self):
0183         return "vector"
0184 
0185 
0186 class MatrixUDT(UserDefinedType):
0187     """
0188     SQL user-defined type (UDT) for Matrix.
0189     """
0190 
0191     @classmethod
0192     def sqlType(cls):
0193         return StructType([
0194             StructField("type", ByteType(), False),
0195             StructField("numRows", IntegerType(), False),
0196             StructField("numCols", IntegerType(), False),
0197             StructField("colPtrs", ArrayType(IntegerType(), False), True),
0198             StructField("rowIndices", ArrayType(IntegerType(), False), True),
0199             StructField("values", ArrayType(DoubleType(), False), True),
0200             StructField("isTransposed", BooleanType(), False)])
0201 
0202     @classmethod
0203     def module(cls):
0204         return "pyspark.mllib.linalg"
0205 
0206     @classmethod
0207     def scalaUDT(cls):
0208         return "org.apache.spark.mllib.linalg.MatrixUDT"
0209 
0210     def serialize(self, obj):
0211         if isinstance(obj, SparseMatrix):
0212             colPtrs = [int(i) for i in obj.colPtrs]
0213             rowIndices = [int(i) for i in obj.rowIndices]
0214             values = [float(v) for v in obj.values]
0215             return (0, obj.numRows, obj.numCols, colPtrs,
0216                     rowIndices, values, bool(obj.isTransposed))
0217         elif isinstance(obj, DenseMatrix):
0218             values = [float(v) for v in obj.values]
0219             return (1, obj.numRows, obj.numCols, None, None, values,
0220                     bool(obj.isTransposed))
0221         else:
0222             raise TypeError("cannot serialize type %r" % (type(obj)))
0223 
0224     def deserialize(self, datum):
0225         assert len(datum) == 7, \
0226             "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
0227         tpe = datum[0]
0228         if tpe == 0:
0229             return SparseMatrix(*datum[1:])
0230         elif tpe == 1:
0231             return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
0232         else:
0233             raise ValueError("do not recognize type %r" % tpe)
0234 
0235     def simpleString(self):
0236         return "matrix"
0237 
0238 
0239 class Vector(object):
0240 
0241     __UDT__ = VectorUDT()
0242 
0243     """
0244     Abstract class for DenseVector and SparseVector
0245     """
0246     def toArray(self):
0247         """
0248         Convert the vector into an numpy.ndarray
0249 
0250         :return: numpy.ndarray
0251         """
0252         raise NotImplementedError
0253 
0254     def asML(self):
0255         """
0256         Convert this vector to the new mllib-local representation.
0257         This does NOT copy the data; it copies references.
0258 
0259         :return: :py:class:`pyspark.ml.linalg.Vector`
0260         """
0261         raise NotImplementedError
0262 
0263 
0264 class DenseVector(Vector):
0265     """
0266     A dense vector represented by a value array. We use numpy array for
0267     storage and arithmetics will be delegated to the underlying numpy
0268     array.
0269 
0270     >>> v = Vectors.dense([1.0, 2.0])
0271     >>> u = Vectors.dense([3.0, 4.0])
0272     >>> v + u
0273     DenseVector([4.0, 6.0])
0274     >>> 2 - v
0275     DenseVector([1.0, 0.0])
0276     >>> v / 2
0277     DenseVector([0.5, 1.0])
0278     >>> v * u
0279     DenseVector([3.0, 8.0])
0280     >>> u / v
0281     DenseVector([3.0, 2.0])
0282     >>> u % 2
0283     DenseVector([1.0, 0.0])
0284     >>> -v
0285     DenseVector([-1.0, -2.0])
0286     """
0287     def __init__(self, ar):
0288         if isinstance(ar, bytes):
0289             ar = np.frombuffer(ar, dtype=np.float64)
0290         elif not isinstance(ar, np.ndarray):
0291             ar = np.array(ar, dtype=np.float64)
0292         if ar.dtype != np.float64:
0293             ar = ar.astype(np.float64)
0294         self.array = ar
0295 
0296     @staticmethod
0297     def parse(s):
0298         """
0299         Parse string representation back into the DenseVector.
0300 
0301         >>> DenseVector.parse(' [ 0.0,1.0,2.0,  3.0]')
0302         DenseVector([0.0, 1.0, 2.0, 3.0])
0303         """
0304         start = s.find('[')
0305         if start == -1:
0306             raise ValueError("Array should start with '['.")
0307         end = s.find(']')
0308         if end == -1:
0309             raise ValueError("Array should end with ']'.")
0310         s = s[start + 1: end]
0311 
0312         try:
0313             values = [float(val) for val in s.split(',') if val]
0314         except ValueError:
0315             raise ValueError("Unable to parse values from %s" % s)
0316         return DenseVector(values)
0317 
0318     def __reduce__(self):
0319         return DenseVector, (self.array.tostring(),)
0320 
0321     def numNonzeros(self):
0322         """
0323         Number of nonzero elements. This scans all active values and count non zeros
0324         """
0325         return np.count_nonzero(self.array)
0326 
0327     def norm(self, p):
0328         """
0329         Calculates the norm of a DenseVector.
0330 
0331         >>> a = DenseVector([0, -1, 2, -3])
0332         >>> a.norm(2)
0333         3.7...
0334         >>> a.norm(1)
0335         6.0
0336         """
0337         return np.linalg.norm(self.array, p)
0338 
0339     def dot(self, other):
0340         """
0341         Compute the dot product of two Vectors. We support
0342         (Numpy array, list, SparseVector, or SciPy sparse)
0343         and a target NumPy array that is either 1- or 2-dimensional.
0344         Equivalent to calling numpy.dot of the two vectors.
0345 
0346         >>> dense = DenseVector(array.array('d', [1., 2.]))
0347         >>> dense.dot(dense)
0348         5.0
0349         >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
0350         4.0
0351         >>> dense.dot(range(1, 3))
0352         5.0
0353         >>> dense.dot(np.array(range(1, 3)))
0354         5.0
0355         >>> dense.dot([1.,])
0356         Traceback (most recent call last):
0357             ...
0358         AssertionError: dimension mismatch
0359         >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
0360         array([  5.,  11.])
0361         >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
0362         Traceback (most recent call last):
0363             ...
0364         AssertionError: dimension mismatch
0365         """
0366         if type(other) == np.ndarray:
0367             if other.ndim > 1:
0368                 assert len(self) == other.shape[0], "dimension mismatch"
0369             return np.dot(self.array, other)
0370         elif _have_scipy and scipy.sparse.issparse(other):
0371             assert len(self) == other.shape[0], "dimension mismatch"
0372             return other.transpose().dot(self.toArray())
0373         else:
0374             assert len(self) == _vector_size(other), "dimension mismatch"
0375             if isinstance(other, SparseVector):
0376                 return other.dot(self)
0377             elif isinstance(other, Vector):
0378                 return np.dot(self.toArray(), other.toArray())
0379             else:
0380                 return np.dot(self.toArray(), other)
0381 
0382     def squared_distance(self, other):
0383         """
0384         Squared distance of two Vectors.
0385 
0386         >>> dense1 = DenseVector(array.array('d', [1., 2.]))
0387         >>> dense1.squared_distance(dense1)
0388         0.0
0389         >>> dense2 = np.array([2., 1.])
0390         >>> dense1.squared_distance(dense2)
0391         2.0
0392         >>> dense3 = [2., 1.]
0393         >>> dense1.squared_distance(dense3)
0394         2.0
0395         >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
0396         >>> dense1.squared_distance(sparse1)
0397         2.0
0398         >>> dense1.squared_distance([1.,])
0399         Traceback (most recent call last):
0400             ...
0401         AssertionError: dimension mismatch
0402         >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
0403         Traceback (most recent call last):
0404             ...
0405         AssertionError: dimension mismatch
0406         """
0407         assert len(self) == _vector_size(other), "dimension mismatch"
0408         if isinstance(other, SparseVector):
0409             return other.squared_distance(self)
0410         elif _have_scipy and scipy.sparse.issparse(other):
0411             return _convert_to_vector(other).squared_distance(self)
0412 
0413         if isinstance(other, Vector):
0414             other = other.toArray()
0415         elif not isinstance(other, np.ndarray):
0416             other = np.array(other)
0417         diff = self.toArray() - other
0418         return np.dot(diff, diff)
0419 
0420     def toArray(self):
0421         """
0422         Returns an numpy.ndarray
0423         """
0424         return self.array
0425 
0426     def asML(self):
0427         """
0428         Convert this vector to the new mllib-local representation.
0429         This does NOT copy the data; it copies references.
0430 
0431         :return: :py:class:`pyspark.ml.linalg.DenseVector`
0432 
0433         .. versionadded:: 2.0.0
0434         """
0435         return newlinalg.DenseVector(self.array)
0436 
0437     @property
0438     def values(self):
0439         """
0440         Returns a list of values
0441         """
0442         return self.array
0443 
0444     def __getitem__(self, item):
0445         return self.array[item]
0446 
0447     def __len__(self):
0448         return len(self.array)
0449 
0450     def __str__(self):
0451         return "[" + ",".join([str(v) for v in self.array]) + "]"
0452 
0453     def __repr__(self):
0454         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
0455 
0456     def __eq__(self, other):
0457         if isinstance(other, DenseVector):
0458             return np.array_equal(self.array, other.array)
0459         elif isinstance(other, SparseVector):
0460             if len(self) != other.size:
0461                 return False
0462             return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
0463         return False
0464 
0465     def __ne__(self, other):
0466         return not self == other
0467 
0468     def __hash__(self):
0469         size = len(self)
0470         result = 31 + size
0471         nnz = 0
0472         i = 0
0473         while i < size and nnz < 128:
0474             if self.array[i] != 0:
0475                 result = 31 * result + i
0476                 bits = _double_to_long_bits(self.array[i])
0477                 result = 31 * result + (bits ^ (bits >> 32))
0478                 nnz += 1
0479             i += 1
0480         return result
0481 
0482     def __getattr__(self, item):
0483         return getattr(self.array, item)
0484 
0485     def __neg__(self):
0486         return DenseVector(-self.array)
0487 
0488     def _delegate(op):
0489         def func(self, other):
0490             if isinstance(other, DenseVector):
0491                 other = other.array
0492             return DenseVector(getattr(self.array, op)(other))
0493         return func
0494 
0495     __add__ = _delegate("__add__")
0496     __sub__ = _delegate("__sub__")
0497     __mul__ = _delegate("__mul__")
0498     __div__ = _delegate("__div__")
0499     __truediv__ = _delegate("__truediv__")
0500     __mod__ = _delegate("__mod__")
0501     __radd__ = _delegate("__radd__")
0502     __rsub__ = _delegate("__rsub__")
0503     __rmul__ = _delegate("__rmul__")
0504     __rdiv__ = _delegate("__rdiv__")
0505     __rtruediv__ = _delegate("__rtruediv__")
0506     __rmod__ = _delegate("__rmod__")
0507 
0508 
0509 class SparseVector(Vector):
0510     """
0511     A simple sparse vector class for passing data to MLlib. Users may
0512     alternatively pass SciPy's {scipy.sparse} data types.
0513     """
0514     def __init__(self, size, *args):
0515         """
0516         Create a sparse vector, using either a dictionary, a list of
0517         (index, value) pairs, or two separate arrays of indices and
0518         values (sorted by index).
0519 
0520         :param size: Size of the vector.
0521         :param args: Active entries, as a dictionary {index: value, ...},
0522           a list of tuples [(index, value), ...], or a list of strictly
0523           increasing indices and a list of corresponding values [index, ...],
0524           [value, ...]. Inactive entries are treated as zeros.
0525 
0526         >>> SparseVector(4, {1: 1.0, 3: 5.5})
0527         SparseVector(4, {1: 1.0, 3: 5.5})
0528         >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
0529         SparseVector(4, {1: 1.0, 3: 5.5})
0530         >>> SparseVector(4, [1, 3], [1.0, 5.5])
0531         SparseVector(4, {1: 1.0, 3: 5.5})
0532         """
0533         self.size = int(size)
0534         """ Size of the vector. """
0535         assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
0536         if len(args) == 1:
0537             pairs = args[0]
0538             if type(pairs) == dict:
0539                 pairs = pairs.items()
0540             pairs = sorted(pairs)
0541             self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
0542             """ A list of indices corresponding to active entries. """
0543             self.values = np.array([p[1] for p in pairs], dtype=np.float64)
0544             """ A list of values corresponding to active entries. """
0545         else:
0546             if isinstance(args[0], bytes):
0547                 assert isinstance(args[1], bytes), "values should be string too"
0548                 if args[0]:
0549                     self.indices = np.frombuffer(args[0], np.int32)
0550                     self.values = np.frombuffer(args[1], np.float64)
0551                 else:
0552                     # np.frombuffer() doesn't work well with empty string in older version
0553                     self.indices = np.array([], dtype=np.int32)
0554                     self.values = np.array([], dtype=np.float64)
0555             else:
0556                 self.indices = np.array(args[0], dtype=np.int32)
0557                 self.values = np.array(args[1], dtype=np.float64)
0558             assert len(self.indices) == len(self.values), "index and value arrays not same length"
0559             for i in xrange(len(self.indices) - 1):
0560                 if self.indices[i] >= self.indices[i + 1]:
0561                     raise TypeError(
0562                         "Indices %s and %s are not strictly increasing"
0563                         % (self.indices[i], self.indices[i + 1]))
0564 
0565     def numNonzeros(self):
0566         """
0567         Number of nonzero elements. This scans all active values and count non zeros.
0568         """
0569         return np.count_nonzero(self.values)
0570 
0571     def norm(self, p):
0572         """
0573         Calculates the norm of a SparseVector.
0574 
0575         >>> a = SparseVector(4, [0, 1], [3., -4.])
0576         >>> a.norm(1)
0577         7.0
0578         >>> a.norm(2)
0579         5.0
0580         """
0581         return np.linalg.norm(self.values, p)
0582 
0583     def __reduce__(self):
0584         return (
0585             SparseVector,
0586             (self.size, self.indices.tostring(), self.values.tostring()))
0587 
0588     @staticmethod
0589     def parse(s):
0590         """
0591         Parse string representation back into the SparseVector.
0592 
0593         >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )')
0594         SparseVector(4, {0: 4.0, 1: 5.0})
0595         """
0596         start = s.find('(')
0597         if start == -1:
0598             raise ValueError("Tuple should start with '('")
0599         end = s.find(')')
0600         if end == -1:
0601             raise ValueError("Tuple should end with ')'")
0602         s = s[start + 1: end].strip()
0603 
0604         size = s[: s.find(',')]
0605         try:
0606             size = int(size)
0607         except ValueError:
0608             raise ValueError("Cannot parse size %s." % size)
0609 
0610         ind_start = s.find('[')
0611         if ind_start == -1:
0612             raise ValueError("Indices array should start with '['.")
0613         ind_end = s.find(']')
0614         if ind_end == -1:
0615             raise ValueError("Indices array should end with ']'")
0616         new_s = s[ind_start + 1: ind_end]
0617         ind_list = new_s.split(',')
0618         try:
0619             indices = [int(ind) for ind in ind_list if ind]
0620         except ValueError:
0621             raise ValueError("Unable to parse indices from %s." % new_s)
0622         s = s[ind_end + 1:].strip()
0623 
0624         val_start = s.find('[')
0625         if val_start == -1:
0626             raise ValueError("Values array should start with '['.")
0627         val_end = s.find(']')
0628         if val_end == -1:
0629             raise ValueError("Values array should end with ']'.")
0630         val_list = s[val_start + 1: val_end].split(',')
0631         try:
0632             values = [float(val) for val in val_list if val]
0633         except ValueError:
0634             raise ValueError("Unable to parse values from %s." % s)
0635         return SparseVector(size, indices, values)
0636 
0637     def dot(self, other):
0638         """
0639         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
0640 
0641         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0642         >>> a.dot(a)
0643         25.0
0644         >>> a.dot(array.array('d', [1., 2., 3., 4.]))
0645         22.0
0646         >>> b = SparseVector(4, [2], [1.0])
0647         >>> a.dot(b)
0648         0.0
0649         >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
0650         array([ 22.,  22.])
0651         >>> a.dot([1., 2., 3.])
0652         Traceback (most recent call last):
0653             ...
0654         AssertionError: dimension mismatch
0655         >>> a.dot(np.array([1., 2.]))
0656         Traceback (most recent call last):
0657             ...
0658         AssertionError: dimension mismatch
0659         >>> a.dot(DenseVector([1., 2.]))
0660         Traceback (most recent call last):
0661             ...
0662         AssertionError: dimension mismatch
0663         >>> a.dot(np.zeros((3, 2)))
0664         Traceback (most recent call last):
0665             ...
0666         AssertionError: dimension mismatch
0667         """
0668 
0669         if isinstance(other, np.ndarray):
0670             if other.ndim not in [2, 1]:
0671                 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
0672             assert len(self) == other.shape[0], "dimension mismatch"
0673             return np.dot(self.values, other[self.indices])
0674 
0675         assert len(self) == _vector_size(other), "dimension mismatch"
0676 
0677         if isinstance(other, DenseVector):
0678             return np.dot(other.array[self.indices], self.values)
0679 
0680         elif isinstance(other, SparseVector):
0681             # Find out common indices.
0682             self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
0683             self_values = self.values[self_cmind]
0684             if self_values.size == 0:
0685                 return 0.0
0686             else:
0687                 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
0688                 return np.dot(self_values, other.values[other_cmind])
0689 
0690         else:
0691             return self.dot(_convert_to_vector(other))
0692 
0693     def squared_distance(self, other):
0694         """
0695         Squared distance from a SparseVector or 1-dimensional NumPy array.
0696 
0697         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
0698         >>> a.squared_distance(a)
0699         0.0
0700         >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
0701         11.0
0702         >>> a.squared_distance(np.array([1., 2., 3., 4.]))
0703         11.0
0704         >>> b = SparseVector(4, [2], [1.0])
0705         >>> a.squared_distance(b)
0706         26.0
0707         >>> b.squared_distance(a)
0708         26.0
0709         >>> b.squared_distance([1., 2.])
0710         Traceback (most recent call last):
0711             ...
0712         AssertionError: dimension mismatch
0713         >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
0714         Traceback (most recent call last):
0715             ...
0716         AssertionError: dimension mismatch
0717         """
0718         assert len(self) == _vector_size(other), "dimension mismatch"
0719 
0720         if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
0721             if isinstance(other, np.ndarray) and other.ndim != 1:
0722                 raise Exception("Cannot call squared_distance with %d-dimensional array" %
0723                                 other.ndim)
0724             if isinstance(other, DenseVector):
0725                 other = other.array
0726             sparse_ind = np.zeros(other.size, dtype=bool)
0727             sparse_ind[self.indices] = True
0728             dist = other[sparse_ind] - self.values
0729             result = np.dot(dist, dist)
0730 
0731             other_ind = other[~sparse_ind]
0732             result += np.dot(other_ind, other_ind)
0733             return result
0734 
0735         elif isinstance(other, SparseVector):
0736             result = 0.0
0737             i, j = 0, 0
0738             while i < len(self.indices) and j < len(other.indices):
0739                 if self.indices[i] == other.indices[j]:
0740                     diff = self.values[i] - other.values[j]
0741                     result += diff * diff
0742                     i += 1
0743                     j += 1
0744                 elif self.indices[i] < other.indices[j]:
0745                     result += self.values[i] * self.values[i]
0746                     i += 1
0747                 else:
0748                     result += other.values[j] * other.values[j]
0749                     j += 1
0750             while i < len(self.indices):
0751                 result += self.values[i] * self.values[i]
0752                 i += 1
0753             while j < len(other.indices):
0754                 result += other.values[j] * other.values[j]
0755                 j += 1
0756             return result
0757         else:
0758             return self.squared_distance(_convert_to_vector(other))
0759 
0760     def toArray(self):
0761         """
0762         Returns a copy of this SparseVector as a 1-dimensional NumPy array.
0763         """
0764         arr = np.zeros((self.size,), dtype=np.float64)
0765         arr[self.indices] = self.values
0766         return arr
0767 
0768     def asML(self):
0769         """
0770         Convert this vector to the new mllib-local representation.
0771         This does NOT copy the data; it copies references.
0772 
0773         :return: :py:class:`pyspark.ml.linalg.SparseVector`
0774 
0775         .. versionadded:: 2.0.0
0776         """
0777         return newlinalg.SparseVector(self.size, self.indices, self.values)
0778 
0779     def __len__(self):
0780         return self.size
0781 
0782     def __str__(self):
0783         inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
0784         vals = "[" + ",".join([str(v) for v in self.values]) + "]"
0785         return "(" + ",".join((str(self.size), inds, vals)) + ")"
0786 
0787     def __repr__(self):
0788         inds = self.indices
0789         vals = self.values
0790         entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
0791                              for i in xrange(len(inds))])
0792         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
0793 
0794     def __eq__(self, other):
0795         if isinstance(other, SparseVector):
0796             return other.size == self.size and np.array_equal(other.indices, self.indices) \
0797                 and np.array_equal(other.values, self.values)
0798         elif isinstance(other, DenseVector):
0799             if self.size != len(other):
0800                 return False
0801             return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
0802         return False
0803 
0804     def __getitem__(self, index):
0805         inds = self.indices
0806         vals = self.values
0807         if not isinstance(index, int):
0808             raise TypeError(
0809                 "Indices must be of type integer, got type %s" % type(index))
0810 
0811         if index >= self.size or index < -self.size:
0812             raise IndexError("Index %d out of bounds." % index)
0813         if index < 0:
0814             index += self.size
0815 
0816         if (inds.size == 0) or (index > inds.item(-1)):
0817             return 0.
0818 
0819         insert_index = np.searchsorted(inds, index)
0820         row_ind = inds[insert_index]
0821         if row_ind == index:
0822             return vals[insert_index]
0823         return 0.
0824 
0825     def __ne__(self, other):
0826         return not self.__eq__(other)
0827 
0828     def __hash__(self):
0829         result = 31 + self.size
0830         nnz = 0
0831         i = 0
0832         while i < len(self.values) and nnz < 128:
0833             if self.values[i] != 0:
0834                 result = 31 * result + int(self.indices[i])
0835                 bits = _double_to_long_bits(self.values[i])
0836                 result = 31 * result + (bits ^ (bits >> 32))
0837                 nnz += 1
0838             i += 1
0839         return result
0840 
0841 
0842 class Vectors(object):
0843 
0844     """
0845     Factory methods for working with vectors.
0846 
0847     .. note:: Dense vectors are simply represented as NumPy array objects,
0848         so there is no need to covert them for use in MLlib. For sparse vectors,
0849         the factory methods in this class create an MLlib-compatible type, or users
0850         can pass in SciPy's `scipy.sparse` column vectors.
0851     """
0852 
0853     @staticmethod
0854     def sparse(size, *args):
0855         """
0856         Create a sparse vector, using either a dictionary, a list of
0857         (index, value) pairs, or two separate arrays of indices and
0858         values (sorted by index).
0859 
0860         :param size: Size of the vector.
0861         :param args: Non-zero entries, as a dictionary, list of tuples,
0862                      or two sorted lists containing indices and values.
0863 
0864         >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
0865         SparseVector(4, {1: 1.0, 3: 5.5})
0866         >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
0867         SparseVector(4, {1: 1.0, 3: 5.5})
0868         >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
0869         SparseVector(4, {1: 1.0, 3: 5.5})
0870         """
0871         return SparseVector(size, *args)
0872 
0873     @staticmethod
0874     def dense(*elements):
0875         """
0876         Create a dense vector of 64-bit floats from a Python list or numbers.
0877 
0878         >>> Vectors.dense([1, 2, 3])
0879         DenseVector([1.0, 2.0, 3.0])
0880         >>> Vectors.dense(1.0, 2.0)
0881         DenseVector([1.0, 2.0])
0882         """
0883         if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
0884             # it's list, numpy.array or other iterable object.
0885             elements = elements[0]
0886         return DenseVector(elements)
0887 
0888     @staticmethod
0889     def fromML(vec):
0890         """
0891         Convert a vector from the new mllib-local representation.
0892         This does NOT copy the data; it copies references.
0893 
0894         :param vec: a :py:class:`pyspark.ml.linalg.Vector`
0895         :return: a :py:class:`pyspark.mllib.linalg.Vector`
0896 
0897         .. versionadded:: 2.0.0
0898         """
0899         if isinstance(vec, newlinalg.DenseVector):
0900             return DenseVector(vec.array)
0901         elif isinstance(vec, newlinalg.SparseVector):
0902             return SparseVector(vec.size, vec.indices, vec.values)
0903         else:
0904             raise TypeError("Unsupported vector type %s" % type(vec))
0905 
0906     @staticmethod
0907     def stringify(vector):
0908         """
0909         Converts a vector into a string, which can be recognized by
0910         Vectors.parse().
0911 
0912         >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0]))
0913         '(2,[1],[1.0])'
0914         >>> Vectors.stringify(Vectors.dense([0.0, 1.0]))
0915         '[0.0,1.0]'
0916         """
0917         return str(vector)
0918 
0919     @staticmethod
0920     def squared_distance(v1, v2):
0921         """
0922         Squared distance between two vectors.
0923         a and b can be of type SparseVector, DenseVector, np.ndarray
0924         or array.array.
0925 
0926         >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
0927         >>> b = Vectors.dense([2, 5, 4, 1])
0928         >>> a.squared_distance(b)
0929         51.0
0930         """
0931         v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
0932         return v1.squared_distance(v2)
0933 
0934     @staticmethod
0935     def norm(vector, p):
0936         """
0937         Find norm of the given vector.
0938         """
0939         return _convert_to_vector(vector).norm(p)
0940 
0941     @staticmethod
0942     def parse(s):
0943         """Parse a string representation back into the Vector.
0944 
0945         >>> Vectors.parse('[2,1,2 ]')
0946         DenseVector([2.0, 1.0, 2.0])
0947         >>> Vectors.parse(' ( 100,  [0],  [2])')
0948         SparseVector(100, {0: 2.0})
0949         """
0950         if s.find('(') == -1 and s.find('[') != -1:
0951             return DenseVector.parse(s)
0952         elif s.find('(') != -1:
0953             return SparseVector.parse(s)
0954         else:
0955             raise ValueError(
0956                 "Cannot find tokens '[' or '(' from the input string.")
0957 
0958     @staticmethod
0959     def zeros(size):
0960         return DenseVector(np.zeros(size))
0961 
0962     @staticmethod
0963     def _equals(v1_indices, v1_values, v2_indices, v2_values):
0964         """
0965         Check equality between sparse/dense vectors,
0966         v1_indices and v2_indices assume to be strictly increasing.
0967         """
0968         v1_size = len(v1_values)
0969         v2_size = len(v2_values)
0970         k1 = 0
0971         k2 = 0
0972         all_equal = True
0973         while all_equal:
0974             while k1 < v1_size and v1_values[k1] == 0:
0975                 k1 += 1
0976             while k2 < v2_size and v2_values[k2] == 0:
0977                 k2 += 1
0978 
0979             if k1 >= v1_size or k2 >= v2_size:
0980                 return k1 >= v1_size and k2 >= v2_size
0981 
0982             all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
0983             k1 += 1
0984             k2 += 1
0985         return all_equal
0986 
0987 
0988 class Matrix(object):
0989 
0990     __UDT__ = MatrixUDT()
0991 
0992     """
0993     Represents a local matrix.
0994     """
0995     def __init__(self, numRows, numCols, isTransposed=False):
0996         self.numRows = numRows
0997         self.numCols = numCols
0998         self.isTransposed = isTransposed
0999 
1000     def toArray(self):
1001         """
1002         Returns its elements in a NumPy ndarray.
1003         """
1004         raise NotImplementedError
1005 
1006     def asML(self):
1007         """
1008         Convert this matrix to the new mllib-local representation.
1009         This does NOT copy the data; it copies references.
1010         """
1011         raise NotImplementedError
1012 
1013     @staticmethod
1014     def _convert_to_array(array_like, dtype):
1015         """
1016         Convert Matrix attributes which are array-like or buffer to array.
1017         """
1018         if isinstance(array_like, bytes):
1019             return np.frombuffer(array_like, dtype=dtype)
1020         return np.asarray(array_like, dtype=dtype)
1021 
1022 
1023 class DenseMatrix(Matrix):
1024     """
1025     Column-major dense matrix.
1026     """
1027     def __init__(self, numRows, numCols, values, isTransposed=False):
1028         Matrix.__init__(self, numRows, numCols, isTransposed)
1029         values = self._convert_to_array(values, np.float64)
1030         assert len(values) == numRows * numCols
1031         self.values = values
1032 
1033     def __reduce__(self):
1034         return DenseMatrix, (
1035             self.numRows, self.numCols, self.values.tostring(),
1036             int(self.isTransposed))
1037 
1038     def __str__(self):
1039         """
1040         Pretty printing of a DenseMatrix
1041 
1042         >>> dm = DenseMatrix(2, 2, range(4))
1043         >>> print(dm)
1044         DenseMatrix([[ 0.,  2.],
1045                      [ 1.,  3.]])
1046         >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
1047         >>> print(dm)
1048         DenseMatrix([[ 0.,  1.],
1049                      [ 2.,  3.]])
1050         """
1051         # Inspired by __repr__ in scipy matrices.
1052         array_lines = repr(self.toArray()).splitlines()
1053 
1054         # We need to adjust six spaces which is the difference in number
1055         # of letters between "DenseMatrix" and "array"
1056         x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
1057         return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
1058 
1059     def __repr__(self):
1060         """
1061         Representation of a DenseMatrix
1062 
1063         >>> dm = DenseMatrix(2, 2, range(4))
1064         >>> dm
1065         DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
1066         """
1067         # If the number of values are less than seventeen then return as it is.
1068         # Else return first eight values and last eight values.
1069         if len(self.values) < 17:
1070             entries = _format_float_list(self.values)
1071         else:
1072             entries = (
1073                 _format_float_list(self.values[:8]) +
1074                 ["..."] +
1075                 _format_float_list(self.values[-8:])
1076             )
1077 
1078         entries = ", ".join(entries)
1079         return "DenseMatrix({0}, {1}, [{2}], {3})".format(
1080             self.numRows, self.numCols, entries, self.isTransposed)
1081 
1082     def toArray(self):
1083         """
1084         Return an numpy.ndarray
1085 
1086         >>> m = DenseMatrix(2, 2, range(4))
1087         >>> m.toArray()
1088         array([[ 0.,  2.],
1089                [ 1.,  3.]])
1090         """
1091         if self.isTransposed:
1092             return np.asfortranarray(
1093                 self.values.reshape((self.numRows, self.numCols)))
1094         else:
1095             return self.values.reshape((self.numRows, self.numCols), order='F')
1096 
1097     def toSparse(self):
1098         """Convert to SparseMatrix"""
1099         if self.isTransposed:
1100             values = np.ravel(self.toArray(), order='F')
1101         else:
1102             values = self.values
1103         indices = np.nonzero(values)[0]
1104         colCounts = np.bincount(indices // self.numRows)
1105         colPtrs = np.cumsum(np.hstack(
1106             (0, colCounts, np.zeros(self.numCols - colCounts.size))))
1107         values = values[indices]
1108         rowIndices = indices % self.numRows
1109 
1110         return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
1111 
1112     def asML(self):
1113         """
1114         Convert this matrix to the new mllib-local representation.
1115         This does NOT copy the data; it copies references.
1116 
1117         :return: :py:class:`pyspark.ml.linalg.DenseMatrix`
1118 
1119         .. versionadded:: 2.0.0
1120         """
1121         return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
1122 
1123     def __getitem__(self, indices):
1124         i, j = indices
1125         if i < 0 or i >= self.numRows:
1126             raise IndexError("Row index %d is out of range [0, %d)"
1127                              % (i, self.numRows))
1128         if j >= self.numCols or j < 0:
1129             raise IndexError("Column index %d is out of range [0, %d)"
1130                              % (j, self.numCols))
1131 
1132         if self.isTransposed:
1133             return self.values[i * self.numCols + j]
1134         else:
1135             return self.values[i + j * self.numRows]
1136 
1137     def __eq__(self, other):
1138         if (self.numRows != other.numRows or self.numCols != other.numCols):
1139             return False
1140         if isinstance(other, SparseMatrix):
1141             return np.all(self.toArray() == other.toArray())
1142 
1143         self_values = np.ravel(self.toArray(), order='F')
1144         other_values = np.ravel(other.toArray(), order='F')
1145         return np.all(self_values == other_values)
1146 
1147 
1148 class SparseMatrix(Matrix):
1149     """Sparse Matrix stored in CSC format."""
1150     def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
1151                  isTransposed=False):
1152         Matrix.__init__(self, numRows, numCols, isTransposed)
1153         self.colPtrs = self._convert_to_array(colPtrs, np.int32)
1154         self.rowIndices = self._convert_to_array(rowIndices, np.int32)
1155         self.values = self._convert_to_array(values, np.float64)
1156 
1157         if self.isTransposed:
1158             if self.colPtrs.size != numRows + 1:
1159                 raise ValueError("Expected colPtrs of size %d, got %d."
1160                                  % (numRows + 1, self.colPtrs.size))
1161         else:
1162             if self.colPtrs.size != numCols + 1:
1163                 raise ValueError("Expected colPtrs of size %d, got %d."
1164                                  % (numCols + 1, self.colPtrs.size))
1165         if self.rowIndices.size != self.values.size:
1166             raise ValueError("Expected rowIndices of length %d, got %d."
1167                              % (self.rowIndices.size, self.values.size))
1168 
1169     def __str__(self):
1170         """
1171         Pretty printing of a SparseMatrix
1172 
1173         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1174         >>> print(sm1)
1175         2 X 2 CSCMatrix
1176         (0,0) 2.0
1177         (1,0) 3.0
1178         (1,1) 4.0
1179         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1180         >>> print(sm1)
1181         2 X 2 CSRMatrix
1182         (0,0) 2.0
1183         (0,1) 3.0
1184         (1,1) 4.0
1185         """
1186         spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1187         if self.isTransposed:
1188             spstr += "CSRMatrix\n"
1189         else:
1190             spstr += "CSCMatrix\n"
1191 
1192         cur_col = 0
1193         smlist = []
1194 
1195         # Display first 16 values.
1196         if len(self.values) <= 16:
1197             zipindval = zip(self.rowIndices, self.values)
1198         else:
1199             zipindval = zip(self.rowIndices[:16], self.values[:16])
1200         for i, (rowInd, value) in enumerate(zipindval):
1201             if self.colPtrs[cur_col + 1] <= i:
1202                 cur_col += 1
1203             if self.isTransposed:
1204                 smlist.append('({0},{1}) {2}'.format(
1205                     cur_col, rowInd, _format_float(value)))
1206             else:
1207                 smlist.append('({0},{1}) {2}'.format(
1208                     rowInd, cur_col, _format_float(value)))
1209         spstr += "\n".join(smlist)
1210 
1211         if len(self.values) > 16:
1212             spstr += "\n.." * 2
1213         return spstr
1214 
1215     def __repr__(self):
1216         """
1217         Representation of a SparseMatrix
1218 
1219         >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1220         >>> sm1
1221         SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1222         """
1223         rowIndices = list(self.rowIndices)
1224         colPtrs = list(self.colPtrs)
1225 
1226         if len(self.values) <= 16:
1227             values = _format_float_list(self.values)
1228 
1229         else:
1230             values = (
1231                 _format_float_list(self.values[:8]) +
1232                 ["..."] +
1233                 _format_float_list(self.values[-8:])
1234             )
1235             rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1236 
1237         if len(self.colPtrs) > 16:
1238             colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1239 
1240         values = ", ".join(values)
1241         rowIndices = ", ".join([str(ind) for ind in rowIndices])
1242         colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1243         return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1244             self.numRows, self.numCols, colPtrs, rowIndices,
1245             values, self.isTransposed)
1246 
1247     def __reduce__(self):
1248         return SparseMatrix, (
1249             self.numRows, self.numCols, self.colPtrs.tostring(),
1250             self.rowIndices.tostring(), self.values.tostring(),
1251             int(self.isTransposed))
1252 
1253     def __getitem__(self, indices):
1254         i, j = indices
1255         if i < 0 or i >= self.numRows:
1256             raise IndexError("Row index %d is out of range [0, %d)"
1257                              % (i, self.numRows))
1258         if j < 0 or j >= self.numCols:
1259             raise IndexError("Column index %d is out of range [0, %d)"
1260                              % (j, self.numCols))
1261 
1262         # If a CSR matrix is given, then the row index should be searched
1263         # for in ColPtrs, and the column index should be searched for in the
1264         # corresponding slice obtained from rowIndices.
1265         if self.isTransposed:
1266             j, i = i, j
1267 
1268         colStart = self.colPtrs[j]
1269         colEnd = self.colPtrs[j + 1]
1270         nz = self.rowIndices[colStart: colEnd]
1271         ind = np.searchsorted(nz, i) + colStart
1272         if ind < colEnd and self.rowIndices[ind] == i:
1273             return self.values[ind]
1274         else:
1275             return 0.0
1276 
1277     def toArray(self):
1278         """
1279         Return an numpy.ndarray
1280         """
1281         A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1282         for k in xrange(self.colPtrs.size - 1):
1283             startptr = self.colPtrs[k]
1284             endptr = self.colPtrs[k + 1]
1285             if self.isTransposed:
1286                 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1287             else:
1288                 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1289         return A
1290 
1291     def toDense(self):
1292         densevals = np.ravel(self.toArray(), order='F')
1293         return DenseMatrix(self.numRows, self.numCols, densevals)
1294 
1295     def asML(self):
1296         """
1297         Convert this matrix to the new mllib-local representation.
1298         This does NOT copy the data; it copies references.
1299 
1300         :return: :py:class:`pyspark.ml.linalg.SparseMatrix`
1301 
1302         .. versionadded:: 2.0.0
1303         """
1304         return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
1305                                       self.values, self.isTransposed)
1306 
1307     # TODO: More efficient implementation:
1308     def __eq__(self, other):
1309         return np.all(self.toArray() == other.toArray())
1310 
1311 
1312 class Matrices(object):
1313     @staticmethod
1314     def dense(numRows, numCols, values):
1315         """
1316         Create a DenseMatrix
1317         """
1318         return DenseMatrix(numRows, numCols, values)
1319 
1320     @staticmethod
1321     def sparse(numRows, numCols, colPtrs, rowIndices, values):
1322         """
1323         Create a SparseMatrix
1324         """
1325         return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1326 
1327     @staticmethod
1328     def fromML(mat):
1329         """
1330         Convert a matrix from the new mllib-local representation.
1331         This does NOT copy the data; it copies references.
1332 
1333         :param mat: a :py:class:`pyspark.ml.linalg.Matrix`
1334         :return: a :py:class:`pyspark.mllib.linalg.Matrix`
1335 
1336         .. versionadded:: 2.0.0
1337         """
1338         if isinstance(mat, newlinalg.DenseMatrix):
1339             return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
1340         elif isinstance(mat, newlinalg.SparseMatrix):
1341             return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
1342                                 mat.values, mat.isTransposed)
1343         else:
1344             raise TypeError("Unsupported matrix type %s" % type(mat))
1345 
1346 
1347 class QRDecomposition(object):
1348     """
1349     Represents QR factors.
1350     """
1351     def __init__(self, Q, R):
1352         self._Q = Q
1353         self._R = R
1354 
1355     @property
1356     @since('2.0.0')
1357     def Q(self):
1358         """
1359         An orthogonal matrix Q in a QR decomposition.
1360         May be null if not computed.
1361         """
1362         return self._Q
1363 
1364     @property
1365     @since('2.0.0')
1366     def R(self):
1367         """
1368         An upper triangular matrix R in a QR decomposition.
1369         """
1370         return self._R
1371 
1372 
1373 def _test():
1374     import doctest
1375     import numpy
1376     try:
1377         # Numpy 1.14+ changed it's string format.
1378         numpy.set_printoptions(legacy='1.13')
1379     except TypeError:
1380         pass
1381     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1382     if failure_count:
1383         sys.exit(-1)
1384 
1385 if __name__ == "__main__":
1386     _test()