2222from pyspark .mllib .linalg import SparseVector
2323from pyspark .serializers import Serializer
2424
25+ """
26+ Common utilities shared throughout MLlib, primarily for dealing with
27+ different data types. These include:
28+ - Serialization utilities to / from byte arrays that Java can handle
29+ - Serializers for other data types, like ALS Rating objects
30+ - Common methods for linear models
31+ - Methods to deal with the different vector types we support, such as
32+ SparseVector and scipy.sparse matrices.
33+ """
34+
35+ # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
36+ # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
37+
38+ _have_scipy = False
39+ _scipy_issparse = None
40+ try :
41+ import scipy .sparse
42+ _have_scipy = True
43+ _scipy_issparse = scipy .sparse .issparse
44+ except :
45+ # No SciPy in environment, but that's okay
46+ pass
47+
2548# Dense double vector format:
2649#
2750# [1-byte 1] [4-byte length] [length*8 bytes of data]
@@ -67,6 +90,7 @@ def _serialize_double_vector(v):
6790 >>> array_equal(y, array([1.0, 2.0, 3.0]))
6891 True
6992 """
93+ v = _convert_vector (v )
7094 if type (v ) == ndarray :
7195 return _serialize_dense_vector (v )
7296 elif type (v ) == SparseVector :
@@ -201,6 +225,7 @@ def _deserialize_double_matrix(ba):
201225def _linear_predictor_typecheck (x , coeffs ):
202226 """Check that x is a one-dimensional vector of the right shape.
203227 This is a temporary hackaround until I actually implement bulk predict."""
228+ x = _convert_vector (x )
204229 if type (x ) == ndarray :
205230 if x .ndim == 1 :
206231 if x .shape != coeffs .shape :
@@ -245,23 +270,20 @@ def predict(self, x):
245270 """Predict the value of the dependent variable given a vector x"""
246271 """containing values for the independent variables."""
247272 _linear_predictor_typecheck (x , self ._coeff )
248- return x . dot ( self ._coeff ) + self ._intercept
273+ return _dot ( x , self ._coeff ) + self ._intercept
249274
250275# If we weren't given initial weights, take a zero vector of the appropriate
251276# length.
252277def _get_initial_weights (initial_weights , data ):
253278 if initial_weights is None :
254- initial_weights = data .first ()
279+ initial_weights = _convert_vector ( data .first () )
255280 if type (initial_weights ) == ndarray :
256281 if initial_weights .ndim != 1 :
257282 raise TypeError ("At least one data element has "
258283 + initial_weights .ndim + " dimensions, which is not 1" )
259284 initial_weights = numpy .ones ([initial_weights .shape [0 ] - 1 ])
260285 elif type (initial_weights ) == SparseVector :
261286 initial_weights = numpy .ones ([initial_weights .size - 1 ])
262- else :
263- raise TypeError ("At least one data element has type "
264- + type (initial_weights ).__name__ + " which is not a vector" )
265287 return initial_weights
266288
267289# train_func should take two parameters, namely data and initial_weights, and
@@ -327,6 +349,8 @@ def _squared_distance(v1, v2):
327349 >>> _squared_distance(sparse1, sparse2)
328350 2.0
329351 """
352+ v1 = _convert_vector (v1 )
353+ v2 = _convert_vector (v2 )
330354 if type (v1 ) == ndarray and type (v2 ) == ndarray :
331355 diff = v1 - v2
332356 return diff .dot (diff )
@@ -335,6 +359,41 @@ def _squared_distance(v1, v2):
335359 else :
336360 return v1 .squared_distance (v2 )
337361
362+ def _convert_vector (vec ):
363+ """
364+ Convert a vector to a format we support internally. This does
365+ the following:
366+
367+ * For dense NumPy vectors (ndarray), returns them as is
368+ * For our SparseVector class, returns that as is
369+ * For scipy.sparse.*_matrix column vectors, converts them to
370+ our own SparseVector type.
371+
372+ This should be called before passing any data to our algorithms
373+ or attempting to serialize it to Java.
374+ """
375+ if type (vec ) == ndarray or type (vec ) == SparseVector :
376+ return vec
377+ elif _have_scipy :
378+ if _scipy_issparse (vec ):
379+ assert vec .shape [1 ] == 1 , "Expected column vector"
380+ csc = vec .tocsc ()
381+ return SparseVector (vec .shape [0 ], csc .indices , csc .data )
382+ raise TypeError ("Expected NumPy array, SparseVector, or scipy.sparse matrix" )
383+
384+ def _dot (vec , target ):
385+ """
386+ Compute the dot product of a vector of the types we support
387+ (Numpy dense, SparseVector, or SciPy sparse) and a target NumPy
388+ array that is either 1- or 2-dimensional. Equivalent to calling
389+ numpy.dot of the two vectors, but for SciPy ones, we have to
390+ transpose them because they're column vectors.
391+ """
392+ if type (vec ) == ndarray or type (vec ) == SparseVector :
393+ return vec .dot (target )
394+ else :
395+ return vec .transpose ().dot (target )[0 ]
396+
338397def _test ():
339398 import doctest
340399 globs = globals ().copy ()
0 commit comments