Skip to content

Commit 889dde8

Browse files
committed
Support scipy.sparse matrices in all our algorithms and models
1 parent ab244d1 commit 889dde8

File tree

7 files changed

+285
-21
lines changed

7 files changed

+285
-21
lines changed

python/epydoc.conf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,6 @@ target: docs/
3333
private: no
3434

3535
exclude: pyspark.cloudpickle pyspark.worker pyspark.join
36-
pyspark.java_gateway pyspark.examples pyspark.shell pyspark.test
36+
pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests
3737
pyspark.rddsampler pyspark.daemon pyspark.mllib._common
38+
pyspark.mllib.tests

python/pyspark/mllib/_common.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,29 @@
2222
from pyspark.mllib.linalg import SparseVector
2323
from pyspark.serializers import Serializer
2424

25+
"""
26+
Common utilities shared throughout MLlib, primarily for dealing with
27+
different data types. These include:
28+
- Serialization utilities to / from byte arrays that Java can handle
29+
- Serializers for other data types, like ALS Rating objects
30+
- Common methods for linear models
31+
- Methods to deal with the different vector types we support, such as
32+
SparseVector and scipy.sparse matrices.
33+
"""
34+
35+
# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
36+
# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
37+
38+
_have_scipy = False
39+
_scipy_issparse = None
40+
try:
41+
import scipy.sparse
42+
_have_scipy = True
43+
_scipy_issparse = scipy.sparse.issparse
44+
except:
45+
# No SciPy in environment, but that's okay
46+
pass
47+
2548
# Dense double vector format:
2649
#
2750
# [1-byte 1] [4-byte length] [length*8 bytes of data]
@@ -67,6 +90,7 @@ def _serialize_double_vector(v):
6790
>>> array_equal(y, array([1.0, 2.0, 3.0]))
6891
True
6992
"""
93+
v = _convert_vector(v)
7094
if type(v) == ndarray:
7195
return _serialize_dense_vector(v)
7296
elif type(v) == SparseVector:
@@ -201,6 +225,7 @@ def _deserialize_double_matrix(ba):
201225
def _linear_predictor_typecheck(x, coeffs):
202226
"""Check that x is a one-dimensional vector of the right shape.
203227
This is a temporary hackaround until I actually implement bulk predict."""
228+
x = _convert_vector(x)
204229
if type(x) == ndarray:
205230
if x.ndim == 1:
206231
if x.shape != coeffs.shape:
@@ -245,23 +270,20 @@ def predict(self, x):
245270
"""Predict the value of the dependent variable given a vector x"""
246271
"""containing values for the independent variables."""
247272
_linear_predictor_typecheck(x, self._coeff)
248-
return x.dot(self._coeff) + self._intercept
273+
return _dot(x, self._coeff) + self._intercept
249274

250275
# If we weren't given initial weights, take a zero vector of the appropriate
251276
# length.
252277
def _get_initial_weights(initial_weights, data):
253278
if initial_weights is None:
254-
initial_weights = data.first()
279+
initial_weights = _convert_vector(data.first())
255280
if type(initial_weights) == ndarray:
256281
if initial_weights.ndim != 1:
257282
raise TypeError("At least one data element has "
258283
+ initial_weights.ndim + " dimensions, which is not 1")
259284
initial_weights = numpy.ones([initial_weights.shape[0] - 1])
260285
elif type(initial_weights) == SparseVector:
261286
initial_weights = numpy.ones([initial_weights.size - 1])
262-
else:
263-
raise TypeError("At least one data element has type "
264-
+ type(initial_weights).__name__ + " which is not a vector")
265287
return initial_weights
266288

267289
# train_func should take two parameters, namely data and initial_weights, and
@@ -327,6 +349,8 @@ def _squared_distance(v1, v2):
327349
>>> _squared_distance(sparse1, sparse2)
328350
2.0
329351
"""
352+
v1 = _convert_vector(v1)
353+
v2 = _convert_vector(v2)
330354
if type(v1) == ndarray and type(v2) == ndarray:
331355
diff = v1 - v2
332356
return diff.dot(diff)
@@ -335,6 +359,41 @@ def _squared_distance(v1, v2):
335359
else:
336360
return v1.squared_distance(v2)
337361

362+
def _convert_vector(vec):
363+
"""
364+
Convert a vector to a format we support internally. This does
365+
the following:
366+
367+
* For dense NumPy vectors (ndarray), returns them as is
368+
* For our SparseVector class, returns that as is
369+
* For scipy.sparse.*_matrix column vectors, converts them to
370+
our own SparseVector type.
371+
372+
This should be called before passing any data to our algorithms
373+
or attempting to serialize it to Java.
374+
"""
375+
if type(vec) == ndarray or type(vec) == SparseVector:
376+
return vec
377+
elif _have_scipy:
378+
if _scipy_issparse(vec):
379+
assert vec.shape[1] == 1, "Expected column vector"
380+
csc = vec.tocsc()
381+
return SparseVector(vec.shape[0], csc.indices, csc.data)
382+
raise TypeError("Expected NumPy array, SparseVector, or scipy.sparse matrix")
383+
384+
def _dot(vec, target):
385+
"""
386+
Compute the dot product of a vector of the types we support
387+
(Numpy dense, SparseVector, or SciPy sparse) and a target NumPy
388+
array that is either 1- or 2-dimensional. Equivalent to calling
389+
numpy.dot of the two vectors, but for SciPy ones, we have to
390+
transpose them because they're column vectors.
391+
"""
392+
if type(vec) == ndarray or type(vec) == SparseVector:
393+
return vec.dot(target)
394+
else:
395+
return vec.transpose().dot(target)[0]
396+
338397
def _test():
339398
import doctest
340399
globs = globals().copy()

python/pyspark/mllib/classification.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from numpy import array, shape
2121
from pyspark import SparkContext
2222
from pyspark.mllib._common import \
23-
_get_unmangled_rdd, _get_unmangled_double_vector_rdd, \
23+
_dot, _get_unmangled_rdd, _get_unmangled_double_vector_rdd, \
2424
_serialize_double_matrix, _deserialize_double_matrix, \
2525
_serialize_double_vector, _deserialize_double_vector, \
2626
_get_initial_weights, _serialize_rating, _regression_train_wrapper, \
@@ -55,7 +55,7 @@ class LogisticRegressionModel(LinearModel):
5555
"""
5656
def predict(self, x):
5757
_linear_predictor_typecheck(x, self._coeff)
58-
margin = x.dot(self._coeff) + self._intercept
58+
margin = _dot(x, self._coeff) + self._intercept
5959
prob = 1/(1 + exp(-margin))
6060
return 1 if prob > 0.5 else 0
6161

@@ -91,7 +91,7 @@ class SVMModel(LinearModel):
9191
"""
9292
def predict(self, x):
9393
_linear_predictor_typecheck(x, self._coeff)
94-
margin = x.dot(self._coeff) + self._intercept
94+
margin = _dot(x, self._coeff) + self._intercept
9595
return 1 if margin >= 0 else 0
9696

9797
class SVMWithSGD(object):
@@ -138,7 +138,7 @@ def __init__(self, labels, pi, theta):
138138

139139
def predict(self, x):
140140
"""Return the most likely class for a data vector x"""
141-
return self.labels[numpy.argmax(self.pi + x.dot(self.theta))]
141+
return self.labels[numpy.argmax(self.pi + _dot(x, self.theta))]
142142

143143
class NaiveBayes(object):
144144
@classmethod

python/pyspark/mllib/clustering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def __init__(self, centers_):
5757
def predict(self, x):
5858
"""Find the cluster to which x belongs in this model."""
5959
best = 0
60-
best_distance = 1e75
60+
best_distance = float("inf")
6161
for i in range(0, self.centers.shape[0]):
6262
distance = _squared_distance(x, self.centers[i])
6363
if distance < best_distance:

python/pyspark/mllib/linalg.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
"""
19-
MLlib utilities for working with vectors. For dense vectors, MLlib
19+
MLlib utilities for linear algebra. For dense vectors, MLlib
2020
uses the NumPy C{array} type, so you can simply pass NumPy arrays
2121
around. For sparse vectors, users can construct a L{SparseVector}
2222
object from MLlib or pass SciPy C{scipy.sparse} column vectors if
@@ -29,7 +29,7 @@
2929
class SparseVector(object):
3030
"""
3131
A simple sparse vector class for passing data to MLlib. Users may
32-
alternatively pass use SciPy's {scipy.sparse} data types.
32+
alternatively pass SciPy's {scipy.sparse} data types.
3333
"""
3434

3535
def __init__(self, size, *args):
@@ -40,7 +40,7 @@ def __init__(self, size, *args):
4040
4141
@param size: Size of the vector.
4242
@param args: Non-zero entries, as a dictionary, list of tupes,
43-
or two sorted lists containing indices and values.
43+
or two sorted lists containing indices and values.
4444
4545
>>> print SparseVector(4, {1: 1.0, 3: 5.5})
4646
[1: 1.0, 3: 5.5]
@@ -115,7 +115,7 @@ def squared_distance(self, other):
115115
>>> a.squared_distance(a)
116116
0.0
117117
>>> a.squared_distance(array([1., 2., 3., 4.]))
118-
1.0
118+
11.0
119119
>>> b = SparseVector(4, [2, 4], [1.0, 2.0])
120120
>>> a.squared_distance(b)
121121
30.0
@@ -125,9 +125,14 @@ def squared_distance(self, other):
125125
if type(other) == ndarray:
126126
if other.ndim == 1:
127127
result = 0.0
128-
for i in xrange(len(self.indices)):
129-
diff = self.values[i] - other[self.indices[i]]
130-
result += diff * diff
128+
j = 0 # index into our own array
129+
for i in xrange(other.shape[0]):
130+
if j < len(self.indices) and self.indices[j] == i:
131+
diff = self.values[j] - other[i]
132+
result += diff * diff
133+
j += 1
134+
else:
135+
result += other[i] * other[i]
131136
return result
132137
else:
133138
raise Exception("Cannot call squared_distance with %d-dimensional array" %
@@ -191,7 +196,7 @@ def __ne__(self, other):
191196

192197
class Vectors(object):
193198
"""
194-
Factory methods to create MLlib vectors. Note that dense vectors
199+
Factory methods for working with vectors. Note that dense vectors
195200
are simply represented as NumPy array objects, so there is no need
196201
to covert them for use in MLlib. For sparse vectors, the factory
197202
methods in this class create an MLlib-compatible type, or users

python/pyspark/mllib/regression.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from numpy import array, dot
1919
from pyspark import SparkContext
2020
from pyspark.mllib._common import \
21-
_get_unmangled_rdd, _get_unmangled_double_vector_rdd, \
21+
_dot, _get_unmangled_rdd, _get_unmangled_double_vector_rdd, \
2222
_serialize_double_matrix, _deserialize_double_matrix, \
2323
_serialize_double_vector, _deserialize_double_vector, \
2424
_get_initial_weights, _serialize_rating, _regression_train_wrapper, \
@@ -44,7 +44,7 @@ def predict(self, x):
4444
"""Predict the value of the dependent variable given a vector x"""
4545
"""containing values for the independent variables."""
4646
_linear_predictor_typecheck(x, self._coeff)
47-
return x.dot(self._coeff) + self._intercept
47+
return _dot(x, self._coeff) + self._intercept
4848

4949
class LinearRegressionModel(LinearRegressionModelBase):
5050
"""A linear regression model derived from a least-squares fit.

0 commit comments

Comments
 (0)