Added a sparse vector in Python and made Java-Python format more compact

mateiz · mateiz · commit 881fef7e4ab3 · 2014-04-15T13:04:34.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -31,22 +31,24 @@ import org.apache.spark.rdd.RDD
 /**
  * :: DeveloperApi ::
  * The Java stubs necessary for the Python mllib bindings.
+ *
+ * See mllib/python/pyspark._common.py for the mutually agreed upon data format.
  */
 @DeveloperApi
 class PythonMLLibAPI extends Serializable {
   private def deserializeDoubleVector(bytes: Array[Byte]): Array[Double] = {
     val packetLength = bytes.length
-    if (packetLength < 16) {
+    if (packetLength < 5) {
       throw new IllegalArgumentException("Byte array too short.")
     }
     val bb = ByteBuffer.wrap(bytes)
     bb.order(ByteOrder.nativeOrder())
-    val magic = bb.getLong()
+    val magic = bb.get()
     if (magic != 1) {
       throw new IllegalArgumentException("Magic " + magic + " is wrong.")
     }
-    val length = bb.getLong()
-    if (packetLength != 16 + 8 * length) {
+    val length = bb.getInt()
+    if (packetLength != 5 + 8 * length) {
       throw new IllegalArgumentException("Length " + length + " is wrong.")
     }
     val db = bb.asDoubleBuffer()
@@ -57,30 +59,30 @@ class PythonMLLibAPI extends Serializable {
 
   private def serializeDoubleVector(doubles: Array[Double]): Array[Byte] = {
     val len = doubles.length
-    val bytes = new Array[Byte](16 + 8 * len)
+    val bytes = new Array[Byte](5 + 8 * len)
     val bb = ByteBuffer.wrap(bytes)
     bb.order(ByteOrder.nativeOrder())
-    bb.putLong(1)
-    bb.putLong(len)
+    bb.put(1: Byte)
+    bb.putInt(len)
     val db = bb.asDoubleBuffer()
     db.put(doubles)
     bytes
   }
 
   private def deserializeDoubleMatrix(bytes: Array[Byte]): Array[Array[Double]] = {
     val packetLength = bytes.length
-    if (packetLength < 24) {
+    if (packetLength < 9) {
       throw new IllegalArgumentException("Byte array too short.")
     }
     val bb = ByteBuffer.wrap(bytes)
     bb.order(ByteOrder.nativeOrder())
-    val magic = bb.getLong()
+    val magic = bb.get()
     if (magic != 2) {
       throw new IllegalArgumentException("Magic " + magic + " is wrong.")
     }
-    val rows = bb.getLong()
-    val cols = bb.getLong()
-    if (packetLength != 24 + 8 * rows * cols) {
+    val rows = bb.getInt()
+    val cols = bb.getInt()
+    if (packetLength != 9 + 8 * rows * cols) {
       throw new IllegalArgumentException("Size " + rows + "x" + cols + " is wrong.")
     }
     val db = bb.asDoubleBuffer()
@@ -98,12 +100,12 @@ class PythonMLLibAPI extends Serializable {
     if (rows > 0) {
       cols = doubles(0).length
     }
-    val bytes = new Array[Byte](24 + 8 * rows * cols)
+    val bytes = new Array[Byte](9 + 8 * rows * cols)
     val bb = ByteBuffer.wrap(bytes)
     bb.order(ByteOrder.nativeOrder())
-    bb.putLong(2)
-    bb.putLong(rows)
-    bb.putLong(cols)
+    bb.put(2: Byte)
+    bb.putInt(rows)
+    bb.putInt(cols)
     val db = bb.asDoubleBuffer()
     for (i <- 0 until rows) {
       db.put(doubles(i))
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
@@ -15,20 +15,24 @@
 # limitations under the License.
 #
 
+import struct
+import numpy
 from numpy import ndarray, float64, int64, int32, ones, array_equal, array, dot, shape, complex, issubdtype
 from pyspark import SparkContext, RDD
-import numpy as np
-
+from pyspark.mllib.linalg import SparseVector
 from pyspark.serializers import Serializer
-import struct
 
-# Double vector format:
+# Dense double vector format:
 #
 # [8-byte 1] [8-byte length] [length*8 bytes of data]
 #
+# Sparse double vector format:
+#
+# [8-byte 2] [8-byte size] [8-byte entries] [entries*4 bytes of indices] [entries*8 bytes of values]
+#
 # Double matrix format:
 #
-# [8-byte 2] [8-byte rows] [8-byte cols] [rows*cols*8 bytes of data]
+# [8-byte 3] [8-byte rows] [8-byte cols] [rows*cols*8 bytes of data]
 #
 # This is all in machine-endian.  That means that the Java interpreter and the
 # Python interpreter must agree on what endian the machine is.
@@ -43,8 +47,7 @@ def _deserialize_byte_array(shape, ba, offset):
     >>> array_equal(x, _deserialize_byte_array(x.shape, x.data, 0))
     True
     """
-    ar = ndarray(shape=shape, buffer=ba, offset=offset, dtype="float64",
-            order='C')
+    ar = ndarray(shape=shape, buffer=ba, offset=offset, dtype="float64", order='C')
     return ar.copy()
 
 def _serialize_double_vector(v):
@@ -58,21 +61,20 @@ def _serialize_double_vector(v):
     if type(v) != ndarray:
         raise TypeError("_serialize_double_vector called on a %s; "
                 "wanted ndarray" % type(v))
-    """complex is only datatype that can't be converted to float64"""
-    if issubdtype(v.dtype, complex):
-        raise TypeError("_serialize_double_vector called on a %s; "
-                "wanted ndarray" % type(v))
-    if v.dtype != float64:
-        v = v.astype(float64)
     if v.ndim != 1:
         raise TypeError("_serialize_double_vector called on a %ddarray; "
                 "wanted a 1darray" % v.ndim)
+    if v.dtype != float64:
+        if numpy.issubdtype(v.dtype, numpy.complex):
+            raise TypeError("_serialize_double_vector called on an ndarray of %s; "
+                    "wanted ndarray of float64" % v.dtype)
+        v = v.astype('float64')
     length = v.shape[0]
-    ba = bytearray(16 + 8*length)
-    header = ndarray(shape=[2], buffer=ba, dtype="int64")
-    header[0] = 1
-    header[1] = length
-    arr_mid = ndarray(shape=[length], buffer=ba, offset=16, dtype="float64")
+    ba = bytearray(5 + 8 * length)
+    ba[0] = 1
+    length_bytes = ndarray(shape=[1], buffer=ba, offset=1, dtype="int32")
+    length_bytes[0] = length
+    arr_mid = ndarray(shape=[length], buffer=ba, offset=5, dtype="float64")
     arr_mid[...] = v
     return ba
 
@@ -86,34 +88,34 @@ def _deserialize_double_vector(ba):
     if type(ba) != bytearray:
         raise TypeError("_deserialize_double_vector called on a %s; "
                 "wanted bytearray" % type(ba))
-    if len(ba) < 16:
+    if len(ba) < 5:
         raise TypeError("_deserialize_double_vector called on a %d-byte array, "
                 "which is too short" % len(ba))
-    if (len(ba) & 7) != 0:
-        raise TypeError("_deserialize_double_vector called on a %d-byte array, "
-                "which is not a multiple of 8" % len(ba))
-    header = ndarray(shape=[2], buffer=ba, dtype="int64")
-    if header[0] != 1:
+    if ba[0] != 1:
         raise TypeError("_deserialize_double_vector called on bytearray "
                         "with wrong magic")
-    length = header[1]
-    if len(ba) != 8*length + 16:
+    length = ndarray(shape=[1], buffer=ba, offset=1, dtype="int32")[0]
+    if len(ba) != 8*length + 5:
         raise TypeError("_deserialize_double_vector called on bytearray "
                         "with wrong length")
-    return _deserialize_byte_array([length], ba, 16)
+    return _deserialize_byte_array([length], ba, 5)
 
 def _serialize_double_matrix(m):
     """Serialize a double matrix into a mutually understood format."""
-    if (type(m) == ndarray and m.dtype == float64 and m.ndim == 2):
+    if (type(m) == ndarray and m.ndim == 2):
+        if m.dtype != float64:
+            if numpy.issubdtype(m.dtype, numpy.complex):
+                raise TypeError("_serialize_double_matrix called on an ndarray of %s; "
+                        "wanted ndarray of float64" % m.dtype)
+            m = m.astype('float64')
         rows = m.shape[0]
         cols = m.shape[1]
-        ba = bytearray(24 + 8 * rows * cols)
-        header = ndarray(shape=[3], buffer=ba, dtype="int64")
-        header[0] = 2
-        header[1] = rows
-        header[2] = cols
-        arr_mid = ndarray(shape=[rows, cols], buffer=ba, offset=24,
-                      dtype="float64", order='C')
+        ba = bytearray(9 + 8 * rows * cols)
+        ba[0] = 2
+        lengths = ndarray(shape=[3], buffer=ba, offset=1, dtype="int32")
+        lengths[0] = rows
+        lengths[1] = cols
+        arr_mid = ndarray(shape=[rows, cols], buffer=ba, offset=9, dtype="float64", order='C')
         arr_mid[...] = m
         return ba
     else:
@@ -125,22 +127,19 @@ def _deserialize_double_matrix(ba):
     if type(ba) != bytearray:
         raise TypeError("_deserialize_double_matrix called on a %s; "
                 "wanted bytearray" % type(ba))
-    if len(ba) < 24:
+    if len(ba) < 9:
         raise TypeError("_deserialize_double_matrix called on a %d-byte array, "
                 "which is too short" % len(ba))
-    if (len(ba) & 7) != 0:
-        raise TypeError("_deserialize_double_matrix called on a %d-byte array, "
-                "which is not a multiple of 8" % len(ba))
-    header = ndarray(shape=[3], buffer=ba, dtype="int64")
-    if (header[0] != 2):
+    if ba[0] != 2:
         raise TypeError("_deserialize_double_matrix called on bytearray "
                         "with wrong magic")
-    rows = header[1]
-    cols = header[2]
-    if (len(ba) != 8*rows*cols + 24):
+    lengths = ndarray(shape=[2], buffer=ba, offset=1, dtype="int32")
+    rows = lengths[0]
+    cols = lengths[1]
+    if (len(ba) != 8 * rows * cols + 9):
         raise TypeError("_deserialize_double_matrix called on bytearray "
                         "with wrong length")
-    return _deserialize_byte_array([rows, cols], ba, 24)
+    return _deserialize_byte_array([rows, cols], ba, 9)
 
 def _linear_predictor_typecheck(x, coeffs):
     """Check that x is a one-dimensional vector of the right shape.
@@ -151,7 +150,7 @@ def _linear_predictor_typecheck(x, coeffs):
                 pass
             else:
                 raise RuntimeError("Got array of %d elements; wanted %d"
-                        % (shape(x)[0], shape(coeffs)[0]))
+                        % (numpy.shape(x)[0], numpy.shape(coeffs)[0]))
         else:
             raise RuntimeError("Bulk predict not yet supported.")
     elif (type(x) == RDD):
@@ -187,7 +186,7 @@ def predict(self, x):
         """Predict the value of the dependent variable given a vector x"""
         """containing values for the independent variables."""
         _linear_predictor_typecheck(x, self._coeff)
-        return dot(self._coeff, x) + self._intercept
+        return numpy.dot(self._coeff, x) + self._intercept
 
 # If we weren't given initial weights, take a zero vector of the appropriate
 # length.
@@ -200,7 +199,7 @@ def _get_initial_weights(initial_weights, data):
         if initial_weights.ndim != 1:
             raise TypeError("At least one data element has "
                     + initial_weights.ndim + " dimensions, which is not 1")
-        initial_weights = ones([initial_weights.shape[0] - 1])
+        initial_weights = numpy.ones([initial_weights.shape[0] - 1])
     return initial_weights
 
 # train_func should take two parameters, namely data and initial_weights, and
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
@@ -0,0 +1,111 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+MLlib utilities for working with vectors. For dense vectors, MLlib
+uses the NumPy C{array} type, so you can simply pass NumPy arrays
+around. For sparse vectors, users can construct a L{SparseVector}
+object from MLlib or pass SciPy C{scipy.sparse} column vectors if
+SciPy is available in their environment.
+"""
+
+from numpy import array
+
+
+class SparseVector(object):
+    """
+    A simple sparse vector class for passing data to MLlib. Users may
+    alternatively pass use SciPy's {scipy.sparse} data types.
+    """
+
+    def __init__(self, size, *args):
+        """
+        Create a sparse vector, using either an array of (index, value) pairs
+        or two separate arrays of indices and values.
+
+        >>> print SparseVector(4, [(1, 1.0), (3, 5.5)])
+        [1: 1.0, 3: 5.5]
+        >>> print SparseVector(4, [1, 3], [1.0, 5.5])
+        [1: 1.0, 3: 5.5]
+        """
+        self.size = size
+        assert 1 <= len(args) <= 2, "must pass either 1 or 2 arguments"
+        if len(args) == 1:
+            pairs = args[0]
+            self.indices = array([p[0] for p in pairs], dtype='int32')
+            self.values = array([p[1] for p in pairs], dtype='float64')
+        else:
+            assert len(args[0]) == len(args[1]), "index and value arrays not same length"
+            self.indices = array(args[0], dtype='int32')
+            self.values = array(args[1], dtype='float64')
+
+    def __str__(self):
+        inds = self.indices
+        vals = self.values
+        entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
+        return "[" + entries + "]"
+
+    def __repr__(self):
+        inds = self.indices
+        vals = self.values
+        entries = ", ".join(["({0}, {1})".format(inds[i], vals[i]) for i in xrange(len(inds))])
+        return "SparseVector({0}, [{1}])".format(self.size, entries)
+
+
+
+class Vectors(object):
+    """
+    Factory methods to create MLlib vectors. Note that dense vectors
+    are simply represented as NumPy array objects, so there is no need
+    to covert them for use in MLlib. For sparse vectors, the factory
+    methods in this class create an MLlib-compatible type, or users
+    can pass in SciPy's C{scipy.sparse} column vectors.
+    """
+
+    @staticmethod
+    def sparse(size, *args):
+        """
+        Create a sparse vector, using either an array of (index, value) pairs
+        or two separate arrays of indices and values.
+
+        >>> print Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
+        [1: 1.0, 3: 5.5]
+        >>> print Vectors.sparse(4, [1, 3], [1.0, 5.5])
+        [1: 1.0, 3: 5.5]
+        """
+        return SparseVector(size, *args)
+
+    @staticmethod
+    def dense(elements):
+        """
+        Create a dense vector of 64-bit floats from a Python list. Always
+        returns a NumPy array.
+
+        >>> Vectors.dense([1, 2, 3])
+        array([ 1.,  2.,  3.])
+        """
+        return array(elements, dtype='float64')
+
+
+def _test():
+    import doctest
+    (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
+    if failure_count:
+        exit(-1)
+
+if __name__ == "__main__":
+    _test()