Removing Row nested values and placing by generic types

AndreSchumacher · AndreSchumacher · commit 32229c755f1d · 2014-06-19T18:03:42.000+03:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
@@ -206,68 +206,6 @@ class GenericMutableRow(size: Int) extends GenericRow(size) with MutableRow {
   override def copy() = new GenericRow(values.clone())
 }
 
-// TODO: this is an awful lot of code duplication. If values would be covariant we could reuse
-// much of GenericRow
-class NativeRow[T](protected[catalyst] val values: Array[T]) extends Row {
-
-  /** No-arg constructor for serialization. */
-  def this() = this(null)
-
-  def this(elementType: NativeType, size: Int) =
-    this(elementType.classTag.newArray(size).asInstanceOf[Array[T]])
-
-  def iterator = values.iterator
-
-  def length = values.length
-
-  def apply(i: Int) = values(i)
-
-  def isNullAt(i: Int) = values(i) == null
-
-  def getInt(i: Int): Int = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive int value.")
-    values(i).asInstanceOf[Int]
-  }
-
-  def getLong(i: Int): Long = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive long value.")
-    values(i).asInstanceOf[Long]
-  }
-
-  def getDouble(i: Int): Double = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive double value.")
-    values(i).asInstanceOf[Double]
-  }
-
-  def getFloat(i: Int): Float = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive float value.")
-    values(i).asInstanceOf[Float]
-  }
-
-  def getBoolean(i: Int): Boolean = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive boolean value.")
-    values(i).asInstanceOf[Boolean]
-  }
-
-  def getShort(i: Int): Short = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive short value.")
-    values(i).asInstanceOf[Short]
-  }
-
-  def getByte(i: Int): Byte = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive byte value.")
-    values(i).asInstanceOf[Byte]
-  }
-
-  def getString(i: Int): String = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive String value.")
-    values(i).asInstanceOf[String]
-  }
-
-  def copy() = this
-}
-
-
 class RowOrdering(ordering: Seq[SortOrder]) extends Ordering[Row] {
   def this(ordering: Seq[SortOrder], inputSchema: Seq[Attribute]) =
     this(ordering.map(BindReferences.bindReference(_, inputSchema)))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -50,7 +50,7 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression {
         null
       } else {
         if (child.dataType.isInstanceOf[ArrayType]) {
-          val baseValue = value.asInstanceOf[Seq[_]]
+          val baseValue = value.asInstanceOf[Array[_]]
           val o = key.asInstanceOf[Int]
           if (o >= baseValue.size || o < 0) {
             null
@@ -92,7 +92,7 @@ case class GetField(child: Expression, fieldName: String) extends UnaryExpressio
   override lazy val resolved = childrenResolved && child.dataType.isInstanceOf[StructType]
 
   override def eval(input: Row): Any = {
-    val baseValue = child.eval(input).asInstanceOf[Row]
+    val baseValue = child.eval(input).asInstanceOf[Seq[_]]
     if (baseValue == null) null else baseValue(ordinal)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -23,12 +23,12 @@ import parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter}
 import parquet.schema.MessageType
 
 import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.catalyst.expressions.{NativeRow, GenericRow, Row, Attribute}
+import org.apache.spark.sql.catalyst.expressions.{GenericRow, Row, Attribute}
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
 
 /**
- * Collection of converters of Parquet types (Group and primitive types) that
- * model arrays and maps. The convertions are partly based on the AvroParquet
+ * Collection of converters of Parquet types (group and primitive types) that
+ * model arrays and maps. The conversions are partly based on the AvroParquet
  * converters that are part of Parquet in order to be able to process these
  * types.
  *
@@ -51,7 +51,7 @@ import org.apache.spark.sql.parquet.CatalystConverter.FieldType
  * </ul>
  */
 
-private[parquet] object CatalystConverter {
+private[sql] object CatalystConverter {
   // The type internally used for fields
   type FieldType = StructField
 
@@ -63,6 +63,10 @@ private[parquet] object CatalystConverter {
   val MAP_VALUE_SCHEMA_NAME = "value"
   val MAP_SCHEMA_NAME = "map"
 
+  type ArrayScalaType[T] = Array[T]
+  type StructScalaType[T] = Seq[T]
+  type MapScalaType[K, V] = Map[K, V]
+
   protected[parquet] def createConverter(
       field: FieldType,
       fieldIndex: Int,
@@ -325,7 +329,6 @@ private[parquet] class CatalystPrimitiveRowConverter(
 private[parquet] class CatalystPrimitiveConverter(
     parent: CatalystConverter,
     fieldIndex: Int) extends PrimitiveConverter {
-  // TODO: consider refactoring these together with ParquetTypesConverter
   override def addBinary(value: Binary): Unit =
     parent.updateBinary(fieldIndex, value)
 
@@ -404,6 +407,9 @@ private[parquet] class CatalystArrayConverter(
 
   override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
     // fieldIndex is ignored (assumed to be zero but not checked)
+    if(value == null) {
+      throw new IllegalArgumentException("Null values inside Parquet arrays are not supported!")
+    }
     buffer += value
   }
 
@@ -419,7 +425,8 @@ private[parquet] class CatalystArrayConverter(
 
   override def end(): Unit = {
     assert(parent != null)
-    parent.updateField(index, new GenericRow(buffer.toArray))
+    // here we need to make sure to use ArrayScalaType
+    parent.updateField(index, buffer.toArray)
     clearBuffer()
   }
 }
@@ -444,7 +451,8 @@ private[parquet] class CatalystNativeArrayConverter(
 
   type nativeType = elementType.JvmType
 
-  private var buffer: Array[nativeType] = elementType.classTag.newArray(capacity)
+  private var buffer: CatalystConverter.ArrayScalaType[nativeType] =
+    elementType.classTag.newArray(capacity)
 
   private var elements: Int = 0
 
@@ -515,16 +523,18 @@ private[parquet] class CatalystNativeArrayConverter(
 
   override def end(): Unit = {
     assert(parent != null)
+    // here we need to make sure to use ArrayScalaType
     parent.updateField(
       index,
-      new NativeRow[nativeType](buffer.slice(0, elements)))
+      buffer.slice(0, elements))
     clearBuffer()
   }
 
   private def checkGrowBuffer(): Unit = {
     if (elements >= capacity) {
       val newCapacity = 2 * capacity
-      val tmp: Array[nativeType] = elementType.classTag.newArray(newCapacity)
+      val tmp: CatalystConverter.ArrayScalaType[nativeType] =
+        elementType.classTag.newArray(newCapacity)
       Array.copy(buffer, 0, tmp, 0, capacity)
       buffer = tmp
       capacity = newCapacity
@@ -552,8 +562,10 @@ private[parquet] class CatalystStructConverter(
   // TODO: think about reusing the buffer
   override def end(): Unit = {
     assert(!isRootConverter)
-    // TODO: use iterators if possible, avoid Row wrapping!
-    parent.updateField(index, new GenericRow(current.toArray))
+    // here we need to make sure to use StructScalaType
+    // Note: we need to actually make a copy of the array since we
+    // may be in a nested field
+    parent.updateField(index, current.toArray.toSeq)
   }
 }
 
@@ -619,6 +631,7 @@ private[parquet] class CatalystMapConverter(
   }
 
   override def end(): Unit = {
+    // here we need to make sure to use MapScalaType
     parent.updateField(index, map.toMap)
   }
 
@@ -627,6 +640,3 @@ private[parquet] class CatalystMapConverter(
   override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
     throw new UnsupportedOperationException
 }
-
-
-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -140,9 +140,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   private[parquet] def writeValue(schema: DataType, value: Any): Unit = {
     if (value != null && value != Nil) {
       schema match {
-        case t @ ArrayType(_) => writeArray(t, value.asInstanceOf[Row])
-        case t @ MapType(_, _) => writeMap(t, value.asInstanceOf[Map[Any, Any]])
-        case t @ StructType(_) => writeStruct(t, value.asInstanceOf[Row])
+        case t @ ArrayType(_) => writeArray(
+          t,
+          value.asInstanceOf[CatalystConverter.ArrayScalaType[_]])
+        case t @ MapType(_, _) => writeMap(
+          t,
+          value.asInstanceOf[CatalystConverter.MapScalaType[_, _]])
+        case t @ StructType(_) => writeStruct(
+          t,
+          value.asInstanceOf[CatalystConverter.StructScalaType[_]])
         case _ => writePrimitive(schema.asInstanceOf[PrimitiveType], value)
       }
     }
@@ -166,7 +172,9 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     }
   }
 
-  private[parquet] def writeStruct(schema: StructType, struct: Row): Unit = {
+  private[parquet] def writeStruct(
+      schema: StructType,
+      struct: CatalystConverter.StructScalaType[_]): Unit = {
     if (struct != null && struct != Nil) {
       val fields = schema.fields.toArray
       writer.startGroup()
@@ -183,7 +191,11 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     }
   }
 
-  private[parquet] def writeArray(schema: ArrayType, array: Row): Unit = {
+  // TODO: support null values, see
+  // https://issues.apache.org/jira/browse/SPARK-1649
+  private[parquet] def writeArray(
+      schema: ArrayType,
+      array: CatalystConverter.ArrayScalaType[_]): Unit = {
     val elementType = schema.elementType
     writer.startGroup()
     if (array.size > 0) {
@@ -198,8 +210,11 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     writer.endGroup()
   }
 
-  // TODO: this does not allow null values! Should these be supported?
-  private[parquet] def writeMap(schema: MapType, map: Map[_, _]): Unit = {
+  // TODO: support null values, see
+  // https://issues.apache.org/jira/browse/SPARK-1649
+  private[parquet] def writeMap(
+      schema: MapType,
+      map: CatalystConverter.MapScalaType[_, _]): Unit = {
     writer.startGroup()
     if (map.size > 0) {
       writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala