WIP: Directly serialize catalyst attributes.

marmbrus · AndreSchumacher · commit 3e1456c961c2 · 2014-06-19T18:03:42.000+03:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
@@ -19,26 +19,86 @@ package org.apache.spark.sql.catalyst.types
 
 import java.sql.Timestamp
 
+import scala.util.parsing.combinator.RegexParsers
+
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag, runtimeMirror}
 
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 import org.apache.spark.util.Utils
 
+/**
+ *
+ */
+object DataType extends RegexParsers {
+  protected lazy val primitiveType: Parser[DataType] =
+    "StringType" ^^^ StringType |
+    "FloatType" ^^^ FloatType |
+    "IntegerType" ^^^ IntegerType |
+    "ByteType" ^^^ ByteType |
+    "ShortType" ^^^ ShortType |
+    "DoubleType" ^^^ DoubleType |
+    "LongType" ^^^ LongType |
+    "BinaryType" ^^^ BinaryType |
+    "BooleanType" ^^^ BooleanType |
+    "DecimalType" ^^^ DecimalType |
+    "TimestampType" ^^^ TimestampType
+
+  protected lazy val arrayType: Parser[DataType] =
+    "ArrayType" ~> "(" ~> dataType <~ ")" ^^ ArrayType
+
+  protected lazy val mapType: Parser[DataType] =
+    "MapType" ~> "(" ~> dataType ~ "," ~ dataType <~ ")" ^^ {
+      case t1 ~ _ ~ t2 => MapType(t1, t2)
+    }
+
+  protected lazy val structField: Parser[StructField] =
+    ("StructField(" ~> "[a-zA-Z0-9_]*".r) ~ ("," ~> dataType) ~ ("," ~> boolVal <~ ")") ^^ {
+      case name ~ tpe ~ nullable  =>
+          StructField(name, tpe, nullable = nullable)
+    }
+
+  protected lazy val boolVal: Parser[Boolean] =
+    "true" ^^^ true |
+    "false" ^^^ false
+
+
+  protected lazy val structType: Parser[DataType] =
+    "StructType\\([A-zA-z]*\\(".r ~> repsep(structField, ",") <~ "))" ^^ {
+      case fields => new StructType(fields)
+    }
+
+  protected lazy val dataType: Parser[DataType] =
+    arrayType |
+      mapType |
+      structType |
+      primitiveType
+
+  /**
+   * Parses a string representation of a DataType.
+   *
+   * TODO: Generate parser as pickler...
+   */
+  def apply(asString: String): DataType = parseAll(dataType, asString) match {
+    case Success(result, _) => result
+    case failure: NoSuccess => sys.error(s"Unsupported dataType: $asString, $failure")
+  }
+}
+
 abstract class DataType {
   /** Matches any expression that evaluates to this DataType */
   def unapply(a: Expression): Boolean = a match {
     case e: Expression if e.dataType == this => true
     case _ => false
   }
 
-  def isPrimitive(): Boolean = false
+  def isPrimitive: Boolean = false
 }
 
 case object NullType extends DataType
 
 trait PrimitiveType extends DataType {
-  override def isPrimitive() = true
+  override def isPrimitive = true
 }
 
 abstract class NativeType extends DataType {
@@ -167,6 +227,17 @@ case object FloatType extends FractionalType {
 case class ArrayType(elementType: DataType) extends DataType
 
 case class StructField(name: String, dataType: DataType, nullable: Boolean)
-case class StructType(fields: Seq[StructField]) extends DataType
+
+object StructType {
+  def fromAttributes(attributes: Seq[Attribute]): StructType = {
+    StructType(attributes.map(a => StructField(a.name, a.dataType, a.nullable)))
+  }
+
+  //def apply(fields: Seq[StructField]) = new StructType(fields.toIndexedSeq)
+}
+
+case class StructType(fields: Seq[StructField]) extends DataType {
+  def toAttributes = fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)())
+}
 
 case class MapType(keyType: DataType, valueType: DataType) extends DataType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -36,6 +36,7 @@ import parquet.schema.MessageType
 import org.apache.spark.{Logging, SerializableWritable, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
+import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
 
 /**
@@ -167,7 +168,7 @@ case class InsertIntoParquetTable(
     val job = new Job(sc.hadoopConfiguration)
 
     val writeSupport =
-      if (child.output.map(_.dataType).forall(_.isPrimitive())) {
+      if (child.output.map(_.dataType).forall(_.isPrimitive)) {
         logger.debug("Initializing MutableRowWriteSupport")
         classOf[org.apache.spark.sql.parquet.MutableRowWriteSupport]
       } else {
@@ -178,7 +179,7 @@ case class InsertIntoParquetTable(
 
     // TODO: move that to function in object
     val conf = ContextUtil.getConfiguration(job)
-    conf.set(RowWriteSupport.PARQUET_ROW_SCHEMA, relation.parquetSchema.toString)
+    conf.set(RowWriteSupport.PARQUET_ROW_SCHEMA, StructType.fromAttributes(relation.output).toString)
 
     val fspath = new Path(relation.path)
     val fs = fspath.getFileSystem(conf)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -82,39 +82,35 @@ private[parquet] object RowReadSupport {
  * A `parquet.hadoop.api.WriteSupport` for Row ojects.
  */
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
-  def setSchema(schema: MessageType, configuration: Configuration) {
-    // for testing
-    this.schema = schema
-    // TODO: could use Attributes themselves instead of Parquet schema?
+
+
+  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
     configuration.set(
       RowWriteSupport.PARQUET_ROW_SCHEMA,
-      schema.toString)
+      StructType.fromAttributes(schema).toString)
     configuration.set(
       ParquetOutputFormat.WRITER_VERSION,
       ParquetProperties.WriterVersion.PARQUET_1_0.toString)
   }
 
-  def getSchema(configuration: Configuration): MessageType = {
-    MessageTypeParser.parseMessageType(configuration.get(RowWriteSupport.PARQUET_ROW_SCHEMA))
-  }
-
-  private[parquet] var schema: MessageType = null
   private[parquet] var writer: RecordConsumer = null
   private[parquet] var attributes: Seq[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    schema = if (schema == null) getSchema(configuration) else schema
-    attributes = ParquetTypesConverter.convertToAttributes(schema)
-    log.debug(s"write support initialized for requested schema $schema")
+    attributes = DataType(configuration.get(RowWriteSupport.PARQUET_ROW_SCHEMA)) match {
+      case s: StructType => s.toAttributes
+      case other => sys.error(s"Can convert $attributes to row")
+    }
+    log.debug(s"write support initialized for requested schema $attributes")
     ParquetRelation.enableLogForwarding()
     new WriteSupport.WriteContext(
-      schema,
+      ParquetTypesConverter.convertFromAttributes(attributes),
       new java.util.HashMap[java.lang.String, java.lang.String]())
   }
 
   override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
     writer = recordConsumer
-    log.debug(s"preparing for write with schema $schema")
+    log.debug(s"preparing for write with schema $attributes")
   }
 
   override def write(record: Row): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -67,7 +67,17 @@ case class Nested(i: Int, s: String)
 
 case class Data(array: Seq[Int], nested: Nested)
 
-class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
+case class AllDataTypes(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+class ParquetQuerySuite extends QueryTest with FunSuite with BeforeAndAfterAll {
   import TestData._
   TestData // Load test data tables.
 
@@ -100,6 +110,13 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     // here we should also unregister the table??
   }
 
+  test("Read/Write All Types") {
+    val data = AllDataTypes("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true)
+    val tempDir = getTempFilePath("parquetTest").getCanonicalPath
+    sparkContext.parallelize(data :: Nil).saveAsParquetFile(tempDir)
+    assert(parquetFile(tempDir).collect().head === data)
+  }
+
   test("self-join parquet files") {
     val x = ParquetTestData.testData.as('x)
     val y = ParquetTestData.testData.as('y)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -208,7 +208,9 @@ object HiveMetastoreTypes extends RegexParsers {
     }
 
   protected lazy val structType: Parser[DataType] =
-    "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ StructType
+    "struct" ~> "<" ~> repsep(structField,",") <~ ">"  ^^ {
+      case fields => new StructType(fields)
+    }
 
   protected lazy val dataType: Parser[DataType] =
     arrayType |

Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,9 @@ object HiveMetastoreTypes extends RegexParsers {`
`208`	`208`	`}`
`209`	`209`
`210`	`210`	`protected lazy val structType: Parser[DataType] =`
`211`		`- "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ StructType`
	`211`	`+ "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ {`
	`212`	`+ case fields => new StructType(fields)`
	`213`	`+ }`
`212`	`214`
`213`	`215`	`protected lazy val dataType: Parser[DataType] =`
`214`	`216`	`arrayType \|`