Attempting to fix Spark-Parquet schema conversion

AndreSchumacher · AndreSchumacher · commit 14c3fd8a3c39 · 2014-06-19T18:03:42.000+03:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -106,8 +106,8 @@ private[sql] object CatalystConverter {
     }
   }
 
-  protected[parquet] def createRootConverter(parquetSchema: MessageType): CatalystConverter = {
-    val attributes = ParquetTypesConverter.convertToAttributes(parquetSchema)
+  protected[parquet] def createRootConverter(parquetSchema: MessageType, attributes: Seq[Attribute]): CatalystConverter = {
+    //val attributes = ParquetTypesConverter.convertToAttributes(parquetSchema)
     // For non-nested types we use the optimized Row converter
     if (attributes.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))) {
       new CatalystPrimitiveRowConverter(attributes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -66,9 +66,13 @@ case class ParquetTableScan(
     }
 
     // Store Parquet schema in `Configuration`
+    // TODO: should this here be just the projected fields?
     conf.set(
-        RowReadSupport.PARQUET_ROW_REQUESTED_SCHEMA,
-        ParquetTypesConverter.convertFromAttributes(output).toString)
+      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+      ParquetTypesConverter.convertToString(output))
+    //conf.set(
+    //    RowReadSupport.PARQUET_ROW_REQUESTED_SCHEMA,
+    //    ParquetTypesConverter.convertFromAttributes(output).toString)
 
     // Store record filtering predicate in `Configuration`
     // Note 1: the input format ignores all predicates that cannot be expressed
@@ -179,7 +183,8 @@ case class InsertIntoParquetTable(
 
     // TODO: move that to function in object
     val conf = ContextUtil.getConfiguration(job)
-    conf.set(RowWriteSupport.PARQUET_ROW_SCHEMA, StructType.fromAttributes(relation.output).toString)
+    //conf.set(RowWriteSupport.PARQUET_ROW_SCHEMA, StructType.fromAttributes(relation.output).toString)
+    RowWriteSupport.setSchema(relation.output, conf)
 
     val fspath = new Path(relation.path)
     val fs = fspath.getFileSystem(conf)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -29,6 +29,8 @@ import parquet.schema.{MessageType, MessageTypeParser}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.execution.SparkSqlSerializer
+import com.google.common.io.BaseEncoding
 
 /**
  * A `parquet.io.api.RecordMaterializer` for Rows.
@@ -38,8 +40,8 @@ import org.apache.spark.sql.catalyst.types._
 private[parquet] class RowRecordMaterializer(root: CatalystConverter)
   extends RecordMaterializer[Row] {
 
-  def this(parquetSchema: MessageType) =
-    this(CatalystConverter.createRootConverter(parquetSchema))
+  def this(parquetSchema: MessageType, attributes: Seq[Attribute]) =
+    this(CatalystConverter.createRootConverter(parquetSchema, attributes))
 
   override def getCurrentRecord: Row = root.getCurrentRecord
 
@@ -57,50 +59,92 @@ private[parquet] class RowReadSupport extends ReadSupport[Row] with Logging {
       fileSchema: MessageType,
       readContext: ReadContext): RecordMaterializer[Row] = {
     log.debug(s"preparing for read with file schema $fileSchema")
-    new RowRecordMaterializer(readContext.getRequestedSchema)
+    //new RowRecordMaterializer(readContext.getRequestedSchema)
+    val parquetSchema = readContext.getRequestedSchema
+    var schema: Seq[Attribute] =
+      if (readContext.getReadSupportMetadata != null &&
+          readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY) != null) {
+        ParquetTypesConverter.convertFromString(
+          readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY))
+      } else {
+        // fall back to converting from Parquet schema
+        ParquetTypesConverter.convertToAttributes(parquetSchema)
+      }
+    new RowRecordMaterializer(parquetSchema, schema)
   }
 
   override def init(
       configuration: Configuration,
       keyValueMetaData: java.util.Map[String, String],
       fileSchema: MessageType): ReadContext = {
-    val requested_schema_string =
+    /*val requested_schema_string =
       configuration.get(RowReadSupport.PARQUET_ROW_REQUESTED_SCHEMA, fileSchema.toString)
     val requested_schema =
       MessageTypeParser.parseMessageType(requested_schema_string)
     log.debug(s"read support initialized for requested schema $requested_schema")
     ParquetRelation.enableLogForwarding()
-    new ReadContext(requested_schema, keyValueMetaData)
+    new ReadContext(requested_schema, keyValueMetaData) */
+
+    // GO ON HERE.. figure out why Avro distinguishes between requested read and read schema
+    // try to figure out what when needs to be written to metadata
+
+    var parquetSchema: MessageType = fileSchema
+    var metadata: java.util.Map[String, String] = null
+    val requestedAttributes = RowReadSupport.getRequestedSchema(configuration)
+
+    if (requestedAttributes != null) {
+      parquetSchema = ParquetTypesConverter.convertFromAttributes(requestedAttributes)
+    }
+
+    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
+
+    if (origAttributesStr != null) {
+      metadata = new java.util.HashMap[String, String]()
+      metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
+    }
+
+    return new ReadSupport.ReadContext(parquetSchema, metadata)
   }
 }
 
 private[parquet] object RowReadSupport {
-  val PARQUET_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
+  val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
+  val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
+
+  private def getRequestedSchema(configuration: Configuration): Seq[Attribute] = {
+    val schemaString = configuration.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+    if (schemaString == null) null else ParquetTypesConverter.convertFromString(schemaString)
+  }
 }
 
 /**
  * A `parquet.hadoop.api.WriteSupport` for Row ojects.
  */
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
-
-  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
+  /*def setSchema(schema: Seq[Attribute], configuration: Configuration) {
     configuration.set(
       RowWriteSupport.PARQUET_ROW_SCHEMA,
       StructType.fromAttributes(schema).toString)
     configuration.set(
       ParquetOutputFormat.WRITER_VERSION,
       ParquetProperties.WriterVersion.PARQUET_1_0.toString)
-  }
+  } */
 
+  private[parquet] var schema: MessageType = null
   private[parquet] var writer: RecordConsumer = null
   private[parquet] var attributes: Seq[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    attributes = DataType(configuration.get(RowWriteSupport.PARQUET_ROW_SCHEMA)) match {
-      case s: StructType => s.toAttributes
-      case other => sys.error(s"Can convert $attributes to row")
-    }
+    //attributes = DataType(configuration.get(RowWriteSupport.PARQUET_ROW_SCHEMA))
+    attributes = if (attributes == null) {
+      RowWriteSupport.getSchema(configuration) match {
+        case s: StructType => s.toAttributes
+        case other => sys.error(s"Can convert $attributes to row")
+      }
+    } else attributes
+    schema = if (schema == null) ParquetTypesConverter.convertFromAttributes(attributes) else schema
+    // ParquetTypesConverter.convertToAttributes(schema)
     log.debug(s"write support initialized for requested schema $attributes")
     ParquetRelation.enableLogForwarding()
     new WriteSupport.WriteContext(
@@ -275,6 +319,22 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
 }
 
 private[parquet] object RowWriteSupport {
-  val PARQUET_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.schema"
+  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
+
+  def getSchema(configuration: Configuration): Seq[Attribute] = {
+    val schemaString = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
+    if (schemaString == null) {
+      throw new RuntimeException("Missing schema!")
+    }
+    ParquetTypesConverter.convertFromString(schemaString)
+  }
+
+  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
+    val encoded = ParquetTypesConverter.convertToString(schema)
+    configuration.set(SPARK_ROW_SCHEMA, encoded)
+    configuration.set(
+      ParquetOutputFormat.WRITER_VERSION,
+      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
+  }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -33,6 +33,8 @@ import parquet.schema.Type.Repetition
 
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
 import org.apache.spark.sql.catalyst.types._
+import com.google.common.io.BaseEncoding
+import org.apache.spark.sql.execution.SparkSqlSerializer
 
 // Implicits
 import scala.collection.JavaConversions._
@@ -289,6 +291,16 @@ private[parquet] object ParquetTypesConverter {
     new MessageType("root", fields)
   }
 
+  def convertFromString(string: String): Seq[Attribute] = {
+    val decoded: Array[Byte] = BaseEncoding.base64().decode(string)
+    SparkSqlSerializer.deserialize(decoded)
+  }
+
+  def convertToString(schema: Seq[Attribute]): String = {
+    val serialized: Array[Byte] = SparkSqlSerializer.serialize(schema)
+    BaseEncoding.base64().encode(serialized)
+  }
+
   def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration) {
     if (origPath == null) {
       throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
@@ -313,6 +325,7 @@ private[parquet] object ParquetTypesConverter {
     }
     val extraMetadata = new java.util.HashMap[String, String]()
     extraMetadata.put("path", path.toString)
+    extraMetadata.put(RowReadSupport.SPARK_METADATA_KEY, ParquetTypesConverter.convertToString(attributes))
     // TODO: add extra data, e.g., table name, date, etc.?
 
     val parquetSchema: MessageType =

Original file line number	Diff line number	Diff line change
`@@ -106,8 +106,8 @@ private[sql] object CatalystConverter {`
`106`	`106`	`}`
`107`	`107`	`}`
`108`	`108`
`109`		`- protected[parquet] def createRootConverter(parquetSchema: MessageType): CatalystConverter = {`
`110`		`- val attributes = ParquetTypesConverter.convertToAttributes(parquetSchema)`
	`109`	`+ protected[parquet] def createRootConverter(parquetSchema: MessageType, attributes: Seq[Attribute]): CatalystConverter = {`
	`110`	`+ //val attributes = ParquetTypesConverter.convertToAttributes(parquetSchema)`
`111`	`111`	`// For non-nested types we use the optimized Row converter`
`112`	`112`	`if (attributes.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))) {`
`113`	`113`	`new CatalystPrimitiveRowConverter(attributes)`