fix

cloud-fan · cloud-fan · commit bf8b42d494d4 · 2018-05-13T12:42:25.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -78,4 +78,17 @@ private[sql] object CreateJacksonParser extends Serializable {
   def inputStream(enc: String, jsonFactory: JsonFactory, is: InputStream): JsonParser = {
     jsonFactory.createParser(new InputStreamReader(is, enc))
   }
+
+  def internalRow(jsonFactory: JsonFactory, row: InternalRow): JsonParser = {
+    val ba = row.getBinary(0)
+
+    jsonFactory.createParser(ba, 0, ba.length)
+  }
+
+  def internalRow(enc: String, jsonFactory: JsonFactory, row: InternalRow): JsonParser = {
+    val binary = row.getBinary(0)
+    val sd = getStreamDecoder(enc, binary, binary.length)
+
+    jsonFactory.createParser(sd)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -70,24 +70,15 @@ object SQLExecution {
       // streaming queries would give us call site like "run at <unknown>:0"
       val callSite = sc.getCallSite()
 
-      // Set all the specified SQL configs to local properties, so that they can be available at
-      // the executor side.
-      val allConfigs = sparkSession.sessionState.conf.getAllConfs
-      allConfigs.foreach {
-        // Excludes external configs defined by users.
-        case (key, value) if key.startsWith("spark") => sc.setLocalProperty(key, value)
-      }
-
-      sc.listenerBus.post(SparkListenerSQLExecutionStart(
-        executionId, callSite.shortForm, callSite.longForm, queryExecution.toString,
-        SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan), System.currentTimeMillis()))
-      try {
-        body
-      } finally {
-        sc.listenerBus.post(SparkListenerSQLExecutionEnd(
-          executionId, System.currentTimeMillis()))
-        allConfigs.foreach {
-          case (key, _) => sc.setLocalProperty(key, null)
+      withSQLConfPropagated(sparkSession) {
+        sc.listenerBus.post(SparkListenerSQLExecutionStart(
+          executionId, callSite.shortForm, callSite.longForm, queryExecution.toString,
+          SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan), System.currentTimeMillis()))
+        try {
+          body
+        } finally {
+          sc.listenerBus.post(SparkListenerSQLExecutionEnd(
+            executionId, System.currentTimeMillis()))
         }
       }
     } finally {
@@ -104,21 +95,30 @@ object SQLExecution {
   def withExecutionId[T](sparkSession: SparkSession, executionId: String)(body: => T): T = {
     val sc = sparkSession.sparkContext
     val oldExecutionId = sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    withSQLConfPropagated(sparkSession) {
+      try {
+        sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, executionId)
+        body
+      } finally {
+        sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, oldExecutionId)
+      }
+    }
+  }
+
+  def withSQLConfPropagated[T](sparkSession: SparkSession)(body: => T): T = {
     // Set all the specified SQL configs to local properties, so that they can be available at
     // the executor side.
     val allConfigs = sparkSession.sessionState.conf.getAllConfs
-    allConfigs.foreach {
+    for ((key, value) <- allConfigs) {
       // Excludes external configs defined by users.
-      case (key, value) if key.startsWith("spark") => sc.setLocalProperty(key, value)
+      if (key.startsWith("spark")) sparkSession.sparkContext.setLocalProperty(key, value)
     }
     try {
-      sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, executionId)
       body
     } finally {
       allConfigs.foreach {
-        case (key, _) => sc.setLocalProperty(key, null)
+        case (key, _) => sparkSession.sparkContext.setLocalProperty(key, null)
       }
-      sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, oldExecutionId)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.types.StructType
@@ -93,32 +94,30 @@ object TextInputJsonDataSource extends JsonDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): StructType = {
     val json: Dataset[String] = createBaseDataset(sparkSession, inputPaths, parsedOptions)
+
     inferFromDataset(json, parsedOptions)
   }
 
   def inferFromDataset(json: Dataset[String], parsedOptions: JSONOptions): StructType = {
     val sampled: Dataset[String] = JsonUtils.sample(json, parsedOptions)
-    if (parsedOptions.encoding.isDefined) {
-      // TODO: We should be able to parse the input string directly. Remove this hack when we
-      // support setting encoding when reading text files.
-      val encoding = parsedOptions.encoding.get
-      val textDS = sampled.map(new Text(_))(Encoders.javaSerialization)
-      val parser = CreateJacksonParser.text(encoding, _: JsonFactory, _: Text)
-      JsonInferSchema.infer(textDS, parsedOptions, parser)
-    } else {
-      JsonInferSchema.infer(sampled, parsedOptions, CreateJacksonParser.string)
+    val rdd: RDD[InternalRow] = sampled.queryExecution.toRdd
+    val rowParser = parsedOptions.encoding.map { enc =>
+      CreateJacksonParser.internalRow(enc, _: JsonFactory, _: InternalRow)
+    }.getOrElse(CreateJacksonParser.internalRow(_: JsonFactory, _: InternalRow))
+
+    SQLExecution.withSQLConfPropagated(json.sparkSession) {
+      JsonInferSchema.infer(rdd, parsedOptions, rowParser)
     }
   }
 
   private def createBaseDataset(
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): Dataset[String] = {
-    val paths = inputPaths.map(_.getPath.toString)
     sparkSession.baseRelationToDataFrame(
       DataSource.apply(
         sparkSession,
-        paths = paths,
+        paths = inputPaths.map(_.getPath.toString),
         className = classOf[TextFileFormat].getName,
         options = parsedOptions.parameters
       ).resolveRelation(checkFilesExist = false))
@@ -164,8 +163,9 @@ object MultiLineJsonDataSource extends JsonDataSource {
       .map(enc => createParser(enc, _: JsonFactory, _: PortableDataStream))
       .getOrElse(createParser(_: JsonFactory, _: PortableDataStream))
 
-    JsonInferSchema.infer[PortableDataStream](
-      sparkSession.createDataset(sampled)(Encoders.javaSerialization), parsedOptions, parser)
+    SQLExecution.withSQLConfPropagated(sparkSession) {
+      JsonInferSchema.infer[PortableDataStream](sampled, parsedOptions, parser)
+    }
   }
 
   private def createBaseRdd(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
@@ -22,7 +22,7 @@ import java.util.Comparator
 import com.fasterxml.jackson.core._
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{Dataset, Encoders}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.catalyst.json.JSONOptions
@@ -39,14 +39,14 @@ private[sql] object JsonInferSchema {
    *   3. Replace any remaining null fields with string, the top type
    */
   def infer[T](
-      json: Dataset[T],
+      json: RDD[T],
       configOptions: JSONOptions,
       createParser: (JsonFactory, T) => JsonParser): StructType = {
     val parseMode = configOptions.parseMode
     val columnNameOfCorruptRecord = configOptions.columnNameOfCorruptRecord
 
     // perform schema inference on each row and merge afterwards
-    val inferredTypes = json.mapPartitions { iter =>
+    val rootType = json.mapPartitions { iter =>
       val factory = new JsonFactory()
       configOptions.setJacksonOptions(factory)
       iter.flatMap { row =>
@@ -67,15 +67,8 @@ private[sql] object JsonInferSchema {
           }
         }
       }
-    }(Encoders.javaSerialization)
-
-    // TODO: use `Dataset.fold` once we have it.
-    val rootType = try {
-      inferredTypes.reduce(compatibleRootType(columnNameOfCorruptRecord, parseMode))
-    } catch {
-      case e: UnsupportedOperationException if e.getMessage == "empty collection" =>
-        StructType(Nil)
-    }
+    }.fold(StructType(Nil))(
+      compatibleRootType(columnNameOfCorruptRecord, parseMode))
 
     canonicalizeType(rootType) match {
       case Some(st: StructType) => st
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -1374,7 +1374,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   test("SPARK-6245 JsonInferSchema.infer on empty RDD") {
     // This is really a test that it doesn't throw an exception
     val emptySchema = JsonInferSchema.infer(
-      empty,
+      empty.rdd,
       new JSONOptions(Map.empty[String, String], "GMT"),
       CreateJacksonParser.string)
     assert(StructType(Seq()) === emptySchema)
@@ -1401,7 +1401,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
   test("SPARK-8093 Erase empty structs") {
     val emptySchema = JsonInferSchema.infer(
-      emptyRecords,
+      emptyRecords.rdd,
       new JSONOptions(Map.empty[String, String], "GMT"),
       CreateJacksonParser.string)
     assert(StructType(Seq()) === emptySchema)