Add more checks

imback82 · imback82 · commit 488e051e1a7c · 2020-06-25T20:44:52.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.collection.mutable
+
 import org.apache.spark.TaskContext
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
@@ -61,25 +63,91 @@ case class BroadcastHashJoinExec(
   }
 
   override def outputPartitioning: Partitioning = {
-    def buildKeys: Seq[Expression] = buildSide match {
-      case BuildLeft => leftKeys
-      case BuildRight => rightKeys
+    val (buildKeys, streamedKeys) = buildSide match {
+      case BuildLeft => (leftKeys, rightKeys)
+      case BuildRight => (rightKeys, leftKeys)
     }
 
     joinType match {
       case _: InnerLike =>
         streamedPlan.outputPartitioning match {
           case h: HashPartitioning =>
-            PartitioningCollection(Seq(h, HashPartitioning(buildKeys, h.numPartitions)))
-          case c: PartitioningCollection
-            if c.partitionings.forall(_.isInstanceOf[HashPartitioning]) =>
-            PartitioningCollection(c.partitionings :+ HashPartitioning(buildKeys, c.numPartitions))
+            getBuildSidePartitioning(h, streamedKeys, buildKeys) match {
+              case Some(p) => PartitioningCollection(Seq(h, p))
+              case None => h
+            }
+          case c: PartitioningCollection =>
+            c.partitionings.foreach {
+              case h: HashPartitioning =>
+                getBuildSidePartitioning(h, streamedKeys, buildKeys) match {
+                  case Some(p) => return PartitioningCollection(c.partitionings :+ p)
+                  case None => ()
+                }
+              case _ => ()
+            }
+            c
           case other => other
         }
       case _ => streamedPlan.outputPartitioning
     }
   }
 
+  /**
+   * Returns a partitioning for the build side if the following conditions are met:
+   *   - The streamed side's output partitioning expressions consist of all the keys
+   *     from the streamed side, we can add a partitioning for the build side.
+   *   - There is a one-to-one mapping from streamed keys to build keys.
+   *
+   * The build side partitioning will have expressions in the same order as the expressions
+   * in the streamed side partitioning. For example, for the following setup:
+   *   - streamed partitioning expressions: Seq(s1, s2)
+   *   - streamed keys: Seq(c1, c2)
+   *   - build keys: Seq(b1, b2)
+   * the expressions in the build side partitioning will be Seq(b1, b2), not Seq(b2, b1).
+   */
+  private def getBuildSidePartitioning(
+      streamedPartitioning: HashPartitioning,
+      streamedKeys: Seq[Expression],
+      buildKeys: Seq[Expression]): Option[HashPartitioning] = {
+    if (!satisfiesPartitioning(streamedKeys, streamedPartitioning)) {
+      return None
+    }
+
+    val streamedKeyToBuildKeyMap = mutable.Map.empty[Expression, Expression]
+    streamedKeys.zip(buildKeys).foreach {
+      case (streamedKey, buildKey) =>
+        val inserted = streamedKeyToBuildKeyMap.getOrElseUpdate(
+          streamedKey.canonicalized,
+          buildKey)
+
+        if (!inserted.semanticEquals(buildKey)) {
+          // One-to-many mapping from streamed keys to build keys found.
+          return None
+        }
+    }
+
+    // Ensure the one-to-one mapping from streamed keys to build keys.
+    if (streamedKeyToBuildKeyMap.size != streamedKeyToBuildKeyMap.values.toSet.size) {
+      return None
+    }
+
+    // The final expressions are built by mapping stream partitioning expressions ->
+    // streamed keys -> build keys.
+    val buildPartitioningExpressions = streamedPartitioning.expressions.map { e =>
+      streamedKeyToBuildKeyMap(e.canonicalized)
+    }
+
+    Some(HashPartitioning(buildPartitioningExpressions, streamedPartitioning.numPartitions))
+  }
+
+  // Returns true if `keys` consist of all the expressions in `partitioning`.
+  private def satisfiesPartitioning(
+      keys: Seq[Expression],
+      partitioning: HashPartitioning): Boolean = {
+    partitioning.expressions.length == keys.length &&
+      partitioning.expressions.forall(e => keys.exists(_.semanticEquals(e)))
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -469,14 +469,14 @@ abstract class BroadcastJoinSuiteBase extends QueryTest with SQLTestUtils
         val df1 = (0 until 100).map(i => (i % 5, i % 13)).toDF("i1", "j1")
         val df2 = (0 until 20).map(i => (i % 7, i % 11)).toDF("i2", "j2")
         val df3 = (0 until 100).map(i => (i % 5, i % 13)).toDF("i3", "j3")
-        df1.write.format("parquet").bucketBy(8, "i1").saveAsTable("t1")
-        df3.write.format("parquet").bucketBy(8, "i3").saveAsTable("t3")
+        df1.write.format("parquet").bucketBy(8, "i1", "j1").saveAsTable("t1")
+        df3.write.format("parquet").bucketBy(8, "i3", "j3").saveAsTable("t3")
         val t1 = spark.table("t1")
         val t3 = spark.table("t3")
 
         // join1 is a broadcast join where df2 is broadcasted. Note that output partitioning on the
         // streamed side (t1) is HashPartitioning (bucketed files).
-        val join1 = t1.join(df2, t1("i1") === df2("i2"))
+        val join1 = t1.join(df2, t1("i1") === df2("i2") && t1("j1") === df2("j2"))
         val plan1 = join1.queryExecution.executedPlan
         assert(collect(plan1) { case e: ShuffleExchangeExec => e }.isEmpty)
         val broadcastJoins = collect(plan1) { case b: BroadcastHashJoinExec => b }
@@ -489,17 +489,17 @@ abstract class BroadcastJoinSuiteBase extends QueryTest with SQLTestUtils
           case _ => fail()
         }
 
-        // Join on the column from the broadcasted side (i2) and make sure output partitioning
+        // Join on the column from the broadcasted side (i2, j2) and make sure output partitioning
         // is maintained by checking no shuffle exchange is introduced.
-        val join2 = join1.join(t3, join1("i2") === t3("i3"))
+        val join2 = join1.join(t3, join1("i2") === t3("i3") && join1("j2") === t3("j3"))
         val plan2 = join2.queryExecution.executedPlan
         assert(collect(plan2) { case s: SortMergeJoinExec => s }.size == 1)
         assert(collect(plan2) { case b: BroadcastHashJoinExec => b }.size == 1)
         assert(collect(plan2) { case e: ShuffleExchangeExec => e }.isEmpty)
 
-        // Validate the data with boradcast join off.
+        // Validate the data with broadcast join off.
         withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
-          val df = join1.join(t3, join1("i2") === t3("i3"))
+          val df = join1.join(t3, join1("i2") === t3("i3") && join1("j2") === t3("j3"))
           QueryTest.sameRows(join2.collect().toSeq, df.collect().toSeq)
         }
       }