apache
diff --git a/‎core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java‎
Lines changed: 7 additions & 0 deletions b/‎core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala‎
Lines changed: 95 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala‎
Lines changed: 38 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 3 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 43 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala‎
Lines changed: 51 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala‎
Lines changed: 51 additions & 0 deletions
@@ -414,6 +414,13 @@ public MapIterator iterator() {
     return new MapIterator(numValues, loc, false);
   }
 
+  /**
+   * Returns a thread safe iterator that iterates of the entries of this map.
+   */
+  public MapIterator safeIterator() {
+    return new MapIterator(numValues, new Location(), false);
+  }
+
   /**
    * Returns a destructive iterator for iterating over the entries of this map. It frees each page
    * as it moves onto next one. Notice: it is illegal to call any method on the map after
 
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+trait DynamicPruning extends Predicate
+
+/**
+ * The DynamicPruningSubquery expression is only used in join operations to prune one side of the
+ * join with a filter from the other side of the join. It is inserted in cases where partition
+ * pruning can be applied.
+ *
+ * @param pruningKey the filtering key of the plan to be pruned.
+ * @param buildQuery the build side of the join.
+ * @param buildKeys the join keys corresponding to the build side of the join
+ * @param onlyInBroadcast when set to false it indicates that the pruning filter is likely to be
+ *  beneficial and so it should be executed even if it cannot reuse the results of the
+ *  broadcast through ReuseExchange; otherwise, it will use the filter only if it
+ *  can reuse the results of the broadcast through ReuseExchange
+ * @param broadcastKeyIndex the index of the filtering key collected from the broadcast
+ */
+case class DynamicPruningSubquery(
+    pruningKey: Expression,
+    buildQuery: LogicalPlan,
+    buildKeys: Seq[Expression],
+    broadcastKeyIndex: Int,
+    onlyInBroadcast: Boolean,
+    exprId: ExprId = NamedExpression.newExprId)
+  extends SubqueryExpression(buildQuery, Seq(pruningKey), exprId)
+  with DynamicPruning
+  with Unevaluable {
+
+  override def children: Seq[Expression] = Seq(pruningKey)
+
+  override def plan: LogicalPlan = buildQuery
+
+  override def nullable: Boolean = false
+
+  override def withNewPlan(plan: LogicalPlan): DynamicPruningSubquery = copy(buildQuery = plan)
+
+  override lazy val resolved: Boolean = {
+    pruningKey.resolved &&
+      buildQuery.resolved &&
+      buildKeys.nonEmpty &&
+      buildKeys.forall(_.resolved) &&
+      broadcastKeyIndex >= 0 &&
+      broadcastKeyIndex < buildKeys.size &&
+      buildKeys.forall(_.references.subsetOf(buildQuery.outputSet)) &&
+      pruningKey.dataType == buildKeys(broadcastKeyIndex).dataType
+  }
+
+  override def toString: String = s"dynamicpruning#${exprId.id} $conditionString"
+
+  override lazy val canonicalized: DynamicPruning = {
+    copy(
+      pruningKey = pruningKey.canonicalized,
+      buildQuery = buildQuery.canonicalized,
+      buildKeys = buildKeys.map(_.canonicalized),
+      exprId = ExprId(0))
+  }
+}
+
+/**
+ * Marker for a planned [[DynamicPruning]] expression.
+ * The expression is created during planning, and it defers to its child for evaluation.
+ *
+ * @param child underlying predicate.
+ */
+case class DynamicPruningExpression(child: Expression)
+  extends UnaryExpression
+  with DynamicPruning {
+  override def eval(input: InternalRow): Any = child.eval(input)
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.genCode(ctx)
+  }
+}
@@ -21,9 +21,10 @@ import scala.collection.immutable.TreeSet
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral, GenerateSafeProjection, GenerateUnsafeProjection, Predicate => BasePredicate}
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -65,6 +66,42 @@ trait PredicateHelper {
     }
   }
 
+  /**
+   * Find the origin of where the input references of expression exp were scanned in the tree of
+   * plan, and if they originate from a single leaf node.
+   * Returns optional tuple with Expression, undoing any projections and aliasing that has been done
+   * along the way from plan to origin, and the origin LeafNode plan from which all the exp
+   */
+  def findExpressionAndTrackLineageDown(
+      exp: Expression,
+      plan: LogicalPlan): Option[(Expression, LogicalPlan)] = {
+
+    plan match {
+      case Project(projectList, child) =>
+        val aliases = AttributeMap(projectList.collect {
+          case a @ Alias(child, _) => (a.toAttribute, child)
+        })
+        findExpressionAndTrackLineageDown(replaceAlias(exp, aliases), child)
+      // we can unwrap only if there are row projections, and no aggregation operation
+      case Aggregate(_, aggregateExpressions, child) =>
+        val aliasMap = AttributeMap(aggregateExpressions.collect {
+          case a: Alias if a.child.find(_.isInstanceOf[AggregateExpression]).isEmpty =>
+            (a.toAttribute, a.child)
+        })
+        findExpressionAndTrackLineageDown(replaceAlias(exp, aliasMap), child)
+      case l: LeafNode if exp.references.subsetOf(l.outputSet) =>
+        Some((exp, l))
+      case other =>
+        other.children.flatMap {
+          child => if (exp.references.subsetOf(child.outputSet)) {
+            findExpressionAndTrackLineageDown(exp, child)
+          } else {
+            None
+          }
+        }.headOption
+    }
+  }
+
   protected def splitDisjunctivePredicates(condition: Expression): Seq[Expression] = {
     condition match {
       case Or(cond1, cond2) =>
 
@@ -48,7 +48,9 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
   }
 
   override protected val blacklistedOnceBatches: Set[String] =
-    Set("Extract Python UDFs")
+    Set(
+      "PartitionPruning",
+      "Extract Python UDFs")
 
   protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
 
 
@@ -216,6 +216,39 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val DYNAMIC_PARTITION_PRUNING_ENABLED =
+    buildConf("spark.sql.optimizer.dynamicPartitionPruning.enabled")
+      .doc("When true, we will generate predicate for partition column when it's used as join key")
+      .booleanConf
+      .createWithDefault(true)
+
+  val DYNAMIC_PARTITION_PRUNING_USE_STATS =
+    buildConf("spark.sql.optimizer.dynamicPartitionPruning.useStats")
+      .internal()
+      .doc("When true, distinct count statistics will be used for computing the data size of the " +
+        "partitioned table after dynamic partition pruning, in order to evaluate if it is worth " +
+        "adding an extra subquery as the pruning filter if broadcast reuse is not applicable.")
+      .booleanConf
+      .createWithDefault(true)
+
+  val DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO = buildConf(
+    "spark.sql.optimizer.dynamicPartitionPruning.fallbackFilterRatio")
+    .internal()
+    .doc("When statistics are not available or configured not to be used, this config will be " +
+      "used as the fallback filter ratio for computing the data size of the partitioned table " +
+      "after dynamic partition pruning, in order to evaluate if it is worth adding an extra " +
+      "subquery as the pruning filter if broadcast reuse is not applicable.")
+    .doubleConf
+    .createWithDefault(0.5)
+
+  val DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST =
+    buildConf("spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcast")
+      .internal()
+      .doc("When true, dynamic partition pruning will seek to reuse the broadcast results from " +
+        "a broadcast hash join operation.")
+      .booleanConf
+      .createWithDefault(true)
+
   val COMPRESS_CACHED = buildConf("spark.sql.inMemoryColumnarStorage.compressed")
     .doc("When set to true Spark SQL will automatically select a compression codec for each " +
       "column based on statistics of the data.")
@@ -1970,6 +2003,16 @@ class SQLConf extends Serializable with Logging {
 
   def optimizerPlanChangeBatches: Option[String] = getConf(OPTIMIZER_PLAN_CHANGE_LOG_BATCHES)
 
+  def dynamicPartitionPruningEnabled: Boolean = getConf(DYNAMIC_PARTITION_PRUNING_ENABLED)
+
+  def dynamicPartitionPruningUseStats: Boolean = getConf(DYNAMIC_PARTITION_PRUNING_USE_STATS)
+
+  def dynamicPartitionPruningFallbackFilterRatio: Double =
+    getConf(DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO)
+
+  def dynamicPartitionPruningReuseBroadcast: Boolean =
+    getConf(DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST)
+
   def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS)
 
   def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT)
 
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.dynamicpruning
+
+import org.apache.spark.sql.catalyst.expressions.{DynamicPruning, PredicateHelper}
+import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ *  Removes the filter nodes with dynamic pruning that were not pushed down to the scan.
+ *  These nodes will not be pushed through projects and aggregates with non-deterministic
+ *  expressions.
+ */
+object CleanupDynamicPruningFilters extends Rule[LogicalPlan] with PredicateHelper {
+
+  override def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!SQLConf.get.dynamicPartitionPruningEnabled) {
+      return plan
+    }
+
+    plan.transform {
+      // pass through anything that is pushed down into PhysicalOperation
+      case p @ PhysicalOperation(_, _, LogicalRelation(_: HadoopFsRelation, _, _, _)) => p
+      // remove any Filters with DynamicPruning that didn't get pushed down to PhysicalOperation.
+      case f @ Filter(condition, _) =>
+        val newCondition = condition.transform {
+          case _: DynamicPruning => TrueLiteral
+        }
+        f.copy(condition = newCondition)
+    }
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,9 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`override protected val blacklistedOnceBatches: Set[String] =`
`51`		`- Set("Extract Python UDFs")`
	`51`	`+ Set(`
	`52`	`+ "PartitionPruning",`
	`53`	`+ "Extract Python UDFs")`
`52`	`54`
`53`	`55`	`protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)`
`54`	`56`