fix strip usage, fix complex ordering projection

peter-toth · peter-toth · commit 1f1f093e646d · 2023-01-31T11:57:35.000+01:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/AliasAwareOutputExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/AliasAwareOutputExpression.scala
@@ -44,7 +44,7 @@ trait AliasAwareOutputExpression extends SQLConfHelper {
     val aliases = mutable.Map[Expression, mutable.ArrayBuffer[Attribute]]()
     outputExpressions.reverse.foreach {
       case a @ Alias(child, _) =>
-        val buffer = aliases.getOrElseUpdate(strip(child.canonicalized), mutable.ArrayBuffer.empty)
+        val buffer = aliases.getOrElseUpdate(strip(child).canonicalized, mutable.ArrayBuffer.empty)
         if (buffer.size < aliasCandidateLimit) {
           buffer += a.toAttribute
         }
@@ -96,7 +96,11 @@ trait AliasAwareQueryOutputOrdering[T <: QueryPlan[T]]
 
   override final def outputOrdering: Seq[SortOrder] = {
     if (hasAlias) {
-      orderingExpressions.flatMap { sortOrder =>
+      // Take the first `SortOrder`s only until they can be projected.
+      // E.g. we have child ordering `Seq(SortOrder(a), SortOrder(b))` then
+      // if only `a AS x` can be projected then we can return Seq(SortOrder(x))`
+      // but if only `b AS y` can be projected we can't return `Seq(SortOrder(y))`.
+      orderingExpressions.iterator.map { sortOrder =>
         val orderingSet = mutable.Set.empty[Expression]
         val sameOrderings = sortOrder.children.toStream
           .flatMap(projectExpression)
@@ -108,7 +112,7 @@ trait AliasAwareQueryOutputOrdering[T <: QueryPlan[T]]
         } else {
           None
         }
-      }
+      }.takeWhile(_.isDefined).flatten.toSeq
     } else {
       orderingExpressions
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ProjectedOrderingAndPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ProjectedOrderingAndPartitioningSuite.scala
@@ -159,4 +159,22 @@ class ProjectedOrderingAndPartitioningSuite
     assert(outputOrdering.head.sameOrderExpressions.map(_.sql) ==
       Seq("aa", "(b + b)", "(a + b)", "(a + a)"))
   }
+
+  test("SPARK-42049: Improve AliasAwareOutputExpression - ordering partly projected") {
+    val df = spark.range(2).orderBy($"id" + 1, $"id" + 2)
+
+    val df1 = df.selectExpr("id + 1 AS a", "id + 2 AS b")
+    val outputOrdering1 = df1.queryExecution.optimizedPlan.outputOrdering
+    assert(outputOrdering1.size == 2)
+    assert(outputOrdering1.map(_.sql) == Seq("a ASC NULLS FIRST", "b ASC NULLS FIRST"))
+
+    val df2 = df.selectExpr("id + 1 AS a")
+    val outputOrdering2 = df2.queryExecution.optimizedPlan.outputOrdering
+    assert(outputOrdering2.size == 1)
+    assert(outputOrdering2.head.sql == "a ASC NULLS FIRST")
+
+    val df3 = df.selectExpr("id + 2 AS b")
+    val outputOrdering3 = df3.queryExecution.optimizedPlan.outputOrdering
+    assert(outputOrdering3.size == 0)
+  }
 }