Implemented another idea

sarutak · sarutak · commit 021977f8e1b7 · 2016-08-28T05:05:26.000+09:00
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1137,7 +1137,6 @@ def test_access_column(self):
         self.assertTrue(isinstance(df['key'], Column))
         self.assertTrue(isinstance(df[0], Column))
         self.assertRaises(IndexError, lambda: df[2])
-        self.assertRaises(AnalysisException, lambda: df["bad_key"])
         self.assertRaises(TypeError, lambda: df[{}])
 
     def test_column_name_with_non_ascii(self):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -603,15 +603,21 @@ class Analyzer(
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressionsUp {
-          case u @ UnresolvedAttribute(nameParts) =>
+          case u @ UnresolvedAttribute(nameParts, targetPlanIdOpt) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result =
-              withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) }
+              withPosition(u) {
+                targetPlanIdOpt match {
+                  case Some(targetPlanId) =>
+                    resolveExpressionFromSpecificLogicalPlan(nameParts, q, targetPlanId)
+                  case None =>
+                    q.resolveChildren(nameParts, resolver).getOrElse(u)
+                }
+              }
             logDebug(s"Resolving $u to $result")
             result
           case UnresolvedExtractValue(child, fieldExpr) if child.resolved =>
             ExtractValue(child, fieldExpr, resolver)
-          case l: LazilyDeterminedAttribute => resolveLazilyDeterminedAttribute(l, q)
         }
     }
 
@@ -684,22 +690,18 @@ class Analyzer(
     exprs.exists(_.find(_.isInstanceOf[UnresolvedDeserializer]).isDefined)
   }
 
-  private def resolveLazilyDeterminedAttribute(
-      expr: LazilyDeterminedAttribute,
-      plan: LogicalPlan): Expression = {
-
-    val foundPlanOpt = plan.findByBreadthFirst(_.planId == expr.plan.planId)
-    val foundPlan = foundPlanOpt.getOrElse {
-      failAnalysis(s"""Cannot resolve column name "${expr.name}" """)
-    }
-
-    if (foundPlan == expr.plan) {
-      expr.namedExpr
-    } else {
-      foundPlan.resolveQuoted(expr.name, resolver).getOrElse {
-        failAnalysis(s"""Cannot resolve column name "${expr.name}" """ +
-          s"""among (${foundPlan.schema.fieldNames.mkString(", ")})""")
-      }
+  private[sql] def resolveExpressionFromSpecificLogicalPlan(
+      nameParts: Seq[String],
+      planToSearchFrom: LogicalPlan,
+      targetPlanId: Long): Expression = {
+    lazy val name = UnresolvedAttribute(nameParts).name
+    planToSearchFrom.findByBreadthFirst(_.planId == targetPlanId) match {
+      case Some(foundPlan) =>
+        foundPlan.resolve(nameParts, resolver).getOrElse {
+          failAnalysis(s"Could not find $name in ${planToSearchFrom.output.mkString(", ")}")
+        }
+      case None =>
+        failAnalysis(s"Could not find $name in ${planToSearchFrom.output.mkString(", ")}")
     }
   }
 
@@ -714,11 +716,16 @@ class Analyzer(
     try {
       expr transformUp {
         case GetColumnByOrdinal(ordinal, _) => plan.output(ordinal)
-        case u @ UnresolvedAttribute(nameParts) =>
-          withPosition(u) { plan.resolve(nameParts, resolver).getOrElse(u) }
+        case u @ UnresolvedAttribute(nameParts, targetPlanIdOpt) =>
+          withPosition(u) {
+            targetPlanIdOpt match {
+              case Some(targetPlanId) =>
+                resolveExpressionFromSpecificLogicalPlan(nameParts, plan, targetPlanId)
+              case None => plan.resolve(nameParts, resolver).getOrElse(u)
+            }
+          }
         case UnresolvedExtractValue(child, fieldName) if child.resolved =>
           ExtractValue(child, fieldName, resolver)
-        case l: LazilyDeterminedAttribute => resolveLazilyDeterminedAttribute(l, plan)
       }
     } catch {
       case a: AnalysisException if !throws => expr
@@ -942,12 +949,17 @@ class Analyzer(
       plan transformDown {
         case q: LogicalPlan if q.childrenResolved && !q.resolved =>
           q transformExpressions {
-            case u @ UnresolvedAttribute(nameParts) =>
+            case u @ UnresolvedAttribute(nameParts, targetPlanIdOpt) =>
               withPosition(u) {
                 try {
-                  outer.resolve(nameParts, resolver) match {
-                    case Some(outerAttr) => OuterReference(outerAttr)
-                    case None => u
+                  targetPlanIdOpt match {
+                    case Some(targetPlanId) =>
+                      resolveExpressionFromSpecificLogicalPlan(nameParts, outer, targetPlanId)
+                    case None =>
+                      outer.resolve(nameParts, resolver) match {
+                        case Some(outerAttr) => OuterReference(outerAttr)
+                        case None => u
+                      }
                   }
                 } catch {
                   case _: AnalysisException => u
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -83,7 +83,9 @@ case class UnresolvedTableValuedFunction(functionName: String, functionArgs: Seq
 /**
  * Holds the name of an attribute that has yet to be resolved.
  */
-case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Unevaluable {
+case class UnresolvedAttribute(
+    nameParts: Seq[String],
+    targetPlanIdOpt: Option[Long] = None) extends Attribute with Unevaluable {
 
   def name: String =
     nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".")
@@ -419,42 +421,3 @@ case class UnresolvedOrdinal(ordinal: Int)
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 }
-
-/**
- * This is used when we refer a column like `df("expr")`
- * and determines which expression `df("expr")` should point to lazily.
- * Normally, `df("expr")` should point the expression (say expr1 here.) which
- * the logical plan in `df` outputs. but we have some cases that `df("expr")` should
- * point to another expression (say expr2 here) rather than expr1
- * and in this case, expr2 is equally to expr1 except exprId.
- * This will happen when datasets are self-joined or in similar situations and in this situation,
- * logical plans and expressions of those outputs are re-created with new exprIds the analyzer.
- * [[LazilyDeterminedAttribute()]] can treat this case properly
- * to determine that `df("expr")` should point which expression in the analyzer.
- *
- * @param namedExpr The expression which a column reference should point to normally.
- * @param plan The logical plan which contains the expression
- *             which the column reference should point to lazily.
- */
-case class LazilyDeterminedAttribute(
-    namedExpr: NamedExpression)(
-  val plan: LogicalPlan)
-  extends Attribute with Unevaluable {
-  // We need to keep the constructor curried
-  // so that we can compare like df1("col1") == df2("col1") especially in case of test.
-
-  override def name: String = namedExpr.name
-  override def exprId: ExprId = throw new UnresolvedException(this, "exprId")
-  override def dataType: DataType = throw new UnresolvedException(this, "dataType")
-  override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
-  override def qualifier: Option[String] = throw new UnresolvedException(this, "qualifier")
-  override lazy val resolved = false
-
-  override def newInstance(): Attribute = throw new UnresolvedException(this, "newInstance")
-  override def withNullability(newNullability: Boolean): Attribute =
-    throw new UnresolvedException(this, "withNullability")
-  override def withName(newName: String): Attribute =
-    throw new UnresolvedException(this, "withName")
-  override def withQualifier(newQualifier: Option[String]): Attribute =
-    throw new UnresolvedException(this, "withQualifier")
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -149,7 +149,7 @@ object ExpressionEncoder {
       } else {
         val input = GetColumnByOrdinal(index, enc.schema)
         val deserialized = enc.deserializer.transformUp {
-          case UnresolvedAttribute(nameParts) =>
+          case UnresolvedAttribute(nameParts, _) =>
             assert(nameParts.length == 1)
             UnresolvedExtractValue(input, Literal(nameParts.head))
           case GetColumnByOrdinal(ordinal, _) => GetStructField(input, ordinal)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1168,8 +1168,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   override def visitDereference(ctx: DereferenceContext): Expression = withOrigin(ctx) {
     val attr = ctx.fieldName.getText
     expression(ctx.base) match {
-      case UnresolvedAttribute(nameParts) =>
-        UnresolvedAttribute(nameParts :+ attr)
+      case UnresolvedAttribute(nameParts, targetPlanId) =>
+        UnresolvedAttribute(nameParts :+ attr, targetPlanId)
       case e =>
         UnresolvedExtractValue(e, Literal(attr))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -161,7 +161,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       if (f.dataType.isInstanceOf[NumericType] && cols.exists(col => columnEquals(f.name, col))) {
         fillCol[Double](f, value)
       } else {
-        df.colInternal(f.name)
+        df.col(f.name)
       }
     }
     df.select(projections : _*)
@@ -188,7 +188,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       if (f.dataType.isInstanceOf[StringType] && cols.exists(col => columnEquals(f.name, col))) {
         fillCol[String](f, value)
       } else {
-        df.colInternal(f.name)
+        df.col(f.name)
       }
     }
     df.select(projections : _*)
@@ -363,7 +363,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       } else if (f.dataType == targetColumnType && shouldReplace) {
         replaceCol(f, replacementMap)
       } else {
-        df.colInternal(f.name)
+        df.col(f.name)
       }
     }
     df.select(projections : _*)
@@ -395,7 +395,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
           case v: jl.Boolean => fillCol[Boolean](f, v.booleanValue())
           case v: String => fillCol[String](f, v)
         }
-      }.getOrElse(df.colInternal(f.name))
+      }.getOrElse(df.col(f.name))
     }
     df.select(projections : _*)
   }
@@ -407,8 +407,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
     val quotedColName = "`" + col.name + "`"
     val colValue = col.dataType match {
       case DoubleType | FloatType =>
-        nanvl(df.colInternal(quotedColName), lit(null)) // nanvl only supports these types
-      case _ => df.colInternal(quotedColName)
+        nanvl(df.col(quotedColName), lit(null)) // nanvl only supports these types
+      case _ => df.col(quotedColName)
     }
     coalesce(colValue, lit(replacement)).cast(col.dataType).as(col.name)
   }
@@ -420,8 +420,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * TODO: This can be optimized to use broadcast join when replacementMap is large.
    */
   private def replaceCol(col: StructField, replacementMap: Map[_, _]): Column = {
-    val keyExpr = df.colInternal(col.name).expr
-    def buildExpr(v: Any) = Cast(Literal(v), keyExpr.dataType)
+    val keyExpr = df.col(col.name).expr
+    def buildExpr(v: Any) = Cast(Literal(v), col.dataType)
     val branches = replacementMap.flatMap { case (source, target) =>
       Seq(buildExpr(source), buildExpr(target))
     }.toSeq
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -899,7 +899,7 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def sort(sortCol: String, sortCols: String*): Dataset[T] = {
-    sort((sortCol +: sortCols).map(colInternal) : _*)
+    sort((sortCol +: sortCols).map(apply) : _*)
   }
 
   /**
@@ -953,8 +953,9 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def col(colName: String): Column = withStarResolved(colName) {
-    val candidateExpr = resolve(colName)
-    val expr = LazilyDeterminedAttribute(candidateExpr)(logicalPlan)
+    val expr = UnresolvedAttribute(
+      UnresolvedAttribute.parseAttributeName(colName),
+      Some(queryExecution.analyzed.planId))
     Column(expr)
   }
 
@@ -1703,8 +1704,7 @@ class Dataset[T] private[sql](
       val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
       f(row(0).asInstanceOf[A]).map(o => InternalRow(convert(o)))
     }
-    val generator =
-      UserDefinedGenerator(elementSchema, rowFunction, colInternal(inputColumn).expr :: Nil)
+    val generator = UserDefinedGenerator(elementSchema, rowFunction, apply(inputColumn).expr :: Nil)
 
     withPlan {
       Generate(generator, join = true, outer = false,
@@ -1832,15 +1832,17 @@ class Dataset[T] private[sql](
    */
   def drop(col: Column): DataFrame = {
     val expression = col match {
-      case Column(u: UnresolvedAttribute) =>
-        queryExecution.analyzed.resolveQuoted(
-          u.name, sparkSession.sessionState.analyzer.resolver).getOrElse(u)
-      case Column(l: LazilyDeterminedAttribute) =>
-        val foundExpression =
-          logicalPlan.findByBreadthFirst(_.planId == l.plan.planId)
-            .flatMap(_.resolveQuoted(l.name, sparkSession.sessionState.analyzer.resolver))
-            .getOrElse(l.namedExpr)
-        foundExpression
+      case Column(u @ UnresolvedAttribute(nameParts, targetPlanIdOpt)) =>
+        val plan = queryExecution.analyzed
+        val analyzer = sparkSession.sessionState.analyzer
+        val resolver = analyzer.resolver
+
+        targetPlanIdOpt match {
+          case Some(targetPlanId) =>
+            analyzer.resolveExpressionFromSpecificLogicalPlan(nameParts, plan, targetPlanId)
+          case None =>
+            plan.resolveQuoted(u.name, resolver).getOrElse(u)
+        }
       case Column(expr: Expression) => expr
     }
     val attrs = this.logicalPlan.output
@@ -2633,6 +2635,9 @@ class Dataset[T] private[sql](
     }
   }
 
+  /** Another version of `col` which resolve an expression immediately.
+   *  Mainly intended to use for test for example in case of passing columns to a SparkPlan.
+   */
   private[sql] def colInternal(colName: String): Column = withStarResolved(colName) {
     val expr = resolve(colName)
     Column(expr)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -607,7 +607,10 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
           Row(id, name, age, salary)
       }.toSeq)
     assert(df.schema.map(_.name) === Seq("id", "name", "age", "salary"))
-    assert(df("id") == person("id"))
+    val dfAnalyzer = df.sparkSession.sessionState.analyzer
+    val personAnalyzer = person.sparkSession.sessionState.analyzer
+    assert(dfAnalyzer.resolveExpression(df("id").expr, df.queryExecution.analyzed) ==
+      personAnalyzer.resolveExpression(person("id").expr, person.queryExecution.analyzed))
   }
 
   test("drop top level columns that contains dot") {
@@ -1469,6 +1472,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
         join2.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size === 4)
     }
   }
+
   test("sameResult() on aggregate") {
     val df = spark.range(100)
     val agg1 = df.groupBy().count()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -242,7 +242,7 @@ object SparkPlanTest {
         case plan: SparkPlan =>
           val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
           plan transformExpressions {
-            case UnresolvedAttribute(Seq(u)) =>
+            case UnresolvedAttribute(Seq(u), _) =>
               inputMap.getOrElse(u,
                 sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
           }

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`161`	`161`	`if (f.dataType.isInstanceOf[NumericType] && cols.exists(col => columnEquals(f.name, col))) {`
`162`	`162`	`fillCol[Double](f, value)`
`163`	`163`	`} else {`
`164`		`- df.colInternal(f.name)`
	`164`	`+ df.col(f.name)`
`165`	`165`	`}`
`166`	`166`	`}`
`167`	`167`	`df.select(projections : _*)`
`@@ -188,7 +188,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`188`	`188`	`if (f.dataType.isInstanceOf[StringType] && cols.exists(col => columnEquals(f.name, col))) {`
`189`	`189`	`fillCol[String](f, value)`
`190`	`190`	`} else {`
`191`		`- df.colInternal(f.name)`
	`191`	`+ df.col(f.name)`
`192`	`192`	`}`
`193`	`193`	`}`
`194`	`194`	`df.select(projections : _*)`
`@@ -363,7 +363,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`363`	`363`	`} else if (f.dataType == targetColumnType && shouldReplace) {`
`364`	`364`	`replaceCol(f, replacementMap)`
`365`	`365`	`} else {`
`366`		`- df.colInternal(f.name)`
	`366`	`+ df.col(f.name)`
`367`	`367`	`}`
`368`	`368`	`}`
`369`	`369`	`df.select(projections : _*)`
`@@ -395,7 +395,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`395`	`395`	`case v: jl.Boolean => fillCol[Boolean](f, v.booleanValue())`
`396`	`396`	`case v: String => fillCol[String](f, v)`
`397`	`397`	`}`
`398`		`- }.getOrElse(df.colInternal(f.name))`
	`398`	`+ }.getOrElse(df.col(f.name))`
`399`	`399`	`}`
`400`	`400`	`df.select(projections : _*)`
`401`	`401`	`}`
`@@ -407,8 +407,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`407`	`407`	val quotedColName = "`" + col.name + "`"
`408`	`408`	`val colValue = col.dataType match {`
`409`	`409`	`case DoubleType \| FloatType =>`
`410`		`- nanvl(df.colInternal(quotedColName), lit(null)) // nanvl only supports these types`
`411`		`- case _ => df.colInternal(quotedColName)`
	`410`	`+ nanvl(df.col(quotedColName), lit(null)) // nanvl only supports these types`
	`411`	`+ case _ => df.col(quotedColName)`
`412`	`412`	`}`
`413`	`413`	`coalesce(colValue, lit(replacement)).cast(col.dataType).as(col.name)`
`414`	`414`	`}`
`@@ -420,8 +420,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {`
`420`	`420`	`* TODO: This can be optimized to use broadcast join when replacementMap is large.`
`421`	`421`	`*/`
`422`	`422`	`private def replaceCol(col: StructField, replacementMap: Map[_, _]): Column = {`
`423`		`- val keyExpr = df.colInternal(col.name).expr`
`424`		`- def buildExpr(v: Any) = Cast(Literal(v), keyExpr.dataType)`
	`423`	`+ val keyExpr = df.col(col.name).expr`
	`424`	`+ def buildExpr(v: Any) = Cast(Literal(v), col.dataType)`
`425`	`425`	`val branches = replacementMap.flatMap { case (source, target) =>`
`426`	`426`	`Seq(buildExpr(source), buildExpr(target))`
`427`	`427`	`}.toSeq`
Original file line number	Diff line number	Diff line change
`@@ -607,7 +607,10 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {`
`607`	`607`	`Row(id, name, age, salary)`
`608`	`608`	`}.toSeq)`
`609`	`609`	`assert(df.schema.map(_.name) === Seq("id", "name", "age", "salary"))`
`610`		`- assert(df("id") == person("id"))`
	`610`	`+ val dfAnalyzer = df.sparkSession.sessionState.analyzer`
	`611`	`+ val personAnalyzer = person.sparkSession.sessionState.analyzer`
	`612`	`+ assert(dfAnalyzer.resolveExpression(df("id").expr, df.queryExecution.analyzed) ==`
	`613`	`+ personAnalyzer.resolveExpression(person("id").expr, person.queryExecution.analyzed))`
`611`	`614`	`}`
`612`	`615`
`613`	`616`	`test("drop top level columns that contains dot") {`
`@@ -1469,6 +1472,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {`
`1469`	`1472`	`join2.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size === 4)`
`1470`	`1473`	`}`
`1471`	`1474`	`}`
	`1475`	`+`
`1472`	`1476`	`test("sameResult() on aggregate") {`
`1473`	`1477`	`val df = spark.range(100)`
`1474`	`1478`	`val agg1 = df.groupBy().count()`
Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ object SparkPlanTest {`
`242`	`242`	`case plan: SparkPlan =>`
`243`	`243`	`val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap`
`244`	`244`	`plan transformExpressions {`
`245`		`- case UnresolvedAttribute(Seq(u)) =>`
	`245`	`+ case UnresolvedAttribute(Seq(u), _) =>`
`246`	`246`	`inputMap.getOrElse(u,`
`247`	`247`	`sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))`
`248`	`248`	`}`