apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala‎
Lines changed: 10 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala‎
Lines changed: 3 additions & 3 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala‎
Lines changed: 22 additions & 3 deletions b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala‎
Lines changed: 15 additions & 13 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/KeyedState.scala‎
Lines changed: 116 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/KeyedState.scala‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/State.scala‎
Lines changed: 0 additions & 101 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/State.scala‎
Lines changed: 0 additions & 101 deletions
@@ -46,8 +46,13 @@ object UnsupportedOperationChecker {
         "Queries without streaming sources cannot be executed with writeStream.start()")(plan)
     }
 
+    /** Collect all the streaming aggregates in a sub plan */
+    def collectStreamingAggregates(subplan: LogicalPlan): Seq[Aggregate] = {
+      subplan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
+    }
+
     // Disallow multiple streaming aggregations
-    val aggregates = plan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
+    val aggregates = collectStreamingAggregates(plan)
 
     if (aggregates.size > 1) {
       throwError(
@@ -114,6 +119,10 @@ object UnsupportedOperationChecker {
         case _: InsertIntoTable =>
           throwError("InsertIntoTable is not supported with streaming DataFrames/Datasets")
 
+        case m: MapGroupsWithState if collectStreamingAggregates(m).nonEmpty =>
+          throwError("(map/flatMap)GroupsWithState is not supported after aggregation on a " +
+            "streaming DataFrame/Dataset")
+
         case Join(left, right, joinType, _) =>
 
           joinType match {
 
@@ -314,12 +314,12 @@ case class MapGroups(
     child: LogicalPlan) extends UnaryNode with ObjectProducer
 
 /** Internal class representing State */
-trait LogicalState[S]
+trait LogicalKeyedState[S]
 
 /** Factory for constructing new `MapGroupsWithState` nodes. */
 object MapGroupsWithState {
   def apply[K: Encoder, V: Encoder, S: Encoder, U: Encoder](
-      func: (Any, Iterator[Any], LogicalState[Any]) => Iterator[Any],
+      func: (Any, Iterator[Any], LogicalKeyedState[Any]) => Iterator[Any],
       groupingAttributes: Seq[Attribute],
       dataAttributes: Seq[Attribute],
       child: LogicalPlan): LogicalPlan = {
@@ -352,7 +352,7 @@ object MapGroupsWithState {
  * @param stateSerializer used to serialize updated state after calling `func`
  */
 case class MapGroupsWithState(
-    func: (Any, Iterator[Any], LogicalState[Any]) => Iterator[Any],
+    func: (Any, Iterator[Any], LogicalKeyedState[Any]) => Iterator[Any],
     keyDeserializer: Expression,
     valueDeserializer: Expression,
     groupingAttributes: Seq[Attribute],
 
@@ -22,13 +22,13 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal, NamedExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.logical.{MapGroupsWithState, _}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.types.{IntegerType, LongType}
 
 /** A dummy command for testing unsupported operations. */
 case class DummyCommand() extends Command
@@ -111,6 +111,25 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     outputMode = Complete,
     expectedMsgs = Seq("distinct aggregation"))
 
+  // MapGroupsWithState: Not supported after a streaming aggregation
+  val att = new AttributeReference(name = "a", dataType = LongType)()
+  assertSupportedInStreamingPlan(
+    "mapGroupsWithState - mapGroupsWithState on batch relation",
+    MapGroupsWithState(null, att, att, Seq(att), Seq(att), att, att, Seq(att), batchRelation),
+    outputMode = Append)
+
+  assertSupportedInStreamingPlan(
+    "mapGroupsWithState - mapGroupsWithState on streaming relation before aggregation",
+    MapGroupsWithState(null, att, att, Seq(att), Seq(att), att, att, Seq(att), streamRelation),
+    outputMode = Append)
+
+  assertNotSupportedInStreamingPlan(
+    "mapGroupsWithState - mapGroupsWithState on streaming relation after aggregation",
+    MapGroupsWithState(null, att, att, Seq(att), Seq(att), att, att, Seq(att),
+      Aggregate(Nil, aggExprs("c"), streamRelation)),
+    outputMode = Complete,
+    expectedMsgs = Seq("(map/flatMap)GroupsWithState"))
+
   // Inner joins: Stream-stream not supported
   testBinaryOperationInStreamingPlan(
     "inner join",
 
@@ -243,15 +243,15 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * (for example, by calling `toList`) unless they are sure that this is possible given the memory
    * constraints of their cluster.
    *
-   * @see [[State]] for more details of how to update/remove state in the function.
+   * @see [[KeyedState]] for more details of how to update/remove state in the function.
    * @since 2.1.1
    */
   @Experimental
   @InterfaceStability.Evolving
   def mapGroupsWithState[STATE: Encoder, OUT: Encoder](
-      func: (K, Iterator[V], State[STATE]) => OUT): Dataset[OUT] = {
-    val f = (key: K, it: Iterator[V], s: State[STATE]) => Iterator(func(key, it, s))
-    flatMapGroupsWithState[STATE, OUT](f)
+      func: (K, Iterator[V], KeyedState[STATE]) => OUT): Dataset[OUT] = {
+    flatMapGroupsWithState[STATE, OUT](
+      (key: K, it: Iterator[V], s: KeyedState[STATE]) => Iterator(func(key, it, s)))
   }
 
   /**
@@ -279,7 +279,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * (for example, by calling `toList`) unless they are sure that this is possible given the memory
    * constraints of their cluster.
    *
-   * @see [[State]] for more details of how to update/remove state in the function.
+   * @see [[KeyedState]] for more details of how to update/remove state in the function.
    * @since 2.1.1
    */
   @Experimental
@@ -288,8 +288,9 @@ class KeyValueGroupedDataset[K, V] private[sql](
       func: MapGroupsWithStateFunction[K, V, STATE, OUT],
       stateEncoder: Encoder[STATE],
       outputEncoder: Encoder[OUT]): Dataset[OUT] = {
-    val f = (key: K, it: Iterator[V], s: State[STATE]) => Iterator(func.call(key, it.asJava, s))
-    flatMapGroupsWithState[STATE, OUT](f)(stateEncoder, outputEncoder)
+    flatMapGroupsWithState[STATE, OUT](
+      (key: K, it: Iterator[V], s: KeyedState[STATE]) => Iterator(func.call(key, it.asJava, s))
+    )(stateEncoder, outputEncoder)
   }
 
 
@@ -318,17 +319,17 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * (for example, by calling `toList`) unless they are sure that this is possible given the memory
    * constraints of their cluster.
    *
-   * @see [[State]] for more details of how to update/remove state in the function.
+   * @see [[KeyedState]] for more details of how to update/remove state in the function.
    * @since 2.1.1
    */
   @Experimental
   @InterfaceStability.Evolving
   def flatMapGroupsWithState[STATE: Encoder, OUT: Encoder](
-      func: (K, Iterator[V], State[STATE]) => Iterator[OUT]): Dataset[OUT] = {
+      func: (K, Iterator[V], KeyedState[STATE]) => Iterator[OUT]): Dataset[OUT] = {
     Dataset[OUT](
       sparkSession,
       MapGroupsWithState[K, V, STATE, OUT](
-        func.asInstanceOf[(Any, Iterator[Any], LogicalState[Any]) => Iterator[Any]],
+        func.asInstanceOf[(Any, Iterator[Any], LogicalKeyedState[Any]) => Iterator[Any]],
         groupingAttributes,
         dataAttributes,
         logicalPlan))
@@ -359,7 +360,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * (for example, by calling `toList`) unless they are sure that this is possible given the memory
    * constraints of their cluster.
    *
-   * @see [[State]] for more details of how to update/remove state in the function.
+   * @see [[KeyedState]] for more details of how to update/remove state in the function.
    * @since 2.1.1
    */
   @Experimental
@@ -368,8 +369,9 @@ class KeyValueGroupedDataset[K, V] private[sql](
       func: FlatMapGroupsWithStateFunction[K, V, STATE, OUT],
       stateEncoder: Encoder[STATE],
       outputEncoder: Encoder[OUT]): Dataset[OUT] = {
-    val f = (key: K, it: Iterator[V], s: State[STATE]) => func.call(key, it.asJava, s).asScala
-    flatMapGroupsWithState[STATE, OUT](f)(stateEncoder, outputEncoder)
+    flatMapGroupsWithState[STATE, OUT](
+      (key: K, it: Iterator[V], s: KeyedState[STATE]) => func.call(key, it.asJava, s).asScala
+    )(stateEncoder, outputEncoder)
   }
 
   /**
 
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalKeyedState
+
+/**
+ * :: Experimental ::
+ *
+ * Wrapper class for interacting with keyed state data in `mapGroupsWithState` and
+ * `flatMapGroupsWithState` operations on
+ * [[org.apache.spark.sql.KeyValueGroupedDataset KeyValueGroupedDataset]].
+ *
+ * Important points to note.
+ * - State can be `null`. So updating the state to null is not same as removing the state.
+ * - Operations on state are not threadsafe. This is to avoid memory barriers.
+ * - If the `remove()` is called, then `exists()` will return `false`, and
+ *   `getOption()` will return `None`.
+ * - After that `update(newState)` is called, then `exists()` will return `true`,
+ *   and `getOption()` will return `Some(...)`.
+ *
+ * Scala example of using `KeyedState`:
+ * {{{
+ * // A mapping function that maintains an integer state for string keys and returns a string.
+ * def mappingFunction(key: String, value: Iterable[Int], state: KeyedState[Int]): Option[String]= {
+ *   // Check if state exists
+ *   if (state.exists) {
+ *     val existingState = state.get  // Get the existing state
+ *     val shouldRemove = ...         // Decide whether to remove the state
+ *     if (shouldRemove) {
+ *       state.remove()     // Remove the state
+ *     } else {
+ *       val newState = ...
+ *       state.update(newState)    // Set the new state
+ *     }
+ *   } else {
+ *     val initialState = ...
+ *     state.update(initialState)  // Set the initial state
+ *   }
+ *   ... // return something
+ * }
+ *
+ * }}}
+ *
+ * Java example of using `KeyedState`:
+ * {{{
+ * // A mapping function that maintains an integer state for string keys and returns a string.
+ * MapGroupsWithStateFunction<String, Integer, Integer, String> mappingFunction =
+ *    new MapGroupsWithStateFunction<String, Integer, Integer, String>() {
+ *
+ *      @Override
+ *      public String call(String key, Optional<Integer> value, KeyedState<Integer> state) {
+ *        if (state.exists()) {
+ *          int existingState = state.get(); // Get the existing state
+ *          boolean shouldRemove = ...; // Decide whether to remove the state
+ *          if (shouldRemove) {
+ *            state.remove(); // Remove the state
+ *          } else {
+ *            int newState = ...;
+ *            state.update(newState); // Set the new state
+ *          }
+ *        } else {
+ *          int initialState = ...; // Set the initial state
+ *          state.update(initialState);
+ *        }
+ *        ... // return something
+ *      }
+ *    };
+ * }}}
+ *
+ * @tparam S User-defined type of the state to be stored for each key. Must be encodable into
+ *           Spark SQL types (see [[Encoder]] for more details).
+ * @since 2.1.1
+ */
+@Experimental
+@InterfaceStability.Evolving
+trait KeyedState[S] extends LogicalKeyedState[S] {
+
+  /** Whether state exists or not. */
+  def exists: Boolean
+
+  /** Get the state object if it is defined, otherwise throws NoSuchElementException. */
+  def get: S
+
+  /**
+   * Update the value of the state. Note that null is a valid value, and does not signify removing
+   * of the state.
+   */
+  def update(newState: S): Unit
+
+  /** Remove this keyed state. */
+  def remove(): Unit
+
+  /** (scala friendly) Get the state object as an [[Option]]. */
+  @inline final def getOption: Option[S] = if (exists) Some(get) else None
+
+  @inline final override def toString: String = {
+    getOption.map { _.toString }.getOrElse("<undefined>")
+  }
+}