apache
diff --git a/‎common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java‎
Lines changed: 1 addition & 0 deletions b/‎common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/streaming/ss-migration-guide.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/streaming/ss-migration-guide.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala‎
Lines changed: 3 additions & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala‎
Lines changed: 13 additions & 7 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala‎
Lines changed: 11 additions & 3 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryPlanTraverseHelper.scala‎
Lines changed: 61 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryPlanTraverseHelper.scala‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/IncrementalExecution.scala‎
Lines changed: 7 additions & 5 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/IncrementalExecution.scala‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala‎
Lines changed: 32 additions & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/ProgressReporter.scala‎
Lines changed: 21 additions & 15 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/ProgressReporter.scala‎
Lines changed: 21 additions & 15 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/StreamExecution.scala‎
Lines changed: 7 additions & 2 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/StreamExecution.scala‎
Lines changed: 7 additions & 2 deletions
@@ -44,6 +44,7 @@ public enum LogKeys implements LogKey {
   APP_ID,
   APP_NAME,
   APP_STATE,
+  AQE_PLAN,
   ARCHIVE_NAME,
   ARGS,
   ARTIFACTS,
 
@@ -23,6 +23,10 @@ Note that this migration guide describes the items specific to Structured Stream
 Many items of SQL migration can be applied when migrating Structured Streaming to higher versions.
 Please refer [Migration Guide: SQL, Datasets and DataFrame](../sql-migration-guide.html).
 
+## Upgrading from Structured Streaming 4.0 to 4.1
+
+- Since Spark 4.1, AQE is supported for stateless workloads, and it could affect the behavior of the query after upgrade (especially since AQE is turned on by default). In general, it helps to achieve better performance including resolution of skewed partition, but you can turn off AQE via changing `spark.sql.adaptive.enabled` to `false` to restore the behavior if you see regression.
+
 ## Upgrading from Structured Streaming 3.5 to 4.0
 
 - Since Spark 4.0, Spark falls back to single batch execution if any source in the query does not support `Trigger.AvailableNow`. This is to avoid any possible correctness, duplication, and dataloss issue due to incompatibility between source and wrapper implementation. (See [SPARK-45178](https://issues.apache.org/jira/browse/SPARK-45178) for more details.)
 
@@ -52,7 +52,9 @@ class SparkPlanner(val session: SparkSession, val experimentalMethods: Experimen
       InMemoryScans ::
       SparkScripts ::
       Pipelines ::
-      BasicOperators :: Nil)
+      BasicOperators ::
+      // Need to be here since users can specify withWatermark in stateless streaming query.
+      EventTimeWatermarkStrategy :: Nil)
 
   /**
    * Override to add extra planning strategies to the planner. These strategies are tried after
 
@@ -421,13 +421,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
-  /**
-   * Used to plan streaming aggregation queries that are computed incrementally as part of a
-   * [[org.apache.spark.sql.streaming.StreamingQuery]]. Currently this rule is injected into the
-   * planner on-demand, only when planning in a
-   * [[org.apache.spark.sql.execution.streaming.StreamExecution]]
-   */
-  object StatefulAggregationStrategy extends Strategy {
+  object EventTimeWatermarkStrategy extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case _ if !plan.isStreaming => Nil
 
@@ -445,6 +439,18 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
               "Please report your query to Spark user mailing list.")
         }
         UpdateEventTimeColumnExec(columnName, delay.get, None, planLater(child)) :: Nil
+    }
+  }
+
+  /**
+   * Used to plan streaming aggregation queries that are computed incrementally as part of a
+   * [[org.apache.spark.sql.streaming.StreamingQuery]]. Currently this rule is injected into the
+   * planner on-demand, only when planning in a
+   * [[org.apache.spark.sql.execution.streaming.StreamExecution]]
+   */
+  object StatefulAggregationStrategy extends Strategy {
+    override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case _ if !plan.isStreaming => Nil
 
       case PhysicalAggregation(
         namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) =>
 
@@ -32,6 +32,7 @@ import org.apache.spark.sql.execution.command.{DataWritingCommandExec, ExecutedC
 import org.apache.spark.sql.execution.datasources.V1WriteCommand
 import org.apache.spark.sql.execution.datasources.v2.V2CommandExec
 import org.apache.spark.sql.execution.exchange.Exchange
+import org.apache.spark.sql.execution.streaming.operators.stateful.StatefulOperator
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -55,6 +56,15 @@ case class InsertAdaptiveSparkPlan(
     case c: DataWritingCommandExec
         if !c.cmd.isInstanceOf[V1WriteCommand] || !conf.plannedWriteEnabled =>
       c.copy(child = apply(c.child))
+    // SPARK-53941: Do not apply AQE for stateful streaming workloads. From recent change of shuffle
+    // origin for shuffle being added from stateful operator, we anticipate stateful operator to
+    // work with AQE. But we want to make the adoption of AQE be gradual, to have a risk under
+    // control. Note that we will disable the value of AQE config explicitly in streaming engine,
+    // but also introduce this pattern here for defensive programming.
+    case _ if plan.exists {
+      case _: StatefulOperator => true
+      case _ => false
+    } => plan
     case _ if shouldApplyAQE(plan, isSubquery) =>
       if (supportAdaptive(plan)) {
         try {
@@ -114,9 +124,7 @@ case class InsertAdaptiveSparkPlan(
   }
 
   private def supportAdaptive(plan: SparkPlan): Boolean = {
-    sanityCheck(plan) &&
-      !plan.logicalLink.exists(_.isStreaming) &&
-    plan.children.forall(supportAdaptive)
+    sanityCheck(plan) && plan.children.forall(supportAdaptive)
   }
 
   private def sanityCheck(plan: SparkPlan): Boolean =
 
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.internal.{Logging, LogKeys}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
+import org.apache.spark.sql.execution.adaptive.QueryStageExec
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
+
+/**
+ * This is an utility object placing methods to traverse the query plan for streaming query.
+ * This is used for patterns of traversal which are repeated in multiple places.
+ */
+object StreamingQueryPlanTraverseHelper extends Logging {
+  def collectFromUnfoldedPlan[B](
+      executedPlan: SparkPlan)(
+      pf: PartialFunction[SparkPlan, B]): Seq[B] = {
+    executedPlan.flatMap {
+      // InMemoryTableScanExec is a node to represent a cached plan. The node has underlying
+      // actual executed plan, which we should traverse to collect the required information.
+      case s: InMemoryTableScanExec => collectFromUnfoldedPlan(s.relation.cachedPlan)(pf)
+
+      // AQE physical node contains the executed plan, pick the plan.
+      // In most cases, AQE physical node is expected to contain the final plan, which is
+      // appropriate for the caller.
+      // Even it does not contain the final plan (in whatever reason), we just provide the
+      // plan as best effort, as there is no better way around.
+      case a: AdaptiveSparkPlanExec =>
+        if (!a.isFinalPlan) {
+          logWarning(log"AQE plan is captured, but the executed plan in AQE plan is not" +
+            log"the final one. Providing incomplete executed plan. AQE plan: ${MDC(
+              LogKeys.AQE_PLAN, a)}")
+        }
+        collectFromUnfoldedPlan(a.executedPlan)(pf)
+
+      // There are several AQE-specific leaf nodes which covers shuffle. We should pick the
+      // underlying plan of these nodes, since the underlying plan has the actual executed
+      // nodes which we want to collect metrics.
+      case e: QueryStageExec => collectFromUnfoldedPlan(e.plan)(pf)
+
+      case p if pf.isDefinedAt(p) => Seq(pf(p))
+      case _ => Seq.empty[B]
+    }
+  }
+}
@@ -38,6 +38,7 @@ import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, MergingSessi
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
 import org.apache.spark.sql.execution.python.streaming.{FlatMapGroupsInPandasWithStateExec, TransformWithStateInPySparkExec}
+import org.apache.spark.sql.execution.streaming.StreamingQueryPlanTraverseHelper
 import org.apache.spark.sql.execution.streaming.checkpointing.{CheckpointFileManager, OffsetSeqMetadata}
 import org.apache.spark.sql.execution.streaming.operators.stateful.{SessionWindowStateStoreRestoreExec, SessionWindowStateStoreSaveExec, StatefulOperator, StatefulOperatorStateInfo, StateStoreRestoreExec, StateStoreSaveExec, StateStoreWriter, StreamingDeduplicateExec, StreamingDeduplicateWithinWatermarkExec, StreamingGlobalLimitExec, StreamingLocalLimitExec, UpdateEventTimeColumnExec}
 import org.apache.spark.sql.execution.streaming.operators.stateful.flatmapgroupswithstate.FlatMapGroupsWithStateExec
@@ -638,10 +639,11 @@ class IncrementalExecution(
   def shouldRunAnotherBatch(newMetadata: OffsetSeqMetadata): Boolean = {
     val tentativeBatchId = currentBatchId + 1
     watermarkPropagator.propagate(tentativeBatchId, executedPlan, newMetadata.batchWatermarkMs)
-    executedPlan.collect {
-      case p: StateStoreWriter => p.shouldRunAnotherBatch(
-        watermarkPropagator.getInputWatermarkForEviction(tentativeBatchId,
-          p.stateInfo.get.operatorId))
-    }.exists(_ == true)
+    StreamingQueryPlanTraverseHelper
+      .collectFromUnfoldedPlan(executedPlan) {
+        case p: StateStoreWriter => p.shouldRunAnotherBatch(
+          watermarkPropagator.getInputWatermarkForEviction(tentativeBatchId,
+            p.stateInfo.get.operatorId))
+      }.exists(_ == true)
   }
 }
@@ -27,7 +27,7 @@ import org.apache.spark.internal.LogKeys
 import org.apache.spark.internal.LogKeys.BATCH_ID
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp, FileSourceMetadataAttribute, LocalTimestamp}
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, GlobalLimit, LeafNode, LocalRelation, LogicalPlan, Project, StreamSourceAwareLogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Deduplicate, DeduplicateWithinWatermark, Distinct, FlatMapGroupsInPandasWithState, FlatMapGroupsWithState, GlobalLimit, Join, LeafNode, LocalRelation, LogicalPlan, Project, StreamSourceAwareLogicalPlan, TransformWithState, TransformWithStateInPySpark}
 import org.apache.spark.sql.catalyst.streaming.{StreamingRelationV2, WriteToStream}
 import org.apache.spark.sql.catalyst.trees.TreePattern.CURRENT_LIKE
 import org.apache.spark.sql.catalyst.util.truncatedString
@@ -344,9 +344,40 @@ class MicroBatchExecution(
     setLatestExecutionContext(execCtx)
 
     populateStartOffsets(execCtx, sparkSessionForStream)
+
+    // SPARK-53941: This code path is executed for the first batch, regardless of whether it's a
+    // fresh new run or restart.
+    disableAQESupportInStatelessIfUnappropriated(sparkSessionForStream)
+
     logInfo(log"Stream started from ${MDC(LogKeys.STREAMING_OFFSETS_START, execCtx.startOffsets)}")
     execCtx
   }
+
+  private def disableAQESupportInStatelessIfUnappropriated(
+      sparkSessionToRunBatches: SparkSession): Unit = {
+    def containsStatefulOperator(p: LogicalPlan): Boolean = {
+      p.exists {
+        case node: Aggregate if node.isStreaming => true
+        case node: Deduplicate if node.isStreaming => true
+        case node: DeduplicateWithinWatermark if node.isStreaming => true
+        case node: Distinct if node.isStreaming => true
+        case node: Join if node.left.isStreaming && node.right.isStreaming => true
+        case node: FlatMapGroupsWithState if node.isStreaming => true
+        case node: FlatMapGroupsInPandasWithState if node.isStreaming => true
+        case node: TransformWithState if node.isStreaming => true
+        case node: TransformWithStateInPySpark if node.isStreaming => true
+        case node: GlobalLimit if node.isStreaming => true
+        case _ => false
+      }
+    }
+
+    if (containsStatefulOperator(analyzedPlan)) {
+      // SPARK-53941: We disable AQE for stateful workloads as of now.
+      logWarning(log"Disabling AQE since AQE is not supported in stateful workloads.")
+      sparkSessionToRunBatches.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
+    }
+  }
+
   /**
    * Repeatedly attempts to run batches as data arrives.
    */
 
@@ -37,6 +37,7 @@ import org.apache.spark.sql.connector.catalog.Table
 import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, ReportsSinkMetrics, ReportsSourceMetrics, SparkDataStream}
 import org.apache.spark.sql.execution.{QueryExecution, StreamSourceAwareSparkPlan}
 import org.apache.spark.sql.execution.datasources.v2.{MicroBatchScanExec, StreamingDataSourceV2ScanRelation, StreamWriterCommitProgress}
+import org.apache.spark.sql.execution.streaming.StreamingQueryPlanTraverseHelper
 import org.apache.spark.sql.execution.streaming.checkpointing.OffsetSeqMetadata
 import org.apache.spark.sql.execution.streaming.operators.stateful.{EventTimeWatermarkExec, StateStoreWriter}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
@@ -443,8 +444,8 @@ abstract class ProgressContext(
 
     val sources = newData.keys.toSet
 
-    val sourceToInputRowsTuples = lastExecution.executedPlan
-      .collect {
+    val sourceToInputRowsTuples = StreamingQueryPlanTraverseHelper
+      .collectFromUnfoldedPlan(lastExecution.executedPlan) {
         case node: StreamSourceAwareSparkPlan if node.getStream.isDefined =>
           val numRows = node.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
           node.getStream.get -> numRows
@@ -502,12 +503,13 @@ abstract class ProgressContext(
       // It's possible that multiple DataSourceV2ScanExec instances may refer to the same source
       // (can happen with self-unions or self-joins). This means the source is scanned multiple
       // times in the query, we should count the numRows for each scan.
-      val sourceToInputRowsTuples = lastExecution.executedPlan.collect {
-        case s: MicroBatchScanExec =>
-          val numRows = s.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
-          val source = s.stream
-          source -> numRows
-      }
+      val sourceToInputRowsTuples = StreamingQueryPlanTraverseHelper
+        .collectFromUnfoldedPlan(lastExecution.executedPlan) {
+          case s: MicroBatchScanExec =>
+            val numRows = s.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
+            val source = s.stream
+            source -> numRows
+        }
       logDebug("Source -> # input rows\n\t" + sourceToInputRowsTuples.mkString("\n\t"))
       sumRows(sourceToInputRowsTuples)
     } else {
@@ -544,7 +546,10 @@ abstract class ProgressContext(
       val finalLogicalPlan = unrollCTE(lastExecution.logical)
 
       val allLogicalPlanLeaves = finalLogicalPlan.collectLeaves() // includes non-streaming
-      val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
+      val allExecPlanLeaves = StreamingQueryPlanTraverseHelper
+        .collectFromUnfoldedPlan(lastExecution.executedPlan) {
+          case p if p.children.isEmpty => p
+        }
       if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
         val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
           case (_, ep: MicroBatchScanExec) =>
@@ -580,10 +585,11 @@ abstract class ProgressContext(
   private def extractStateOperatorMetrics(
       lastExecution: IncrementalExecution): Seq[StateOperatorProgress] = {
     assert(lastExecution != null, "lastExecution is not available")
-    lastExecution.executedPlan.collect {
-      case p if p.isInstanceOf[StateStoreWriter] =>
-        p.asInstanceOf[StateStoreWriter].getProgress()
-    }
+    StreamingQueryPlanTraverseHelper
+      .collectFromUnfoldedPlan(lastExecution.executedPlan) {
+        case p if p.isInstanceOf[StateStoreWriter] =>
+          p.asInstanceOf[StateStoreWriter].getProgress()
+      }
   }
 
   /** Extracts statistics from the most recent query execution. */
@@ -609,8 +615,8 @@ abstract class ProgressContext(
       return ExecutionStats(Map.empty, stateOperators, watermarkTimestamp, sinkOutput)
     }
 
-    val eventTimeStats = lastExecution.executedPlan
-      .collect {
+    val eventTimeStats = StreamingQueryPlanTraverseHelper
+      .collectFromUnfoldedPlan(lastExecution.executedPlan) {
         case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
           val stats = e.eventTimeStats.value
           Map(
 
@@ -43,6 +43,7 @@ import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, ReadLi
 import org.apache.spark.sql.connector.write.{LogicalWriteInfoImpl, SupportsTruncate, Write}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.StreamingExplainCommand
+import org.apache.spark.sql.execution.streaming.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.checkpointing.{CheckpointFileManager, CommitLog, OffsetSeqLog, OffsetSeqMetadata}
 import org.apache.spark.sql.execution.streaming.operators.stateful.{StatefulOperator, StateStoreWriter}
 import org.apache.spark.sql.execution.streaming.sources.{ForeachBatchUserFuncException, ForeachUserFuncException}
@@ -304,8 +305,6 @@ abstract class StreamExecution(
 
       // While active, repeatedly attempt to run batches.
       sparkSessionForStream.withActive {
-        // Adaptive execution can change num shuffle partitions, disallow
-        sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
         // Disable cost-based join optimization as we do not want stateful operations
         // to be rearranged
         sparkSessionForStream.conf.set(SQLConf.CBO_ENABLED.key, "false")
@@ -315,6 +314,12 @@ abstract class StreamExecution(
         sparkSessionForStream.conf.set(SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_DISTRIBUTION.key,
           "false")
 
+        if (trigger.isInstanceOf[ContinuousTrigger]) {
+          // SPARK-53941: AQE does not make sense for continuous processing, disable it.
+          logWarning("Disabling AQE since the query runs with continuous mode.")
+          sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
+        }
+
         getLatestExecutionContext().updateStatusMessage("Initializing sources")
         // force initialization of the logical plan so that the sources can be created
         logicalPlan