I am in a migration of code from Apache Spark 2.4 to Spark 3.3 (in Azure Synapse).
Running the same code, in 2.4 works but in 3.3. I am running in a bigger Apache Spark pool (cluster) for Spark 3.3. The code I am running is the next one (there are more withColumns, but just to summarize):
def windowTrans= Window.partitionBy("COD_X", "COD_Y", "date").
orderBy($"COD_X",$"COD_Y",$"date")
def windowTransMinMax= Window.partitionBy("COD_X", "COD_Y", "date")
def dataframeAvg(spark: SparkSession, df: DataFrame): DataFrame = {
import spark.implicits._
val dfAvg = df.
withColumn("row", row_number.over(windowTrans)).
withColumn("LONG_HIST", count("row") over windowTrans).
withColumn("AVG_COL1", avg("col1") over windowTrans).
withColumn("AVG2", avg("col2") over windowTrans).
withColumn("AVG3", avg("col3") over windowTrans).
withColumn("AVG6", avg("col6") over windowTrans).
...
withColumn("STD_COL1", stddev_samp("col1") over windowTrans).
withColumn("STD2", stddev_samp("col2") over windowTrans).
withColumn("STD3", stddev_samp("col3") over windowTrans).
withColumn("STD6", stddev_samp("col6") over windowTrans).
...
withColumn("MIN_COL1", min("col1") over windowTransMinMax).
withColumn("MIN2", min("col2") over windowTransMinMax).
withColumn("MIN3", min("col3") over windowTransMinMax).
withColumn("MIN6", min("col6") over windowTransMinMax).
...
where(col("row") === 1).
select(/* List of columns */)
dfAvg
}
The error is the next one:
java.lang.StackOverflowError at org.apache.spark.sql.catalyst.trees.TreeNode$$Lambda$5466/589672638.get$Lambda(Unknown Source)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:777)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:427)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:745)
at org.apache.spark.sql.catalyst.trees.TreeNode.clone(TreeNode.scala:868)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.clone(AnalysisHelper.scala:295)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.clone$(AnalysisHelper.scala:294)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$clone$1(TreeNode.scala:868)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:747)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:427)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:745)
at org.apache.spark.sql.catalyst.trees.TreeNode.clone(TreeNode.scala:868)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.clone(AnalysisHelper.scala:295)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.clone$(AnalysisHelper.scala:294)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.clone(LogicalPlan.scala:31)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$clone$1(TreeNode.scala:868)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:747)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:427)
I am doing a research but I am not able to find the reason and how to fix this.