The code below is close to what I'm trying to do. I have an RDD of dataframes and I want to append those dataframes to make up 1 dataframe in the end.
import org.apache.spark.sql.SparkSession
//scalastyle:off
object test{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Ingester")
.master("local[2]")
.getOrCreate()
val rdd = spark.sparkContext.parallelize(List(spark.emptyDataFrame, spark.emptyDataFrame))
rdd
.fold(spark.emptyDataFrame)(_.union(_))
}
}
For some reason I get an NPE when trying to do this.
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:73)
at org.apache.spark.sql.Dataset.withSetOperator(Dataset.scala:3313)
at org.apache.spark.sql.Dataset.union(Dataset.scala:1834)
at test$$anonfun$main$1.apply(test.scala:14)
at test$$anonfun$main$1.apply(test.scala:14)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:155)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:155)
at scala.collection.Iterator$class.foreach(Iterator.scala:742)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:155)
at org.apache.spark.InterruptibleIterator.foldLeft(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.fold(TraversableOnce.scala:210)
at org.apache.spark.InterruptibleIterator.fold(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$fold$1$$anonfun$19.apply(RDD.scala:1096)
at org.apache.spark.rdd.RDD$$anonfun$fold$1$$anonfun$19.apply(RDD.scala:1096)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2130)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2130)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Does anybody know why this happens?