I am trying to get employeeId
from employee_table
and use this id to query employee_address
table to fetch the address.
There is nothing wrong with tables. But when I run the below code, I get org.apache.spark.SparkException: Task not serializable
I think I know the issue. The issue is sparkContext is with master and not with worker. But I don't know how to get my head around this.
val employeeRDDRdd = sc.cassandraTable("local_keyspace", "employee_table")
try {
val data = employeeRDDRdd
.map(row => {
row.getStringOption("employeeID") match {
case Some(s) if (s != null) && s.nonEmpty => s
case None => ""
}
})
//create tuple of employee id and address. Filtering out cases when for an employee address is empty.
val id = data
.map(s => (s,getID(s)))
filter(tups => tups._2.nonEmpty)
//printing out total size of rdd.
println(id.count())
} catch {
case e: Exception => e.printStackTrace()
}
def getID(employeeID: String): String = {
val addressRDD = sc.cassandraTable("local_keyspace", "employee_address")
val data = addressRDD.map(row => row.getStringOption("address") match {
case Some(s) if (s != null) && s.nonEmpty => s
case None => ""
})
data.collect()(0)
}
Exception ==>
rg.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2039)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:366)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:365)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.map(RDD.scala:365)