I am running the following spark
code and each time it runs out of memory. The data size is not huge but overtime it complains for GC
error so there are too many objects for garbage collector
to collect. Selecting few columns and data from a table should not contain much overhead and create too many objects on Heap. Am i creating too many immutable objects by firing a select query. Not sure why it's complaining for GC
error.
object O {
def main(args: Array[String]): Unit = {
val log = LogManager.getRootLogger
val TGTTBL = "XYZ"
val outputLocation = "somepath"
val dql = "select col1, col2, col3 from SOURCETBL where condition"
val appName = "createDFS";
val spark = SparkSession.builder()
.appName(appName)
.enableHiveSupport()
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate()
log.warn("Select Query........................")
log.warn(dql)
val tblDF = spark.sql(dql)
def getCols(df: DataFrame): String = {
val cols = df.columns;
val colString = cols.map(c => s"$c
${df.schema(s"$c").dataType}").dropRight(3).mkString(",").replace("Type", "")
return colString;
}
val colString = getCols(tblDF)
log.warn("Create Table def........................")
log.warn(colString)
spark.sql(s"drop table if exists $TGTTBL")
spark.sql(s"Create external table if not exists $TGTTBL ($colString)" +
s" partitioned by (col1 string, col2 string, col3 string) stored as orc location \'$outputLocation\'")
tblDF.write.partitionBy("col1", "col2", ).format("orc").mode("overwrite").save(outputLocation)
}
}
**Error -
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded**