This has already been answered nicely by other contributors, but I had one question which is how do i access each nested value/unit of the dataframe.
So, for collections, we can use explode and for struct types we can directly call the unit by dot(.)
.
scala> val a = spark.read.option("multiLine", true).option("mode", "PERMISSIVE").json("file:///home/hdfs/spark_2.json")
a: org.apache.spark.sql.DataFrame = [atom: array<struct<dailydata:array<struct<datatimezone:string,intervaltime:bigint,intervalvalue:bigint,utcacquisitiontime:string>>,usage:string>>, titlename: string]
scala> a.printSchema
root
|-- atom: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dailydata: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- datatimezone: string (nullable = true)
| | | | |-- intervaltime: long (nullable = true)
| | | | |-- intervalvalue: long (nullable = true)
| | | | |-- utcacquisitiontime: string (nullable = true)
| | |-- usage: string (nullable = true)
|-- titlename: string (nullable = true)
scala> val b = a.withColumn("exploded_atom", explode(col("atom")))
b: org.apache.spark.sql.DataFrame = [atom: array<struct<dailydata:array<struct<datatimezone:string,intervaltime:bigint,intervalvalue:bigint,utcacquisitiontime:string>>,usage:string>>, titlename: string ... 1 more field]
scala> b.printSchema
root
|-- atom: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dailydata: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- datatimezone: string (nullable = true)
| | | | |-- intervaltime: long (nullable = true)
| | | | |-- intervalvalue: long (nullable = true)
| | | | |-- utcacquisitiontime: string (nullable = true)
| | |-- usage: string (nullable = true)
|-- titlename: string (nullable = true)
|-- exploded_atom: struct (nullable = true)
| |-- dailydata: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- datatimezone: string (nullable = true)
| | | |-- intervaltime: long (nullable = true)
| | | |-- intervalvalue: long (nullable = true)
| | | |-- utcacquisitiontime: string (nullable = true)
| |-- usage: string (nullable = true)
scala>
scala> val c = b.withColumn("exploded_atom_struct", explode(col("`exploded_atom`.dailydata")))
c: org.apache.spark.sql.DataFrame = [atom: array<struct<dailydata:array<struct<datatimezone:string,intervaltime:bigint,intervalvalue:bigint,utcacquisitiontime:string>>,usage:string>>, titlename: string ... 2 more fields]
scala>
scala> c.printSchema
root
|-- atom: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dailydata: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- datatimezone: string (nullable = true)
| | | | |-- intervaltime: long (nullable = true)
| | | | |-- intervalvalue: long (nullable = true)
| | | | |-- utcacquisitiontime: string (nullable = true)
| | |-- usage: string (nullable = true)
|-- titlename: string (nullable = true)
|-- exploded_atom: struct (nullable = true)
| |-- dailydata: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- datatimezone: string (nullable = true)
| | | |-- intervaltime: long (nullable = true)
| | | |-- intervalvalue: long (nullable = true)
| | | |-- utcacquisitiontime: string (nullable = true)
| |-- usage: string (nullable = true)
|-- exploded_atom_struct: struct (nullable = true)
| |-- datatimezone: string (nullable = true)
| |-- intervaltime: long (nullable = true)
| |-- intervalvalue: long (nullable = true)
| |-- utcacquisitiontime: string (nullable = true)
scala> val d = c.withColumn("exploded_atom_struct_last", col("`exploded_atom_struct`.utcacquisitiontime"))
d: org.apache.spark.sql.DataFrame = [atom: array<struct<dailydata:array<struct<datatimezone:string,intervaltime:bigint,intervalvalue:bigint,utcacquisitiontime:string>>,usage:string>>, titlename: string ... 3 more fields]
scala> d.printSchema
root
|-- atom: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dailydata: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- datatimezone: string (nullable = true)
| | | | |-- intervaltime: long (nullable = true)
| | | | |-- intervalvalue: long (nullable = true)
| | | | |-- utcacquisitiontime: string (nullable = true)
| | |-- usage: string (nullable = true)
|-- titlename: string (nullable = true)
|-- exploded_atom: struct (nullable = true)
| |-- dailydata: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- datatimezone: string (nullable = true)
| | | |-- intervaltime: long (nullable = true)
| | | |-- intervalvalue: long (nullable = true)
| | | |-- utcacquisitiontime: string (nullable = true)
| |-- usage: string (nullable = true)
|-- exploded_atom_struct: struct (nullable = true)
| |-- datatimezone: string (nullable = true)
| |-- intervaltime: long (nullable = true)
| |-- intervalvalue: long (nullable = true)
| |-- utcacquisitiontime: string (nullable = true)
|-- exploded_atom_struct_last: string (nullable = true)
scala> val d = c.select(col("titlename"), col("exploded_atom_struct.*"))
d: org.apache.spark.sql.DataFrame = [titlename: string, datatimezone: string ... 3 more fields]
scala> d.show
+---------+------------+------------+-------------+--------------------+
|titlename|datatimezone|intervaltime|intervalvalue| utcacquisitiontime|
+---------+------------+------------+-------------+--------------------+
| periodic| +02:00| 15| 28128|2017-03-27T22:00:00Z|
| periodic| +02:00| 15| 25687|2017-03-27T22:15:00Z|
+---------+------------+------------+-------------+--------------------+
So thought of posting it here, in case if anyone has similar questions seeing this question.