I am reading xml file in scala
<tag1>
<tag2 id="0" attr1="abc" ... />
..
</tag1>
This was already reported as issue and closed . https://github.com/databricks/spark-xml/pull/303
However i am not able to resolve this.
import org.apache.spark.sql.SparkSession
import com.databricks.spark.xml._
import org.apache.spark.sql.types.{StructType, StructField, DoubleType,StringType}
import org.apache.spark.sql.{Row, SaveMode}
object stack {
def main(args: Array[String]) {
val spark = SparkSession.builder.getOrCreate()
val customSchema = StructType(Array(
StructField("id", DoubleType, nullable = true),
StructField("attr1", StringType, nullable = true),
...
...
))
val df = spark.read
.option("rowTag", "tag2")
.format("com.databricks.spark.xml")
.schema(customSchema)
.load("dummy.xml")
import spark.sql
import spark.implicits._
df.createOrReplaceTempView("temp1")
sql("SELECT * from temp1 limit 5").show()
}
}
However df.show(5)
displays no rows.
The resolution talks about using XmlInputFormat which i have not tried , if someone can guide then it will be helpful.
Similar type of solution works with nested xml file.
<books>
<book> .. </book>
<name> abc </name>
</books>
I want to see the dataframe with values to show. and later i want to read many xml files and join them in a sql query.