I am new to Spark and I am coding using scala. I want to read a file from HDFS or S3 and convert it into Spark Data frame. The first line of the Csv file is the schema. but how can I create a dataframe with a schema having unknown columns? I was using the following piece of code to create the dataframe for a known schema.
def loadData(path:String): DataFrame = {
val rdd = sc.textFile(path);
val firstLine = rdd.first();
val schema = StructType(firstLine.split(',').map(fieldName=>StructField(fieldName,StringType,true)));
val noHeader = rdd.mapPartitionsWithIndex(
(i, iterator) =>
if (i == 0 && iterator.hasNext) {
iterator.next
iterator
} else iterator)
val rowRDD = noHeader.map(_.split(",")).map(p => Row(p(0), p(1), p(2), p(3), p(4),p(5)))
val dataFrame = sqlContext.createDataFrame(rowRDD, schema);
return dataFrame;
}