How to create a DataFrame by read multiple sheets of an excel file in spark2 ?
Note : Reading multiple sheets from an excel file(single file) not a single sheet
How to create a DataFrame by read multiple sheets of an excel file in spark2 ?
Note : Reading multiple sheets from an excel file(single file) not a single sheet
def readExcel(file: String): DataFrame = sqlContext.read
.format("com.crealytics.spark.excel")
.option("location", file)
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.option("addColorColumns", "False")
.load()
val data = readExcel("path to your excel file")
data.show(false)
public void loadMultipleExcel()throws Exception{
try {
File file = new File("your xls file path");
FileInputStream fIP = new FileInputStream(file)
StructType customSchema = new StructType(new StructField[] {
new StructField("year", DataTypes.DateType, true,Metadata.empty()),
new StructField("make", DataTypes.DateType, true,Metadata.empty()),
new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
new StructField("blank", DataTypes.StringType,true,Metadata.empty())});
//Get the workbook instance for XLSX file
XSSFWorkbook wb = new XSSFWorkbook(fIP);
System.out.println(wb.getNumberOfSheets()); for (int i = 0; i < wb.getNumberOfSheets(); i++) {
XSSFSheet sheet = wb.getSheetAt(i);
System.out.print(sheet.getSheetName()+"\n");
Dataset<Row> dataSet =spark.read().format("com.crealytics.spark.excel")
.option("sheetName", sheet.getSheetName())
.option("useHeader", "true")
.option("inferSchema", true)
.option("location", "your file path")
.option("treatEmptyValuesAsNulls", true)
.option("addColorColumns", "false")
.load("excel sheet name");
Row header = dataSet.first() ;
dataSet = dataSet.filter(x->x != header) ;
dataSet.printSchema();
dataSet.show();
}
} catch (Exception ex ) {
ex.printStackTrace();
}
}
public void loadMultipleExcel()throws Exception{
try {
File file = new File("/PATH TO YOUR XLS FILE INCLUDING FILENAME");
FileInputStream fIP = new FileInputStream(file)
//use customSchema when the schema is not inferred properly orelse ignore go ahead with inferSchema=true when creating dataset
StructType customSchema = new StructType(new StructField[] {
new StructField("year", DataTypes.DateType, true,Metadata.empty()),
new StructField("make", DataTypes.DateType, true,Metadata.empty()),
new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
new StructField("blank", DataTypes.StringType,true,Metadata.empty())});
//Get the workbook instance for XLSX file
//import org.apache.poi.xssf.usermodel.XSSFSheet;
//import org.apache.poi.xssf.usermodel.XSSFWorkbook;
XSSFWorkbook wb = new XSSFWorkbook(fIP);
System.out.println(wb.getNumberOfSheets());
//get the 1st sheet as dataset1
XSSFSheet sheet1 = wb.getSheetAt(0);
Dataset<Row> ds1 =spark.read().format("com.crealytics.spark.excel")
.option("sheetName", sheet1.getSheetName())
.option("useHeader", "true")
.option("inferSchema", true)
.option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
.option("treatEmptyValuesAsNulls", true)
.option("addColorColumns", "false")
.load("NAME OF YOUR XLSM OR XLS FILE");
Row header1 = ds1.first() ;
ds1 = ds1.filter(x->x != header1) ;
//get rest of the sheets as dataset2 and union it with dataset1
for (int i = 1; i < wb.getNumberOfSheets(); i++) {
XSSFSheet sheet = wb.getSheetAt(i);
System.out.print(sheet.getSheetName()+"\n");
Dataset<Row> ds2 =spark.read().format("com.crealytics.spark.excel")
.option("sheetName", sheet.getSheetName())
.option("useHeader", "true")
.option("inferSchema", true)
.option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
.option("treatEmptyValuesAsNulls", true)
.option("addColorColumns", "false")
.load("NAME OF YOUR XLSM OR XLS FILE");
Row header = ds2.first() ;
ds2 = ds2.filter(x->x != header) ;
ds1=ds1.union(ds2);
ds2.unpersist();
ds2=null;
} catch (Exception ex ) {
ex.printStackTrace();
}
}