How to read multiple sheets from an excel file in Apache spark 2

Question

How to create a DataFrame by read multiple sheets of an excel file in spark2 ?

Note : Reading multiple sheets from an excel file(single file) not a single sheet

This Question isn't duplicated. please read the question again and check with the other one So,please reopen it because still i didn't get solution. — swcraft, Oct 19 '18 at 10:05

score 0 · Answer 1 · answered Sep 11 '18 at 09:46

0

def readExcel(file: String): DataFrame = sqlContext.read
.format("com.crealytics.spark.excel")
.option("location", file)
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.option("addColorColumns", "False")
.load()

val data = readExcel("path to your excel file")

data.show(false)

answered Sep 11 '18 at 09:46

Anish Shetty

63
5

Above snippet is n't working throwing Illegal argument exception saying as use 'path'. using with 'path' it is only reading first sheet only . but, my query is how to read multiple excel sheets using spark 2.0 API – swcraft Oct 12 '18 at 13:31

score -1 · Answer 2 · answered Sep 11 '18 at 10:15

public void loadMultipleExcel()throws Exception{

try {

    File file = new File("your xls file path");
    FileInputStream fIP = new FileInputStream(file)
   StructType  customSchema = new StructType(new StructField[] {

            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});
    //Get the workbook instance for XLSX file 
    XSSFWorkbook wb = new XSSFWorkbook(fIP);

System.out.println(wb.getNumberOfSheets()); for (int i = 0; i < wb.getNumberOfSheets(); i++) {

        XSSFSheet sheet = wb.getSheetAt(i);
        System.out.print(sheet.getSheetName()+"\n");

        Dataset<Row> dataSet =spark.read().format("com.crealytics.spark.excel")
                .option("sheetName", sheet.getSheetName())
                .option("useHeader", "true")
                .option("inferSchema", true)
                .option("location", "your file path")
                .option("treatEmptyValuesAsNulls", true)
                .option("addColorColumns", "false") 
                .load("excel sheet name");
      Row  header = dataSet.first() ;
                dataSet = dataSet.filter(x->x != header) ; 

        dataSet.printSchema();
        dataSet.show();

    }

} catch (Exception  ex ) {
    ex.printStackTrace();
}

}

Please reformat your source code and give some information, explaining how this code solves the question. — Dominique, Sep 11 '18 at 10:23

score -1 · Answer 3 · answered Sep 11 '18 at 13:02

public void loadMultipleExcel()throws Exception{
    try {
        File file = new File("/PATH TO YOUR XLS FILE INCLUDING FILENAME");
        FileInputStream fIP = new FileInputStream(file)

//use customSchema when the schema is not inferred properly orelse ignore go ahead with inferSchema=true when creating dataset

        StructType  customSchema = new StructType(new StructField[] {
            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});



//Get the workbook instance for XLSX file 
//import org.apache.poi.xssf.usermodel.XSSFSheet;
//import org.apache.poi.xssf.usermodel.XSSFWorkbook;




            XSSFWorkbook wb = new XSSFWorkbook(fIP);
            System.out.println(wb.getNumberOfSheets());
//get the 1st sheet as dataset1
            XSSFSheet sheet1 = wb.getSheetAt(0);
            Dataset<Row> ds1 =spark.read().format("com.crealytics.spark.excel")
            .option("sheetName", sheet1.getSheetName())
            .option("useHeader", "true")
            .option("inferSchema", true)
            .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
            .option("treatEmptyValuesAsNulls", true)
            .option("addColorColumns", "false") 
            .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header1 = ds1.first() ;
            ds1 = ds1.filter(x->x != header1) ; 

//get rest of the sheets as dataset2 and union it with dataset1
        for (int i = 1; i < wb.getNumberOfSheets(); i++) {
            XSSFSheet sheet = wb.getSheetAt(i);
            System.out.print(sheet.getSheetName()+"\n");
            Dataset<Row> ds2 =spark.read().format("com.crealytics.spark.excel")
                    .option("sheetName", sheet.getSheetName())
                    .option("useHeader", "true")
                    .option("inferSchema", true)
                    .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
                    .option("treatEmptyValuesAsNulls", true)
                    .option("addColorColumns", "false") 
                    .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header = ds2.first() ;
            ds2 = ds2.filter(x->x != header) ; 
            ds1=ds1.union(ds2);
            ds2.unpersist();
            ds2=null;

    } catch (Exception  ex ) {
        ex.printStackTrace();
    }
}

How to read multiple sheets from an excel file in Apache spark 2

3 Answers3