-3

How to create a DataFrame by read multiple sheets of an excel file in spark2 ?

Note : Reading multiple sheets from an excel file(single file) not a single sheet

swcraft
  • 1
  • 1
  • 7
  • This Question isn't duplicated. please read the question again and check with the other one So,please reopen it because still i didn't get solution. – swcraft Oct 19 '18 at 10:05

3 Answers3

0
def readExcel(file: String): DataFrame = sqlContext.read
.format("com.crealytics.spark.excel")
.option("location", file)
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.option("addColorColumns", "False")
.load()

val data = readExcel("path to your excel file")

data.show(false)

  • Above snippet is n't working throwing Illegal argument exception saying as use 'path'. using with 'path' it is only reading first sheet only . but, my query is how to read multiple excel sheets using spark 2.0 API – swcraft Oct 12 '18 at 13:31
-1

public void loadMultipleExcel()throws Exception{

try {

    File file = new File("your xls file path");
    FileInputStream fIP = new FileInputStream(file)
   StructType  customSchema = new StructType(new StructField[] {

            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});
    //Get the workbook instance for XLSX file 
    XSSFWorkbook wb = new XSSFWorkbook(fIP);

System.out.println(wb.getNumberOfSheets()); for (int i = 0; i < wb.getNumberOfSheets(); i++) {

        XSSFSheet sheet = wb.getSheetAt(i);
        System.out.print(sheet.getSheetName()+"\n");

        Dataset<Row> dataSet =spark.read().format("com.crealytics.spark.excel")
                .option("sheetName", sheet.getSheetName())
                .option("useHeader", "true")
                .option("inferSchema", true)
                .option("location", "your file path")
                .option("treatEmptyValuesAsNulls", true)
                .option("addColorColumns", "false") 
                .load("excel sheet name");
      Row  header = dataSet.first() ;
                dataSet = dataSet.filter(x->x != header) ; 

        dataSet.printSchema();
        dataSet.show();

    }

} catch (Exception  ex ) {
    ex.printStackTrace();
}

}

Nitya Yekkirala
  • 265
  • 3
  • 3
  • 1
    Please reformat your source code and give some information, explaining how this code solves the question. – Dominique Sep 11 '18 at 10:23
-1
public void loadMultipleExcel()throws Exception{
    try {
        File file = new File("/PATH TO YOUR XLS FILE INCLUDING FILENAME");
        FileInputStream fIP = new FileInputStream(file)

//use customSchema when the schema is not inferred properly orelse ignore go ahead with inferSchema=true when creating dataset

        StructType  customSchema = new StructType(new StructField[] {
            new StructField("year", DataTypes.DateType, true,Metadata.empty()),
            new StructField("make", DataTypes.DateType, true,Metadata.empty()),
            new StructField("model", DataTypes.DoubleType,true,Metadata.empty()),
            new StructField("comment", DataTypes.StringType,true,Metadata.empty()),
            new StructField("blank", DataTypes.StringType,true,Metadata.empty())});



//Get the workbook instance for XLSX file 
//import org.apache.poi.xssf.usermodel.XSSFSheet;
//import org.apache.poi.xssf.usermodel.XSSFWorkbook;




            XSSFWorkbook wb = new XSSFWorkbook(fIP);
            System.out.println(wb.getNumberOfSheets());
//get the 1st sheet as dataset1
            XSSFSheet sheet1 = wb.getSheetAt(0);
            Dataset<Row> ds1 =spark.read().format("com.crealytics.spark.excel")
            .option("sheetName", sheet1.getSheetName())
            .option("useHeader", "true")
            .option("inferSchema", true)
            .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
            .option("treatEmptyValuesAsNulls", true)
            .option("addColorColumns", "false") 
            .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header1 = ds1.first() ;
            ds1 = ds1.filter(x->x != header1) ; 

//get rest of the sheets as dataset2 and union it with dataset1
        for (int i = 1; i < wb.getNumberOfSheets(); i++) {
            XSSFSheet sheet = wb.getSheetAt(i);
            System.out.print(sheet.getSheetName()+"\n");
            Dataset<Row> ds2 =spark.read().format("com.crealytics.spark.excel")
                    .option("sheetName", sheet.getSheetName())
                    .option("useHeader", "true")
                    .option("inferSchema", true)
                    .option("location", "/PATH TO YOUR XLS FILE INCLUDING FILENAME")
                    .option("treatEmptyValuesAsNulls", true)
                    .option("addColorColumns", "false") 
                    .load("NAME OF YOUR XLSM OR XLS FILE");

            Row  header = ds2.first() ;
            ds2 = ds2.filter(x->x != header) ; 
            ds1=ds1.union(ds2);
            ds2.unpersist();
            ds2=null;

    } catch (Exception  ex ) {
        ex.printStackTrace();
    }
}   
Nitya Yekkirala
  • 265
  • 3
  • 3