1

The project that I'm working on is trying to read a very large Excel file (a couple hundred columns and around 3000 rows) and recognize patterns in a series of letters. It works just fine on smaller files, but when I try to run it using this file I receive a java.lang.OutOfMemoryError: Java heap space error even when I'm only trying to analyze the first few rows. The error appears to be at Workbook wb = WorkbookFactory.create(new File(filepath));

I've tried a few of the solutions on this website, but have not come across any success. My code is as follows:

import java.awt.List;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class ExcelReader {

    public int Reader(File file) throws IOException, EncryptedDocumentException, InvalidFormatException {
        String filepath = file.getPath();
        Workbook wb = WorkbookFactory.create(new File(filepath));
        XSSFSheet sheet = (XSSFSheet) wb.getSheetAt(0);
        XSSFRow row;
        XSSFCell cell;
        ArrayList<Integer> list = new ArrayList<Integer>();

        int rows;
        int cols = 0;
        int temp = 0;
        rows = sheet.getPhysicalNumberOfRows();

        for (int i = 0; i <= 1; i++) {
            row = sheet.getRow(i);
            if (row != null) {
                temp = sheet.getRow(i).getPhysicalNumberOfCells();
                if (temp > cols)
                    cols = temp;
            }
        }
        for (int r = 0; r <= 60; r++) {
            row = sheet.getRow(r);
            if (row != null) {
                for (int c = 0; c <= cols; c++) {
                    int numblanks = 0;
                    cell = row.getCell((short) c);
                    if (cell != null) {
                        //System.out.print(cell + "\t\t");
                    } else {
                        //System.out.print("\t\t");
                    }
                    if (cell != null && cell.getCellType() == XSSFCell.CELL_TYPE_STRING) {
                        if ("N".equals(cell.getStringCellValue())) {
                            for (int k = c; k <= cols; k++) {
                                if ("-".equals(row.getCell(k).getStringCellValue())) {
                                    numblanks++;
                                    continue;
                                }
                                if ("S".equals(row.getCell(c + 2 + numblanks).getStringCellValue())
                                        || "T".equals(row.getCell(c + 2 + numblanks).getStringCellValue())) {
                                    list.add((int) sheet.getRow(1).getCell(c).getNumericCellValue());
                                    break;
                                }
                            }
                        }
                    }
                }
                System.out.println();
            }
        }
        System.out.println();
        System.out.println("Rows: " + rows);
        System.out.println("Columns: " + cols);
        System.out.println(list);
        return temp;
    }
}

Thank you for any help you can give me!

rgettman
  • 176,041
  • 30
  • 275
  • 357
jmc1094
  • 135
  • 1
  • 11

1 Answers1

1

I solved this issue before. My case was to read a 23M Excel file which contains 230k rows.

Increase maximum heap size is not a good solution. Apache poi does not have a streaming mode to read data. This non-streaming mode costs too much memory.

My solution is to convert the data to xml and then use XMLReader to parse the data.

Please check the following sample code:

    protected List<Entity> parseData(InputStream in) throws Exception {
        OPCPackage pkg = OPCPackage.open(in);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) r.getSheetsData();

        while (sheets.hasNext()) {
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
            break; // if only need to process one sheet.
        }
        return SheetHandler.getRawData();
    }

    private XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
        XMLReader parser =
                XMLReaderFactory.createXMLReader();
        ContentHandler handler = new SheetHandler(sst);
        parser.setContentHandler(handler);
        return parser;
    }

    private static class SheetHandler extends DefaultHandler {

        private SharedStringsTable sst;
        private String lastContents;
        private boolean nextIsString;
        private boolean nextIsInlineString;
        private boolean nextIsNull;

        private SheetHandler(SharedStringsTable sst) {
            this.sst = sst;
            rawData = new ArrayList<Entity>();
        }

        public static List<Entity> getRawData() {
            return rawData;
        }


        @Override
        public void startElement(String uri, String localName, String name,
                                 Attributes attributes) throws SAXException {

        }

        @Override
        public void endElement(String uri, String localName, String name)
                throws SAXException {


        }

        @Override
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            lastContents += new String(ch, start, length);
        }
    }
}
Bryan
  • 400
  • 1
  • 3
  • 15
  • 4
    Apache POI does have a semi-easy streaming mode - use [XSSFReader](https://poi.apache.org/apidocs/org/apache/poi/xssf/eventusermodel/XSSFReader.html) and [SheetContentsHandler](https://poi.apache.org/apidocs/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.SheetContentsHandler.html) – Gagravarr Feb 26 '16 at 23:42