1

I need to read a very large Excel file that has both date and not date numbers. All of the examples I find seem to be able to do one or the other (identify a cell as a date value or read the file in constant memory).

The only solution that seems to work for the very large file is the StreamingReader approach as described here (the other examples described here either do not work for the file format I have or give out of memory heap errors).

Read Huge Excel file(500K rows) in java

http://poi.apache.org/components/spreadsheet/how-to.html#event_api

What I'm doing to read the file is shown below. The entire example with test-excel.xmls (a small test file) is available in github here:

https://github.com/greshje/example-poi-streaming

POM.XML:

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <build.version>1.0.4-001</build.version>
</properties>

<modelVersion>4.0.0</modelVersion>
<groupId>com.greshje.examples</groupId>
<artifactId>poi-streaming-example</artifactId>
<version>1.0.4-SNAPSHOT</version>
<packaging>jar</packaging>

<!-- 
*
* dependencies
*
-->

<dependencies>
    
    <!-- JUNIT https://mvnrepository.com/artifact/junit/junit -->
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.12</version>
        <scope>test</scope>
    </dependency>
    
    <!-- JUNIT-TOOLBOX https://mvnrepository.com/artifact/com.googlecode.junit-toolbox/junit-toolbox -->
    <dependency>
        <groupId>com.googlecode.junit-toolbox</groupId>
        <artifactId>junit-toolbox</artifactId>
        <version>2.4</version>
        <scope>test</scope>
    </dependency>

    <!-- SLF4J LOGBACK CLASSIC https://mvnrepository.com/artifact/ch.qos.logback/logback-classic -->
    <dependency>
        <groupId>ch.qos.logback</groupId>
        <artifactId>logback-classic</artifactId>
        <version>1.2.3</version>
    </dependency>

    <!-- POI https://mvnrepository.com/artifact/org.apache.poi/poi -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- POI-OOXML https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- XERCES https://mvnrepository.com/artifact/xerces/xerces -->
    <dependency>
        <groupId>xerces</groupId>
        <artifactId>xerces</artifactId>
        <version>2.4.0</version>
    </dependency>

    <!-- XERCES-IMPL https://mvnrepository.com/artifact/xerces/xercesImpl -->
    <dependency>
        <groupId>xerces</groupId>
        <artifactId>xercesImpl</artifactId>
        <version>2.12.0</version>
    </dependency>

    <!-- XLSX-STREAMER https://mvnrepository.com/artifact/com.monitorjbl/xlsx-streamer -->
    <dependency>
        <groupId>com.monitorjbl</groupId>
        <artifactId>xlsx-streamer</artifactId>
        <version>0.2.3</version>
    </dependency>

</dependencies>

<!-- 
*
* build
*
-->

<build>
    <plugins>
        <plugin>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.7.0</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-jar-plugin</artifactId>
            <version>2.3.2</version>
            <!--  
            <configuration> 
                <finalName></finalName>                   
            </configuration>
            -->
        </plugin>      
    </plugins>
</build>

Java Code:

package com.greshje.example.poi.streaming;

import java.io.InputStream;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.monitorjbl.xlsx.StreamingReader;

public class PoiStreamingExample {

    private static final Logger log = LoggerFactory.getLogger(PoiStreamingExample.class);

    private static final String FILE_NAME = "/com/greshje/example/poi/streaming/test-file.xlsx";

    public static void main(String[] args) {
        log.info("Starting test...");
        log.info("Getting file");
        InputStream in = PoiStreamingExample.class.getResourceAsStream(FILE_NAME);
        log.info("Got file");
        StreamingReader reader = getReader(in, 0);
        log.info("File contents:");
        for (Row row : reader) {
            String rowString = "";
            for (Cell cell : row) {
                if (rowString != "") {
                    rowString += ",";
                }
                // NEED A WAY TO GET A DATE WHERE APPROPRIATE HERE
                rowString += cell.getStringCellValue();
            }
            log.info(rowString);
        }
        log.info("Done.");
    }

    public static StreamingReader getReader(InputStream in, int sheetIndex) {
        try {
            StreamingReader reader = StreamingReader.builder()
                    .rowCacheSize(100) // number of rows to keep in memory (defaults to 10)
                    .bufferSize(4096) // buffer size to use when reading InputStream to file (defaults to 1024)
                    .sheetIndex(sheetIndex) // index of sheet to use
                    .read(in); // read the file
            return reader;
        } catch (Exception exp) {
            throw new RuntimeException(exp);
        }
    }

}

My test data looks like this:

enter image description here

The output looks like this (date and not date values are represented as numbers).

2020-09-06 10:47:13,814 10:47:13.814 [main] INFO  (PoiStreamingExample.java:19) - Starting test...
2020-09-06 10:47:13,822 10:47:13.822 [main] INFO  (PoiStreamingExample.java:20) - Getting file
2020-09-06 10:47:13,823 10:47:13.823 [main] INFO  (PoiStreamingExample.java:22) - Got file
2020-09-06 10:47:15,117 10:47:15.117 [main] INFO  (PoiStreamingExample.java:24) - File contents:
2020-09-06 10:47:15,149 10:47:15.149 [main] INFO  (PoiStreamingExample.java:33) - Number,Date (mostly),Date (mostly)
2020-09-06 10:47:15,150 10:47:15.150 [main] INFO  (PoiStreamingExample.java:33) - 123456,43550
2020-09-06 10:47:15,150 10:47:15.150 [main] INFO  (PoiStreamingExample.java:33) - 123456,43685,44019
2020-09-06 10:47:15,150 10:47:15.150 [main] INFO  (PoiStreamingExample.java:33) - 123456,43522,43535
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 123456,43503,43538
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 123456,43535,43564
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 123456,43536,43574
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43553,43700
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,44041
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43521,43550
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43558,43580
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43567,43599
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43633,43641
2020-09-06 10:47:15,151 10:47:15.151 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43573,43615
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43577,43606
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43719,43754
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - 7890123,43634,43641
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - 123,43550
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - smith,43550
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - jones,43550
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:33) - 43550,43550
2020-09-06 10:47:15,152 10:47:15.152 [main] INFO  (PoiStreamingExample.java:35) - Done.

--- EDIT ------------------------

I updated the xls-stream version and DO NOT still have the issue UPDATE TO NEW VERSION FIXED THIS!!!

<!-- XLSX-STREAMER https://mvnrepository.com/artifact/com.monitorjbl/xlsx-streamer -->
<dependency>
    <groupId>com.monitorjbl</groupId>
    <artifactId>xlsx-streamer</artifactId>
    <version>2.1.0</version>
</dependency>

THIS WAS WITH THE OLD VERSION: There just doesn't seem to be any information that would allow the cell type to be determined:

enter image description here

Also, code to get cell type doesn't seem to be supported (in the old version)

enter image description here

The new version has much more cell information and works with dates and numbers giving the results shown in the accepted answer.

John
  • 3,458
  • 4
  • 33
  • 54
  • 1
    Why using that old version of `Streaming Excel Reader`? Current version is `2.1.0`. Why not using this? – Axel Richter Sep 06 '20 at 15:43
  • Good catch, I must have gotten it from and old post :) I'll update and see if that helps, thanks! – John Sep 06 '20 at 15:46

1 Answers1

2

Using latest version of Excel Streaming Reader, which is 2.1.0, this problem is gone.

Using your test-file.xlsx and following code:

import java.io.InputStream;

import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Cell;

import com.monitorjbl.xlsx.StreamingReader;

public class PoiStreamingExample {

 private static final String FILE_NAME = "./test-file.xlsx";

 public static void main(String[] args) {

  try (
   InputStream is = PoiStreamingExample.class.getResourceAsStream(FILE_NAME);
   Workbook workbook = StreamingReader.builder()
    .rowCacheSize(100)
    .bufferSize(4096)
    .open(is)) {
   Sheet sheet =  workbook.getSheetAt(0);
   for (Row r : sheet) {
    String rowString = "";
    for (Cell c : r) {
     if (rowString != "") {
      rowString += ",";
     } 
     rowString += c.getStringCellValue();
    }
    System.out.println(rowString);
   }
  } catch (Exception ex) {
   ex.printStackTrace();
  }

 }
}

It prints:

Number,Date (mostly),Date (mostly)
123456,3/26/19
123456,8/8/19,7/7/20
123456,2/26/19,3/11/19
123456,2/7/19,3/14/19
123456,3/11/19,4/9/19
123456,3/12/19,4/19/19
7890123,3/29/19,8/23/19
7890123,7/29/20
7890123,2/25/19,3/26/19
7890123,4/3/19,4/25/19
7890123,4/12/19,5/14/19
7890123,6/17/19,6/25/19
7890123,4/18/19,5/30/19
7890123,4/22/19,5/21/19
7890123,9/11/19,10/16/19
7890123,6/18/19,6/25/19
123,43550
smith,43550
jones,43550
43550,43550
Axel Richter
  • 56,077
  • 6
  • 60
  • 87