Apache Commons CSV ➙ 12 seconds for million rows
Is there any existing library that would help me to speed up things?
Yes, the Apache Commons CSV project works very well in my experience.
Here is an example app that uses Apache Commons CSV library to write and read rows of 24 columns: An integer sequential number, an Instant
, and the rest are random UUID
objects.
For 10,000 rows, the writing and the read each take about half a second. The reading includes reconstituting the Integer
, Instant
, and UUID
objects.
My example code lets you toggle on or off the reconstituting of objects. I ran both with a million rows. This creates a file of 850 megs. I am using Java 12 on a MacBook Pro (Retina, 15-inch, Late 2013), 2.3 GHz Intel Core i7, 16 GB 1600 MHz DDR3, Apple built-in SSD.
For a million rows, ten seconds for reading plus two seconds for parsing:
- Writing: PT25.994816S
- Reading only: PT10.353912S
- Reading & parsing: PT12.219364S
Source code is a single .java
file. Has a write method, and a read
method. Both methods called from a main
method.
I opened a BufferedReader
by calling Files.newBufferedReader
.
package work.basil.example;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.time.Instant;
import java.util.UUID;
public class CsvReadingWritingDemo
{
public static void main ( String[] args )
{
CsvReadingWritingDemo app = new CsvReadingWritingDemo();
app.write();
app.read();
}
private void write ()
{
Instant start = Instant.now();
int limit = 1_000_000; // 10_000 100_000 1_000_000
Path path = Paths.get( "/Users/basilbourque/IdeaProjects/Demo/csv.txt" );
try (
Writer writer = Files.newBufferedWriter( path, StandardCharsets.UTF_8 );
CSVPrinter printer = new CSVPrinter( writer , CSVFormat.RFC4180 );
)
{
printer.printRecord( "id" , "instant" , "uuid_01" , "uuid_02" , "uuid_03" , "uuid_04" , "uuid_05" , "uuid_06" , "uuid_07" , "uuid_08" , "uuid_09" , "uuid_10" , "uuid_11" , "uuid_12" , "uuid_13" , "uuid_14" , "uuid_15" , "uuid_16" , "uuid_17" , "uuid_18" , "uuid_19" , "uuid_20" , "uuid_21" , "uuid_22" );
for ( int i = 1 ; i <= limit ; i++ )
{
printer.printRecord( i , Instant.now() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() , UUID.randomUUID() );
}
} catch ( IOException ex )
{
ex.printStackTrace();
}
Instant stop = Instant.now();
Duration d = Duration.between( start , stop );
System.out.println( "Wrote CSV for limit: " + limit );
System.out.println( "Elapsed: " + d );
}
private void read ()
{
Instant start = Instant.now();
int count = 0;
Path path = Paths.get( "/Users/basilbourque/IdeaProjects/Demo/csv.txt" );
try (
Reader reader = Files.newBufferedReader( path , StandardCharsets.UTF_8) ;
)
{
CSVFormat format = CSVFormat.RFC4180.withFirstRecordAsHeader();
CSVParser parser = CSVParser.parse( reader , format );
for ( CSVRecord csvRecord : parser )
{
if ( true ) // Toggle parsing of the string data into objects. Turn off (`false`) to see strictly the time taken by Apache Commons CSV to read & parse the lines. Turn on (`true`) to get a feel for real-world load.
{
Integer id = Integer.valueOf( csvRecord.get( 0 ) ); // Annoying zero-based index counting.
Instant instant = Instant.parse( csvRecord.get( 1 ) );
for ( int i = 3 - 1 ; i <= 22 - 1 ; i++ ) // Subtract one for annoying zero-based index counting.
{
UUID uuid = UUID.fromString( csvRecord.get( i ) );
}
}
count++;
if ( count % 1_000 == 0 ) // Every so often, report progress.
{
//System.out.println( "# " + count );
}
}
} catch ( IOException e )
{
e.printStackTrace();
}
Instant stop = Instant.now();
Duration d = Duration.between( start , stop );
System.out.println( "Read CSV for count: " + count );
System.out.println( "Elapsed: " + d );
}
}