I have been tasked with reading large CSV files (300k+ records) and apply regexp patterns to each record. I have always been a PHP developer and never really tried any other languages, but decided I should take the dive and attempt to do this with Java which I assumed would be much faster.
In fact, just reading the CSV file line by line was 3x faster in Java. However, when I applied the regexp requirements, the Java implementation proved to take 10-20% longer than the PHP script.
It is very well possible that I did something wrong in Java, because I just learned this as I went today. Below are the two scripts, any advice would be greatly appreciated. I really would like to not give up on Java for this particular project.
PHP CODE
<?php
$bgtime=time();
$patterns =array(
"/SOME REGEXP/",
"/SOME REGEXP/",
"/SOME REGEXP/",
"/SOME REGEXP/"
);
$fh = fopen('largeCSV.txt','r');
while($currentLineString = fgetcsv($fh, 10000, ","))
{
foreach($patterns AS $pattern)
{
preg_match_all($pattern, $currentLineString[6], $matches);
}
}
fclose($fh);
print "Execution Time: ".(time()-$bgtime);
?>
JAVA CODE
import au.com.bytecode.opencsv.CSVReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.ArrayList;
public class testParser
{
public static void main(String[] args)
{
long start = System.currentTimeMillis();
String[] rawPatterns = {
"SOME REGEXP",
"SOME REGEXP",
"SOME REGEXP",
"SOME REGEXP"
};
ArrayList<Pattern> compiledPatternList = new ArrayList<Pattern>();
for(String patternString : rawPatterns)
{
Pattern compiledPattern = Pattern.compile(patternString);
compiledPatternList.add(compiledPattern);
}
try{
String fileName="largeCSV.txt";
CSVReader reader = new CSVReader(new FileReader(fileName));
String[] header = reader.readNext();
String[] nextLine;
String description;
while( (nextLine = reader.readNext()) != null)
{
description = nextLine[6];
for(Pattern compiledPattern : compiledPatternList)
{
Matcher m = compiledPattern.matcher(description);
while(m.find())
{
//System.out.println(m.group(0));
}
}
}
}
catch(IOException ioe)
{
System.out.println("Blah!");
}
long end = System.currentTimeMillis();
System.out.println("Execution time was "+((end-start)/1000)+" seconds.");
}
}