0

I have a .csv file with 500K records where each record has 4 columns. I want all of these records to get imported into an SQLite table in Java (JDBC).

I have tried using executeUpdate() and executeBatch(), but both of these are really slow. They process 400-500 records per minute.

import java.io.File;
import java.io.FileNotFoundException;
import java.util.Scanner;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.text.ParseException;
import java.sql.*;

public class MyClass{
    public static void main(String[] args) throws FileNotFoundException, ParseException, SQLException, ClassNotFoundException{
              Connection c = null;
              Statement stmt = null;
             try {
                     Class.forName("org.sqlite.JDBC");
                     c = DriverManager.getConnection("jdbc:sqlite:mydb.db");
                     stmt = c.createStatement();
                     String drop_sql = "DROP TABLE IF EXISTS MyTable";
                     stmt.executeUpdate(drop_sql);
                     String create_sql = "CREATE TABLE MyTable " +
                             "(VAR1     CHAR(50) NOT NULL, " +
                             "VAR2 CHAR(10) PRIMARY KEY NOT NULL," +
                             " VAR3   TEXT   NOT NULL, " +
                             " VAR4      TEXT   NOT NULL )";

                     stmt.executeUpdate(create_sql);
                     File premFile = new File("MyFile.csv");
                     DateFormat df = new SimpleDateFormat("dd/MM/yyyy");
                     Scanner scanner = new Scanner(premFile);
                     scanner.useDelimiter(",");
                     int i = 0, count = 500000;

                     while (i < count){
                             String myRecord = scanner.nextLine();
                             String[] cols = myRecord.split(",");
                             String var1 = cols[0];
                             String var2 = cols[1];
                             Date var3 = df.parse(cols[2]);
                             Date var4 = df.parse(cols[3]);

                             String query = "INSERT INTO MyTable VALUES (" +
                                     "'" + var1 + "', " +
                                     "'" + var2 + "', " +
                                     "'" + var3 +  "', " +
                                     "'" + var4 + "')";
                             stmt.addBatch(query);
                             i++;
                     }
                     stmt.executeBatch();
                     stmt.close();
                     c.close();
             } catch ( Exception e ) {
                     System.err.println( e.getClass().getName() + ": " + e.getMessage() );
                     System.exit(0);
             }
     } 
}

If I go the SQLite way, and import the csv to the table using .import my_file.csv my_table, I get the full task done within seconds. Is there any way to achieve a similar performance by using only Java code?

I tried PreparedStatement, but it did not have visible improvements.

Sonu Mishra
  • 1,659
  • 4
  • 26
  • 45

1 Answers1

1

I think your biggest problem here might be that you are going back to the file on each iteration, I would try loading the lines into an array and processing from there.

P.S. You might not want to use scanner.useDelimiter(",") as you are using scanner.nextLine() anyway and not scanner.next(). I believe this does nothing, although I might be incorrect in saying so, give it a go.

J. Knight
  • 131
  • 9
  • The while loop anyway executes very fast. The bottleneck is the database insert (stmt.executeBatch() here) – Sonu Mishra Apr 18 '17 at 03:26
  • In that case, look at this answer which suggests turning off autoCommit. http://stackoverflow.com/questions/9319366/getting-executebatch-to-execute-faster – J. Knight Apr 18 '17 at 03:33