2

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long lastId = -1L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        Long id = -1L;
                        if (lastWord.equals(data[0])) {
                            id = lastId;
                        } else {
                            wordInsert.setString(1, data[0]);
                            wordInsert.executeUpdate();
                            ResultSet resultSet = wordInsert.getGeneratedKeys();
                            if (resultSet != null && resultSet.next()) 
                            {
                                id = resultSet.getLong(1);
                            }
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                        lastId = id;
                    }
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)

EDIT: This code is how I solved it:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            c.setAutoCommit(false);
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (id, word) VALUES (?,?)"
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long id = 0L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    int i = 0;
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        if (!lastWord.equals(data[0])) {
                            id++;
                            wordInsert.setLong(1, id);
                            wordInsert.setString(2, data[0]);
                            wordInsert.executeUpdate();
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                        if (i % 10000 == 0) {
                            c.commit();
                        }
                        if (i % 100000 == 0) {
                            System.out.println(i+" mark file "+filename);
                        }
                        i++;
                    }
                    c.commit();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!

teuneboon
  • 4,034
  • 5
  • 23
  • 28
  • Try running without executing the SQL, you'll probably find you need to tweak your PostgreSQL database more than anything. – Steve-o Apr 09 '11 at 15:31

6 Answers6

4

JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:

c.setAutoCommit(false)

then commit in batches, something along the lines of:

long ops = 0;

for(String filename : files) {
    // ...
    while ((line = input.readLine()) != null) {
        // insert some stuff...

        ops ++;

        if(ops % 1000 == 0) {
            c.commit();
        }
    }
}

c.commit();
SimonJ
  • 21,076
  • 1
  • 35
  • 50
  • I'll try that, in combination with all of the other improvements suggested here this could surely help :) – teuneboon Apr 09 '11 at 15:36
3

If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.

Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.

Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.

There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.

user unknown
  • 35,537
  • 11
  • 75
  • 121
2

Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.

NKCSS
  • 2,716
  • 1
  • 22
  • 38
  • Hmm, I suppose threading could help, unfortunately I only got 1 machine I can run it on, and that has only 2 cores. So the max speed it could have would still be 31.25 days – teuneboon Apr 09 '11 at 15:31
  • 1
    I'm not sure this math is correct, as while reading files, or updating database the cores are sitting idle, so having 5-6-7 threads will probably still show improvement. – MeBigFatGuy Apr 09 '11 at 15:59
0

Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.

In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.

matt b
  • 138,234
  • 66
  • 282
  • 345
0

Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.

Community
  • 1
  • 1
btlog
  • 4,760
  • 2
  • 29
  • 38
0

Create threads

String lastWord = "";
    Long lastId = -1L;
    PreparedStatement wordInsert;
    PreparedStatement countInsert ;
    public class ToPostgres {
        public void main(String[] args) throws Exception {
            String filePath = "./";
            List<String> files = new ArrayList<String>();
            for (int i =0; i < 10; i++) {
                files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
            }
            Connection c = null;
            try {
                c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                        "postgres", "xxxxxx");
            } catch (SQLException e) {
                e.printStackTrace();
            }

            if (c != null) {
                try {
                    wordInsert = c.prepareStatement(
                        "INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
                    );
                    countInsert = c.prepareStatement(
                        "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                        "VALUES (?,?,?,?,?)"
                    );
                    for (String filename: files) {
                        new MyThread(filename). start();
                    }
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }

    }
    class MyThread extends Thread{
        String file;
        public MyThread(String file) {
            this.file = file;
        }
        @Override
        public void run() {         
            try {
                super.run();
                BufferedReader input =  new BufferedReader(new FileReader(new File(file)));
                String line = "";
                while ((line = input.readLine()) != null) {
                    String[] data = line.split("\t");
                    Long id = -1L;
                    if (lastWord.equals(data[0])) {
                        id = lastId;
                    } else {
                        wordInsert.setString(1, data[0]);
                        wordInsert.executeUpdate();
                        ResultSet resultSet = wordInsert.getGeneratedKeys();
                        if (resultSet != null && resultSet.next()) 
                        {
                            id = resultSet.getLong(1);
                        }
                    }
                    countInsert.setLong(1, id);
                    countInsert.setInt(2, Integer.parseInt(data[1]));
                    countInsert.setInt(3, Integer.parseInt(data[2]));
                    countInsert.setInt(4, Integer.parseInt(data[3]));
                    countInsert.setInt(5, Integer.parseInt(data[4]));
                    countInsert.executeUpdate();
                    lastWord = data[0];
                    lastId = id;
                }
            } catch (NumberFormatException e) {
                e.printStackTrace();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
Nirmal- thInk beYond
  • 11,847
  • 8
  • 35
  • 46