the code you are using is ultra inefficient for a number of reasons as you are committing each of your data one row at a time (which would be what you want for a transactional DB or process) but not for a one-off dump.
There are a number of ways to speed this up ranging from great to not so great. Here are 4 approaches, including the naive implementation (above)
#!/usr/bin/env python
import pandas as pd
import numpy as np
import odo
import profilehooks
import sqlalchemy
import csv
import os
def create_test_data():
n = 100000
df = pd.DataFrame(dict(
id=np.random.randint(0, 1000000, n),
col1=np.random.choice(['hello', 'world', 'python', 'large string for testing ' * 10], n),
col2=np.random.randint(-1000000, 1000000, n),
col3=np.random.randint(-9000000, 9000000, n),
col4=(np.random.random(n) - 0.5) * 99999
), columns=['id', 'col1', 'col2', 'col3', 'col4'])
df.to_csv('tmp.csv', index=False)
@profilehooks.timecall
def using_pandas(table_name, uri):
df = pd.read_csv('tmp.csv')
df.to_sql(table_name, con=uri, if_exists='append', index=False)
@profilehooks.timecall
def using_odo(table_name, uri):
odo.odo('tmp.csv', '%s::%s' % (uri, table_name))
@profilehooks.timecall
def using_cursor(table_name, uri):
engine = sqlalchemy.create_engine(uri)
query = 'INSERT INTO {} (id, col1, col2, col3, col4) VALUES(%s, %s, %s, %s, %s)'
query = query.format(table_name)
con = engine.raw_connection()
with con.cursor() as cursor:
with open('tmp.csv') as fh:
reader = csv.reader(fh)
next(reader) # Skip firt line (headers)
for row in reader:
cursor.execute(query, row)
con.commit()
con.close()
@profilehooks.timecall
def using_cursor_correct(table_name, uri):
engine = sqlalchemy.create_engine(uri)
query = 'INSERT INTO {} (id, col1, col2, col3, col4) VALUES(%s, %s, %s, %s, %s)'
query = query.format(table_name)
with open('tmp.csv') as fh:
reader = csv.reader(fh)
next(reader) # Skip firt line (headers)
data = list(reader)
engine.execute(query, data)
def main():
uri = 'mysql+pymysql://root:%s@localhost/test' % os.environ['pass']
engine = sqlalchemy.create_engine(uri)
for i in (1, 2, 3, 4):
engine.execute("DROP TABLE IF EXISTS table%s" % i)
engine.execute("""
CREATE TABLE table%s(
id INT,
col1 VARCHAR(255),
col2 INT,
col3 INT,
col4 DOUBLE
);
""" % i)
create_test_data()
using_odo('table1', uri)
using_pandas('table4', uri)
using_cursor_correct('table3', uri)
using_cursor('table2', uri)
for i in (1, 2, 3, 4):
count = pd.read_sql('SELECT COUNT(*) as c FROM table%s' % i, con=uri)['c'][0]
print("Count for table%s - %s" % (i, count))
if __name__ == '__main__':
main()
The odo method is the fastest (uses MySQL LOAD DATA INFILE under the hood)
Next is Pandas (critical code paths are optimized)
Next is using a raw cursor but inserting rows in bulk
Last is the naive method, committing one row at a time
Here are some examples timings running locally against a local MySQL server.
using_odo (./test.py:29):
0.516 seconds
using_pandas (./test.py:23):
3.039 seconds
using_cursor_correct (./test.py:50):
12.847 seconds
using_cursor (./test.py:34):
43.470 seconds
Count for table1 - 100000
Count for table2 - 100000
Count for table3 - 100000
Count for table4 - 100000
As you can see, the naive implementation is ~100 times slower than odo.
And ~10 times slower than using pandas