≈105 seconds per 1 million rows to insert into Postgresql local database on table with 2 indexes and 4 columns it is slow or fast ?
Python Code:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from sqlalchemy import create_engine
num = 32473068
batch = 1000000
def main(data):
engine = create_engine('postgresql://***:****' + host + ':5432/kaggle')
data.to_sql(con=engine, name=tbl_name, if_exists='append', index=False)
for i in range(0, num, batch):
data = pd.read_csv(data_path+'app_events.csv', skiprows=i, nrows=batch)
data.columns = ['event_id', 'app_id', 'is_installed', 'is_active']
data = data.reset_index(drop=True)
batchSize = 10000
batchList = [data.iloc[x:x + batchSize].reset_index(drop=True) for x in range(0, len(data), batchSize)]
with ThreadPoolExecutor(max_workers=30) as executor:
future_to_url = {executor.submit(main, d): d for d in batchList}
for k, future in enumerate(as_completed(future_to_url)):
url = future_to_url[future]