If you want to do this purely in Python, then I have given an approach below. It's possible that you wouldn't need to chunk the list (that you could hold all of the files in memory at once and not need to do in batches). It's also possible that all the files are radically different sizes and you'd need something more sophisticated than just batches to prevent you creating an in-memory file object that exceeds your RAM. Or, you might choose to do it in 2000 separate transactions but I suspect some kind of batching will be faster (untested).
import csv
import io
import os
import psycopg2
CSV_DIR = 'the_csv_folder/' # Relative path here, might need to be an absolute path
def chunks(l, n):
"""
https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
"""
n = max(1, n)
return [l[i:i+n] for i in range(0, len(l), n)]
# Get a list of all the CSV files in the directory
all_files = os.listdir(CSV_DIR)
# Chunk the list of files. Let's go with 100 files per chunk, can be changed
chunked_file_list = chunks(all_files, 100)
# Iterate the chunks and aggregate the files in each chunk into a single
# in-memory file
for chunk in chunked_file_list:
# This is the file to aggregate into
string_buffer = io.StringIO()
csv_writer = csv.writer(string_buffer)
for file in chunk:
with open(CSV_DIR + file) as infile:
reader = csv.reader(infile)
data = reader.readlines()
# Transfer the read data to the aggregated file
csv_writer.writerows(data)
# Now we have aggregated the chunk, copy the file to Postgres
with psycopg2.connect(dbname='the_database_name',
user='the_user_name',
password='the_password',
host='the_host') as conn:
c = conn.cursor()
# Headers need to the table field names, in the order they appear in
# the csv
headers = ['first_name', 'last_name', ...]
# Now upload the data as though it was a file
c.copy_from(string_buffer, 'the_table_name', sep=',', columns=headers)
conn.commit()