I am trying to export the whole database schema (around 20 GB) using postgreSQL query to create a final unique hdf5 file.
Because this size don't fit on my computers memory, I am using chuncks argument.
First I use this function to establish conection:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "@" + hostname + \
":" + port + "/" + db
return connectstring
Then I created a temporary folder to save each of hdf5 file.
def query_to_hdf5(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
i = 0
paths_chunks = []
with tempfile.TemporaryDirectory() as td:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
path = td + "/chunk" + str(i) + ".hdf5"
df.to_hdf(path, key='data')
print(path)
if verbose:
print("wrote", path)
paths_chunks.append(path)
i+=1
connectstring = make_connectstring(prefix, db, uname, passa, hostname, port)
query = "SELECT * FROM public.zz_ges"
df = query_to_hdf5(connectstring, query)
What is the best way to merge all these files into 1 single file that represents the whole dataframe ?
I tried something like this :
df = pd.DataFrame()
print(path)
for path in paths_chunks:
df_scratch = pd.read_hdf(path)
df = pd.concat([df, df_scratch])
if verbose:
print("read", path)
However, the memory goes up very fast. I need something that could be more efficient.
Update:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "@" + hostname + \
":" + port + "/" + db
return connectstring
def query_to_df(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
with pd.HDFStore('output.h5', 'w') as store:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
store.append('data', df)