I have a loop that creates a new dataframe and writes it to a parquet file in S3. I'm doing a batch process and processing about 100,000 rows per iteration.
Once a dataframe is saved as a parquet file I don't need it memory anymore and I want to delete it and release the space but the commands I'm running below don't seem to open up memory and in every iteration my available memory goes down, and memory consumed goes up.
while True:
#code snippet to pull data from source
df_to_write = get_pandas_data_frame(col_name_data_type_map, records) #create dataframe
wr.s3.to_parquet(df=df_to_write) #write to s3
#check memory usage
with_df = psutil.virtual_memory().available * 100 / psutil.virtual_memory().total
print("memory with df: {}".format(with_df))
print_memory_usage()
#delete and release memory
lst = [df_to_write]
del df_to_write
del lst
gc.collect()
df_to_write = pd.DataFrame() #saw an answer that said this can help with releasing memory (doesn't in my case)
#check memory usage after deleting
print_memory_usage()
without_df = psutil.virtual_memory().available * 100 / psutil.virtual_memory().total
print("memory without df: {}".format(without_df))
def get_pandas_data_frame(col_name_data_type_map, records):
table_columns = [item[0] for item in col_name_data_type_map]
df = pd.DataFrame(records, columns=table_columns)
for item in col_name_data_type_map:
if item[1]=='json':
df[item[0]] = df[item[0]].astype(dtype=str) # Here jsonb type is converted to string format, once can flatten also.
return df
def print_memory_usage():
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
print(f'Memory used: {memory_info.rss / 1024 ** 2} MB')
I referenced this question: Delete and release memory of a single pandas dataframe
The most common approach seems to be some combination of del df
and gc.collect()
but I'm seeing no affect on my end
Is there something else I should be doing?