I have a set of files in numerically labelled folders and I am trying to collate and extract statistics on the included data. My minimal code is:
import psutil as psutil
import sys as sys
import pandas as pd
import os as os
import glob as glob
import numpy as np
import matplotlib.pyplot as plt
@profile
def main():
temp=[]
strj=pd.DataFrame()
ener_list=[]
ref_ener_list=[]
main_dir="./"
column_list=["1","2","3","4","5","6","7"]
f1_list=["part1","part2"]
for folder in sorted(glob.iglob(main_dir+"/0????")):
print(folder)
print(psutil.Process().memory_info().rss / (1024 * 1024))
for fld2 in sorted(glob.iglob(folder+"/run?????")):
strj=pd.DataFrame()
for fld1 in f1_list:
for data_file in sorted(glob.iglob(fld2+"/prdv"+fld1+"/track.txt")):
temp=pd.read_table(data_file, comment="#",delim_whitespace=True,names=column_list)
strj=pd.concat([strj, temp])
del(temp)
ener_list.append(strj.values.tolist())
del(strj)
print(np.shape(ener_list))
avg_ener_list=(np.array(ener_list,dtype=object)).mean(axis=0)
avg_ener_df=pd.DataFrame(avg_ener_list, columns=column_list)
print(avg_ener_df,np.shape(avg_ener_df))
main()
exit()
For the total dataset, I have 50 folders with 1000 subfolders with 2 parts each. The size of the individual file can be either 5.6Kb(small) or 320Kb(large). When I tried running my code over all folders I noticed that I had used over 28GB memory causing the program to killed while halfway through the data extraction section. I used the memory_profiler tool to track the memory leakage but I couldn't really figure out what changes I need to make. I am inexperienced in this scenario as I haven't had to deal with memory issues before. I tried tracking memory used by variables as well but there was nothing weird there. Are there some other aspects I am overlooking here, or should I change the way I am extracting data here?
Line # Mem usage Increment Occurrences Line Contents
=============================================================
23 99.961 MiB 99.961 MiB 1 @profile
24 def main():
25 99.961 MiB 0.000 MiB 1 temp=[]
26 99.961 MiB 0.000 MiB 1 strj=pd.DataFrame()
27
28 99.961 MiB 0.000 MiB 1 ener_list=[]
29 99.961 MiB 0.000 MiB 1 ref_ener_list=[]
30 99.961 MiB 0.000 MiB 1 main_dir="./"
31 99.961 MiB 0.000 MiB 1 column_list=["1","2","3","4","5","6","7"]
32 #f_list=["part1","part2","part3"]
33 99.961 MiB 0.000 MiB 1 f1_list=["part1","part2"]
34 99.961 MiB 0.000 MiB 1 f2_list=["part1"]
35
36
37 8065.902 MiB 0.000 MiB 10 for folder in sorted(glob.iglob(main_dir+"/0????")):
38 7181.180 MiB 0.000 MiB 9 print(folder)
39 7181.180 MiB 0.000 MiB 9 print(psutil.Process().memory_info().rss / (1024 * 1024))
40 8065.902 MiB -0.527 MiB 9009 for fld2 in sorted(glob.iglob(folder+"/run?????")):
41 8065.020 MiB -0.527 MiB 9000 strj=pd.DataFrame()
42 8065.133 MiB -4.262 MiB 27000 for fld1 in f1_list:
43 8065.133 MiB -3.449 MiB 36000 for data_file in sorted(glob.iglob(fld2+"/prdv"+fld1+"/track.txt")):
44 #pass
45 8066.078 MiB 9237.312 MiB 18000 temp=pd.read_table(data_file, comment="#",delim_whitespace=True,names=column_list)
46 8066.078 MiB -8199.547 MiB 18000 strj=pd.concat([strj, temp])
47 8065.133 MiB -16399.094 MiB 18000 del(temp)
48 #strj=pd.concat([strj, pd.read_table(data_file, comment="#",delim_whitespace=True,names=column_list)])
49 #temp.append(pd.read_csv(data_file, delim_whitespace=True, skiprows=1))
50 8065.902 MiB 6923.656 MiB 9000 ener_list.append(strj.values.tolist())
51 8065.902 MiB -0.270 MiB 9000 del(strj)
52 #print(sys.getsizeof(strj)/(1024*1024), sys.getsizeof(ener_list)/(1024*1024))
53 #print(ener_list,np.shape(ener_list))
54 8067.801 MiB 1.898 MiB 1 print(np.shape(ener_list))
55
56 8067.926 MiB 0.125 MiB 1 avg_ener_list=(np.array(ener_list,dtype=object)).mean(axis=0)
57 8067.926 MiB 0.000 MiB 1 avg_ener_df=pd.DataFrame(avg_ener_list, columns=column_list)
58
59 8068.469 MiB 0.543 MiB 1 print(avg_ener_df,np.shape(avg_ener_df))