This may not be the most efficient way but have a go.
Reduce or increase the chunk size depending on your RAM availability.
chunks = pd.read_csv('report_OOP_Full.csv', chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
i += 1
chunk_list.append(chunk)
df = pd.concat(chunk_list, sort = True)
If this doesnt work. Try this:
chunks = pd.read_csv('report_OOP_Full.csv', chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 10:
break
i += 1
chunk_list.append(chunk)
df1 = pd.concat(chunk_list, sort = True)
chunks = pd.read_csv('report_OOP_Full.csv', skiprows = 100000, chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 10:
break
i += 1
chunk_list.append(chunk)
df2 = pd.concat(chunk_list, sort = True)
d3 = pd.concat([d1,d2], sort = True)
skiprows was calculated by how many rows the previous dataframe has read in.
This will break after 10 chunks is loaded. store this as df1. and read in the file again by starting at chunk 11, and append that again.
i understand that you're working with some big data. I encourage you to take a look at this function i found. The link below explains how it works.
credit for this function is here:
credit
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
df[col] = df[col].astype(np.uint8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
df[col] = df[col].astype(np.uint16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
df[col] = df[col].astype(np.uint32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
df[col] = df[col].astype(np.uint64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
This will make sure your dataframe use as low memory as possible when you're working with it.