I have a python script, which needs to run a loop about 50000 times. I defined a separate function to perform the data processing so as to free up the memory by removing the local variables of the function once it's done. Additionally, the total size of all objects in my script is less than about 300 Mb, yet the total memory consumption shown by “top” increases to multiple Gbs after about only a few hundred iterations. A sample script is given below:
from random import randint
from copy import deepcopy
class dataHolder(object):
"""
This is a sample class for holding data.
The actual class in my script holds lots of
different data types
"""
def __init__(self,x,y):
self.x = x
self.y = y
def sub_process(data,x_change , y_change):
# This is a function processing the input arguments and writes the results
# into a text file. The following operations is just a sample of what I have in my actual script
# create a random index
x_ind = randint(1,len(data.x)) - 1
y_ind = randint(1,len(data.y)) - 1
data.x[x_ind] += x_change
data.y[y_ind] += y_change
# Write the results into file
with open('test.txt','w') as f:
f.write('x[%i] = %i y[%i] = %i\n' % (x_ind,data.x[x_ind],y_ind,data.y[y_ind]))
def master_func(start_pos):
# This is an example of the main function in my actual script,
# which creates data used as input arguments
# for function sub_process().
# The approximate size of the input data
# for sub_process() is 320 Mb in my actual script,
# so in the following example I create data objects with
# approximately the same size. The following data are constant
# for all iterations
mydata = dataHolder(x = range(5000000), y = range(start_pos,5000000 + start_pos))
# This is the main loop that must be run 50000 times
for k in range(50000):
# x_change and y_change vary from one iteration to another
x_change = randint(1,10)
y_change = randint(1,10)
# Perform the data processing in a separate function
sub_process(data = deepcopy(mydata), x_change = x_change, y_change = y_change)
if __name__ == "__main__":
master_func(start_pos = 2)
Following the suggestion given here I am trying to use sub-processes to resolve the memory issue but now quite sure hot to put it into the context for this particular problem. Any suggestions is greatly appreciated.
EDIT: The following modification of the sample code above resolved the memory issue:
from random import randint
from copy import deepcopy
from multiprocessing import Process
class dataHolder(object):
def __init__(self,x,y):
self.x = x
self.y = y
def sub_process(input_data):
data = deepcopy(input_data['data'])
x_change = input_data['x_change']
y_change = input_data['y_change']
x_ind = randint(1,len(data.x)) - 1
y_ind = randint(1,len(data.y)) - 1
data.x[x_ind] += 1
data.y[y_ind] += 1
with open('test.txt','w') as f:
f.write('x[%i] = %i y[%i] = %i\n' % (x_ind,data.x[x_ind],y_ind,data.y[y_ind]))
def master_func(start_pos):
input_data = {}
input_data['data'] = dataHolder(x = range(5000000), y = range(start_pos,5000000 + start_pos))
# This is the main loop that must be run 50000 times
for k in range(50000):
input_data['x_change'] = randint(1,10) x_change
input_data['y_change'] = randint(1,10)
p = Process(target = sub_process, args = (input_data,))
p.start()
p.join()
if __name__ == "__main__":
master_func(start_pos = 2)