I have a large list containing binary encoded strings that I used to process in a single function before, like so:
""" just included this to demonstrate the 'data' structure """
data=np.zeros(250,dtype='float32, (250000,2)float32')
def func numpy_array(data, peaks):
rt_counter=0
for x in peaks:
if rt_counter %(len(peaks)/20) == 0:
update_progress()
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[rt_counter][1][peak_counter][0]=float(buff2)
else:
data[rt_counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
rt_counter+=1
I have been reading up on multiprocessing and figured that I wanted to try that to see if I could get a big increase in performance, I rewrote my function into 2 (helper and 'caller') like so:
def numpy_array(data, peaks):
processors=mp.cpu_count #Might as well throw this directly in the mp.Pool (just for clarity for now)
pool = mp.Pool(processes=processors)
chunk_size=len(peaks)/processors
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
pool.map(decode(data,chunk,counter))
def decode(data,chunk,counter):
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
print data[counter][1][10][0]
counter+=1
The program runs but only uses 100-110% of CPU (according to top) and once it should be finished it throws TypeError: map() takes at least 3 arguments (2 given)
at me, could anyone with some more experience with multiprocess give me a hint as to what things to look out for (that could cause the TypeError)? What might be causing my low cpu usage?
-- Code after incorporating answers --
def decode((data,chunk,counter)):
print len(chunk), counter
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
counter+=1
def numpy_array(data, peaks):
"""Fills the NumPy array 'data' with m/z-intensity values acquired
from b64 decoding and unpacking the binary string read from the
mzXML file, which is stored in the list 'peaks'.
The m/z values are assumed to be ordered without validating this
assumption.
Note: This function uses multi-processing
"""
processors=mp.cpu_count()
pool = mp.Pool(processes=processors)
chunk_size=int(len(peaks)/processors)
map_parameters=[]
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
map_parameters.append((data,chunk,counter))
pool.map(decode,map_parameters)
This latest version 'works' so far that it fills the array in the processes (where the array contains values) but once all processes are done accessing the array yields zero values only because each process gets a local copy of the array.