I want to shuffle values in a 3D numpy
-array, but only when they are > 0.
When I run my function with a single core, it is much faster than with even 2 cores. It is way beyond the overhead of creating new python processes. What am I missing?
The following code outputs:
random shuffling of markers started
time in serial execution: 1.0288s
time executing in parallel with num_cores=1: 0.9056s
time executing in parallel with num_cores=2: 273.5253s
import numpy as np
import time
from random import shuffle
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
def randomizeVoxels(V,markerLUT):
V_rand=V.copy()
# the xyz naming here does not match outer convention, which will depend on permutation
for ix in range(V.shape[0]):
for iy in range(V.shape[1]):
if V[ix,iy]>0:
V_rand[ix,iy]=markerLUT[V[ix,iy]]
return V_rand
V_ori=np.arange(1000000,-1000000,-1).reshape(100,100,200)
V_rand=V_ori.copy()
listMarkers=np.unique(V_ori)
listMarkers=[val for val in listMarkers if val>0]
print("random shuffling of markers started\n")
reassignedMarkers=listMarkers.copy()
#random shuffling of original markers
shuffle(reassignedMarkers)
markerLUT={}
for i,iMark in enumerate(listMarkers):
markerLUT[iMark]=reassignedMarkers[i]
tic=time.perf_counter()
for ix in range(len(V_ori)):
for iy in range(len(V_ori[0])):
for iz in range(len(V_ori[0][0])):
if V_ori[ix,iy,iz]>0:
V_rand[ix,iy,iz]=markerLUT[V_ori[ix,iy,iz]]
toc=time.perf_counter()
print("time in serial execution: \t\t\t{: >4.4f} s".format(toc-tic))
#######################################################################3
num_cores = 1
V_rand=V_ori.copy()
tic=time.perf_counter()
results= Parallel(n_jobs=num_cores)\
(delayed(randomizeVoxels)\
(V_ori[imSlice,:,:],
markerLUT
)for imSlice in range(V_ori.shape[0]))
for i,resTuple in enumerate(results):
V_rand[i,:,:]=resTuple
toc=time.perf_counter()
print("time executing in parallel with num_cores={}:\t{: >4.4f} s".format(num_cores,toc-tic))
num_cores = 2
V_rand=V_ori.copy()
MASK = "time executing in parallel with num_cores={}:\t {: >4.4f}s"
tic=time.perf_counter() #----------------------------- [PERF-me]
results= Parallel(n_jobs=num_cores)\
(delayed(randomizeVoxels)\
(V_ori[imSlice,:,:],
markerLUT
)for imSlice in range(V_ori.shape[0]))
for i,resTuple in enumerate(results):
V_rand[i,:,:]=resTuple
toc=time.perf_counter() #----------------------------- [PERF-me]
print( MASK.format(num_cores,toc-tic) )