I am trying a parallelize a piece of code that looks like follows:
%%cython --compile-args=-fopenmp --link-args=-fopenmp -a
cimport openmp
cimport cython
#cimport openmp
"""
%%cython -a
"""
from cpython cimport array
import array
import cython.parallel as cp
from cython.parallel import parallel, prange
"""stuff"""
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
@cython.cdivision(True) # Deactivate division by 0 checking.
cdef function(arr1, arr2):
len_out = 3*length
cdef np.ndarray[np.float64_t, ndim=2] output_arr
output_arr = np.zeros((len_out, 6))
cdef np.ndarray[np.float64_t, ndim=1] current_arr1 = arr1
cdef np.ndarray[np.float64_t, ndim=1] current_arr2 = arr2
cdef np.ndarray[np.int_t, ndim=2] lkupcoord = np.array(lookup)
"""
stuff
"""
#for k in range(length):
for k in prange(0,length,1, nogil=True, schedule = "static",chunksize = 200,num_threads =2):
#Block 1
if "check some conditions":
"do computation"
output_arr[k,0] = val
.
.
.
output_arr[k,5] = val
else:
"do some computation"
output_arr[k,0] = val
.
.
.
output_arr[k,5] = val
#Block 2
if "check some conditions":
"do computation"
output_arr[k+length,0] = val
.
.
.
output_arr[k+length,5] = val
elif:
"do some computation"
output_arr[k+2*length,0] = val
.
.
.
output_arr[k+2*length,5] = val
#Block 3
if "check some conditions":
"do computation"
output_arr[k,0] = val
.
.
.
output_arr[k,5] = val
elif:
"do some computation"
output_arr[k,0] = val
.
.
.
output_arr[k,5] = val
return output_arr
the length is approximate 2000. Using prange is the same or even slightly worse than using regular range. I am not quite sure why. I don't have any python/GIL errors as far as I can tell as it runs.
I suspect that length is not long enough to benefit from prange, and not worth the extra overhead?
I was wondering if Block 1, Block 2 and Block 3 can be launched on separate threads by wrapping into different functions, and then return output_arr? that would in principle give me a 3x speed up?
I am willing to provide exact code, and some sample input/output and annotate cython file if that helps? It would be greatly appreciated if I could get some help speeding this up
Running on cluster as an interactive job using srun in a jupter notebook. requested resources at the time of the job 1 node with 2 cores and 16G of memory.
NodeName=node443 Arch=x86_64 CoresPerSocket=14
CPUAlloc=28 CPUErr=0 CPUTot=28 CPULoad=27.09
AvailableFeatures=node443,centos7
ActiveFeatures=node443,centos7
Gres=(null)
NodeAddr=node443 NodeHostName=node443
OS=Linux 3.10.0-514.26.2.el7.x86_64 #1 SMP Tue Jul 4 15:04:05 UTC 2017
RealMemory=128000 AllocMem=64440 FreeMem=21369 Sockets=2 Boards=1
State=ALLOCATED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=sched_mit_binz
BootTime=2021-08-11T14:47:16 SlurmdStartTime=2021-08-11T14:47:54
CfgTRES=cpu=28,mem=125G,billing=28
AllocTRES=cpu=28,mem=64440M
CapWatts=n/a
CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
arr1 and arr2 are 1d nd.arrays with entries either 0,1, and their sizes are 1700 and 60 entries respectively.
Time with prange, average time per call: 0.00308809373805772
Time without prange, average time per call: 0.0014715276634957217
Timed using from timeit import default_timer as timer
updated code, this are the computations that are being performed inside the loop over k
for k in range(lenpop):
q_c = current_c[k]
i,j = lkupcoord[k]
i_m = current_m[i]
j_m = current_m[j]
if i_m == j_m and i_m == 1.0:
e = 0.0 #some float
else:
e = 0.0 #some float
if fabs(i-j) == 1.0:
bb = 0.0
else:
bb = 1.0
if q_c ==0 and bb == 1.0:
dq = +1.0
q_n = q_c + dq
r = kc
output_arr[k+2*s, 0] = k
output_arr[k+2*s, 1] = i
output_arr[k+2*s, 2] = j
output_arr[k+2*s,3] = dq
output_arr[k+2*s, 4] = r
output_arr[k+2*s, 5] = 0
elif q_c == 1 and bb == 1.0:
dq = -1.0
q_n = q_c + dq
Qc = (mu-(N-1))/Mc
Qn = (mu+dq-(N-1))/Mc
fudge = (
Mc*(Qn*log(Qn + 0.0001) + (1-Qn)*log(1-Qn + 0.0001))
- Mc*(Qc*log(Qc + 0.0001) + (1-Qc)*log(1-Qc + 0.0001))
)
dF = ((100*(Qn-0.5)**4 -25*(Qn-0.5)**2 + e*q_n)-(100*(Qc-0.5)**4 -25*(Qc-0.5)**2 + e*q_c)-fudge
) #some quartic polynomial in general
r = kc*exp(-dF)
output_arr[k+2*s, 0] = k
output_arr[k+2*s, 1] = i
output_arr[k+2*s, 2] = j
output_arr[k+2*s, 3] = dq
output_arr[k+2*s, 4] = r
output_arr[k+2*s, 5] = 0
:
if i_m == 1:
d_im = -1
if bb==0.0:
r = 0.5*c2*i_m + q_c*cy1*i_m*(1-j_m)
else:
r = q_c*cy1*i_m*(1-j_m)
output_arr[k+2*s+1, 0] = k
output_arr[k+2*s+1, 1] = i
output_arr[k+2*s+1, 2] = j
output_arr[k+2*s+1, 3] = d_im
output_arr[k+2*s+1, 4] = r
output_arr[k+2*s+1, 5] = 1
elif i_m == 0:
d_im = +1
if bb==0.0:
r = 0.5*c2*(1-i_m) + q_c*cx1*(1-i_m)*j_m
else:
r = q_c*cx1*(1-i_m)*j_m
output_arr[k+2*s+1, 0] = k
output_arr[k+2*s+1, 1] = i
output_arr[k+2*s+1, 2] = j
output_arr[k+2*s+1, 3] = d_im
output_arr[k+2*s+1, 4] = r
output_arr[k+2*s+1, 5] = 1
if j_m == 1:
d_jm = -1
if bb==0.0:
r = 0.5*c2*j_m + q_c*cy1*j_m*(1-i_m)
else:
r = q_c*cy1*j_m*(1-i_m)
output_arr[k+2*s+2,0] = k
output_arr[k+2*s+2,1] = i
output_arr[k+2*s+2,2] = j
output_arr[k+2*s+2,3] = d_jm
output_arr[k+2*s+2,4] = r
output_arr[k+2*s+2,5] = 2
elif j_m == 0:
d_jm = +1
if bb==0.0:
r = 0.5*c2*(1-j_m) + q_c*cx1*(1-j_m)*i_m
else:
r = q_c*cx1*(1-j_m)*i_m
output_arr[k+2*s+2,0] = k
output_arr[k+2*s+2,1] = i
output_arr[k+2*s+2,2] = j
output_arr[k+2*s+2,3] = d_jm
output_arr[k+2*s+2,4] = r
output_arr[k+2*s+2,5] = 2
s=s+1
output_arr = output_arr[~np.all(output_arr == 0, axis=1)]
return output_arr
sample function([1,0,0,1],[0,1,0,1])