I'm struggling to find where the leak is in this code
kullback.pyx
import numpy as np
cimport numpy as np
from libcpp.vector cimport vector
import scipy.stats as st
import matplotlib.pyplot as plt
cdef vector[double] minmax(double i, dict a):
cdef double minmax
cdef vector[double] out
try:
minmax= min(list(filter(lambda x: x > i, a.keys())))
except ValueError:
minmax = min(a.keys())
cdef double maxmin
try:
maxmin = max(list(filter(lambda x: x < i, a.keys())))
except ValueError:
maxmin = max(a.keys())
out.push_back(minmax)
out.push_back(maxmin)
return out
def KullbackLeibler(args):
cdef np.ndarray[np.double_t, ndim = 1] psample = args[0]
cdef np.ndarray[np.double_t, ndim = 1] qsample = args[1]
cdef int n = args[2]
a = plt.hist(psample, bins = n)
cdef np.ndarray[np.double_t, ndim = 1] ax = a[1]
cdef np.ndarray[np.double_t, ndim = 1] ay = a[0]
b = plt.hist(qsample, bins = ax)
adict = dict(zip(ax, ay))
ax = ax[:-1]
cdef np.ndarray[np.double_t, ndim = 1] bx = b[1]
cdef np.ndarray[np.double_t, ndim = 1] by = b[0]
bdict = dict(zip(bx, by))
bx = bx[:-1]
cdef vector[double] kl
cdef int N = np.sum(ay)
cdef int i
cdef double p_minmax, p_maxmin, q_minmax, q_maxmin
cdef double KL
for i in range(len(psample)):
ptmp = minmax(psample[i], adict)
p_minmax = ptmp[0]
p_maxmin = ptmp[1]
qtmp = minmax(psample[i], bdict)
q_minmax = qtmp[0]
q_maxmin = qtmp[1]
pdensity = adict[p_maxmin]/ N
qdensity = np.max([bdict[q_maxmin]/ N, 10e-20])
KL = pdensity * np.log(pdensity/qdensity)
kl.push_back(KL)
cdef double res = np.sum(kl)
del args, psample, qsample, ax, ay, bx, by, adict, bdict
return res
here the main from which I launch
main.py
import kullback as klcy #@unresolvedimport
import datetime
import numpy as np
import pathos.pools as pp
import objgraph
np.random.seed(10)
ncore = 4
pool = pp.ProcessPool(ncore)
KL = []
for i in range(2500):
time1 = datetime.datetime.now()
n = 500
x = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
y = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
data = np.array(list(zip(x,y,[n/10]*ncore)))
kl = pool.map(klcy.KullbackLeibler, data)
time2 = datetime.datetime.now()
print(i, time2 - time1, sep = " ")
print(objgraph.show_growth())
KL.append(kl)
The function KullbackLeibler
takes as input two arrays and an integer
What I've already tried:
using objgraph to identify growing objects, unfortunately it seems it doesn't work with C-defined arrays (it identifies only the list in which I'm appending the result as growing) Why can't objgraph capture the growth of np.array()?
deleting all the arrays at the end of the pyx function
tried placing a
gc.collect()
call both in the pyx file and in the main file, but nothing has changed
Memory consumption grows linearly with the number of iterations, along with the time required for each iteration (from 0.6s to over 4s). It's my first attempt with cython, any suggestion would be useful.