I have a simple C++ function that I've attempted to wrap with pybind11
(the ehvi3d_sliceupdate
function from the KMAC library). It's deep in a loop and gets called a few hundred thousand to a million times in my Python module. Unfortunately, it seems to be leaking memory (12+GB after ~700k calls) and I'm not sure what the cause might be. The wrapper that I've compiled looks like this:
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <iostream>
#include "helper.h"
#include "ehvi_calculations.h"
#include "ehvi_sliceupdate.h"
namespace py = pybind11;
// Copied from main.cc
//Checks if p dominates P. Removes points dominated by p from P and return the number of points removed.
int checkdominance(deque<individual*> & P, individual* p){
int nr = 0;
for (int i=P.size()-1;i>=0;i--){
if (p->f[0] >= P[i]->f[0] && p->f[1] >= P[i]->f[1] && p->f[2] >= P[i]->f[2]){
cerr << "Individual " << (i+1) << " is dominated or the same as another point; removing." << endl;
P.erase(P.begin()+i);
nr++;
}
}
return nr;
}
// Wrap the ehvi3d_sliceupdate function - not sure how to pass straight in
double wrap_ehvi3d_sliceupdate(py::array_t<double> y_par, py::array_t<double> ref_point, py::array_t<double> mean_vector, py::array_t<double> std_dev) {
deque<individual*> nd_samples;
// Get y_par and feed by individual via numpy direct access
// https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html
auto yp = y_par.unchecked<2>(); // y_par must have ndim = 2
for (py::ssize_t i = 0; i < yp.shape(0); i++) {
individual * tempvidual = new individual;
tempvidual->f[0] = yp(i, 0);
tempvidual->f[1] = yp(i, 1);
tempvidual->f[2] = yp(i, 2);
// cerr << i << ": " << yp(i, 0) << " " << yp(i, 1) << " " << yp(i, 2) << endl;
checkdominance(nd_samples, tempvidual);
nd_samples.push_back(tempvidual);
}
// Marshall ref_point, mean_vector, and std_dev into an array
// (might be better ways to do this..)
auto rp = ref_point.unchecked<1>(); // ref_point must have ndim = 1, len 3
double r [] = {rp(0), rp(1), rp(2)};
auto mv = mean_vector.unchecked<1>(); // mean_vector must have ndim = 1, len 3
double mu [] = {mv(0), mv(1), mv(2)};
auto sd = std_dev.unchecked<1>(); // std_dev must have ndim = 1, len 3
double s [] = {sd(0), sd(1), sd(2)};
double hvi = ehvi3d_sliceupdate(nd_samples, r, mu, s);
return hvi;
}
PYBIND11_MODULE(kmac, m) {
// module docstring
m.doc() = "EHVI using KMAC";
// definie EHVI slice update function
m.def("ehvi3d_sliceupdate", &wrap_ehvi3d_sliceupdate, "O(n^3) slice-update scheme for calculating the EHVI.");
}
There's probably a much easier way to wrap this, as I just cobbled together bits I found from the pybind11 docs and here on SO. I'm not that familiar with C++, so I might have committed some other heinous coding errors creating arrays or passing pointers using my limited knowledge. Am I creating something which needs to be cleaned up each time? At first, I thought I might need to contain my numpy arrays as in this and this previous post, but I'm only returning a double, so there's no numpy array to handle on the python side.
EDIT
I've tried changing the tempvidual
block to use heap memory (I believe it's called?), as I read that it would clear itself up, by using:
individual tempvidual;
tempvidual.f[0] = yp(i, 0);
tempvidual.f[1] = yp(i, 1);
tempvidual.f[2] = yp(i, 2);
checkdominance(nd_samples, &tempvidual);
nd_samples.push_back(&tempvidual);
and before returning hvi
at the end, I tried adding nd_samples.clear();
to clear the deque
before returning to python, but I'm still getting an increase in memory per call to the wrapper. Is there anything else left to clean up?
EDIT 2
So it turns out part of the problem was the library itself, which leaked around > 4kb per call according to valgrind
. Thanks (and big shout out) to the amazing help of @ajum on the pybind11 gitter who practically walked me through refactoring most of the code to use shared_ptr
and make_shared
instead of raw pointers fixing all of the leaks in the library. This has also required a small update to the wrapper, see below. Unfortunately, even using the leak-free (I think) library and updated wrapper, I'm getting a report of:
==1932812== LEAK SUMMARY:
==1932812== definitely lost: 676 bytes in 1 blocks
==1932812== indirectly lost: 0 bytes in 0 blocks
==1932812== possibly lost: 145,291 bytes in 80 blocks
==1932812== still reachable: 1,725,888 bytes in 1,013 blocks
which is less than before, but I don't know what's causing it.
Edited sections of the wrapper:
// Copied from main.cc
//Checks if p dominates P. Removes points dominated by p from P and return the number of points removed.
int checkdominance(deque<shared_ptr<individual>> & P, shared_ptr<individual> p){
int nr = 0;
for (int i=P.size()-1;i>=0;i--){
if (p->f[0] >= P[i]->f[0] && p->f[1] >= P[i]->f[1] && p->f[2] >= P[i]->f[2]){
cerr << "Individual " << (i+1) << " is dominated or the same as another point; removing." << endl;
P.erase(P.begin()+i);
nr++;
}
}
return nr;
}
// Wrap the ehvi3d_sliceupdate function - not sure how to pass straight in
double wrap_ehvi3d_sliceupdate(py::array_t<double> y_par, py::array_t<double> ref_point, py::array_t<double> mean_vector, py::array_t<double> std_dev) {
// deque<individual*> nd_samples;
deque<shared_ptr<individual>> nd_samples;
// Get y_par and feed by individual via numpy direct access
// https://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html
auto yp = y_par.unchecked<2>(); // y_par must have ndim = 2
for (py::ssize_t i = 0; i < yp.shape(0); i++) {
auto tempvidual = make_shared<individual>();
// individual * tempvidual = new individual;
tempvidual->f[0] = yp(i, 0);
tempvidual->f[1] = yp(i, 1);
tempvidual->f[2] = yp(i, 2);
// cerr << i << ": " << yp(i, 0) << " " << yp(i, 1) << " " << yp(i, 2) << endl;
// cerr << i << ": " << tempvidual->f[0] << " " << tempvidual->f[1] << " " << tempvidual->f[2] << endl;
checkdominance(nd_samples, tempvidual);
nd_samples.push_back(tempvidual);
}
// Marshall ref_point, mean_vector, and std_dev into an array
// (might be better ways to do this..)
auto rp = ref_point.unchecked<1>(); // ref_point must have ndim = 1, len 3
double r [] = {rp(0), rp(1), rp(2)};
auto mv = mean_vector.unchecked<1>(); // mean_vector must have ndim = 1, len 3
double mu [] = {mv(0), mv(1), mv(2)};
auto sd = std_dev.unchecked<1>(); // std_dev must have ndim = 1, len 3
double s [] = {sd(0), sd(1), sd(2)};
double hvi = ehvi3d_sliceupdate(nd_samples, r, mu, s);
return hvi;
}
In the output of running valgrind
on the python test script, I couldn't identify what the issue was. An excerpt of the output with the definitely lost
block looks like this:
==1932812== 676 bytes in 1 blocks are definitely lost in loss record 212 of 485
==1932812== at 0x4C30F0B: malloc (vg_replace_malloc.c:307)
==1932812== by 0x2D595F: _PyMem_RawWcsdup (obmalloc.c:592)
==1932812== by 0x166786: _PyCoreConfig_Copy.cold (main.c:2535)
==1932812== by 0x34C4C7: _Py_InitializeCore (pylifecycle.c:850)
==1932812== by 0x34CCB3: pymain_init (main.c:3041)
==1932812== by 0x3503EB: pymain_main (main.c:3063)
==1932812== by 0x35085B: _Py_UnixMain (main.c:3103)
==1932812== by 0x5A137B2: (below main) (in /usr/lib64/libc-2.28.so)
==1932812==
==1932812== 688 bytes in 1 blocks are possibly lost in loss record 214 of 485
==1932812== at 0x4C33419: realloc (vg_replace_malloc.c:834)
==1932812== by 0x21E8F8: _PyObject_GC_Resize (gcmodule.c:1758)
==1932812== by 0x2345DA: UnknownInlinedFun (frameobject.c:726)
==1932812== by 0x2345DA: UnknownInlinedFun (call.c:272)
==1932812== by 0x2345DA: _PyFunction_FastCallKeywords (call.c:408)
==1932812== by 0x2979C7: call_function (ceval.c:4616)
==1932812== by 0x2BE4AB: _PyEval_EvalFrameDefault (ceval.c:3124)
==1932812== by 0x233E93: UnknownInlinedFun (ceval.c:547)
==1932812== by 0x233E93: UnknownInlinedFun (call.c:283)
==1932812== by 0x233E93: _PyFunction_FastCallKeywords (call.c:408)
==1932812== by 0x2979C7: call_function (ceval.c:4616)
==1932812== by 0x2BE4AB: _PyEval_EvalFrameDefault (ceval.c:3124)
==1932812== by 0x233E93: UnknownInlinedFun (ceval.c:547)
==1932812== by 0x233E93: UnknownInlinedFun (call.c:283)
==1932812== by 0x233E93: _PyFunction_FastCallKeywords (call.c:408)
==1932812== by 0x2979C7: call_function (ceval.c:4616)
==1932812== by 0x2BE4AB: _PyEval_EvalFrameDefault (ceval.c:3124)
==1932812== by 0x233E93: UnknownInlinedFun (ceval.c:547)
==1932812== by 0x233E93: UnknownInlinedFun (call.c:283)
==1932812== by 0x233E93: _PyFunction_FastCallKeywords (call.c:408)
==1932812==
==1932812== 1,056 bytes in 2 blocks are possibly lost in loss record 350 of 485
==1932812== at 0x4C30F0B: malloc (vg_replace_malloc.c:307)
==1932812== by 0x221130: UnknownInlinedFun (obmalloc.c:520)
==1932812== by 0x221130: UnknownInlinedFun (obmalloc.c:1584)
==1932812== by 0x221130: UnknownInlinedFun (obmalloc.c:1576)
==1932812== by 0x221130: UnknownInlinedFun (obmalloc.c:633)
==1932812== by 0x221130: UnknownInlinedFun (gcmodule.c:1693)
==1932812== by 0x221130: UnknownInlinedFun (gcmodule.c:1715)
==1932812== by 0x221130: _PyObject_GC_NewVar (gcmodule.c:1744)
==1932812== by 0x2344F2: UnknownInlinedFun (frameobject.c:713)
==1932812== by 0x2344F2: UnknownInlinedFun (call.c:272)
==1932812== by 0x2344F2: _PyFunction_FastCallKeywords (call.c:408)
==1932812== by 0x2979C7: call_function (ceval.c:4616)
==1932812== by 0x2BE4AB: _PyEval_EvalFrameDefault (ceval.c:3124)
==1932812== by 0x206EAC: UnknownInlinedFun (ceval.c:547)
==1932812== by 0x206EAC: UnknownInlinedFun (call.c:283)
==1932812== by 0x206EAC: _PyFunction_FastCallDict (call.c:322)
==1932812== by 0x20F1BA: UnknownInlinedFun (call.c:98)
==1932812== by 0x20F1BA: object_vacall (call.c:1200)
==1932812== by 0x28E2E6: _PyObject_CallMethodIdObjArgs (call.c:1250)
==1932812== by 0x1FC4A6: UnknownInlinedFun (import.c:1652)
==1932812== by 0x1FC4A6: PyImport_ImportModuleLevelObject (import.c:1764)
==1932812== by 0x2C069F: UnknownInlinedFun (ceval.c:4770)
==1932812== by 0x2C069F: _PyEval_EvalFrameDefault (ceval.c:2600)
==1932812== by 0x205AF1: UnknownInlinedFun (ceval.c:547)
==1932812== by 0x205AF1: _PyEval_EvalCodeWithName (ceval.c:3930)
==1932812== by 0x206D08: PyEval_EvalCodeEx (ceval.c:3959)
Is this due to the pybind11
itself or the way I called it?
P.S. Not sure if it's SO style to add edits or replace the original questions with the (long) update. Thanks!