I'm trying to create a Cython wrapper for the __gnu_parallel::sort in the same way as they do in this thread Parallel in-place sort for numpy arrays.
This is my simplified code for wrapparallel.pyx:
import cython
cimport cython
cdef extern from "<parallel/algorithm>" namespace "__gnu_parallel":
cdef void sort[T](T first, T last) nogil
def parallel_sort(double[::1] a):
sort(&a[0], &a[a.shape[0] - 1])
I generate c++ code with:
cython --cplus wrapparallel.pyx
Compile and link with:
g++ -g -march=native -Ofast -fpic -c wrapparallel.cpp -o wrapparallel.o -fopenmp -I/usr/include/python2.7 -I/usr/include/x86_64-linux-gnu/python2.7
g++ -g -march=native -Ofast -shared -o wrapparallel.so wrapparallel.o -lpthread -ldl -lutil -lm -lpython2.7 -lgomp
Now to test it:
In [1]: import numpy as np
from wrapparallel import parallel_sort
a = np.random.randn(10)
parallel_sort(a)
a
Out[1]: array([-1.23569683, -1.05639448, -0.76990205, -0.2512839 , -0.25022328,
0.12711458, 0.81659571, 0.92205287, 2.15019125, -0.45902146])
As pointed out in the comment in the original thread this code does not sort the last element and the commentator suggests that the " - 1" be removed in the call to sort in the pyx-file. However this change would not solve anything since a[a.shape[0]] would be out of bounds.
This made me suspect that there might be an issue in the c++ code. The snippet where the actual call to __gnu_parallel::sort happens looks like this:
static PyObject *__pyx_pf_12wrapparallel_parallel_sort(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_a) {
PyObject *__pyx_r = NULL;
__Pyx_RefNannyDeclarations
Py_ssize_t __pyx_t_1;
int __pyx_t_2;
Py_ssize_t __pyx_t_3;
int __pyx_lineno = 0;
const char *__pyx_filename = NULL;
int __pyx_clineno = 0;
__Pyx_RefNannySetupContext("parallel_sort", 0);
__pyx_t_1 = 0;
__pyx_t_2 = -1;
if (__pyx_t_1 < 0) {
__pyx_t_1 += __pyx_v_a.shape[0];
if (unlikely(__pyx_t_1 < 0)) __pyx_t_2 = 0;
} else if (unlikely(__pyx_t_1 >= __pyx_v_a.shape[0])) __pyx_t_2 = 0;
if (unlikely(__pyx_t_2 != -1)) {
__Pyx_RaiseBufferIndexError(__pyx_t_2);
{__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
}
__pyx_t_3 = ((__pyx_v_a.shape[0]) - 1);
__pyx_t_2 = -1;
if (__pyx_t_3 < 0) {
__pyx_t_3 += __pyx_v_a.shape[0];
if (unlikely(__pyx_t_3 < 0)) __pyx_t_2 = 0;
} else if (unlikely(__pyx_t_3 >= __pyx_v_a.shape[0])) __pyx_t_2 = 0;
if (unlikely(__pyx_t_2 != -1)) {
__Pyx_RaiseBufferIndexError(__pyx_t_2);
{__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
}
__gnu_parallel::sort<double *>((&(*((double *) ( /* dim=0 */ ((char *) (((double *) __pyx_v_a.data) + __pyx_t_1)) )))), (&(*((double *) ( /* dim=0 */ ((char *) (((double *) __pyx_v_a.data) + __pyx_t_3)) )))));
/* function exit code */
__pyx_r = Py_None; __Pyx_INCREF(Py_None);
goto __pyx_L0;
__pyx_L1_error:;
__Pyx_AddTraceback("wrapparallel.parallel_sort", __pyx_clineno, __pyx_lineno, __pyx_filename);
__pyx_r = NULL;
__pyx_L0:;
__PYX_XDEC_MEMVIEW(&__pyx_v_a, 1);
__Pyx_XGIVEREF(__pyx_r);
__Pyx_RefNannyFinishContext();
return __pyx_r;
}
My knowledge of c++ is not enough to grasp whats going on here so my question is: Is there something wrong with call to __gnu_parallel::sort and how might I change it to also include the last element in the memoryview?
Edit:
The answer to change sort(&a[0], &a[a.shape[0] - 1])
to sort(&a[0], &a[a.shape[0]])
is correct. However, this will raise a IndexError: Out of bounds on buffer access (axis 0)
unless the cython compiler is instructed to use boundscheck = False
directive. For completeness the wrapparallel.pyx file should look like this:
# cython: boundscheck = False
import cython
cimport cython
cdef extern from "<parallel/algorithm>" namespace "__gnu_parallel":
cdef void sort[T](T first, T last) nogil
def parallel_sort(double[::1] a):
sort(&a[0], &a[a.shape[0]])