I have a function written in regular numpy ndarray
and another one with a typed memoryview
. However, I couldn't get the memoryview
version to work faster than the regular version (unlike many of the blogs such as memoryview benchmarks).
Any pointers / suggestions to increase the speed of the memoryview code versus the numpy alternative will be greatly appreciated! ... OR ... if anyone can point out any glaring reason why the memoryview version is not much faster than the regular numpy version
In the code below there are two functions, both of which takes in two vectors bi
and xi
and returns a matrix. The first function shrink_correl
is the regular numpy version and the second function shrink_correl2
is the memoryview alternative (let the file be sh_cor.pyx
).
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
cimport cython
cimport numpy as np
import numpy as np
from numpy cimport ndarray as ar
# -- ***this is the Regular Cython version*** -
cpdef ar[double, ndim=2, mode='c'] shrink_correl(ar[double, ndim=1, mode='c'] bi, ar[double, ndim=1, mode='c'] xi):
cdef:
int n_ = xi.shape[0]
int n__ = int(n_*(n_-1)/2)
ar[double, ndim=2, mode='c'] f = np.zeros([n__, n_+1])
int x__ = 0
ar[double, ndim=2, mode='c'] f1 = np.zeros([n_, n_+1])
ar[double, ndim=2, mode='c'] f2 = np.zeros([n__, n_+1])
ar[double, ndim=1, mode='c'] g = np.zeros(n_+1)
ar[double, ndim=1, mode='c'] s = np.zeros(n__)
ar[double, ndim=2, mode='c'] cori_ = np.zeros([n_, n_])
Py_ssize_t j, k
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
f[x__-1, j] = bi[k]*xi[k]*1000
f[x__-1, k] = bi[j]*xi[j]*1000
f1 = np.dot(np.transpose(f), f)
with nogil:
for j in range(0, n_):
f1[n_, j] = xi[j]*1000
f1[j, n_] = f1[n_, j]
f2 = np.dot(f, np.linalg.inv(f1))
with nogil:
for j in range(0, n_):
g[j] = -bi[j]*xi[j]*1000
s = np.dot(f2, g)
with nogil:
for j in range(0, n_):
cori_[j, j] = 1.0
x__ = 0
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
cori_[j, k] = s[x__-1]
cori_[k, j] = cori_[j, k]
return cori_
# -- ***this is the MemoryView Cython version*** -
cpdef ar[double, ndim=2, mode='c'] shrink_correl2(double[:] bi, double[:] xi):
cdef:
int n_ = xi.shape[0]
int n__ = int(n_*(n_-1)/2)
double[:, ::1] f = np.zeros([n__, n_+1])
int x__ = 0
double[:, ::1] f1 = np.zeros([n_, n_+1])
double[:, ::1] f2 = np.zeros([n__, n_+1])
double[:] g = np.zeros(n_+1)
double[:] s = np.zeros(n__)
double[:, ::1] cori_ = np.zeros([n_, n_])
ar[double, ndim=2, mode='c'] cori__ = np.zeros([n_, n_])
Py_ssize_t j, k
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
f[x__-1, j] = bi[k]*xi[k]*1000
f[x__-1, k] = bi[j]*xi[j]*1000
f1 = np.dot(np.transpose(f), f)
with nogil:
for j in range(0, n_):
f1[n_, j] = xi[j]*1000
f1[j, n_] = f1[n_, j]
f2 = np.dot(f, np.linalg.inv(f1))
with nogil:
for j in range(0, n_):
g[j] = -bi[j]*xi[j]*1000
s = np.dot(f2, g)
with nogil:
for j in range(0, n_):
cori_[j, j] = 1.0
x__ = 0
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
cori_[j, k] = s[x__-1]
cori_[k, j] = cori_[j, k]
cori__[:, :] = cori_
return cori__
This is compiled using the following setup.py
code
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
import os
ext_modules = [Extension('sh_cor', ['sh_cor.pyx'], include_dirs=[np.get_include(),
os.path.join(np.get_include(), 'numpy')],
define_macros=[('NPY_NO_DEPRECATED_API', None)],
extra_compile_args=['-O3', '-march=native', '-ffast-math', '-flto'],
libraries=['m']
)]
setup(
name="Sh Cor",
cmdclass={'build_ext': build_ext},
ext_modules=ext_modules
)
The code used to test the speeds is
import numpy as np
import sh_cor # this the library created by the setup.py file
import time
b = np.random.random(400)
b = b/np.sum(b)
x = np.random.random(400)-0.5
n = 10
t0 = time.time()
for i in range(n):
v1 = sh_cor.shrink_correl(b, x)
t1 = time.time()
print((t1-t0)/n)
t0 = time.time()
for i in range(n):
v2 = sh_cor.shrink_correl2(b, x)
t1 = time.time()
print((t1-t0)/n)
The output on my PC is:
0.7070999860763549 # regular numpy
0.6726999998092651 # memoryview
using memoryview (in the codes above) only gives me a 5% speed boost (unlike the huge speed boost in the blogs).