The way I would rewrite the MATLAB code as equivalent Python code could be (beware of indices starting at 1 in MATLAB and at 0 in Python... since I cannot know how this should be adapted without context I went with the simplest approach):
import numpy as np
def func(f_arr, nech, n):
sf2 = np.zeros(nech)
count = np.zeros(nech)
for i in range(nech):
for j in range(n - i):
sf2[i] += (f_arr[i + j] - f_arr[i]) ** 2
count[i] += 1
sf2[i] /= count[i]
return sf2
Note that count[i] += 1
is useless as the final value of count[i]
is known in advance, and actually the whole count
is useless, e.g.:
import numpy as np
def func2(f_arr, nech, n):
sf2 = np.zeros(nech)
for i in range(nech):
for j in range(n - i):
sf2[i] += (f_arr[i + j] - f_arr[i]) ** 2
sf2[i] /= (n - i)
return sf2
Speed-up
This a manual case of Numba speed up. This is just as easy as adding/using a Numba @njit
decorator:
import numba as nb
func_nb = nb.njit(func)
func2_nb = nb.njit(func2)
Now, func
, func_nb
, func2
and func2_nb
all perform the same computation:
nech = n = 6
f_arr = np.arange(n)
print(func(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
print(func_nb(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
print(func2(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
print(func2_nb(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
If you really need to stick to Cython, here is an implementation based on func2
:
%%cython -c-O3 -c-march=native -a
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, infer_types=True
import numpy as np
import cython as cy
cpdef func2_cy(f_arr, nech, n):
sf2 = np.zeros(nech)
_func2_cy(f_arr.astype(np.float_), sf2, nech, n)
return sf2
cdef _func2_cy(double[:] f_arr, double[:] sf2, cy.int nech, cy.int n):
for i in range(nech):
for j in range(1, n - i):
sf2[i] = sf2[i] + (f_arr[i + j] - f_arr[i]) ** 2
sf2[i] = sf2[i] / (n - i)
This is significantly more complicated to write compared to Numba, but achieves similar performances.
The trick is to have a function _func2_cy
which operates with little to no interaction with Python (read: it goes at C speed).
Again the result is the same as func2
:
nech = n = 6
f_arr = np.arange(n)
print(func2_cy(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
Timings
With some little toy benchmarking we get a feeling of the speed-ups, including a comparable function written as you did, and the vectorized solution proposed in Andras Deak's very fine answer (but fixing the indices to match the above):
def func_OP(f, nech, n):
sf2 = np.zeros(n)
count = np.zeros(n)
for i in range(nech):
indN = np.arange(n - i) # <-- indexing fixed
for j in indN:
sf2[i] += np.power((f[i+j]-f[i]),2)
count[i] += 1
sf2[i] = sf2[i] / count[i]
return sf2
func_OP_nb = nb.njit(func_OP)
def func_pdisty(f, nech, n):
res = np.zeros(nech)
dists = scipy.spatial.distance.pdist(f[:, None], metric='sqeuclidean')
counts = np.arange(n - 1, n - nech - 1, -1)
inds = np.repeat(np.arange(res.size), counts)
np.add.at(res, inds, dists[:inds.size])
res /= (counts + 1)
return res
nech = n = 6
f_arr = np.arange(n)
print(func_OP(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
print(func_pdisty(f_arr, nech, n))
# [9.16666667 6. 3.5 1.66666667 0.5 0. ]
nech = n = 1000
f_arr = np.arange(n)
%timeit func_OP(f_arr, nech, n)
# 1 loop, best of 5: 1.5 s per loop
%timeit func(f_arr, nech, n)
# 1 loop, best of 5: 567 ms per loop
%timeit func2(f_arr, nech, n)
# 1 loop, best of 5: 352 ms per loop
%timeit func_OP_nb(f_arr, nech, n)
# 1000 loops, best of 5: 1.87 ms per loop
%timeit func_nb(f_arr, nech, n)
# 1000 loops, best of 5: 1.7 ms per loop
%timeit func2_nb(f_arr, nech, n)
# 1000 loops, best of 5: 768 µs per loop
%timeit func_pdisty(f_arr, nech, n)
# 10 loops, best of 5: 44.5 ms per loop
%timeit func2_cy(f_arr, nech, n)
# 1000 loops, best of 5: 1 ms per loop
nech = n = 2000
f_arr = np.arange(n)
%timeit func_OP(f_arr, nech, n)
# 1 loop, best of 5: 6.01 s per loop
%timeit func(f_arr, nech, n)
# 1 loop, best of 5: 2.3 s per loop
%timeit func2(f_arr, nech, n)
# 1 loop, best of 5: 1.42 s per loop
%timeit func_OP_nb(f_arr, nech, n)
# 100 loops, best of 5: 7.31 ms per loop
%timeit func_nb(f_arr, nech, n)
# 100 loops, best of 5: 6.82 ms per loop
%timeit func2_nb(f_arr, nech, n)
# 100 loops, best of 5: 3.05 ms per loop
%timeit func_pdisty(f_arr, nech, n)
# 1 loop, best of 5: 344 ms per loop
%timeit func2_cy(f_arr, nech, n)
# 100 loops, best of 5: 3.95 ms per loop
nech = n = 4000
f_arr = np.arange(n)
%timeit func_OP(f_arr, nech, n)
# 1 loop, best of 5: 24.3 s per loop
%timeit func(f_arr, nech, n)
# 1 loop, best of 5: 9.27 s per loop
%timeit func2(f_arr, nech, n)
# 1 loop, best of 5: 5.71 s per loop
%timeit func_OP_nb(f_arr, nech, n)
# 10 loops, best of 5: 29 ms per loop
%timeit func_nb(f_arr, nech, n)
# 10 loops, best of 5: 27.3 ms per loop
%timeit func2_nb(f_arr, nech, n)
# 100 loops, best of 5: 12.2 ms per loop
%timeit func_pdisty(f_arr, nech, n)
# 1 loop, best of 5: 706 ms per loop
%timeit func2_cy(f_arr, nech, n)
# 100 loops, best of 5: 15.9 ms per loop
...and with the input sizes you provided:
nech = 24 * 30 * 24
n = 70299
f_arr = np.random.random(n)
%timeit -n1 -r1 func_OP(f_arr, nech, n)
# 1 loop, best of 1: 1h 4min 50s per loop
%timeit -n1 -r1 func(f_arr, nech, n)
# 1 loop, best of 1: 21min 14s per loop
%timeit -n1 -r1 func2(f_arr, nech, n) # only one run / loop
# 1 loop, best of 1: 13min 9s per loop
%timeit func_OP_nb(f_arr, nech, n)
# 1 loop, best of 5: 4.74 s per loop
%timeit func_nb(f_arr, nech, n)
# 1 loop, best of 5: 4 s per loop
%timeit func2_nb(f_arr, nech, n)
# 1 loop, best of 5: 1.62 s per loop
# %timeit func_pdisty(f_arr, nech, n)
# -- MEMORY ERROR --
%timeit func2_cy(f_arr, nech, n)
# 1 loop, best of 5: 2.2 s per loop