For windowed summation convolution is actually overkill since a simple O(n) solution exists:
import numpy as np
from scipy.signal import convolve
def winsum(in_arr, windowsize):
in_arr = np.pad(in_arr, windowsize//2+1, mode='reflect')[:-1, :-1]
in_arr[0] = 0
in_arr[:, 0] = 0
ps = in_arr.cumsum(0).cumsum(1)
return ps[windowsize:, windowsize:] + ps[:-windowsize, :-windowsize] \
- ps[windowsize:, :-windowsize] - ps[:-windowsize, windowsize:]
This is already fast but you can save even more because ps
calculated once for the largest window size could be reused for all smaller window sizes.
However, there is one potential drawback, which are the very large numbers that may arise from summing everything like that. A numerically more sound version eliminates this problem by taking the differences first. Downside: the extra saving through sharing ps
is no longer available.
def winsum_safe(in_arr, windowsize):
in_arr = np.pad(in_arr, windowsize//2, mode='reflect')
in_arr[windowsize:] -= in_arr[:-windowsize]
in_arr[:, windowsize:] -= in_arr[:, :-windowsize]
return in_arr.cumsum(0)[windowsize-1:].cumsum(1)[:, windowsize-1:]
For reference, here is the closest competitor which is fft
based convolution. You need an up-to-date version of scipy for this to work efficiently. On older versions use fftconvolve
instead of convolve
.
def winsumc(in_arr, windowsize):
in_arr = np.pad(in_arr, windowsize//2, mode='reflect')
kernel = np.ones((windowsize, windowsize), in_arr.dtype)
return convolve(in_arr, kernel, 'valid')
The next one is to simulate scipy's old - and excruciatingly slow - behavior.
def winsum_nofft(in_arr, windowsize):
in_arr = np.pad(in_arr, windowsize//2, mode='reflect')
kernel = np.ones((windowsize, windowsize), in_arr.dtype)
return convolve(in_arr, kernel, 'valid', method='direct')
Testing and benchmarking:
data = np.random.random((1000, 1000))
assert np.allclose(winsum(data, 333), winsumc(data, 333))
assert np.allclose(winsum(data, 333), winsum_safe(data, 333))
kwds = dict(globals=globals(), number=10)
from timeit import timeit
from time import perf_counter
print('data 100x1000, window 333x333')
print('cumsum: ', timeit('winsum(data, 333)', **kwds)*100, 'ms')
print('cumsum safe: ', timeit('winsum_safe(data, 333)', **kwds)*100, 'ms')
print('fftconv: ', timeit('winsumc(data, 333)', **kwds)*100, 'ms')
t = perf_counter()
res = winsum_nofft(data, 99) # 333 just takes too long
t = perf_counter() - t
assert np.allclose(winsum(data, 99), res)
print('data 100x1000, window 99x99')
print('conv: ', t*1000, 'ms')
Sample output:
data 100x1000, window 333x333
cumsum: 70.33260859316215 ms
cumsum safe: 59.98647050000727 ms
fftconv: 298.60571819590405 ms
data 100x1000, window 99x99
conv: 135224.8261970235 ms