As @Reti43 has mentioned in the comments, there already exists libraries to do so, but I suspect you just want to play around with some home made implementations.
I too have been interested in how to implement convolutions manually in Python. Python loops are terribly slow, and if you care about speed you should stay away from pure python loops and instead stick to more vectorized methods.
The best I have so far is to use numpy.lib.stride_tricks.as_strided
, which allows you to get very customized views of numpy arrays. I use as_strided
to get a sliding window view of the image, then I use np.tensordot
to do a "more general matrix multiplication" (docs) with the kernel. Furthermore, numpy 1.20 (iirc) has numpy.lib.stride_tricks.sliding_window_view
, which is a little less general version of my code below (as of this date), as it cannot do custom strides.
import numpy as np
from numpy.lib.stride_tricks import as_strided
def get_sliding_window(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
imgChannels, imgRows, imgCols = x.shape
_, kernelRows, kernelCols = k.shape
u = np.array(x.itemsize) # Used to scale stride size, as_astrided wants stride sizes in bits
return as_strided(x,
shape=((imgRows-kernelRows)//rowstride+1, (imgCols-kernelCols)//colstride+1, imgChannels, kernelRows, kernelCols),
strides=u*(imgCols*rowstride, colstride, imgRows*imgCols, imgCols, 1)
)
def conv2d(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
"""
Performs 2d convolution on images with arbitrary number of channels where you can
specify the strides as well.
x: np.ndarray, image array of shape (C x N x M), where C is number of channels
k: np.ndarray, convolution kernel of shape (C x P x Q), where C is number of channels
rowstride: int, "vertical" step size
colstride: int, "horizontal" step size
"""
sliding_window_view = get_sliding_window(x, k, rowstride, colstride)
return np.tensordot(sliding_window_view, k, axes=3)
x = np.array([
[[1,1,1,1],
[1,1,1,1],
[2,2,2,2],
[2,2,2,2]],
[[1,1,2,2],
[1,1,2,2],
[4,4,8,8],
[4,4,8,8]]
])
k = np.array([
[[1,1],
[1,1]],
[[1,1],
[1,1]]
]) / 8
print(conv2d(x,k,1,1))
#[[1. 1.25 1.5 ]
# [2. 2.625 3.25 ]
# [3. 4. 5. ]]
print(conv2d(x,k,2,2))
#[[1. 1.5]
# [3. 5. ]]
Bonus
I implemented an ascii visualization thing to sanity check that sliding windows is correct:
import time
def conv2d_asciiviz(x: np.ndarray, k: np.ndarray, rowstride: int, colstride: int):
x = x.copy().astype(object)
sliding_window_view = get_sliding_window(x, k, rowstride, colstride)
highlighter = np.vectorize(lambda x: f"\x1b[33m{x}\x1b[0m")
r = np.full(sliding_window_view.shape[:2], np.nan)
with np.printoptions(nanstr="", formatter={"all":lambda x: str(x)}):
for i, row in enumerate(sliding_window_view):
for j, window in enumerate(row):
temp = window.copy()
r[i,j] = np.tensordot(window, k, axes=3)
window[...] = highlighter(window)
print(f"\x1b[JChannels:\n{x}\n\nResult:\n{str(r)}\x1b[{x.shape[0]*x.shape[1]+len(r)+4}A")
window[...] = temp
time.sleep(0.69)
print(f"\x1b[{x.shape[0]*x.shape[1]+len(r)+4}B")
return r
print("Output:\n",conv2d(x,k,1,1))
