I'm trying to learn more about the use of shared memory to improve performance in some cuda kernels in Numba
, for this I was looking at the Matrix multiplication Example in the Numba documentation and tried to implement to see the gain.
This is my test implementation, I'm aware that the example in the documentation has some issues that I followed from Here, so I copied the fixed example code.
from timeit import default_timer as timer
import numba
from numba import cuda, jit, int32, int64, float64, float32
import numpy as np
from numpy import *
@cuda.jit
def matmul(A, B, C):
"""Perform square matrix multiplication of C = A * B
"""
i, j = cuda.grid(2)
if i < C.shape[0] and j < C.shape[1]:
tmp = 0.
for k in range(A.shape[1]):
tmp += A[i, k] * B[k, j]
C[i, j] = tmp
# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 16
@cuda.jit
def fast_matmul(A, B, C):
# Define an array in the shared memory
# The size and type of the arrays must be known at compile time
sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bpg = cuda.gridDim.x # blocks per grid
# Each thread computes one element in the result matrix.
# The dot product is chunked into dot products of TPB-long vectors.
tmp = 0.
for i in range(bpg):
# Preload data into shared memory
sA[ty, tx] = 0
sB[ty, tx] = 0
if y < A.shape[0] and (tx+i*TPB) < A.shape[1]:
sA[ty, tx] = A[y, tx + i * TPB]
if x < B.shape[1] and (ty+i*TPB) < B.shape[0]:
sB[ty, tx] = B[ty + i * TPB, x]
# Wait until all threads finish preloading
cuda.syncthreads()
# Computes partial product on the shared memory
for j in range(TPB):
tmp += sA[ty, j] * sB[j, tx]
# Wait until all threads finish computing
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
size = 1024*4
tpbx,tpby = 16, 16
tpb = (tpbx,tpby)
bpgx, bpgy = int(size/tpbx), int(size/tpby)
bpg = (bpgx, bpgy)
a_in = cuda.to_device(np.arange(size*size, dtype=np.float32).reshape((size, size)))
b_in = cuda.to_device(np.ones(size*size, dtype=np.float32).reshape((size, size)))
c_out1 = cuda.device_array_like(a_in)
c_out2 = cuda.device_array_like(a_in)
s = timer()
cuda.synchronize()
matmul[bpg,tpb](a_in, b_in, c_out1);
cuda.synchronize()
gpu_time = timer() - s
print(gpu_time)
c_host1 = c_out1.copy_to_host()
print(c_host1)
s = timer()
cuda.synchronize()
fast_matmul[bpg,tpb](a_in, b_in, c_out2);
cuda.synchronize()
gpu_time = timer() - s
print(gpu_time)
c_host2 = c_out2.copy_to_host()
print(c_host2)
The time of execution of the above kernels are essentially the same, actually the matmul
was making faster for some larger input matrices. I would like to know what I'm missing in order to see the gain as the documentation suggests.
Thanks, Bruno.