I have been playing around with numba and trying to implement a simple element-wise matrix multiplication. When using 'vectorize' I get the same result as the numpy multiplication but when I'm using 'cuda.jit' they are not same. Many of them are zeros. I'm providing a minimum working example for this purpose. Any help with the problem will be appreciated. I'm using numba o.35.0 and python 2.7
from __future__ import division
from __future__ import print_function
import numpy as np
from numba import vectorize, cuda, jit
M = 80
N = 40
P = 40
# Set the number of threads in a block
threadsperblock = 32
# Calculate the number of thread blocks in the grid
blockspergrid = (M*N*P + (threadsperblock - 1)) // threadsperblock
@vectorize(['float32(float32,float32)'], target='cuda')
def VectorMult3d(a, b):
return a*b
@cuda.jit('void(float32[:, :, :], float32[:, :, :], float32[:, :, :])')
def mult_gpu_3d(a, b, c):
[x, y, z] = cuda.grid(3)
if x < c.shape[0] and y < c.shape[1] and z < c.shape[2]:
c[x, y, z] = a[x, y, z] * b[x, y, z]
if __name__ == '__main__':
A = np.random.normal(size=(M, N, P)).astype(np.float32)
B = np.random.normal(size=(M, N, P)).astype(np.float32)
numpy_C = A*B
A_gpu = cuda.to_device(A)
B_gpu = cuda.to_device(B)
C_gpu = cuda.device_array((M,N,P), dtype=np.float32) # cuda.device_array_like(A_gpu)
mult_gpu_3d[blockspergrid,threadsperblock](A_gpu,B_gpu,C_gpu)
cudajit_C = C_gpu.copy_to_host()
print('------- using cuda.jit -------')
print('Is close?: {}'.format(np.allclose(numpy_C,cudajit_C)))
print('{} of {} elements are close'.format(np.sum(np.isclose(numpy_C,cudajit_C)), M*N*P))
print('------- using cuda.jit -------\n')
vectorize_C_gpu = VectorMult3d(A_gpu, B_gpu)
vectorize_C = vectorize_C_gpu.copy_to_host()
print('------- using vectorize -------')
print('Is close?: {}'.format(np.allclose(numpy_C,vectorize_C)))
print('{} of {} elements are close'.format(np.sum(np.isclose(numpy_C,vectorize_C)), M*N*P))
print('------- using vectorize -------\n')
import numba; print("numba version: "+numba.__version__)