I am doing a matrix multiplication in CUDA. The following setup works:
int TILE = 8;
dim3 DimGrid((numCColumns - 1)/TILE + 1, (numCRows - 1)/TILE + 1, 1);
dim3 DimBlock(TILE, TILE, 1);
But if I use one block for the whole image, it returns all zero. What is the reason for that? Assume one block can contain the whole image ( input is 64x64).
dim3 DimGrid(1,1,1);
dim3 DimBlock(numCColumns, numCRows, 1);
This is how I call kernel in the main function:
matrixMultiply<<<DimGrid, DimBlock>>>(deviceA, deviceB, deviceC,
numARows, numAColumns,
numBRows, numBColumns,
numCRows, numCColumns);
and the kernel:
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if ((Row < numCRows) && (Col < numCColumns))
{
float value = 0.0;
for (int i = 0; i < numAColumns; i++)
value += A[Row * numAColumns + i] * B[i*numBColumns + Col];
C[Row * numCColumns + Col] = value;
}
}