I have written this program and I am having some trouble understanding how to use multiple blocks by using dim3 variable in the kernel call line. This code works fine when I am doing 1000*1000 matrix multiplication, but not getting correct answer for lower dimensions like 100*100 , 200*200.
#include <stdio.h>
#include <cuda.h>
#define width 1000
__global__ void kernel(int *a,int *b,int *c)
{
int tx = threadIdx.x + blockIdx.x*blockDim.x;
int ty = threadIdx.y + blockIdx.y*blockDim.y;
int sum=0,k;
for(k=0;k<(width);++k)
{
sum += a[ty*width +k]*b[k*width + tx];
}
c[ty*width + tx] = sum;
}
int main()
{
int a[width*width],c[width*width],b[width*width];
int *dev_a,*dev_b,*dev_c;
int i,count=0;
int size = (width*width)*sizeof(int);
for(i=0;i<(width*width);i++)
{
a[i] = 1;
b[i] = 1;
}
cudaMalloc((void **)&dev_a,size);
cudaMalloc((void **)&dev_b,size);
cudaMalloc((void **)&dev_c,size);
cudaMemcpy(dev_a,&a,size,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,&b,size,cudaMemcpyHostToDevice);
dim3 dimBlock(20,20);
dim3 blockID(50,50);
kernel<<<blockID,dimBlock>>>(dev_a,dev_b,dev_c);
cudaMemcpy(&c,dev_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<(width*width);i++)
{
count++;
if(count == (width+1))
{
count = 1;
printf("\n");
}
printf("%d ",c[i]);
}
printf("\n");
return 0;
}