I am trying to implement matrix multiplication using CUDA. I have two matrices of order Mw and wN. I launched (w*w) threads in each block and grid dimension = (M/w,N/w). I created a matrix in shared memory of size 32*32. I want to implement matrix multiplication using only one matrix in shared memory. Here's my code
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
#include<stdlib.h>
#include<unistd.h>
#include<math.h>
__global__ void add(int *a,int *b, int *c,int *p,int *q){
// __shared__ int aTile[*p][*p];
//const int a=*p;
__shared__ int aTile[32][32];
int row = blockIdx.x*blockDim.x+threadIdx.x;
int col = blockIdx.y*blockDim.y+threadIdx.y;
int sum=0;
aTile[threadIdx.x][threadIdx.y] = a[row*(*p)+threadIdx.y];
__syncthreads();
if(row< *q && col< *q)
{
for(int k=0;k<*p;k++)
{
sum+= aTile[threadIdx.x][k]*b[col+(*q)*k];
// __syncthreads();
}
c[col+(*q)*row]=sum;
//__syncthreads();
}
}
int main(){
printf("Enter the number of rows of matrix 1\n");
int row_1;
scanf("%d",&row_1);
printf("Enter the number of columns of matrix 1\n");
int col_1;
scanf("%d",&col_1);
/*printf("Enter the values of matrix 1 \n");
*/
int a[row_1][col_1];
for(int i=0;i<row_1;i++)
{
for(int j=0;j<col_1;j++)
{
//scanf("%d",&a[i][j]);
a[i][j]=1;
}
}
printf("Enter the number of rows of matrix 2\n");
int row_2;
scanf("%d",&row_2);
printf("Enter the number of columns of matrix 2\n");
int col_2;
scanf("%d",&col_2);
/* printf("Enter the values of matrix 2 \n");
*/
int b[row_2][col_2];
for(int i=0;i<row_2;i++)
{
for(int j=0;j<col_2;j++)
{
// scanf("%d",&b[i][j]);
b[i][j]=1;
}
}
int c[row_1][col_2];
//dim3 dimBlock(col_1, col_1);// in one block u have row_1*col_2 threads;
dim3 dimBlock(col_1,col_1);
//dim3 dimGrid((row_1/col_1)+1,(col_2/col_1)+1); // in one grid you have 1*1 blocks
dim3 dimGrid(ceil(row_1/col_1),ceil(col_2/col_1));
int *p;
int *q;
int *dev_a,*dev_b,*dev_c;
int size_a=row_1*col_1*sizeof(int);
int size_b=row_2*col_2*sizeof(int);
int size_c = row_1*col_2*sizeof(int);
cudaMalloc((void**)&dev_a,size_a);
cudaMalloc((void**)&dev_b,size_b);
cudaMalloc((void**)&dev_c,size_c);
cudaMalloc((void**)&p,sizeof(int));
cudaMalloc((void**)&q,sizeof(int));
cudaMemcpy(dev_a,a,size_a,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,size_b,cudaMemcpyHostToDevice);
cudaMemcpy(dev_c,c,size_c,cudaMemcpyHostToDevice);
cudaMemcpy(p,&col_1,sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(q,&col_2,sizeof(int),cudaMemcpyHostToDevice);
add<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c,p,q);
cudaMemcpy(c,dev_c,size_c,cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("output matrix is : \n");
for(int i=0;i<10;i++)
{
for(int j=0;j<10;j++)
{
printf("%d ",c[i][j]);
}
printf("\n");
}
}
I get the correct output for when i multiply matrices of size 32*32 and 32*32 but when i Multiply matrices of sizes 33*33 and 33*33(and above) , the resultant multiplied matrix contains all zeros. I have tried to increased the size of matrix in shared memory, but I get the following error
ptxas error : Entry function '_Z3addPiS_S_S_S_' uses too much shared data (0x10038 bytes, 0x4000 max)
I am pretty new to CUDA. Sorry, if this was too much basic question