I am new to CUDA. I have tried to add two vectors and it works fine. Now I want to add two matrix. I want to add two matrix using two dimension threads(threadIdx.x and threadIdx.y). I have found this code in Internet, and I have made some changes to display the results. It compiles. But displays unexpected results, it looks like memory addresses. Please help me, Thank you in advance.
#include <stdio.h>
#include <stdlib.h>
#define N 5
#define BLOCK_DIM 10
__global__ void matrixAdd (int *a, int *b, int *c) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N) {
c[index] = a[index] + b[index];
}
}
int main() {
int a[N][N], b[N][N], c[N][N];
int *dev_a, *dev_b, *dev_c;
int size = N * N;
for(int i=0; i<N; i++)
for (int j=0; j<N; j++){
a[i][j] = 1;
b[i][j] = 2;
}
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));
matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
cudaDeviceSynchronize();
for(int i=0; i<N; i++){
for (int j=0; j<N; j++){
printf("%d\t", c[i][j] );
}
printf("\n");
}
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
and the output is
0 0 -780197879 32659 1
0 452489360 32764 6303208 0
4198328 0 452489376 32764 4198181
0 2 0 4198557 0
4196864 0 0 0 4198480
my expected output is a 5x5 matrix of element 3. Please help me.