0

Here I'm trying to access a dynamically allocated array in CUDA. However, after running the output is c[0][0] = 0. Am I accessing the allocated array correctly? I think the way I'm copying the arrays is probably correct and for some reason, the value of C has not been changed on the device.

#include<iostream>
using namespace std;

__global__ void add_matrix(float *A, float *B, float *C, int n) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    if ((i < n) && (j < n)){
        C[i*n+j] = A[i*n+j] + B[i*n+j];
    }
}

int main(){
    const size_t N = 1024;
    const size_t size = N * N * sizeof(float);
    float *A, *B, *C;
    A = (float*) malloc(size);
    B = (float*) malloc(size);
    C = (float*) malloc(size);
    for (size_t i=0; i<N*N; i++){
        A[i] = 5.0;
        B[i] = 6.0;
    }
    float *A_d, *B_d, *C_d;
    cudaMalloc((void**)&A_d, size);
    cudaMalloc((void**)&B_d, size);
    cudaMalloc((void**)&C_d, size);
    auto code = cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
    if (code != cudaSuccess){
        cout << "Error copying A to device" << endl;
    }
    code = cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
    if (code != cudaSuccess){
        cout << "Error copying B to device" << endl;
    }

    dim3 threads(N, N);
    dim3 blocks(1,1);
    add_matrix<<<blocks, threads>>>(A_d, B_d, C_d, N);
    code = cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
    if (code != cudaSuccess){
        cout << "Error copying C from device" << endl;
    }

    std::cout << "C[0][0] : " << C[0] << std::endl;

    free(A); free(B); free(C);
    cudaFree(A_d); cudaFree(B_d); cudaFree(C_d);
    return 0;
}
Amirabbas asadi
  • 182
  • 1
  • 1
  • 7

1 Answers1

2

The problem was arranging the blocks. I totally forgot each block can have a limited number of threads. we can obtain the maximum threads per block by getting maxThreadsPerBlock property using cudaDeviceGetAttribute. It seems the Colab GPU supports 1024 threads in each block. so I changed the arrangement this way:

dim3 threads(32,32);
dim3 blocks(32,32);

And it worked

Amirabbas asadi
  • 182
  • 1
  • 1
  • 7
  • By the way, you do not need a 2D grid for this since the array are flatten ;) . Note this should certainly be faster. Additionally, I guess you could check the result of the kernel launch so to be warned about the problem as early as possible. – Jérôme Richard May 11 '22 at 00:07
  • @JérômeRichard Of course, this example didn't need a 2D array. but it's strange that I didn't get any error for thread allocation. – Amirabbas asadi May 11 '22 at 00:09
  • 1
    See https://stackoverflow.com/questions/6419700/way-to-verify-kernel-was-executed-in-cuda . Is this better with this? – Jérôme Richard May 11 '22 at 08:33
  • @JérômeRichard Thanks this is so useful! – Amirabbas asadi May 11 '22 at 15:32