-2
#define TS 32
int num_devices = 0;

__global__ void shared_kernel(float* A, float* B, float* C, int M, int N, int K) {

   int global_col = blockDim.x * blockIdx.x + threadIdx.x;
   int global_row = blockDim.y * blockIdx.y + threadIdx.y;
   int local_col  = threadIdx.x;
   int local_row  = threadIdx.y;
   if (global_row >= M || global_col >= N) return;


   __shared__ float Asub[TS][TS];
   __shared__ float Bsub[TS][TS];
 
   const int num_tiles = K / TS;

   float acc = 0;
 
   for(int t = 0; t < num_tiles; t++){
       const int t_row = TS * t + local_row;
       const int t_col = TS * t + local_col;
       Asub[local_row][local_col] = A[global_row * K + t_col];
       Bsub[local_row][local_col] = B[t_row * N + global_col];


       __syncthreads();
       printf("[DEBUG] first sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
 

       for (int k = 0; k < K; ++k) {
         acc += Asub[local_row][k] * Bsub[k][local_col];
       }
 

       __syncthreads();
       printf("[DEBUG] second sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
   }

   C[global_row * N + global_col] = acc;
}

static float *a_d, *b_d, *c_d;

void mat_mul(float *A, float *B, float *C, int M, int N, int K) {
 cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice);

 dim3 blockDim(TS, TS);
 dim3 gridDim(M/TS, N/TS);
 shared_kernel<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K); 

 cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost);

 cudaDeviceSynchronize();
}

void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {

 cudaGetDeviceCount(&num_devices); 
 cudaSetDevice(0);

 cudaMalloc(&a_d, M * K * sizeof(float));
 cudaMalloc(&b_d, K * N * sizeof(float));
 cudaMalloc(&c_d, M * N * sizeof(float));
}


Above example is a matrix multiplication with shared memory. I ran above kernel with dim3 blockDim(TS, TS) and dim3 gridDim(M/TS, N/TS) and M, N, K = 128.

I checked that float * C has zero value after launching kernel. Also, I found that only few of global_row are printed(from 37 to 81) after first __syncthreads(), and there is no printf DEBUG message after the second __syncthreads().

I suspect that __syncthreads() is causing the problem, but I don't know how to fix it. My code is almost the same as other matrix multiplication code in other site.

Would you give me some hint how to solve this?

alryosha
  • 641
  • 1
  • 8
  • 15
  • Could you at least post the complete kernel, better still an [MCVE]. You are asking for someone to diagnose a runtime error on code that can't be compiled, let alone run. Does that sound like a reasonable proposition to you? – talonmies Jun 14 '21 at 04:36
  • I'm sorry I uploaded a full kernel. – alryosha Jun 14 '21 at 04:42

1 Answers1

2

Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with compute-sanitizer or cuda-memcheck. For this type of analysis, it will be easier if you don't use in-kernel printf.

If you did that, you would see output like this:

========= Invalid __shared__ read of size 4
=========     at 0x000002f0 in shared_kernel(float*, float*, float*, int, int, int)
=========     by thread (0,2,0) in block (0,1,0)
=========     Address 0x00002000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
...  (and more output)

So from that, we can see that your kernel is making invalid __shared__ read operations. Where is that happening in your kernel? You could use the methodology here to identify a specific line of code. However this is a fairly simple kernel, and there is only one line that is reading from shared memory, it is here:

   for (int k = 0; k < K; ++k) {
     acc += Asub[local_row][k] * Bsub[k][local_col];  // shared reads here

A quick inspection will show that if you let this loop iterate over a range of K=128, then you will index out of bounds here:

   for (int k = 0; k < K; ++k) {
     acc += Asub[local_row][k] * Bsub[k][local_col];
                            ^         ^

when k is greater than 31, because this would exceed your shared array dimensions:

#define TS 32

__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];

I'm not going to bother writing a fixed kernel/code for you, because as you've already pointed out, this topic is covered in many other places, and a canonical example is already provided in the programming guide.

FWIW, if i change your for-loop to this:

   for (int k = 0; k < TS; ++k) {

then the run-time errors go away for me. cuda-memcheck reports no errors.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257