Why transposing a CUDA grid (but not its threadblocks) still slowdowns computation?

Question

EDIT: It seems that, at least in this case, transposing the grid has a negative effect on the L2 Cache Bandwidth. This was obtained from the visual profiler. The reason why is not clear to me yet.

I have come to a GPU computing situation in which require to transpose a CUDA grid. So, if block_{x,y} originally acted on data region d_{x,y}, now it acts on data region d_{y,x}, therefore block_{y,x} would act on data region d_{x,y}. An example is presented in the following figure.

It is worth mentioning that threads are not transposed inside each block, that is, once the block is located, the threadIdx.x and threadIdx.y values are used in a normal way for their x and y offsets, respectively.

From what I know, in theory this design should do no harm in performance, as the memory coalescing pattern is still preserved, i.e., threads inside a block are not transposed, it is just the grid that re-arranged its blocks. However I found that when transposing the grid, the kernel runs approx. 2X slower than in the normal case. I made a toy example to illustrate the situation.

➜  transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
➜  transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok

I would really appreciate any explanation for this issue. Here is the source code to reproduce the behavior.

 // -----------------------------------
 // can compile as nvcc main.cu -o prog
 // -----------------------------------

 #include <cuda.h>
 #include <cstdio>

 #define BSIZE2D 32

 __global__ void normal_kernel(int *dmat, const int m, const int n){
     const int i = blockIdx.y*blockDim.y + threadIdx.y;
     const int j = blockIdx.x*blockDim.x + threadIdx.x;
     if(i < m && j < n){
         dmat[i*n + j] = 1;
     }
 }

 __global__ void transp_kernel(int *dmat, const int m, const int n){
     const int i = blockIdx.x*blockDim.x + threadIdx.y;
     const int j = blockIdx.y*blockDim.y + threadIdx.x;
     if(i < m && j < n){
         dmat[i*n + j] = 1;
     }
 }



 int verify(int *hmat, const int m, const int n){
     printf("verifying correctness........."); fflush(stdout);
     for(int i=0; i<m*n; ++i){
         if(hmat[i] != 1){
             fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
             return 0;
         }
     }
     printf("ok\n"); fflush(stdout);
     return 1;
 }
 int main(int argc, char **argv){
     if(argc != 5){
         printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
         exit(EXIT_FAILURE);
     }
     const int m = atoi(argv[1]);
     const int n = atoi(argv[2]);
     const int r = atoi(argv[3]);
     const int t = atoi(argv[4]);
     const unsigned int size = m*n;
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
     float time;
     int *hmat, *dmat;


     printf("init data....................."); fflush(stdout);
     hmat = (int*)malloc(sizeof(int)*(size));
     for(int i=0; i<size; ++i){
         hmat[i] = 0;
     }
     printf("done: zero matrix of %i rows x %i cols\n", m, n);




     printf("copy data to GPU.............."); fflush(stdout);
     cudaMalloc(&dmat, sizeof(int)*(size));
     cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
     printf("done\n");



     printf("preparing grid................"); fflush(stdout);
     dim3 block(BSIZE2D, BSIZE2D, 1);
     dim3 grid;
     // if transpose or not
     if(t){
         grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
     }
     else{
         grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
     }
     printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);


     if(t){
         printf("transp_kernel (%3i rep).......", r); fflush(stdout);
         cudaEventRecord(start, 0);
         for(int i=0; i<r; ++i){
             transp_kernel<<<grid, block>>>(dmat, m, n);
             cudaDeviceSynchronize();
         }
         cudaEventRecord(stop,0);
         cudaEventSynchronize(stop);
         cudaEventElapsedTime(&time, start, stop); // that's our time!
         printf("done: %f ms\n", time/(float)r);
     }
     else{
         printf("normal_kernel (%3i rep).......", r); fflush(stdout);
         cudaEventRecord(start, 0);
         for(int i=0; i<r; ++i){
             normal_kernel<<<grid, block>>>(dmat, m, n);
             cudaDeviceSynchronize();
         }
         cudaEventRecord(stop,0);
         cudaEventSynchronize(stop);
         cudaEventElapsedTime(&time, start, stop); // that's our time!
         printf("done: %f ms\n", time/(float)r);
     }


     cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
     verify(hmat, m, n);
     exit(EXIT_SUCCESS);
 }

Use the CUDA profiler to find out. It will probably report significantly different efficiency of memory access, based on the different access patterns of the two kernels. — njuffa, Feb 08 '18 at 18:37
The CUDA profiler showed around 2X of L2 cache bandwidth performance between the normal and transposed grid. I will keep looking into it. — Cristobal Navarro, Feb 28 '18 at 14:51
Added a picture to illustrate that it is still coalesced memory. — Cristobal Navarro, Feb 28 '18 at 16:07
Adding the actual benchmarking data to the question would be far more constructive than a pretty picture of the transpose operation — talonmies, Mar 04 '18 at 09:00
I'm not sure what you mean with 'threads are not transposed inside each block'. Looking at your code, the indexes are used differently, which means reads will be in consecutive order in one case and will be scattered across cache lines in the other, even within a block. — StarShine, Mar 05 '18 at 12:17
Starshine, I mean that "blockIdx.x \* blockDim.x" and "blockIdx.y \* blockDim.y" are just offsets as the starting point of the block in data. Then, each thread offsets itself with its "threadIdx" value. Both kernels apply the thread offset in the same way, so what you describe inside the block happens in both kernels. — Cristobal Navarro, Mar 05 '18 at 12:23
After receiving help from Nvidia forums, there is a high chance that the slowdown in performance comes from a particular behavior in the L2 Cache Associative mechanism for caching and replacing pages. For the moment it is a strong possibility. — Cristobal Navarro, Mar 07 '18 at 13:36
Similar issues occur when using local memory, by way of a memory bank conflict based on the address ranges. Perhaps the driver internally follows similar paths for your particular setup. — StarShine, Mar 10 '18 at 15:28

Krzysztof · Answer 1 · 2019-03-14T13:03:00.463

Since I could not find any literature on that topic here is my guess-explanation rather basing on experience (my old problem with memory reading speed).

As you wrote, your example preserves memory coalescing pattern but it is done only on warp level (consecutive 32 threads). But to achieve full speed, there is a need for coalescing on inter-warp level - and here the reason is not clear if such coalescing is actually done or maybe cache&memory works somehow better in this scenario (probably as described here we have better utilization of memory burst mode).

So in your normal_kernel execution not only single warp is coalesced but also warp from the next block(s).

To check it on your example I have modified your code to use different block sizes and here are my results on 1080Ti:

Block size (32, 32) same as yours:

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
normal_kernel (100 rep).......done: 1.020545 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
transp_kernel (100 rep).......done: 1.564084 ms
verifying correctness.........ok

Block size (64, 16) unfortunetly we cannot create 64,64 since #threads limit in one block:

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
normal_kernel (100 rep).......done: 1.020420 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
transp_kernel (100 rep).......done: 1.205506 ms
verifying correctness.........ok

Block size (128, 8):

~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
normal_kernel (100 rep).......done: 1.019547 ms
verifying correctness.........ok

~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
transp_kernel (100 rep).......done: 1.058236 ms
verifying correctness.........ok

I'm not sure if this helps in your particular problem but at least we have some more data to discuss.

Why transposing a CUDA grid (but not its threadblocks) still slowdowns computation?

1 Answers1