EDIT: It seems that, at least in this case, transposing the grid has a negative effect on the L2 Cache Bandwidth. This was obtained from the visual profiler. The reason why is not clear to me yet.
I have come to a GPU computing situation in which require to transpose a CUDA grid. So, if block_{x,y} originally acted on data region d_{x,y}, now it acts on data region d_{y,x}, therefore block_{y,x} would act on data region d_{x,y}. An example is presented in the following figure.
It is worth mentioning that threads are not transposed inside each block, that is, once the block is located, the threadIdx.x and threadIdx.y values are used in a normal way for their x and y offsets, respectively.
From what I know, in theory this design should do no harm in performance, as the memory coalescing pattern is still preserved, i.e., threads inside a block are not transposed, it is just the grid that re-arranged its blocks. However I found that when transposing the grid, the kernel runs approx. 2X slower than in the normal case. I made a toy example to illustrate the situation.
➜ transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
➜ transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok
I would really appreciate any explanation for this issue. Here is the source code to reproduce the behavior.
// -----------------------------------
// can compile as nvcc main.cu -o prog
// -----------------------------------
#include <cuda.h>
#include <cstdio>
#define BSIZE2D 32
__global__ void normal_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.y*blockDim.y + threadIdx.y;
const int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
__global__ void transp_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.x*blockDim.x + threadIdx.y;
const int j = blockIdx.y*blockDim.y + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
int verify(int *hmat, const int m, const int n){
printf("verifying correctness........."); fflush(stdout);
for(int i=0; i<m*n; ++i){
if(hmat[i] != 1){
fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
return 0;
}
}
printf("ok\n"); fflush(stdout);
return 1;
}
int main(int argc, char **argv){
if(argc != 5){
printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
exit(EXIT_FAILURE);
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int r = atoi(argv[3]);
const int t = atoi(argv[4]);
const unsigned int size = m*n;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float time;
int *hmat, *dmat;
printf("init data....................."); fflush(stdout);
hmat = (int*)malloc(sizeof(int)*(size));
for(int i=0; i<size; ++i){
hmat[i] = 0;
}
printf("done: zero matrix of %i rows x %i cols\n", m, n);
printf("copy data to GPU.............."); fflush(stdout);
cudaMalloc(&dmat, sizeof(int)*(size));
cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
printf("done\n");
printf("preparing grid................"); fflush(stdout);
dim3 block(BSIZE2D, BSIZE2D, 1);
dim3 grid;
// if transpose or not
if(t){
grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
}
else{
grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
}
printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);
if(t){
printf("transp_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
transp_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
else{
printf("normal_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
normal_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
verify(hmat, m, n);
exit(EXIT_SUCCESS);
}