How to parallelize four nested for loops in cuda
in case of dct i have four nested for loops i want my dct function in cuda code
for(y = 0; y < HEIGHT; y+=BLOCK_H) {
for(x = 0; x < WIDTH; x+= BLOCK_W) {
for(i = 0; i < BLOCK_H; i++) {
for(j = 0; j < BLOCK_W; j++) {
block_in[i][j] = cur_frame[(x+j)+(WIDTH*(y+i))];
}
}
}
}