I'm doing an assignment for my university, and the main Idea is to compare CUDA Data parallelism with CUDA Task parallelism. I came up with an idea to parallelize the Conway's game of life. The problem is, I cannot figure out how to navigate through an 2D array in CUDA in multiple directions, i.e. above/under/right/left and the corners around the cell, which the kernel evaluates.
So far I came up with following:
The first Kernel Code
//determines the alive cell and save value of each cell into an array
__global__ void numAliveAround(int *oldBoard, int *newBoard, int xSize, int ySize, size_t pitchOld, size_t pitchNew)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if(x < xSize && y < ySize)
{
//cell above
//xMod is to make sure the number wraps when it overflows the board
xMod = ((x + 1) % xSize + xSize) % xSize;
//idx calculation
idx = xMod * xSize + y;
outputNumber += board[idx];
//more of the same code, just for cell under, left, right, and corners
newBoard[x * xSize + y] = outputNumber;
}
}
The second Kernel code
//sets new cell status according to the number of alive cells around
__global__ void determineNextState(int *board, int *newBoard, int xSize, int ySize, size_t pitchOld, size_t pitchNew)
{
//getting threads
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x < xSize && y < ySize)
{
int idxNew = x * xSize + y;
int idxOld = x * xSize + y;
int state = board[idxOld];
//ALIVE = 1, DEAD = 0;
int output = DEAD;
//checking if any alive condition is met
if (state == ALIVE)
{
if ((newBoard[idxNew] == 2 || newBoard[idxNew] == 3))
{
output = ALIVE;
}
}
else
{
if (newBoard[idxNew] == 3)
{
output = ALIVE;
}
}
newBoard[idxNew] = output;
}
}
Kernel calling function
void SendToCUDA(int oldBoard[COLUMNS][ROWS], int newBoard[COLUMNS][ROWS])
{
//CUDA pointers
int *d_oldBoard;
int *d_newBoard;
size_t pitchOld;
size_t pitchNew;
cudaMallocPitch(&d_oldBoard, &pitchOld, COLUMNS * sizeof(int), ROWS);
cudaMallocPitch(&d_newBoard, &pitchNew, COLUMNS * sizeof(int), ROWS);
cudaMemcpy2D(d_oldBoard, pitchOld, oldBoard, COLUMNS * sizeof(int), COLUMNS * sizeof(int), ROWS, cudaMemcpyHostToDevice);
dim3 grid(divideAndRound(COLUMNS, BLOCKSIZE_X), divideAndRound(ROWS, BLOCKSIZE_Y));
dim3 block(BLOCKSIZE_Y, BLOCKSIZE_X);
printf("counting \n");
numberAliveAround <<<block, grid>>> (d_oldBoard, d_newBoard, COLUMNS, ROWS, pitchOld, pitchNew);
cudaDeviceSynchronize();
printf("determining \n");
determineNextState <<<block, grid>>> (d_oldBoard, d_newBoard, COLUMNS, ROWS, pitchOld, pitchNew);
cudaDeviceSynchronize();
//using newBoard later (outside the function) to display the Board
cudaMemcpy2D(newBoard, COLUMNS * sizeof(int), d_newBoard, pitchNew, COLUMNS * sizeof(int), ROWS, cudaMemcpyDeviceToHost);
cudaFree(d_oldBoard);
cudaFree(d_newBoard);
}
I found multiple ways of accessing flattened 2d array, of which some contradict each other, like:
//what is usually used as an exmplanation
idx = x * widht + y;
//sometimes x and y are swapped
idx = y * width + x;
//what works with simple access
int *value = (int *)((char *)(d_matrix + y * pitch)) + x;
//or
idx = x * xDim + y + pitch;
the funny thing is that 2 later ones work when I just access a single point in the array (for example increase all the values in it by 1) but completely do not work with more complex navigation. I've been sitting on this Problem for quite some time at this point. So any kind of insight would be extremely helpful.