I'm planning to build image processing using CUDA. To represent an image I use a matrix (values are randomly generated). I want to apply average filter to this matrix. The filter size I used is 3. Here is the code I have written. This works fine when the number (N = 10) is less than the block dimension size (BLOCK_DIM = 32). I tried with N=5 and BLOCK_DIM = 3. It works fine.
Why does this code result unexpected results (0 instead of average) when the BLOCK_DIM increases, how can I solve this ?
#include <stdio.h>
#include <stdlib.h>
#define N 10
#define BLOCK_DIM 32
__global__ void averageKernel (int *a, int *c) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
c[index] = 1;
int sum = 0;
int avg = 0;
if (row > 0 && col > 0 && col < N-1 && row < N-1 ) {
sum = sum + a[index - 1];
sum = sum + a[index + 1];
sum = sum + a[index - N-1];
sum = sum + a[index - N];
sum = sum + a[index - N+1];
sum = sum + a[index + N-1];
sum = sum + a[index + N];
sum = sum + a[index + N+1];
sum = sum + a[index];
avg = sum/9;
}
c[index] = avg;
}
void printMatrix(int a[N][N] )
{
for(int i=0; i<N; i++){
for (int j=0; j<N; j++){
printf("%d\t", a[i][j] );
}
printf("\n");
}
}
int main() {
int a[N][N], c[N][N];
int *dev_a, *dev_c;
int size = N * N * sizeof(int);
for(int i=0; i<N; i++)
for (int j=0; j<N; j++){
a[i][j] = rand() % 256;
}
printf("Matrix A\n");
printMatrix(a);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n", dimGrid.x, dimGrid.y);
averageKernel<<<dimGrid,dimBlock>>>(dev_a,dev_c);
cudaDeviceSynchronize();
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
printf("Matrix c\n");
printMatrix(c);
cudaFree(dev_a);
cudaFree(dev_c);
}