I just begin to programming CUDA. Now I have a problem when I find the max value of matrix. I have a matrix and I have a window size(ie 2x2, 4x4, 8x8), and I find the max value of matrix for each window size.
Everything is ok if my matrix is small number, when matrix larger, ie 256x256 or 1024x1024, the order of max array is not true. For exmaple, the expected output when matrix large is: {4,5,9,1,2,3,7,9,9}. But, actually it is {4,5,1,2,9,3 ....} the order change in each time we run.
#include <stdio.h>
#include <stdlib.h>
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size)
{
int x_index = blockIdx.x*blockDim.x+threadIdx.x;
int y_index = blockIdx.y*blockDim.y+threadIdx.y;
int num_row_block = img_height/windows_size;
int num_col_block = img_width/windows_size;
int index;
__shared__ float window_elements[256];
__shared__ int counter;
if(y_index >= img_height|| x_index >= img_width) return;
for(int i = 0; i < num_row_block; i++)
{
for(int j = 0; j < num_col_block; j++)
{
counter = 0;
if(y_index >= i*windows_size && y_index < (i+1)*windows_size
&& x_index >= j*windows_size && x_index < (j+1)*windows_size)
{
int idx = y_index*img_height + x_index;
index = atomicAdd(&counter, 1);
window_elements[index] = emap[idx];
__syncthreads();
//printf("row_block = %d ||col_block = %d || y_index =%d || x_index = %d || index = %d , window_elements[%d] = %d\n",i,j,y_index,x_index,(int)emap[idx],index,(int)window_elements[index]);
// reduction
unsigned int k = (windows_size*windows_size)/2;
while(k != 0)
{
if(index < k)
{
window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
//printf ("i = %d || j = %d || k = %d ||win_idx =%d ||%d = fmaxf( %d, %d) \n",i,j,k,index,(int)fmaxf(window_elements[index], window_elements[index+k]),
// (int)window_elements[index], (int)window_elements[index+k]);
}
k /= 2;
}
if(index == 0)
{
printf ("max = %d || index = %d \n",(int)window_elements[index],i*num_row_block+j);
emax[i*num_row_block+j] = window_elements[index];
}
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
void max_reduction_1D(float * array, float * max,int array_height , int array_width)
{
float * d_array;
float * d_max;
int *d_mutex;
int elements = array_height*array_width;
cudaMalloc((void**)&d_array, elements*sizeof(float));
cudaMalloc((void**)&d_max, sizeof(float));
cudaMalloc((void**)&d_mutex, sizeof(int));
cudaMemset(d_mutex, 0, sizeof(float));
cudaMemcpy(d_array, array, elements*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(array_width+blocksize.x-1)/blocksize.x;
gridsize.y=(array_height+blocksize.y-1)/blocksize.y;
calculate_emax_kernel<<< gridsize, blocksize >>>(d_array, d_max, array_height,array_width,8);
cudaMemcpy(max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
//printf("Maximum number is: %d \n",(int)*max);
cudaFree(d_array);
cudaFree(d_max);
}
int main()
{
int array_height = 6;
int array_width = 6;
unsigned int N = array_height*array_width;
float *h_array;
float *h_max;
h_array = (float*)malloc(N*sizeof(float));
h_max = (float*)malloc(sizeof(float));
float array[] = {1, 2, 2, 3, 6, 7, 5, 6,
3, 4, 4, 5, 8, 9, 7, 8,
1, 1, 2, 2, 3, 3, 4, 4,
1, 1, 2, 2, 3, 3, 4, 4,
4, 5, 6, 6, 9, 7, 1, 2,
6, 7, 8, 9, 8, 8, 2, 3,
1, 2, 3, 4, 5, 6, 7, 8,
8, 7, 6, 5, 4, 3, 2, 1
};
for(int i = 0; i < N; i++)
{
h_array[i] = array[i];
}
//max_reduction_1D(h_array, h_max, array_height , array_width);
max_reduction_1D(h_array, h_max, 32 ,32);
free(h_array);
}