Find max of matrix with window size in CUDA

Question

I just begin to programming CUDA. Now I have a problem when I find the max value of matrix. I have a matrix and I have a window size(ie 2x2, 4x4, 8x8), and I find the max value of matrix for each window size.

Everything is ok if my matrix is small number, when matrix larger, ie 256x256 or 1024x1024, the order of max array is not true. For exmaple, the expected output when matrix large is: {4,5,9,1,2,3,7,9,9}. But, actually it is {4,5,1,2,9,3 ....} the order change in each time we run.

#include <stdio.h>
#include <stdlib.h>

   __global__ void calculate_emax_kernel(float emap[],float emax[],         int img_height, int img_width,int windows_size)
    {
    int x_index = blockIdx.x*blockDim.x+threadIdx.x;
    int y_index = blockIdx.y*blockDim.y+threadIdx.y;

    int num_row_block = img_height/windows_size;
    int num_col_block = img_width/windows_size;
    int index;
    __shared__ float window_elements[256];
    __shared__ int counter;


    if(y_index >= img_height|| x_index >= img_width) return;
    for(int i = 0; i < num_row_block; i++)
    {
        for(int j = 0; j < num_col_block; j++)
        {
            counter = 0;
            if(y_index >= i*windows_size && y_index < (i+1)*windows_size
                && x_index >= j*windows_size && x_index < (j+1)*windows_size)
            {
                int idx = y_index*img_height + x_index;
                index = atomicAdd(&counter, 1);

                window_elements[index] = emap[idx];
                __syncthreads();
            //printf("row_block = %d ||col_block = %d || y_index =%d || x_index = %d || index = %d , window_elements[%d] = %d\n",i,j,y_index,x_index,(int)emap[idx],index,(int)window_elements[index]);

            // reduction
            unsigned int k = (windows_size*windows_size)/2;
            while(k != 0)
            {
                if(index < k)
                {
                    window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
                    //printf ("i = %d || j = %d || k = %d ||win_idx =%d ||%d = fmaxf( %d, %d) \n",i,j,k,index,(int)fmaxf(window_elements[index], window_elements[index+k]),
                    //                                                                              (int)window_elements[index], (int)window_elements[index+k]);
                }
                k /= 2;
            }
            if(index == 0)
            {
                printf ("max = %d || index = %d \n",(int)window_elements[index],i*num_row_block+j);
                emax[i*num_row_block+j] = window_elements[index];
            }
        }
        __syncthreads();
    }
    __syncthreads();
}
__syncthreads();
}


void max_reduction_1D(float * array, float * max,int array_height , int array_width)
{
    float * d_array;
    float * d_max;
    int *d_mutex;
    int elements = array_height*array_width;
    cudaMalloc((void**)&d_array, elements*sizeof(float));
    cudaMalloc((void**)&d_max, sizeof(float));
    cudaMalloc((void**)&d_mutex, sizeof(int));
    cudaMemset(d_mutex, 0, sizeof(float));

    cudaMemcpy(d_array, array, elements*sizeof(float), cudaMemcpyHostToDevice);

    dim3 blocksize(16,16);
    dim3 gridsize;
    gridsize.x=(array_width+blocksize.x-1)/blocksize.x;
    gridsize.y=(array_height+blocksize.y-1)/blocksize.y;

    calculate_emax_kernel<<< gridsize, blocksize >>>(d_array, d_max, array_height,array_width,8);

    cudaMemcpy(max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
    //printf("Maximum number is: %d \n",(int)*max);

    cudaFree(d_array);
    cudaFree(d_max);
}

int main()
{
    int array_height = 6;
    int array_width = 6;
    unsigned int N = array_height*array_width;
    float *h_array;
    float *h_max;
    h_array = (float*)malloc(N*sizeof(float));
    h_max = (float*)malloc(sizeof(float));
    float array[] = {1, 2, 2, 3, 6, 7, 5, 6,
                 3, 4, 4, 5, 8, 9, 7, 8,
                 1, 1, 2, 2, 3, 3, 4, 4,
                 1, 1, 2, 2, 3, 3, 4, 4,
                 4, 5, 6, 6, 9, 7, 1, 2,
                 6, 7, 8, 9, 8, 8, 2, 3,
                 1, 2, 3, 4, 5, 6, 7, 8,
                 8, 7, 6, 5, 4, 3, 2, 1
    };

for(int i = 0; i < N; i++)
{
    h_array[i] = array[i];
}
//max_reduction_1D(h_array, h_max, array_height , array_width);
max_reduction_1D(h_array, h_max, 32 ,32);
free(h_array);
}

@kangshiyin because I cannot applied your code. First i just understand a lit of bit your ideal and cannot applied in my code. Second, I just implement in CUDA without thrust. Could you help me in more details. Thank you very much — Nguyễn Cường, Jul 21 '16 at 18:49
@kangshiyin I don't understand your algorithms, can you instruction me step by step. Thanks a lot. — Nguyễn Cường, Jul 21 '16 at 19:18

Find max of matrix with window size in CUDA

0 Answers0