Porting and openMp program to cuda c: correct grid_size/block_size and reduction

Question

I want to convert an openMP program to cuda c.
I try to find my way on the web and the sdk. But the material is beyond my level.
My c program loop over n=2^30 index and add the weight of each index.

1) What is the correct grid_size and block_size?
My guess is to replicate openMP and do

grid_size=n/max_number_of_cuda_threads;
block_size=1;

2) How can I implement openMP reduction in cuda?
I try a cudaMemcpy and then reduce the array in standard c, but it seems slow.
I look at the thrust library and its reduce operator. But I don't see how to integrate it with my current code.

program.c

#include <math.h>
#include <omp.h>

float get_weigth_of_index(long index,float* data){
    int i;
    float v=0;
    for(i=0;i<4;i++)
        v+=index*data[i];
    return v;
}

int main(){
    long i;
    float r=0;
    long n=pow(2,30);
    float data[4]={0,1,2,3};
    #pragma omp parallel for reduction (+:r)
    for(i=0;i<n;i++)
        r+=get_weigth_of_index(i,data);
    return 0;
}

program.cu

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>

__device__ float get_weigth_of_index(long index,float* data){
    int i;
    float v=0;
    for(i=0;i<4;i++)
        v+=index*data[i];
    return v;
}

__global__ void looper(long max_number_of_cuda_threads, float* data,float* result){
    long bid=blockIdx.x;
    long start=bid*max_number_of_cuda_threads;
    long end=start+max_number_of_cuda_threads;
    long i;
    float r=0;
    for(i=start;i<end;i++)
        r+=get_weigth_of_index(i,data);
    result[bid]=r;
}

int main(){
    long n=pow(2,30);
    int max_number_of_cuda_threads=1024; //I'm not sure it's correct
    long grid_size=n/max_number_of_cuda_threads;
    long block_size=1;

    float data_host[4]={0,1,2,3};
    float* data_device=0;
    float* result_device=0;
    cudaMalloc((void**)&data_device, sizeof(int)*4);
    cudaMemcpy(data_device, data_host, sizeof(int)*4, cudaMemcpyHostToDevice);
    cudaMalloc((void**)&result_device, sizeof(float)*grid_size);

    looper<<<grid_size,block_size>>>(max_number_of_cuda_threads,data_device,result_device);

    //reduction with standard c: cudaMemcpy seems slow
    float* result_host=(float*)malloc(sizeof(float)*grid_size);
    cudaMemcpy(result_host, result_device, sizeof(float)*grid_size, cudaMemcpyDeviceToHost); 

    long i;
    float v=0;
    #pragma omp parallel for reduction(+:v)
    for(i=0;i<grid_size;i++)    
        v+=result_host[i];
    printf("result:%f",v);

    return 0;
}

my gpu card

Device 0: "Tesla M2050"
  Number of multiprocessors:                     14
  Number of cores:                               448
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 1
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes

You might want to read [this answer](http://stackoverflow.com/a/5643838/681865) and [this answer](http://stackoverflow.com/a/9986748/681865) to get a better feel for what should influence your choice of block and grid sizes. — talonmies, May 26 '12 at 15:19

geek · Accepted Answer · 2012-05-26T17:03:03.430

I think that thrust::transform_reduce can solve your problem. This code shows how you can use it:

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h>
#include <cmath>

struct get_weigth_of_index
{

    get_weigth_of_index(float* data, size_t n)
    {
        cudaMalloc((void**)&_data,n * sizeof(float));
        cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
        _n = n;
    }

    float* _data;
    size_t _n;
    __host__ __device__
    float operator()(const int& index) const
    { 
        float v=0;
        for(size_t i=0; i<_n; i++)
            v += index * _data[i];
        return v;
    }
};

int main(void)
{

    float x[4] = {1.0, 2.0, 3.0, 4.0};

    size_t len = 1024; // init your value
    float * index //init and fill you array here 
    // transfer to device
    thrust::device_vector<float> d_index(index, index + len);

    get_weigth_of_index unary_op(x, 4);
    thrust::plus<float> binary_op;
    float init = 0;

    float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

    std::cout << sum<< std::endl;

    return 0;
}

Is there a `thrust` operator which doesn't require to initialize the `index` array? If I take a very large `index`, say `long index={1,..,2**40}`, `index` won't fit in the device memory and it would be better if `thrust` iterates through the values. — , May 27 '12 at 02:40

Porting and openMp program to cuda c: correct grid_size/block_size and reduction

1 Answers1