3

So, in the other post I questioned about C time measurement. Now, I wanna know how to compare the result of the C "function" vs the OpenCL "function"

This is the code of the host OpenCL and C

#define PROGRAM_FILE "sum.cl"
#define KERNEL_FUNC "float_sum"
#define ARRAY_SIZE 1000000


#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <CL/cl.h>

int main()
{
    /* OpenCL Data structures */

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel;    
    cl_command_queue queue;
    cl_mem vec_buffer, result_buffer;

    cl_event prof_event;;

    /* ********************* */

    /* C Data Structures / Data types */
    FILE *program_handle; //Kernel file handle
    char *program_buffer; //Kernel buffer

    float *vec, *non_parallel;
    float result[ARRAY_SIZE];

    size_t program_size; //Kernel file size

    cl_ulong time_start, time_end, total_time;

    int i;
    /* ****************************** */

    /* Errors */
    cl_int err;
    /* ****** */

    non_parallel = (float*)malloc(ARRAY_SIZE * sizeof(float));
    vec          = (float*)malloc(ARRAY_SIZE * sizeof(float));

    //Initialize the vector of floats
    for(i = 0; i < ARRAY_SIZE; i++)
    vec[i] = i + 1;

    /************************* C Function **************************************/
    clock_t start, end;

    start = clock();

    for( i = 0; i < ARRAY_SIZE; i++) 
    {
    non_parallel[i] = vec[i] * vec[i];
    }
    end = clock();
    printf( "Number of seconds: %f\n", (clock()-start)/(double)CLOCKS_PER_SEC );

    free(non_parallel);
    /***************************************************************************/




    clGetPlatformIDs(1, &platform, NULL);//Just want NVIDIA platform
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

    // Context error?
    if(err)
    {
    perror("Cannot create context");
    return 1;
    }

    //Read the kernel file
    program_handle = fopen(PROGRAM_FILE,"r");
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);

    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    //Create the program
    program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, 
                    &program_size, &err);

    if(err)
    {
    perror("Cannot create program");
    return 1;
    }

    free(program_buffer);

    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    kernel = clCreateKernel(program, KERNEL_FUNC, &err);

    if(err)
    {
    perror("Cannot create kernel");
    return 1;
    }

    queue = clCreateCommandQueue(context, device, CL_QUEU_PROFILING_ENABLE, &err);

    if(err)
    {
    perror("Cannot create command queue");
    return 1;
    }

    vec_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                sizeof(float) * ARRAY_SIZE, vec, &err);
    result_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*ARRAY_SIZE, NULL, &err);

    if(err)
    {
    perror("Cannot create the vector buffer");
    return 1;
    }

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &vec_buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &result_buffer);

    size_t global_size = ARRAY_SIZE;
    size_t local_size = 0;

    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &prof_event);

    clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(float)*ARRAY_SIZE, &result, 0, NULL, NULL);
    clFinish(queue);



     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;

    printf("\nAverage time in nanoseconds = %lu\n", total_time/ARRAY_SIZE);



    clReleaseMemObject(vec_buffer);
    clReleaseMemObject(result_buffer);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);

    free(vec);

    return 0;
}

And the kernel is:

__kernel void float_sum(__global float* vec,__global float* result){
    int gid = get_global_id(0);
    result[gid] = vec[gid] * vec[gid];
}

Now, the results are:

Number of seconds: 0.010000 <- This is the for the C code

Average time in nanoseconds = 140737284 <- OpenCL function

0,1407 seconds is the time of the OpenCL time kernel execution, and it's more than the C function, is it correct? Beacause I think OpenCL should be fastest than C non parallel algorithm...

  • I am very surprised by these results, especially because you are dividing the OpenCL speed by the size of the array. Are you sure that you are timing the codes properly? Are you using Windows or Linux? What GPU are you using? – KLee1 Apr 15 '12 at 07:53
  • Are you experimenting with this specific example? If you use the float4 type you can dot product and sum 4 values in a single operation. I have answered below, assuming you aren't looking for such an optimization, instead opencl pointers. There is also a formula for sum of squares you could use. – mfa Oct 26 '12 at 03:18

3 Answers3

3

Executing parallel code on the GPU is not necessarily faster that executing on the CPU. Take into account that you also have to transfer the data to and from the GPU memory in addition to the computations.

In your example you are transferring 2 * N items and doing an O(N) operation in parallel, which is a very inefficient use of the GPU. Therefore, it's quite likely that the CPU is indeed faster for this particular computation.

Tudor
  • 61,523
  • 12
  • 102
  • 142
  • Does exist any way to make a kernel more efficient? Im thinking to parallelize the data also... –  Apr 14 '12 at 17:50
  • @facundo: Not really. I'm afraid your only option is to run a more significant computation on the GPU. For example you could try to implement a matrix multiplication. – Tudor Apr 14 '12 at 17:52
  • 1
    This doesn't really make any sense. Only the kernel execution time was measured for this. It does not take into account data transfer. – KLee1 Apr 15 '12 at 07:48
  • A modest gpu will often beat a high-end cpu at this task. the transfer time was not measured, but it was only 4M up/down. The example is a little bit contrived to begin with. sum of first N squares = (N * (N + 1) * (2N + 1)) / 6 – mfa Oct 26 '12 at 03:28
2

Just for others coming her for help: Short introduction to profiling kernel runtime with OpenCL

Enable profiling mode:

cmdQueue = clCreateCommandQueue(context, *devices, CL_QUEUE_PROFILING_ENABLE, &err);

Profiling kernel:

cl_event prof_event; 
clEnqueueNDRangeKernel(cmdQueue, kernel, 1 , 0, globalWorkSize, NULL, 0, NULL, &prof_event);

Read profiling data in:

cl_ulong ev_start_time=(cl_ulong)0;     
cl_ulong ev_end_time=(cl_ulong)0;   

clFinish(cmdQueue);
err = clWaitForEvents(1, &prof_event);
err |= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ev_start_time, NULL);
err |= clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);

Calculate kernel execution time:

float run_time_gpu = (float)(ev_end_time - ev_start_time)/1000; // in usec

Your approach with

total_time/ARRAY_SIZE

is not what you want. It will give you the run time per work item.

Operations / time in nanoseconds will give you GFLOPS (giga floating point operations per second).

jaba
  • 735
  • 7
  • 18
2

This is one big problem with your application:

size_t global_size = ARRAY_SIZE;
size_t local_size = 0;

You are creating single-item work groups, which will let most of the gpu sit idle. In many cases, using single-item work groups will only utilize 1/15th of your gpu.

Instead try this:

size_t global_size = ARRAY_SIZE / 250; //important: local_size*global_size must equal ARRAY_SIZE
size_t local_size = 250; //a power of 2 works well. 250 is close enough, yes divisible by your 1M array size

Now you're creating large groups that will better saturate the ALU of your graphics hardware. The kernel will run fine the way you have it now, but there are ways to get more out of the kernel portion too.

Kernel optimization: pass ARRAY_SIZE into the kernel as an additional param, and use fewer groups of a more optimal group size. You will also eliminate the need for local_size*global_size to be exactly equal to ARRAY_SIZE. The work items' global id is never used in the this kernel, and it is not needed because the total size was passed in.

__kernel void float_sum(__global float* vec,__global float* result,__global int count){
  int lId = get_local_id(0);
  int lSize = get_local_size(0);
  int grId = get_group_id(0);
  int totalOps = count/get_num_groups(0);
  int startIndex = grId * totalOps;
  int maxIndex = startIndex+totalOps;
  if(grId == get_num_groups(0)-1){
    endIndex = count;
  }
  for(int i=startIndex+lId;i<endIndex;i+=lSize){
    result[i] = vec[i] * vec[i];
  }
}

Now you might be thinking that there are an awful lot of variables for such a simple kernel. Remember that each execution of the kernel will do multiple operations on the data, rather than just one. Using the values below, on my radeon 5870 (20 compute units), each work item ends up computing 781 or 782 values in its for loop. Each group would compute 50000 pieces of data. The overhead of the variables I use is far less than the overhead of creating 4000 work groups -- or 1 million.

size_t global_size = ARRAY_SIZE / numComputeUnits;
size_t local_size = 64; //also try other multiples of 16 or 64 for gpu; or multiples of your core-count for a cpu kernel

See here about how to get the value for numComputeUnits

Community
  • 1
  • 1
mfa
  • 5,017
  • 2
  • 23
  • 28