4

Hi I'm new to JOCL (opencl). I wrote this code to take the sum of the intensities per image. The kernel takes a 1D array of all the pixels of all the images put behind eachother. An image is 300x300 , so that's 90000 pixels per image. At the moment it goes slower than when I do this sequentially.

My code

package PAR;

/*
 * JOCL - Java bindings for OpenCL
 * 
 * Copyright 2009 Marco Hutter - http://www.jocl.org/
 */
import IMAGE_IO.ImageReader;
import IMAGE_IO.Input_Folder;
import static org.jocl.CL.*;

import org.jocl.*;

/**
 * A small JOCL sample.
 */
public class IPPARA {

    /**
     * The source code of the OpenCL program to execute
     */
    private static String programSource =
            "__kernel void "
            + "sampleKernel(__global uint *a,"
            + "             __global uint *c)"
            + "{"
            + "__private uint intensity_core=0;"
            + "      uint i = get_global_id(0);"
            + "      for(uint j=i*90000; j < (i+1)*90000; j++){ "
            + "              intensity_core += a[j];"
            + "     }"
            + "c[i]=intensity_core;" 
            + "}";

    /**
     * The entry point of this sample
     *
     * @param args Not used
     */
    public static void main(String args[]) {
        long numBytes[] = new long[1];

        ImageReader imagereader = new ImageReader() ;
        int srcArrayA[]  = imagereader.readImages();

        int size[] = new int[1];
        size[0] = srcArrayA.length;
        long before = System.nanoTime();
        int dstArray[] = new int[size[0]/90000];


        Pointer srcA = Pointer.to(srcArrayA);
        Pointer dst = Pointer.to(dstArray);


        // Obtain the platform IDs and initialize the context properties
        System.out.println("Obtaining platform...");
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);

        // Create an OpenCL context on a GPU device
        cl_context context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
        if (context == null) {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                    contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

            if (context == null) {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Get the list of GPU devices associated with the context
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);

        // Obtain the cl_device_id for the first device
        int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
                Pointer.to(devices), null);

        // Create a command-queue
        cl_command_queue commandQueue =
                clCreateCommandQueue(context, devices[0], 0, null);

        // Allocate the memory objects for the input- and output data
        cl_mem memObjects[] = new cl_mem[2];
        memObjects[0] = clCreateBuffer(context,
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                Sizeof.cl_uint * srcArrayA.length, srcA, null);
        memObjects[1] = clCreateBuffer(context,
                CL_MEM_READ_WRITE,
                Sizeof.cl_uint * (srcArrayA.length/90000), null, null);

        // Create the program from the source code
        cl_program program = clCreateProgramWithSource(context,
                1, new String[]{programSource}, null, null);

        // Build the program
        clBuildProgram(program, 0, null, null, null, null);

        // Create the kernel
        cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);

        // Set the arguments for the kernel
        clSetKernelArg(kernel, 0,
                Sizeof.cl_mem, Pointer.to(memObjects[0]));
        clSetKernelArg(kernel, 1,
                Sizeof.cl_mem, Pointer.to(memObjects[1]));

        // Set the work-item dimensions
        long local_work_size[] = new long[]{1};
        long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};


        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
                global_work_size, local_work_size, 0, null, null);

        // Read the output data
        clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,
                (srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);

        // Release kernel, program, and memory objects
        clReleaseMemObject(memObjects[0]);
        clReleaseMemObject(memObjects[1]);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);


        long after = System.nanoTime();

        System.out.println("Time: " + (after - before) / 1e9);

    }
}

After the suggestions in the answers, the parallel code via the CPU is almost as fast as the sequential code. Are there any more improvements that can be made ?

Olivier_s_j
  • 5,490
  • 24
  • 80
  • 126

2 Answers2

2
 for(uint j=i*90000; j < (i+1)*90000; j++){ "
        + "              c[i] += a[j];"

1)You are using global memory(c[]) to sum and this is slow. Use a private-variable to make it faster. Something like this:

          "__kernel void "
        + "sampleKernel(__global uint *a,"
        + "             __global uint *c)"
        + "{"
        + "__private uint intensity_core=0;" <---this is a private variable of each core
        + "      uint i = get_global_id(0);"
        + "      for(uint j=i*90000; j < (i+1)*90000; j++){ "
        + "              intensity_core += a[j];" <---register is at least 100x faster than global memory
         //but we cannot get rid of a[] so the calculation time cannot be less than %50
        + "     }"
        + "c[i]=intensity_core;"   
        + "}";  //expecting %100 speedup

Now you have c[number of images] array of sum-of-intensities.

Your local-work-size is 1 then if you have at least 160 images(which is your gpu's core number) then the calculation will be using all cores.

You will need 90000*num_images times read and num_images write and 90000*num_images register read/write. Using registers will halve your kernel time.

2)You are doing only 1 math per 2 memory-access. You need at least 10 math per 1 memory-access to use a small-fraction of peak Gflops of you gpu (250 Gflops peak for 6490M)

Your i7 cpu can have 100 Gflops easily but your memory will be bottleneck. This is even worse when you send whole data throug pci-express .(HD Graphics 3000 is rated at 125 GFLOPS)

 // Obtain a device ID 
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
    cl_device_id device = devices[deviceIndex];
 //one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu 
 //devices[2]-->HD3000

In your program:

 // Obtain the cl_device_id for the first device
    int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
            Pointer.to(devices), null);

takes the first device probagbly the gpu.

huseyin tugrul buyukisik
  • 11,469
  • 4
  • 45
  • 97
  • And would you be able to give some insight on how to do this with my current code ? – Olivier_s_j Nov 24 '12 at 17:08
  • Whats your cpu/gpu ? Your local worksize is 1 this is too low. – huseyin tugrul buyukisik Nov 24 '12 at 17:10
  • Graphics AMD Radeon HD 6490M 256 MB || Processor Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz thx for trying :) – Olivier_s_j Nov 24 '12 at 17:17
  • You got 160 processing element right? Just divide 90000 by 160 and you get about 560 pixels per core. This is(may) the optimal ratio. – huseyin tugrul buyukisik Nov 24 '12 at 17:20
  • "Did you try default or bigger than-1 local-work size?", I do not think so. I did not change this when I started from the example code long local_work_size[] = new long[]{1}; – Olivier_s_j Nov 24 '12 at 17:23
  • With your example I won't get the intesity sum per image I think. I'll just get the sum of all the intensities off all the images. But I need them per image (each image has 90000 pixels) – Olivier_s_j Nov 24 '12 at 17:25
  • What you wrote in your question is calculating a pictures all pixels sum in each core so you make the same calculation for 160 times. Thats why you get too slow computing. – huseyin tugrul buyukisik Nov 24 '12 at 17:29
  • I thought that would speed it up :p. Take the sum of the intensities per image per core. That's why I'm here to get some help how this can be enhanced :). But to make myself more clearer, If i have for example 10.000 images I have 10.000*300*300 pixels. I need 10.000 values (So a sum per image, not 1 big sum of all images) – Olivier_s_j Nov 24 '12 at 17:31
  • Your c[] has one image or `all` ,mages? If all, you are right. But keep in mind that your cpu could be better than your gpu :) – huseyin tugrul buyukisik Nov 24 '12 at 17:33
  • Yes c[] has all the pixels of all the images. And yes I know that is a possibility. But with this code I can test parallel cpu and also gpu .. one of the two should be faster I hope – Olivier_s_j Nov 24 '12 at 17:34
  • Use local/private variables for core-sums so it douesnt access main memory each time. Just like i wrone in the answer. You gpu's global memory bandwidth is 14-26 GBps but your local variable is more than 200 GBps and register(private) bandwidth is more than 1200 GBps – huseyin tugrul buyukisik Nov 24 '12 at 17:34
  • example ? (really new to jocl/ opencl :s) – Olivier_s_j Nov 24 '12 at 17:35
  • Okay, add pixels in a __private variable per core to make it faster. Then in the end, put all private variables to global memory. – huseyin tugrul buyukisik Nov 24 '12 at 17:39
  • Keep in mind that 90000 elements stride is not making any global-memory bank-conflicts. – huseyin tugrul buyukisik Nov 24 '12 at 17:46
  • Adding the local variable slows it down it seems. And I can use up to 1500 images btw. – Olivier_s_j Nov 24 '12 at 17:59
  • It shouldnt be slowing, did you try __private also? – huseyin tugrul buyukisik Nov 24 '12 at 18:01
  • It is faster than before (especially GPU), but not faster that sequential calculations. Do you recon it is not possible to be faster than sequential calculations ? – Olivier_s_j Nov 24 '12 at 18:15
  • If you are not doing 10 maths per memory access, you are not using your cores. U are using memory only :(. Try to use your HD 3000(Intel) for calculations. Closer to memory :) – huseyin tugrul buyukisik Nov 24 '12 at 18:17
  • Is that possible ? :o, but isn't that a slower one ? How would that be done in code ? – Olivier_s_j Nov 24 '12 at 18:27
  • I mean the `ratio` `(10 math):(1 memory)` your intensity sum operation needs 1:1 so it is bounded by memory bandwidth. Rather than sending to gpu, you can just use cpu (opencl on cpu or Intel HD 3000). You have many cores eating faster than your cpu throws at them. They are hungry animals. Poor pci-express(and memory) cannot reach all speed. – huseyin tugrul buyukisik Nov 24 '12 at 18:29
  • Ok I'll look into that tomorrow :) – Olivier_s_j Nov 24 '12 at 18:48
  • "Stride" term is important to not to have any "memory bank conflict". Please look at this from sources of internet. – huseyin tugrul buyukisik Nov 24 '12 at 18:49
  • Hi, if you are stilling willing to help. Mind helping by telling me how i can access the intel HD 3000 via my code ? – Olivier_s_j Nov 25 '12 at 09:53
0

You should be using an entire work group per 300x300 image. This will help saturate the gpu cores and let you use local memory. The kernel should also be able to process as many images simultaneously as you have compute units on your device.

The kernel below does your reduction in three steps.

  1. read the values into one private unit per work item
  2. write the private var to local memory (very simple step, but important)
  3. reduce the values in local memory to get the final value. There are two ways to do this shown here.

WG_MAX_SIZE is defined because I am not a fan of passing in variable-sized local memory blocks. The value is 64 because that is a good value to use on most platforms. Make sure you set this value higher if you want to experiment with larger work groups. work groups smaller than WG_MAX_SIZE will still work fine.

#define WORK_SIZE 90000
#define WG_MAX_SIZE 64
__kernel void sampleKernel(__global uint *a, __global uint *c)
{

    local uint intensity_core[WG_MAX_SIZE];
    private uint workItemIntensity = 0;

    int gid = get_group_id(0);
    int lid = get_local_id(0);
    int wgsize = get_local_size(0);
    int i;

    for(i=gid*WORK_SIZE; i < (gid+1)*WORK_SIZE; i+=wgsize){ 
        workItemIntensity += a[j];
    }
    intensity_core[lid] = workItemIntensity;
    mem_fence(CLK_LOCAL_MEM_FENCE);

    //option #1
    //loop to reduce the final values O(n) time
    if(lid == 0){
        for(i=1;i<wgsize;i++){
            workItemIntensity += intensity_core[i];
        }
        c[gid]=intensity_core;
    }

    //option #2
    //O(logn) time reduction
    //assumes work group size is a power of 2
    int steps = 32 - clz(wgsize);
    for(i=1;i<steps;i++){
        if(lid % (1 << i) == 0){
            intensity_core[lid] += intensity_core[i<<(i-1)];
        }
        mem_fence(CLK_LOCAL_MEM_FENCE);
    }
    if(lid == 0){
        c[gid]=intensity_core[0];
    }
}
mfa
  • 5,017
  • 2
  • 23
  • 28