Shouldn't be 3x3 convolution much faster on GPU (OpenCL)

Question

I'm learning how to optimize code for GPU. I read about importance of memory locality. I've also seen some tutorials and examples of GPU convolution. Based on that I wrote and tested several own kernels. Surprisingly I found that the simplest naive kernell is the fastest!? and it is jut <10x faster than CPU. (Yes I amortized upload/download time by running the kenrnel 64x).

What I do wrong? I woud expect that convolution is just that kind of operation for which GPUs are optimized. If I can get 100x speed-up on matrix multiplication, why convolution is so slow?

performance [CPU ticks/pixel] (lower is better):

CPU-naive 9.5
GPU-naive 1.64
GPU-local 2.56
GPU-local_async 15.10
GPU-scanline-private 7.35
GPU-scanline_async 15.37

EDIT: GPU-scanline_async I made later after reading advices about async_work_group_copy

I wonder 2 things:

Is the kernel speed limited by memory bandwidth or by computing power? From what I read I would expect memory. But the results of the tests show the opposite.
- Kernel GPU-local is slower than GPU-naive even though it does much less global memory reads
- modification of the kernel by gaussian-filter coeffs (i.e. add multiplication per each pixel) makes it >2x slower, although it does same number of memory reads
- But if it is limited by processing power than why I get 100x faster matrix-multiplication on GPU than on CPU ?
why the kernel GPU-scanline-private is so slow? The memory locality is much better (just 3 instead of 9 reads from global memory per pixel) and the logic is minimal (no ifs/switches)

The test was done on my laptop with CPU Intel Core i7 6700HQ Skylake and GPU nVidia 960M by running the kernels 64x/frame on floating point array of 256x256 pixels. The code full can be seen here.

=========== Kernel codes ===========

kernel GPU-Naive 2D global=(256,256) local=(16,16)

__kernel void blur2D_naive(
    __global float* I, 
    __global float* O
){
    const int ix = get_global_id (0)+1;
    const int iy = get_global_id (1)+1;
    const int nx = get_global_size(0)+2;

    int i = iy * nx + ix;

    // 1.6 ticks/pixel
    O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
            I[i   -1] + I[i   ] + I[i   +1] +
            I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * 0.11111111111;
    // modified with gaussian mask 4.9 ticks/pixel
    //O[i] =( 0.0625*I[i-nx-1] + 0.125*I[i-nx] + 0.0625*I[i-nx+1] +
    //        0.125 *I[i   -1] + 0.25 *I[i   ] + 0.125 *I[i   +1] +
    //        0.0625*I[i+nx-1] + 0.125*I[i+nx] + 0.0625*I[i+nx+1] );
}

kernel GPU-local 2D global=(256,256) local=(16,16)

#define NBx 18 // tile size including borders [halo] 16+2
#define NBy 18
// seems to be slower than naive method
__kernel void blur2D_local(
    __global float* I, 
    __global float* O
){
    __local float L[NBx*NBy];
    const int2 iG  = (int2)(get_global_id  (0)+1 , get_global_id  (1)+1 );
    const int2 nG  = (int2)(get_global_size(0)+2 , get_global_size(1)+2 );
    const int2 iL  = (int2)(get_local_id   (0)+1 , get_local_id   (1)+1 );
    const int2 nL  = (int2)(get_local_size (0)+2 , get_local_size (1)+2 );
    const int2 iGR = (int2)(get_group_id   (0)   , get_group_id   (1)   );

    // copy boundary pixels to local memory
    switch( get_local_id(1) ){ // some threads copy one more of boundary (halo) pixels
        case 4: 
        switch( get_local_id(0) ){ // copy corner points
            case 0: L[        0      ] = I[ nG.x* get_group_id(1)*get_local_size(1)          + get_group_id(0)*get_local_size(0)         ]; break; // upper-left
            case 1: L[         NBx-1 ] = I[ nG.x* get_group_id(1)*get_local_size(1)          + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break; // upper-right
            case 2: L[ (NBy-1)*NBx   ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0)         ]; break; // lower-left
            case 3: L[ NBy*    NBx-1 ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break; // lower-rigth
        }
        // copy border lines 
        case 0: L[               iL.x    ] = I[ nG.x* get_group_id(1)*get_local_size(1)                   + iG.x                                        ]; break; // top    line
        case 1: L[ NBx*(NBy-1) + iL.x    ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)         ) + iG.x                                        ]; break; // botton line
        case 2: L[ NBx*iL.x              ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) +  get_group_id(0)*get_local_size(0)          ]; break; // left   line
        case 3: L[ NBx*iL.x    + (NBx-1) ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) + (get_group_id(0)*get_local_size(0)+(NBx-1)) ]; break; // right  line
    } // each thread coppied at max. 1 border pixels

    int ig = iG.y*nG.x + iG.x;
    int il = iL.y*nL.x + iL.x;
    L[il] = I[ig];             // each thread copy his pixel to local memory

    barrier(CLK_LOCAL_MEM_FENCE);

    const float renorm = 1.0/9.0;
    O[ig] =( L[il-NBx-1] + L[il-NBx] + L[il-NBx+1] +
             L[il    -1] + L[il    ] + L[il    +1] +
             L[il+NBx-1] + L[il+NBx] + L[il+NBx+1] ) / 9.0;
}

kernel GPU-local_async 2D global=(256,16) local=(16,16)

#define nTiles 16
#define NBx 18
#define NBy 18 
#define copy_tile(event,ig0,I,L) { int ig_=ig0; int il_=0; for(int i=0; i<NBy; i++){   event = async_work_group_copy( L+il_, I+ig_, NBx, event ); ig_+=nx; il_+=NBx; } }
// https://streamcomputing.eu/blog/2014-06-19/using-async_work_group_copy-on-2d-data/
__kernel void blur2D_local_async(
    __global float* I, 
    __global float* O
){
    const int nx = get_global_size(0)+2;        
    __local float LI[NBx*NBy*2];
    int iL0 = 0;
    int iL1 = NBx*NBy;        
    event_t event = 0;
    int ig0 = get_group_id(0)*get_local_size(0);
    copy_tile(event,ig0,I,LI);
    for( int it=0; it<nTiles; it++ ){
        int ig   = ig0 + (get_local_id(1)+1)*nx  + get_local_id(0)+1;
        int il   =       (get_local_id(1)+1)*NBx + get_local_id(0) + iL0;
        ig0     += get_local_size(1)*nx;
        event_t event_ = 0;
        copy_tile(event_,ig0,I,LI+iL1);
        wait_group_events(1, &event);
        //barrier(CLK_LOCAL_MEM_FENCE);
        O[ig] =( LI[il-NBx] + LI[il-NBx+1] + LI[il-NBx+2] +
                 LI[il    ] + LI[il    +1] + LI[il    +2] +
                 LI[il+NBx] + LI[il+NBx+1] + LI[il+NBx+2] ) * 0.11111111111;
        int iLtmp=iL0; iL0=iL1; iL1=iLtmp;
        event = event_;
    }
}

kernel GPU-scanline_private 1D global=(256) local=(32)

__kernel void blur2D_scanline_priv(
    int nx, int ny,
    __global float* I, 
    __global float* O
){ 
    int ig    = get_global_id(0)+1;
    float3 Lm = (float3)( I[ig-1], I[ig], I[ig+1] );  ig += nx;
    float3 L0 = (float3)( I[ig-1], I[ig], I[ig+1] ); 
    for(int iy=1; iy<(ny-1); iy++ ){
        ig += nx;
        float3 Lp= (float3)( I[ig-1], I[ig], I[ig+1] );  
        O[ig-nx] = 
            ( Lm.x + Lm.y + Lm.z +
              L0.x + L0.y + L0.z +
              Lp.x + Lp.y + Lp.z ) * 0.11111111111;              
        Lm=L0; L0=Lp; 
    }
}

kernel GPU-scanline_async 1D global=(256) local=(32)

 #define NB 34
__kernel void blur2D_scanline_async(
    int nx, int ny,
    __global float* I, 
    __global float* O
){
    __local float  L[NB*4];
    int i0=0;
    int i1=NB;
    int i2=NB*2;
    int i3=NB*3;
    event_t event = 0;
    int ig0 = get_group_id(0)*get_local_size(0);
    event = async_work_group_copy(  L     , I+ig0, NB, event );    ig0 += nx;
    event = async_work_group_copy(  L+NB  , I+ig0, NB, event );    ig0 += nx;   
    event = async_work_group_copy(  L+NB*2, I+ig0, NB, event );    ig0 += nx;
    const int il = get_local_id(0);
    int ig = get_global_id(0)+1;
    for(int iy=1; iy<(ny-2); iy++ ){
        wait_group_events(1, &event);
        event = async_work_group_copy(  L+i3, I+ig0, NB, event ); ig0 += nx;
        ig += nx;
        O[ig] =  
            ( L[i0+il] + L[i0+il+1] + L[i0+il+2] +
              L[i1+il] + L[i1+il+1] + L[i1+il+2] +
              L[i2+il] + L[i2+il+1] + L[i2+il+2] ) * 0.11111111111;
        __local float *Ltmp;
        int itmp=i0; i0=i1; i1=i2; i2=i3; i3=itmp;
    }
}

kernel CPU-naive

void blur(int nx, int ny, float * I, float * O ){
    float renorm = 1.0/9.0;
    for(int iy=1;iy<ny-1;iy++){ for(int ix=1;ix<nx-1;ix++){
        int i   = iy*nx+ix;
        O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
                I[i   -1] + I[i   ] + I[i   +1] +
                I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * renorm;
    } }
}

huseyin tugrul buyukisik · Accepted Answer · 2017-04-04T23:33:06.673

In matrix multiplication, each sub-matrix (patch) is used for all patches in all lines in the other matrix. If there is 2x2 sub-matrix in a patch and if main matrix is 20x20 then each sub-matrix is used 10 times for multiplication. GPU generally uses 16x16 or 32x32 sized patches which means, for a 2kx2k multiplication each 16x16 patch is re-used for 128 times at least.

MM reuse = 128

and add the sub-matrix - sub-matrix multiplication re-use, it is enough to push gpu to limits.

In a 3x3 convolution, 3x3 patch is not used for a whole scanline or a whole picture. Only its pixels are re-used.

3x3 stencil: each pixel is re-used by neighbouring 8 stencils.

5x5 stencil: each pixel is re-used by neighbouring 24 stencils.

to catch up with matrix multiplication, it would need

11x11 stencil to have a reuse of 120

which is also more local than matrix multiplication and should get more gflops than it but it is not doing equal amounts of multiplications and additions.

It is doing 9 additions + 1 multiplications.

8 potential multiplications are lost. Nearly half of GFLOPS limit is lost.

You should try async workgroup copies.

load top-left 18x18,
load top 18x18 and compute top-left async
load top-right 18x18 and compute top async and store top-left async
load right 18x18 and compute top-left async and store top async
load .... compute ... store... all async so both local memory and main memory could be used(main memory would take advantage of naive version, L1 maybe)

Matrix multiplication/with 16x16 sub matrices) vs convolution(17x17 brush size):

Matrix: L2 re-use ratio increases with main matrix size, or L1 re-use ratio increases with sub-matrix size (L1)
- Convolution: total re-use ratio is same for all image sizes but L1 usage ratio increases with brush size(good)
Matrix: 16*16*16 multiplications + 16*16*16 additions per workgroup
- Convolution: 17*17 additions + 1 multiplication per thread(bad)
Matrix: uniform thread usage, no if-else, all local memory is re-used
- Convolution: needs to load at least 16 pixel further than borders(ghost walls with 16 thickness) which are to be re-used by neighbour workgroups but those neighbour workgroups may be in another compute unit and just use L2 instead of being on same compute unit to use L1 (ugly)
  - That is why I suggested async work group copies to use those neighbours on same compute unit (and L1) and increase re-use ratio.
Matrix: increasing patch size also increases re-use by cubic power rate in sub-matrix multiplications(but decreases L2 re-usage because of having less patches per line, which makes total re-use like square-power rate)
- Convolution: increasing patch size increases re-use by square power rate
Matrix: local memory must be at least 2x tile area (sub mat-mat mul)
- Convolution: local memory must be at least tile area + ghost walls area
Matrix: can do 4x4 sub-sub-multiplications in private memory(which use each element 4 times) which means 4x4 memory = 64 add+64 mul
- Convolution: loading 4x4 into private memory doesnt do anything but just a 4-pixel compute (for a 3x3 brush) which means 4x4 memory = 36 add + 4 mul

Having an addition-heavy kernel leaves room for another multiplication-heavy kernel to work concurrently or in same kernel asynchronously. Maybe if you are using this for an image processing, maybe you can add some "blend" or "resize" kernels inside so they work together?

Scanline version is loading 3 elements, doing 9 add + 1 mul then repeats, loaded elements stay for 3 turns which means they are re-used for 3 times only and its neighbours(x or y directio) may not fall in neighbour thread or even neighbour workgroup. Also 3 loads versus 1 stores is unbalanced. If memory bandwidth is 100 GB/s then it would use 50GB/s for loads, 15 GB/s for stores unless they are coming from L1.

You can decrease add/mul imbalance by using accumulator.

store = (accumulator) * 0.1111111
accumulator+=new vector  // 3 adds
accumulator-=old vecotr  // 3 adds

so now it is 6 adds + 1 muls so more balanced like: 1Tflops GPU will have 500Gflops for adds, 90 Gflops for muls.

Naive version doesn't use local memory, leaving more room for more wavefronts in-flight. Local memory version actually breaks L1 access pattern and let less wavefronts in-flight. This reduces VALU occupation.

You can decrease local memory usage by doing scanline on workgroup level instead of thread level. What I mean is something like:

load from memory: x x x x x x x x x x do scanline for it: (left to right,1-D) a b c d e f g h i j now use it for scanline on workgroup level: a c c u m u l a t o r (+new) (top to bottom) z x z x z x z x z x (- old)

calculate frontline 1-d scanline:  30 additions for each new row
calculate wide vector 2-d scanline:30*30 additions
each pixel get 1 value instead of adding 3 values
storing: 16x16 multiplications
much less local memory used, more balanced (~8 add 1 mul)

this has 1-d scanline that is single-thread for N cycles or multi threaded reduce for LogN cycles(considering enough threads in a compute unit).

Thanks, you seems to be very knowledgeble. I have to read it several times to understand. What is not clear to me: 1) Is your recommendation about *async workgroup copies* applicable to 3x3 convolution or it is ment for larger stencils? 2) you say my kernel are slow because they are "unbalanced" and have low reusability (few multiplication, many reads, adds). Then I would expect that adding multiplications like in *gaussian mask* mod of naive kernel would not affect speed? — Prokop Hapala, Apr 05 '17 at 06:52
In general I'm testing this 3x3 convolution just as a simple model for algorithms like finite differences solvers of partial differential equations. The memory access pattern is the same - each pixels needs to know value of its the neighbors. But the operations will be different - different stencils, (dot product with stencil mask, minimum of neighbors). In that case I guess your suggestions about *accumulator* is not applicable (?) — Prokop Hapala, Apr 05 '17 at 07:00
(first comment)Async workgroups may be applicable to biger chunks such as 18x18 you mention. Yes, when mul units are free, using them would fill pipeline an get hidden. (second comment) if there are not only additions then accumulator is not usable but you can still use multiple reductions instead. — huseyin tugrul buyukisik, Apr 05 '17 at 08:03
Aha I finally got understood what you mean by reading this https://streamcomputing.eu/blog/2014-06-19/using-async_work_group_copy-on-2d-data/ ... I did not know about this function. great tip ! thanks — Prokop Hapala, Apr 05 '17 at 21:06
Hi. I tried that `async_work_group_copy` (see EDIT kernel *GPU-scanline_async* ). But it does not help, actually it is almost 3x slower than previous *GPU-scanline-private*. Can you comment on that? I hope I don't bother you too much. — Prokop Hapala, Apr 06 '17 at 21:25
@ProkopHapala I checked your kernel, it seems each thread is issuing a different async copy. Main rule about async copy is that all threads must give same parameters to it, and all parameters must hit it. Also they are doing less job I think. I have meant loading whole locals with interior+ghost with single async copy(probably its strided version). Lastly, async command itself is not a barrier, kernel may need a barrier before it if the local memory is to be used just after some data written onto it. Another lastly, asnyc copies can be from local to global too, which can async copy with others — huseyin tugrul buyukisik, Apr 06 '17 at 23:30
The address `L+i3-il` is actually the same for all threads. Now, I modified the kernel to equivalent code the to make it obvious (in case compiler was somehow confused) (you can see updated version). The indexes `i0,i1,i2,i3` depend just on `get_group_id` and constants. I hope that the shuffling `int itmp=i0; i0=i1; i1=i2; i2=i3; i3=itmp;` does not confuse the compiler to recognize that addresses are the same. For the other comments - yes, I understood you about 18x18 tiles, but I wanted to start with smaller/simple mod. Still I wonder why it is slower than previous versions of *scanline*. — Prokop Hapala, Apr 07 '17 at 07:28
Hi, so I implemented `blur2D_local_async` which copy 18x18 tiles, but still it is 10x slower than the naive kernel :-( ... I don't understand that. `async_work_group_copy` seems to be much (>5x) slower than manual global-to-local copy (*blur2D_local*). — Prokop Hapala, Apr 08 '17 at 08:31
Could the implementation forcing them serialized? Whats the gpu you using? — huseyin tugrul buyukisik, Apr 08 '17 at 08:34
Currently I have conspiracy theory that the compiler somehow automatically generate some optimal memory caching pattern in *blur2D_naive* since it is so simple and regular. — Prokop Hapala, Apr 08 '17 at 08:41
I'm using nVidia 960M (on laptop) https://compubench.com/device.jsp?benchmark=compu20&D=NVIDIA+GeForce+GTX+960M&testgroup=info — Prokop Hapala, Apr 08 '17 at 08:43
@ProkopHapala did you profile total local memory usage, private register usage? Maybe they are limiting number of wavefronts(warps)? — huseyin tugrul buyukisik, Apr 08 '17 at 09:04
no I have no experience with using these tools. perhaps I should learn it. — Prokop Hapala, Apr 08 '17 at 09:40
@ProkopHapala profiling is very useful to see bottlenecks. If it is local memory usage, adding more local just makes it worse. — huseyin tugrul buyukisik, Apr 08 '17 at 09:59
Hmm, I cannot find any way how to profile my OpenCL apps. [This way](http://uob-hpc.github.io/2015/05/27/nvvp-import-opencl.html) seems to not work since it was probably [deprecated](https://stackoverflow.com/questions/41698992/how-to-profile-opencl-application-with-cuda-8-0-nvprof) (I'm on ubuntu 16.04 with nVidia). How do you profile OpenCL — Prokop Hapala, Apr 28 '17 at 12:54
@ProkopHapala I used only Intel's codebuilder with its analyzer addon on my amd cards and ofcourse codexl of Amd. I'm not sure about nvidia solution. — huseyin tugrul buyukisik, Apr 28 '17 at 12:59

Shouldn't be 3x3 convolution much faster on GPU (OpenCL)

1 Answers1