CUDA which is faster? Memory coalescing vs caching?

Question

I have encountered this exercise which asks for which code is faster between the following two.

First code.

int sum = 0;
for(int i = 0; i < n; i++) {
    sum += array[i*n + thread_id];
}

Second code.

int sum = 0;
for(int i = 0; i < n; i++) {
    sum += array[n*thread_id + i];
}

I would try the code myself I will not have a Nvidia GPU in the following days. I think that the first code takes advantage of memory coalescing see here, while the second one would take advantage of caching.

There is no order of execution of threads within a block. Similarly, the execution of blocks also has no order, it depends on the required resources available. Yes, the second code will see much better memory coherence. Further, I am assuming that "n" is not the size of the array, otherwise both codes will suffer from out of bounds error. — MuneshSingh, Dec 10 '17 at 03:48
here https://stackoverflow.com/questions/11816786/why-bother-to-know-about-cuda-warps?rq=1 in the last point of the answer of Roger Dahl, it is written that n=32 the first code is better! — Nisba, Dec 10 '17 at 08:48
No, I have not yet done any benchmark tests. Will go through the link and let you know. — MuneshSingh, Dec 10 '17 at 08:52
If you have to load the same data in each case, coalesced loads will always be more efficient and faster than any other non-coalesced method, for the purposes of loading data. The two presented codes are not doing the same thing, however. One is effectively summing columns (using coalesced loads), and the other is effectively rows (in a non-coalesced fashion.) If we only care about the execution performance of the two codes, and not their functionality, the coalesced version is definitely more efficient. — Robert Crovella, Dec 10 '17 at 15:01
@RobertCrovella thank you. But isn't the second code taking advantage of caching? Because when the first element of a row is read by a thread then the same thread will find all the other elements already in cache? — Nisba, Dec 10 '17 at 15:44
@Nisba I am too learning from your query. By the way Robert Crovella is absolutely correct. The first code shall provide concurrent access to different threads across memory banks separated by different strides. I am not very sure about the second reason as anyway the concurrent reads should not have any conflicts. — MuneshSingh, Dec 10 '17 at 17:28

score 1 · Accepted Answer · answered Dec 12 '17 at 10:54

Many thanks to @RobertCrovella for clarifying the issues regarding memory coalescing. This is my attempt to benchmark the two codes as asked for. It can be clearly noticed from the output (run on a NVS5400M GPU laptop) that the first code is twice more efficient as compared to the second one. This is because of the memory coalescing taking place in the first one (kernel1).

#include <cuda.h>
#include <ctime>
#include <iostream>
#include <stdio.h>
using namespace std;

#define BLOCK_SIZE 1024
#define GRID_SIZE 1024

// Error Handling
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//kernel1<<<8,8>>>(d_array,d_sum1,n);

__global__ void kernel1(int *array, long *sum, int n) {
    long result=0;

    int thread_id=threadIdx.x+blockIdx.x*blockDim.x;

    for(int i=0;i<n;i++) {
        result += array[i*n + thread_id];
    }
    //__syncthreads();
    sum[thread_id]=result;
}

__global__ void kernel2(int *array, long *sum, int n) {
    long result=0;

    int thread_id=threadIdx.x+blockIdx.x*blockDim.x;

    for(int i=0;i<n;i++) {
        result += array[n*thread_id+i];
    }
    __syncthreads();
    sum[thread_id]=result;
}


int main() {
    srand((unsigned)time(0));

    long *h_sum1,*d_sum1;
    long *h_sum2,*d_sum2;
    int n=10;
    int size1=n*BLOCK_SIZE*GRID_SIZE+n;
    int *h_array;

    h_array=new int[size1];
    h_sum1=new long[size1];
    h_sum2=new long[size1];

    //random number range
    int min =1, max =10;
    for(int i=0;i<size1;i++) {
        h_array[i]= min + (rand() % static_cast<int>(max - min + 1));
        h_sum1[i]=0;
        h_sum2[i]=0;
    }

    int *d_array;
    gpuErrchk(cudaMalloc((void**)&d_array,size1*sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_sum1,size1*sizeof(long)));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    gpuErrchk(cudaMemcpy(d_array,h_array,size1*sizeof(int),cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_sum1,h_sum1,size1*sizeof(long),cudaMemcpyHostToDevice));



    cudaEventRecord(start);
    kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum1,n);
    cudaEventRecord(stop);

    gpuErrchk(cudaMemcpy(h_sum1,d_sum1,size1*sizeof(long),cudaMemcpyDeviceToHost));

    float milliSeconds1=0;
    cudaEventElapsedTime(&milliSeconds1,start,stop);

    gpuErrchk(cudaMalloc((void**)&d_sum2,size1*sizeof(long)));
    gpuErrchk(cudaMemcpy(d_sum2,h_sum2,size1*sizeof(long),cudaMemcpyHostToDevice));

    cudaEventRecord(start);
    kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum2,10);
    cudaEventRecord(stop);

    gpuErrchk(cudaMemcpy(h_sum2,d_sum2,size1*sizeof(long),cudaMemcpyDeviceToHost));


    float milliSeconds2=0;
    cudaEventElapsedTime(&milliSeconds2,start,stop);

    long result_device1=0,result_host1=0;
    long result_device2=0,result_host2=0;
    for(int i=0;i<size1;i++) {
        result_device1 += h_sum1[i];
        result_device2 += h_sum2[i];
    }


    for(int thread_id=0;thread_id<GRID_SIZE*BLOCK_SIZE;thread_id++)
    for(int i=0;i<10;i++) {
            result_host1 += h_array[i*10+thread_id];
            result_host2 += h_array[10*thread_id+i];
    }

    cout << "Device result1 = " <<  result_device1 << endl;
    cout << "Host result1 = " <<  result_host1 << endl;
    cout << "Time1 (ms) = " << milliSeconds1 << endl;

    cout << "Device result2 = " <<  result_device2 << endl;
    cout << "Host result2 = " <<  result_host2 << endl;
    cout << "Time2 (ms) = " << milliSeconds2 << endl;

    gpuErrchk(cudaFree(d_array));
    gpuErrchk(cudaFree(d_sum1));
    gpuErrchk(cudaFree(d_sum2));

    return 0;
}

The Cuda Event timer output is as under:

Device result1 = 57659226
Host result1 = 57659226
Time1 (ms) = 5.21952
Device result2 = 57674257
Host result2 = 57674257
Time2 (ms) = 11.8356

CUDA which is faster? Memory coalescing vs caching?

1 Answers1