3

Assume the array size is SOME_CONSTANT. So I have this AOS (array of structs)

struct abc {
  float a;
  float b;
  float c;
};

And I allocate memory for it by first defining the pointer and then allocating an AOS.

abc *foo = 0;
foo = (abc *)malloc(SOME_CONSTANT * sizeof(abc));

So this is fine. Now I want to make a Struct of arrays (SOA) like this:

struct abc {
  float *a;
  float *b;
  float *c;
};

But I cannot seem to think of a way of allocating memory to the struct pointer abc *foo

The best I could come up was this:

struct abc {
  float a[SOME_CONSTANT];
  float b[SOME_CONSTANT];
  float c[SOME_CONSTANT];
};

and then by doing:

abc *foo = 0;
foo = (abc *)malloc(sizeof(abc));

I am trying to see the performance difference between AOS and SOA with CUDA. Is there any other way I can allocate memory for an SOA (below)? Is using my above method a good practice?

struct abc {
  float *a;
  float *b;
  float *c;
};
If_You_Say_So
  • 1,195
  • 1
  • 10
  • 25
paulplusx
  • 213
  • 4
  • 13
  • Just allocate memory and assign the pointer to that memory to each member. – talonmies May 17 '20 at 14:13
  • @talonmies Do you mean that I should do 3 different memory allocations for the three member arrays ? 1. First create struct pointer `abc *foo = 0;` 2. Then allocate memory and assign for each member `foo->a= (float *)malloc(SOME_CONSTANT * sizeof(float ));` `foo->b= (float *)malloc(SOME_CONSTANT * sizeof(float ));` `foo->c= (float *)malloc(SOME_CONSTANT * sizeof(float ));` – paulplusx May 17 '20 at 16:48

1 Answers1

3

But I cannot seem to think of a way of allocating memory to the struct pointer abc *foo ...... Is there any other way I can allocate memory for an SOA (below)?

I am not sure what your difficulty is. As long as you don't have an array of structures of arrays, why not simply using:

abc *foo;
cudaMalloc((void **)&foo, SOME_CONSTANT*sizeof(abc));

Is using my above method a good practice?

The question of AoS vs SoA is application dependent and there are many excellent questions/answers regarding this topic for CUDA applications on SO (like this answer). The bottom line is that coalesced memory access occurs when all threads in a warp access a contiguous chunk of memory. So you can expect to see a higher memory bandwidth when working with an SoA if the access to each field can be coalesced. With your given example, let's run a simple test to quantify the performance difference:

#include <stdio.h>
#include <stdlib.h>

#define CHECK_CUDA(call)                                            \
{                                                                   \
const cudaError_t error = call;                                     \
if (error != cudaSuccess)                                           \
{                                                                   \
printf("ERROR:: File: %s, Line: %d, ", __FILE__, __LINE__);         \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(EXIT_FAILURE);                                                 \
}                                                                   \
}

const int SOME_CONSTANT = 1024 * 1000; // to be executed on 1024 threads per block on 1000 blocks

// To be used as a SoA 
struct soa_abc {
    float *a;
    float *b;
    float *c;
};

// To be used as an AoS    
struct aos_abc {
    float a;
    float b;
    float c;
};

__global__ void kernel_soa(soa_abc foo) {
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
    foo.a[tid] = 1.f;
    foo.b[tid] = 2.f;
    foo.c[tid] = 3.f;
}

__global__ void kernel_aos(aos_abc *bar) {
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
    bar[tid].a = 1.f;
    bar[tid].b = 2.f;
    bar[tid].c = 3.f;
}

int main()
{
    float milliseconds = 0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // SoA
    soa_abc foo;
    CHECK_CUDA(cudaMalloc((void **)&foo.a, SOME_CONSTANT * sizeof(float)));
    CHECK_CUDA(cudaMalloc((void **)&foo.b, SOME_CONSTANT * sizeof(float)));
    CHECK_CUDA(cudaMalloc((void **)&foo.c, SOME_CONSTANT * sizeof(float)));

    cudaEventRecord(start);
    kernel_soa <<<SOME_CONSTANT/1000, 1000 >>> (foo);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time for SoA is %f ms.\n", milliseconds);

    CHECK_CUDA(cudaFree(foo.a));
    CHECK_CUDA(cudaFree(foo.b));
    CHECK_CUDA(cudaFree(foo.c));

    // AoS
    aos_abc *bar;
    CHECK_CUDA(cudaMalloc((void **)&bar, SOME_CONSTANT*sizeof(aos_abc)));

    cudaEventRecord(start);
    kernel_aos <<<SOME_CONSTANT/1000, 1000 >>> (bar);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time for AoS is %f ms.\n", milliseconds);

}

Tested with a Quadro P400 on Windows and CUDA 10 the results are:

Time for SoA is 0.492384 ms.
Time for AoS is 1.217568 ms.

which confirms that the SoA is a better choice.

If_You_Say_So
  • 1,195
  • 1
  • 10
  • 25