CUDA Memory Allocation for AoS inside a SoA

Question

i've been working in a program that requires to use array of structs inside another array of structs or structure of arrays, i decided to use this approach given the initial conditions (there are dynamic), the following are the structs that i'm trying to allocate in CUDA

struct population
{
    int id;
    tour *tours;
};

struct tour
{
    int id;
    node *nodes;
    double value;
    int node_qty;
};

struct node 
{
    int id;
    double x;
    double y;
    int item_qty;
    item *items;
};

struct item 
{
    float weight;
    float value;
};

As you can see, this group of structures are one inside another and as i have said most of the properties are dynamic (P.E: the amount of nodes, the amount of items and the amount of tours). I have made many attempts to allocate memory but the result is almost always the same "Access violation writing location". As a side note i have tried to follow some advices from other questions like this: cudaMemcpy segmentation fault or this Memory allocation on GPU for dynamic array of structs.

The following code allocates most of the memory, but when i try to access the properties of the structures the result is "an illegal memory access"

// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;

HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
    
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
   
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
   
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
    // 4. cudaMalloc node storage on the device for node pointer
    HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
    // 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
    HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
    // Optional: Copy an instantiated object on the host to the device pointer
    HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));

After this initial approach my next attempt was trying to allocate the inner structures first and then go upwards, my attempt was with the node and item structs as follows

// Define a pointer for struct "node"
node* dev_node; 

// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));

// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));

// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;

// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}

// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}

// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}

But this last attempt gives me an Access violation writing location 0x0000000B00700C00 in the following line

HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));

I suppose that the error(s) are associated to some kind of missing or bad memory allocation but i haven't been able to figure it out where.

UPDATE 1: After some research as indicated by talonmies I have done a simplified version of my code only to solve this but still doesn't work.

This is my new code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct item
{
    int id;
    float weight;
    float value;
    int node;
    int taken;
};

struct node
{
    int id;
    double x;
    double y;
    int item_qty;
    item* items;
};

struct tour
{
    int id;
    int node_qty;
    node* nodes;
};

struct population
{
    int id;
    tour* tours;
};

static void HandleError(cudaError_t err, const char* file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        getchar();
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
    
int main()
{
    // Get user defined values
    int population_size, tour_size, node_size, item_size;

    printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
    // For this exercise the values are 1 10 5 4
    scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
    printf("\n");

    printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);

#pragma region ALLOCATE CPU MEMORY

    // Declare pointers
    population* host_population;
    tour* host_tour;
    node* host_node;
    item* host_item;

    // Allocate host memory for population
    host_population = (population*)malloc(sizeof(population) * population_size);
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
    }

    // Allocate host memory for tour
    host_tour = (tour*)malloc(sizeof(tour) * tour_size);
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
    }

    // Allocate host memory for node
    host_node = (node*)malloc(sizeof(node) * node_size);
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].items = (item*)malloc(sizeof(item) * item_size);
    }

    // Allocate memory for item
    host_item = (item*)malloc(sizeof(item) * item_size);

#pragma endregion

#pragma region FILL CPU DATA

    //Fill the full structure with information, for test purposes these values are going to be taken

    // 1. Item Data
    int item_id[4] = { 1,2,3,4 };
    float item_value[4] = { 300,50,30,40 };
    float item_weight[4] = { 400,200,40,2 };
    int item_node[4] = { 3,4,5,2 };

    // 2. Node Data
    int node_id[5] = { 1,2,3,4,5 };
    double node_x[5] = { 0,6,14,11,7 };
    double node_y[5] = { 0,-5,5,13,5 };
    int node_item[5] = { 0,1,1,1,1 };

    // 3. Tour Data
    int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };

    // 4. Population Data
    int population_id = 1;

    for (int i = 0; i < item_size; i++)
    {
        host_item[i].id = item_id[i];
        host_item[i].value = item_value[i];
        host_item[i].taken = rand() % 2;
        host_item[i].node = item_node[i];
        host_item[i].weight = item_weight[i];
    }

    for (int n = 0; n < node_size; n++)
    {
        host_node[n].id = node_id[n];
        host_node[n].x = node_x[n];
        host_node[n].y = node_y[n];
        host_node[n].item_qty = node_item[n];
        for (int i = 0; i < item_size; i++)
        {
            if (host_node[n].id == host_item[i].node)
            {
                memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
            }
        }
    }

    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].id = tour_id[t];
        host_tour[t].node_qty = node_size;
        memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
    }

    for (int p = 0; p < population_size; p++)
    {
        host_population[p].id = population_id;
        memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
    }

    //printStructure(host_population, population_size, tour_size);

#pragma endregion

    population* device_population;
    tour *device_tour;
    node* device_node;
    item* device_item;

    // Allocate host memory for population
    HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));

    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

    return 0;
}

After this exercise i have a more specific question to ask: What's the difference between this:

    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

And this

// Allocate host memory for tour
    device_tour[1];
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

The second block works without any issue, but the first one returns an exception "Access violation writing location". Is there a way to make the first block work without using static arrays?

You have to create host memory versions of each level of the structure, build the device structures in host memory, and then copy the final device tree to the device. There are probably about 100 questions on this topic on [SO], if you care to do some searching — talonmies, Apr 08 '22 at 06:32
@talonmies thanks for your advice, as suggested i have read some questions about this topic and i have refined a little bit my code, and i have updated my question. The point is that in the other questions that i have seen about memory allocation of arrays inside arrays in cuda, the inner array is of fixed size while my arrays are dynamic. I just want to know if there's a way to make it work with dynamic arrays — Wisk, Apr 09 '22 at 06:42

score 1 · Accepted Answer · answered Apr 28 '22 at 05:17

Finally after some further research i was able to find a solution, effectively as sugested the solution is to create host memory versions of each level of the structure. Following the full working example:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
    
/// <summary>
    /// Struct to define item
    /// </summary>
    /// <returns></returns>
struct item
{
    int id;
    float weight;
    float value;
    int node;
    int taken;
};

/// <summary>
/// Struct to define a node
/// </summary>
/// <returns></returns>
struct node
{
    int id;
    double x;
    double y;
    int item_qty;
    item* items;
};

/// <summary>
/// Struct to define a tour
/// </summary>
/// <returns></returns>
struct tour
{
    int id;
    int node_qty;
    node* nodes;
};

/// <summary>
/// Struct to define population
/// </summary>
/// <returns></returns>
struct population
{
    int id;
    int tour_qty;
    tour* tours;
};

static void HandleError(cudaError_t err, const char* file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        getchar();
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

void printStructure(population* pop, int pop_size, int tour_size);

__global__ void populationTest(population* population, int population_size)
{
    for (int p = 0; p < population_size; ++p)
    {
        printf(" > population[%d].id: %d\n", p, population[p].id);
        printf(" > population[%d].tour_qty: %d\n", p, population[p].tour_qty);
        if (population[p].tour_qty > 0)
        {
            for (int t = 0; t < population[p].tour_qty; ++t)
            {
                printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, population[p].tours[t].node_qty);
                if (population[p].tours[t].node_qty > 0)
                {
                    for (int n = 0; n < population[p].tours[t].node_qty; ++n)
                    {
                        printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, population[p].tours[t].nodes[n].id);
                        printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, population[p].tours[t].nodes[n].x);
                        printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, population[p].tours[t].nodes[n].y);
                        printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, population[p].tours[t].nodes[n].item_qty);
                        if (population[p].tours[t].nodes[n].item_qty > 0)
                        {
                            for (int i = 0; i < population[p].tours[t].nodes[n].item_qty; ++i)
                            {
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].id);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].node);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].taken);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].value);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].weight);
                            }
                        }
                    }
                }
            }
        }
    }
    printf("\n\n");
}

int main()
{
    // Get user defined values
    int population_size = 1;
    int tour_size = 10; 
    int node_size = 5;
    int item_size = 4;

#pragma region ALLOCATE CPU MEMORY

    // Declare pointers
    population* host_population;
    tour* host_tour;
    node* host_node;
    item* host_item;

    // Allocate host memory for population
    host_population = (population*)malloc(sizeof(population) * population_size);
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
    }

    // Allocate host memory for tour
    host_tour = (tour*)malloc(sizeof(tour) * tour_size);
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
    }

    // Allocate host memory for node
    host_node = (node*)malloc(sizeof(node) * node_size);
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].items = (item*)malloc(sizeof(item) * item_size);
    }

    // Allocate memory for item
    //host_item = (item*)malloc(sizeof(item) * item_size);

    //Test for AoSoA
    host_item = (item*)malloc(sizeof(item) * item_size * node_size);

#pragma endregion

#pragma region FILL CPU DATA

    //Fill the full structure with information, for test purposes these values are going to be taken

    // 1. Item Data
    int item_id[4] = { 1,2,3,4 };
    float item_value[4] = { 300,50,30,40 };
    float item_weight[4] = { 400,200,40,2 };
    int item_node[4] = { 3,4,5,2 };

    // 2. Node Data
    int node_id[5] = { 1,2,3,4,5 };
    double node_x[5] = { 0,6,14,11,7 };
    double node_y[5] = { 0,-5,5,13,5 };
    int node_item[5] = { 0,1,1,1,1 };

    // 3. Tour Data
    int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };

    // 4. Population Data
    int population_id = 1;

    for (int i = 0; i < item_size; i++)
    {
        host_item[i].id = item_id[i];
        host_item[i].value = item_value[i];
        host_item[i].taken = rand() % 2;
        host_item[i].node = item_node[i];
        host_item[i].weight = item_weight[i];
    }

    for (int n = 0; n < node_size; n++)
    {
        host_node[n].id = node_id[n];
        host_node[n].x = node_x[n];
        host_node[n].y = node_y[n];
        host_node[n].item_qty = node_item[n];
        for (int i = 0; i < item_size; i++)
        {
            if (host_node[n].id == host_item[i].node)
            {
                memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
            }
        }
    }

    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].id = tour_id[t];
        host_tour[t].node_qty = node_size;
        memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
    }

    for (int p = 0; p < population_size; p++)
    {
        host_population[p].id = population_id;
        host_population[p].tour_qty = tour_size;
        memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
    }

    printStructure(host_population, population_size, tour_size);

#pragma endregion

#pragma region ALLOCATE GPU MEMORY

    // Define pointers for device structs
    population* device_population;
    tour* device_tour;
    node* device_node;
    item* device_item;

    // Allocate device memory for population
    HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * size_t(population_size)));
    
    // Allocate device memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour) * size_t(tour_size)));
    
    // Allocate device memory for node
    HANDLE_ERROR(cudaMalloc((void**)&device_node, sizeof(node) * size_t(node_size)));
    
    // Allocate device memory for item
    HANDLE_ERROR(cudaMalloc((void**)&device_item, sizeof(item) * size_t(item_size)));

    // Copy host item struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_item, host_item, sizeof(item) * size_t(item_size), cudaMemcpyHostToDevice));

    // Offset pointers
    for (int n = 0; n < node_size; ++n)
    {
        for (int i = 0; i < item_size; ++i)
        {
            if (host_node[n].id == host_item[i].node)
            {
                host_node[n].items = device_item + i;
            }
        }
    }

    // Copy host node struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_node, host_node, sizeof(node) * size_t(node_size), cudaMemcpyHostToDevice));

    for (int t = 0; t < tour_size; ++t)
    {
        host_tour[t].nodes = device_node;
    }

    // Copy host tour struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_tour, host_tour, sizeof(tour) * size_t(tour_size), cudaMemcpyHostToDevice));

    for (int p = 0; p < population_size; ++p)
    {
        host_population[p].tours = device_tour;
    }

    host_population->tour_qty = tour_size;

    HANDLE_ERROR(cudaMemcpy(device_population, host_population, sizeof(population) * size_t(population_size), cudaMemcpyHostToDevice));

   populationTest << <1, 1 >> > (device_population, population_size);
   HANDLE_ERROR(cudaDeviceSynchronize());

#pragma endregion

    return 0;
}

/// <summary>
/// Function to print the tree struct of population
/// </summary>
/// <param name="p"></param>
/// <param name="pop_size"></param>
/// <param name="tour_size"></param>
void printStructure(population* pop, int pop_size, int tour_size)
{
    for (int p = 0; p < pop_size; ++p)
    {
        printf(" > population[%d].id: %d\n", p, pop[p].id);
        for (int t = 0; t < tour_size; ++t)
        {
            printf(" > population[%d].tours[%d].id: %d\n", p, t, pop[p].tours[t].id);
            printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, pop[p].tours[t].node_qty);
            if (pop[p].tours[t].node_qty > 0)
            {
                for (int n = 0; n < pop[p].tours[t].node_qty; ++n)
                {
                    printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, pop[p].tours[t].nodes[n].id);
                    printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, pop[p].tours[t].nodes[n].x);
                    printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, pop[p].tours[t].nodes[n].y);
                    printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, pop[p].tours[t].nodes[n].item_qty);
                    if (pop[p].tours[t].nodes[n].item_qty > 0)
                    {
                        for (int i = 0; i < pop[p].tours[t].nodes[n].item_qty; ++i)
                        {
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].id);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].node);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].taken);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].value);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].weight);
                        }
                    }
                }
            }
        }
    }
    printf("\n\n");
}

CUDA Memory Allocation for AoS inside a SoA

1 Answers1