i've been working in a program that requires to use array of structs inside another array of structs or structure of arrays, i decided to use this approach given the initial conditions (there are dynamic), the following are the structs that i'm trying to allocate in CUDA
struct population
{
int id;
tour *tours;
};
struct tour
{
int id;
node *nodes;
double value;
int node_qty;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item *items;
};
struct item
{
float weight;
float value;
};
As you can see, this group of structures are one inside another and as i have said most of the properties are dynamic (P.E: the amount of nodes, the amount of items and the amount of tours). I have made many attempts to allocate memory but the result is almost always the same "Access violation writing location". As a side note i have tried to follow some advices from other questions like this: cudaMemcpy segmentation fault or this Memory allocation on GPU for dynamic array of structs.
The following code allocates most of the memory, but when i try to access the properties of the structures the result is "an illegal memory access"
// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;
HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
// 4. cudaMalloc node storage on the device for node pointer
HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
// 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
// Optional: Copy an instantiated object on the host to the device pointer
HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));
After this initial approach my next attempt was trying to allocate the inner structures first and then go upwards, my attempt was with the node and item structs as follows
// Define a pointer for struct "node"
node* dev_node;
// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));
// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));
// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;
// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}
// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}
// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}
But this last attempt gives me an Access violation writing location 0x0000000B00700C00 in the following line
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
I suppose that the error(s) are associated to some kind of missing or bad memory allocation but i haven't been able to figure it out where.
UPDATE 1: After some research as indicated by talonmies I have done a simplified version of my code only to solve this but still doesn't work.
This is my new code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct item
{
int id;
float weight;
float value;
int node;
int taken;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item* items;
};
struct tour
{
int id;
int node_qty;
node* nodes;
};
struct population
{
int id;
tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
getchar();
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
int main()
{
// Get user defined values
int population_size, tour_size, node_size, item_size;
printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
// For this exercise the values are 1 10 5 4
scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
printf("\n");
printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);
#pragma region ALLOCATE CPU MEMORY
// Declare pointers
population* host_population;
tour* host_tour;
node* host_node;
item* host_item;
// Allocate host memory for population
host_population = (population*)malloc(sizeof(population) * population_size);
for (int p = 0; p < population_size; p++)
{
host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
}
// Allocate host memory for tour
host_tour = (tour*)malloc(sizeof(tour) * tour_size);
for (int t = 0; t < tour_size; t++)
{
host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
}
// Allocate host memory for node
host_node = (node*)malloc(sizeof(node) * node_size);
for (int n = 0; n < node_size; n++)
{
host_node[n].items = (item*)malloc(sizeof(item) * item_size);
}
// Allocate memory for item
host_item = (item*)malloc(sizeof(item) * item_size);
#pragma endregion
#pragma region FILL CPU DATA
//Fill the full structure with information, for test purposes these values are going to be taken
// 1. Item Data
int item_id[4] = { 1,2,3,4 };
float item_value[4] = { 300,50,30,40 };
float item_weight[4] = { 400,200,40,2 };
int item_node[4] = { 3,4,5,2 };
// 2. Node Data
int node_id[5] = { 1,2,3,4,5 };
double node_x[5] = { 0,6,14,11,7 };
double node_y[5] = { 0,-5,5,13,5 };
int node_item[5] = { 0,1,1,1,1 };
// 3. Tour Data
int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
// 4. Population Data
int population_id = 1;
for (int i = 0; i < item_size; i++)
{
host_item[i].id = item_id[i];
host_item[i].value = item_value[i];
host_item[i].taken = rand() % 2;
host_item[i].node = item_node[i];
host_item[i].weight = item_weight[i];
}
for (int n = 0; n < node_size; n++)
{
host_node[n].id = node_id[n];
host_node[n].x = node_x[n];
host_node[n].y = node_y[n];
host_node[n].item_qty = node_item[n];
for (int i = 0; i < item_size; i++)
{
if (host_node[n].id == host_item[i].node)
{
memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
}
}
}
for (int t = 0; t < tour_size; t++)
{
host_tour[t].id = tour_id[t];
host_tour[t].node_qty = node_size;
memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
}
for (int p = 0; p < population_size; p++)
{
host_population[p].id = population_id;
memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
}
//printStructure(host_population, population_size, tour_size);
#pragma endregion
population* device_population;
tour *device_tour;
node* device_node;
item* device_item;
// Allocate host memory for population
HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
return 0;
}
After this exercise i have a more specific question to ask: What's the difference between this:
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
And this
// Allocate host memory for tour
device_tour[1];
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
The second block works without any issue, but the first one returns an exception "Access violation writing location". Is there a way to make the first block work without using static arrays?