I think your impression 1 could be implemented approximately with what is described here (per-thread stack, pre-allocated). It has the issues you mention, related to over-allocation. In newer GPUs, memory of several gigabytes (or more) is common, so the over-allocation concern might be not very serious, if total memory is not an issue.
I think your impression 2 could be implemented approximately with what is described here (device-wide thread-safe vector push_back). It has the issues you mention, related to lack of ordering of the results in the result vector. These could possibly be resolved with a sorting operation after the collection operation is complete.
(4. It sounds like you probably already have an idea of how to do your "worst-case" impression 4.)
- That leaves impression 3. We could use an idea that is a combination of impression 1 and impression 2, i.e. create a per-thread vector push_back, but use an on-demand allocation via in-kernel
malloc
or new
. In-kernel memory allocation like this is pretty slow, and not without its own issues (e.g. you may have to reserve additional heap space, in-kernel allocated heap memory cannot participate in a transfer to the host, small allocations may be inefficient in memory usage), but there really is no way to tell which of these approaches might be best, without more information about the dimensions of your problem. If keeping track of the parent nodes is a relatively infrequent operation as you are traversing the graph, the dynamic allocation approach may not be a problem.
Here's an example of how a simple vector (per-thread) could be created:
$ cat t376.cu
#include <iostream>
#include <cstdio>
#include <assert.h>
template <typename T>
class cu_vec{ // simple implementation of per-thread "vector"
const size_t alloc_block_size = 4096; // tuning parameter
T *my_ptr;
size_t n_items;
size_t alloc_blocks;
public:
__host__ __device__
cu_vec(){
assert(sizeof(T) <= alloc_block_size);
n_items = 0;
my_ptr = (T *)new char[alloc_block_size];
assert(my_ptr != NULL);
alloc_blocks = 1;}
__host__ __device__
cu_vec(size_t sz){
assert(sizeof(T) <= alloc_block_size);
n_items = sz;
alloc_blocks = (n_items*sizeof(T)+alloc_block_size-1)/alloc_block_size;
my_ptr = (T *)new char[alloc_blocks*alloc_block_size];
assert(my_ptr != NULL);
memset(my_ptr, 0, alloc_blocks*alloc_block_size);}
__host__ __device__
~cu_vec(){
if (my_ptr != NULL) delete[] my_ptr;
}
__host__ __device__
void push_back(T const &item){ // first test if we can just store new item
if ((n_items+1)*sizeof(T) > alloc_blocks*alloc_block_size){
T *temp = (T *)new char[(alloc_blocks+1)*alloc_block_size];
assert(temp != NULL);
memcpy(temp, my_ptr, alloc_blocks*alloc_block_size);
delete[] my_ptr;
my_ptr = temp;
alloc_blocks++;}
my_ptr[n_items] = item;
n_items++;}
__host__ __device__
size_t size(){
return n_items;}
__host__ __device__
void clear(){
n_items = 0;}
__host__ __device__
T& operator[](size_t idx){
assert(idx < n_items);
return my_ptr[idx];}
__host__ __device__
T& pop_back(){
if (n_items > 0){
n_items--;}
return my_ptr[n_items];}
__host__ __device__
T* data(){
return my_ptr;}
__host__ __device__
size_t storage_ratio(){
return alloc_block_size/sizeof(T);}
};
struct ss
{
unsigned x;
float y;
};
__global__ void test(){
cu_vec<ss> my_vec;
ss temp = {threadIdx.x, 2.0f};
my_vec.push_back(temp);
assert(my_vec.size() == 1);
assert(my_vec.storage_ratio() >= 1);
ss temp2 = my_vec[0];
printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp2.x, temp2.y);
temp.y = 3.0f;
my_vec[0].x = temp.x;
my_vec[0].y = temp.y;
ss temp3 = my_vec.pop_back();
printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp3.x, temp3.y);
my_vec.clear();
temp.x = 0;
for (int i = 0; i < 10000; i++){
my_vec.push_back(temp);
temp.x++;}
temp.x--;
for (int i = 0; i < 10000; i++) {
assert(my_vec.pop_back().x == temp.x);
temp.x--;}
cu_vec<ss> my_vec2(2);
assert(my_vec2[1].x == 0);
assert(my_vec2[1].y == 0.0f);
}
int main(){
//default heap space is 8MB, if needed reserve more with:
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576*32));
test<<<1, 4>>>();
cudaDeviceSynchronize();
}
$ nvcc -std=c++11 -o t376 t376.cu
$ cuda-memcheck ./t376
========= CUDA-MEMCHECK
threadIdx.x: 0, ss.x: 0, ss.y: 2.000000
threadIdx.x: 1, ss.x: 1, ss.y: 2.000000
threadIdx.x: 2, ss.x: 2, ss.y: 2.000000
threadIdx.x: 3, ss.x: 3, ss.y: 2.000000
threadIdx.x: 0, ss.x: 0, ss.y: 3.000000
threadIdx.x: 1, ss.x: 1, ss.y: 3.000000
threadIdx.x: 2, ss.x: 2, ss.y: 3.000000
threadIdx.x: 3, ss.x: 3, ss.y: 3.000000
========= ERROR SUMMARY: 0 errors
$
The code hasn't been tested any more than what you see here.