Dynamically expanding array in cuda kernel

Question

I am trying to run Brandes algorithm(basically bfs with some extra operations & data structures) on GPU and i am allocating each thread a vertex to run brandes from. The problem i'm facing is that in my code

i need to store the parents of each vertex visited during the bfs

. In CPU implementation its very easy to achieve by creating a map of vector and calling push_back whenever I find a new parent which is technically a dynamically expanding array. I have no idea how to do this in CUDA.

Here's a sample code for the functionality i need:

    vector<int> distance;               //Initialized to 0
    vector<int> paths;                  //Initialized to 0
    vector<bool> visited;               //Initialized to false
    map <int, vector<int> > parents;    //Parent vector of each key is empty
    queue<int> q;


    // Running bfs from vertex
    q.push(vertex);                     
    while(!q.empty())
    {
        int source = q.front();
        q.pop();

        for(auto neighbour : adjacency_list[source])
        {
            if(!visited[neighbour])
            {
                visited[neighbour] = true;
                q.push(neighbour);
                distance[neighbour] = distance[source] + 1;
            }
            if(distance[neighbour] == distance[source] + 1)
            {
                paths[neighbour] += paths[source];
                parents[neighbour].push_back(source);
            }
        }
    }

    {
        // Use data accumulated above for calculations
        ....
    }

Ths is the line(functionality) I have trouble implementing in device code

parents[neighbour].push_back(source);

My impressions :

I could over-allocate(max degree of graph) the parents list for each vertex but it will cost me a lot of unused memory
Store parent relation as edges in an array of size 2*Edges but i need all parents of a vertex together(stored contiguously or in the same container) which is not possible in this implementation
I am aware of gpu heap memory but cant think of a way to exploit it for my use
Worst case scenario : I first run a bfs to find no. of parents for each vertex and then allocate appropriate memory for each and then again run brandes.

I think there may be various possibilities already here on the `cuda` tag. For example, [here](https://stackoverflow.com/questions/21786495/cuda-kernel-returning-vectors/21788662#21788662) is an example of a multi-threaded device `push_back` operation. It does require you to pre-allocate space for the vector, but it does not require allocation per thread, so you can over-allocate based on the needs of all the threads or the entire graph. It's also easy to build an overrun detector into it. — Robert Crovella, Jan 17 '19 at 19:52
@RobertCrovella I read the code in the link, it is very similar to my 2nd impression i.e I could get them(parent-child pairs) all stored in an array but I'll have to sort(aligning all parents of a vertex together) them before using them for next part of code. I hope i made my question clearer to everyone — Kshitij Shukla, Jan 18 '19 at 07:13

Robert Crovella · Accepted Answer · 2019-01-19T20:26:27.897

I think your impression 1 could be implemented approximately with what is described here (per-thread stack, pre-allocated). It has the issues you mention, related to over-allocation. In newer GPUs, memory of several gigabytes (or more) is common, so the over-allocation concern might be not very serious, if total memory is not an issue.
I think your impression 2 could be implemented approximately with what is described here (device-wide thread-safe vector push_back). It has the issues you mention, related to lack of ordering of the results in the result vector. These could possibly be resolved with a sorting operation after the collection operation is complete.

(4. It sounds like you probably already have an idea of how to do your "worst-case" impression 4.)

That leaves impression 3. We could use an idea that is a combination of impression 1 and impression 2, i.e. create a per-thread vector push_back, but use an on-demand allocation via in-kernel malloc or new. In-kernel memory allocation like this is pretty slow, and not without its own issues (e.g. you may have to reserve additional heap space, in-kernel allocated heap memory cannot participate in a transfer to the host, small allocations may be inefficient in memory usage), but there really is no way to tell which of these approaches might be best, without more information about the dimensions of your problem. If keeping track of the parent nodes is a relatively infrequent operation as you are traversing the graph, the dynamic allocation approach may not be a problem.

Here's an example of how a simple vector (per-thread) could be created:

$ cat t376.cu
#include <iostream>
#include <cstdio>

#include <assert.h>
template <typename T>
class cu_vec{  // simple implementation of per-thread "vector"
  const size_t alloc_block_size = 4096; // tuning parameter
  T *my_ptr;
  size_t n_items;
  size_t alloc_blocks;
  public:
    __host__ __device__
    cu_vec(){
      assert(sizeof(T) <= alloc_block_size);
      n_items = 0;
      my_ptr = (T *)new char[alloc_block_size];
      assert(my_ptr != NULL);
      alloc_blocks = 1;}

    __host__ __device__
    cu_vec(size_t sz){
      assert(sizeof(T) <= alloc_block_size);
      n_items = sz;
      alloc_blocks = (n_items*sizeof(T)+alloc_block_size-1)/alloc_block_size;
      my_ptr = (T *)new char[alloc_blocks*alloc_block_size];
      assert(my_ptr != NULL);
      memset(my_ptr, 0, alloc_blocks*alloc_block_size);}

    __host__ __device__
    ~cu_vec(){
      if (my_ptr != NULL) delete[] my_ptr;
      }

    __host__ __device__
    void push_back(T const &item){ // first test if we can just store new item
      if ((n_items+1)*sizeof(T) > alloc_blocks*alloc_block_size){
        T *temp = (T *)new char[(alloc_blocks+1)*alloc_block_size];
        assert(temp != NULL);
        memcpy(temp, my_ptr, alloc_blocks*alloc_block_size);
        delete[] my_ptr;
        my_ptr = temp;
        alloc_blocks++;}
      my_ptr[n_items] = item;
      n_items++;}

    __host__ __device__
    size_t size(){
      return n_items;}

    __host__ __device__
    void clear(){
      n_items = 0;}

    __host__ __device__
    T& operator[](size_t idx){
      assert(idx < n_items);
      return my_ptr[idx];}

    __host__ __device__
    T& pop_back(){
      if (n_items > 0){
        n_items--;}
      return my_ptr[n_items];}

    __host__ __device__
    T* data(){
      return my_ptr;}

    __host__ __device__
    size_t storage_ratio(){
      return alloc_block_size/sizeof(T);}
};

struct ss
{
   unsigned x;
   float y;
};

__global__ void test(){

  cu_vec<ss> my_vec;
  ss temp = {threadIdx.x, 2.0f};
  my_vec.push_back(temp);
  assert(my_vec.size() == 1);
  assert(my_vec.storage_ratio() >= 1);
  ss temp2 = my_vec[0];
  printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp2.x, temp2.y);
  temp.y = 3.0f;
  my_vec[0].x = temp.x;
  my_vec[0].y = temp.y;
  ss temp3 = my_vec.pop_back();
  printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp3.x, temp3.y);
  my_vec.clear();
  temp.x = 0;
  for (int i = 0; i < 10000; i++){
    my_vec.push_back(temp);
    temp.x++;}
  temp.x--;
  for (int i = 0; i < 10000; i++) {
    assert(my_vec.pop_back().x == temp.x);
    temp.x--;}
  cu_vec<ss> my_vec2(2);
  assert(my_vec2[1].x == 0);
  assert(my_vec2[1].y == 0.0f);
}

int main(){

  //default heap space is 8MB, if needed reserve more with:
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576*32));
  test<<<1, 4>>>();
  cudaDeviceSynchronize();
}
$ nvcc -std=c++11 -o t376 t376.cu
$ cuda-memcheck ./t376
========= CUDA-MEMCHECK
threadIdx.x: 0, ss.x: 0, ss.y: 2.000000
threadIdx.x: 1, ss.x: 1, ss.y: 2.000000
threadIdx.x: 2, ss.x: 2, ss.y: 2.000000
threadIdx.x: 3, ss.x: 3, ss.y: 2.000000
threadIdx.x: 0, ss.x: 0, ss.y: 3.000000
threadIdx.x: 1, ss.x: 1, ss.y: 3.000000
threadIdx.x: 2, ss.x: 2, ss.y: 3.000000
threadIdx.x: 3, ss.x: 3, ss.y: 3.000000
========= ERROR SUMMARY: 0 errors
$

The code hasn't been tested any more than what you see here.

Thanks for your efforts @Robert .. I went ahead with the second impression because the operation of adding parents was very frequent throughout the entire program so i couldnt use the above method.. I am able to accumulate all parents of all vertices(storing as parent-child pair) in a single struct array and then sort them using thrust::sort.. that way i'm not wasting much memory and there is no runtime malloc overhead.. — Kshitij Shukla, Jan 25 '19 at 10:19
Actually the whole point of asking this question was to know if there is some primitive way in cuda to do what i was trying to do.. turns out there isn't. — Kshitij Shukla, Jan 25 '19 at 10:21

Dynamically expanding array in cuda kernel

1 Answers1