Error memcpy in device code

Question

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1

#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif

template <class a_type>
class dataIntern{
private:
    a_type *possibilities;
    int _assign;
    int _size;
    int _var;
    int _maxsize;

public:
    CUDA_CALLABLE_MEMBER dataIntern(){
    }

    CUDA_CALLABLE_MEMBER dataIntern(int var){
        _var = var;
        _size = (int)pow(3.0, (double)_var);
        _maxsize = _size * _var;
        _assign = 1;
        possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
        if(!possibilities){
            exit(1);
        }
        createTable();
    }

    CUDA_CALLABLE_MEMBER void createTable(){
        int i, j, k, limit, pos;
        a_type value;
        if(_assign == 1){
            for(i=0; i<_var; i++){
                #ifdef __CUDA_ARCH__
                    limit = (int)pow(3.0, _var-i-1);
                #else
                    limit = (int)pow(3.0, (double)_var-i-1);
                #endif

                value = (a_type)_MIN;
                k = 0;
                for(j=0; j<_size; j++){
                    pos = _var*j+i;
                    if(k >= limit){
                        value++;
                        if(value > _MAX){
                            value = (a_type)_MIN;
                        }
                        k = 0;
                    }
                    possibilities[pos] = value;
                    k++;
                }
            }           
        }
    }

    CUDA_CALLABLE_MEMBER void print(){
        int i;

        printf("Printing.\n");
        if(_assign == 1){
            for(i=0; i<_size*_var; i++){
                printf("%d ", possibilities[i]);
                if(i%_var == _var-1){
                    printf("\n");
                }
            }
        }
        else{
            printf("Not assigned.\n");
        }
    }

    CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
        int i, pos, count, initpos, attrib;
        a_type *newnode;
        a_type *newlist = NULL, *morelist = NULL;
        pos = posChanged;
        initpos = 0;
        count = 0;      

        if(_assign == 1){
            attrib = 0;
            newnode = (a_type*)malloc(_var*sizeof(a_type));
            for(i=0; i<_size; i++){
                if(possibilities[pos] == valueRetified){
                    memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));

                    count++;
                    if(newlist!=NULL){
                        morelist = (a_type*)malloc(count*_var*sizeof(a_type));
                        memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
                    }
                    newlist = (a_type*)malloc(count*_var*sizeof(a_type));
                    memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
                    memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));

                    initpos+=_var;
                    attrib = 1;
                }
                pos+=_var;
            }

            if(attrib == 1){
                _size = count;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Allocation fail in newlist retify.\n");
                    exit(1);
                }
                memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
            }
            else{
                _assign = 0;
            }
        }
    }

    CUDA_CALLABLE_MEMBER a_type* unstack(){
        a_type* solution = NULL, *backup = NULL;

        if(_assign == 1){
            if(_size>0){
                backup = (a_type*)malloc(_var*_size*sizeof(a_type));
                if(backup == NULL){
                    printf("Erro to alloc backup pointer on unstack function in data intern\n");
                    return NULL;
                }

                solution = (a_type*)malloc(_var*sizeof(a_type));
                if(solution == NULL){
                    printf("Erro to alloc solution pointer on unstack function in data intern\n");
                    return NULL;
                }
                memcpy(backup, possibilities, _size*_var*sizeof(a_type));                   
                memcpy(solution, possibilities, _var*sizeof(a_type));

                free(possibilities);

                _size--;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Error to realloc possibilities pointer in data intern\n");
                    return NULL;
                }
                memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));

                free(backup);
                return solution;
            }           
        }
        return NULL;
    }

    CUDA_CALLABLE_MEMBER int get_size(){
        return _size;
    }

    CUDA_CALLABLE_MEMBER ~dataIntern(){
        _assign = 0;
        if(possibilities)
            free(possibilities);
    }
};

deviceCode.h:

#ifndef DEVICECODE_H
#define DEVICECODE_H

void CallingInMain();
__global__ void kernel();
#endif

deviceCode.cu:

#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>


//I declared like this to my kernel:
__global__ void kernel(){
    __shared__ dataIntern<int> data[32];
    int *vetor;

    vetor = NULL;
    data[threadIdx.x] = dataIntern<int>(3);

    //_var == 3 in the class above
    vetor = (int*)malloc(sizeof(int)*3);

    vetor = data[threadIdx.x].unstack();
    while(vetor!=NULL){
        //never past here
        printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
        vetor = data[threadIdx.x].unstack();
    }
    //neither here in if or else
    if(vetor)
        printf("Not null\n");
    else
        printf("Null final\n");

    free(vetor);
}

void CallingInMain(){
    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
}

main.cu:

#include <iostream>
#include <stdio.h>

#ifndef deviceCode_H
#include "deviceCode.h"
#endif

int main(int argc, char* argv[]){


    CallingInMain();

    return 0;
}

Your code is not compilable. It is, among other things, missing a copy-constructor, and a `main` routine. SO [expects](http://stackoverflow.com/help/on-topic) for questions like these ("Why isn't this code working?"), that you provide an [MCVE](http://stackoverflow.com/help/mcve). That is a complete code that someone else could compile, and run, and see the issue. Also, you should use [proper cuda error checking](http://stackoverflow.com/questions/14038589). What happens if you run your code with `cuda-memcheck` ? — Robert Crovella, Oct 20 '15 at 23:45
I post only main part. I don't have a copy-constructor, it's really necessary? My main question is about return of dynamic pointer in device, it is possible and has good perform? — realbas, Oct 21 '15 at 16:12
Your code as posted now will compile (although it has warnings that you should pay attention to) but at runtime it has various errors. I suggest adding the proper cuda error checking I mentioned as well as running your code with `cuda-memcheck`. The question you have now posted in the comments appears to be **entirely** different than what you have posted in your question, which seems to be asking why the code is not working. Nevertheless, your code is not working correctly, and I would suggest tackling that first before your question about performance (which to me is unclear). — Robert Crovella, Oct 21 '15 at 16:22
In fact, I am confused if the second question (return of pointers) may be the cause of the first. About cuda error checking, I will try use it. — realbas, Oct 21 '15 at 16:51

Robert Crovella · Answer 1 · 2015-10-31T16:14:58.047

Some colleagues pointed out to me that your code seems to have an error in it.

Consider this line in your kernel:

data[threadIdx.x] = dataIntern<int>(3);

This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:

CUDA_CALLABLE_MEMBER dataIntern(int var){
    ...
    possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));

But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:

data[threadIdx.x] = dataIntern<int>(3);
                                      ^

after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:

CUDA_CALLABLE_MEMBER ~dataIntern(){
    _assign = 0;
    if(possibilities)
        free(possibilities);
}

Therefore usage of the pointer so allocated subsequent to this line of code:

data[threadIdx.x] = dataIntern<int>(3);

such as in unstack here:

vetor = data[threadIdx.x].unstack();

will be invalid.

This is a violation of C++ programming rules, and the error is not specific to CUDA.

I thought that was the error because when I took the destructor working perfectly so I changed my data structure to a struct, initializes out of the kernel and passed them to device memory. Thank you anyway. — realbas, Nov 03 '15 at 17:52
I have a question outside that context. When declares a variable in shared memory your content is the same for all blocks? i.e. if I change your value with block 0, the contents of the variable will be changed to the other blocks as well? — realbas, Nov 26 '15 at 18:45
no, the scope of shared memory is only for a single block. Each block in your kernel has its own separately scoped shared memory. — Robert Crovella, Feb 13 '16 at 02:13

Error memcpy in device code

1 Answers1