Troubles allocating objects in Cuda

Question

I'm currently doing some Cuda C++ stuff for Deep Learning (Pencil&Paper technique), but I'm stuck on some weird behavior from Cuda.

Here is my class :

class Matrix
{
public:
    float* data;
    int width;
    int height;

    Matrix();
    Matrix(const Matrix&);
    ~Matrix();
    void reset();
    friend std::ostream& operator<<(std::ostream&, const Matrix*);
};

Its definition :

Matrix::Matrix()
{
}

Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
    data = new float[width * height];
    std::copy(copy.data, copy.data + width * height, data);
}

Matrix::~Matrix()
{
    delete data;
}

void Matrix::reset()
{
    memset(data, 0, width * height * sizeof(float));
}

std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{   
    for (int i = 1; i <= matrix->height * matrix->width; ++i)
        out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
    return out;
}

And here is a Minimal, Complete and Verifiable example :

__global__ void add_and_display(Matrix* dev_weights)
{
    dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}

int main()
{
    Matrix* weights = new Matrix(), *dev_weights;
    float* weights_elements;

    //For the purpose of testing, creating a checked pattern Matrix
    weights->width = 9;
    weights->height = 9;
    weights->data = new float[weights->width * weights->height];
    for (int i = 0; i < weights->width * weights->height; ++i)
    {
        if (i % 2 == 0) 
            weights->data[i] = 0;
        else 
            weights->data[i] = 1;
    }

    int weights_size = weights->width * weights->height * sizeof(float);

    HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));

    //Allocate objects on the device
    HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));

    //Copy the data to the object allocated on the device
    HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));

    add_and_display <<< weights->width, weights->height >>> (dev_weights);

    HANDLE_ERROR(cudaDeviceSynchronize());

    //Copy back data from device
    float* hostPointer = new float[weights->width * weights->height];
    HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));

    //Display and get errors here
    cout << weights << endl;

    cudaFree(dev_weights);

    return 0;
}

Here is my macro for errors checking :

static void HandleError(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

So the problem is the program crashes when you try to display the data on the host. I guess the copy back from the device isn't working but I can't find a way to correct it.

Thanks if you can help me out to find the issue.

EDIT 1 : Simplified my post so it's testable for everyone.

Can you edit the shortest complete [MCVE] into your question (please read everything at the link first)? Incomplete code which can't be compiled to run isn't very helpful when the problem is apparently a runtime error. — talonmies, May 13 '18 at 11:00
My guess is your `input->data` stores a host pointer which is then copied into `dev_input->data` and eventually accessed from the device, but it's hard to say without a [mcve]. — Angew is no longer proud of SO, May 13 '18 at 11:56
You need to allocate device pointers using cudamalloc or cudamallocmanaged — Regis Portalez, May 14 '18 at 06:00
Please take a look at [this answer](https://stackoverflow.com/a/31135377/1231073) for information on how to create a device object with member variables also allocated on the device. — sgarizvi, May 14 '18 at 07:25
Despite what the other commentators are saying, there is absolutely nothing wrong with the device memory allocation or passing of data to and from the GPU. There is only a very small mistake on the host side causing all of this — talonmies, May 14 '18 at 11:15

score 0 · Accepted Answer · answered May 14 '18 at 11:20

There is actually nothing wrong with the allocation, transfer, and use of objects on the device. There is one small mistake in the transfer of data back from the device at the end of the MCVE which is the source of the segfaults. This:

//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));

leaves weights with a device pointer to the underlying weight data which causes a segfault when you attempt to output the matrix contents on the host. The code needs to be changed to this:

//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
weights->data = hostPointer; // weights data must point to hostPointer

Then the code runs correctly:

$ cat weights.cu
#include <iostream>
#include <cstdio>

static void HandleError(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
class Matrix
{
public:
    float* data;
    int width;
    int height;

    Matrix();
    Matrix(const Matrix&);
    ~Matrix();
    void reset();
    friend std::ostream& operator<<(std::ostream&, const Matrix*);
};

Matrix::Matrix()
{
}

Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
    data = new float[width * height];
    std::copy(copy.data, copy.data + width * height, data);
}

Matrix::~Matrix()
{
    delete data;
}

void Matrix::reset()
{
    memset(data, 0, width * height * sizeof(float));
}

std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{   
    for (int i = 1; i <= matrix->height * matrix->width; ++i)
        out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
    return out;
}

__global__ void add_and_display(Matrix* dev_weights)
{
    dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}

int main()
{
    Matrix* weights = new Matrix(), *dev_weights;
    float* weights_elements;

    //For the purpose of testing, creating a checked pattern Matrix
    weights->width = 9;
    weights->height = 9;
    weights->data = new float[weights->width * weights->height];
    for (int i = 0; i < weights->width * weights->height; ++i)
    {
        if (i % 2 == 0) 
            weights->data[i] = 0;
        else 
            weights->data[i] = 1;
    }

    int weights_size = weights->width * weights->height * sizeof(float);

    HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));

    //Allocate objects on the device
    HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));

    //Copy the data to the object allocated on the device
    HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));

    add_and_display <<< weights->width, weights->height >>> (dev_weights);

    HANDLE_ERROR(cudaDeviceSynchronize());

    //Copy back data from device
    float* hostPointer = new float[weights->width * weights->height];
    HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
    weights->data = hostPointer;

    //Display and get errors here
    std::cout << weights << std::endl;

    cudaFree(dev_weights);

    return 0;
}

$ nvcc -g -arch=sm_52 -o weights weights.cu 

$ cuda-memcheck weights
========= CUDA-MEMCHECK
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1

========= ERROR SUMMARY: 0 errors

That was exactly the problem ! Thank you so much ! – Adrien Lenoir May 14 '18 at 12:01 — Adrien Lenoir, May 14 '18 at 12:01

Troubles allocating objects in Cuda

1 Answers1