How to return pointers from CUDA to C without memory allocation?

Question

I have CUDA function that returns 3 pointers: csrVal, csrRowPtr, csrColInd.

void dense2Csr (int dim,
             cuComplex *dnMatr,
             cuComplex *csrVal,
             int *csrRowPtr,
             int *csrColInd)
{
cusparseHandle_t   cusparseH = NULL;   // residual evaluation
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix
cusparseStatus_t cudaStat1 = CUSPARSE_STATUS_SUCCESS;
int nnZ;

//Input GPU Copy
cuComplex *d_dnMatr;
int *d_nnzRow;


//Output GPU Copy
cuComplex *d_csrVal;
int *d_csrRowPtr;
int *d_csrColInd;


cusparseCreate(&cusparseH); //Create SparseStructure
cudaStreamCreate(&stream);
cusparseSetStream(cusparseH, stream);
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); //Set First Element RowPtr eq. to zero


cudaMalloc((void **)&d_dnMatr   , sizeof(cuComplex)*dim*dim);
cudaMalloc((void **)&d_nnzRow   , sizeof(int)*dim);
cudaMemcpy(d_dnMatr  , dnMatr   , sizeof(cuComplex)*dim*dim  , cudaMemcpyHostToDevice);


cusparseCnnz(cusparseH,
             CUSPARSE_DIRECTION_ROW,
             dim,
             dim,
             descrA,
             d_dnMatr,
             dim,
             d_nnzRow,
             &nnZ);




cudaMalloc((void **)&d_csrRowPtr   , sizeof(int)*(dim+1));
cudaMalloc((void **)&d_csrColInd   , sizeof(int)*nnZ);
cudaMalloc((void **)&d_csrVal   , sizeof(cuComplex)*nnZ);


cudaStat1 = cusparseCdense2csr(cusparseH,
                   dim,
                   dim,
                   descrA,
                   d_dnMatr,
                   dim,
                   d_nnzRow,
                   d_csrVal,
                   d_csrRowPtr,
                   d_csrColInd);

assert(cudaStat1 == CUSPARSE_STATUS_SUCCESS);

cudaMallocHost((void **)&csrRowPtr   , sizeof(int)*(dim+1));
cudaMallocHost((void **)&csrColInd   , sizeof(int)*nnZ);
cudaMallocHost((void **)&csrVal   , sizeof(cuComplex)*nnZ);

cudaMemcpy(csrVal, d_csrVal, sizeof(cuComplex)*nnZ, cudaMemcpyDeviceToHost);
cudaMemcpy(csrRowPtr, d_csrRowPtr, sizeof(int)*(dim+1), cudaMemcpyDeviceToHost);
cudaMemcpy(csrColInd, d_csrColInd, sizeof(int)*(nnZ), cudaMemcpyDeviceToHost);



if (d_csrVal) cudaFree(d_csrVal);
if (d_csrRowPtr) cudaFree(d_csrRowPtr);
if (d_csrColInd) cudaFree(d_csrColInd);
if (cusparseH  ) cusparseDestroy(cusparseH);
if (stream     ) cudaStreamDestroy(stream);

And I call it in C code (with 100% proper linking):

dense2Csr(dim, Sigma, csrValSigma, csrRowPtrSigma, csrColIndSigma);

or

dense2Csr(dim, Sigma, &csrValSigma[0], &csrRowPtrSigma[0], &csrColIndSigma[0]);

And in both ways it writes me

Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)

So, it is a memory error, and I solved it just by allocating a host memory in the main program (and without cudaMallocHost in the function) just before calling dense2Csr. But now I am unable to do it in this way. So, is there a recipe to make the function eat a null poiters, and make it return a pointer to a memory region in a such setup?

Pass the pointers by reference, not value. Look at how cudaMalloc works. — talonmies, May 06 '18 at 15:43
Neither is cudaMalloc. C has a pass by reference idiom. It doesn't have references. What you you do if dim needed to be modified within the function? There in lies the answer for the pointers also — talonmies, May 06 '18 at 16:12
Well, I would use &dim instead of dim, but as I already wrote, It doesn't work. I don't understand you. What I didn't mention is that dense2Csr function is inside a shared library. — Indian, May 06 '18 at 17:28
I will repeat my first comment again. Look at the prototype for cudaMalloc and then ask yourself why that works. That is exactly what you need here — talonmies, May 06 '18 at 17:44
I do appreciate that you still answering, but still, I'm too dumb to understand either you or cudaMalloc. — Indian, May 06 '18 at 18:38

talonmies · Accepted Answer · 2018-05-07T16:59:06.240

It appears you have found the C pass by reference idiom by yourself, and that will work perfectly fine for what you seem to need to do. A much more elegant and logical way to do the same thing is to define a structure containing the pointers you allocate within your function, and have the function return the structure by value.

So your code could be modified like this:

#include <cusparse.h>
#include <cuda_runtime_api.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>

typedef struct
{
    cuComplex *csrVal;
    int *csrRowPtr;
    int *csrColInd;
} csr_struct;

csr_struct dense2Csr (int dim, cuComplex *dnMatr)
{
    cusparseHandle_t   cusparseH = NULL;   // residual evaluation
    cudaStream_t stream = NULL;
    cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix
    cusparseStatus_t cudaStat1 = CUSPARSE_STATUS_SUCCESS;
    int nnZ;

    //Input GPU Copy
    cuComplex *d_dnMatr;
    int *d_nnzRow;

    //Output GPU Copy
    cuComplex *d_csrVal;
    int *d_csrRowPtr;
    int *d_csrColInd;

    // return value
    csr_struct mat;

    cusparseCreate(&cusparseH); //Create SparseStructure
    cudaStreamCreate(&stream);
    cusparseSetStream(cusparseH, stream);
    cusparseCreateMatDescr(&descrA);
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); //Set First Element RowPtr eq. to zero

    cudaMalloc((void **)&d_dnMatr   , sizeof(cuComplex)*dim*dim);
    cudaMalloc((void **)&d_nnzRow   , sizeof(int)*dim);
    cudaMemcpy(d_dnMatr  , dnMatr   , sizeof(cuComplex)*dim*dim  , cudaMemcpyHostToDevice);

    cusparseCnnz(cusparseH,
            CUSPARSE_DIRECTION_ROW,
            dim, dim, descrA, d_dnMatr, dim,
            d_nnzRow, &nnZ);

    cudaMalloc((void **)&d_csrRowPtr   , sizeof(int)*(dim+1));
    cudaMalloc((void **)&d_csrColInd   , sizeof(int)*nnZ);
    cudaMalloc((void **)&d_csrVal   , sizeof(cuComplex)*nnZ);

    cudaStat1 = cusparseCdense2csr(cusparseH,
            dim, dim, descrA, d_dnMatr, dim, d_nnzRow,
            d_csrVal, d_csrRowPtr, d_csrColInd); 
    assert(cudaStat1 == CUSPARSE_STATUS_SUCCESS);

    cudaMallocHost((void **)&mat.csrRowPtr   , sizeof(int)*(dim+1));
    cudaMallocHost((void **)&mat.csrColInd   , sizeof(int)*nnZ);
    cudaMallocHost((void **)&mat.csrVal   , sizeof(cuComplex)*nnZ);

    cudaMemcpy(mat.csrVal, d_csrVal, sizeof(cuComplex)*nnZ, cudaMemcpyDeviceToHost);
    cudaMemcpy(mat.csrRowPtr, d_csrRowPtr, sizeof(int)*(dim+1), cudaMemcpyDeviceToHost);
    cudaMemcpy(mat.csrColInd, d_csrColInd, sizeof(int)*(nnZ), cudaMemcpyDeviceToHost);

    if (d_csrVal) cudaFree(d_csrVal);
    if (d_csrRowPtr) cudaFree(d_csrRowPtr);
    if (d_csrColInd) cudaFree(d_csrColInd);
    if (cusparseH  ) cusparseDestroy(cusparseH);
    if (stream     ) cudaStreamDestroy(stream);

    return mat;
}

int main()
{
    const int dim = 1024;
    const size_t sz = sizeof(cuComplex) * dim * dim;
    cuComplex* dmat = malloc(sz);
    memset(dmat, 0, sz);
    const cuComplex ten_plus_nine_i = { 10.0, 9.0 };
    for(int i=0; i<dim; i++)
        dmat[i * (dim + 1)] = ten_plus_nine_i;

    csr_struct smat = dense2Csr(dim, dmat);

    for(int j=0; j<10; j++) {
        cuComplex x = smat.csrVal[j];
        printf("%d %d %f + %f i\n", smat.csrColInd[j], smat.csrRowPtr[j], x.x, x.y);
    }

    return 0;
}

which appears to work correctly (note this example requires a C99 compliant compiler even if the structure return code doesn't):

$ nvcc -Xcompiler="-std=c99" -o intialainen intialainen.c -lcudart -lcusparse
cc1plus: warning: command line option -std=c99 is valid for C/ObjC but not for C++ [enabled by default]

$ ./intialainen 
0 0 10.000000 + 9.000000 i
1 1 10.000000 + 9.000000 i
2 2 10.000000 + 9.000000 i
3 3 10.000000 + 9.000000 i
4 4 10.000000 + 9.000000 i
5 5 10.000000 + 9.000000 i
6 6 10.000000 + 9.000000 i
7 7 10.000000 + 9.000000 i
8 8 10.000000 + 9.000000 i
9 9 10.000000 + 9.000000 i

or directly using gcc:

$ gcc -std=c99 -o intialainen intialainen.c -I /opt/cuda-9.0/include -L /opt/cuda-9.0/lib64 -lcudart -lcusparse -lcuda
$ ./intialainen 
0 0 10.000000 + 9.000000 i
1 1 10.000000 + 9.000000 i
2 2 10.000000 + 9.000000 i
3 3 10.000000 + 9.000000 i
4 4 10.000000 + 9.000000 i
5 5 10.000000 + 9.000000 i
6 6 10.000000 + 9.000000 i
7 7 10.000000 + 9.000000 i
8 8 10.000000 + 9.000000 i
9 9 10.000000 + 9.000000 i

The function call csr_struct smat = dense2Csr(dim, dmat) is simpler and easier to follow than something like dense2Csr(dim, dmat, &p1, &p2, &p2) which would be the alternative, although this is entirely a matter of taste.

How to return pointers from CUDA to C without memory allocation?

1 Answers1