I have CUDA function that returns 3 pointers: csrVal, csrRowPtr, csrColInd.
void dense2Csr (int dim,
cuComplex *dnMatr,
cuComplex *csrVal,
int *csrRowPtr,
int *csrColInd)
{
cusparseHandle_t cusparseH = NULL; // residual evaluation
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix
cusparseStatus_t cudaStat1 = CUSPARSE_STATUS_SUCCESS;
int nnZ;
//Input GPU Copy
cuComplex *d_dnMatr;
int *d_nnzRow;
//Output GPU Copy
cuComplex *d_csrVal;
int *d_csrRowPtr;
int *d_csrColInd;
cusparseCreate(&cusparseH); //Create SparseStructure
cudaStreamCreate(&stream);
cusparseSetStream(cusparseH, stream);
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); //Set First Element RowPtr eq. to zero
cudaMalloc((void **)&d_dnMatr , sizeof(cuComplex)*dim*dim);
cudaMalloc((void **)&d_nnzRow , sizeof(int)*dim);
cudaMemcpy(d_dnMatr , dnMatr , sizeof(cuComplex)*dim*dim , cudaMemcpyHostToDevice);
cusparseCnnz(cusparseH,
CUSPARSE_DIRECTION_ROW,
dim,
dim,
descrA,
d_dnMatr,
dim,
d_nnzRow,
&nnZ);
cudaMalloc((void **)&d_csrRowPtr , sizeof(int)*(dim+1));
cudaMalloc((void **)&d_csrColInd , sizeof(int)*nnZ);
cudaMalloc((void **)&d_csrVal , sizeof(cuComplex)*nnZ);
cudaStat1 = cusparseCdense2csr(cusparseH,
dim,
dim,
descrA,
d_dnMatr,
dim,
d_nnzRow,
d_csrVal,
d_csrRowPtr,
d_csrColInd);
assert(cudaStat1 == CUSPARSE_STATUS_SUCCESS);
cudaMallocHost((void **)&csrRowPtr , sizeof(int)*(dim+1));
cudaMallocHost((void **)&csrColInd , sizeof(int)*nnZ);
cudaMallocHost((void **)&csrVal , sizeof(cuComplex)*nnZ);
cudaMemcpy(csrVal, d_csrVal, sizeof(cuComplex)*nnZ, cudaMemcpyDeviceToHost);
cudaMemcpy(csrRowPtr, d_csrRowPtr, sizeof(int)*(dim+1), cudaMemcpyDeviceToHost);
cudaMemcpy(csrColInd, d_csrColInd, sizeof(int)*(nnZ), cudaMemcpyDeviceToHost);
if (d_csrVal) cudaFree(d_csrVal);
if (d_csrRowPtr) cudaFree(d_csrRowPtr);
if (d_csrColInd) cudaFree(d_csrColInd);
if (cusparseH ) cusparseDestroy(cusparseH);
if (stream ) cudaStreamDestroy(stream);
And I call it in C code (with 100% proper linking):
dense2Csr(dim, Sigma, csrValSigma, csrRowPtrSigma, csrColIndSigma);
or
dense2Csr(dim, Sigma, &csrValSigma[0], &csrRowPtrSigma[0], &csrColIndSigma[0]);
And in both ways it writes me
Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)
So, it is a memory error, and I solved it just by allocating a host memory in the main program (and without cudaMallocHost in the function) just before calling dense2Csr. But now I am unable to do it in this way. So, is there a recipe to make the function eat a null poiters, and make it return a pointer to a memory region in a such setup?