The following program tests the dense to sparse conversion using cuSPARSE
. It produces garbage in the first several lines of output. But if I move the lines marked with (2)
to the place after the lines marked with (1)
, the program works fine. Can someone tell me what could be the reason?
EDIT:
To make the presentation clearer, I rewrote the program with thrust
, the same issue persists.
EDIT:
As suggested by Robert, I changed it back to the version without thrust
and added api level error check code.
#include <iostream>
#include <cusparse_v2.h>
using std::cerr;
using std::cout;
using std::endl;
#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP( \
cusparseStatus_t err = (x); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \
<< __LINE__ << " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define CHKcuda(x) WRAP( \
cudaError_t err = (x); \
if (err != cudaSuccess) { \
cerr << "Cuda Error #" << int(err) << ", \"" \
<< cudaGetErrorString(err) << "\" at Line " << __LINE__ \
<< " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define ALLOC(X, T, N) do { \
h##X = (T*) malloc(sizeof(T) * (N)); \
CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)
int main() {
srand(100);
cusparseHandle_t g_cusparse_handle;
CHKcusparse(cusparseCreate(&g_cusparse_handle));
const int n = 100, in_degree = 10;
int nnz = n * in_degree, nn = n * n;
int *dnnz, *dridx, *dcols;
int *hnnz, *hridx, *hcols;
float *dvals, *dmat;
float *hvals, *hmat;
// (1) The number of non-zeros in each column.
ALLOC(nnz, int, n);
// The dense matrix.
ALLOC(mat, float, nn);
// The values in sparse matrix.
ALLOC(vals, float, nnz);
// (2) The row indices of the sparse matrix.
ALLOC(ridx, int, nnz);
// The column offsets of the sparse matrix.
ALLOC(cols, int, n+1);
// Fill and copy dense matrix and number of non-zeros.
for (int i = 0; i < nn; i++) {hmat[i] = rand();}
for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
CHKcuda(cudaDeviceSynchronize());
// Perform dense to CSC format
cusparseMatDescr_t cspMatDesc;
CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
CHKcusparse(cusparseSdense2csc(
g_cusparse_handle, n, n, cspMatDesc, dmat, n,
dnnz, dvals, dridx, dcols
));
// Copy row indices back.
CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
CHKcuda(cudaDeviceSynchronize());
CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));
// Display row indices.
for (int i = 0; i < n; i++) {
for (int j = 0; j < in_degree; j++) {
std::cout << hridx[i * in_degree + j] << ", ";
}
std::cout << std::endl;
}
CHKcuda(cudaFree(dnnz));
CHKcuda(cudaFree(dvals));
CHKcuda(cudaFree(dridx));
CHKcuda(cudaFree(dcols));
CHKcuda(cudaFree(dmat));
free(hnnz);
free(hmat);
free(hvals);
free(hridx);
free(hcols);
return 0;
}