1

The following program tests the dense to sparse conversion using cuSPARSE. It produces garbage in the first several lines of output. But if I move the lines marked with (2) to the place after the lines marked with (1), the program works fine. Can someone tell me what could be the reason?

EDIT: To make the presentation clearer, I rewrote the program with thrust, the same issue persists.

EDIT: As suggested by Robert, I changed it back to the version without thrust and added api level error check code.

#include <iostream>
#include <cusparse_v2.h>

using std::cerr;
using std::cout;
using std::endl;

#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP(                                        \
  cusparseStatus_t err = (x);                                       \
  if (err != CUSPARSE_STATUS_SUCCESS) {                             \
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line "   \
         << __LINE__ << " of " << __FILE__ << ": " << #x << endl;   \
    exit(1);                                                        \
  }                                                                 \
)
#define CHKcuda(x) WRAP(                                             \
  cudaError_t err = (x);                                             \
  if (err != cudaSuccess) {                                          \
    cerr << "Cuda Error #" << int(err) << ", \""                     \
         << cudaGetErrorString(err) << "\" at Line " << __LINE__     \
         << " of " << __FILE__ << ": " << #x << endl;                \
    exit(1);                                                         \
  }                                                                  \
)
#define ALLOC(X, T, N) do {                            \
  h##X = (T*) malloc(sizeof(T) * (N));                 \
  CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)

int main() {
  srand(100);

  cusparseHandle_t g_cusparse_handle;
  CHKcusparse(cusparseCreate(&g_cusparse_handle));

  const int n = 100, in_degree = 10;
  int nnz = n * in_degree, nn = n * n;

  int *dnnz, *dridx, *dcols;
  int *hnnz, *hridx, *hcols;
  float *dvals, *dmat;
  float *hvals, *hmat;

  // (1) The number of non-zeros in each column.
  ALLOC(nnz, int, n);

  // The dense matrix.
  ALLOC(mat, float, nn);

  // The values in sparse matrix.
  ALLOC(vals, float, nnz);

  // (2) The row indices of the sparse matrix.
  ALLOC(ridx, int, nnz);

  // The column offsets of the sparse matrix.
  ALLOC(cols, int, n+1);

  // Fill and copy dense matrix and number of non-zeros.
  for (int i = 0; i < nn; i++) {hmat[i] = rand();}
  for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
  CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
  CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
  CHKcuda(cudaDeviceSynchronize());

  // Perform dense to CSC format
  cusparseMatDescr_t cspMatDesc;
  CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
  CHKcusparse(cusparseSdense2csc(
      g_cusparse_handle, n, n, cspMatDesc, dmat, n,
      dnnz, dvals, dridx, dcols
  ));

  // Copy row indices back.
  CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
  CHKcuda(cudaDeviceSynchronize());
  CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));

  // Display row indices.
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < in_degree; j++) {
      std::cout << hridx[i * in_degree + j] << ", ";
    }
    std::cout << std::endl;
  }

  CHKcuda(cudaFree(dnnz));
  CHKcuda(cudaFree(dvals));
  CHKcuda(cudaFree(dridx));
  CHKcuda(cudaFree(dcols));
  CHKcuda(cudaFree(dmat));
  free(hnnz);
  free(hmat);
  free(hvals);
  free(hridx);
  free(hcols);
  return 0;
}
Vitality
  • 20,705
  • 4
  • 108
  • 146
shaoyl85
  • 1,854
  • 18
  • 30
  • no error checking? Before asking others for help, you should take advantage of basic API level error checking for CUDA and cusparse APIs. you have cusparse functions that are returning errors regardless of positioning of lines 1 or 2. You are declaring that nnz per column is 10 but in fact you are initializing your dense matrix with more than 10 non-zero elements per column, which is causing the dense to sparse conversion to blow up. Cusparse provides a function to precompute the nnz per column. But in your case you can eliminate the error simply by setting `in_degree` to 100 instead of 10. – Robert Crovella Jan 13 '14 at 03:49
  • Thank you for the remind. I extracted it from a large code base to ask question. I have tested these calls they all return successful. As for the dense matrix, I intend to make the dense matrix n by n, then conver it to a sparse matrix of size n by n with each column having 10 non-zero elements. If this is the setting, is my way of calling the conversion function correct? Or is there anything I understood wrong? – shaoyl85 Jan 13 '14 at 03:52
  • @RobertCrovella Sorry I see your point, do you mean if my dense matrix has more than 10 non-zero elements in each column, I cannot call the conversion with nnz per column equal to 10? Won't the conversion automatically select 10 most non-zero elements? – shaoyl85 Jan 13 '14 at 03:55
  • No it does not automatically select 10 most non-zero elements, (??). If you think about this carefully, I think you will realize that if nnz per column does not match the matrix you pass, the results can be ambiguous. The nnz per column that you pass is expected to match the actual dense matrix that you pass. Use the [cusparse function available for this purpose](http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-nnz), if you like. And when I run the first code you had posted, and test for API level errors, I get an error on the last `cudaMemcpyAsync` call. – Robert Crovella Jan 13 '14 at 04:08
  • You can see the errors with your thrust version as well. Either run your code with `cuda-memcheck` or review [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) (which also discusses thrust error checking). If you are not seeing errors, your error-checking methodology is broken. – Robert Crovella Jan 13 '14 at 04:09
  • Thank you, I will study `cuda-memcheck` and have a try – shaoyl85 Jan 13 '14 at 04:27

2 Answers2

1

The basic problem is that you are passing internally inconsistent data to the dense-to-sparse routine. You are passing a dense matrix which has 100 non-zero elements per column, but you are telling cusparse that there are only 10 non-zero elements per column.

If you run your code with cuda-memcheck, you will see that there are errors coming out of cusparse.

For this code, you can fix the issue by changing your in_degree variable to 100.

For the general case, cusparse provides a convenient routine to populate the number of non-zero elements per column correctly.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • So I think my real question is how to efficiently select k elements with largest absolute values from each column and set the rest to zero. Do you have any idea? I think it boils down to the problem of multiple k-selection in parallel. – shaoyl85 Jan 14 '14 at 02:41
  • The only thing that comes to mind is doing a [sort-by-key](http://thrust.github.io/doc/group__sorting.html#ga2bb765aeef19f6a04ca8b8ba11efff24) on each column (key = element absolute value, value = element index), then populating the top k original elements by value (element index) back into a column of zeroes. I suggest you ask it as a new SO question for better ideas. – Robert Crovella Jan 14 '14 at 03:43
0

As already underlined by Robert Crovella, passing from dense to sparse can be effectively performed using cuSPARSE by the cusparse<t>nnz() and cusparse<t>dense2csr() routines. The vice versa can be done by the cusparse<t>csr2dense() routine. Below, there is a fully worked out example showing how passing from dense to sparse and vice versa using cuSPARSE in CSR format.

cuSparseUtilities.cuh

#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH

#include "cusparse_v2.h"

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
    const cusparseHandle_t handle, const int Nrows, const int Ncols);

#endif

cuSparseUtilities.cu

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}

/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
                   int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
                   const cusparseHandle_t handle, const int Nrows, const int Ncols) {

    const int lda = Nrows;                      // --- Leading dimension of dense matrix

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));

    // --- Device side sparse matrix
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));

}

kernel.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cusparse_v2.h>

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main() {

    cusparseHandle_t    handle;

    // --- Initialize cuSPARSE
    cusparseSafeCall(cusparseCreate(&handle));

    cusparseMatDescr_t  descrA = 0;

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int Nrows = 5;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));

    // --- Column-major storage
    h_A_dense[ 0] = 0.4612f;  h_A_dense[ 5] = -0.0006f;   h_A_dense[10] = 1.3f;     h_A_dense[15] = 0.0f;
    h_A_dense[ 1] = 0.0f;     h_A_dense[ 6] = 1.443f;     h_A_dense[11] = 0.0f;     h_A_dense[16] = 0.0f;
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f;    h_A_dense[12] = 0.0723f;  h_A_dense[17] = 0.0f;
    h_A_dense[ 3] = 0.3566f;  h_A_dense[ 8] = 0.0723f;    h_A_dense[13] = 0.7543f;  h_A_dense[18] = 0.0f;
    h_A_dense[ 4] = 0.f;      h_A_dense[ 9] = 0.0f;       h_A_dense[14] = 0.0f;     h_A_dense[19] = 0.1f;

    // --- Create device array and copy host array to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    /*******************************/
    /* FROM DENSE TO SPARSE MATRIX */
    /*******************************/
    // --- Descriptor for sparse matrix A
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    int *d_nnzPerVector;                        // --- Device side number of nonzero elements per row

    double *d_A;                                // --- Sparse matrix values - array of size nnz
    int *d_A_RowIndices;                        // --- "Row indices"
    int *d_A_ColIndices;                        // --- "Column indices"

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);

    /*******************************************************/
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
    /*******************************************************/
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Host side sparse matrix
    double *h_A = (double *)malloc(nnz * sizeof(double));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix in CSR format\n\n");
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    /*******************************/
    /* FROM SPARSE TO DENSE MATRIX */
    /*******************************/
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
                                        d_A_denseReconstructed, Nrows));

    /*******************************************************/
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
    /*******************************************************/
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));

    printf("\nReconstructed dense matrix \n");
    for (int m = 0; m < Nrows; m++) {
        for (int n = 0; n < Ncols; n++) 
            printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
        printf("\n");
    }

    return 0;
}
Vitality
  • 20,705
  • 4
  • 108
  • 146