function cusparseScsr2csc in cuSPARSE library return strange result

Question

I want to test the cusparseScsr2csc which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.

the wrapper:

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    
    // malloc space on video card and copy data
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    
    // use the API from cuSPARSE
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    
    // copy the data from device (video card) to host (CPU)
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    
    // return the answer
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

this is the CSR class:

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

and this is the code in the main:

int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();

the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds):

the result I got (definitely wrong):

My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2

Full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda_runtime.h>
#include <cusparse.h>

#include <iostream>
#include <vector>
#include <complex>

using namespace std;

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

int main()
{
    int m = 4, n = 6, nnz = 8;
    float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
    int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
    int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

    cusparseStatus_t st;
    auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);

    if (st == CUSPARSE_STATUS_SUCCESS) {
        cout << "success" << endl;
        res.out();
    }
    return 0;
}

this is the documentation page, the function cusparseScsr2csc is located in ch. 9. And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.

Edit:

I tried the solution mentioned by paleonix (add cudaDeviceSynchronize() right after the cusparseScsr2csc(...)), but still got the exact same wrong answer.

`cudaDeviceSynchronize()` is the function you need to use to get the results of any asynchronous operation in CUDA. — paleonix, Jul 18 '22 at 08:39
@paleonix Thanks for your reply. I have added `cudaDeviceSynchronize()` function right after the `cusparseScsr2csc(...)`, but still got the same output... — yys_c, Jul 18 '22 at 08:45
Maybe you will find the problem using proper [CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api). See also the [Conjugate Gradient](https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp) CUDA sample and [this header](https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h) which is used for error checking in the samples. — paleonix, Jul 18 '22 at 09:50
It might not make a difference for `cusparseScsr2csc`, but there are newer versions of the documentation you linked. For CUDA 9.2, see [here](https://docs.nvidia.com/cuda/archive/9.2/pdf/CUSPARSE_Library.pdf). — paleonix, Jul 18 '22 at 13:09

score 2 · Accepted Answer · answered Jul 18 '22 at 14:56

The main problem is here:

cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

That should be:

cudaMemcpy(csr_col_inds, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

A few other notes:

The function cusparseScsr2csc is no longer available in recent versions of CUDA (it was evidently deprecated and later removed). I assume this might be one reason you are using CUDA 9.2. One possible replacement would be cusparseCsr2cscEx2()
There is no particular need for an additional cudaDeviceSynchronize() here. The cudaMemcpy functions after the cusparse function call serve the same purpose.

function cusparseScsr2csc in cuSPARSE library return strange result

Edit:

1 Answers1