0

I want to test the cusparseScsr2csc which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.

the wrapper:

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    
    // malloc space on video card and copy data
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    
    // use the API from cuSPARSE
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    
    // copy the data from device (video card) to host (CPU)
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    
    // return the answer
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

this is the CSR class:

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

and this is the code in the main:

int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();

the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds): the matrix

the result I got (definitely wrong): result

My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2

Full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda_runtime.h>
#include <cusparse.h>

#include <iostream>
#include <vector>
#include <complex>

using namespace std;

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

int main()
{
    int m = 4, n = 6, nnz = 8;
    float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
    int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
    int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

    cusparseStatus_t st;
    auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);

    if (st == CUSPARSE_STATUS_SUCCESS) {
        cout << "success" << endl;
        res.out();
    }
    return 0;
}

this is the documentation page, the function cusparseScsr2csc is located in ch. 9. And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.

piece

Edit:

I tried the solution mentioned by paleonix (add cudaDeviceSynchronize() right after the cusparseScsr2csc(...)), but still got the exact same wrong answer.

yys_c
  • 15
  • 6
  • 1
    `cudaDeviceSynchronize()` is the function you need to use to get the results of any asynchronous operation in CUDA. – paleonix Jul 18 '22 at 08:39
  • @paleonix Thanks for your reply. I have added `cudaDeviceSynchronize()` function right after the `cusparseScsr2csc(...)`, but still got the same output... – yys_c Jul 18 '22 at 08:45
  • Maybe you will find the problem using proper [CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api). See also the [Conjugate Gradient](https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp) CUDA sample and [this header](https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h) which is used for error checking in the samples. – paleonix Jul 18 '22 at 09:50
  • It might not make a difference for `cusparseScsr2csc`, but there are newer versions of the documentation you linked. For CUDA 9.2, see [here](https://docs.nvidia.com/cuda/archive/9.2/pdf/CUSPARSE_Library.pdf). – paleonix Jul 18 '22 at 13:09

1 Answers1

2

The main problem is here:

cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

That should be:

cudaMemcpy(csr_col_inds, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

A few other notes:

  • The function cusparseScsr2csc is no longer available in recent versions of CUDA (it was evidently deprecated and later removed). I assume this might be one reason you are using CUDA 9.2. One possible replacement would be cusparseCsr2cscEx2()

  • There is no particular need for an additional cudaDeviceSynchronize() here. The cudaMemcpy functions after the cusparse function call serve the same purpose.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257