I want to test the cusparseScsr2csc
which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.
the wrapper:
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
// malloc space on video card and copy data
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
// use the API from cuSPARSE
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
// copy the data from device (video card) to host (CPU)
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
// return the answer
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
this is the CSR class:
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
and this is the code in the main:
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();
the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds):
the result I got (definitely wrong):
My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2
Full code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cusparse.h>
#include <iostream>
#include <vector>
#include <complex>
using namespace std;
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
int main()
{
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
if (st == CUSPARSE_STATUS_SUCCESS) {
cout << "success" << endl;
res.out();
}
return 0;
}
this is the documentation page, the function cusparseScsr2csc
is located in ch. 9.
And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.
Edit:
I tried the solution mentioned by paleonix (add cudaDeviceSynchronize()
right after the cusparseScsr2csc(...)
), but still got the exact same wrong answer.