cusolverSpDcsrlsvlu or QR method using CUDA

Question

I have searched the whole world but unable to solve this problem! "Unhandled exception at 0x00007FFF3AD3D430 (cusolver64_70.dll) in cusolver test.exe: 0xC0000005: Access violation reading location 0x0000000400960004." i want to solve Ax=B using least square solver or by Qr method .... my codes compile without error but later on i get this error! the code breaks at the last line of this code! my code is

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cublas.h>
#include <cusolver_common.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>

# include <memory.h>
# include <thrust/device_vector.h>
# include <thrust/host_vector.h>
# include <thrust/device_ptr.h>
# include <thrust/system/cuda/execution_policy.h>



double *dX, *X;
double *dY, *Y;

int NoOfBuses = 4;


void main(void)
{
cusparseStatus_t status;
int row;
double *matA, *d_matA;
size_t pitchd_matA;
int *dNnzPerRow;
double *dCsrValA, *H_CsrVal;
int *dCsrRowPtrA, *HCsrRowPtrA;
int *dCsrColIndA, *HCsrColIndA;
int totalNnz;

cusparseHandle_t handle = 0;
cusparseMatDescr_t descr = 0;


//---------------------------------------------------------------------------------------------------------------

matA = (double *)calloc(((NoOfBuses+1)*(NoOfBuses+1)), sizeof(double));
Y = (double *)calloc((NoOfBuses + 1), sizeof(double));
X = (double *)calloc((NoOfBuses + 1), sizeof(double));

//STTORING IN col MAJOR FORM
for (int Row = 1; Row <= NoOfBuses; Row++)
{
    double value = 1;

    for (int Col = 1; Col <= NoOfBuses; Col++)
    {
        matA[Row + Col*(NoOfBuses + 1)] = value;
        value++;
    }
}


double value = 1;

for (int index = 1; index <= NoOfBuses; index++)
{
    Y[index] = value;
    value++;
}

printf("\n");
printf("A matrix\n");
for (int Row = 0; Row <= NoOfBuses; Row++)
{
    for (int Col = 0; Col <= NoOfBuses; Col++)
    {
        printf("%f\t",matA[Col + Row*(NoOfBuses + 1)] );
    }
    printf("\n");
}

printf("Y matrix\n\n");
for (int index = 0; index <= NoOfBuses; index++)
{
    printf("%f\n",Y[index]);
}

//-------------------------------------------------------------------------------------------------------

 cusparseCreate(&handle);

 // Allocate device memory to store the sparse CSR representation of A
 cudaMalloc((void **)&dCsrValA, sizeof(double)* 16 );
 cudaMalloc((void **)&dCsrColIndA, sizeof(int)* 16);
 cudaMalloc((void **)&dCsrRowPtrA, sizeof(int)* (NoOfBuses + 2));


// Allocate device memory for vectors and the dense form of the matrix A 

//cudaMallocPitch((void **) &d_matA, &pitchd_matA, sizeof(double)*(NoOfBuses + 1), (NoOfBuses + 1));
cudaMalloc((void **)&d_matA, sizeof(double)* (NoOfBuses + 1)*(NoOfBuses + 1));
cudaMalloc((void **)&dNnzPerRow, sizeof(int)* NoOfBuses);


//transfer Matrix A,X & Y to the GPU
cudaMemcpy(dX, X, sizeof(double) * (NoOfBuses +1), cudaMemcpyHostToDevice);
cudaMemcpy(dY, Y, sizeof(double)* (NoOfBuses + 1), cudaMemcpyHostToDevice);
//cudaMemcpy2D(d_matA, pitchd_matA, matA, sizeof(double)*(NoOfBuses + 1), sizeof(double)*(NoOfBuses + 1), (NoOfBuses + 1), cudaMemcpyHostToDevice);
cudaMemcpy(d_matA,matA,sizeof(double) * (NoOfBuses + 1)*(NoOfBuses + 1), cudaMemcpyHostToDevice);


// Construct a descriptor of the matrix A
 cusparseCreateMatDescr(&descr);
 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
 //cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);

cusparseDnnz(handle, CUSPARSE_DIRECTION_COLUMN, (NoOfBuses+1), (NoOfBuses+1), descr, d_matA, (NoOfBuses+1), dNnzPerRow, &totalNnz);

printf("the total number of Non zero elements are = %d",totalNnz);




cusparseDdense2csr(handle,  (NoOfBuses+1), (NoOfBuses+1), descr, d_matA, (NoOfBuses+1), dNnzPerRow, dCsrValA, dCsrRowPtrA, dCsrColIndA);


cudaMalloc((void **)&dX, sizeof(double)* (NoOfBuses + 1));
cudaMalloc((void **)&dY, sizeof(double)* (NoOfBuses + 1));
H_CsrVal = (double *)calloc((totalNnz), sizeof(double));
HCsrRowPtrA = (int *)calloc((NoOfBuses + 2), sizeof(int));
HCsrColIndA = (int *)calloc((totalNnz), sizeof(int));


cudaMemcpy(H_CsrVal, dCsrValA, sizeof(double) * (totalNnz), cudaMemcpyDeviceToHost);
cudaMemcpy(HCsrRowPtrA, dCsrRowPtrA, sizeof(int)* (NoOfBuses + 2), cudaMemcpyDeviceToHost);
cudaMemcpy(HCsrColIndA, dCsrColIndA, sizeof(int)* (totalNnz), cudaMemcpyDeviceToHost);

printf("Values\n\n");
for (int index = 0; index <= (totalNnz-1); index++)
{
    printf("%f\n", H_CsrVal[index]);
}

printf("\ncol pointer matrix\n\n");
for (int index = 0; index <= (totalNnz - 1); index++)
{
    printf("%d\n", HCsrColIndA[index]);
}

printf("\nrow ofssett pointer matrix\n\n");
for (int index = 0; index <= (NoOfBuses +2 - 1); index++)
{
    printf("%d\n", HCsrRowPtrA[index]);
}


//------------------------------------------------------------------------------------

cusolverSpHandle_t handleSolver;
double tol = 0.0000001;
int reorder = 0;
int valuefor,*singularity = &valuefor;
*singularity = 0;
cudaStream_t streamId = NULL;
cusolverStatus_t Checker;

Checker=cusolverSpCreate(&handleSolver);
cusolverStatus_t cudasu = cusolverSpSetStream(handleSolver, streamId);
cusolverStatus_t pakao = cusolverSpDcsrlsvluHost(handleSolver,5, totalNnz, descr, dCsrValA, dCsrRowPtrA, dCsrColIndA, dY, tol, reorder, dX, singularity);


getchar();
}

you should use [error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api/14038590#14038590) to find out if your CUDA API calls are executed correctly. — m.s., May 05 '15 at 18:09

score 1 · Accepted Answer · answered May 05 '15 at 18:55

1

You are using the Host version of the API, but you are passing device variables to it:

cudaMalloc((void **)&dCsrValA, sizeof(double)* 16 );
...

cusolverStatus_t pakao = cusolverSpDcsrlsvluHost(handleSolver,5, totalNnz, descr, dCsrValA, dCsrRowPtrA, dCsrColIndA, dY, tol, reorder, dX, singularity);
                                            ^^^^                                  ^^

Referring to the cusolver documentation:

enter image description here

We see that for the host path, all variables must be on the host, not the device.

answered May 05 '15 at 18:55

Robert Crovella

143,785
11
213
257

1

U are awesome..... sir !!! but sir do u think that this has a negative aspect that we need to copy back the whole matrix from the device back!!! and then again back to device!!!! so it will reduce the speed – Taha Saeedkhan May 05 '15 at 19:37
I'm not really sure what your question is. If the matrix starts out on the host (as in the code you have shown here ie. `matA`) then there doesn't need to be any copying back and forth with the `Host` api. If you actually had the `A` matrix starting out on the device, then yes, you would need to copy it back to the host. Perhaps in the future this particular API will be extended to include a device version. – Robert Crovella May 05 '15 at 19:47
@TahaSaeedkhan You are also awesome sir, you should have started using this years back. – Faizan May 05 '15 at 20:15

cusolverSpDcsrlsvlu or QR method using CUDA

1 Answers1