0

I need to find eigenvalue of a matrices 1000 on 1000 and bigger with CUDA in parallel as fast as possible. I found cusolver library and run the code from documentation:

#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>


void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
    for(int row = 0 ; row < m ; row++){
        for(int col = 0 ; col < n ; col++){
            double Areg = A[row + col*lda];
            printf("%s(%d,%d) = %f\n", name, row+1, col+1, Areg);
        }
    }
}

int main(int argc, char*argv[])
{
    cusolverDnHandle_t cusolverH = NULL;
    cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    const int m = 25;
    const int lda = m;
/*       | 3.5 0.5 0 |
 *   A = | 0.5 3.5 0 |
 *       | 0   0   2 |
 *
 */
    double A[lda*m];
    double lambda[m] = { 2.0, 3.0, 4.0};
    for(int i = 0; i < m*m; i++){
        A[i] = 0;
    }

    double V[lda*m]; // eigenvectors
    double W[m]; // eigenvalues

    double *d_A = NULL;
    double *d_W = NULL;
    int *devInfo = NULL;
    double *d_work = NULL;
    int  lwork = 0;

    int info_gpu = 0;

    printf("A = (matlab base-1)\n");
    printMatrix(m, m, A, lda, "A");
    printf("=====\n");

    // step 1: create cusolver/cublas handle
        cusolver_status = cusolverDnCreate(&cusolverH);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

    // step 2: copy A and B to device
        cudaStat1 = cudaMalloc ((void**)&d_A, sizeof(double) * lda * m);
        cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * m);
        cudaStat3 = cudaMalloc ((void**)&devInfo, sizeof(int));
        assert(cudaSuccess == cudaStat1);
        assert(cudaSuccess == cudaStat2);
        assert(cudaSuccess == cudaStat3);

        cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
        assert(cudaSuccess == cudaStat1);

    // step 3: query working space of syevd
        cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
        cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
        float time;
        cudaEvent_t start, stop;

        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        cusolver_status = cusolverDnDsyevd_bufferSize(
            cusolverH,
            jobz,
            uplo,
            m,
            d_A,
            lda,
            d_W,
            &lwork);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);

        printf("Time to find eigenvector:  %3.1f ms \n", time);
        assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);

        cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
        assert(cudaSuccess == cudaStat1);

    // step 4: compute spectrum
        cudaEventCreate (&start);
        cudaEventCreate (&stop);
        cusolver_status = cusolverDnDsyevd(
            cusolverH,
            jobz,
            uplo,
            m,
            d_A,
            lda,
            d_W,
            d_work,
            lwork,
            devInfo);
        cudaStat1 = cudaDeviceSynchronize();
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
        assert(cudaSuccess == cudaStat1);

        cudaStat1 = cudaMemcpy(W, d_W, sizeof(double)*m, cudaMemcpyDeviceToHost);
        cudaStat2 = cudaMemcpy(V, d_A, sizeof(double)*lda*m, cudaMemcpyDeviceToHost);
        cudaStat3 = cudaMemcpy(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
        assert(cudaSuccess == cudaStat1);
        assert(cudaSuccess == cudaStat2);
        assert(cudaSuccess == cudaStat3);
        printf("after syevd: info_gpu = %d\n", info_gpu);
        assert(0 == info_gpu);

        printf("eigenvalue = (matlab base-1), ascending order\n");
        for(int i = 0 ; i < m ; i++){
            printf("W[%d] = %E\n", i+1, W[i]);
        }

        printf("V = (matlab base-1)\n");
        printMatrix(m, m, V, lda, "V");
        printf("=====\n");

    // step 4: check eigenvalues
        double lambda_sup = 0;
        for(int i = 0 ; i < m ; i++){
            double error = fabs( lambda[i] - W[i]);
            lambda_sup = (lambda_sup > error)? lambda_sup : error;
        }
        printf("|lambda - W| = %E\n", lambda_sup);

    // free resources
        if (d_A    ) cudaFree(d_A);
        if (d_W    ) cudaFree(d_W);
        if (devInfo) cudaFree(devInfo);
        if (d_work ) cudaFree(d_work);

        if (cusolverH) cusolverDnDestroy(cusolverH);

        cudaDeviceReset();

        return 0;
    }

It worked pretty fast, but the problem is that I can't calculate eigenvalue of matrix bigger than 25*25(variable m is responsible for size of the matrix0 when I try to set value of m greater than 25 I get: Segmentation fault (core dumped). Shall I change the library or use cusolver?

talonmies
  • 70,661
  • 34
  • 192
  • 269
  • A segfault certainly means that there is a bug somewhere, either in the library or in this code. – Hulk Jul 15 '20 at 15:07
  • 1
    Note that this code uses Variable Length Arrays (VLA), if they get large you might hit the limitiations of the size of stack frames on your platform. For a real application, it might be better to either use dynamic allocation (`malloc`), or constant sizes (i.e. defines). – Hulk Jul 15 '20 at 15:15
  • Don't quite get what you mean... as far as i understood you suggest to `# define N 1000`, but than, where the error is generated from your point of view? – anonymus321 Jul 15 '20 at 15:56
  • These are not formally VLA because the size arguments are compile time constants denoted with `const`, but nevertheless the point is valid. Wherever you see something like this `double A[lda*m];` that is a stack-based array, and if you make it very large (e.g. by increasing `m`) you will overrun the limits of the stack. A simple alternative is to do something like `double *A = new double[lda*m];` and that will not be a stack based array. If you do that kind of modification for each stack based array in the code (for `A`, `lambda`, `V`, `W`), there is a good chance the segfaults will disappear – Robert Crovella Jul 15 '20 at 16:02
  • @RobertCrovella I think you refer to C++, but this question is about C - no `new` here. – Hulk Jul 15 '20 at 16:19
  • 1
    Fair enough. Use `malloc()` instead. `double *A = malloc(lda*m*sizeof(double));` (I actually suspect the question is mistagged, the bulk of new users to CUDA are using C++ not C, however I don't wish to argue it. Since C purists will yell at me if I cast the `malloc` return value, I've left that off, but if the language here is actually C++, the cast will be necessary. I suspect the OP is at a knowledge level where this is all just going to be horribly distracting, so I'd suggest starting with the `new` suggestion and see if that works. YMMV.) – Robert Crovella Jul 15 '20 at 16:26
  • @RobertCrovella , thank you, it made a trick! – anonymus321 Jul 15 '20 at 16:58
  • 2
    Your example here (when you make `m` large) falls into the "Stack Overflow" reason in the linked duplicate. – Robert Crovella Jul 15 '20 at 18:29

0 Answers0