0

I'm trying to invert a matrix composed of complex numbers, where I'm using matrix inversion code for real numbers posted in the following link by 'user' cuda matrix inverse gaussian jordan

code compiles, no bugs, but problem is output is wrong! I don't know where I went wrong. Can anyone, please, help. Thank you in advance!

here is the complete code:

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){ 
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P; 
            if(y>=i){ 
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;  
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
            if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
            }
        }
    __syncthreads();

}

int main()
{
    int const n = 3;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f  ", cuCimag(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *d_L, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(16,16);     //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation    
    cudaMalloc( (void**)  &d_A, ddsize);   
    cudaMalloc( (void**)   &dI, ddsize); 

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); 
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice); 
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n); 

    cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); 
    cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f  ", cuCimag(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

    system("Pause");
 return 0;
}

Thank you @RobertCrovella for ur fast and very insightful suggestion! Regarding your answer to my question: I changed my threadsPerBlock(4,4) and numBlocks(1,1) so I'll be using 1 block with 16 threads for my 4x4 matrix. My input matrix is the following

1  0  0  0
0  2  0  0 
0  0  3  0
0  0  0  4

all numbers are real in here, then expected inverted matrix should look like

1   0    0   0
0   1/2  0   0 
0   0    1/3 0
0   0    0   1/4

and i'm not getting this at all. I inputted cuda memcheck tool to see if my kernel is not lunching but it didn't show any error massages. I started learning CUDA very recently and don't have much experience. Can anyone give more detailed response? Thank You!

here is my modified code.

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}




 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){ 
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P; 
            if(y>=i){ 
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;  
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if( cuCimag(d_A[x*h+x]) != 0 ){
            if( cuCreal(d_A[x*h+x]) != 0 ){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
            }
        }
    __syncthreads();
}

int main()
{
    int const n= 4;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f ", cuCreal(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(1,1);       //!!!!!!!!!!!!!!!!!!

// memory allocation    
    cudaMalloc( (void**)  &d_A, ddsize);   
    cudaMalloc( (void**)   &dI, ddsize); 

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); 
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice); 
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
        gpuErrchk( cudaPeekAtLastError() );
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n); 

    gpuErrchk( cudaPeekAtLastError() );

    gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost )); 
    gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost )); 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f ", cuCreal(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

    system("Pause");
 return 0;
}
Community
  • 1
  • 1
  • note, gauss jordan is a lot slower than simpler gaussian elimination – Steve Cox Aug 26 '14 at 20:07
  • 1
    Your `threadsPerBlock` and `numBlocks` calculations and usage are messed up in at least 2 ways. If you do [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) you would discover that your kernels are not launching. – Robert Crovella Aug 26 '14 at 20:08
  • There are several approaches for Gaussian elimination, which was suggested by @SteveCox. You may wish to take a look at [Gaussian elimination with CUDA](http://www.orangeowlsolutions.com/archives/721). – Vitality Aug 26 '14 at 21:54
  • Thank you very much @RobertCrovella for super fast response! – user3753370 Aug 27 '14 at 20:47
  • Thank you @RobertCrovella for ur fast and very insightful suggestion! Regarding your answer to my question: I changed my threadsPerBlock(4,4) and numBlocks(1,1) so I'll be using 1 block with 16 threads for my 4x4 matrix. Can you please take a look below? – user3753370 Aug 28 '14 at 01:05

1 Answers1

1

DISCLAIMER: I am not an expert on matrix inversion. I have not worked through the details of the differences between real matrix inversion and complex matrix inversion (there shouldn't be many differences, I don't think). As suggested already, there are probably better/faster ways to invert matrices.

The immediate problem seems to be in your dev kernel, particularly here:

    if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
        if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){

This is requiring that both the real and imaginary parts of the d_A matrix element in question be non-zero in order for the dev kernel to do any work. However I don't think this condition should be necessary. For division, we probably only require that either the real or the imaginary part be non-zero. I think in the complex domain we are actually dividing by zero only if both real and imaginary parts are zero. If you inspect the cuCdiv function provided in cuComplex.h, you can ascertain for yourself under what conditions it will "blow up" and therefore what conditions need to be tested for and avoided. I'm confident your test is not correct.

The following modified code works correctly for me, for your simple test case:

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <math.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}




 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
            if(y>=i){
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if(( cuCimag(d_A[x*h+x]) != 0 ) || ( cuCreal(d_A[x*h+x]) != 0 )){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];

        }
    __syncthreads();
}

int main()
{
    int const n= 4;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f ", cuCreal(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(1,1);       //!!!!!!!!!!!!!!!!!!

// memory allocation
    cudaMalloc( (void**)  &d_A, ddsize);
    cudaMalloc( (void**)   &dI, ddsize);

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice);
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);
//timer start
    cudaEventRecord( start, 0);
// L^(-1)
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
        gpuErrchk( cudaPeekAtLastError() );
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n);

    gpuErrchk( cudaPeekAtLastError() );

    gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
    gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f ", cuCreal(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

 return 0;
}

FINAL DISCLAIMER: I'm not saying this is a fully-validated approach to inversion of matrices of arbitrary dimensions. I'm simply pointing out a critical bug that seems to make it fail for your simple test case. I also expressed some reservations in the previous question you linked.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257