1

I didn't find any similar question to mine. I'm trying to write the gaussian-jordan inverse matrix algorithm. The idea of the algorithm is simple:)

I want to inverse only a lower triangular matrix. I got almost correct answer. Where did I do sth wrong? Does anyone can help me? I will appreciate it.

  • d_ A lower triangular matrix (nxn)
  • dI identity matrix (nxn)
  • n size of a matrix in one direction (n%16=0)

  • dim3 threadsPerBlock(n/16,n/16);

  • dim3 numBlocks(16,16);

I know it is a simple implementation but at first I need it to work correctly :) Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!

there is the whole cpu code:

  #include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>

using namespace std;

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i)
            if(y>=i){
                P=A[x*n+i]/A[i*n+i];
                I[x*n+y]-= I[i*n+y]*P;
                A[x*n+y]-= A[i*n+y]*P;
            }
            __syncthreads();
 }


 __global__ void dev(float *d_A,  float *dI, int h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x<h && y<h)
        if(d_A[x*h+x]!=0){
            dI[x*h+y]  /= d_A[x*h+x];
            d_A[x*h+y] /= d_A[x*h+x];
        }
    __syncthreads();

}

void savetofile(float *A, string s, int n, int h)
{
    std::ofstream plik;
    plik.open(s);

    for(int j=0;j<h;j++){
        for(int i=0;i<h;i++){
            plik<<A[j*n+i]<<"\t";}
            plik<<endl;}
    plik.close();
}

int main()
{
    int n = 16;
// creating input
    float *iL = new float [n*n];
    float *L = new float [n*n];
    for(int i=0;i<n;i++)
        for(int j=0;j<n;j++)
            if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
            else L[i*n+j] = 0;

    savetofile(L,"L.txt",n,n);

    cout<<"inv\n";
    float *d_A, *d_L,*I, *dI;
    float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(float);

    dim3 threadsPerBlock(n/16,n/16);
    dim3 numBlocks(16,16);
// memory allocation    
    err= cudaMalloc( (void**)  &d_A, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err= cudaMalloc( (void**)   &dI, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    I = new float[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=1.0;
                else I[i*n+j]=0.0;}}
 //copy data from GPU to CPU
    err =cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err =cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);  if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,    threadsPerBlock>>>(d_A, dI, n); 

    err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 
    err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
    savetofile(iL,"inv.txt",n,n);
    savetofile(L,"I.txt",n,n);
    cudaFree(d_A);
    cudaFree(dI);
    delete []I;
    delete []L;
    delete []iL;
    system("Pause");
 return 0;
}

there is my output:

60.6061 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
-34.1233    -2.13675    -0  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-48.5115    1.91603 -0.0799201  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.4891    1.8697  0.0748167   -0.0196634  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.8332    1.84732 0.0725876   0.018747    -0.00767828 -0  -0  -0  0   0   0   0   0   0   0   0   
-50.0073    1.83403 0.071321    0.0182352   0.00739595  -0.00376795 -0  -0  0   0   0   0   0   0   0   0   
-50.112 1.82521 0.0705011   0.0179073   0.0072164   0.00365346  -0.00212282 -0  0   0   0   0   0   0   0   0   
-50.1818    1.81893 0.0699261   0.0176789   0.00709196  0.00357445  0.00206784  -0.00131234 0   0   0   0   0   0   0   0   
-50.2316    1.81423 0.0695003   0.0175105   0.00700059  0.00351662  0.0020277   0.00128271  -0.00086736 -0  -0  -0  -0  -0  -0  -0  
-50.2689    1.81057 0.0691722   0.0173813   0.00693062  0.00347244  0.00199711  0.00126017  0.000850006 -0.000602925    -0  -0  -0  -0  -0  -0  
-50.2979    1.80765 0.0689115   0.0172789   0.0068753   0.00343758  0.00197301  0.00124245  0.000836382 0.000592093 -0.000435975    -0  -0  -0  -0  -0  
-50.321 1.80527 0.0686993   0.0171957   0.00683047  0.00340937  0.00195354  0.00122815  0.000825401 0.000583374 0.000428868 -0.00032541 -0  -0  -0  -0  
-50.34  1.80328 0.0685233   0.0171269   0.0067934   0.00338607  0.00193748  0.00121637  0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293    -0  -0  -0  
-50.3557    1.80159 0.0683749   0.0170689   0.00676223  0.0033665   0.001924    0.00120649  0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186    -0  -0  
-50.369 1.80015 0.0682481   0.0170195   0.00673566  0.00334983  0.00191253  0.00119809  0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673    -0  
-50.3805    1.7989  0.0681385   0.0169768   0.00671274  0.00333547  0.00190265  0.00119086  0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382  -0.000126146

and it should be:

60,6060606060606    4,44089209850063e-16    4,85722573273506e-17    -3,12250225675825e-17   0   1,73472347597681e-18    -1,08420217248550e-18   -7,58941520739853e-19   4,33680868994202e-19    -5,42101086242752e-19   0   -6,93889390390723e-18   0   -1,38777878078145e-17   0   1,18720137887163e-17
-34,1232841232841   -2,13675213675214   0   8,67361737988404e-18    3,03576608295941e-18    8,67361737988404e-19    -1,73472347597681e-18   1,35525271560688e-18    -8,67361737988404e-19   1,00288700954909e-18    0   0   6,93889390390723e-18    6,93889390390723e-18    -1,38777878078145e-17   3,02221355580334e-18
-17,9130271437964   1,91603268526345    -0,0799200799200800 1,30104260698261e-18    1,95156391047391e-18    -9,75781955236954e-19   1,95156391047391e-18    2,16840434497101e-19    -3,52365706057789e-19   -1,62630325872826e-19   1,38777878078145e-17    -3,46944695195361e-18   0   0   0   -2,72405795836983e-18
-2,86140643299924   0,0760191125748172  0,0748166415934231  -0,0196633632216454 -2,41234983378025e-18   7,99599102208060e-19    3,25260651745651e-19    -4,74338450462408e-19   2,67662411332359e-19    2,91379333855479e-19    -2,16840434497101e-18   -4,33680868994202e-19   1,30104260698261e-18    0   0   6,86096687275983e-20
-1,33482739506506   0,0346053236774996  0,00125734163772674 0,0187469132242915  -0,00767825058738617    5,35324822664718e-19    -2,23616698075135e-19   5,08219768352580e-20    5,92923063078010e-20    1,74488787134386e-19    -4,33680868994202e-19   4,33680868994202e-19    -2,16840434497101e-19   2,16840434497101e-19    0   -1,19008129089229e-19
-0,793561224702690  0,0203250367373064  0,000727127971238783    0,000177630032830862    0,00739591005669882 -0,00376795430225022    4,98055372985529e-19    -3,84552958053452e-19   3,20178454062126e-19    -1,35525271560688e-19   6,50521303491303e-19    -1,08420217248550e-19   1,08420217248550e-19    -2,16840434497101e-19   0   -7,15742840429884e-20
-0,532255026297144  0,0135340901236068  0,000479383336751935    0,000115847127348313    4,51920594555328e-05    0,00365346070706817 -0,00212282675610843    1,37219337455197e-19    -5,14996031930615e-19   3,30342849429177e-19    0   -2,71050543121376e-19   1,08420217248550e-19    0   0   5,08219768352580e-20
-0,384130052448431  0,00972113086608457 0,000342250536212794    8,21235560483452e-05    3,18129608485860e-05    1,56232096436654e-05    0,00206784220009096 -0,00131233595800525    6,39509875176997e-20    -3,37542629480839e-19   -1,08420217248550e-19   2,16840434497101e-19    0   0   0   -8,47032947254300e-22
-0,291692030052418  0,00735419164507677 0,000257375648850429    6,15185225200113e-05    2,37495210052671e-05    1,16038017329438e-05    6,53368676878396e-06    0,00128271813402154 -0,000867362869930264   1,77876918923403e-19    1,62630325872826e-19    -1,89735380184963e-19   1,62630325872826e-19    0   0   -9,07384044746169e-20
-0,229596895430646  0,00578230937666655 0,000201707743336976    4,79768824589291e-05    1,84020572663637e-05    8,96002707181433e-06    5,05525466995835e-06    3,12009781742606e-06    0,000850011219708818    -0,000602925394011745   0   2,71050543121376e-20    -8,13151629364128e-20   5,42101086242752e-20    -5,42101086242752e-20   7,73976355553617e-20
-0,185720949479909  0,00466765632076680 0,000162419592307734    3,85318721641536e-05    1,47407053519860e-05    7,17308297585328e-06    4,02178178072207e-06    2,48428717850195e-06    1,64547815065802e-06    0,000592092919336558    -0,000435974905284452   0   0   8,13151629364128e-20    -1,08420217248550e-19   2,64697796016969e-20
-0,153867987373140  0,00385473267086607 0,000133863548213241    3,17506489004575e-05    1,20962229586152e-05    5,86799087221288e-06    3,28276799988068e-06    2,02338706451671e-06    1,33735029942045e-06    9,34275734555363e-07    0,000428867197061432    -0,000325409609345764   0   2,71050543121376e-20    0   -1,09055491958991e-20
-0,129703518509601  0,00324211947468978 0,000112403568308126    2,65969300905272e-05    1,01402805713936e-05    4,89779294849866e-06    2,73496124917826e-06    1,68586638861081e-06    1,11012300345236e-06    7,73556738632873e-07    5,60933254708493e-07    0,000320553621268105    -0,000249293253625970   5,42101086242752e-20    0   -1,01114558078482e-20
-0,110691345431593  0,00276839969825208 9,59884298624889e-05    2,25961759289096e-05    8,63052307521336e-06    4,15554692230644e-06    2,31688356971108e-06    1,42511604039733e-06    9,39229137057347e-07    6,51934526276135e-07    4,72019315851685e-07    3,53897320062806e-07    0,000245863313382516    -0,000195185934120844   0   -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05    1,95227026042985e-05    7,41637441475814e-06    3,57424367962823e-06    1,99334817579930e-06    1,21993241781196e-06    8,05577604288488e-07    5,57554928001086e-07    4,03155267486669e-07    3,01723475812485e-07    2,31838854154289e-07    0,000192695260333710    -0,000155673036807333   -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05    1,70229067498473e-05    6,46008752692950e-06    3,11455737751181e-06    1,73159030599080e-06    1,06073213436631e-06    6,96842172109705e-07    4,82764206408816e-07    3,49217230232344e-07    2,60145440758586e-07    2,00286821017368e-07    1,56906945950947e-07    0,000153820426928509    -0,000126146355001072
user
  • 95
  • 1
  • 9
  • What does "almost correct answer" mean? what is the problem *exactly*? You haven't provided runnable code, so how are we suppose to know what is wrong? – talonmies Feb 26 '14 at 12:39
  • the range of values in the output matrix is let's say (0.02,0.14) and my result is (0.01,0.14). below for loop there is just allocations and transferring input data and after it is transferring result back – user Feb 26 '14 at 13:37
  • 1
    suggestion: provide a complete code, one that can be copied, pasted, compiled, without adding anything or changing anything, and include a simple test case like a 3x3 matrix, and show the actual output from your code as well as the desired output. – Robert Crovella Feb 26 '14 at 14:35
  • It seems to me that the reference example was compiled and ran on a CPU with double precision, that will interfere in your results. – prmottajr Feb 26 '14 at 15:40
  • so results from gpu and cpu always be so different? I can't agree with @prmottajr. There can be differences but not so big. other wise the gpu result changes the whole solution if it will be multiply in the next step. – user Feb 26 '14 at 15:51
  • @user my point is that even in the CPU if you use float against double you will have differences on the results. I am not stating that this will is "the" problem, just a small part of it. – prmottajr Feb 26 '14 at 15:57
  • ok, I should have misunderstand what you had on your mind. of course float and double can a little issue but I don't believe it can be so significant ///////*(to my previous post) if it is a multiply in the next step – user Feb 26 '14 at 16:01
  • 1
    Is it your intent to launch one thread per block and a grid of 16x16 blocks? Because that is the way your code is written. the `__syncthreads()` in your kernels aren't serving any purpose. Remember, threadblocks execute in some undefined order. – Robert Crovella Feb 26 '14 at 16:54
  • 16x16 blocks idea - yes, it is. I know, but __syncthreads() is not connecting with outputs in this case, is it? general, matrix is more then 64x64, but just for testing purpose – user Feb 26 '14 at 22:01
  • Frankly, my matrix is more then 256x256, but just for the testing purpose I decrease size to 16x16 :) – user Feb 26 '14 at 22:07
  • where is the code that generates the CPU results to compare to? – Robert Crovella Feb 28 '14 at 21:58
  • I used matlab for that – user Mar 02 '14 at 21:30

1 Answers1

2

It seems the problem was in your gaussjordan kernel.

When you are doing gauss-jordan elimination on the original (L) matrix, it is acceptable to work only on the row elements to the right of the pivot point.

But when you are applying the same row operations to the identity matrix to create the inverse (I), it's necessary to apply the equivalent row operations to every member of the row, not just those to the right of the pivot point.

So if you modify your gaussjordan kernel like this:

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i){ // this limits operation to rows below the pivot point
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] -= I[i*n+y]*P;  // apply for every row member
            if(y>=i){ //limits  to row members to the right of the pivot
                A[x*n+y] -= A[i*n+y]*P;  // apply only to members right of pivot
            }
        }
 }

I believe you'll have better results. With the above changes, I was able to duplicate your expected results within the accuracy of float vs. double, I believe.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thank you very much! what a simple mistake I made. I know about float and double but cuda gpu doesn't like double (cuda capability 1.0). – user Mar 05 '14 at 09:31
  • 1
    I believe the `gaussjordan` kernel is safe from race conditions, but I'm not sure the `dev` kernel is safe from race conditions. – Robert Crovella Mar 05 '14 at 21:19
  • right, it may be a race conditions in general. but in that case, where all elements of array are under the main diagonal so even if value of d_P[x*h+x] is changed it doesn't influence on the result but: if I do P=d_P[x*h+x] and then I will use P instead of d_P[x*h+x] it gives me a race condition free code. am I right? – user Mar 17 '14 at 12:19
  • @RobertCrovella How is the OP compiling the said code. I modified the the function `savetofile()` by using conventional `FILE *fp` and `fprintf(fp,"");` . Is there anything else we have to do? What is the use of user defined header `device_launch_parameters.h` in the OP code? – pradyot Jun 10 '16 at 06:33