I'm trying to invert a matrix composed of complex numbers, where I'm using matrix inversion code for real numbers posted in the following link by 'user' cuda matrix inverse gaussian jordan
code compiles, no bugs, but problem is output is wrong! I don't know where I went wrong. Can anyone, please, help. Thank you in advance!
here is the complete code:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n = 3;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCimag(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *d_L, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
dim3 numBlocks(16,16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost );
cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCimag(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
Thank you @RobertCrovella for ur fast and very insightful suggestion! Regarding your answer to my question: I changed my threadsPerBlock(4,4) and numBlocks(1,1) so I'll be using 1 block with 16 threads for my 4x4 matrix. My input matrix is the following
1 0 0 0
0 2 0 0
0 0 3 0
0 0 0 4
all numbers are real in here, then expected inverted matrix should look like
1 0 0 0
0 1/2 0 0
0 0 1/3 0
0 0 0 1/4
and i'm not getting this at all. I inputted cuda memcheck tool to see if my kernel is not lunching but it didn't show any error massages. I started learning CUDA very recently and don't have much experience. Can anyone give more detailed response? Thank You!
here is my modified code.
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != 0 ){
if( cuCreal(d_A[x*h+x]) != 0 ){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}