I'm working on a 3D Laplacian. My code is successful with the size N=32 but with N=64 or N=128 I've some incorrect results:
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;
// Let's start the main program.
int main(void) {
// Choice of N.
int N;
cout<<"Choose matrix dimension (32, 64 or 128)"<<endl;
cin>>N;
int size=(N+2)*(N+2)*(N+2)*sizeof(float);
// Variable statement.
struct timeval t1, t2;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
//Init x and y.
x = new float[size];
y = new float[size];
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++)
for (int k=1;k<N+1;k++) {
x[i*(N+2)*(N+2)+j*(N+2)+k]=1;
}
// Shadow cases.
for (int i=1;i<N+1;i++) {
for (int j=1;j<N+1;j++) {
x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1];
x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];
}
for (int k=0;k<N+2;k++) {
x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k];
x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}
for (int j=0;j<N+2;j++)
for (int k=0;k<N+2;k++) {
x[(N+2)*j+k]=x[(N+2)*(N+2)+(N+2)*j+k];
x[(N+1)*(N+2)*(N+2)+(N+2)*j+k]=x[(N+2)*(N+2)*N+(N+2)*j+k];
}
// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which initial matrix's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<x[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);
cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
// Solver parameters.
dim3 dimGrid(N/32, N/8, N/8);
dim3 dimBlock(16, 8, 8);
// Solver loop.
gettimeofday(&t1, 0);
res3d<<<dimGrid, dimBlock>>>(x_d, y_d, N);
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));
// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);
// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
Where :
#ifndef RES2D_MAT_GPU_HPP
#define RES2D_GPU_HPP
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
__global__ void res3d(volatile float* x, float* y, int N)
{
// Variable statement.
__shared__ float sdata[18][10][10];
__shared__ float idata[18][10][10];
int tid = threadIdx.x+1;
int tjd = threadIdx.y+1;
int tkd = threadIdx.z+1;
int i = threadIdx.x + blockIdx.x*(blockDim.x)+1;
int j = threadIdx.y + blockIdx.y*(blockDim.y)+1;
int k = threadIdx.z + blockIdx.z*(blockDim.z)+1;
// Overloading of shared variable's outlines.
float data=0,data1=0;
if (threadIdx.x==0) {
data += x[(N+2)*(N+2)*(i-1)+(N+2)*j+k];
data1 += x[(N+2)*(N+2)*(i-1)+(N+2)*j+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.x==15) {
data += x[(N+2)*(N+2)*(i+1)+(N+2)*j+k];
data1 += x[(N+2)*(N+2)*(i+1)+(N+2)*j+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.y==0) {
data += x[(N+2)*(N+2)*i+(N+2)*(j-1)+k];
data1 += x[(N+2)*(N+2)*i+(N+2)*(j-1)+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.y==7) {
data += x[(N+2)*(N+2)*i+(N+2)*(j+1)+k];
data1 += x[(N+2)*(N+2)*i+(N+2)*(j+1)+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.z==0) {
data += x[(N+2)*(N+2)*i+(N+2)*j+k-1];
data1 += x[(N+2)*(N+2)*i+(N+2)*j+k-1+N*(N+2)*(N+2)/2];
}
if (threadIdx.z==7) {
data += x[(N+2)*(N+2)*i+(N+2)*j+k+1];
data1 += x[(N+2)*(N+2)*i+(N+2)*j+k+1+N*(N+2)*(N+2)/2];
}
// Init shared variable.
sdata[tid][tjd][tkd] = x[(N+2)*(N+2)*i+(N+2)*j+k];
idata[tid][tjd][tkd]=x[(N+2)*(N+2)*i+(N+2)*j+k+N*(N+2)*(N+2)/2];
__syncthreads();
// (small) tiling.
y[(N+2)*(N+2)*i+(N+2)*j+k] = sdata[tid][tjd+1][tkd]
+ sdata[tid][tjd-1][tkd]
+ sdata[tid][tjd][tkd+1]
+ sdata[tid][tjd][tkd-1]
+ sdata[tid+1][tjd][tkd]
+ sdata[tid-1][tjd][tkd]
- 6*sdata[tid][tjd][tkd]+data;
y[(N+2)*(N+2)*i+(N+2)*j+k+N*(N+2)*(N+2)/2] = idata[tid][tjd+1][tkd]
+ idata[tid][tjd-1][tkd]
+ idata[tid][tjd][tkd+1]
+ idata[tid][tjd][tkd-1]
+ idata[tid+1][tjd][tkd]
+ idata[tid-1][tjd][tkd]
- 6*idata[tid][tjd][tkd]+data1;
}
#endif
Questions :
Is my code erroneous? Or is it a problem from GPU's architecure if results are false with N=64 and N=128?
Does "if" is the good way to overloading shared variable's outlines?
Thanks in advance for your help.