I'm writing my first CUDA code for 2D/3D nonlinear diffusion. The 2D case is working fine, however I'm struggling with the 3D one. Basically I'm getting zeros on the stage of finite differences calculation and surprisingly the 'deltaN' (pls see code below) is giving the right answer but the other ones are not working (zeros in answer). I'm trying to process 256x256x256 volume. Any suggestions please? Thanks!
#define BLKXSIZE 8
#define BLKYSIZE 8
#define BLKZSIZE 8
#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
void AnisotropDiff_GPU(double* A, double* B, int N, int M, int Z, double sigma, int iter, double tau, int type)
{
// Nonlinear Diffusion in 3D
double *Ad;
dim3 dimBlock(BLKXSIZE, BLKYSIZE, BLKZSIZE);
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE), idivup(Z,BLKYSIZE));
cudaMalloc((void**)&Ad,N*M*Z*sizeof(double));
cudaMemcpy(Ad,A,N*M*Z*sizeof(double),cudaMemcpyHostToDevice);
int n = 1;
while (n <= iter) {
anis_diff3D<<<dimGrid,dimBlock>>>(Ad, N, M, Z, sigma, iter, tau, type);
n++;}
cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(Ad);
}
and here is a part for calculating finite differences
__global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{
int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
{
int index_out = xIndex + M*yIndex + N*M*zIndex;
double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
double cN, cS, cW, cE, cU, cD;
int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
int indexD = xIndex + M*yIndex + N*M*(zIndex+1);
if (xIndex>1)
deltaN = A[indexN]-A[index_out];
if (xIndex<N)
deltaS = A[indexS]-A[index_out];
if (yIndex>1)
deltaW = A[indexW]-A[index_out];
if (yIndex<M)
deltaE = A[indexE]-A[index_out];
if (zIndex>1)
deltaU = A[indexU]-A[index_out];
if (zIndex<Z)
deltaD = A[indexD]-A[index_out];
A[index_out] = deltaN ; // works for deltaN but not for deltaS, deltaW... .
}
Thanks a lot for help!