I'm starting to use cuda, trying to improve the speed of my code.
So, I have a chain.h file where I define the Chain_1D structure and some functions
extern int N;
struct Chain_1D
{
int N_mons;
double *x_old = (double*) malloc(sizeof(double)*N);
float *X;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&X, N*sizeof(float));
__global__ void step();
};
__global__
void Chain_1D::step()
{
// Update x_old!
for (int i = 0; i < N_mons; i++) x_old[i] = X[i] ;
int index = threadIdx.x;
int stride = blockDim.x;
for (int j=index ; j<N_mons ; j += stride)
{
if (j==0)
{
X[0] += - (x_old[2]-3*x_old[1]+2*x_old[0])*dt;
} else if (j==1)
{
X[1] += - (x_old[3]-4*x_old[2]+6*x_old[1]-3*x_old[0])*dt ;
} else if (j==N_mons-2)
{
X[N_mons-2] += -(x_old[N_mons-4]-4*x_old[N_mons-3]+6*x_old[N_mons-2]-3*x_old[N_mons-1])*dt;
} else if (j==N_mons-1)
{
X[N_mons-1] += -(x_old[N_mons-3]-3*x_old[N_mons-2]+2*x_old[N_mons-1])*dt;
} else
{
X[j] += - (x_old[j-2]+x_old[j+2]-4*x_old[j-1]-4*x_old[j+1]+6*x_old[j])*dt;
}
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
}
Then I intend to use this in the main()
function (and main.cu file)
Chain_1D chain;
chain.init_Chain();
for (int i = 1; i < N_runs; i++)
{
chain.step<<<1, 200>>>();
}
cudaFree(chain.X);
It doesn't work, I believe it is due to the definition of this __global__
function. Can anyone help?