I have a CUDA kernel doing some computation on a local variable (in register), and after it gets computed, its value gets written into a global array p
:
__global__ void dd( float* p, int dimX, int dimY, int dimZ )
{
int
i = blockIdx.x*blockDim.x + threadIdx.x,
j = blockIdx.y*blockDim.y + threadIdx.y,
k = blockIdx.z*blockDim.z + threadIdx.z,
idx = j*dimX*dimY + j*dimX +i;
if (i >= dimX || j >= dimY || k >= dimZ)
{
return;
}
float val = 0;
val = SomeComputationOnVal();
p[idx ]= val;
__syncthreads();
}
Unfortunately, this function executes very slow.
However, it runs very fast if I do this:
__global__ void dd( float* p, int dimX, int dimY, int dimZ )
{
int
i = blockIdx.x*blockDim.x + threadIdx.x,
j = blockIdx.y*blockDim.y + threadIdx.y,
k = blockIdx.z*blockDim.z + threadIdx.z,
idx = j*dimX*dimY + j*dimX +i;
if (i >= dimX || j >= dimY || k >= dimZ)
{
return;
}
float val = 0;
//val = SomeComputationOnVal();
p[idx ]= val;
__syncthreads();
}
It also runs very fast if I do this:
__global__ void dd( float* p, int dimX, int dimY, int dimZ )
{
int
i = blockIdx.x*blockDim.x + threadIdx.x,
j = blockIdx.y*blockDim.y + threadIdx.y,
k = blockIdx.z*blockDim.z + threadIdx.z,
idx = j*dimX*dimY + j*dimX +i;
if (i >= dimX || j >= dimY || k >= dimZ)
{
return;
}
float val = 0;
val = SomeComputationOnVal();
// p[idx ]= val;
__syncthreads();
}
So I am confused, and have no idea how to solve this problem. I have used NSight step in, and did not find access violations.
Here is how I launch the kernel (dimX:924; dimY: 16: dimZ: 1120):
dim3
blockSize(8,16,2),
gridSize(dimX/blockSize.x+1,dimY/blockSize.y, dimZ/blockSize.z);
float* dev_p; cudaMalloc((void**)&dev_p, dimX*dimY*dimZ*sizeof(float));
dd<<<gridSize, blockSize>>>( dev_p,dimX,dimY,dimZ);
Could anyone please gives some pointers? Because it does not make much sense to me. All computation of val is fast, and the final step is to move val
into p
. p
never gets involved in the computation, and it only shows up once. So why is it so slow?
The computations are basically a loop over a 512 X 512 matrix. It is pretty fair amount of computation I'd say.