I want to prepare my CUDA kernels for working over large amount of particles (much exceeding 65535 which is max value of gridDim). I tried to create a proper thread index mapping working for any <<<numBlocks, threadsPerBlock>>>
values.
I wrote this:
__global__ void step_k(float* position, size_t numElements, unsigned int* blabla)
{
unsigned int i = calculateIndex();
if (i < numElements){
blabla[i] = i;
}
}
__device__ unsigned int calculateIndex(){
unsigned int xIndex = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int yIndex = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int zIndex = blockIdx.z*blockDim.z+threadIdx.z;
unsigned int xSize = gridDim.x*blockDim.x;
unsigned int ySize = gridDim.y*blockDim.y;
return xSize*ySize*zIndex+xSize*yIndex+xIndex;
}
and I use it this way:
void CudaSphFluids::step(void)
{
//dim3 threadsPerBlock(1024, 1024, 64);
//dim3 numBlocks(65535, 65535, 65535);
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
unsigned int result[256] = {};
unsigned int* d_results;
cudaMalloc( (void**) &d_results,sizeof(unsigned int)*256);
step_k<<<numBlocks, threadsPerBlock>>>(d_position, 256, d_results);
cudaMemcpy(result,d_results,sizeof(unsigned int)*256,cudaMemcpyDeviceToHost);
CLOG(INFO, "SPH")<<"STEP";
for(unsigned int t=0; t<256;t++) {
cout<<result[t]<<"; ";
}
cout<<endl;
cudaFree(d_results);
Sleep(200);
}
It seems to be ok (incrementing numbers from 0 to 255) for :
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 1, 1);
It works for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 3, 1);
but when I try to run it for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 5, 1);
for:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 10, 1);
and for larger values like:
dim3 numBlocks(1, 1, 1);
dim3 threadsPerBlock(256, 256, 1);
it's getting crazy:
Then I tried to use another mapping from some smart guy's website:
__device__ int getGlobalIdx_3D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
But unfortunately it doesn't work. (numbers are different, but also wrong).
Any ideas what is the reason of such a strange acting?
I use CUDA 6.0 on GeForce GTX 560Ti (sm_21) and VS2012 with NSight.