I've got the following kernel:
__global__ void myKernel(int k, int inc, int width, int* d_Xco, int* d_Xnum, bool*
Xvalid, float* d_X)
{
int i, k1;
i = threadIdx.x + blockIdx.x * blockDim.x;
//k1 = threadIdx.y + blockIdx.y * blockDim.y;
if( (i < k) ){
for(k1 = 0; k1 < inc; k1++){
int mul = (d_X[i*inc + k1] >= 2e2);
d_X[i*inc + k1] *= (float)(!mul);
d_Xco[i*width + k1] = k*mul;
d_Xnum[i] += mul;
d_Xvalid[i*inc + k1] = (!mul) ;
}
}// of if
}
which is call this way:
int bx = (int)(k/32)+1;
int by = (int)(inc/32)+1;
dim3 b(bDim, 1);
dim3 t(tDim, 1);
cmyKernel<< b, t >>>( k, inc, width, d_Xco, d_Xnum, d_Xvalid, d_X );
cudaThreadSynchronize();
k
is around 9000 and inc
is around 5000, so I am sure I am not exceeding the number of blocks. If myKernel
is called with 1thread/1block in the y
dimension, the kernel seems to work fine, however, just changing the number of threads and blocks in y
dimension to 10 for example, it gives wrong output, even if within the kernel I am not really using threads and blocks in y
. Ideally, I would like to get rid of the for()
using k = threadIdx.y + blockIdx.y * blockDim.y