I'm trying to utilize cuda to accelerate my codes, it's been working, until this kernel. What the kernel should do, is to copy a mxn matrix d_A, to another memory space, ignoring the pth row. Resulting in a (m-1)xn matrix.
__global__ void copyWOp(int m, int n, int p, double* d_tobeCopied, double* d_Copied) //copy tobeCopied to copied without pth row
{
int thread2Dpx=blockIdx.x * blockDim.x + threadIdx.x;
int thread2Dpy=blockIdx.y * blockDim.y + threadIdx.y;
if (thread2Dpx>=m || thread2Dpy>=n)
return;
int thread1Dp=thread2Dpy*(m)+thread2Dpx;
if (thread2Dpx<p)
d_Copied[thread2Dpy*(m-1)+thread2Dpx]=d_tobeCopied[thread1Dp];
else if (thread2Dpx==p)
return;
else
d_Copied[thread2Dpy*(m-1)+thread2Dpx-1]=d_tobeCopied[thread1Dp];
}
and the following is how I call the kernel
cudaMalloc(&d_newA,(m-1)*n*sizeof(double));
const dim3 blockSize1(32,32,1);
const dim3 gridSize1 ((m + blockSize1.x - 1) / blockSize1.x, (n + blockSize1.y - 1) / blockSize1.y,1);
copyWOp<<<blockSize1,gridSize1>>>(m,n,p,d_A,d_newA);
cudaFree(d_A);
d_A=d_newA;
But somehow, when I checked the norm of d_newA after the kernel, it gives a flat zero, where d_A is not. So the kernel is clearly not working.
I have several similar kernel written, and they all work using the exact same indexing variables.
I know the kernel is quite naive, but I want to get things working first before optimizing.