I have an array of two rows and I have two kernels calculating values of each row. The second kernel needs to use the first row to calculate the second row. I thought this would not be a problem since device variables are alive throughout the application and wrote the code below.
#define IDX(x, y, N) (x * N + y)
__global__ void first_forward(int N, int K, float * alpha)
{
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
alpha[IDX(0, x, N)] = tex1Dfetch(text_pi,x) + tex1Dfetch(text_B,IDX(x, tex1Dfetch(text_O, 0), K));
}
__global__ void forward_step(float * alpha, int N, int K, int step){
int bx = blockIdx.x;
int tx = threadIdx.x;
int x = bx * BLOCK_SIZE + tx;
float sum = logf(0);
__shared__ float salpha[BLOCK_SIZE];
__shared__ float sB[BLOCK_SIZE];
int i,j;
for(i = 0; i < N; i+= BLOCK_SIZE){
// if i + tx < N
salpha[tx] = alpha[IDX(step-1,i + tx, N)];
sB[tx] = tex1Dfetch(text_B, IDX(x, tex1Dfetch(text_O, step), K));
__syncthreads();
printf("thread %x, loop %d , indexes: %d, %d ,salpha %.4f \n", x, i, step-1, i+ tx, exp(alpha[IDX(step-1,i + tx, N)]));
for(j = 0; j < BLOCK_SIZE; j++)
sum = add_logs(sum, salpha[j] + tex1Dfetch(text_A, IDX(i + j, x, N)) + sB[tx]);
__syncthreads();
}
alpha[IDX(step, x, N)] = sum;
printf("thread %d, step %d result %.4f \n", x, step, sum);
}
int main(int argc, char *argv[]){
float *A, *B, *pi, *alpha, *beta, *xi, *gamm;
float *dA, *dB, *dpi, *dbeta, *dalpha, *dxi, *dgamm;
....
checkCudaErrors( cudaMalloc((void**)&dalpha, sizeof(float) * N * L));
checkCudaErrors( cudaMalloc((void**)&dbeta, sizeof(float) * N * L));
first_forward<<<dimGrid, dimBlock, 0, stream_forw>>>(N,K, dalpha);
forward_step<<<dimGrid, dimBlock, 0, stream_forw>>>(dalpha, N, K, i);
checkCudaErrors(cudaMemcpyAsync(alpha + i * N,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
checkCudaErrors(cudaMemcpyAsync(alpha,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
}
When I print the first row values of dalpha inside the second kernel (forward_step), all values are printed as 1.0s. However when I copy the variable dalpha to host and I print the first row values, they are all correct. Why can this be happening? Since my second kernel does not get the first row values, all my calculations are wrong.