0

I have an array of two rows and I have two kernels calculating values of each row. The second kernel needs to use the first row to calculate the second row. I thought this would not be a problem since device variables are alive throughout the application and wrote the code below.

#define IDX(x, y, N) (x * N + y)

__global__ void first_forward(int N, int K, float * alpha)
{
    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    alpha[IDX(0, x, N)] = tex1Dfetch(text_pi,x) + tex1Dfetch(text_B,IDX(x, tex1Dfetch(text_O, 0), K)); 
}

__global__ void forward_step(float * alpha, int N, int K, int step){
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int x = bx * BLOCK_SIZE + tx;
    float sum = logf(0);

    __shared__ float salpha[BLOCK_SIZE];
    __shared__ float sB[BLOCK_SIZE];

    int i,j;
    for(i = 0; i < N; i+= BLOCK_SIZE){
        // if i + tx < N

        salpha[tx] = alpha[IDX(step-1,i + tx, N)];

        sB[tx] = tex1Dfetch(text_B, IDX(x, tex1Dfetch(text_O, step), K));


        __syncthreads();
        printf("thread %x, loop %d , indexes: %d, %d ,salpha %.4f \n", x, i, step-1, i+ tx, exp(alpha[IDX(step-1,i + tx, N)]));


        for(j = 0; j < BLOCK_SIZE; j++)
            sum = add_logs(sum, salpha[j] + tex1Dfetch(text_A, IDX(i + j, x, N)) + sB[tx]);

        __syncthreads();
    }

    alpha[IDX(step, x, N)] = sum;
    printf("thread %d, step %d result %.4f \n", x, step, sum);



}

int main(int argc, char *argv[]){

float *A, *B, *pi, *alpha, *beta, *xi, *gamm;
float *dA, *dB, *dpi, *dbeta, *dalpha, *dxi, *dgamm;

....


checkCudaErrors( cudaMalloc((void**)&dalpha, sizeof(float) * N * L));
checkCudaErrors( cudaMalloc((void**)&dbeta, sizeof(float) * N * L));

first_forward<<<dimGrid, dimBlock, 0, stream_forw>>>(N,K, dalpha);
forward_step<<<dimGrid, dimBlock, 0, stream_forw>>>(dalpha, N, K, i);

checkCudaErrors(cudaMemcpyAsync(alpha + i * N,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
checkCudaErrors(cudaMemcpyAsync(alpha,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
}


When I print the first row values of dalpha inside the second kernel (forward_step), all values are printed as 1.0s. However when I copy the variable dalpha to host and I print the first row values, they are all correct. Why can this be happening? Since my second kernel does not get the first row values, all my calculations are wrong.

talonmies
  • 70,661
  • 34
  • 192
  • 269
diane
  • 13
  • 3

1 Answers1

1

I solved the problem. For some reason whenever I was trying to reach alpha in the kernel as alpha[IDX(step-1,i + tx, N)], I was not getting the right results. I changed it to alpha[IDX((step-1),(i + tx), N)] and everything is fine.

talonmies
  • 70,661
  • 34
  • 192
  • 269
diane
  • 13
  • 3