Device variables among different kernels in CUDA

Question

I have an array of two rows and I have two kernels calculating values of each row. The second kernel needs to use the first row to calculate the second row. I thought this would not be a problem since device variables are alive throughout the application and wrote the code below.

#define IDX(x, y, N) (x * N + y)

__global__ void first_forward(int N, int K, float * alpha)
{
    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    alpha[IDX(0, x, N)] = tex1Dfetch(text_pi,x) + tex1Dfetch(text_B,IDX(x, tex1Dfetch(text_O, 0), K)); 
}

__global__ void forward_step(float * alpha, int N, int K, int step){
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int x = bx * BLOCK_SIZE + tx;
    float sum = logf(0);

    __shared__ float salpha[BLOCK_SIZE];
    __shared__ float sB[BLOCK_SIZE];

    int i,j;
    for(i = 0; i < N; i+= BLOCK_SIZE){
        // if i + tx < N

        salpha[tx] = alpha[IDX(step-1,i + tx, N)];

        sB[tx] = tex1Dfetch(text_B, IDX(x, tex1Dfetch(text_O, step), K));


        __syncthreads();
        printf("thread %x, loop %d , indexes: %d, %d ,salpha %.4f \n", x, i, step-1, i+ tx, exp(alpha[IDX(step-1,i + tx, N)]));


        for(j = 0; j < BLOCK_SIZE; j++)
            sum = add_logs(sum, salpha[j] + tex1Dfetch(text_A, IDX(i + j, x, N)) + sB[tx]);

        __syncthreads();
    }

    alpha[IDX(step, x, N)] = sum;
    printf("thread %d, step %d result %.4f \n", x, step, sum);



}

int main(int argc, char *argv[]){

float *A, *B, *pi, *alpha, *beta, *xi, *gamm;
float *dA, *dB, *dpi, *dbeta, *dalpha, *dxi, *dgamm;

....


checkCudaErrors( cudaMalloc((void**)&dalpha, sizeof(float) * N * L));
checkCudaErrors( cudaMalloc((void**)&dbeta, sizeof(float) * N * L));

first_forward<<<dimGrid, dimBlock, 0, stream_forw>>>(N,K, dalpha);
forward_step<<<dimGrid, dimBlock, 0, stream_forw>>>(dalpha, N, K, i);

checkCudaErrors(cudaMemcpyAsync(alpha + i * N,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
checkCudaErrors(cudaMemcpyAsync(alpha,dalpha,N * sizeof(float),cudaMemcpyDeviceToHost, stream_forw));
}

When I print the first row values of dalpha inside the second kernel (forward_step), all values are printed as 1.0s. However when I copy the variable dalpha to host and I print the first row values, they are all correct. Why can this be happening? Since my second kernel does not get the first row values, all my calculations are wrong.

score 1 · Accepted Answer · edited Nov 28 '19 at 08:30

1

I solved the problem. For some reason whenever I was trying to reach alpha in the kernel as alpha[IDX(step-1,i + tx, N)], I was not getting the right results. I changed it to alpha[IDX((step-1),(i + tx), N)] and everything is fine.

edited Nov 28 '19 at 08:30

talonmies

70,661
34
192
269

answered Nov 28 '19 at 02:09

diane

13
3

This is related to the C preprocessor and nothing to do specifically with CUDA – talonmies Nov 28 '19 at 08:30

Device variables among different kernels in CUDA

1 Answers1