0

I have a CUDA application where I am trying to use constant memory. But when I am writing the kernel in the same file where the main function is, then only the data in the constant memory is getting recognized inside the kernel. Otherwise if I declare the kernel function in some other file then the constant memory is becoming 0 and the operation is operating properly. I am providing a simple dummy code which would explain the problem more easily. This program have a 48x48 matrix divided into 16x16 blocks and I am storing random numbers 1 to 50 in it. Inside the kernel I am adding numbers stored in constant memory to the each rows in a block. The code is given below :

Header File:

#include <windows.h>
#include <dos.h>
#include <stdio.h>
#include <conio.h>
#include <math.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <curand.h>
#include <curand_kernel.h>

__constant__ int test_cons[16];

__global__ void test_kernel_1(int *,int *);

Main Program :

int main(int argc,char *argv[])
{   int *mat,*dev_mat,*res,*dev_res;
    int i,j;
    int test[16 ]   = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
    mat = (int *)malloc(48*48*sizeof(int));
    res = (int *)malloc(48*48*sizeof(int));
    memset(res,0,48*48*sizeof(int));

    srand(time(NULL));
    for(i=0;i<48;i++)
    {   for(j=0;j<48;j++)
        {   mat[i*48+j] = rand()%(50-1)+1;
            printf("%d\t",mat[i*48+j] );
        }
        printf("\n");
    }

    cudaMalloc((void **)&dev_mat,48*48*sizeof(int));
    cudaMemcpy(dev_mat,mat,48*48*sizeof(int),cudaMemcpyHostToDevice);
    cudaMalloc((void **)&dev_res,48*48*sizeof(int));

    dim3 gridDim(48/16,48/16,1);
    dim3 blockDim(16,16,1);

    test_kernel_1<<< gridDim,blockDim>>>(dev_mat,dev_res);

    cudaMemcpy(res,dev_res,48*48*sizeof(int),cudaMemcpyDeviceToHost);

    printf("\n\n\n\n");
    for(i=0;i<48;i++)
    {   for(j=0;j<48;j++)
        {   printf("%d\t",res[i*48+j] );
        }
        printf("\n");
    }

    cudaFree(dev_mat);
    cudaFree(dev_res);
    free(mat);
    free(res);
    exit(0);
}

Kernel Function :

__global__ void test_kernel_1(int *dev_mat,int* dev_res)
{
    int row = blockIdx.y*blockDim.y+threadIdx.y;
    int col = blockIdx.x*blockDim.x +threadIdx.x;

    dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
}

Now when I am declaring the kernel function inside the main program file along with the main program then the constant memory values are correct otherwise if it is in a different file the test_cons[threadIdx.x] values are becoming 0.

I came across this link which kind of discuss the same problem but I am not getting it properly. It would be very much helpful if someone could tell me why this is happening and what I need to do avoid this problem. Any sort of help would be highly appreciated. Thanks.

Community
  • 1
  • 1
duttasankha
  • 717
  • 2
  • 10
  • 32
  • You have answered your own question with the link you have provided. The link exactly says how to solve your problem. You have to declare a wrapper in main file and call that wrapper from the file where you have the kernel to copy the values into the constant memory. – Sagar Masuti Sep 23 '13 at 02:59
  • Yeah I kind of understand that the link actually answers my question but it would be helpful if you could write the necessary codes in the program I have provided and then post it as answer. I am not able to understand the link totally. Thanks – duttasankha Sep 23 '13 at 03:03
  • I did something mentioned in this link http://stackoverflow.com/questions/2450556/allocate-constant-memory but it is still not working. I am not totally understanding how the wrapper would be done to avoid the situation. – duttasankha Sep 23 '13 at 03:48

2 Answers2

2

I just recently answered a similar question here

CUDA can handle code that references device code (entry points) or symbols in other files, but it requires separate compilation with device linking (as described and linked in the link I gave above). (And separate compilation/linking requires CC 2.0 or greater)

So if you modify the link steps you can have your __constant__ variable in a given file, and reference it from a different file.

If not (if you don't specify separate compilation and device linking), then the device code that references the __constant__ variable, the host code that references the __constant__ variable, and the definition/declaration of the variable itself, all need to be in the same file.

So this:

__constant__ int test_cons[16];

This:

cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));

And this:

dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];

all need to be in the same file.

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • I did exactly the same you have mentioned. I declared the constant variable, cudaMemcpyToSymbol and kernel statement in the same file. But it is still not working. I did the cudaMemcpyToSymbol in a function which I am calling from the main function, declared __constant__ and kernel, all three in the same file but it is still not working. – duttasankha Sep 23 '13 at 03:57
  • Perhaps you should post the the example where you are doing everything in a single file. I don't see that you are doing any [cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) in what you have posted so far. I took the 3 different pieces of code you have posted so far, put them all in the same file, and it seemed to compile and run successfully for me. For instance the first 3 numbers in the data were 2 28 17 and the first 3 numbers in the results were 3 30 20 etc. – Robert Crovella Sep 23 '13 at 13:16
  • Actually there I was missing something in the main function and I corrected it. Now it is working fine and giving the correct result. I have accepted the answer after it started working properly. Thanks for your help. – duttasankha Sep 23 '13 at 20:07
1

The above answer is totally acceptable I am adding this since the user is not able to make it working. You can accept the above answer this is just for your reference.

Kernel.cu file:

#include <stdio.h>

__constant__ int test_cons[16];

void copymemory (int *test)
{
      cudaMemcpyToSymbol(test_cons,test,16*sizeof(int));
}

__global__ void test_kernel_1(int *dev_mat,int* dev_res)
{
    int row = blockIdx.y*blockDim.y+threadIdx.y;
    int col = blockIdx.x*blockDim.x +threadIdx.x;

    if (threadIdx.x ==0)
    {
        printf ("testcons[0] is %d\n", test_cons[threadIdx.x]) ;
    }
    dev_res[row*48+col] = dev_mat[row*48+col] + test_cons[threadIdx.x];
 }

simple.cu file

#include <stdio.h>
#include <math.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>


void copymemory (int *temp) ;

__global__ void test_kernel_1(int *,int *);

int main(int argc,char *argv[])
{
    int *mat,*dev_mat,*res,*dev_res;
    int i,j;
    int test[16 ]   = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    mat = (int *)malloc(48*48*sizeof(int));
    res = (int *)malloc(48*48*sizeof(int));
    memset(res,0,48*48*sizeof(int));


    copymemory (test) ;
    srand(time(NULL));
    for(i=0;i<48;i++)
    {
            for(j=0;j<48;j++)
            {
                mat[i*48+j] = rand()%(50-1)+1;
                //printf("%d\t",mat[i*48+j] );
            }
            //printf("\n");
    }

    cudaMalloc((void **)&dev_mat,48*48*sizeof(int));
    cudaMemcpy(dev_mat,mat,48*48*sizeof(int),cudaMemcpyHostToDevice);
    cudaMalloc((void **)&dev_res,48*48*sizeof(int));
    dim3 gridDim(48/16,48/16,1);
    dim3 blockDim(16,16,1);

    test_kernel_1<<< gridDim,blockDim>>>(dev_mat,dev_res);
    cudaMemcpy(res,dev_res,48*48*sizeof(int),cudaMemcpyDeviceToHost);
    for(i=0;i<48;i++)
    {
           for(j=0;j<48;j++)
            {
                    //   printf("%d\t",res[i*48+j] );
            }
            //printf("\n");
    }

    cudaFree(dev_mat);
    cudaFree(dev_res);
    free(mat);
    free(res);
    exit(0);
}

I have commented your printf. And the printf in the kernel prints the value 1. I also tested by changing the value of test[0] in main function and it works perfectly.

Sagar Masuti
  • 1,271
  • 2
  • 11
  • 30