CUDA extern texture declaration

Question

I want to declare my texture once and use it in all my kernels and files. Therefore, I declare it as extern in a header and include the header on all other files (following the SO How do I use extern to share variables between source files?)

I have a header cudaHeader.cuh file containing my texture:

extern texture<uchar4, 2, cudaReadModeElementType> texImage;

In my file1.cu, I allocate my CUDA array and bind it to the texture:

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc< uchar4 >( );
cudaStatus=cudaMallocArray( &cu_array_image, &channelDesc, width, height ); 
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMallocArray failed! cu_array_image couldn't be created.\n");
    return cudaStatus;
}

cudaStatus=cudaMemcpyToArray( cu_array_image, 0, 0, image, size_image, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpyToArray failed! Copy from the host memory to the device texture memory failed.\n");
    return cudaStatus;
}


// set texture parameters
texImage.addressMode[0] = cudaAddressModeWrap;
texImage.addressMode[1] = cudaAddressModeWrap;
texImage.filterMode = cudaFilterModePoint;
texImage.normalized = false;    // access with normalized texture coordinates

// Bind the array to the texture
cudaStatus=cudaBindTextureToArray( texImage, cu_array_image, channelDesc);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaBindTextureToArray failed! cu_array couldn't be bind to texImage.\n");
    return cudaStatus;
}

In file2.cu, I use the texture in the kernel function as follows:

__global__ void kernel(int width, int height, unsigned char *dev_image) {
    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;
    if(y< height) {
        uchar4 tempcolor=tex2D(texImage, x, y);

        //if(tempcolor.x==0)
        //  printf("tempcolor.x %d \n", tempcolor.x);

        dev_image[y*width*3+x*3]= tempcolor.x;
        dev_image[y*width*3+x*3+1]= tempcolor.y;
        dev_image[y*width*3+x*3+2]= tempcolor.z;
    }
}

The problem is that my texture contains nothing or corrupt values when I use it in my file2.cu. Even if I use the function kernel directly in file1.cu, the data are not correct.

If I add: texture<uchar4, 2, cudaReadModeElementType> texImage; in file1.cu and file2.cu, the compiler says that there is a redefinition.

EDIT:

I tried the same thing with CUDA version 5.0 but the same problem appears. If I print the address of texImage in file1.cu and file2.cu, I don't have the same address. There must have a problem with the declaration of the variable texImage.

@talonmies CUDA is 4.2, I'm using a quadro 5000 with CC 2.0. — Seltymar, Oct 12 '12 at 05:55
OK then, the one line answer is no, you can't do this. The CUDA version you are using doesn't have a device code linker. All device symbols - kernels, textures and other memory declarations *must* be made within the same compilation unit. No external linkage is allowed. — talonmies, Oct 12 '12 at 05:58
@talonmies In this case, I have to bind my data to a new texture on every file. What happend to the texture when I declare it in global in the file ? When the variable `texImage` is destroyed ? Isn't dangerous to declare in every file my texture in global ? — Seltymar, Oct 12 '12 at 06:02
@talonmies Maybe make your comment as an answer so I can accept it. — Seltymar, Oct 12 '12 at 06:29
@Seltymar: I don't think you have fully understood the implications of no external linkage in device code. You can't have "every file" when compiling device code pre-cuda 5.0. You can only have 1 compilation unit containing all device definitions. This effectively means 1 .cu file, into which everything is either directly defined or defined via preprocessor includes. — talonmies, Oct 12 '12 at 08:37
@talonmies I will have to reorder my program. At the moment, I have in each file a function which set the device, bind textures, calls his own kernel... They don't include each other (I guess they compile in different unit). As global and array stay in memories until I free them, I can still call different kernel from different computation unit (pointers to global and textures memories are passed as parameters). I don't know what happen during the compilation but it seems to work. The main function can call all functions because they are prototyped. — Seltymar, Oct 12 '12 at 09:05
Just to clarify on the compilation unit issue, pre-CUDA-5.0 you *can* have different compilation units, but they cannot reference each other (meaning no extern linkage). In other words, you could have kernel_a and all its device functions and textures etc. in one compilation unit and kernel_b and all *its* device functions etc. in another unit. — Tom, Oct 15 '12 at 15:54
@Tom Ok, thank's and what is the advantage to put everything in 1 computation unit ? Does the compiler optimize better the whole program ? — Seltymar, Oct 16 '12 at 00:32

score 3 · Accepted Answer · answered Nov 03 '14 at 20:44

This is a very old question and answers were provided in the comments by talonmies and Tom. In the pre-CUDA 5.0 scenario, extern textures were not feasible due to the lack of a true linker leading to extern linkage possibilities. As a consequence, and as mentioned by Tom,

you can have different compilation units, but they cannot reference each other

In the post-CUDA 5.0 scenario, extern textures are possible and I want to provide a simple example below, showing this in the hope that it could be useful to other users.

kernel.cu compilation unit

#include <stdio.h>

texture<int, 1, cudaReadModeElementType> texture_test;

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/*************************/
/* LOCAL KERNEL FUNCTION */
/*************************/
__global__ void kernel1() {

    printf("ThreadID = %i; Texture value = %i\n", threadIdx.x, tex1Dfetch(texture_test, threadIdx.x));

}

__global__ void kernel2();

/********/
/* MAIN */
/********/
int main() {

    const int N = 16;

    // --- Host data allocation and initialization
    int *h_data = (int*)malloc(N * sizeof(int));
    for (int i=0; i<N; i++) h_data[i] = i;

    // --- Device data allocation and host->device memory transfer
    int *d_data; gpuErrchk(cudaMalloc((void**)&d_data, N * sizeof(int)));
    gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));

    gpuErrchk(cudaBindTexture(NULL, texture_test, d_data, N * sizeof(int)));

    kernel1<<<1, 16>>>();
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    kernel2<<<1, 16>>>();
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaUnbindTexture(texture_test));

}

kernel2.cu compilation unit

#include <stdio.h>

extern texture<int, 1, cudaReadModeElementType> texture_test;

/**********************************************/
/* DIFFERENT COMPILATION UNIT KERNEL FUNCTION */
/**********************************************/
__global__ void kernel2() {

    printf("Texture value = %i\n", tex1Dfetch(texture_test, threadIdx.x));

}

Remember to compile generating relocatable device code, namely, -rdc = true, to enable external linkage

CUDA extern texture declaration

1 Answers1

Linked