CUDA Constant Memory Usage in CPP Main

Question

I am trying to use an array in constant memory which is to be filled in a cpp main. When I copy to the symbol and copy back from it I observe all 0s, similarly if I try to use the array in the constant memory I end up getting all 0s.

What am I doing wrong?

(I have tried "extern"ing but all of my attempts ended up with unresolved symbol build errors, I also tried making dummy.h to dummy.cuh. I will probably hear that symbols are defined for a local scope but that will not proabably help me much :) )

Here is the edited version, I am using VS2017 community edition with CUDA 10.1:

//dummy.h
#include <cuda_runtime.h>
void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream);

//dummy.cpp
#include "dummy.h"
__constant__ float order[300];
inline int idivCeil(int x, int y)
{
  return (x + y - 1) / y;
}

__global__ void dummyBackTransferKernel(float* d_array, int size)
{
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  if (x < size)
  {
    d_array[x] = order[x];
    //std::cout << x << " : " << order[x] << std::endl;
  }
}

void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream)
{
  dim3 blockSize(32); // Consider using #define 
  dim3 gridSize(idivCeil(size, blockSize.x));
  dummyBackTransferKernel << < gridSize, blockSize, 0, *stream >> > (d_array, size);
}

//main.cpp
#include "dummy.h"
#include "../Preprocessor.h"
#include <iostream>

#define TEST_SIZE 250

int main(int argc, char** argv)
{
  CHK_CUDA(cudaSetDevice(0));
  cudaStream_t testStream;
  cudaStreamCreate(&testStream);

  std::cout << "Const test" << std::endl;

  float* c_buf; //Host array as input
  CHK_CUDA(cudaMallocHost((void**)&c_buf, sizeof(float) * TEST_SIZE));
  float* ct_buf; //Host array to contain the symbols after cudaMemcpyToSymbol+cudaMemcpyFromSymbol
  CHK_CUDA(cudaMallocHost((void**)&ct_buf, sizeof(float) * TEST_SIZE));
  float* cd_buf; //Device array to contain result of a kernel using the constant memory
  CHK_CUDA(cudaMalloc((void**)&cd_buf, sizeof(float) * TEST_SIZE));
  float* ch_buf; //Result of the kernel copied back to host  
  CHK_CUDA(cudaMallocHost((void**)&ch_buf, sizeof(float) * TEST_SIZE));  

  for (int pp = 0; pp < TEST_SIZE; ++pp) {
    c_buf[pp] = (float)rand() / RAND_MAX;
  }  

  cudaMemcpyToSymbolAsync(order, c_buf, TEST_SIZE * sizeof(float), 0, cudaMemcpyHostToDevice, testStream);
  cudaMemcpyFromSymbolAsync(ct_buf, order, TEST_SIZE * sizeof(float), 0, cudaMemcpyDeviceToHost, testStream);

  dummyBackTransferStream(cd_buf, TEST_SIZE, &testStream);
  CHK_CUDA(cudaMemcpy(ch_buf, cd_buf , sizeof(float) * TEST_SIZE, cudaMemcpyDeviceToHost));

  cudaStreamSynchronize(testStream);

  for (int pp = 0; pp < TEST_SIZE; ++pp) {
    std::cout << c_buf[pp] << "     " << ch_buf[pp] << "     " << ct_buf[pp] << std::endl;
  }

  std::cout << "done!" << std::endl;
  return 0;
}

The outcome

1>main.cpp(29): error C2065: 'order': undeclared identifier
1>main.cpp(30): error C2065: 'order': undeclared identifier
1>    0 Warning(s)
1>    2 Error(s)
1>
1>Time Elapsed 00:00:04.29
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

Here is the related portion of cmake file:

set(CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
    -gencode arch=compute_70,code=sm_70
    -gencode arch=compute_62,code=sm_62
    -gencode arch=compute_61,code=sm_61
    -gencode arch=compute_60,code=sm_60
    -gencode arch=compute_50,code=sm_50
    -gencode arch=compute_35,code=sm_35
    -use_fast_math
    -rdc=true
    )

Thank you for your help, it will be greatly appreciated.

Don't define the constant memory symbol in a header. Define it within one of the source files and use `extern` declaration elsewhere. You then have to use separate compilation and device linkage — talonmies, Oct 19 '19 at 09:13
@talonmies I tried used externs both in main and header (together) and ended up with unresolved symbol errors. Would you recommend using it only in header or in main? — keoxkeox, Oct 19 '19 at 09:19
Did you read the second sentence? You must use separate compilation and device linkage to make this work — talonmies, Oct 19 '19 at 09:20
@talonmies Nope I have completely missed it, can you please explain it or direct me where I find more about it? Thank you for your help — keoxkeox, Oct 19 '19 at 09:22
@talonmies found this https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/ I really do appreciate your help, if you wish to write it as an answer I would select it. — keoxkeox, Oct 19 '19 at 09:27

CUDA Constant Memory Usage in CPP Main

0 Answers0