I am trying to use an array in constant memory which is to be filled in a cpp main. When I copy to the symbol and copy back from it I observe all 0s, similarly if I try to use the array in the constant memory I end up getting all 0s.
What am I doing wrong?
(I have tried "extern"ing but all of my attempts ended up with unresolved symbol build errors, I also tried making dummy.h to dummy.cuh. I will probably hear that symbols are defined for a local scope but that will not proabably help me much :) )
Here is the edited version, I am using VS2017 community edition with CUDA 10.1:
//dummy.h
#include <cuda_runtime.h>
void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream);
//dummy.cpp
#include "dummy.h"
__constant__ float order[300];
inline int idivCeil(int x, int y)
{
return (x + y - 1) / y;
}
__global__ void dummyBackTransferKernel(float* d_array, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size)
{
d_array[x] = order[x];
//std::cout << x << " : " << order[x] << std::endl;
}
}
void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream)
{
dim3 blockSize(32); // Consider using #define
dim3 gridSize(idivCeil(size, blockSize.x));
dummyBackTransferKernel << < gridSize, blockSize, 0, *stream >> > (d_array, size);
}
//main.cpp
#include "dummy.h"
#include "../Preprocessor.h"
#include <iostream>
#define TEST_SIZE 250
int main(int argc, char** argv)
{
CHK_CUDA(cudaSetDevice(0));
cudaStream_t testStream;
cudaStreamCreate(&testStream);
std::cout << "Const test" << std::endl;
float* c_buf; //Host array as input
CHK_CUDA(cudaMallocHost((void**)&c_buf, sizeof(float) * TEST_SIZE));
float* ct_buf; //Host array to contain the symbols after cudaMemcpyToSymbol+cudaMemcpyFromSymbol
CHK_CUDA(cudaMallocHost((void**)&ct_buf, sizeof(float) * TEST_SIZE));
float* cd_buf; //Device array to contain result of a kernel using the constant memory
CHK_CUDA(cudaMalloc((void**)&cd_buf, sizeof(float) * TEST_SIZE));
float* ch_buf; //Result of the kernel copied back to host
CHK_CUDA(cudaMallocHost((void**)&ch_buf, sizeof(float) * TEST_SIZE));
for (int pp = 0; pp < TEST_SIZE; ++pp) {
c_buf[pp] = (float)rand() / RAND_MAX;
}
cudaMemcpyToSymbolAsync(order, c_buf, TEST_SIZE * sizeof(float), 0, cudaMemcpyHostToDevice, testStream);
cudaMemcpyFromSymbolAsync(ct_buf, order, TEST_SIZE * sizeof(float), 0, cudaMemcpyDeviceToHost, testStream);
dummyBackTransferStream(cd_buf, TEST_SIZE, &testStream);
CHK_CUDA(cudaMemcpy(ch_buf, cd_buf , sizeof(float) * TEST_SIZE, cudaMemcpyDeviceToHost));
cudaStreamSynchronize(testStream);
for (int pp = 0; pp < TEST_SIZE; ++pp) {
std::cout << c_buf[pp] << " " << ch_buf[pp] << " " << ct_buf[pp] << std::endl;
}
std::cout << "done!" << std::endl;
return 0;
}
The outcome
1>main.cpp(29): error C2065: 'order': undeclared identifier
1>main.cpp(30): error C2065: 'order': undeclared identifier
1> 0 Warning(s)
1> 2 Error(s)
1>
1>Time Elapsed 00:00:04.29
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========
Here is the related portion of cmake file:
set(CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS};
-gencode arch=compute_70,code=sm_70
-gencode arch=compute_62,code=sm_62
-gencode arch=compute_61,code=sm_61
-gencode arch=compute_60,code=sm_60
-gencode arch=compute_50,code=sm_50
-gencode arch=compute_35,code=sm_35
-use_fast_math
-rdc=true
)
Thank you for your help, it will be greatly appreciated.