I want to modify a piece of mapped memory from the host WHILE the kernel is executing, and then read this value from the kernel.
I'm trying to do this the following way. I have a kernel like this:
__global__ void kernel(int* d_ptr)
{
*d_ptr = 1;
while( *d_ptr);
}
d_ptr is mapped to a piece of memory accessible from the host.
I also have a host thread that looks like this
void run( void* input )
{
int* h_ptr = (int*)input;
while( kernel_running)
*h_ptr = 0;
}
So, the host thread is repeatedly writing a 0 to a place that the kernel is repeatedly reading from until it sees a 0. Theoretically, the kernel should stop right after it reads the value written by the host thread. The problem is that the kernel never reads this 0, so it never terminates.
The weird part is that if I add a print statement in the kernel like this
__global__ void kernel(int* d_ptr)
{
*d_ptr = 1;
while( *d_ptr) printf("%d\n", *d_ptr);
}
Then it does read the 0 and terminates. I have no clue on what's happening. The cuda programming guide does not provide much information about concurrent access to mapped memory and it's been pretty hard to find a question that addresses this either. Any pointers? I'm using Windows and I'm compiling with
nvcc -g -arch=sm_20 -lineinfo
The whole code looks like this:
bool kernel_running = 0;
__global__ void kernel(int* d_ptr)
{
*d_ptr = 1;
while( *d_ptr) printf("%d\n", *d_ptr);
}
void run( void* input )
{
int* h_ptr = (int*)input;
while( kernel_running)
{
*h_ptr = 0;
}
}
int main()
{
// HOST AND DEVICE POINTERS
int* h_ptr = 0;
int* d_ptr = 0;
// INITIALIZE POINTERS
assert( cudaHostAlloc(&h_ptr, sizeof(int), cudaHostAllocMapped) == cudaSuccess);
assert( cudaHostGetDevicePointer(&d_ptr, h_ptr, 0) == cudaSuccess);
// RUN KERNEL
kernel_running = 1;
_beginthread( run, 0, h_ptr);
kernel<<<1,1>>>(d_ptr);
assert( cudaDeviceSynchronize() == cudaSuccess);
kernel_running = 0;
}