1

Updated, I wrote a test program to test the following idea:

  1. a cuda stream copies data to gpu
  2. a second cuda stream reads and processes data.
  3. One more data is copied by first stream only if the previous data is processed by the second stream.

However, it does not work: only copy first data and then waiting there.

> #include "cuda.h"
#include <iostream>
#include <pthread.h>

const int UNPROCESSED = 1;
const int PROCESSED = 2;
const int DONE = 3;
const int RUNNING= 0;
const int NUM_OF_DATA = 100;
const int NUM_OF_BLOCKS = 1;
const int THREADS_PER_BLOCK = 1;

//int data_states[NUM_OF_DATA];
cudaStream_t cuda_stream[2];
volatile int* process_state;
volatile int* d_process_state;
volatile int* d_copier_state;
int* d_data_state;
int* h_data_states;
cudaError_t cuda_status;

using namespace std;

void* copy_data(void* arg){
   int i=0;
   //cout << "in copy_data" << endl;
   while(i < NUM_OF_DATA){
      if (*process_state != UNPROCESSED){
         cout << "Now copy data " << i << " with state = " << h_data_states[i] <<  endl;
         *process_state = UNPROCESSED;
         cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
         if (cuda_status != cudaSuccess){
            cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
         }
         i++;
      }
   }
   int copier_state = DONE;
   cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
}

__global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
   int i = 0;
   printf(" i = %d\n", i);
   while(*copier_state != DONE){
      printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
      if(*data_state == UNPROCESSED){
        printf("now processing data %d\n", i);
        i++;
        // process data here, skipped
        *process_state = PROCESSED;
        *data_state = PROCESSED;
        //__threadfence_system();
      }
   }
   printf("process_data is done\n");
}

int main(int argc, char **argv){
  int i;

  cudaSetDeviceFlags(cudaDeviceMapHost);

  cuda_status = cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int),  cudaHostAllocMapped);
  if (cuda_status != cudaSuccess){
      cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
  }
  cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);

  cuda_status = cudaMalloc((void**) &d_copier_state, NUM_OF_BLOCKS*sizeof(int));
  if (cuda_status != cudaSuccess){
      cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
  }
  cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));

  cuda_status = cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
  if (cuda_status != cudaSuccess){
      cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
  }
  for(i = 0; i < NUM_OF_DATA; i++){
     h_data_states[i] = UNPROCESSED;
  }

  cudaStreamCreate(&cuda_stream[0]);
  cudaStreamCreate(&cuda_stream[1]);
 pthread_t thread;
 int thread_state = pthread_create(&thread, NULL, &copy_data, h_data_states);
  if(thread_state){
     cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
     exit(-1);
  }


  //cout << "Starting kernel" << endl;
  process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);


  cudaDeviceSynchronize();
  cudaFree(d_data_state);
  cudaFree((void*) d_copier_state);
  cudaFreeHost((void*) process_state);

  return 0;
}

My program has a variable (state) in mapped memory (cudaMallocHost with cudaHostAllocMapped flag). On CPU, the variable is accessed by a pointer (state_pointer), while on gpu, the corresponding pointer is d_state_pointer.

CPU sets the variable to be UNPROCESSED via state_pointer, then gpu checks the d_state_variable: if it is UNPROCESSED, process something and then change it to PROCESSED.

My problems is

  1. it seems that gpu does not read the correct value of d_state_pointer, so it is always waiting d_state_pointer to change to UNPROCESSED.
  2. However, if a statement if (threadIdx.x==0) printf("weird\n"); is added in the while loop, it reads the correct value of d_state_pointer (which should be UNPROCESSED) and then continue.

Cuda docs mentions that printf could change the order of thread executions. But I do not understand why the correct value of d_state_pointer can not be read without the if-printf statement? Moreover, without the if(threadIdx.x==0), the printf statement itself does not help. On the other hand, without printf, the if(threadIdx.x==0) also does not help.

Any suggestions?

BhushanK
  • 1,205
  • 6
  • 23
  • 39
Yang Liu
  • 85
  • 7

1 Answers1

3

Probably d_state_pointer is not volatile. That means the GPU is free to cache the value in L2, and has no idea if/when the host updated it.

Make it a volatile variable/pointer instead. This makes the GPU code retrieve the value from the source, rather than from the cache, on each access. Furthermore, to make sure updates go in the other direction, use __threadfence()

If you need help, take a look at this answer, or post a short, complete, compilable code that demonstrates the issue (SO expects this).

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Does volative works for pointer which point to the starting address of an array? Will array elements never be cached? – Yang Liu Aug 14 '14 at 20:08
  • Yes, `volatile` [can be a type applied to a pointer](http://stackoverflow.com/questions/9935190/why-is-a-point-to-volatile-pointer-like-volatile-int-p-useful). Any element referenced from that pointer will be treated as `volatile`. – Robert Crovella Aug 14 '14 at 20:21
  • 1
    Please read my answer and the various links. [The (answer) code I linked](http://stackoverflow.com/questions/20345702/how-can-i-check-the-progress-of-matrix-multiplication/20381924#20381924) demonstrates that a volatile pointer can be passed as a kernel parameter. – Robert Crovella Aug 14 '14 at 20:22
  • Below is a simple test program to test the following idea:one stream to copy data in a host thread, and the second stream to read and process data. But the program just copy one data and then wait for the gpu to process. The gpu seems not reading and processing the copied data. – Yang Liu Aug 15 '14 at 15:03
  • If you're still having trouble, start a new question, and post a short, *complete* code, that demonstrates the problem. The code should be something that someone can copy, paste, and compile, without adding anything or changing anything. If you can do that, I'm sure someone can help you. – Robert Crovella Aug 15 '14 at 15:19