GPU reduction code only runs one time

Question

I have been using the code sample supplied by Robert Crovella:

thrust::max_element slow in comparison cublasIsamax - More efficient implementation?

Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?

#include <cuda.h>  
#include <cublas_v2.h>  
#include <thrust/extrema.h>  
#include <thrust/device_ptr.h>  
#include <thrust/device_vector.h>  
#include <stdio.h>  
#include <stdlib.h>  

#define DSIZE 4096*4  // nTPB should be a power-of-2  
#define nTPB 512  
#define MAX_KERNEL_BLOCKS 30  
#define MAX_BLOCKS ((DSIZE/nTPB)+1)  
#define MIN(a,b) ((a>b)?b:a)  
#define FLOAT_MIN -1.0f  

#include <helper_functions.h>  
#include <helper_cuda.h>  

// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];  
__device__ volatile int   blk_idxs[MAX_BLOCKS];  
__device__ int   blk_num = 0;  

//template <typename T>  
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){  

  __shared__ volatile float   vals[nTPB];  
  __shared__ volatile int idxs[nTPB];  
  __shared__ volatile int last_block;  
  int idx = threadIdx.x+blockDim.x*blockIdx.x;  
  last_block = 0;  
  float   my_val = FLOAT_MIN;  
  int my_idx = -1;  
  // sweep from global memory  
  while (idx < dsize){  
    if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}  
    idx += blockDim.x*gridDim.x;}  
  // populate shared memory  
  vals[threadIdx.x] = my_val;  
  idxs[threadIdx.x] = my_idx;  
  __syncthreads();  
  // sweep in shared memory  
  for (int i = (nTPB>>1); i > 0; i>>=1){  
    if (threadIdx.x < i)  
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }  
    __syncthreads();}  
  // perform block-level reduction  
  if (!threadIdx.x){  
    blk_vals[blockIdx.x] = vals[0];  
    blk_idxs[blockIdx.x] = idxs[0];  
    if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block  
      last_block = 1;}  
  __syncthreads();  
  if (last_block){  
    idx = threadIdx.x;  
    my_val = FLOAT_MIN;  
    my_idx = -1;  
    while (idx < gridDim.x){  
      if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }  
      idx += blockDim.x;}  
  // populate shared memory  
    vals[threadIdx.x] = my_val;  
    idxs[threadIdx.x] = my_idx;  
    __syncthreads();  
  // sweep in shared memory  
    for (int i = (nTPB>>1); i > 0; i>>=1){  
      if (threadIdx.x < i)  
        if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }  
      __syncthreads();}  
    if (!threadIdx.x)  
      *result = idxs[0];  
    }  
}  



int main(){  

  int nrElements = DSIZE;  
  float *d_vector, *h_vector;  

  StopWatchInterface *hTimer = NULL;  
  sdkCreateTimer(&hTimer);  
  double gpuTime;  
  int k;  
  int max_index;  
  int *d_max_index;  
  cudaMalloc(&d_max_index, sizeof(int));  


  h_vector = new float[DSIZE];  
  for(k=0; k < 5; k++){  
    for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;  
       h_vector[10+k] = 10;  // create definite max element that changes with each loop iteration   
   cublasHandle_t my_handle;  
   cublasStatus_t my_status = cublasCreate(&my_handle);  
   cudaMalloc(&d_vector, DSIZE*sizeof(float));  
   cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.  
       thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);  
       thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);  
       max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;  
       cudaDeviceSynchronize();  
       gpuTime = sdkGetTimerValue(&hTimer);  
       std::cout << "loop: " << k << "  thrust time:   " << gpuTime << " max index: " << max_index << std::endl;  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);  
       cudaDeviceSynchronize();  
       gpuTime = sdkGetTimerValue(&hTimer);   
       std::cout << "loop: " << k << "  cublas time:   " << gpuTime << " max index: " << max_index-1 << std::endl;  

       max_index = 0;  
       sdkResetTimer(&hTimer);  
       sdkStartTimer(&hTimer);  
       max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);  
       cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);  
       gpuTime = sdkGetTimerValue(&hTimer);  
       std::cout << "loop: " << k << "  idx kern time: " << gpuTime << " max index: " << max_index << std::endl;  
       std::cout <<  std::endl;  

  } // end for loop on k  

   cudaFree(d_max_index);  
   cudaFree(d_vector);  

  return 0;  
}

Can you provide a full code example that can reproduce your problem? — kangshiyin, Jun 28 '16 at 10:45

score 1 · Answer 1 · answered Jul 05 '16 at 02:28

The primary issue in re-using this code for multiple loops as-is is in this static initialization of a device (global) variable:

__device__ int   blk_num = 0;

That's OK if you're only going to run the routine once. But if you intend to re-use it, you will need to re-initialize this variable to zero before each call to the kernel.

We could fix this by putting an explicit initialization of this variable to zero before each call to the reduction kernel:

cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));

(I'm using max_index here simply because it is a convenient host int variable that has just been set to zero.)

That's the only change needed to get the code "working".

However the introduction of the loop has created some other "issues" that I would point out. These 3 lines of code:

cublasHandle_t my_handle;  
cublasStatus_t my_status = cublasCreate(&my_handle);  
cudaMalloc(&d_vector, DSIZE*sizeof(float));

don't belong inside the for-loop on k. That is effectively creating a memory leak and unnecessarily re-initializing the cublas library.

The following code has those changes and seems to work for me:

$ cat t1183.cu
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>

#define DSIZE 4096*4  // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f

#include <helper_functions.h>
#include <helper_cuda.h>

// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int   blk_idxs[MAX_BLOCKS];
__device__ int   blk_num;

//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){

  __shared__ volatile float   vals[nTPB];
  __shared__ volatile int idxs[nTPB];
  __shared__ volatile int last_block;
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  last_block = 0;
  float   my_val = FLOAT_MIN;
  int my_idx = -1;
  // sweep from global memory
  while (idx < dsize){
    if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
    idx += blockDim.x*gridDim.x;}
  // populate shared memory
  vals[threadIdx.x] = my_val;
  idxs[threadIdx.x] = my_idx;
  __syncthreads();
  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
    __syncthreads();}
  // perform block-level reduction
  if (!threadIdx.x){
    blk_vals[blockIdx.x] = vals[0];
    blk_idxs[blockIdx.x] = idxs[0];
    if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
      last_block = 1;}
  __syncthreads();
  if (last_block){
    idx = threadIdx.x;
    my_val = FLOAT_MIN;
    my_idx = -1;
    while (idx < gridDim.x){
      if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
      idx += blockDim.x;}
  // populate shared memory
    vals[threadIdx.x] = my_val;
    idxs[threadIdx.x] = my_idx;
    __syncthreads();
  // sweep in shared memory
    for (int i = (nTPB>>1); i > 0; i>>=1){
      if (threadIdx.x < i)
        if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
      __syncthreads();}
    if (!threadIdx.x)
      *result = idxs[0];
    }
}



int main(){

  int nrElements = DSIZE;
  float *d_vector, *h_vector;

  StopWatchInterface *hTimer = NULL;
  sdkCreateTimer(&hTimer);
  double gpuTime;
  int k;
  int max_index;
  int *d_max_index;
  cudaMalloc(&d_max_index, sizeof(int));


  h_vector = new float[DSIZE];
  cublasHandle_t my_handle;
  cublasStatus_t my_status = cublasCreate(&my_handle);
  cudaMalloc(&d_vector, DSIZE*sizeof(float));
  for(k=0; k < 5; k++){
    for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
       h_vector[10+k] = 10;  // create definite max element that changes with each loop iteration
    cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
       //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
    thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
    thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
    max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
    cudaDeviceSynchronize();
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  thrust time:   " << gpuTime << " max index: " << max_index << std::endl;

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
    cudaDeviceSynchronize();
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  cublas time:   " << gpuTime << " max index: " << max_index-1 << std::endl;

    max_index = 0;
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    cudaMemcpyToSymbol(blk_num, &max_index, sizeof(int));
    max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
    cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
    gpuTime = sdkGetTimerValue(&hTimer);
    std::cout << "loop: " << k << "  idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
    std::cout <<  std::endl;

  } // end for loop on k

   cudaFree(d_max_index);
   cudaFree(d_vector);

  return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1183.cu -o t1183 -lcublas
$ cuda-memcheck ./t1183
========= CUDA-MEMCHECK
loop: 0  thrust time:   2.806 max index: 10
loop: 0  cublas time:   0.441 max index: 10
loop: 0  idx kern time: 0.395 max index: 10

loop: 1  thrust time:   1.298 max index: 11
loop: 1  cublas time:   0.419 max index: 11
loop: 1  idx kern time: 0.424 max index: 11

loop: 2  thrust time:   1.303 max index: 12
loop: 2  cublas time:   0.43 max index: 12
loop: 2  idx kern time: 0.419 max index: 12

loop: 3  thrust time:   1.291 max index: 13
loop: 3  cublas time:   0.423 max index: 13
loop: 3  idx kern time: 0.415 max index: 13

loop: 4  thrust time:   1.299 max index: 14
loop: 4  cublas time:   0.423 max index: 14
loop: 4  idx kern time: 0.417 max index: 14

========= ERROR SUMMARY: 0 errors
$

Isn't there overhead in calling the cudaMemcpyToSymbol() since this is a copy from Host to Device? Would it be better to just reset blk_num to zero when we reach the if(last_blk){ } section of the kernel code? — brumby, Jul 05 '16 at 18:41
Yes, you should be able to make that work also. In that case you would want to add back in the static initialization for `blk_num` (`=0;`) that I removed in the above code. — Robert Crovella, Jul 05 '16 at 18:54
@RobertCrovella Hello Robert. Thank you for providing this kernel. I do have one concern. You are checking `atomicAdd(&blk_num, 1) == gridDim.x - 1`. It seems to me that it should be checking for `gridDim.x` instead of `gridDim.x - 1`. For example, if I have 256 blocks and each one adds 1 to a running count that starts at 0, then after all blocks have finished, the running count should be 256, which would be equivalent to `gridDim.x`, and not 255, which would be `gridDim.x - 1`. If I am mistaken I would greatly appreciate any insight you can give into that line of code. — Dillydill123, May 22 '19 at 19:49
Never mind I have figured it out. I read the documentation for atomicAdd and realized even though it calculates `old + val` it returns `old`. Sorry for assuming things :) — Dillydill123, May 22 '19 at 19:55

GPU reduction code only runs one time

1 Answers1