I have been using the code sample supplied by Robert Crovella:
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
Which is a very fast reduction code. I modified it to also return the index of the max in the input array of floats. When I use it in my code, it will only execute one time. If I try calling the routine again it does not find a new max value, it just returns the previous max. Is there something about the volatile global memory that the routine uses that needs to be reset before it can be called again?
#include <cuda.h>
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 4096*4 // nTPB should be a power-of-2
#define nTPB 512
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <helper_functions.h>
#include <helper_cuda.h>
// this code has been modified to return the index of the max instead of the actual max value - for my application
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
//template <typename T>
__global__ void max_idx_kernel(const float *data, const int dsize, int *result){
__shared__ volatile float vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
float my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
double gpuTime;
int k;
int max_index;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
h_vector = new float[DSIZE];
for(k=0; k < 5; k++){
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10+k] = 10; // create definite max element that changes with each loop iteration
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " thrust time: " << gpuTime << " max index: " << max_index << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " cublas time: " << gpuTime << " max index: " << max_index-1 << std::endl;
max_index = 0;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
gpuTime = sdkGetTimerValue(&hTimer);
std::cout << "loop: " << k << " idx kern time: " << gpuTime << " max index: " << max_index << std::endl;
std::cout << std::endl;
} // end for loop on k
cudaFree(d_max_index);
cudaFree(d_vector);
return 0;
}