I'm trying to do an exclusive sum reduction in CUDA. I am using the CUB library and have decided to try the CUB::DeviceReduce. However, my result is NaN, and I can't figure out why.
Code is:
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using std::cout;
using std::endl;
#define DSIZE 512
void dev_cumsum( const float *dev_inData, float *dev_outData ) {
int n = 512;
void* dev_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(dev_temp_storage,temp_storage_bytes,const_cast<float*>(dev_inData),dev_outData,n);
cudaMalloc(&dev_temp_storage,temp_storage_bytes);
cub::DeviceScan::ExclusiveSum(dev_temp_storage,temp_storage_bytes,const_cast<float*>(dev_inData),dev_outData,n);
}
int main(){
float h_data[512];
float* d_data;
float* d_result;
float h_result[512];
cudaMalloc(&d_data, DSIZE*sizeof(float));
cudaMalloc(&d_result, DSIZE*sizeof(float));
h_data[0] = rand()%10;
h_result[0] = 0;
for (int i=1; i<DSIZE; i++) {
h_data[i] = rand()%10;
h_result[i] = h_data[i-1]+h_result[i-1];
}
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
dev_cumsum(d_data, d_result);
printf("CPU result = %f\n", h_result[511]);
cudaMemcpy(h_result, d_result, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
printf("GPU result = %f\n", h_result[511]);
for( int i = 0; i < DSIZE; i++ ) {cout << h_result[i] << " ";}
cout << endl;
return 0;
}
This code gives me NaN for the last 8 elements of the device result.
This code is running on a GTX650 Ti Boost in Linux Mint15. I'm using NSight and the console output compile command is:
Invoking: NVCC Compiler
/usr/local/cuda-5.5/bin/nvcc -G -g -O0 -gencode arch=compute_30,code=sm_30 -odir "" -M -o "main.d" "../main.cu"
/usr/local/cuda-5.5/bin/nvcc --device-c -G -O0 -g -gencode arch=compute_30,code=compute_30 -gencode arch=compute_30,code=sm_30 -x cu -o "main.o" "../main.cu"
Cuda version is 5.5 CUB version 1.0.2
This was tested on another computer with Cuda 6, OSX10.9.2, CUB 1.2.3 and running a GT750M, and reproduced the error of last 8 numbers being NaN
edit: The code works correctly with int and double, but not float.
edit: With thanks to Robert Crovella, this question was originally asked in regards to DeviceReduce. That code worked, it was throwing NaN because earlier code using DeviceScan was feeding it NaN as input. Question is revised to suit