I'm using CUDA
cudppScan
to calculate prefix-sum of an array of elements.
It goes well when input data size is small.
But when data size is greater than about 700,000 some elements of the output turn out to be wrong values.
I'm doing cudppScan
of an array of all ones, so the out put should be 1, 2, 3, 4, ...
.
Here is my code:
void
runTest( int argc, char** argv)
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
int num_elements = 670000;
int *h_isCommon;
int *d_isCommon;
int *d_scan_odata;
h_isCommon = (int *) malloc(sizeof(int) * num_elements);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_isCommon, sizeof(int) * num_elements));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_scan_odata, sizeof(int) * num_elements));
for(int i = 0; i < num_elements; i++) h_isCommon[i] = 1;
CUDA_SAFE_CALL(cudaMemcpy(d_isCommon, h_isCommon, sizeof(int) * num_elements,
cudaMemcpyHostToDevice));
CUDPPConfiguration config;
CUDPPHandle scanplan;
config.op = CUDPP_ADD;
config.datatype = CUDPP_INT;
config.algorithm = CUDPP_SCAN;
config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;
scanplan = 0;
CUDPPResult result_cudpp = cudppPlan(&scanplan, config, 4000000, 1, 0);
cudppScan(scanplan, d_scan_odata, d_isCommon, num_elements);
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(
h_isCommon,
d_scan_odata,
sizeof(int) * num_elements,
cudaMemcpyDeviceToHost));
for(int i = 1; i < num_elements; i++) {
if(h_isCommon[i] != h_isCommon[i - 1] + 1)
printf("error %d, %d\n", h_isCommon[i], h_isCommon[i - 1]);
//if(i != 0 && i % 10 == 0) printf("\n");
//printf("%8d", h_isCommon[i]);
}
printf("\n");
CUDA_SAFE_CALL(cudaFree(d_isCommon));
CUDA_SAFE_CALL(cudaFree(d_scan_odata));
free(h_isCommon);
cudaThreadExit();
}
So please help point out where I did wrong. Thanks in advance.