1

I'm using CUDA cudppScan to calculate prefix-sum of an array of elements. It goes well when input data size is small. But when data size is greater than about 700,000 some elements of the output turn out to be wrong values.

I'm doing cudppScan of an array of all ones, so the out put should be 1, 2, 3, 4, ....

Here is my code:

void
runTest( int argc, char** argv) 
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    int num_elements = 670000;

    int *h_isCommon;
    int *d_isCommon;
    int *d_scan_odata;

    h_isCommon = (int *) malloc(sizeof(int) * num_elements);
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_isCommon,   sizeof(int) * num_elements));
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_scan_odata, sizeof(int) * num_elements));

    for(int i = 0; i < num_elements; i++) h_isCommon[i] = 1;
    CUDA_SAFE_CALL(cudaMemcpy(d_isCommon, h_isCommon, sizeof(int) * num_elements, 
        cudaMemcpyHostToDevice));


    CUDPPConfiguration config;
    CUDPPHandle scanplan;

    config.op           = CUDPP_ADD;
    config.datatype     = CUDPP_INT;
    config.algorithm    = CUDPP_SCAN;
    config.options      = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;
    scanplan = 0;
    CUDPPResult result_cudpp = cudppPlan(&scanplan, config, 4000000, 1, 0);


    cudppScan(scanplan, d_scan_odata, d_isCommon, num_elements);
    CUDA_SAFE_CALL(cudaThreadSynchronize());

    CUDA_SAFE_CALL(cudaMemcpy(
            h_isCommon, 
            d_scan_odata, 
            sizeof(int) * num_elements, 
            cudaMemcpyDeviceToHost));

    for(int i = 1; i < num_elements; i++) {
        if(h_isCommon[i] != h_isCommon[i - 1] + 1)
            printf("error %d, %d\n", h_isCommon[i], h_isCommon[i - 1]);
        //if(i != 0 && i % 10 == 0) printf("\n");
        //printf("%8d", h_isCommon[i]);
    }
    printf("\n");

    CUDA_SAFE_CALL(cudaFree(d_isCommon));
    CUDA_SAFE_CALL(cudaFree(d_scan_odata));
    free(h_isCommon);
    cudaThreadExit();
}

So please help point out where I did wrong. Thanks in advance.

user435644
  • 11
  • 1
  • Also the `cudppScan` prototype I used is a little different from [that](http://www.gpgpu.org/static/developer/cudpp/rel/rel_gems3/html/group__public_interface.html#gb72e7559d9e22c00ea6412d92b0efe11) published in official site. – user435644 Jul 25 '11 at 12:12

1 Answers1

1

Please submit your issue at http://code.google.com/p/cudpp/issues/list ASAP. We are getting CUDPP 2.0 ready for release and we'd like to fix the issue if there is one.

Does the problem reproduce if you run "cudpp_testrig -scan -n=670000"?

Also, if you can check out the latest version from the SVN trunk and try with that to see if it still fails that would help us. (If it passes, don't file an issue.)

harrism
  • 26,505
  • 2
  • 57
  • 88
  • `cudpp_testrig -scan -n=` also failed even with much smaller data size. I checked out the source code from [SVN repo](http://code.google.com/p/cudpp/source/checkout), but unfortunately, my `GPU` architecture not supported. So what to do next? Thanks. – user435644 Jul 28 '11 at 03:00
  • That doesn't make sense. What GPU do you have? There is nothing new in cudppScan() in SVN that would cause it to not work if it CUDPP 1.1.1 worked. What error messages do you get? Again, please file an issue at cudpp.googlecode.com instead of here. – harrism Jul 28 '11 at 05:24