8

I'm trying to launch multiple CUDA FFT kernels asynchronously using streams. For that, I'm creating my streams, cuFFT forward and inverse plans as follows:

streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)  
{
    cudaStreamCreate(&streams[i]);
    CHECK_ERROR(5)
    cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
    CHECK_ERROR(5)
    cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
    CHECK_ERROR(5)
    cufftSetStream(plansF[i],streams[i]);
    CHECK_ERROR(5)
    cufftSetStream(plansI[i],streams[i]);
    CHECK_ERROR(5)
}

In the main function, I'm launching forward FFTs as follows:

for(w=1;w<q;w++)
  {
    cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
    CHECK_ERROR(8)
    k += rect_small_real;
    j += rect_small_complex;
  }

I also have other kernels that I launch asynchronously with the same streams.

When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially. Can anyone tell me what is my problem?

My environment is VS 2008, 64 bit, Windows 7.

Thanks.

Vitality
  • 20,705
  • 4
  • 108
  • 146
Meriko
  • 161
  • 2
  • 11
  • Can you increase the fft size and see if it is still the problem ? – Pavan Yalamanchili Jun 07 '13 at 22:24
  • If the FFT functions fully utilize the machine, you'll see very little if any overlap with other compute operations. – Robert Crovella Jun 09 '13 at 01:15
  • I'm having this problem, even for tiny FFT sizes in cuFFT. I really can't find a size where `cufftSetStream` makes any difference at all in terms of end-to-end computation time. I'm only using `cudaDeviceSynchronize` after launching all the cuFFT calls. Any suggestions? – solvingPuzzles Jul 25 '13 at 06:11
  • I figured out that cufft kernels do not run asynchronously with streams (no matter what size you use in fft). If you want to run cufft kernels asynchronously, create cufftPlan with multiple batches (that's how I was able to run the kernels in parallel and the performance is great). For example, cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,Batch_Num) plan would run Batch_Num cufft kernels of ticks size in parallel. – Meriko Jul 26 '13 at 19:52
  • Thanks for your help! Argh, that's no fun. I'm already using batch mode (`idist>1`), but I have several batches of a particular `idist` to do. Guess I'm out of luck. Without concurrent FFTs, calling FFTW in an OpenMP loop is about 3x faster than cuFFT. All memory is aligned and in multiples of 2. – solvingPuzzles Aug 03 '13 at 06:02
  • @solvingPuzzles: what hardware and CUDA version are you using? I was really hoping to use concurrent streams to increase fft throughput (of small batches) as well; surely it must be possible somehow? The alternative would be to use a single batch size, and queue all fft requests into batches software-matically. But that sure would be hideously ugly... – Eelco Hoogendoorn Dec 30 '14 at 11:17

3 Answers3

7

This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture.

Here is the code:

#include <stdio.h>

#include <cufft.h>

#define NUM_STREAMS 3

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 5000;

    // --- Host input data initialization
    float2 *h_in1 = new float2[N];
    float2 *h_in2 = new float2[N];
    float2 *h_in3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_in1[i].x = 1.f;
        h_in1[i].y = 0.f;
        h_in2[i].x = 1.f;
        h_in2[i].y = 0.f;
        h_in3[i].x = 1.f;
        h_in3[i].y = 0.f;
    }

    // --- Host output data initialization
    float2 *h_out1 = new float2[N];
    float2 *h_out2 = new float2[N];
    float2 *h_out3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_out1[i].x = 0.f;
        h_out1[i].y = 0.f;
        h_out2[i].x = 0.f;
        h_out2[i].y = 0.f;
        h_out3[i].x = 0.f;
        h_out3[i].y = 0.f;
    }

    // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));

    // --- Device input data allocation
    float2 *d_in1;          gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
    float2 *d_in2;          gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
    float2 *d_in3;          gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
    float2 *d_out1;         gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2;         gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3;         gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));

    // --- Creates CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    // --- Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
        cufftSetStream(plans[i], streams[i]);
    }

    // --- Async memcopyes and computations
    gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
    gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
    gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
    cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
    cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
    cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
    gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
    gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
    gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));

    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamSynchronize(streams[i]));

    // --- Releases resources
    gpuErrchk(cudaHostUnregister(h_in1));
    gpuErrchk(cudaHostUnregister(h_in2));
    gpuErrchk(cudaHostUnregister(h_in3));
    gpuErrchk(cudaHostUnregister(h_out1));
    gpuErrchk(cudaHostUnregister(h_out2));
    gpuErrchk(cudaHostUnregister(h_out3));
    gpuErrchk(cudaFree(d_in1));
    gpuErrchk(cudaFree(d_in2));
    gpuErrchk(cudaFree(d_in3));
    gpuErrchk(cudaFree(d_out1));
    gpuErrchk(cudaFree(d_out2));
    gpuErrchk(cudaFree(d_out3));

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in1;
    delete[] h_in2;
    delete[] h_in3;
    delete[] h_out1;
    delete[] h_out2;
    delete[] h_out3;

    cudaDeviceReset();  

    return 0;
}

Please, add cuFFT error check according to CUFFT error handling.

Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N.

N = 5000

enter image description here

N = 50000

enter image description here

N = 500000

enter image description here

Community
  • 1
  • 1
Vitality
  • 20,705
  • 4
  • 108
  • 146
2

The problem is in the hardware you use.

All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. However, only devices with Compute Capability 3.5 have the feature named Hyper-Q.

Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. In previous GPU's one one hardware queue is available.

This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. Of course this is valid not only for cuFFT but also for your own kernels too!

Please look deeply inside the output of visual profiler. You may unintentionally think of the timeline visualization as of the exact data for GPU execution. However it is not that simple. There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). And this line correspond to execution of specific kernel on GPU (blue rectangles). The same is for memory transfers (the exact time is shown as light brown rectangles).

Hope, I helped you to solve your problem.

Alex
  • 21
  • 3
  • "This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. " This is not true. It's easy to demonstrate concurrent kernel execution on cc 2.0 hardware. In fact, the OP even stated they were able to see concurrent kernel execution in the question: "all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap" – Robert Crovella Aug 04 '14 at 22:57
  • Has this been resolved yet? Does the execution of different fft plans in different streams overlap, assuming execution of any single plan does not saturate the GPU? I would be highly surprised (and disappointed) if it was not the case... – Eelco Hoogendoorn Dec 25 '14 at 22:39
0

Here's a riff on @JackOLantern's code that allows easy variation of the number of FFTs, FFT length, and stream count to experiment with GPU utilization in nvvp.

// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cufft.h>

// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val)  __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )

template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
    if (code) {
    std::cout << "CUDA error at "
          << file << ":" << line << std::endl
          << "error code: " << (unsigned int) code
          << " type: \""  << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
          << "func: \"" << func << "\""
          << std::endl;
    cudaDeviceReset();
    exit(EXIT_FAILURE);
    }
}

int main(int argc, char *argv[]) {

    // Number of FFTs to compute.
    const int NUM_DATA = 64;

    // Length of each FFT.
    const int N = 1048576;

    // Number of GPU streams across which to distribute the FFTs.
    const int NUM_STREAMS = 4;

    // Allocate and initialize host input data.
    float2 **h_in = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        h_in[ii] = new float2[N];
        for (int jj = 0; jj < N; ++jj) {
            h_in[ii][jj].x = (float) 1.f;
            h_in[ii][jj].y = (float) 0.f;
        }
    }

    // Allocate and initialize host output data.
    float2 **h_out = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
    h_out[ii] = new float2[N];
    for (int jj = 0; jj < N; ++jj) {
            h_out[ii][jj].x = 0.f;
            h_out[ii][jj].y = 0.f;
        }
    }

    // Pin host input and output memory for cudaMemcpyAsync.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
        check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
    }

    // Allocate pointers to device input and output arrays.
    float2 **d_in = new float2 *[NUM_STREAMS];
    float2 **d_out = new float2 *[NUM_STREAMS];

    // Allocate intput and output arrays on device.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
        check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
    }

    // Create CUDA streams.
    cudaStream_t streams[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamCreate(&streams[ii]));
    }

    // Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
        cufftSetStream(plans[ii], streams[ii]);
    }

    // Fill streams with async memcopies and FFTs.
    for (int ii = 0; ii < NUM_DATA; ii++) {
        int jj = ii % NUM_STREAMS;
        check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
        cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
        check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
    }

    // Wait for calculations to complete.
    for(int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamSynchronize(streams[ii]));
    }

    // Free memory and streams.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostUnregister(h_in[ii]));
        check_cuda_errors(cudaHostUnregister(h_out[ii]));
        check_cuda_errors(cudaFree(d_in[ii]));
        check_cuda_errors(cudaFree(d_out[ii]));
        delete[] h_in[ii];
        delete[] h_out[ii];
        check_cuda_errors(cudaStreamDestroy(streams[ii]));
    }

    delete plans;

    cudaDeviceReset();  

    return 0;
}