2

I'm setting up a one dimensional fftshift in CUDA. My code is the following

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    double2 temp;

    if(i< N/2)
    {
        temp.x =  u_d[i].x;
        temp.y =  u_d[i].y;

        u_d[i].x =u_d[i+N/2].x;
        u_d[i].y =u_d[i+N/2].y;

        u_d[i+N/2].x = temp.x;
        u_d[i+N/2].y = temp.y;
    }
}

Is there any way, smarter than that shown above, to perform the fftshift in CUDA?

Thanks in advance.

A PERHAPS BETTER SOLUTION

I found that perhaps the following solution could be a good alternative

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < N)
    {
        double a = pow(-1.0,i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}

It consists in multiplying the vector to be transformed by a sequence of 1s and -1s which is equivalent to the multiplication by exp(-jnpi) and thus to a shift in the conjugate domain.

You have to call this kernel before and after the application of the CUFFT.

One pro is that memory movements/swapping are avoided and the idea can be immediately extended to the 2D case, see CUDA Device To Device transfer expensive.

CONCERNING SYMMETRIC DATA

This solution seems not to be limited to symmetric data. Try for example the following Matlab code, applying the idea to a completely complex random matrix (Gaussian amplitude and uniform phase).

N1=512;
N2=256;

Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);

Im=Magnitude.*exp(j*Phase);

Transform=fftshift(fft2(ifftshift(Im)));

n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);

100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))

The returned normalized root mean square error will be 0, confirming that Transform=Im4.

IMPROVEMENT TO THE SPEED

Following the suggestion received at the NVIDIA Forum, improved speed can be achieved as by changing the instruction

double a = pow(-1.0,i&1);

to

double a = 1-2*(i&1);

to avoid the use of the slow routine pow.

Community
  • 1
  • 1
Vitality
  • 20,705
  • 4
  • 108
  • 146

2 Answers2

3

After much time and the introduction of the callback functionality of cuFFT, I can provide a meaningful answer to my own question.

Above I was proposing a "perhaps better solution". After some testing, I have realized that, without using the callback cuFFT functionality, that solution is slower because it uses pow. Then, I have explored two alternatives to the use of pow, something like

float a     = (float)(1-2*((int)offset%2));

float2  out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;

and

float2  out = ((float2*)d_in)[offset];

if ((int)offset&1) {

    out.x = -out.x;
    out.y = -out.y;

}

But, with standard cuFFT, all the above solutions require two separate kernel calls, one for the fftshift and one for the cuFFT execution call. However, with the new cuFFT callback functionality, the above alternative solutions can be embedded in the code as __device__ functions.

So, finally I ended up with the below comparison code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

#include <cufft.h>
#include <cufftXt.h>

//#define DEBUG

#define BLOCKSIZE 256

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
             return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}
#endif

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2)
    {
        float2 temp = d_inout[tid];
        d_inout[tid] = d_inout[tid + (N / 2)];
        d_inout[tid + (N / 2)] = temp;
    }
}

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a     = (float)(1-2*((int)offset%2));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a = pow(-1.,(double)(offset&1));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float2  out = ((float2*)d_in)[offset];

    if ((int)offset&1) {

        out.x = -out.x;
        out.y = -out.y;

    }
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;

/********/
/* MAIN */
/********/
int main()
{
    const int N = 131072;

    printf("N = %d\n", N);

    // --- Host side input array
    float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
    for (int i=0; i<N; i++) {
        h_vect[i].x = (float)rand() / (float)RAND_MAX;
        h_vect[i].y = (float)rand() / (float)RAND_MAX;
    }

    // --- Host side output arrays
    float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));

    // --- Device side input arrays
    float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
    float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
    float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
    float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
    gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));

    // --- Device side output arrays
    float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
    float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /*******************************************/
    /* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
    /*******************************************/
    cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
    fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Memory movements elapsed time:  %3.3f ms \n", time);
    gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V1 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
    cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
    cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v1 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }

    printf("Chessboard v1 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V2 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
    cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v2 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }

    printf("Chessboard v2 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V3 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
    cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v3 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }

    printf("Chessboard v3 test passed!\n");

    return 0;
}

RESULTS ON A GTX 480

N         Mem mov   v1      v2      v3 
131072    0.552     0.136   0.354   0.183 
262144    0.536     0.175   0.451   0.237 
524288    0.661     0.283   0.822   0.290 
1048576   0.784     0.565   1.548   0.548 
2097152   1.298     0.952   2.973   0.944 

RESULTS ON A TESLA C2050

N         Mem mov   v1      v2      v3 
131072    0.278     0.130   0.236   0.132 
262144    0.344     0.202   0.374   0.206 
524288    0.544     0.378   0.696   0.387 
1048576   0.909     0.695   1.294   0.695 
2097152   1.656     1.349   2.531   1.349 

RESULTS ON A KEPLER K20c

N         Mem mov   v1      v2      v3 
131072    0.077     0.076   0.136   0.076 
262144    0.142     0.128   0.202   0.127 
524288    0.268     0.229   0.374   0.230 
1048576   0.516     0.433   0.717   0.435 
2097152   1.019     0.853   1.400   0.855 

Some more details have recently appeared at The 1D fftshift in CUDA by chessboard multiplication and at the GitHub page.

Vitality
  • 20,705
  • 4
  • 108
  • 146
2

If space is not a concern (and are using fftshift for only one dimension), create u_d with size 1.5 x N, and write the first N/2 elements at the end. You can then move u_d to u_d + N / 2

Here is how you could do it.

double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;
Pavan Yalamanchili
  • 12,021
  • 2
  • 35
  • 55
  • Thank you for your answer. Basically, you are physically moving the first `N/2` elements to the end (last `N/2` elements) of the `1.5N`-array by a cudaMemcpy DeviceToDevice. This seems to be clever. But in another post, see [CUDA Device To Device transfer expensive](http://stackoverflow.com/questions/6063619/cuda-device-to-device-transfer-expensive), you have by yourself discouraged another user to that practice, being it slower than creating a kernel to do the swaps. Also, it seems that this solution occupies more memory. Could you please further comment on this point? – Vitality Jan 07 '13 at 13:50
  • @JackOLantern if the memory is a concern of yours, your solution should be just fine. I used a cudamemcpy here just for simplicity sake. And for doing a single memory copy, I don't think there will be too much difference in performance. In the other question, there were multiple memory copies that could be done by a single kernel. – Pavan Yalamanchili Jan 07 '13 at 14:32
  • I probably have found a new solution. Please, take a look at the revised post. – Vitality Jan 07 '13 at 21:03
  • @JackOLantern The revised version only works for symmetric data. This is not always the case though. So keep that in mind. – Pavan Yalamanchili Jan 07 '13 at 21:26
  • What do you mean by "symmetric data"? The idea seems to apply to random complex matrices. Please, take a look at the re-revised post. – Vitality Jan 07 '13 at 22:37
  • @JackOLantern While I agree applying the transform before and after calling CUFFT gives the same result as calling fftshift followed by an ifftshift later, I meant to say multiplying the data by alternating ones and -1s is not functionally equivalent to fftshift. you could get the same results without applying any transform at all. So I am unsure what you are trying to achieve here. – Pavan Yalamanchili Jan 09 '13 at 00:44
  • What I'm trying to do is the following (in Matlab language) `fftshift(fft2(ifftshift(u)))`. Suppose for the moment that the matrix has even sizes, so that `fftshift` and `ifftshift` commute. The version of the `fftshift` suggested above, although not giving "per se" the same result as the, say, Matlab one, gives the same result under a twofold application, before and after the fft. – Vitality Jan 09 '13 at 20:12
  • @JackOLantern If you are going to use ifftshift followed by fftshift without doing anything else, you dont need to do them. You are just doing more operations and gaining nothing else. – Pavan Yalamanchili Jan 09 '13 at 20:53
  • I'm not doing that. I'm just applying one fftshift before and one after the fft2, in other words `fftshift(fft2(ifftshift(u)))`. – Vitality Jan 10 '13 at 14:11
  • @JackOLantern That does exactly what fft2(u) does. You do not need the fftshift and ifftshift. – Pavan Yalamanchili Jan 10 '13 at 16:29
  • Are you sure? Try comparing `fftshift(fft2(ifftshift(Im)))` with `fft2(Im)` for the code above. – Vitality Jan 12 '13 at 20:47