accelerated FFT to be invoked from Python Numba CUDA kernel

Question

I need to calculate the Fourier transform of a 256 element float64 signal. The requirement is as such that I need to invoke these FFTs from inside a cuda.jitted section and it must be completed within 25usec. Alas cuda.jit-compiled functions do not allow to invoke external libraries => I wrote my own. Alas my single-core code is still way too slow (~250usec on a Quadro P4000). Is there a better way?

I created a single core FFT-function that gives correct results, but is alas 10x too slow. I don't understand how to make good use of multiple cores.

---fft.py

from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath

def _transform_radix2(vector, inverse, out):    
    n = len(vector) 
    levels = int32(math.log(float32(n))/math.log(float32(2)))

    assert 2**levels==n # error: Length is not a power of 2 

    #uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)               
    exptable = cuda.local.array(1024, dtype=complex128)   
    #exptable = np.zeros(1024, np.complex128)

    assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

    coef = complex128((2j if inverse else -2j) * math.pi / n)   
    for i in range(n // 2):                       
        exptable[i] = cmath.exp(i * coef)       

    for i in range(n):
        x = i   
        y = 0
        for j in range(levels):
            y = (y << 1) | (x & 1)
            x >>= 1
        out[i] = vector[y]      

    size = 2
    while size <= n:
        halfsize = size // 2
        tablestep = n // size
        for i in range(0, n, size):
            k = 0
            for j in range(i, i + halfsize):
                temp = out[j + halfsize] * exptable[k]    
                out[j + halfsize] = out[j] - temp
                out[j] += temp
                k += tablestep
        size *= 2

    scale=float64(n if inverse else 1)
    for i in range(n):
        out[i]=out[i]/scale   # the inverse requires a scaling

# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)

---test.py

from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit  
import fft

@cuda.jit(void(float64[:],boolean, complex128[:]))    
def fftbench(y, inverse, FT):
  Y  = cuda.local.array(256, dtype=complex128)

  for i in range(len(y)):
    Y[i]=complex128(y[i])    
  fft.gtransform_radix2(Y, False, FT)


str='\nbest [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125  ,129.62109375 ,118.6015625
 ,110.2890625  ,106.55078125 ,104.8203125  ,106.1875     ,109.328125
 ,113.5        ,118.6640625  ,125.71875    ,127.625      ,120.890625
 ,114.04296875 ,112.0078125  ,112.71484375 ,110.18359375 ,104.8828125
 ,104.47265625 ,106.65625    ,109.53515625 ,110.73828125 ,111.2421875
 ,112.28125    ,112.38671875 ,112.7734375  ,112.7421875  ,113.1328125
 ,113.24609375 ,113.15625    ,113.66015625 ,114.19921875 ,114.5
 ,114.5546875  ,115.09765625 ,115.2890625  ,115.7265625  ,115.41796875
 ,115.73828125 ,116.         ,116.55078125 ,116.5625     ,116.33984375
 ,116.63671875 ,117.015625   ,117.25       ,117.41015625 ,117.6640625
 ,117.859375   ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
 ,118.80859375 ,118.67578125 ,118.78125    ,118.49609375 ,119.0078125
 ,119.09375    ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
 ,119.890625   ,119.80078125 ,119.69140625 ,119.65625    ,119.83984375
 ,119.9609375  ,120.15625    ,120.2734375  ,120.47265625 ,120.671875
 ,120.796875   ,120.4609375  ,121.1171875  ,121.35546875 ,120.94921875
 ,120.984375   ,121.35546875 ,120.87109375 ,120.8359375  ,121.2265625
 ,121.2109375  ,120.859375   ,121.17578125 ,121.60546875 ,121.84375
 ,121.5859375  ,121.6796875  ,121.671875   ,121.78125    ,121.796875
 ,121.8828125  ,121.9921875  ,121.8984375  ,122.1640625  ,121.9375
 ,122.         ,122.3515625  ,122.359375   ,122.1875     ,122.01171875
 ,121.91015625 ,122.11328125 ,122.1171875  ,122.6484375  ,122.81640625
 ,122.33984375 ,122.265625   ,122.78125    ,122.44921875 ,122.34765625
 ,122.59765625 ,122.63671875 ,122.6796875  ,122.6171875  ,122.34375
 ,122.359375   ,122.7109375  ,122.83984375 ,122.546875   ,122.25390625
 ,122.06640625 ,122.578125   ,122.7109375  ,122.83203125 ,122.5390625
 ,122.2421875  ,122.06640625 ,122.265625   ,122.13671875 ,121.8046875
 ,121.87890625 ,121.88671875 ,122.2265625  ,121.63671875 ,121.14453125
 ,120.84375    ,120.390625   ,119.875      ,119.34765625 ,119.0390625
 ,118.4609375  ,117.828125   ,117.1953125  ,116.9921875  ,116.046875
 ,115.16015625 ,114.359375   ,113.1875     ,110.390625   ,108.41796875
 ,111.90234375 ,117.296875   ,127.0234375  ,147.58984375 ,158.625
 ,129.8515625  ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
 ,143.9296875  ,150.24609375 ,141.         ,117.71484375 ,109.80859375
 ,115.24609375 ,118.44140625 ,120.640625   ,120.9921875  ,111.828125
 ,101.6953125  ,111.21484375 ,114.91015625 ,115.2265625  ,118.21875
 ,125.3359375  ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
 ,141.67578125 ,139.53125    ,136.44921875 ,135.08203125 ,135.7890625
 ,137.58203125 ,138.7265625  ,154.33203125 ,172.01171875 ,152.24609375
 ,129.8046875  ,125.59375    ,125.234375   ,127.32421875 ,132.8984375
 ,147.98828125 ,152.328125   ,153.7734375  ,155.09765625 ,156.66796875
 ,159.0546875  ,151.83203125 ,138.91796875 ,138.0546875  ,140.671875
 ,143.48046875 ,143.99609375 ,146.875      ,146.7578125  ,141.15234375
 ,141.5        ,140.76953125 ,140.8828125  ,145.5625     ,150.78125
 ,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
 ,131.95703125 ,125.40625    ,123.265625   ,123.57421875 ,129.859375
 ,135.6484375  ,144.51171875 ,155.05078125 ,158.4453125  ,140.8125
 ,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875  ,143.38671875
 ,143.69921875 ,137.734375   ,124.48046875 ,116.73828125 ,114.84765625
 ,113.85546875 ,117.45703125 ,122.859375   ,125.8515625  ,133.22265625
 ,139.484375   ,135.75       ,122.69921875 ,115.7734375  ,116.9375
 ,127.57421875] 
y1 =cp.zeros(len(a), cp.complex128) 
FT1=cp.zeros(len(a), cp.complex128)

for i in range(len(a)):
  y1[i]=a[i]  #convert to complex to feed the FFT

r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)",      number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));



a faster implementation t<<25usec

score 8 · Answer 1 · answered Jun 27 '19 at 11:45

The drawback of your algorithm is that even on GPU it runs on a single-core.

In order to understand how to design algorithms on Nvidia GPGPU I recommend to look at : the CUDA C Programming guide and to the numba documentation to apply the code in python.

Moreover to understand what's wrong with your code, I recommend to use Nvidia profiler.

The following parts of the answer will explained how to apply the basics on your example.

Run multiples threads

To improve performances, you will first need to launch multiples threads that can run in parallel, CUDA handle threads as follow:

Threads are grouped into blocs of n threads (n < 1024)
Each thread withing the same bloc can be synchronized and have access to a (fast) common memory space called "shared memory".
You can run multiples blocs in parallel in a "grid" but you will lose the synchronization mechanism.

The syntax to run multiples threads is the following:

fftbench[griddim, blockdim](y1, False, FT1)

to simplify, I will use only one bloc of size 256:

fftbench[1, 256](y1, False, FT1)

Memory

To improve GPU performances it's important to look where the data will be stored, their is three main spaces:

global memory: it's the "RAM" of your GPU, it's slow and have a high latency, this is where all your array are placed when you send them to the GPU.
shared memory: it's a little fast access memory, all the thread of a bloc have access to the same shared memory.
local memory: physically it's the same that global memory, but each thread access its own local memory.

Typically, if you use multiples times the sames data, you should try store them in shared memory to prevent latency from the global memory.

In your code, you can store exptable in shared memory:

exptable = cuda.shared.array(1024, dtype=complex128)

and if n is not too big, you may want to use a working instead of using out:

working = cuda.shared.array(256, dtype=complex128)

Assign tasks to each thread

Of course if you don't change your function, all thread will do the same job and it will just slow down your program.

In this example we will assign each thread to one cell of the array. To do so, we have to get the unique id of thread withing a bloc:

idx = cuda.threadIdx.x

Now we will be able to speed up the for loops, lets handle them one by one:

exptable = cuda.shared.array(1024, dtype=complex128)   
...
for i in range(n // 2):                       
    exptable[i] = cmath.exp(i * coef)

Here is the goal: we will want the n/2 first threads to fill this array, then all the thread will be able to use it.

So in this case just replace the for loop by a condition on the thread idx's:

if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)

For the two last loops it's easier, each thread will deal with one cell of the array:

for i in range(n):
    x = i   
    y = 0
    for j in range(levels):
        y = (y << 1) | (x & 1)
        x >>= 1
    out[i] = vector[y]

become

x = idx   
y = 0
for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
working[idx] = vector[y]

and

for i in range(n):
    out[i]=out[i]/scale   # the inverse requires a scaling

become

out[idx]=working[idx]/scale   # the inverse requires a scaling

I use the shared array working but you can replace it by out if you want to use global memory.

Now, lets look at the while loop, we said that we want each thread to only deal with one cell of the array. So we can try to parallelize the two for loops inside.

    ...
    for i in range(0, n, size):
        k = 0
        for j in range(i, i + halfsize):
            temp = out[j + halfsize] * exptable[k]    
            out[j + halfsize] = out[j] - temp
            out[j] += temp
            k += tablestep
    ...

To simplify I will only use half of the threads, we will take the 128 first threads and determine j as follow:

    ...
    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)
    ...

k is:

    k = tablestep*(idx%halfsize)

so we got the loop:

size = 2
while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)            
        k = tablestep*(idx%halfsize)
        temp = working[j + halfsize] * exptable[k]
        working[j + halfsize] = working[j] - temp
        working[j] += temp
    size *= 2

Synchronization

Last but not least, we need to synchronize all theses threads. In fact the program will not work if we do not synch. On the GPU thread may not run at the same time so you can get issues when data are produced by one thread and used by another one, for example:

exptable[0] is used by thread_2 before thread_0 fill store its value
working[j + halfsize] is moddified by another thread before you store it in temp

to prevent this we can use the function:

cuda.syncthreads()

All the threads in the same bloc will finish this line before execution the rest of the code.

In this example, you need to synchronize at two point, after the working initialization and after each iteration of the while loop.

then your code look like:

def _transform_radix2(vector, inverse, out):    
  n = len(vector) 
  levels = int32(math.log(float32(n))/math.log(float32(2)))

  assert 2**levels==n # error: Length is not a power of 2 

  exptable = cuda.shared.array(1024, dtype=complex128)
  working = cuda.shared.array(256, dtype=complex128)

  assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

  coef = complex128((2j if inverse else -2j) * math.pi / n)   
  if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)    

  x = idx   
  y = 0
  for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
  working[idx] = vector[y]    
  cuda.syncthreads()

  size = 2
  while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
      j = (idx%halfsize) + size*(idx//halfsize)            
      k = tablestep*(idx%halfsize)
      temp = working[j + halfsize] * exptable[k]
      working[j + halfsize] = working[j] - temp
      working[j] += temp
    size *= 2
    cuda.syncthreads()

  scale=float64(n if inverse else 1)
  out[idx]=working[idx]/scale   # the inverse requires a scaling

I feel like your question is a good way to introduce some basics about GPGPU computing and I try to answer it in a didactic way. The final code is far from perfect and can be optimized a lot, I highly recommend you to read this Programming guide if you want to learn more about GPU optimizations.

accelerated FFT to be invoked from Python Numba CUDA kernel

1 Answers1

Run multiples threads

Memory

Assign tasks to each thread

Synchronization

Linked