Julia Threads.@threads slower than single thread performance

Question

I am trying to solve numerically the heat equation in 1d:

I am using finite differences and I have some trouble using the @threads instruction in Julia. In particular below there are two version of the same code: the first one is single thread while the other uses @threads (they are identical apart from the @thread instruction)

function heatSecLoop(;T::Float64)

    println("start")
    L = 1
    ν = 0.5
    Δt = 1e-6
    Δx = 1e-3

    Nt = ceil(Int, T/Δt )
    Nx = ceil(Int,L/Δx + 2)
    u = zeros(Nx)    
    u[round(Int,Nx/2)] = 1
    
    println("starting loop")
    for t=1:Nt-1
        u_old = copy(u)
        for i=2:Nx-1
            u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
        end

        if t % round(Int,Nt/10) == 0
            println("time = " * string(round(t*Δt,digits=4)) )
        end
    end
    println("done")
    return u
end

function heatParLoop(;T::Float64)

    println("start")
    L = 1
    ν = 0.5
    Δt = 1e-6
    Δx = 1e-3

    Nt = ceil(Int, T/Δt )
    Nx = ceil(Int,L/Δx + 2)
    u = zeros(Nx)    
    u[round(Int,Nx/2)] = 1
    
    println("starting loop")
    for t=1:Nt-1
        u_old = copy(u)
        Threads.@threads for i=2:Nx-1
            u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
        end

        if t % round(Int,Nt/10) == 0
            println("time = " * string(round(t*Δt,digits=4)) )
        end
    end
    println("done")
    return u
end

The issue is that the sequential one is faster than the multi-thread one. Here are the timing (after one run to compile)

julia> Threads.nthreads()
2

julia> @time heatParLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
  5.417182 seconds (12.14 M allocations: 9.125 GiB, 6.59% gc time)

julia> @time heatSecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
  3.892801 seconds (1.00 M allocations: 7.629 GiB, 8.06% gc time)

Of course the heat equation is just an example for a more intricate problem. I also tried using other libraries such as SharedArrays together with Distributed but with worse results.

Any help is appreciated.

Please look [here](https://discourse.julialang.org/t/julia-threads-threads-slower-than-single-thread-performance/46823) for a partial solution — Andrea Fuzzi, Sep 18 '20 at 07:46

cbk · Answer 1 · 2021-07-09T01:06:03.583

This appears to still hold true, likely due to

the overhead of Threads.@threads
perhaps to a lesser degree, the fact that garbage collection in Julia is single-threaded, and the original version here is generating a fair amount of garbage.

Moreover, building on the advice from the linked discourse thread, it may be worth noting that there is now a threaded version of the @avx (now @turbo) macro from LoopVectorization.jl, which uses very lightweight threading from Polyester.jl, and manages to eke out slightly better performance despite the still nontrivial overhead of threading:

function heatSecLoop(;T::Float64)

    println("start")
    L = 1
    ν = 0.5
    Δt = 1e-6
    Δx = 1e-3

    Nt = ceil(Int, T/Δt )
    Nx = ceil(Int,L/Δx + 2)
    u = zeros(Nx)    
    u[round(Int,Nx/2)] = 1
    u_old = similar(u)

    println("starting loop")
    for t=1:Nt-1
        u_old, u = u, u_old
        for i=2:Nx-1
            u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
        end

        if t % round(Int,Nt/10) == 0
            println("time = " * string(round(t*Δt,digits=4)) )
        end
    end
    println("done")
    return u
end
function heatVecLoop(;T::Float64)
    println("start")
    L = 1
    ν = 0.5
    Δt = 1e-6
    Δx = 1e-3

    Nt = ceil(Int, T/Δt )
    Nx = ceil(Int,L/Δx + 2)
    u = zeros(Nx)
    u[round(Int,Nx/2)] = 1
    u_old = similar(u)

    println("starting loop")
    for t=1:Nt-1
       u_old, u = u, u_old
       @tturbo for i=2:Nx-1
           u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i-1]-2u_old[i] + u_old[i+1])
       end

       if t % round(Int,Nt/10) == 0
           println("time = " * string(round(t*Δt,digits=4)) )
       end
    end
    println("done")
    return u
end

function heatTVecLoop(;T::Float64)
    println("start")
    L = 1
    ν = 0.5
    Δt = 1e-6
    Δx = 1e-3

    Nt = ceil(Int, T/Δt )
    Nx = ceil(Int,L/Δx + 2)
    u = zeros(Nx)
    u[round(Int,Nx/2)] = 1
    u_old = similar(u)

    println("starting loop")
    for t=1:Nt-1
       u_old, u = u, u_old
       @tturbo for i=2:Nx-1
           u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i-1]-2u_old[i] + u_old[i+1])
       end

       if t % round(Int,Nt/10) == 0
           println("time = " * string(round(t*Δt,digits=4)) )
       end
    end
    println("done")
    return u
end

julia> @time heatSecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
  1.786011 seconds (114 allocations: 22.094 KiB)

julia> @time heatVecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
  0.314305 seconds (114 allocations: 22.094 KiB)

julia> @time heatTVecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
  0.300656 seconds (114 allocations: 22.094 KiB)

The performance of the single-threaded @turbo-vectorized version also appears to have improved significantly since this question was first asked, and the performance of the multithreaded @tturbo version would likely continue to improve for larger problem sizes.

Julia Threads.@threads slower than single thread performance

1 Answers1

Linked