For the following code extended from OpenMP with BLAS
Program bench_dgemm
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use :: omp_lib
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, :, : ), Allocatable :: b
Real( dp ), Dimension( :, :, : ), Allocatable :: c
Integer :: na, nb, nc, nd, m, m_iter
Integer( li ) :: start, finish, rate
Integer :: numthreads
Integer :: ithr, istart, iend
real(dp) :: sum_time
Write( *, * ) 'numthreads'
Read( *, * ) numthreads
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c( 1:na, 1:nc, 1:nd ) )
!A[a,b] * B[b,c,d] = C[a,c,d]
Call Random_number( a )
Call Random_number( b )
c = 0.0_dp
m_iter = 30
write (*,*) 'm_iter average', m_iter
write (*,*) 'numthreads', numthreads
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
!$omp parallel private(ithr, istart, iend)
ithr = omp_get_thread_num()
istart = ithr * nd / numthreads
iend = (ithr + 1) * nd / numthreads
Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_dp, a, na, &
b(1, 1, 1 + istart), Size(b, Dim = 1), &
0.0_dp, c(1, 1, 1 + istart), Size(c, Dim = 1))
!$omp end parallel
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for dgemm', sum_time / m_iter
End
suppose the file is called bench.f90
. I tried ifort bench.f90 -o bench -qopenmp -mkl=sequential
, then bench
.
For na=nb=nc=nd=200
, numthreads=1
gives me
1 Time for dgemm 4.053670000000001E-002
2 Time for dgemm 2.087716666666666E-002
4 Time for dgemm 1.082136666666667E-002
8 Time for dgemm 5.819133333333333E-003
16 Time for dgemm 4.304533333333333E-003
32 Time for dgemm 5.269366666666666E-003
I tried gfortran bench.f90 -o bench -fopenmp -lopenblas
and got
1 Time for dgemm 0.13534268956666665
2 Time for dgemm 6.9672616866666662E-002
4 Time for dgemm 3.5927094433333334E-002
8 Time for dgemm 1.8583297666666668E-002
16 Time for dgemm 1.1969903900000000E-002
32 Time for dgemm 1.9136184166666667E-002
It seems the omp
gets less speed up in 32 cores (Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHZ 2 sockets. Thus 40 cores). I think the split of indices is the external one in a matrix. Similar to A[a,b]B[b,c]
, the code splits c
into several segments. It should be straightforward to parallel. So, why the performance does not get much faster ~ 32 cores? (If the dimension of c
in B[a,c,d]
is only 30, I can imagine 32 core will not help.)
Does MPI
have a better performance comparing with the OpenMP
and the ideal scaling?