I tried to use openblas to do a matrix multiplication and use its parallel version without writing DGEMM to different threads. The efficiency is not good when the common dimension is large. Is that a limitation of openblas that I should either write parallel dgemm manually or look for mkl?
Here is the Fortran code
Program test
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, : ), Allocatable :: b
Real( dp ), Dimension( :, : ), Allocatable :: c
Integer :: na, nb, nc, m_iter
Integer :: numthreads
Integer( li ) :: start, finish, rate
real(dp) :: sum_time1, alpha, beta
Write( *, * ) 'number of threads'
Read( *, * ) numthreads
Write( *, * ) 'na, nb, nc?'
Read( *, * ) na, nb, nc
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc ) )
Allocate( c( 1:na, 1:nc ) )
Call Random_number( a )
Call Random_number( b )
c = 0.0_dp
alpha = 1.0_dp
beta = 0.0_dp
m_iter = 10
write (*,*) 'm_iter average', m_iter
!call omp_set_num_threads(1)
call openblas_set_num_threads(1)
sum_time1 = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nc, nb, alpha, a , na, b , nb, beta, c, nc )
Call System_clock( finish, rate )
sum_time1 = sum_time1 + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for dgemm-1', sum_time1 / m_iter
! call omp_set_num_threads(numthreads)
! call mkl_set_num_threads(numthreads)
call openblas_set_num_threads(numthreads)
sum_time1 = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nc, nb, alpha, a , na, b , nb, beta, c, nc )
Call System_clock( finish, rate )
sum_time1 = sum_time1 + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for dgemm-auto parallel', sum_time1 / m_iter
deallocate(a, b, c)
End
if I use gfortran-11 -O3 -fopenmp test.f90 -lopenblas
(suppose it is called test.f90
) and I got
number of threads
2
2
na, nb, nc?
6 100000 6
m_iter average 10
Time for dgemm-1 0.15203639000000002
Time for dgemm-auto parallel 0.20141369999999997
parallel one is slower than single processor one. For 1000*1000 matrix, there is an improvement of speed.
(edited for parallel setting for the first dgemm
)