I tried to extend the code in the answer of the above link, to include cross checks and openmp.
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!naive
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!naive omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
I got two issues:
- there is no significant speed up in neither BLAS or naive loops. E.g,., by
gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp
, and input30 30 30 60
, I got
30 30 30 60
Time for reshaping method 2.9443999999999998E-003
Difference between result matrices 12.380937791257775
Time for straight method 1.0016000000000001E-003
Time for straight method omp 2.4878000000000001E-003
Time for loop 6.6072500000000006E-002
Time for loop omp 0.100242600000000002
- when the dimension get larger, e.g.,
60 60 60 60
in the input, the openmp BLAS result can get different value than naive loop, seems I miss some control option.
What would be the problems with OpenMP here?
Edit
I added omp lines in the initialization in the c5
section and commented out two printing lines,
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
!naive loop
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!dgemm omp
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!loop omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
!single core: c1 c2 c3
! c1 reshape blas
! c2 blas
! c3 naive loop (reference)
! parallel: c4 c5
! c4 dgemm parallel
! c5 naive loop parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
then gfortran reshape.f90 -lblas -fopenmp
, and 30 30 30 30
input lead to
Time for reshaping method 1.3519000000000001E-003
Difference between result matrices 12.380937791257775
Time for straight method 6.2549999999999997E-004
Time for straight method omp 1.2600000000000001E-003
Time for naive loop 1.0008599999999999E-002
Time for naive loop omp 1.6678999999999999E-002
not good speed up though.