I am working on a large program that I have parallelized with OMP, and in several places for the parallelization I have to make a choice between using pointers or copies of arrays of data. I had an intuitive sense that the choice might have an impact on compute time, but I am uncertain. In the example program given below, I am unable to discern a difference.
compile with:
gfortran paralleltest.f90 -fopenmp
run with:
>>> ./a.out 8 10000000
Copies (s): 0.835768998
results: 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032
Pointers (s): 0.837329030
results: 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032 8415334.9722864032
Given this example I am wondering about some questions:
- in general would one expect to see no difference between pointers and copies used in this say?
- what circumstances, if any, would lead to significant differences between the two, assuming that the memory overhead of the copies was not an issue.
- am I measuring this performance difference in the right way? and if not, what is a better way to measure the performance?
Here is the fortran file:
! paralleltest.f90
module m_parallel_test
implicit none
type :: t_data
real, allocatable :: x(:)
end type t_data
type :: t_data_ptr
real, pointer :: x(:)
end type t_data_ptr
contains
real(kind=8) function aggregate_data(x) result(value)
real :: x(:)
integer :: i
do i=1,size(x)
value = value + cos(x(i))
end do
end function aggregate_data
subroutine associate_pointer(parent, ptr)
real, target :: parent(:)
real, pointer :: ptr(:)
ptr => parent
end subroutine associate_pointer
end module m_parallel_test
program main
use m_parallel_test
implicit none
character(len=32) :: arg
integer, parameter :: n_set = 8
integer :: n_threads, n_data, i
type(t_data) :: parent
type(t_data), allocatable :: copies(:)
type(t_data_ptr), allocatable :: pointers(:)
real(kind=8), allocatable :: copies_results(:)
real(kind=8), allocatable :: pointers_results(:)
real :: start_time, stop_time
call get_command_argument(1, arg)
read(arg, *) n_threads
call omp_set_num_threads(n_threads)
allocate(copies(n_set))
allocate(pointers(n_set))
allocate(copies_results(n_set))
allocate(pointers_results(n_set))
call get_command_argument(2, arg)
read(arg, *) n_data
allocate(parent%x(n_data))
call seed_rng(1)
call random_number(parent%x)
do i=1,n_set
copies(i)%x = parent%x
call associate_pointer(parent%x, pointers(i)%x)
end do
call cpu_time(start_time)
!$OMP PARALLEL DO
do i=1,n_set
copies_results(i) = aggregate_data(copies(i)%x)
end do
call cpu_time(stop_time)
print *, "Copies (s): ", stop_time - start_time
print *, "results: ", copies_results
call cpu_time(start_time)
!$OMP PARALLEL DO
do i=1,n_set
pointers_results(i) = aggregate_data(pointers(i)%x)
end do
call cpu_time(stop_time)
print *, "Pointers (s): ", stop_time - start_time
print *, "results: ", pointers_results
end program main
subroutine seed_rng(i)
integer, intent(in) :: i
integer, allocatable :: iseed(:)
integer :: n, j
call random_seed(size=n)
allocate(iseed(n))
iseed= [(i + j, j=1, n)]
call random_seed(put=iseed)
end subroutine seed_rng