CPU time discrepancies between static and dynamic allocation

Question

My objective is to investigate the CPU time discrepancies I observe between static and dynamic allocation, depending whether memory is accessed contiguously or not.

In order to make this investigation as sound as possible, I led it with both C++ and Fortran programs. Those are as simple as possible, the core part consists in computing one matrix multiplication from two randomly filled ones. Here is the C++ code:

#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>

using namespace std;

#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//

class matrix
{
private:
  int n_;
  int m_;
  double *data_;

public:

  matrix();
  ~matrix();

  double* operator[](int i);
  void resize(int n, int m);
  double& operator()(int i, int j);
  const double& operator()(int i, int j) const;
};

matrix::matrix() : n_(0), m_(0), data_(NULL)
{
  return;
}

matrix::~matrix()
{
  if (data_) delete[] data_;
  return;
}

void matrix::resize(int n, int m)
{
  if (data_) delete[] data_;
  n_ = n;
  m_ = m;
  data_ = new double[n_ * m_];
}

inline double& matrix::operator()(int i, int j)
{
  return *(data_ + i * m_ + j);
}

inline const double& matrix::operator()(int i, int j) const
{
  return *(data_ + i * m_ + j);
}
#endif

// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;


//
// Main program.
//


int main(int argc, char *argv[])
{
  cout << "optflag " << optflag;

#ifdef ALLOCATION_DYNAMIC
  int n = N;
  matrix cc1;
  matrix cc2;
  matrix cc3;
#else
  const int n = N;

  // It is necessary to specify the static keyword
  // because the default is "automatic", so that
  // data is entirely put on the stack which quickly
  // get overflowed with greater N values.
  static double cc1[N][N];
  static double cc2[N][N];
  static double cc3[N][N];
#endif

  cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
  cout << "dynamic";
  if (argc > 1)
    {
      istringstream iss(argv[1]);
      iss >> n;
    }

  cc1.resize(n, n);
  cc2.resize(n, n);
  cc3.resize(n, n);
#else
  cout << "static";
#endif
  cout << " N " << n << flush;

  // Init.
  string seed = SEED;
  std::seed_seq seed_sequence (seed.begin(), seed.end());

  // Standard, 64 bit based, Mersenne Twister random engine.
  std::mt19937_64 generator (seed_sequence);

  // Random number between [0, 1].
  std::uniform_real_distribution<double> random_unity(double(0), double(1));

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
    {
#ifdef ALLOCATION_DYNAMIC
      cc1(i, j) = random_unity(generator);
      cc2(i, j) = random_unity(generator);
      cc3(i, j) = double(0);
#else
      cc1[i][j] = random_unity(generator);
      cc2[i][j] = random_unity(generator);
      cc3[i][j] = double(0);
#endif
    }

  clock_t cpu_begin = clock();
  auto wall_begin = std::chrono::high_resolution_clock::now();

  cout << " transpose ";
#ifdef TRANSPOSE
  cout << "yes";
  // Transpose.

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < i; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        double tmp = cc2(i, j);
        cc2(i, j) = cc2(j, i);
        cc2(j, i) = tmp;
#else
        double tmp = cc2[i][j];
        cc2[i][j] = cc2[j][i];
        cc2[j][i] = tmp;
#endif
      }
#else
  cout << "no";
#endif
  cout << flush;

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
        }

  clock_t cpu_end = clock();
  auto wall_end = std::chrono::high_resolution_clock::now();

  double sum(0);
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        sum += cc3(i, j);
#else
        sum += cc3[i][j];
#endif
      }

  sum /= double(n * n);

  cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
       << " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
       << " sum " << setprecision(16) << sum << endl;

  return 0;
}

Here is the Fortran code :

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:,:), allocatable :: cc1
  double precision, dimension(:,:), allocatable :: cc2
  double precision, dimension(:,:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n,n) :: cc1
  double precision, dimension(n,n) :: cc2
  double precision, dimension(n,n) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

#ifdef TRANSPOSE
  double precision :: tmp
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n, n))
  allocate(cc2(n, n))
  allocate(cc3(n, n))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n
     do j = 1, n
        call random_number(harvest)
        cc1(i, j) = harvest(1)
        cc2(i, j) = harvest(2)
        cc3(i, j) = dble(0)
     enddo
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
  write(6, FMT = '(A)', ADVANCE = 'NO') "yes"

  ! Transpose.
  do j = 1, n
     do i = 1, j - 1
        tmp = cc1(i, j)
        cc1(i, j) = cc1(j, i)
        cc1(j, i) = tmp
     enddo
  enddo
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
#ifdef TRANSPOSE
           cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n
     do i = 1, n
        sum = sum + cc3(i, j)
     enddo
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

I tried to make both programs as similar as possible, taking into account that C/C++ is row order major whereas Fortran is column order major.

Whenever possible, matrices are read contiguously, the exception is the matrix multiplication because when performing C = A x B, A is usually read by row whereas B is read by column.

Both programs can be compiled either by letting one of the matrix, A or B depending on the language, being accessed no sequentially, or by transposing matrix A or B so that it is then read contiguously during the matrix multiplication, which is achieved by passing the TRANSPOSE preprocess instruction.

The following lines give all the details for the compilation process ((GCC) 4.8.1 ) :

gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3    -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90

gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90

g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

These four lines produce four programs in which A or B matrices are initially transposed.The N preprocess instruction initializes the default matrix size which has to be known at compile time when using static fields. That is to note all programs are compiled with the highest optimization degree (O3) I know so far.

I ran all generated programs for various matrix sizes, from 1000 to 5000. Results are plotted in the following figures, one for each case (transpose or not) :

Test with transpose

Test without transpose

The host system is

Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz

and the stack size is (ulimit -s) 10240.

For each point, I ran several times the same program until the standard deviation of CPU time becomes negligible compared to its average. Squares and circles stand respectively for Fortran and C++, red and green for dynamic or static.

In the transpose test, computation times are very close, especially the main difference comes from the language (Fortran vs. C++), the dynamic vs. static allocation makes nearly no difference. However, static allocation seems faster, especially for C++.

In the no-transpose test, computation times are significantly greater, which was expected as it is slower to access memory not sequentially, but the CPU time behaves differently than before:

there seem to have some "instabilities" between 1600 and 3400 matrix sizes,
the language makes no more differences,
the dynamic vs. static allocation makes one significant discrepancy whatever the language.

I would like to understand what happens in the no-transpose test:

Why moving from static to dynamic allocation makes the CPU time increase by an average of 50% (average on N) for both C++ and Fortran?
Are there ways to overcome this with some compiling options?
Why do we observe some kinds of instabilities compared to the smooth behavior of the transpose test? Indeed, there is a slight increase for some matrix sizes : 1600, 2400, 2800, 3200, 4000, 4800, which are all (except 2800) divisible by 8 (8 x 200, 300, 400, 500, 600). Do you see any reasons for that ?

Your help would be greatly appreciated as the team in which I work faces the same problem : a significant CPU time increase when they switched from static to dynamic allocation in a (much bigger) Fortran program.

I recommend watching [Modern C++: What You Need to Know - Herb Sutter](https://channel9.msdn.com/Events/Build/2014/2-661) to understand caching and prefetching. He has similar data and explains where it comes from. It mostly applies to Fortran as well since it is a hardware thing. — nwp, Jul 20 '15 at 10:58
For multiplying matrix, there are various ways to achieve high performance and reduce cache misses http://stackoverflow.com/q/21503882/995714 http://stackoverflow.com/q/4455645/995714 http://stackoverflow.com/q/16581737/995714 ... You should also consider using [`restrict`](https://en.wikipedia.org/wiki/Restrict) keyword in the C++ code in order for the compiler to optimize more http://stackoverflow.com/q/7298546/995714 — phuclv, Jul 20 '15 at 11:04
Thanks for your link about matrix multiplying. However, my main concern here is to understand discrepancies between static/dynamic and f90/c++. So I just need to perform one calculation which need to be very similar across langages, matrix multiplication seemed to me one good candidate. I will look into the restrict keyword, never used it before, it seems to deal with aliasing, which is new for me, but the optimization flag I used (O2) implies -fstrict-aliasing according to your link. — deb2014, Jul 20 '15 at 11:19
Why two downvotes to this question? It's well written and full of details. If you think it's a duplicate, let moderators (and the OP) know! If you take the answer for granted, maybe the OP doesn't. — Paolo M, Jul 20 '15 at 11:19
@PaoloM The question feels too broad to me. It requires detailed analysis of the data and hardware which we do not have and will result in an answer too specific for other users. But then again I did not downvote. — nwp, Jul 20 '15 at 11:26
@PaoloM I haven't -1d, but I can imagine reasons. There's no detail of code in the question (off-site references); question explicitly repeatedly asks about differences in dynamic/static memory but also says "I see no difference between f90-static and f90-dynamic" of the example given. — francescalus, Jul 20 '15 at 11:26
@francescalus Hello, for better reading I posted the code on codeviewer.org, the links works for me, don't they for you ? I tried to explain, maybe with some repetitions, how I went on doing this test, which is unable to reproduce the issue I mentioned at the beginning, but this test itself gives some strange behavior I would like to understand. I forgot to mention all programs were run on a centOS machine with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz, the stack size is 10240 (ulimit -s). — deb2014, Jul 20 '15 at 11:35
Well a starter for your analysis is the way you perform the operations... This is achieved using the exact same indexing in C++ and Fortran, neglecting the fact that C++ uses row-major, and Fortran uses column-major storage. This way, you will have strided access for writing in Fortran, and contiguous access in C++. My guess is that is is the reason for C++ being faster. — Alexander Vogt, Jul 20 '15 at 11:44
There are some intel tools that allow you to measure the cache misses of your program, possibly also on which memory addresses they occurred. I would expect the cache misses to be strongly correlated with the loss in performance. Also I would like to see your benchmark code. Maybe you dynamically allocate memory and then start the timing and the hardware is smart enough to know that newly allocated memory probably gets used soon and caches it better or something, but only an intel engineer would be able to answer that with confidence. — nwp, Jul 20 '15 at 11:45
BTW: Hager and Wellein examine exactly the same problem [in their book](http://www.amazon.de/dp/143981192X). They show differences due to implementations, effects due to caching on different machines, etc. for the matrix transpose and matrix-matrix multiplication. — Alexander Vogt, Jul 20 '15 at 11:48
@Alexander Vogt : point taken, I did not see this difference (striding/contiguous) between fortran/C++, I will correct this. As for the benchmark code, the CPU time is measured within the source code, only the matrix initialization and multiplication is included, then I cannot control exactly what the hardware does, which tool were you thinking of ? — deb2014, Jul 20 '15 at 11:54
If you can run it under linux, valgrind too has a tool for cache analysis. http://valgrind.org/docs/manual/cg-manual.html — Paolo M, Jul 20 '15 at 12:31
@Vladimir : compiler is g++/gfortran : (GCC) 4.8.1 and f90_compilation_option = ["-cpp", "-Wall", "-pedantic", "-fimplicit-none", "-O2"] cpp_compilation_option = [ "-Wall", "-pedantic", "-std=gnu++0x", "-O2"] — deb2014, Jul 20 '15 at 13:25

Stefan · Answer 1 · 2015-07-22T19:31:22.193

The most important part should be the knowledge, which is available to the compiler.

The static version has a fixed array size which can be used by the compiler to enforce better optimizations. E.g. the distance between the rows of your matrix is fixed (cc3(n,1) is next to cc3(1,2) in Fortran memory), while the dynamic array could have some padding (there could be an element cc3(n+1,1). Actually, taking a look at the output of -fopt-info-optimized we can see, that the loop at l.95 gets only optimized in the static case.

To check this, I modified your program to use linear arrays to represent the matrices. Timing the program, I got no significant time difference between static and dynamic allocation, and the 2d version with optimal loop order performed with the same speed.

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:), allocatable :: cc1
  double precision, dimension(:), allocatable :: cc2
  double precision, dimension(:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n**2) :: cc1
  double precision, dimension(n**2) :: cc2
  double precision, dimension(n**2) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n**2))
  allocate(cc2(n**2))
  allocate(cc3(n**2))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n**2
    call random_number(harvest)
    cc1(i) = harvest(1)
    cc2(i) = harvest(2)
    cc3(i) = dble(0)
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n**2
        sum = sum + cc3(i)
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

score 1 · Answer 2 · edited Jun 20 '20 at 09:12

Clearly, the CPU time discrepancy between static and dynamic allocation in the notranspose test is due to

  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
        enddo
     enddo
  enddo

for the Fortran 90 program and to

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
  #if defined(ALLOCATION_DYNAMIC)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
  #else
          cc3[i][j] += cc1[i][k] * cc2[k][j];
  #endif
        }

for the C++ one.

The compiler is able to make deeper optimizations (-fopt-info-optimize) when using static allocation, in this case it outputs (both F90/C++):

Vectorizing loop at src/test.cpp:163

src/test.cpp:163: note: LOOP VECTORIZED.

src/test.cpp:163: note: OUTER LOOP VECTORIZED.

It does nothing for dynamic allocation, I am quite surprised of this, I do not understand why the compiler can optimize this non-contiguous memory access (cc3[k][j]) when statically allocated and not when dynamically allocated (cc3(k, j)).

Could be worth to try different compilers. Gfortrand and g++ share the same base. — Vladimir F Героям слава, Jul 23 '15 at 10:56
Yes, but it brings some precisions, anyway I moved these in my original post. — deb2014, Jul 23 '15 at 16:37
No problem, I didn't even notice you are the same person. It could have something to do with the cache size and pre-fetching. — Vladimir F Героям слава, Jul 23 '15 at 17:04

CPU time discrepancies between static and dynamic allocation

2 Answers2