Why is my template meta code slower than a for loop?

Question

I am trying to sum all elements in position n in a set of std::arrays. The value of the sum is stored in a std::array passed to my function add_rows. The summation is done with recursively "calling" the templated class method to sum the columns with decreasing index for column, until I hit column 0, and the the same for the next row until I hit row 0.

I also have a loop version which does the same and I compare the time it takes to execute both ways of computing the sum. I was expecting to see the templated version to perform better, but it output says is ~25 times slower. Is there anything wrong with the templated version which makes it slower?

Before starting this I got inspired by this article "Using Metaprograms to Unroll Loops"

Output from the program is:

Templated version took: 23 ns.
Loop version took:      0 ns.

Code:

#include <iostream>
#include <array>
#include <numeric>
#include <chrono>

template<size_t num_rows, size_t row_index, size_t num_columns, size_t column_index>
class sumRow;

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRow<num_rows, row_index, num_columns, 0>
{
 public:
  static inline int result(const std::array<std::array<int, num_rows>, num_columns>& arrays) noexcept
  {
    return arrays[0][row_index];
  }
};

template<size_t num_rows, size_t row_index, size_t num_columns, size_t column_index>
class sumRow
{
 public:
  static inline int result(const std::array<std::array<int, num_rows>, num_columns>& arrays) noexcept
  {
    return arrays[column_index][row_index] + sumRow<num_rows, row_index, num_columns, column_index - 1>::result(arrays);
  }
};

// Array of arrays

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRows;

template<size_t num_rows, size_t num_columns>
class sumRows<num_rows, 0, num_columns>
{
 public:
  static inline void result(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result) noexcept
  {
    result[0] = sumRow<num_rows, 0, num_columns, num_columns - 1>::result(arrays);
  }
};

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRows
{
 public:
  static inline void result(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result) noexcept
  {
    result[row_index - 1] = sumRow<num_rows, row_index - 1, num_columns, num_columns - 1>::result(arrays);
    sumRows<num_rows, row_index - 1, num_columns>::result(arrays, result);
  }
};

template<size_t num_rows, size_t num_columns>
inline void sum_rows(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result)
{
  sumRows<num_rows, num_rows, num_columns>::result(arrays, result);
};

template<size_t num_channels, size_t channel_size>
inline void loop_sum(const std::array<std::array<int, channel_size>, num_channels>& channels, std::array<int, channel_size>& results) noexcept
{
  for (size_t sample_index = 0; sample_index < channel_size; ++sample_index)
  {
    int result = 0;
    for (size_t channel_index = 0; channel_index < num_channels; ++channel_index)
    {
      result += channels[channel_index][sample_index];
    }
    results[sample_index] = result;
  }
};

// Inspired by from https://stackoverflow.com/a/21995693/2996272
struct measure_cpu_clock
{
  template<typename F, typename ...Args>
  static clock_t execution(F&& func, Args&&... args)
  {
    auto start = std::clock();
    std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
    return std::clock() - start;
  }
};

const int num_channels = 850;

const int num_samples = 32;
using channel = std::array<int, num_samples>;

int main()
{
  std::array<channel, num_channels> channels{};
  for (auto&& item : channels)
  {
    std::iota(item.begin(), item.end(), 1);
  }
  // Templated version
  channel results = {};

  auto execution_time = measure_cpu_clock::execution(sum_rows<num_samples, num_channels>, channels, results);
  std::cout << "Templated version took: " << execution_time << " ns." << std::endl;

  // Loop version
  channel results2 = {};

  execution_time = measure_cpu_clock::execution(loop_sum<num_channels, num_samples>, channels, results2);
  std::cout << "Loop version took:      " << execution_time << " ns." << std::endl;

}

Run the measurements multiple times (10,000-100,000) and take an average as well. Al sorts of things going on in the background can mess with your micro-benchmark. — NathanOliver, Mar 05 '18 at 13:42
Are you sure the compiler doesn't just optimize the loop version away completely? — Sebastian Redl, Mar 05 '18 at 13:47
Why do you expect the templated version to be faster? Check the assembly on godbolt, by the way. — Vittorio Romeo, Mar 05 '18 at 13:47
First, measurement error. Depending on implementation, resolution of `std::clock()` may be much lower than the measurement results you get. Make a loop that runs for a few seconds, measure the time before and after the loop and divide by number of iterations. Also, to prevent compiler from optimizing loop away you have to output the actual result, not only the execution time. — zett42, Mar 05 '18 at 14:06
I was expecting the templated version to be faster than my initial results because in a way the templates should expand to the statements that is needed, just as if I had written the summation row by row explicitly. Compiler optimizations and assemblers is something I would like to know more about, but for debugging this on Godbolt I would have to start with something more simple (maybe I should do that). — tuple_cat, Mar 05 '18 at 15:01
Your arrays contain `850*32 = 27200` data points, a CPU core can only handle less than five such data points per `ns`. Consequently, the measurement of `23ns` is pure and utter bullshit, you cannot hope to handle even a thousand data points during that time. — cmaster - reinstate monica, Apr 21 '18 at 09:09

score 1 · Accepted Answer · edited Apr 21 '18 at 08:23

After reading comments above I added a loop which performs the sum 10000 times with a print out after each iterations.

Then arrays to be summed are also initialized with random values before each iteration and now the time measurements show that the two methods are almost equal in speed (~15 clocks for both).

#include <iostream>
#include <array>
#include <numeric>
#include <chrono>
#include <functional>
#include <random>

template<size_t num_rows, size_t row_index, size_t num_columns, size_t column_index>
class sumRow;

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRow<num_rows, row_index, num_columns, 0>
{
 public:
  static inline int result(const std::array<std::array<int, num_rows>, num_columns>& arrays) noexcept
  {
    return arrays[0][row_index];
  }
};

template<size_t num_rows, size_t row_index, size_t num_columns, size_t column_index>
class sumRow
{
 public:
  static inline int result(const std::array<std::array<int, num_rows>, num_columns>& arrays) noexcept
  {
    return arrays[column_index][row_index] + sumRow<num_rows, row_index, num_columns, column_index - 1>::result(arrays);
  }
};

// Array of arrays

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRows;

template<size_t num_rows, size_t num_columns>
class sumRows<num_rows, 0, num_columns>
{
 public:
  static inline void result(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result) noexcept
  {
    result[0] = sumRow<num_rows, 0, num_columns, num_columns - 1>::result(arrays);
  }
};

template<size_t num_rows, size_t row_index, size_t num_columns>
class sumRows
{
 public:
  static inline void result(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result) noexcept
  {
    result[row_index - 1] = sumRow<num_rows, row_index - 1, num_columns, num_columns - 1>::result(arrays);
    sumRows<num_rows, row_index - 1, num_columns>::result(arrays, result);
  }
};

template<size_t num_rows, size_t num_columns>
inline void sum_rows(const std::array<std::array<int, num_rows>, num_columns>& arrays, std::array<int, num_rows>& result)
{
  sumRows<num_rows, num_rows, num_columns>::result(arrays, result);
};

template<size_t channel_size, size_t num_channels>
inline void loop_sum(const std::array<std::array<int, channel_size>, num_channels>& channels, std::array<int, channel_size>& results) noexcept
{
  for (size_t sample_index = 0; sample_index < channel_size; ++sample_index)
  {
    int result = 0;
    for (size_t channel_index = 0; channel_index < num_channels; ++channel_index)
    {
      result += channels[channel_index][sample_index];
    }
    results[sample_index] = result;
  }
};

// Inspired by from https://stackoverflow.com/a/21995693/2996272
struct measure_cpu_clock
{
  template<typename F, typename ...Args>
  static clock_t execution(F&& func, Args&&... args)
  {
    auto start = std::clock();
    std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
    return std::clock() - start;
  }
};

template<typename T>
T get_random_int(T min, T max)
{
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution <T> dis(min, max);
  return dis(gen);
}

template<size_t num_values>
void print_values(std::array<int, num_values>& array)
{
  for (auto&& item : array)
  {
    std::cout << item << "<";
  }
  std::cout << std::endl;
}

const int num_columns = 850;
const int num_rows = 32;
using channel = std::array<int, num_rows>;

using func = std::function<void(const std::array<std::array<int, num_rows>, num_columns>&, std::array<int, num_rows>&)>;

clock_t perform_many(const func& f)
{
  clock_t total_execution_time = 0;
  int num_iterations = 10000;
  for (int i = 0; i < num_iterations; ++i)
  {
    std::array<channel, num_columns> channels{};
    for (auto&& item : channels)
    {
      std::iota(item.begin(), item.end(), get_random_int(0, 3));
    }
    channel results = {};

    auto start = std::clock();
    f(channels, results);
    total_execution_time += (std::clock() - start);

    print_values(results);
  }
  return total_execution_time / num_iterations;
}

int main()
{
  // Templated version
  auto template_execution_time = perform_many(sum_rows<num_rows, num_columns>);

  auto loop_execution_time = perform_many(loop_sum<num_rows, num_columns>);

  std::cout << "Templated version took: " << template_execution_time << " clocks" << std::endl;
  std::cout << "Loop version took:      " << loop_execution_time << " clock" << std::endl;

}

Why is my template meta code slower than a for loop?

1 Answers1