Faster way to convert a vector of vectors to a single contiguous vector with opposite storage order

Question

I have a std::vector<std::vector<double>> that I am trying to convert to a single contiguous vector as fast as possible. My vector has a shape of roughly 4000 x 50.

The problem is, sometimes I need my output vector in column-major contiguous order (just concatenating the interior vectors of my 2d input vector), and sometimes I need my output vector in row-major contiguous order, effectively requiring a transpose.

I have found that a naive for loop is quite fast for conversion to a column-major vector:

auto to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    for (size_t i = 0; i < n_col; ++i)
        for (size_t j = 0; j < n_row; ++j)
            out_vec[i * n_row + j] = vec[i][j];
    return out_vec;
}

But obviously a similar approach is very slow for row-wise conversion, because of all of the cache misses. So for row-wise conversion, I thought a blocking strategy to promote cache locality might be my best bet:

auto to_dense_row_major_blocking(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    size_t block_side = 8;

    for (size_t l = 0; l < n_col; l += block_side) {
        for (size_t k = 0; k < n_row; k += block_side) {
            for (size_t j = l; j < l + block_side && j < n_col; ++j) {
                auto const &column = vec[j];
                for (size_t i = k; i < k + block_side && i < n_row; ++i)
                    out_vec[i * n_col + j] = column[i];
            }
        }
    }
    return out_vec;
}

This is considerably faster than a naive loop for row-major conversion, but still almost an order of magnitude slower than naive column-major looping on my input size.

My question is, is there a faster approach to converting a (column-major) vector of vectors of doubles to a single contiguous row-major vector? I am struggling to reason about what the limit of speed of this code should be, and am thus questioning whether I'm missing something obvious. My assumption was that blocking would give me a much larger speedup then it appears to actually give.

The chart was generated using QuickBench (and somewhat verified with GBench locally on my machine) with this code: (Clang 7, C++20, -O3)

auto to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    for (size_t i = 0; i < n_col; ++i)
        for (size_t j = 0; j < n_row; ++j)
            out_vec[i * n_row + j] = vec[i][j];
    return out_vec;
}

auto to_dense_row_major_naive(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    for (size_t i = 0; i < n_col; ++i)
        for (size_t j = 0; j < n_row; ++j)
            out_vec[j * n_col + i] = vec[i][j];
    return out_vec;
}

auto to_dense_row_major_blocking(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    size_t block_side = 8;

    for (size_t l = 0; l < n_col; l += block_side) {
        for (size_t k = 0; k < n_row; k += block_side) {
            for (size_t j = l; j < l + block_side && j < n_col; ++j) {
                auto const &column = vec[j];
                for (size_t i = k; i < k + block_side && i < n_row; ++i)
                    out_vec[i * n_col + j] = column[i];
            }
        }
    }
    return out_vec;
}

auto to_dense_column_major_blocking(std::vector<std::vector<double>> const & vec)
    -> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    size_t block_side = 8;

    for (size_t l = 0; l < n_col; l += block_side) {
        for (size_t k = 0; k < n_row; k += block_side) {
            for (size_t j = l; j < l + block_side && j < n_col; ++j) {
                auto const &column = vec[j];
                for (size_t i = k; i < k + block_side && i < n_row; ++i)
                    out_vec[j * n_row + i] = column[i];
            }
        }
    }
    return out_vec;
}

auto make_vecvec() -> std::vector<std::vector<double>>
{
  std::vector<std::vector<double>> vecvec(50, std::vector<double>(4000));
  std::mt19937 mersenne {2019};
  std::uniform_real_distribution<double> dist(-1000, 1000);
  for (auto &vec: vecvec)
   for (auto &val: vec)
       val = dist(mersenne);
  return vecvec;
}

static void NaiveColumnMajor(benchmark::State& state) {
  // Code before the loop is not measured

  auto vecvec = make_vecvec();
  for (auto _ : state) {
    benchmark::DoNotOptimize(to_dense_column_major_naive(vecvec));
  }
}
BENCHMARK(NaiveColumnMajor);

static void NaiveRowMajor(benchmark::State& state) {
  // Code before the loop is not measured

  auto vecvec = make_vecvec();
  for (auto _ : state) {
    benchmark::DoNotOptimize(to_dense_row_major_naive(vecvec));
  }
}
BENCHMARK(NaiveRowMajor);

static void BlockingRowMajor(benchmark::State& state) {
  // Code before the loop is not measured

  auto vecvec = make_vecvec();
  for (auto _ : state) {
    benchmark::DoNotOptimize(to_dense_row_major_blocking(vecvec));
  }
}
BENCHMARK(BlockingRowMajor);

static void BlockingColumnMajor(benchmark::State& state) {
  // Code before the loop is not measured

  auto vecvec = make_vecvec();
  for (auto _ : state) {
    benchmark::DoNotOptimize(to_dense_column_major_blocking(vecvec));
  }
}
BENCHMARK(BlockingColumnMajor);

Are you sure you get right results, i.e. right values at right places? As my understanding goes, the row major order should be the fast one. The formula for column major order is n_col*j+i, where vec[i][j] accesses the element in vector of vectors. — ead, Mar 19 '19 at 04:01
If you are not averse to parallel programming then you could make use of multi-threaded execution; the problem seems to be quite suitable for a concurrent approach. — J.R., Mar 19 '19 at 04:03
@ead Hey there, sorry I should have mentioned that my original 2d vector is effectively in “column-major” order, since the inner vectors are contiguous. — Eric Hansen, Mar 19 '19 at 10:07
@J.R. I have tried added OpenMP parallelization (just a pragma around the exterior loop) and did not see improvements. You might be right and there is a factor I have to be weary about though. — Eric Hansen, Mar 19 '19 at 10:09
@EricHansen Ok, I see. This is a little bit confusing: seeing `vec[i][j]` one would assume `i` means row and not column. — ead, Mar 19 '19 at 10:13
@ead totally agreed, it’s confusing in principle to have numeric data in a vector of vectors in the first place but unfortunately I can’t do anything about that :( — Eric Hansen, Mar 19 '19 at 10:35
I would try the tiling (what you called "blocking") version, but using an AVX or AVX2 transpose from a question that covered precisely that: https://stackoverflow.com/q/25622745/179910 — Jerry Coffin, Mar 20 '19 at 13:32
@JerryCoffin Thanks Jerry. My assumption (possibly incorrect) is that AVX transposes rely upon having a contiguous data structure. I might be wrong though, I will look into it. — Eric Hansen, Mar 20 '19 at 13:55
On my computer, doing the naive row_major, with outer loop being rows (so the output will be written continuously, that's seem to be important), inner loop is unrolled 4 times: only 10% slower than the naive column major. — geza, Mar 21 '19 at 19:17
Some confusing points in the question: describing the vectors as "`4000 x 50`" to me means a 4000-element vector containing 50-element vectors (4000 first, 50 second). I would describe your *vectors* as `50 x 4000`, even if they logically represent `4000 x 50` *matrices*. Another confusing point is that you swapped the roles of `i` and `j` in your code. In one function, `i` ranges over columns, while in another it ranges over rows. Not a big deal, but something you might want to keep in mind in the future. — JaMiT, Mar 22 '19 at 03:17

JaMiT · Accepted Answer · 2020-09-04T01:18:31.917

First of all, I cringe whenever something is qualified as "obviously". That word is often used to cover up a shortcoming in one's deductions.

But obviously a similar approach is very slow for row-wise conversion, because of all of the cache misses.

I'm not sure which is supposed to be obvious: that the row-wise conversion would be slow, or that it's slow because of cache misses. In either case, I find it not obvious. After all, there are two caching considerations here, aren't there? One for reading and one for writing? Let's look at the code from the reading perspective:

row_major_naive

for (size_t i = 0; i < n_col; ++i)
    for (size_t j = 0; j < n_row; ++j)
        out_vec[j * n_col + i] = vec[i][j];

Successive reads from vec are reads of contiguous memory: vec[i][0] followed by vec[i][1], etc. Very good for caching. So... cache misses? Slow? :) Maybe not so obvious.

Still, there is something to be gleaned from this. The claim is only wrong by claiming "obviously". There are non-locality issues, but they occur on the writing end. (Successive writes are offset by the space for 50 double values.) And empirical testing confirms the slowness. So maybe a solution is to flip on what is considered "obvious"?

row major flipped

for (size_t j = 0; j < n_row; ++j)
    for (size_t i = 0; i < n_col; ++i)
        out_vec[j * n_col + i] = vec[i][j];

All I did here was reverse the loops. Literally swap the order of those two lines of code then adjust the indentation. Now successive reads are potentially all over the place, as they read from different vectors. However, successive writes are now to contiguous blocks of memory. In one sense, we are in the same situation as before. But just like before, one should measure performance before assuming "fast" or "slow".

NaiveColumnMajor: 3.4 seconds
NaiveRowMajor: 7.7 seconds
FlippedRowMajor: 4.2 seconds
BlockingRowMajor: 4.4 seconds
BlockingColumnMajor: 3.9 seconds

Still slower than the naive column major conversion. However, this approach is not only faster than naive row major, but it's also faster than blocking row major. At least on my computer (using gcc -O3 and obviously :P iterating thousands of times). Mileage may vary. I don't know what the fancy profiling tools would say. The point is that sometimes simpler is better.

For funsies I did a test where the dimensions are swapped (changing from 50 vectors of 4000 elements to 4000 vectors of 50 elements). All methods got hurt this way, but "NaiveRowMajor" took the biggest hit. Worth noting is that "flipped row major" fell behind the blocking version. So, as one might expect, the best tool for the job depends on what exactly the job is.

NaiveColumnMajor: 3.7 seconds
NaiveRowMajor: 16 seconds
FlippedRowMajor: 5.6 seconds
BlockingRowMajor: 4.9 seconds
BlockingColumnMajor: 4.5 seconds

(By the way, I also tried the flipping trick on the blocking version. The change was small -- around 0.2 -- and opposite of flipping the naive version. That is, "flipped blocking" was slower than "blocking" for the question's 50-of-4000 vectors, but faster for my 4000-of-50 variant. Fine tuning might improve the results.)

Update: I did a little more testing with the flipping trick on the blocking version. This version has four loops, so "flipping" is not as straight-forward as when there are only two loops. It looks like swapping the order of the outer two loops is bad for performance, while swapping the inner two loops is good. (Initially, I had done both and gotten mixed results.) When I swapped just the inner loops, I measured 3.8 seconds (and 4.1 seconds in the 4000-of-50 scenario), making this the best row-major option in my tests.

row major hybrid

for (size_t l = 0; l < n_col; l += block_side)
    for (size_t i = 0; i < n_row; ++i)
        for (size_t j = l; j < l + block_side && j < n_col; ++j)
            out_vec[i * n_col + j] = vec[j][i];

(After swapping the inner loops, I merged the middle loops.)

As for the theory behind this, I would guess that this amounts to trying to write one cache block at a time. Once a block is written, try to re-use vectors (the vec[j]) before they get ejected from the cache. After you exhaust those source vectors, move on to a new group of source vectors, again writing full blocks at a time.

@EricHansen Please see the update -- I had missed a good combination. — JaMiT, Mar 22 '19 at 17:01
I apologize for my use of the word “obviously”, and thanks for your help! — Eric Hansen, Mar 22 '19 at 17:24
@EricHansen Actually, I would advise you to keep using "obviously" as you have been. Just be aware of when you use it, so that you can jump right to _"Hold on -- if I'm calling it 'obvious' I must be missing something."_ Very handy analysis technique. ;) — JaMiT, Mar 22 '19 at 20:39

score 0 · Answer 2 · answered Mar 22 '19 at 11:54

I have just added two functions of parallel version of things

#include <ppl.h>

auto ppl_to_dense_column_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);

    size_t vecLen = out_vec.size();
    concurrency::parallel_for(size_t(0), vecLen, [&](size_t i)
    {
        size_t row = i / n_row;
        size_t column = i % n_row;

        out_vec[i] = vec[row][column];
    });

    return out_vec;
}

auto ppl_to_dense_row_major_naive(std::vector<std::vector<double>> const & vec)
-> std::vector<double>
{
    auto n_col = vec.size();
    auto n_row = vec[0].size();
    std::vector<double> out_vec(n_col * n_row);
    size_t vecLen = out_vec.size();


    concurrency::parallel_for(size_t(0), vecLen, [&](size_t i)
    {
        size_t column = i / n_col;
        size_t row = i % n_col;

        out_vec[i] = vec[row][column];
    });

    return out_vec;
}

and additional benchmark codes for all of them

template< class _Fn, class ... Args >
auto callFncWithPerformance( std::string strFnName,  _Fn toCall, Args&& ...args )
{
    auto start = std::chrono::high_resolution_clock::now();
    auto toRet = toCall( std::forward<Args>(args)... );
    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;

    std::cout << strFnName << ": " << diff.count() << " s" << std::endl;

    return toRet;
}

template< class _Fn, class ... Args >
auto second_callFncWithPerformance(_Fn toCall, Args&& ...args)
{
    std::string strFnName(typeid(toCall).name());

    auto start = std::chrono::high_resolution_clock::now();
    auto toRet = toCall(std::forward<Args>(args)...);
    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;

    std::cout << strFnName << ": " << diff.count() << " s";

    return toRet;
}


#define MAKEVEC( FN, ... ) callFncWithPerformance( std::string( #FN ) , FN  , __VA_ARGS__ )


int main()
{
    //prepare vector
    auto vec = make_vecvec();

    std::vector< double > vecs[]
    {
        std::vector<double>(MAKEVEC(to_dense_column_major_naive, vec)),
        std::vector<double>(MAKEVEC(to_dense_row_major_naive, vec)),
        std::vector<double>(MAKEVEC(ppl_to_dense_column_major_naive, vec)),
        std::vector<double>(MAKEVEC(ppl_to_dense_row_major_naive, vec)),
        std::vector<double>(MAKEVEC(to_dense_row_major_blocking, vec)),
        std::vector<double>(MAKEVEC(to_dense_column_major_blocking, vec)),
    };

    //system("pause");

    return 0;
}

and here below result of these

Debug x64

to_dense_column_major_naive: 0.166859 s
to_dense_row_major_naive: 0.192488 s
ppl_to_dense_column_major_naive: 0.0557423 s
ppl_to_dense_row_major_naive: 0.0514017 s
to_dense_column_major_blocking: 0.118465 s
to_dense_row_major_blocking: 0.117732 s

Debug x86

to_dense_column_major_naive: 0.15242 s
to_dense_row_major_naive: 0.158746 s
ppl_to_dense_column_major_naive: 0.0534966 s
ppl_to_dense_row_major_naive: 0.0484076 s
to_dense_column_major_blocking: 0.111217 s
to_dense_row_major_blocking: 0.107727 s

Release x64

to_dense_column_major_naive: 0.000874 s
to_dense_row_major_naive: 0.0011973 s
ppl_to_dense_column_major_naive: 0.0054639 s
ppl_to_dense_row_major_naive: 0.0012034 s
to_dense_column_major_blocking: 0.0008023 s
to_dense_row_major_blocking: 0.0010282 s

Release x86

to_dense_column_major_naive: 0.0007156 s
to_dense_row_major_naive: 0.0012538 s
ppl_to_dense_column_major_naive: 0.0053351 s
ppl_to_dense_row_major_naive: 0.0013022 s
to_dense_column_major_blocking: 0.0008761 s
to_dense_row_major_blocking: 0.0012404 s

You are quite right, to parallel it is too small set of data.
And also it is too small works.
Although I will be post for someone else to reference these functions.

Faster way to convert a vector of vectors to a single contiguous vector with opposite storage order

2 Answers2