3

I'm creating an int (32 bit) vector with 1024 * 1024 * 1024 elements like so:

std::vector<int> nums;
for (size_t i = 0; i < 1024 * 1024 * 1024; i++) {
   nums.push_back(rand() % 1024);
}

which holds 4 GB of random data at that point. And then I'm simply summing up all the elements in the vector like so:

uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn++) {
   total += *cn;
}

This takes about ~0.18 seconds which means the data is processed at around 22.2 GB/s. I'm running this on an M1 with a much higher memory bandwidth of about 60GB/s. Is there a way to make the above code run faster on a single core?

EDIT: Manual SIMD version:

int32x4_t simd_total = vmovq_n_s32(0); 
for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) { 
    const int32_t v[4] = {cn[0], cn[1], cn[2], cn[3]} 
    simd_total = vaddq_s32(simd_total, vld1q_s32(v)); 
} 
return vaddvq_s32(simd_total); 

The SIMD version has the same performance as the non-manual-SIMD version.

EDIT 2: Alright, so I changed the vector elements to uint32_t and also changed the result type to uint32_t(as suggested by @Peter Cordes):

uint32_t sum_ints_32(const std::vector<uint32_t>& nums) {
    uint32_t total = 0;
    for (auto cn = nums.begin(); cn < nums.end(); cn++) {
        total += *cn;
    }
    return total;
}

This runs much faster (~45 GB/s). This is the disassembly:

0000000100002218 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
   100002218:   a940200c    ldp x12, x8, [x0]
   10000221c:   eb08019f    cmp x12, x8
   100002220:   54000102    b.cs    100002240 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28>  // b.hs, b.nlast
   100002224:   aa2c03e9    mvn x9, x12
   100002228:   8b090109    add x9, x8, x9
   10000222c:   f1006d3f    cmp x9, #0x1b
   100002230:   540000c8    b.hi    100002248 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30>  // b.pmore
   100002234:   52800000    mov w0, #0x0                    // #0
   100002238:   aa0c03e9    mov x9, x12
   10000223c:   14000016    b   100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c>
   100002240:   52800000    mov w0, #0x0                    // #0
   100002244:   d65f03c0    ret
   100002248:   d342fd29    lsr x9, x9, #2
   10000224c:   9100052a    add x10, x9, #0x1
   100002250:   927ded4b    and x11, x10, #0x7ffffffffffffff8
   100002254:   8b0b0989    add x9, x12, x11, lsl #2
   100002258:   9100418c    add x12, x12, #0x10
   10000225c:   6f00e400    movi    v0.2d, #0x0
   100002260:   aa0b03ed    mov x13, x11
   100002264:   6f00e401    movi    v1.2d, #0x0
   100002268:   ad7f8d82    ldp q2, q3, [x12, #-16]
   10000226c:   4ea08440    add v0.4s, v2.4s, v0.4s
   100002270:   4ea18461    add v1.4s, v3.4s, v1.4s
   100002274:   9100818c    add x12, x12, #0x20
   100002278:   f10021ad    subs    x13, x13, #0x8
   10000227c:   54ffff61    b.ne    100002268 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x50>  // b.any
   100002280:   4ea08420    add v0.4s, v1.4s, v0.4s
   100002284:   4eb1b800    addv    s0, v0.4s
   100002288:   1e260000    fmov    w0, s0
   10000228c:   eb0b015f    cmp x10, x11
   100002290:   540000a0    b.eq    1000022a4 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x8c>  // b.none
   100002294:   b840452a    ldr w10, [x9], #4
   100002298:   0b000140    add w0, w10, w0
   10000229c:   eb08013f    cmp x9, x8
   1000022a0:   54ffffa3    b.cc    100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c>  // b.lo, b.ul, b.last
   1000022a4:   d65f03c0    ret

I also rewrote the Manual-SIMD version:

uint32_t sum_ints_simd_2(const std::vector<uint32_t>& nums) {
    uint32x4_t  simd_total = vmovq_n_u32(0);
    for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) {
        const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
        simd_total = vaddq_u32(simd_total, vld1q_u32(v));
    }
    return vaddvq_u32(simd_total);
}

which still runs 2x slower than the non-manual-SIMD version and results in the following disassembly:

0000000100002464 <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
   100002464:   a9402408    ldp x8, x9, [x0]
   100002468:   d1003129    sub x9, x9, #0xc
   10000246c:   6f00e400    movi    v0.2d, #0x0
   100002470:   eb09011f    cmp x8, x9
   100002474:   540000c2    b.cs    10000248c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28>  // b.hs, b.nlast
   100002478:   6f00e400    movi    v0.2d, #0x0
   10000247c:   3cc10501    ldr q1, [x8], #16
   100002480:   4ea08420    add v0.4s, v1.4s, v0.4s
   100002484:   eb09011f    cmp x8, x9
   100002488:   54ffffa3    b.cc    10000247c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x18>  // b.lo, b.ul, b.last
   10000248c:   4eb1b800    addv    s0, v0.4s
   100002490:   1e260000    fmov    w0, s0
   100002494:   d65f03c0    ret

To reach the same speed as the auto-vectorized version, we can use a uint32x4x2 instead of uint32x4 for our manual-SIMD version:

uint32_t sum_ints_simd_3(const std::vector<uint32_t>& nums) {
    uint32x4x2_t simd_total;
    simd_total.val[0] = vmovq_n_u32(0);
    simd_total.val[1] = vmovq_n_u32(0);
    for (auto cn = nums.begin(); cn < nums.end()-7; cn +=8) {
        const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
        const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
        simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
        simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
    }
    return vaddvq_u32(simd_total.val[0]) + vaddvq_u32(simd_total.val[1]);
}

And to gain even more speed we can leverage uint32x4x4 (which gets us about ~53 GB/s):

uint32_t sum_ints_simd_4(const std::vector<uint32_t>& nums) {
    uint32x4x4_t simd_total;
    simd_total.val[0] = vmovq_n_u32(0);
    simd_total.val[1] = vmovq_n_u32(0);
    simd_total.val[2] = vmovq_n_u32(0);
    simd_total.val[3] = vmovq_n_u32(0);
    for (auto cn = nums.begin(); cn < nums.end()-15; cn +=16) {
        const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
        const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
        const uint32_t v3[4] = { cn[8], cn[9], cn[10], cn[11] };
        const uint32_t v4[4] = { cn[12], cn[13], cn[14], cn[15] };
        simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
        simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
        simd_total.val[2] = vaddq_u32(simd_total.val[2], vld1q_u32(v3));
        simd_total.val[3] = vaddq_u32(simd_total.val[3], vld1q_u32(v4));
    }
    return vaddvq_u32(simd_total.val[0])
        + vaddvq_u32(simd_total.val[1])
        + vaddvq_u32(simd_total.val[2])
        + vaddvq_u32(simd_total.val[3]);
}

which gets us the following disassembly:

0000000100005e34 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
   100005e34:   a9402408    ldp x8, x9, [x0]
   100005e38:   d100f129    sub x9, x9, #0x3c
   100005e3c:   6f00e403    movi    v3.2d, #0x0
   100005e40:   6f00e402    movi    v2.2d, #0x0
   100005e44:   6f00e401    movi    v1.2d, #0x0
   100005e48:   6f00e400    movi    v0.2d, #0x0
   100005e4c:   eb09011f    cmp x8, x9
   100005e50:   540001c2    b.cs    100005e88 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x54>  // b.hs, b.nlast
   100005e54:   6f00e400    movi    v0.2d, #0x0
   100005e58:   6f00e401    movi    v1.2d, #0x0
   100005e5c:   6f00e402    movi    v2.2d, #0x0
   100005e60:   6f00e403    movi    v3.2d, #0x0
   100005e64:   ad401504    ldp q4, q5, [x8]
   100005e68:   ad411d06    ldp q6, q7, [x8, #32]
   100005e6c:   4ea38483    add v3.4s, v4.4s, v3.4s
   100005e70:   4ea284a2    add v2.4s, v5.4s, v2.4s
   100005e74:   4ea184c1    add v1.4s, v6.4s, v1.4s
   100005e78:   4ea084e0    add v0.4s, v7.4s, v0.4s
   100005e7c:   91010108    add x8, x8, #0x40
   100005e80:   eb09011f    cmp x8, x9
   100005e84:   54ffff03    b.cc    100005e64 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30>  // b.lo, b.ul, b.last
   100005e88:   4eb1b863    addv    s3, v3.4s
   100005e8c:   1e260068    fmov    w8, s3
   100005e90:   4eb1b842    addv    s2, v2.4s
   100005e94:   1e260049    fmov    w9, s2
   100005e98:   0b080128    add w8, w9, w8
   100005e9c:   4eb1b821    addv    s1, v1.4s
   100005ea0:   1e260029    fmov    w9, s1
   100005ea4:   0b090108    add w8, w8, w9
   100005ea8:   4eb1b800    addv    s0, v0.4s
   100005eac:   1e260009    fmov    w9, s0
   100005eb0:   0b090100    add w0, w8, w9
   100005eb4:   d65f03c0    ret

Crazy stuff

user2403221
  • 435
  • 1
  • 3
  • 12
  • You're probably limited more by latency than bandwidth, it sounds. So: not with standard C++ – Mooing Duck Jun 14 '21 at 16:46
  • 2
    You have to remember that other tasks and hardware items need to share data bus and address bus. A CPU fetch may have to wait while other devices are using the data and address busses. – Thomas Matthews Jun 14 '21 at 16:46
  • What compiler are you using? – Mooing Duck Jun 14 '21 at 16:48
  • You could try unrolling your `for` loop. Processors are not liking branch statements, so the more branches you eliminate, the happier it will be. So for example, you could have 4, 16 or more additions in the loop before branching around again. Some compilers may perform this at higher optimization levels. – Thomas Matthews Jun 14 '21 at 16:49
  • GCC has `__builtin_prefetch` for this. I can't find anything for MSVC – Mooing Duck Jun 14 '21 at 16:51
  • Another technique is to use multiple registers. For example, have 4 variables, and read them. Then sum using the 4 variables. This will help the cache access because 4 accesses will be right next to each other without a branch statement. – Thomas Matthews Jun 14 '21 at 16:51
  • `auto total = std::reduce(std::execution::par_unseq, nums.cbegin(), nums.cend(), uint64_t(0));` – EOF Jun 14 '21 at 16:56
  • @MooingDuck I'm using clang version 12.0.5 and I'm compiling with -O3 optimization flag – user2403221 Jun 14 '21 at 17:12
  • Can you go faster if you use `uint32_t` sums, so the compiler doesn't have to widen each element before adding? That means each SIMD instruction can only handle half as much data from memory as with same-sized accumulators. – Peter Cordes Jun 14 '21 at 17:42
  • @PeterCordes thanks a lot for your ideas, summing int32_t instead of int64_t values does indeed boost performance quite a bit. It now reaches about 45 GB/s. I also tried to use neon intrinsics, which oddly enough reverted back to the original 22GB/s speed: ```` int32x4_t simd_total = vmovq_n_s32(0); for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) { const int32_t v[4] = {cn[0], cn[1], cn[2], cn[3]} simd_total = vaddq_s32(simd_total, vld1q_s32(v)); } return vaddvq_s32(simd_total); ```` – user2403221 Jun 14 '21 at 18:01
  • (I reposted that as an answer before you commented, including a paragraph about how mainline (not Apple) clang auto-vectorizes. can you repost your comment there to move the discussion there?) – Peter Cordes Jun 14 '21 at 18:03
  • whoops my bad, I added the code with correct formatting as an edit to my question – user2403221 Jun 14 '21 at 18:03
  • If you're going to include that code in the question, don't forget to say what results you got from it :P Did you check the asm to see if clang optimized away that `const int32_t v[4]` and actually loaded directly from the array? Although I'd expect it to have slowed down even more if it actually did scalar copies to stack memory. But it may not have unrolled your manual-vectorized loop for you, but with a `uint32_t` sum it would likely unroll as well as auto-vectorize the simple code. – Peter Cordes Jun 14 '21 at 18:04
  • 1
    @Peter Cordes, yes you're right, I wanted to stay consistent but the first version already auto-vectorizes so it's not really accurate. I mean the non-manual-SIMD version ;) Btw, using a uint32x4x4_t manual SIMD version is even faster than the auto-vectorized version (~53 GB/s) – user2403221 Jun 14 '21 at 19:57

3 Answers3

2

Does -march=native help? IDK if there are any SIMD features that Apple clang won't already take advantage on the first generation of AArch64 MacOS CPUs, but clang might just be taking baseline AArch64 in general.

Can you go faster if you use uint32_t sums, so the compiler doesn't have to widen each element before adding? That means each SIMD instruction can only handle half as much data from memory as with same-sized accumulators.

https://godbolt.org/z/7c19913jE shows that Thomas Matthews' unrolling suggestion does actually get clang11 -O3 -march=apple-a13 to unroll the SIMD-vectorized asm loops it makes. That source change is not a win in general, e.g. much worse for x86-64 clang -O3 -march=haswell, but it does help here.


Another possibility is that a single core can't saturate memory bandwidth. But benchmark results published by Anandtech for example seem to rule that out: they found that even a single core can achieve 59GB/s, although that was probably running an optimize memcpy function.

(They say The fact that a single Firestorm core can almost saturate the memory controllers is astounding and something we’ve never seen in a design before. That sounds a bit weird; desktop / laptop Intel CPUs come pretty close, unlike their "server" chips. Maybe not as close as Apple?

M1 has pretty low memory latency compared to modern x86, so that probably helps a single core be able to track the incoming loads to keep the necessary latency x bandwidth product in flight, even with its high memory bandwidth.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • Ok let's move the discussion here, I updated the question again. – user2403221 Jun 14 '21 at 19:21
  • @user2403221: You talk about "the non-SIMD version" in your edit. But actually that's the auto-vectorized version! Look at the asm: `ldp` loading two 16-byte q registers, for two `add v0.4s ...` instructions for the uint32_t loop, vs. just one per iteration for your manual loop. (With the copy to a local array optimized away, instead doing a vector load from the `std::vector`, otherwise it'd be much slower). – Peter Cordes Jun 14 '21 at 19:54
  • Yea that wasn't clear, I edited again! Thanks a lot, I think that's all the speed we can get for today! – user2403221 Jun 14 '21 at 20:19
  • 1
    Might be worth adding [Optimizing AMD Opteron Memory Bandwidth](http://sites.utexas.edu/jdm4372/2010/11/) is a good read. The extra parallelism from dram with interleaving pages has some significant affects for reads where north channel has high frequency than any bank can produce on newer DRAMs. – Noah Jun 15 '21 at 14:12
  • 1
    @Noah: Reposting here a link you shared: [Apple M1 microarchitecture reverse engineering (PDF)](https://drive.google.com/file/d/1WrMYCZMnhsGP4o3H33ioAUKL_bjuJSPt/view) by Maynard Handley. Includes some details on experiments done to figure out how things worked. And some good general computer-architecture stuff. ([reddit thread](https://www.reddit.com/r/hardware/comments/po7hi6/maynard_handleys_m1_exploration/) where someone linked it with credit to Maynard for the majority of the work, plus contributions from various others including Travis Downs (BeeOnRope), Dougall J, Andrei Frumusanu.). – Peter Cordes Nov 21 '21 at 16:54
1

Here are some techniques.

Loop Unrolling

uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
    total += cn[0];
    total += cn[1];
    total += cn[2];
    total += cn[3];
}

Register Prefetch

uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
    const uint64 n0 = cn[0];
    const uint64 n1 = cn[1];
    const uint64 n2 = cn[2];
    const uint64 n3 = cn[3];
    total += n0;
    total += n1;
    total += n2;
    total += n3;
}

You should print the assembly language for each of these at high optimization level and compare them.

Also, your processor may have some specialized instructions that you could. For example, the ARM processor can load multiple registers from memory with one instruction.

Also, look up SIMD instructions or search the internet for "C++ SIMD read memory".

I've argued with compilers (on embedded systems) and found out that the compiler's optimization strategies may be better or equal to instruction specialization or other techniques (timings were performed using Test Points and oscilloscope).

You'll have to remember that your task, on a one core machine, will most likely be swapped out more often that with a system with multiple cores or a specialized (embedded) system.

Thomas Matthews
  • 56,849
  • 17
  • 98
  • 154
  • 2
    Not my DV, but your "register prefetch" version should compile to the same asm as the first version with modern C++ compilers. (And does with `clang -O3 -mcpu=apple-a13` https://godbolt.org/z/7c19913jE, presumably similar to Apple clang on MacOS on M1). If it didn't, it would be a missed optimization for whichever version isn't optimal. (And in practice, this is the kind of optimization compilers are already good at; they already compile your C++ source into an SSA form, where it doesn't matter whether the value had a C++ variable name or not.) – Peter Cordes Jun 14 '21 at 17:22
  • 2
    Doing loads early can be useful if your loop contains assignment through a pointer: that can save the compiler having to check for aliasing to maintain the exact C++ semantics if you re-read what you just stored. But here you're not taking the address of `n0..3` so they will fully optimize away pretty easily given the usual design of compiler internals. Interestingly, though, clang didn't unroll the original source for you when auto-vectorizing. If it wasn't for using a wider sum it probably would have, though. clang likes to unroll, at least for x86. Maybe not AArch64. – Peter Cordes Jun 14 '21 at 17:25
  • 1
    Note that scalar unrolling by hand isn't always a good thing! With this same code for x86 with clang, the unrolled sources defeat auto-vectorization with SSE2 (where sign-extension to 64-bit vector elements is a pain). https://godbolt.org/z/oo31sYYeh shows clang auto-vectorizing (and unrolling) the simple loop, but only using scalar (unrolled by 4) for your loops. Or with AVX2 available, https://godbolt.org/z/TGP6sxj6E, doing n0..3 as the elements of one vector, and horizontal summing that inside the loop!! vs. much better asm for the simple source, keeping 4 vector accumulators. – Peter Cordes Jun 14 '21 at 17:32
  • It can be helpful to unroll in the source using multiple accumulators (separate `total0` ... `total3` variables). But usually only for floating-point, where the compiler can't do that for you (without `-ffast-math`, or at least `-fassociative-math` and some other options.) But that's usually not a factor with integer because it's associative so compilers can invent more vector accumulators to hide SIMD integer add latency if that's useful. – Peter Cordes Jun 14 '21 at 17:36
  • And BTW, your code has a correctness problem: you need `cn < nums.end() - 3` to make sure `cn[3]` doesn't read past the end. But of course it would be UB to evaluate `nums.end() - 3` in C++, and avoiding that is a pain in the ass. (Even though in practice you will run your code on systems where the 0 page isn't mapped, so `ptr - 3` will never wrap to a high unsigned address, if you want to follow strict C++ rules you could do `if( size >= 4) for()...` or something annoying like that, or even a do/while to make sure the compiler doesn't do 2 redundant checks before entering the loop. – Peter Cordes Jun 14 '21 at 17:40
  • I copied the OP's `for` loop without thinking about that. Normally, I go by element quantity and index. – Thomas Matthews Jun 14 '21 at 17:43
  • HI @ThomasMatthews thanks a lot for your inputs as well! It looks as if the compiler is already applying these optimizations on its own (I tested it just now). – user2403221 Jun 14 '21 at 18:10
0

Consider precalculating as much as you can and using built-in STL functions, this will lead to as much optimal code as possible before trying SIMD or assembly approaches. If it's still too slow, then try the SIMD/assembly versions:

Avoid calling push_back on unreserved std::vectors: this causes the system to allocate more space when the capacity limit is reached. Since you know the size of the array before hand, reserve the space ahead of time: (for non-built-in types, consider emplace_back as well).

Additionally, the STL functions can reduce the boilerplate code down to two function calls.

Also, avoid rand().

const std::size_t GB = 1024 * 1024 * 1024;
std::vector<int> nums(4 * GB);
std::generate(std::begin(nums), std::end(nums), [](){ return rand() % 1024; });

//...

const auto sum = std::accumulate(std::begin(nums), std::end(nums), 0);

Casey
  • 10,297
  • 11
  • 59
  • 88