4

Consider the following C program.

#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>

static void do_stuff(void)
{
    const int n = 256;
    int *ar = malloc(n * sizeof(int));
    for (int i = 0; i < n; i++)
        ar[i] = random();
}

int main(void)
{
    do_stuff();

    __m256i sm = _mm256_setzero_si256();
    int sum = 0;
    int *vcadd = (int*)&sm;
    for (size_t l = 0; l < 8; l++)
        sum += vcadd[l];
    printf("sum = %d\n", sum);

    return 0;
}

I expected this program to print sum = 0, but when I compile it with gcc -mavx2 src.c -O2, it sometimes prints sum = 0, sometimes sum = 18.

When compiled with -O1 or -O0, the programs works as expected. It also seems to work fine with -O2 and the do_stuff(); call commented out.

Assembly generated for main with -O1 (+ comments from me of what I think the instructions do):

main:
.LFB5513:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %r12
    pushq   %rbx
    andq    $-32, %rsp
    subq    $64, %rsp
    .cfi_offset 12, -24
    .cfi_offset 3, -32
    movq    %fs:40, %rax
    movq    %rax, 56(%rsp)
    xorl    %eax, %eax
    movl    $1024, %edi
    call    malloc@PLT
    movq    %rax, %rbx
    leaq    1024(%rax), %r12
.L2:
    call    random@PLT
    movl    %eax, (%rbx)
    addq    $4, %rbx
    cmpq    %r12, %rbx
    jne .L2
    vpxor   %xmm0, %xmm0, %xmm0 ; zero out %ymm0
    vmovdqa %ymm0, (%rsp)       ; store these zeros at %rsp
    movq    %rsp, %rax          ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
    leaq    32(%rsp), %rcx      ; ^
    movl    $0, %edx            ; ^
.L3:                            ; ^
    addl    (%rax), %edx        ; ^
    addq    $4, %rax            ; ^
    cmpq    %rcx, %rax          ; ^
    jne .L3                     ; ^
    leaq    .LC0(%rip), %rsi
    movl    $1, %edi
    movl    $0, %eax
    call    __printf_chk@PLT
    movq    56(%rsp), %rax
    subq    %fs:40, %rax
    jne .L8
    movl    $0, %eax
    leaq    -16(%rbp), %rsp
    popq    %rbx
    popq    %r12
    popq    %rbp
    .cfi_remember_state
    .cfi_def_cfa 7, 8
    ret
.L8:
    .cfi_restore_state
    call    __stack_chk_fail@PLT
    .cfi_endproc

and with -O2:

main:
.LFB5513:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movl    $1024, %edi
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %r12
    pushq   %rbx
    andq    $-32, %rsp
    subq    $64, %rsp
    .cfi_offset 12, -24
    .cfi_offset 3, -32
    movq    %fs:40, %rax
    movq    %rax, 56(%rsp)
    xorl    %eax, %eax
    call    malloc@PLT
    movq    %rax, %rbx
    leaq    1024(%rax), %r12
    .p2align 4,,10
    .p2align 3
.L2:
    call    random@PLT
    addq    $4, %rbx
    movl    %eax, -4(%rbx)
    cmpq    %r12, %rbx
    jne .L2
    movq    %rsp, %rax      ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
    leaq    32(%rsp), %rcx  ; ^
    xorl    %edx, %edx      ; ^
    .p2align 4,,10          ; ^
    .p2align 3              ; ^
.L3:                        ; ^
    addl    (%rax), %edx    ; ^
    addq    $4, %rax        ; ^
    cmpq    %rcx, %rax      ; ^
    jne .L3                 ; ^
    xorl    %eax, %eax
    leaq    .LC0(%rip), %rsi
    movl    $1, %edi
    call    __printf_chk@PLT
    movq    56(%rsp), %rax
    subq    %fs:40, %rax
    jne .L9
    leaq    -16(%rbp), %rsp
    xorl    %eax, %eax
    popq    %rbx
    popq    %r12
    popq    %rbp
    .cfi_remember_state
    .cfi_def_cfa 7, 8
    ret
.L9:
    .cfi_restore_state
    call    __stack_chk_fail@PLT
    .cfi_endproc

So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0?

I'm using

gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0

Solution based on comments

(all below compiled with -O2)

Using memcpy as

    __m256i sm = _mm256_setzero_si256();
    int ar[8];
    memcpy(ar, &sm, 32);

copies the data, although in a somewhat convoluted way (?)

    vpxor   %xmm0, %xmm0, %xmm0
    leaq    48(%rsp), %rax
    leaq    80(%rsp), %rcx
    xorl    %edx, %edx
    vmovdqa %ymm0, (%rsp)
    vmovdqa 16(%rsp), %xmm2
    vmovdqa %xmm0, 48(%rsp)
    vmovdqa %xmm2, 64(%rsp)

A union

union conv
{
    __m256i val;
    int ar[8];
};
    union conv c;
    c.val = _mm256_setzero_si256();
    // access c.ar

works too by producing

    vpxor   %xmm0, %xmm0, %xmm0
    leaq    4(%rsp), %rax
    leaq    32(%rsp), %rsi
    xorl    %ecx, %ecx
    vmovdqa %ymm0, (%rsp)

Another option is to compile with -fno-strict-aliasing. In that case, the original code works as I expected.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
mimo31
  • 335
  • 3
  • 12
  • 2
    You're violating the C strict aliasing rules -- you have an _m256i that you are accessing as int. The compiler is entitled to assume they don't alias, so it can eliminate the initialization of sm as dead (nothing legally reads it). – Chris Dodd Dec 11 '21 at 08:05
  • 1
    You violated the strict aliasing rule. Try compiling with `-fno-strict-aliasing` – user3386109 Dec 11 '21 at 08:07
  • It works with `-fno-strict-aliasing`. If I don't compile with `-fno-strict-aliasing`, should I instead use a union to read the values from `__m256i` or how would I do that? – mimo31 Dec 11 '21 at 08:10
  • 1
    You can `memcpy` into an `int` array. I expect that the compiler will optimize out the `memcpy` and do what you want. – user3386109 Dec 11 '21 at 08:21
  • You can use a GNU C native vector of `int` so you can index it with `[]`, or possibly even alias with `int*`. (`__m256i` is defined as a vector of `long long`). Or portably, there's an intrinsic, [`_mm256_extract_epi32(__m256i, const int index)`](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmltext=mm256_extr&ig_expand=2980), but neither of those are particularly efficient. (Especially if the index is 4..7, so it can't be done with one `vpextrd r/m32, xmm, imm8` - note the xmm source operand, [not ymm](https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq). – Peter Cordes Dec 11 '21 at 18:54
  • Anyway, it's safe to point a `__m256i*` at anything, but not vice versa: it's defined as a `may_alias` type in GNU C: [Is \`reinterpret\_cast\`ing between hardware SIMD vector pointer and the corresponding type an undefined behavior?](https://stackoverflow.com/a/52117639) – Peter Cordes Dec 11 '21 at 18:55
  • `gcc` (version 11.2.1 20211203) explicitly warns about the line `__m256i sm = _mm256_setzero_si256();`: `warning: ‘sm’ is used uninitialized [-Wuninitialized]`. Also, that intrinsic isn't even in the Intel Software Developer's Manual. – EOF Dec 11 '21 at 21:43

1 Answers1

0

If you have 8 integers in __m256i variable, and you want horizontal sum, best way is probably intrinsics.

Here’s an example, untested:

// Horizontal sum of all 8 lanes in int32 SIMD vector
inline int hadd_epi32( __m256i vec )
{
    // Add 8 lanes into 4
    __m128i r = _mm256_extracti128_si256( vec, 1 );
    r = _mm_add_epi32( r, _mm256_castsi256_si128( vec ) );
    // Add 4 lanes into 2
    r = _mm_add_epi32( r, _mm_unpackhi_epi64( r, r ) );
    // Extract 2 lowest lanes from the vector into scalar registers, return their sum
    const int i1 = _mm_extract_epi32( r, 1 );
    const int i0 = _mm_cvtsi128_si32( r );
    return i1 + i0;
}
Soonts
  • 20,079
  • 9
  • 57
  • 130
  • [Fastest method to calculate sum of all packed 32-bit integers using AVX512 or AVX2](https://stackoverflow.com/q/60108658) has other examples. You probably want to either do do the final sum in vectors, or use scalar 64-bit extract. (`movq` / `mov`+`shr` or `rorx` / `add` with 32-bit operand-size). Your way ends with `movd` / `pextrd` / `add` which is p0 + p5+p0 + p0156 on Intel, vs. pshufd/padd/movd would be p5 + p015 + p0 on Skylake vs. scalar extract starting with movq being p0 + (eliminated except on IceLake)+p06 + p0156. Your way isn't terrible, though, and does have some ILP. – Peter Cordes Dec 13 '21 at 21:12
  • @PeterCordes I think the code I’ve posted saves a cycle of latency on modern processors. The `vpextrd` and `vmovd` instructions don’t have data dependencies, they will run in parallel, so two of them won‘t take much longer than a single one. – Soonts Dec 13 '21 at 21:19
  • You're forgetting resource conflicts (ports) and/or that `vpextrd` isn't single-uop on Intel or AMD. `vpextrd` on is a shuffle uop (Intel port 5) feeding a `vmovd` uop for the xmm->integer data copy (Intel port 0). Or if port 0 had been busy with older work so the shuffle happened before `vmovd` could dispatch to a port, you'd have two separate `vmovd` uops competing for the same port, so the available instruction-level parallelism couldn't be exploited by current CPUs because of the resource conflict for the one port that can run xmm->integer uops (Intel p0 ALU, Zen2 FP2 competing w. stores) – Peter Cordes Dec 13 '21 at 21:43
  • For latency, your way has a critical path of shuffle -> movd -> add. My suggestions are shuffle -> paddd -> movd which is the same latency (but no uops off the critical path so better for throughput). Or movq -> rorx -> add, also the same latency and also only 3 uops. You're right that movq -> mov -> shr -> add would be worse latency on Ice Lake, where Intel broke mov-elimination in a microcode update for an erratum workaround, but it would be equal to your way and same number of uops on CPUs with working scalar mov-elim. And less demand for vector ALU ports. – Peter Cordes Dec 13 '21 at 21:48
  • @PeterCordes On AMD Zen 2 and Zen 3, extract uses P12 + P45, movd uses P45. Both will run in parallel, one of them on P4 another one on P5. – Soonts Dec 13 '21 at 21:52
  • That's not what I'm seeing on https://uops.info/ for Zen 2. (https://uops.info/html-tp/ZEN2/PEXTRD_R32_XMM_I8-Measurements.html). Its table is showing `FP1/2, FP2` for `vpextrd r32, xmm, imm`, with 1/clock throughput. Same measured throughput and ports for Zen 3. But yes, maybe not competing with `vmovd` on Zen3; they're showing FP45 for ports there. (But still only 1/clock measured throughput for `vmovd`, so that's weird, possibly a bug in their testing. https://uops.info/html-tp/ZEN3/VMOVD_R32_XMM-Measurements.html. Or not? They also list a "documented" throughput of 1.00c) – Peter Cordes Dec 13 '21 at 21:55
  • @PeterCordes Hmm, you’re correct about Zen 2. But not 3: https://www.uops.info/html-ports/ZEN3/PEXTRD_R32_XMM_I8-Measurements.html – Soonts Dec 13 '21 at 22:02
  • In Zen 3, they apparently introduced two more EUs, FP45, dedicated to moving values from vectors to RAM, and from vectors to scalar registers. Used by movd, extract, movemask, and some of the vector store instructions. – Soonts Dec 13 '21 at 22:09