8

I'm compiling the program:

#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>

struct Model
{
    int open, extend;
};

struct Cell
{
    int a, b;
};

typedef std::vector<std::vector<Cell>> DPMatrix;

void print(const DPMatrix& matrix)
{
    for (std::size_t i = 0; i < matrix.size(); ++i) {
        for (std::size_t j = 0; j < matrix[i].size(); ++j) {
            std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
        }
        std::cout << std::endl;
    }
}

DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
    DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
    const int inf = model.open * std::max(num_cols, num_rows);
    for (int i = 1; i < num_cols; ++i) {
        result[i][0].b = model.open + (i - 1) * model.extend;
    }
    for (int j = 1; j < num_rows; ++j) {
        result[0][j].a = model.open + (j - 1) * model.extend;
    }
    return result;
}

int main()
{
    const Model model = {-8, -1};
    const DPMatrix matrix = init_dp_matrix(10, 2, model);
    print(matrix);
}

With GCC 9.2.0:

$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)

with -march=native:

$ g++-9 -O3 -march=native -o bug bug.cpp

On an Ubuntu machine with Intel chips:

$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 18.04.3 LTS
Release:        18.04
Codename:       bioni

$ grep model /proc/cpuinfo | head -2
model           : 85
model name      : Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz

Running the program I get bogus output:

$ ./bug 
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

If I compile without -march=native I get the correct output:

$ g++-9 -O3 -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0

The assembly for the -match=native version is:

$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
    .file   "bug.cpp"
    .text
    .section    .text._ZNKSt5ctypeIcE8do_widenEc,"axG",@progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
    .align 2
    .p2align 4
    .weak   _ZNKSt5ctypeIcE8do_widenEc
    .type   _ZNKSt5ctypeIcE8do_widenEc, @function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
    .cfi_startproc
    movl    %esi, %eax
    ret
    .cfi_endproc
.LFE1303:
    .size   _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "} "
    .text
    .p2align 4
    .globl  _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .type   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
    .cfi_startproc
    movq    (%rdi), %rdx
    cmpq    %rdx, 8(%rdi)
    je  .L23
    pushq   %r15
    .cfi_def_cfa_offset 16
    .cfi_offset 15, -16
    pushq   %r14
    .cfi_def_cfa_offset 24
    .cfi_offset 14, -24
    pushq   %r13
    .cfi_def_cfa_offset 32
    .cfi_offset 13, -32
    movabsq $-6148914691236517205, %r13
    pushq   %r12
    .cfi_def_cfa_offset 40
    .cfi_offset 12, -40
    xorl    %r12d, %r12d
    pushq   %rbp
    .cfi_def_cfa_offset 48
    .cfi_offset 6, -48
    movq    %rdi, %rbp
    pushq   %rbx
    .cfi_def_cfa_offset 56
    .cfi_offset 3, -56
    subq    $24, %rsp
    .cfi_def_cfa_offset 80
    .p2align 4,,10
    .p2align 3
.L4:
    leaq    (%r12,%r12,2), %rbx
    salq    $3, %rbx
    addq    %rbx, %rdx
    movq    8(%rdx), %rax
    xorl    %r14d, %r14d
    cmpq    %rax, (%rdx)
    je  .L8
    .p2align 4,,10
    .p2align 3
.L5:
    movl    $1, %edx
    leaq    15(%rsp), %rsi
    movl    $_ZSt4cout, %edi
    movb    $123, 15(%rsp)
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    %rax, %rdi
    movq    0(%rbp), %rax
    leaq    0(,%r14,8), %r15
    movq    (%rax,%rbx), %rax
    movl    (%rax,%r14,8), %esi
    incq    %r14
    call    _ZNSolsEi
    movq    %rax, %rdi
    movl    $1, %edx
    leaq    15(%rsp), %rsi
    movb    $32, 15(%rsp)
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    %rax, %rdi
    movq    0(%rbp), %rax
    movq    (%rax,%rbx), %rax
    movl    4(%rax,%r15), %esi
    call    _ZNSolsEi
    movq    %rax, %rdi
    movl    $2, %edx
    movl    $.LC0, %esi
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    0(%rbp), %rdx
    addq    %rbx, %rdx
    movq    8(%rdx), %rax
    subq    (%rdx), %rax
    sarq    $3, %rax
    cmpq    %rax, %r14
    jb  .L5
.L8:
    movq    _ZSt4cout(%rip), %rax
    movq    -24(%rax), %rax
    movq    _ZSt4cout+240(%rax), %r14
    testq   %r14, %r14
    je  .L26
    cmpb    $0, 56(%r14)
    je  .L9
    movsbl  67(%r14), %esi
.L10:
    movl    $_ZSt4cout, %edi
    call    _ZNSo3putEc
    movq    %rax, %rdi
    call    _ZNSo5flushEv
    movq    0(%rbp), %rdx
    movq    8(%rbp), %rax
    incq    %r12
    subq    %rdx, %rax
    sarq    $3, %rax
    imulq   %r13, %rax
    cmpq    %r12, %rax
    ja  .L4
    addq    $24, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 56
    popq    %rbx
    .cfi_def_cfa_offset 48
    popq    %rbp
    .cfi_def_cfa_offset 40
    popq    %r12
    .cfi_def_cfa_offset 32
    popq    %r13
    .cfi_def_cfa_offset 24
    popq    %r14
    .cfi_def_cfa_offset 16
    popq    %r15
    .cfi_def_cfa_offset 8
    ret
    .p2align 4,,10
    .p2align 3
.L9:
    .cfi_restore_state
    movq    %r14, %rdi
    call    _ZNKSt5ctypeIcE13_M_widen_initEv
    movq    (%r14), %rax
    movl    $10, %esi
    movq    48(%rax), %rax
    cmpq    $_ZNKSt5ctypeIcE8do_widenEc, %rax
    je  .L10
    movq    %r14, %rdi
    call    *%rax
    movsbl  %al, %esi
    jmp .L10
.L23:
    .cfi_def_cfa_offset 8
    .cfi_restore 3
    .cfi_restore 6
    .cfi_restore 12
    .cfi_restore 13
    .cfi_restore 14
    .cfi_restore 15
    ret
.L26:
    .cfi_def_cfa_offset 80
    .cfi_offset 3, -56
    .cfi_offset 6, -48
    .cfi_offset 12, -40
    .cfi_offset 13, -32
    .cfi_offset 14, -24
    .cfi_offset 15, -16
    call    _ZSt16__throw_bad_castv
    .cfi_endproc
.LFE2359:
    .size   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .section    .rodata.str1.8,"aMS",@progbits,1
    .align 8
.LC2:
    .string "cannot create std::vector larger than max_size()"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB6:
    .text
.LHOTB6:
    .p2align 4
    .globl  _Z14init_dp_matrixmmRK5Model
    .type   _Z14init_dp_matrixmmRK5Model, @function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDA2360
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movabsq $1152921504606846975, %rax
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %r15
    pushq   %r14
    pushq   %r13
    pushq   %r12
    pushq   %rbx
    andq    $-32, %rsp
    subq    $64, %rsp
    .cfi_offset 15, -24
    .cfi_offset 14, -32
    .cfi_offset 13, -40
    .cfi_offset 12, -48
    .cfi_offset 3, -56
    movq    %rdi, 24(%rsp)
    movq    %rsi, 40(%rsp)
    movq    %rcx, 16(%rsp)
    cmpq    %rax, %rdx
    ja  .L103
    movq    %rdx, %r15
    testq   %rdx, %rdx
    je  .L71
    leaq    0(,%rdx,8), %rbx
    movq    %rbx, %rdi
.LEHB0:
    call    _Znwm
.LEHE0:
    movq    %rax, %r13
    leaq    -1(%r15), %rax
    cmpq    $3, %rax
    movq    %r15, %rdx
    movq    %r13, %rax
    jbe .L30
    shrq    $2, %rdx
    salq    $5, %rdx
    addq    %r13, %rdx
    vpxor   %xmm0, %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L32:
    vmovdqu32   %ymm0, (%rax)
    addq    $32, %rax
    cmpq    %rdx, %rax
    jne .L32
    movq    %r15, %rcx
    andq    $-4, %rcx
    movq    %r15, %rdx
    andl    $3, %edx
    leaq    0(%r13,%rcx,8), %rax
    cmpq    %rcx, %r15
    je  .L33
.L30:
    movq    $0, (%rax)
    cmpq    $1, %rdx
    je  .L33
    movq    $0, 8(%rax)
    cmpq    $2, %rdx
    je  .L33
    movq    $0, 16(%rax)
    cmpq    $3, %rdx
    je  .L33
    movq    $0, 24(%rax)
.L33:
    leaq    0(%r13,%rbx), %rax
    movq    %rax, 56(%rsp)
.L29:
    movabsq $384307168202282325, %rax
    cmpq    %rax, 40(%rsp)
    ja  .L104
    movq    40(%rsp), %rax
    movq    24(%rsp), %r12
    leaq    (%rax,%rax,2), %rbx
    movq    $0, (%r12)
    movq    $0, 8(%r12)
    movq    $0, 16(%r12)
    salq    $3, %rbx
    testq   %rax, %rax
    je  .L35
    movq    %rbx, %rdi
    vzeroupper
.LEHB1:
    call    _Znwm
.LEHE1:
    addq    %rax, %rbx
    movq    %rax, (%r12)
    movq    %rax, 8(%r12)
    movq    %rbx, 16(%r12)
    movq    56(%rsp), %r12
    movq    %rax, %r14
    subq    %r13, %r12
    movq    %r12, %rax
    sarq    $3, %rax
    je  .L40
    movabsq $1152921504606846975, %rdx
    cmpq    %rdx, %rax
    ja  .L41
    movq    40(%rsp), %rax
    movq    %r14, %rbx
    movq    %rax, 48(%rsp)
    .p2align 4,,10
    .p2align 3
.L46:
    movq    $0, (%rbx)
    movq    $0, 8(%rbx)
    movq    $0, 16(%rbx)
    movq    %r12, %rdi
.LEHB2:
    call    _Znwm
.LEHE2:
    leaq    (%rax,%r12), %rcx
    movq    %rax, (%rbx)
    movq    %rcx, 16(%rbx)
    movq    %rax, %rdi
    cmpq    %r13, 56(%rsp)
    je  .L42
    movq    %r12, %rdx
    movq    %r13, %rsi
    movq    %rcx, 32(%rsp)
    call    memcpy
    movq    32(%rsp), %rcx
    addq    $24, %rbx
    movq    %rcx, -16(%rbx)
    decq    48(%rsp)
    jne .L46
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
.L47:
    movq    %r13, %rdi
    call    _ZdlPv
.L48:
    movq    16(%rsp), %rax
    cmpq    $1, 40(%rsp)
    movl    (%rax), %edx
    jbe .L62
    movl    4(%rax), %edi
    movq    24(%rsp), %rax
    movq    (%rax), %rsi
    movq    40(%rsp), %rax
    leaq    -2(%rax), %rcx
    cmpq    $7, %rcx
    jbe .L73
    movq    %rcx, %r8
    shrq    $3, %r8
    leaq    (%r8,%r8,2), %r8
    salq    $6, %r8
    vmovdqa64   .LC1(%rip), %ymm3
    vmovdqa64   .LC3(%rip), %ymm4
    vmovdqa64   .LC4(%rip), %ymm6
    vmovdqa64   .LC5(%rip), %ymm5
    vpbroadcastd    %edi, %ymm10
    vpbroadcastd    %edx, %ymm9
    leaq    24(%rsi), %rax
    leaq    24(%rsi,%r8), %r8
    vpcmpeqd    %ymm8, %ymm8, %ymm8
    kxnorb  %k1, %k1, %k1
    .p2align 4,,10
    .p2align 3
.L61:
    vmovdqa64   %ymm3, %ymm0
    vpaddd  %ymm8, %ymm0, %ymm0
    vpmulld %ymm10, %ymm0, %ymm0
    vmovdqu64   (%rax), %ymm2
    vmovdqu64   96(%rax), %ymm1
    vpermt2q    32(%rax), %ymm6, %ymm2
    vpermt2q    128(%rax), %ymm6, %ymm1
    vpermt2q    64(%rax), %ymm5, %ymm2
    vpaddd  %ymm9, %ymm0, %ymm0
    vpermt2q    160(%rax), %ymm5, %ymm1
    kmovb   %k1, %k2
    addq    $192, %rax
    vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
    vperm2i128  $17, %ymm0, %ymm0, %ymm0
    kmovb   %k1, %k3
    vpaddd  %ymm4, %ymm3, %ymm3
    vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
    cmpq    %r8, %rax
    jne .L61
    andq    $-8, %rcx
    leaq    1(%rcx), %r8
    leal    1(%rcx), %eax
.L59:
    leaq    (%r8,%r8,2), %rcx
    movq    (%rsi,%rcx,8), %r8
    leal    -1(%rax), %ecx
    imull   %edi, %ecx
    movq    40(%rsp), %rbx
    addl    %edx, %ecx
    movl    %ecx, 4(%r8)
    leal    1(%rax), %ecx
    movslq  %ecx, %r8
    cmpq    %r8, %rbx
    jbe .L62
    leaq    (%r8,%r8,2), %r8
    movq    (%rsi,%r8,8), %r9
    movl    %edi, %r8d
    imull   %eax, %r8d
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    2(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %ecx
    movl    %ecx, 4(%r9)
    leal    3(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %r8d
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    4(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %ecx
    movl    %ecx, 4(%r9)
    leal    5(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %r8d
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    6(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    $7, %eax
    addl    %edx, %ecx
    cltq
    movl    %ecx, 4(%r9)
    cmpq    %rax, %rbx
    jbe .L62
    imull   %r8d, %edi
    leaq    (%rax,%rax,2), %rax
    movq    (%rsi,%rax,8), %rax
    leal    (%rdi,%rdx), %r8d
    movl    %r8d, 4(%rax)
.L62:
    cmpq    $1, %r15
    jbe .L27
    movq    16(%rsp), %rax
    leaq    -1(%r15), %r8
    movl    4(%rax), %edi
    movq    24(%rsp), %rax
    movq    (%rax), %rax
    movq    (%rax), %rsi
    leaq    -2(%r15), %rax
    cmpq    $6, %rax
    jbe .L74
    movq    %r8, %rcx
    shrq    $3, %rcx
    salq    $6, %rcx
    vmovdqa64   .LC1(%rip), %ymm2
    vmovdqa64   .LC3(%rip), %ymm4
    vpbroadcastd    %edi, %ymm6
    vpbroadcastd    %edx, %ymm5
    movq    %rsi, %rax
    addq    %rsi, %rcx
    vpcmpeqd    %ymm3, %ymm3, %ymm3
    .p2align 4,,10
    .p2align 3
.L66:
    vmovdqa64   %ymm2, %ymm0
    vpaddd  %ymm3, %ymm0, %ymm0
    vpmulld %ymm6, %ymm0, %ymm0
    addq    $64, %rax
    vpaddd  %ymm4, %ymm2, %ymm2
    vpaddd  %ymm5, %ymm0, %ymm0
    vmovd   %xmm0, -56(%rax)
    vpextrd $1, %xmm0, -48(%rax)
    vpextrd $2, %xmm0, -40(%rax)
    vpextrd $3, %xmm0, -32(%rax)
    vextracti128    $0x1, %ymm0, %xmm0
    vmovd   %xmm0, -24(%rax)
    vpextrd $1, %xmm0, -16(%rax)
    vpextrd $2, %xmm0, -8(%rax)
    vpextrd $3, %xmm0, (%rax)
    cmpq    %rcx, %rax
    jne .L66
    movq    %r8, %rcx
    andq    $-8, %rcx
    leaq    1(%rcx), %r9
    leal    1(%rcx), %eax
    cmpq    %r8, %rcx
    je  .L27
.L64:
    leal    -1(%rax), %ecx
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    1(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r15, %r9
    jnb .L27
    movl    %edi, %r8d
    imull   %eax, %r8d
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    leal    2(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    3(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r15, %r9
    jnb .L27
    imull   %edi, %r8d
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    leal    4(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    5(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %r8d
    addl    $6, %eax
    cltq
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    cmpq    %rax, %r15
    jbe .L27
    imull   %ecx, %edi
    addl    %edi, %edx
    movl    %edx, (%rsi,%rax,8)
.L27:
    movq    24(%rsp), %rax
    vzeroupper
    leaq    -40(%rbp), %rsp
    popq    %rbx
    popq    %r12
    popq    %r13
    popq    %r14
    popq    %r15
    popq    %rbp
    .cfi_remember_state
    .cfi_def_cfa 7, 8
    ret
    .p2align 4,,10
    .p2align 3
.L37:
    .cfi_restore_state
    movq    %r12, 8(%r14)
    addq    $24, %r14
    cmpq    %r14, %rbx
    je  .L45
.L40:
    movq    $0, (%r14)
    movq    %r12, 16(%r14)
    cmpq    %r13, 56(%rsp)
    je  .L37
    movq    %r12, %rdx
    movq    %r13, %rsi
    xorl    %edi, %edi
    call    memcpy
    addq    $24, %r14
    movq    %r12, -16(%r14)
    cmpq    %r14, %rbx
    jne .L40
.L45:
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
    testq   %r13, %r13
    je  .L48
.L105:
    movq    %r13, %rdi
    call    _ZdlPv
    jmp .L48
    .p2align 4,,10
    .p2align 3
.L42:
    movq    %rcx, 8(%rbx)
    addq    $24, %rbx
    decq    48(%rsp)
    jne .L46
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
    testq   %r13, %r13
    je  .L48
    jmp .L105
    .p2align 4,,10
    .p2align 3
.L71:
    movq    $0, 56(%rsp)
    xorl    %r13d, %r13d
    jmp .L29
    .p2align 4,,10
    .p2align 3
.L35:
    testq   %r13, %r13
    je  .L106
    vzeroupper
    jmp .L47
.L73:
    movl    $1, %eax
    movl    $1, %r8d
    jmp .L59
.L74:
    movl    $1, %eax
    movl    $1, %r9d
    jmp .L64
.L106:
    movq    16(%rsp), %rax
    movl    (%rax), %edx
    jmp .L62
.L41:
    movq    $0, (%r14)
    movq    $0, 8(%r14)
    movq    $0, 16(%r14)
.LEHB3:
    call    _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
    movl    $.LC2, %edi
    vzeroupper
.LEHB4:
    call    _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
    movl    $.LC2, %edi
.LEHB5:
    call    _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
    movq    %rax, %rdi
    jmp .L49
.L77:
    movq    %rax, %rdi
    jmp .L50
.L75:
    movq    %rax, %r12
    vzeroupper
    jmp .L56
    .globl  __gxx_personality_v0
    .section    .gcc_except_table,"a",@progbits
    .align 4
.LLSDA2360:
    .byte   0xff
    .byte   0x3
    .uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
    .byte   0x1
    .uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
    .uleb128 .LEHB0-.LFB2360
    .uleb128 .LEHE0-.LEHB0
    .uleb128 0
    .uleb128 0
    .uleb128 .LEHB1-.LFB2360
    .uleb128 .LEHE1-.LEHB1
    .uleb128 .L75-.LFB2360
    .uleb128 0
    .uleb128 .LEHB2-.LFB2360
    .uleb128 .LEHE2-.LEHB2
    .uleb128 .L77-.LFB2360
    .uleb128 0x1
    .uleb128 .LEHB3-.LFB2360
    .uleb128 .LEHE3-.LEHB3
    .uleb128 .L78-.LFB2360
    .uleb128 0x1
    .uleb128 .LEHB4-.LFB2360
    .uleb128 .LEHE4-.LEHB4
    .uleb128 .L75-.LFB2360
    .uleb128 0
    .uleb128 .LEHB5-.LFB2360
    .uleb128 .LEHE5-.LEHB5
    .uleb128 0
    .uleb128 0
.LLSDACSE2360:
    .byte   0x1
    .byte   0
    .align 4
    .long   0

.LLSDATT2360:
    .text
    .cfi_endproc
    .section    .text.unlikely
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDAC2360
    .type   _Z14init_dp_matrixmmRK5Model.cold, @function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
    .cfi_def_cfa 6, 16
    .cfi_offset 3, -56
    .cfi_offset 6, -16
    .cfi_offset 12, -48
    .cfi_offset 13, -40
    .cfi_offset 14, -32
    .cfi_offset 15, -24
    movq    %r14, %rbx
.L50:
    vzeroupper
    call    __cxa_begin_catch
.L53:
    cmpq    %rbx, %r14
    jne .L107
.LEHB6:
    call    __cxa_rethrow
.LEHE6:
.L76:
    movq    %rax, %r12
    vzeroupper
    call    __cxa_end_catch
    movq    24(%rsp), %rax
    movq    (%rax), %rdi
    testq   %rdi, %rdi
    je  .L56
    call    _ZdlPv
.L56:
    testq   %r13, %r13
    je  .L69
    movq    %r13, %rdi
    call    _ZdlPv
.L69:
    movq    %r12, %rdi
.LEHB7:
    call    _Unwind_Resume
.LEHE7:
.L107:
    movq    (%r14), %rdi
    testq   %rdi, %rdi
    je  .L52
    call    _ZdlPv
.L52:
    addq    $24, %r14
    jmp .L53
    .cfi_endproc
.LFE2360:
    .section    .gcc_except_table
    .align 4
.LLSDAC2360:
    .byte   0xff
    .byte   0x3
    .uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
    .byte   0x1
    .uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
    .uleb128 .LEHB6-.LCOLDB6
    .uleb128 .LEHE6-.LEHB6
    .uleb128 .L76-.LCOLDB6
    .uleb128 0
    .uleb128 .LEHB7-.LCOLDB6
    .uleb128 .LEHE7-.LEHB7
    .uleb128 0
    .uleb128 0
.LLSDACSEC2360:
    .byte   0x1
    .byte   0
    .align 4
    .long   0

.LLSDATTC2360:
    .section    .text.unlikely
    .text
    .size   _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
    .section    .text.unlikely
    .size   _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
    .text
.LHOTE6:
    .section    .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",@progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
    .align 2
    .p2align 4
    .weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .type   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, @function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
    .cfi_startproc
    pushq   %r12
    .cfi_def_cfa_offset 16
    .cfi_offset 12, -16
    movq    %rdi, %r12
    pushq   %rbp
    .cfi_def_cfa_offset 24
    .cfi_offset 6, -24
    pushq   %rbx
    .cfi_def_cfa_offset 32
    .cfi_offset 3, -32
    movq    8(%rdi), %rbx
    movq    (%rdi), %rbp
    cmpq    %rbp, %rbx
    je  .L109
    .p2align 4,,10
    .p2align 3
.L113:
    movq    0(%rbp), %rdi
    testq   %rdi, %rdi
    je  .L110
    addq    $24, %rbp
    call    _ZdlPv
    cmpq    %rbp, %rbx
    jne .L113
.L111:
    movq    (%r12), %rbp
.L109:
    testq   %rbp, %rbp
    je  .L115
    popq    %rbx
    .cfi_remember_state
    .cfi_def_cfa_offset 24
    movq    %rbp, %rdi
    popq    %rbp
    .cfi_def_cfa_offset 16
    popq    %r12
    .cfi_def_cfa_offset 8
    jmp _ZdlPv
    .p2align 4,,10
    .p2align 3
.L110:
    .cfi_restore_state
    addq    $24, %rbp
    cmpq    %rbp, %rbx
    jne .L113
    jmp .L111
    .p2align 4,,10
    .p2align 3
.L115:
    popq    %rbx
    .cfi_def_cfa_offset 24
    popq    %rbp
    .cfi_def_cfa_offset 16
    popq    %r12
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE2637:
    .size   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    .set    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .section    .text.unlikely
.LCOLDB7:
    .section    .text.startup,"ax",@progbits
.LHOTB7:
    .p2align 4
    .globl  main
    .type   main, @function
main:
.LFB2371:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDA2371
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movl    $2, %edx
    movl    $10, %esi
    subq    $48, %rsp
    .cfi_def_cfa_offset 64
    leaq    16(%rsp), %rdi
    leaq    8(%rsp), %rcx
    movq    $-8, 8(%rsp)
.LEHB8:
    call    _Z14init_dp_matrixmmRK5Model
.LEHE8:
    leaq    16(%rsp), %rdi
.LEHB9:
    call    _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
    leaq    16(%rsp), %rdi
    call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    addq    $48, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 16
    xorl    %eax, %eax
    popq    %rbp
    .cfi_def_cfa_offset 8
    ret
.L119:
    .cfi_restore_state
    movq    %rax, %rbp
    jmp .L118
    .section    .gcc_except_table
.LLSDA2371:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
    .uleb128 .LEHB8-.LFB2371
    .uleb128 .LEHE8-.LEHB8
    .uleb128 0
    .uleb128 0
    .uleb128 .LEHB9-.LFB2371
    .uleb128 .LEHE9-.LEHB9
    .uleb128 .L119-.LFB2371
    .uleb128 0
.LLSDACSE2371:
    .section    .text.startup
    .cfi_endproc
    .section    .text.unlikely
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDAC2371
    .type   main.cold, @function
main.cold:
.LFSB2371:
.L118:
    .cfi_def_cfa_offset 64
    .cfi_offset 6, -16
    leaq    16(%rsp), %rdi
    vzeroupper
    call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    movq    %rbp, %rdi
.LEHB10:
    call    _Unwind_Resume
.LEHE10:
    .cfi_endproc
.LFE2371:
    .section    .gcc_except_table
.LLSDAC2371:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
    .uleb128 .LEHB10-.LCOLDB7
    .uleb128 .LEHE10-.LEHB10
    .uleb128 0
    .uleb128 0
.LLSDACSEC2371:
    .section    .text.unlikely
    .section    .text.startup
    .size   main, .-main
    .section    .text.unlikely
    .size   main.cold, .-main.cold
.LCOLDE7:
    .section    .text.startup
.LHOTE7:
    .p2align 4
    .type   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
    .cfi_startproc
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    movl    $_ZStL8__ioinit, %edi
    call    _ZNSt8ios_base4InitC1Ev
    movl    $__dso_handle, %edx
    movl    $_ZStL8__ioinit, %esi
    movl    $_ZNSt8ios_base4InitD1Ev, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 8
    jmp __cxa_atexit
    .cfi_endproc
.LFE3017:
    .size   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .section    .init_array,"aw"
    .align 8
    .quad   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .local  _ZStL8__ioinit
    .comm   _ZStL8__ioinit,1,1
    .section    .rodata.cst32,"aM",@progbits,32
    .align 32
.LC1:
    .long   1
    .long   2
    .long   3
    .long   4
    .long   5
    .long   6
    .long   7
    .long   8
    .align 32
.LC3:
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .align 32
.LC4:
    .quad   0
    .quad   3
    .quad   6
    .quad   0
    .align 32
.LC5:
    .quad   0
    .quad   1
    .quad   2
    .quad   5
    .hidden __dso_handle
    .ident  "GCC: (Homebrew GCC 9.2.0) 9.2.0"
    .section    .note.GNU-stack,"",@progbits

The assembly for the non -march=native version is available on godbolt.

What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?

Additional info

Compiling with -v:

$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
 /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
        compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP

Compiling with -O2 or less makes the problem go away:

$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0}

I tried building on a different machine with Intel chips:

$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64

$ grep model /proc/cpuinfo | head -2
model       : 85
model name  : Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz

$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
 /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
    compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP

$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0} 

The correct output...

-ftree-loop-vectorize is the culprit:

$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

None of the other O3 flags result in this behaviour.

Daniel
  • 8,179
  • 6
  • 31
  • 56
  • Can you compile with the `-v` option and check whether `-march` matches your system. Then, you could try to compare/diff the output with/without `-march=native` flag. – Tom Sep 20 '19 at 10:20
  • @PeterT Yes, but the OP's code contains value-initialisation of all `Cell` objects. – Angew is no longer proud of SO Sep 20 '19 at 10:22
  • 1
    Cannot reproduce this, could be a Xeon-specific bug. Try -O2 instead of -O3. -O3 should rarely be used as it is rarely an improvement over -O2, and the potential for bugs is higher. – n. m. could be an AI Sep 20 '19 at 10:25
  • 1
    @n.m. Is that so? I use `-O3` everywhere. The improvements over `-O2` are small but significant. Haven’t been able to observe any specific bugs yet. As far as I can tell that reputation stems from pre-GCC 4 times, and is no longer deserved. – Konrad Rudolph Sep 20 '19 at 10:39
  • @Tom Output with `-v` added. – Daniel Sep 20 '19 at 10:54
  • @KonradRudolph Looks like it's still deserved. – Lightness Races in Orbit Sep 20 '19 at 11:04
  • @Konrad https://stackoverflow.com/questions/11546075 I have not seen an actual bug in -O3 yet, this question may or may not represent one. – n. m. could be an AI Sep 20 '19 at 11:16
  • See update: `-ftree-loop-vectorize` is the culprit. – Daniel Sep 20 '19 at 11:24
  • @n.m. So it does, apparently. – Konrad Rudolph Sep 20 '19 at 11:37
  • `-march-native` can have two sorts of bugs: either it fails to deduce the native architecture, or it incorrectly generates code for that architecture. What happens if you manually specify the right `-march` ? – MSalters Sep 20 '19 at 11:39
  • On another skylake with avx512 (xeon W-2145, should be close enough), using debian's g++ 9.2.1-8, I get the correct output. I don't know if a relevant bug was fixed just after 9.2 or if there is another difference. AVX512 is still quite young and not so widely used, so bugs are less rare than with older instructions. – Marc Glisse Sep 20 '19 at 11:40
  • If I set `-march=skylake` (rather than `-march=skylake-avx512`) then I get the correct output, so would appear the problem is with AVX512. – Daniel Sep 20 '19 at 11:46
  • @MarcGlisse I also get the correct output on another Skylake with AVX512 machine, so appears the problem is specific to this chip. – Daniel Sep 20 '19 at 11:53
  • 1
    I've [reported](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91837) this as a GCC bug. – Daniel Sep 20 '19 at 12:20
  • @Daniel They're probably not going to accept that in its current form. As on SO, you should post a self-contained MCVE and accurate description of the problem. Read https://www.gnu.org/software/gcc/bugs – Lightness Races in Orbit Sep 20 '19 at 12:28

1 Answers1

3

This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.

Daniel
  • 8,179
  • 6
  • 31
  • 56