3

I have implemented a program for a[i]=a[i-1]+c and I represent it her. I use begin_rdtsc and end_rdtsc to read and store the rdtsc to measure the speedup.

The program is as follows, I use x86intrin.h

#define MAX1 512
#define LEN MAX1*MAX1  //array size for time measure ments
int __attribute__(( aligned(32))) a[LEN];

int main(){

    singleCore // It's a macro to assign the program to a single core of the processor
    int i, b, c;

    begin_rdtsc

    // b=1 and c=2 in this case
    b = 1;
    c = 2;
    i = 0;

    a[i++] = b;//0 --> a[0] = 1
    //step 1:
    //solving dependencies vectorization factor is 8
    a[i++] = a[0] + 1*c; //1  --> a[1] = 1 + 2  = 3
    a[i++] = a[0] + 2*c; //2  --> a[2] = 1 + 4  = 5
    a[i++] = a[0] + 3*c; //3  --> a[3] = 1 + 6  = 7
    a[i++] = a[0] + 4*c; //4  --> a[4] = 1 + 8  = 9
    a[i++] = a[0] + 5*c; //5  --> a[5] = 1 + 10 = 11
    a[i++] = a[0] + 6*c; //6  --> a[6] = 1 + 12 = 13
    a[i++] = a[0] + 7*c; //7  --> a[7] = 1 + 14 = 15
    // vectorization factor reached
    // 8 *c will work for all 
    //loading the results to an vector
    __m256i dep1;
    //__m256i  dep2; //  dep = { 1,   3,  5, 7,  9,  11, 13, 15 }
    __m256i coeff = _mm256_set1_epi32(8*c); //coeff = { 16, 16, 16, 16, 16, 16, 16, 16 }
    //step2
    for(; i<LEN-1; i+=8){

        dep1 = _mm256_load_si256((__m256i *) &a[i-8]);
        dep1 = _mm256_add_epi32(dep1, coeff);
        _mm256_store_si256((__m256i *) &a[i], dep1);    

    }
    end_rdtsc
    return 0;
}

I compiled this program with different compilers. My compilers are : icc 18, gcc 7.2, clang 4.

The OS is fedora 27.

The CPU is Corei7 6700HQ (Skylake)

The scalar implementation which is compiled with icc -D _GNU_SOURCE -O3 -no-vec -march=native is the baseline for speedup measurements.

The asm output for each compiler is as follows: Because the behavior of ICC is not normal I copied all the code for icc. I marked the section in C program ("mm...mm1/2").

  1. ICC

    # mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 18.0.1.163 Build 20171018";
    # mark_description "-D _GNU_SOURCE -O3 -no-vec -march=native -c -S -o AIC3iccnovec";
        .file "AIC3.c"
        .text
    ..TXTST0:
    .L_2__routine_start_main_0:
    # -- Begin  main
        .text
    # mark_begin;
           .align    16,0x90
        .globl main
    # --- main()
    main:
    ..B1.1:                         # Preds ..B1.0
                                    # Execution count [1.00e+00]
        .cfi_startproc
    ..___tag_value_main.1:
    ..L2:
                                                              #7.11
            pushq     %rbp                                          #7.11
        .cfi_def_cfa_offset 16
            movq      %rsp, %rbp                                    #7.11
        .cfi_def_cfa 6, 16
        .cfi_offset 6, -16
            andq      $-128, %rsp                                   #7.11
            subq      $128, %rsp                                    #7.11
            xorl      %esi, %esi                                    #7.11
            movl      $3, %edi                                      #7.11
            call      __intel_new_feature_proc_init                 #7.11
                                    # LOE rbx r12 r13 r14 r15
    ..B1.21:                        # Preds ..B1.1
                                    # Execution count [1.00e+00]
            vstmxcsr  (%rsp)                                        #7.11
            vpxor     %ymm0, %ymm0, %ymm0                           #9.2
            orl       $32832, (%rsp)                                #7.11
            vldmxcsr  (%rsp)                                        #7.11
            vmovups   %ymm0, mask(%rip)                             #9.2
            vmovups   %ymm0, 32+mask(%rip)                          #9.2
            vmovups   %ymm0, 64+mask(%rip)                          #9.2
            vmovups   %ymm0, 96+mask(%rip)                          #9.2
                                    # LOE rbx r12 r13 r14 r15
    ..B1.2:                         # Preds ..B1.21
                                    # Execution count [5.00e-01]
            xorl      %edi, %edi                                    #9.2
            movl      $128, %esi                                    #9.2
            movl      $mask, %edx                                   #9.2
            orq       $12, mask(%rip)                               #9.2
            vzeroupper                                              #9.2
    ..___tag_value_main.6:
    #       sched_setaffinity(__pid_t, size_t, const cpu_set_t *)
            call      sched_setaffinity                             #9.2
    ..___tag_value_main.7:
                                    # LOE rbx r12 r13 r14 r15
    ..B1.3:                         # Preds ..B1.2
                                    # Execution count [1.72e+00]
            movq      $0xdf84757ff, %rax                            #12.5
            movq      $.L_2__STRING.1, programName(%rip)            #10.2
            movq      $100000000, elapsed_rdtsc(%rip)               #12.5
            movq      %rax, overal_time(%rip)                       #12.5
            movq      $0, ttime(%rip)                               #12.5
            vmovdqu   .L_2il0floatpacket.2(%rip), %ymm0             #33.21
                                    # LOE rbx r12 r13 r14 r15
    ..B1.4:                         # Preds ..B1.12 ..B1.3
                                    # Execution count [2.91e+00]
    # Begin ASM
    # #mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm1
    # End ASM
                                    # LOE rbx r12 r13 r14 r15
    ..B1.23:                        # Preds ..B1.4
                                    # Execution count [2.91e+00]
            vzeroupper                                              #12.5
            rdtsc                                                   #12.5
            shlq      $32, %rdx                                     #12.5
            orq       %rdx, %rax                                    #12.5
                                    # LOE rax rbx r12 r13 r14 r15
    ..B1.5:                         # Preds ..B1.23
                                    # Execution count [2.62e+00]
            movq      %rax, t1_rdtsc(%rip)                          #12.5
            xorl      %edx, %edx                                    #35.5
            movl      $1, a(%rip)                                   #18.5
            xorl      %eax, %eax                                    #35.5
            movl      $3, 4+a(%rip)                                 #21.5
            movl      $5, 8+a(%rip)                                 #21.5
            movl      $7, 12+a(%rip)                                #21.5
            movl      $9, 16+a(%rip)                                #21.5
            movl      $11, 20+a(%rip)                               #21.5
            movl      $13, 24+a(%rip)                               #21.5
            movl      $15, 28+a(%rip)                               #21.5
            vmovdqu   .L_2il0floatpacket.2(%rip), %ymm1             #35.5
                                    # LOE rax rbx r12 r13 r14 r15 edx ymm1
    ..B1.6:                         # Preds ..B1.6 ..B1.5
                                    # Execution count [4.29e+04]
            vpaddd    a(%rax), %ymm1, %ymm0                         #38.16
            incl      %edx                                          #35.5
            vmovdqu   %ymm0, 32+a(%rax)                             #39.41
            addq      $32, %rax                                     #35.5
            cmpl      $2047, %edx                                   #35.5
            jb        ..B1.6        # Prob 99%                      #35.5
                                    # LOE rax rbx r12 r13 r14 r15 edx ymm1
    ..B1.7:                         # Preds ..B1.6
                                    # Execution count [2.91e+00]
            vzeroupper                                              #46.5
            rdtsc                                                   #46.5
            shlq      $32, %rdx                                     #46.5
            orq       %rdx, %rax                                    #46.5
                                    # LOE rax rbx r12 r13 r14 r15
    ..B1.8:                         # Preds ..B1.7
                                    # Execution count [2.91e+00]
            movq      %rax, t2_rdtsc(%rip)                          #46.5
                                    # LOE rbx r12 r13 r14 r15
    ..B1.26:                        # Preds ..B1.8
                                    # Execution count [2.91e+00]
    # Begin ASM
    # #mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm2
    # End ASM
                                    # LOE rbx r12 r13 r14 r15
    ..B1.25:                        # Preds ..B1.26
                                    # Execution count [2.91e+00]
            movq      t2_rdtsc(%rip), %rdx                          #46.5
            subq      t1_rdtsc(%rip), %rdx                          #46.5
            movq      ttbest_rdtsc(%rip), %rsi                      #46.5
            movq      %rdx, ttotal_rdtsc(%rip)                      #46.5
            cmpq      %rsi, %rdx                                    #46.5
            jge       ..B1.10       # Prob 50%                      #46.5
                                    # LOE rdx rbx rsi r12 r13 r14 r15
    ..B1.9:                         # Preds ..B1.25
                                    # Execution count [1.45e+00]
            movq      elapsed_rdtsc(%rip), %rcx                     #46.5
            movq      %rcx, %rax                                    #46.5
            negq      %rax                                          #46.5
            movq      %rdx, %rsi                                    #46.5
            addq      $100000000, %rax                              #46.5
            movq      %rdx, ttbest_rdtsc(%rip)                      #46.5
            movq      %rax, elapsed(%rip)                           #46.5
            jmp       ..B1.11       # Prob 100%                     #46.5
                                    # LOE rdx rcx rbx rsi r12 r13 r14 r15
    ..B1.10:                        # Preds ..B1.25
                                    # Execution count [1.45e+00]
            movq      elapsed_rdtsc(%rip), %rcx                     #46.5
                                    # LOE rdx rcx rbx rsi r12 r13 r14 r15
    ..B1.11:                        # Preds ..B1.9 ..B1.10
                                    # Execution count [2.91e+00]
            movq      ttime(%rip), %rax                             #46.5
            addq      %rdx, %rax                                    #46.5
            movq      %rax, ttime(%rip)                             #46.5
            testq     %rcx, %rcx                                    #46.5
            je        ..B1.14       # Prob 50%                      #46.5
                                    # LOE rax rcx rbx rsi r12 r13 r14 r15
    ..B1.12:                        # Preds ..B1.11
                                    # Execution count [1.45e+00]
            decq      %rcx                                          #46.5
            movq      %rcx, elapsed_rdtsc(%rip)                     #46.5
            cmpq      overal_time(%rip), %rax                       #46.5
            jl        ..B1.4        # Prob 82%                      #46.5
            jmp       ..B1.15       # Prob 100%                     #46.5
                                    # LOE rcx rbx rsi r12 r13 r14 r15
    ..B1.14:                        # Preds ..B1.11
                                    # Execution count [1.45e+00]
            movq      $-1, elapsed_rdtsc(%rip)                      #46.5
            movq      $-1, %rcx                                     #46.5
                                    # LOE rcx rbx rsi r12 r13 r14 r15
    ..B1.15:                        # Preds ..B1.12 ..B1.14
                                    # Execution count [1.00e+00]
            negq      %rcx                                          #46.5
            movl      $.L_2__STRING.2, %edi                         #46.5
            addq      $100000000, %rcx                              #46.5
            xorl      %eax, %eax                                    #46.5
            movq      elapsed(%rip), %rdx                           #46.5
    ..___tag_value_main.8:
    #       printf(const char *__restrict__, ...)
            call      printf                                        #46.5
    ..___tag_value_main.9:
                                    # LOE rbx r12 r13 r14 r15
    ..B1.16:                        # Preds ..B1.15
                                    # Execution count [1.00e+00]
            movl      $.L_2__STRING.3, %edi                         #46.5
            movl      $.L_2__STRING.4, %esi                         #46.5
    #       fopen(const char *__restrict__, const char *__restrict__)
            call      fopen                                         #46.5
                                    # LOE rax rbx r12 r13 r14 r15
    ..B1.17:                        # Preds ..B1.16
                                    # Execution count [1.00e+00]
            movl      $128, %ecx                                    #46.5
            movq      %rax, %rdi                                    #46.5
            movq      %rax, fileForSpeedups(%rip)                   #46.5
            movl      $.L_2__STRING.5, %esi                         #46.5
            movl      %ecx, %r8d                                    #46.5
            xorl      %eax, %eax                                    #46.5
            movq      programName(%rip), %rdx                       #46.5
            movq      ttbest_rdtsc(%rip), %r9                       #46.5
    #       fprintf(FILE *__restrict__, const char *__restrict__, ...)
            call      fprintf                                       #46.5
                                    # LOE rbx r12 r13 r14 r15
    ..B1.18:                        # Preds ..B1.17
                                    # Execution count [1.00e+00]
            xorl      %eax, %eax                                    #47.9
            movq      %rbp, %rsp                                    #47.9
            popq      %rbp                                          #47.9
        .cfi_def_cfa 7, 8
        .cfi_restore 6
            ret                                                     #47.9
            .align    16,0x90
                                    # LOE
        .cfi_endproc
    # mark_end;
        .type   main,@function
        .size   main,.-main
    ..LNmain.0:
        .data
    # -- End  main
        .bss
        .align 8
        .align 8
        .globl fileForSpeedups
    fileForSpeedups:
        .type   fileForSpeedups,@object
        .size   fileForSpeedups,8
        .space 8    # pad
        .align 8
        .globl ttime
    ttime:
        .type   ttime,@object
        .size   ttime,8
        .space 8    # pad
        .data
        .align 8
        .align 8
        .globl programName
    programName:
        .quad   .L_2__STRING.0
        .type   programName,@object
        .size   programName,8
        .align 8
        .globl ttbest_rdtsc
    ttbest_rdtsc:
        .long   0x5d89ffff,0x01634578
        .type   ttbest_rdtsc,@object
        .size   ttbest_rdtsc,8
        .align 8
        .globl elapsed_rdtsc
    elapsed_rdtsc:
        .long   0x05f5e100,0x00000000
        .type   elapsed_rdtsc,@object
        .size   elapsed_rdtsc,8
        .align 8
        .globl overal_time
    overal_time:
        .long   0xf84757ff,0x0000000d
        .type   overal_time,@object
        .size   overal_time,8
        .section .rodata, "a"
        .align 32
        .align 32
    .L_2il0floatpacket.2:
        .long   0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010
        .type   .L_2il0floatpacket.2,@object
        .size   .L_2il0floatpacket.2,32
        .section .rodata.str1.4, "aMS",@progbits,1
        .align 4
        .align 4
    .L_2__STRING.1:
        .long   860047681
        .byte   0
        .type   .L_2__STRING.1,@object
        .size   .L_2__STRING.1,5
        .space 3, 0x00  # pad
        .align 4
    .L_2__STRING.2:
        .long   1701344266
        .long   1936024096
        .long   1936269428
        .long   1819026720
        .long   1852383332
        .long   1819026720
        .long   543716452
        .long   1919251561
        .long   1869182049
        .long   1851859054
        .long   1814372452
        .long   1914725484
        .long   1952804965
        .long   1869182057
        .long   684910
        .type   .L_2__STRING.2,@object
        .size   .L_2__STRING.2,60
        .align 4
    .L_2__STRING.3:
        .long   1701603686
        .long   1400008518
        .long   1684366704
        .long   7565429
        .type   .L_2__STRING.3,@object
        .size   .L_2__STRING.3,16
        .align 4
    .L_2__STRING.4:
        .word   97
        .type   .L_2__STRING.4,@object
        .size   .L_2__STRING.4,2
        .space 2, 0x00  # pad
        .align 4
    .L_2__STRING.5:
        .long   539783973
        .long   628646949
        .long   622865508
        .long   174353516
        .byte   0
        .type   .L_2__STRING.5,@object
        .size   .L_2__STRING.5,17
        .space 3, 0x00  # pad
        .align 4
    .L_2__STRING.0:
        .word   32
        .type   .L_2__STRING.0,@object
        .size   .L_2__STRING.0,2
        .data
        .comm mask1,128,32
        .comm t1_rdtsc,8,8
        .comm t2_rdtsc,8,8
        .comm ttotal_rdtsc,8,8
        .comm elapsed,8,8
        .comm mask,128,32
        .comm a,65536,32
        .section .note.GNU-stack, ""
    // -- Begin DWARF2 SEGMENT .eh_frame
        .section .eh_frame,"a",@progbits
    .eh_frame_seg:
        .align 8
    # End
    
  2. GCC

    //gcc  -D _GNU_SOURCE -O3 -fno-tree-vectorize -fno-tree-slp-vectorize -march=native -c -S -o "AIC3" "AIC3.c"
    rdtsc
    salq    $32, %rdx
    movq    %r10, a(%rip)
    
    
    orq %rdx, %rax
    movq    %r9, a+8(%rip)
    movq    %r8, a+16(%rip)
    movq    %rdi, a+24(%rip)
    
    
    vmovdqa a(%rip), %ymm1
    movq    %rax, t1_rdtsc(%rip)
    movl    $a+32, %eax
    .p2align 4,,10
    .p2align 3
    .L2:
        vpaddd  %ymm1, %ymm2, %ymm0
        addq    $32, %rax
        vmovdqa %ymm0, -32(%rax)
        vmovdqa %ymm0, %ymm1
        cmpq    %rax, %rcx
        jne .L2
    rdtsc
    
  3. Clang

    //clang  -D _GNU_SOURCE -O3 -fno-vectorize -fno-slp-vectorize -march=native -c -S -o "AIC3"clang "
    rdtsc
    shlq    $32, %rdx
    orq %rax, %rdx
    movq    %rdx, t1_rdtsc(%rip)
    movq    %r8, a(%rip)
    movq    %r9, a+8(%rip)
    movq    %r10, a+16(%rip)
    movq    %rcx, a+24(%rip)
    vmovdqa a(%rip), %ymm8
    movl    $64, %eax
    jmp .LBB0_2
    .p2align    4, 0x90
    .LBB0_9:                                #   in Loop: Header=BB0_2 Depth=2
        vpaddd  %ymm7, %ymm8, %ymm8
        vmovdqa %ymm8, a(,%rax,4)
        addq    $64, %rax
    .LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vpaddd  %ymm0, %ymm8, %ymm9
        vmovdqa %ymm9, a-224(,%rax,4)
        vpaddd  %ymm1, %ymm8, %ymm9
        vmovdqa %ymm9, a-192(,%rax,4)
        vpaddd  %ymm2, %ymm8, %ymm9
        vmovdqa %ymm9, a-160(,%rax,4)
        vpaddd  %ymm3, %ymm8, %ymm9
        vmovdqa %ymm9, a-128(,%rax,4)
        vpaddd  %ymm4, %ymm8, %ymm9
        vmovdqa %ymm9, a-96(,%rax,4)
        vpaddd  %ymm5, %ymm8, %ymm9
        vmovdqa %ymm9, a-64(,%rax,4)
        vpaddd  %ymm6, %ymm8, %ymm9
        vmovdqa %ymm9, a-32(,%rax,4)
        cmpq    $16383, %rax            # imm = 0x3FFF
        jl  .LBB0_9
    # BB#3:                                 #   in Loop: Header=BB0_1 Depth=1
    rdtsc
    

The speedups are ~1.30, ~4.10 and 4.00 using icc, gcc and clang, respectively.

As I mentioned, I've compiled the same code with different compilers and recorder the rdtsc. speedup for ICC is not as I expected. I used IACA to watch the inner loop, the summarized output is:

-----------------------------------------------------
|  compilers  |    icc    |     gcc    |    clang    |
------------------------------------------------------
|  Throughput |1.49 cycle |1.00 cycle  |1.49 cycle   |
------------------------------------------------------
|  bottleneck | Front End | dependency | Front End   |
------------------------------------------------------

UPDATE-0 : I've compared with and without IACA generated codes. The reason that IACA does not help, in this case, is the outputs are not the same. It seems injecting the IACA marks forces the compilers to stop their optimization, GCC has the same generated code as ICC and Clang has. But, calculating the addresses in GCC is more efficient in throughput point of view. In summary, IACA cannot help for this code.

UPDATE-1 : The outputs for perf is as follows:

512*512
ICC:

 86.06 │loop:  vpaddd 0x604580(%rax),%ymm1,%ymm0
  0.17 │       inc    %edx
  4.73 │       vmovdq %ymm0,0x6045a0(%rax)
       │       add    $0x20,%rax
       │       cmp    $0x7fff,%edx
  8.98 │       jb     loop


GCC:

 30.62 │loop:  vpaddd %ymm1,%ymm2,%ymm0
 15.12 │       add    $0x20,%rax
 46.03 │       vmovdq %ymm0,-0x20(%rax)
  2.40 │       vmovdq %ymm0,%ymm1
  0.01 │       cmp    %rax,%rcx
  5.62 │       jne    loop


LLVM:

  3.00 │loop:  vpaddd %ymm0,%ymm7,%ymm8                                                  
  6.61 │       vmovdq %ymm8,0x6020e0(,%rax,4)                                            
 15.96 │       vpaddd %ymm1,%ymm7,%ymm8                                                  
  5.19 │       vmovdq %ymm8,0x602100(,%rax,4)                                            
  1.89 │       vpaddd %ymm2,%ymm7,%ymm8                                                  
  6.16 │       vmovdq %ymm8,0x602120(,%rax,4)                                            
 13.25 │       vpaddd %ymm3,%ymm7,%ymm8                                                 
  8.01 │       vmovdq %ymm8,0x602140(,%rax,4)                                            
  2.10 │       vpaddd %ymm4,%ymm7,%ymm8                                                  
  5.37 │       vmovdq %ymm8,0x602160(,%rax,4)                                            
 13.92 │       vpaddd %ymm5,%ymm7,%ymm8                                                  
  7.95 │       vmovdq %ymm8,0x602180(,%rax,4)                                            
  0.89 │       vpaddd %ymm6,%ymm7,%ymm7                                                  
  4.34 │       vmovdq %ymm7,0x6021a0(,%rax,4)                                            
  2.82 │       add    $0x38,%rax                                                         
       │       cmp    $0x3ffff,%rax                                                      
  2.24 │       jl     loop                                                                

The ICC assembly output show that there is some SIMD instructions inside the rdtsc. If I miss something, or something is wrong I really have no idea. I spent a lot of time to realize the problem but zero achievement. Please, if somebody knows the reason help me. Thanks in advance.

Amiri
  • 2,417
  • 1
  • 15
  • 42
  • 4
    Do you expect every compiler to produce near-optimal code in all circumstances? If so, I have a really nice shiny bridge to sell you, almost new. – n. m. could be an AI Dec 30 '17 at 19:03
  • @n.m. I expect ICC to be a good boy. What is your offer?:) – Amiri Dec 30 '17 at 20:44
  • 3
    ICC is closed source product of single company, stands no real chance against OSS behemoths like gcc and clang. The only thing giving it somewhat edge to survive is that it's the same company, which does produce the target CPU, so they can exploit the HW knowledge early. Then again their subtle changes to produce rather tiny bit suboptimal code in exchange of degrading performance on AMD hampers them almost enough to counter their advantage. But gcc and clang are now mature enough to make any single-company effort hopeless, you can't allocate such amount of expertise under single roof. – Ped7g Dec 30 '17 at 22:03
  • 1
    @Ped7g: ICC is better at auto-vectorizing some things than gcc/clang. It only targets x86, while gcc vectorizes in an arch-independent internal representation which doesn't even know which shuffles the target can do efficiently. One major example is that ICC can auto-vectorize search loops (e.g. a pure-C implementation of `memchr`), but gcc / clang can only auto-vectorize loops with a trip-count that's determined before loop-entry. Clang is often very good, though, when it does auto-vectorize. – Peter Cordes Dec 31 '17 at 13:30
  • @PeterCordes, Hi peter, missed you. in a benchmark paper that I have read (cannot remember where) ICC was able to auto-vectorize 90% of loops while Clang and GCC were able to auto-vectorize 60% and 50%. I think clang is going to win the competition. As I know most researchers try to implement their Ideas with LLVM which Clang use it. – Amiri Dec 31 '17 at 13:34
  • In my experience Clang shows more performance when you unroll the loop in source codes. – Amiri Dec 31 '17 at 13:36
  • Well... it is, sometimes. – n. m. could be an AI Dec 31 '17 at 18:30
  • what was the reason for down-voting? If there is any problem, say and provide the solution. – Amiri Jan 01 '18 at 19:28

1 Answers1

6

The different compilers actually use fairly different implementation strategies here.

GCC notices that it never has to re-load a[i-8] which was calculated in the previous iteration and therefore can be sourced from a register. This relies on mov-elimination somewhat, otherwise the reg-reg move would still add some latency, though even without mov-elimination it would be a lot faster than reloading every time.

ICC's codegen is very naive, it just does it exactly the way you wrote it. The store/reload adds quite a lot of latency.

Clang does approximately the same thing as GCC, but unrolls by 8 (minus the first iteration). Clang often likes to unroll more. I'm not sure why it's slightly worse than what GCC does.

You can avoid the reloading by explicitly not doing it in the first place: (not tested)

dep1 = _mm256_load_si256((__m256i *) &a[0]);
for(; i<LEN-1; i+=8){

    dep1 = _mm256_add_epi32(dep1, coeff);
    _mm256_store_si256((__m256i *) &a[i], dep1);    

}
harold
  • 61,398
  • 6
  • 86
  • 164
  • I've added the `perf` outputs, it shows why `gcc` is faster than `Clang`. I think it's because of different address calculation and gcc's is more efficient. – Amiri Dec 31 '17 at 13:30
  • @Martin: that `perf` output doesn't explain anything. [Skylake can keep indexed-addressing-mode stores micro-fused](https://stackoverflow.com/questions/26046634/micro-fusion-and-addressing-modes); IACA is wrong about this and still applies the Sandybridge rules. It looks to me like clang should be much faster: unrolling with more registers so the loop-carried dep chain is only every 8th `vpaddd`. But this is integer, not FP, so `vpaddd` latency is only 1c, and it should bottleneck on store throughput (1 per clock, or less if it's not hot in L1D). 3 add/c without the store bottleneck, though. – Peter Cordes Dec 31 '17 at 13:43
  • I use `IACA 3.0`, it supports `SKL` as a default and you can change it to `SKLX` – Amiri Dec 31 '17 at 13:45
  • It would make more sense for clang to use a non-indexed addressing mode, though. But anyway, gcc will bottleneck on the front-end, rather than store throughput. – Peter Cordes Dec 31 '17 at 13:45
  • @Martin: That's not what I meant. IACA for Haswell and Skylake is wrong because it models micro-fusion for indexed addressing modes on SKL the same as on SnB. But on real hardware it changed with HSW, which you can see if you use perf counters for `uops_issued`. – Peter Cordes Dec 31 '17 at 13:47
  • @PeterCordes, I got it. – Amiri Dec 31 '17 at 13:48