1

I am working on Ubuntu 18.04.6 LTS with Intel Core(TM) 3.20GHz. In GCC is written the following on the -Os flag:

-Os
Optimize for size. -Os enables all -O2 optimizations except those that often increase code size:

-falign-functions  -falign-jumps 
-falign-labels  -falign-loops 
-fprefetch-loop-arrays  -freorder-blocks-algorithm=stc
It also enables -finline-functions, causes the compiler to tune for code size rather than execution speed, and performs further optimizations designed to reduce code size.

I complied my program with two kinds of flags:

  1. The flags "-O2 -fno-align-functions -fno-align-loops -fno-align-labels -fno-align-jumps -fno-prefetch-loop-arrays -freorder-blocks-algorithm=simple -finline-functions".

  2. The flag "-Os"

I got different assembly files, running in different speed.

Am I missing something here? What can possibly make the two programs compile differently? (it seems from GCC that both options should be equivalent).

Here is a sample of the critical code

#include <stdio.h>
#include <linux/mman.h>
#include <sys/mman.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <x86intrin.h>
#include <stddef.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <sys/mman.h>
#include <pthread.h>


#include <stdalign.h>
#define alignas _Alignas


#include <sys/resource.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include<pthread.h>

#ifdef DEBUG
  #define INLINE
#else
  #define INLINE inline
#endif


FILE *file;
#define BUFFER_BLOCK_SIZE_TARGET   4096
#define BUFFER_BLOCK_SIZE_SOURCE   4096

#ifdef DEBUG
  #define INLINE
#else
  #define INLINE inline
#endif


struct mmap_buf_st {
   void alignas(BUFFER_BLOCK_SIZE_TARGET)   *ptr;
   uint64_t   num_elem;
   uint64_t   len_B;
   int is_allocated;
};
typedef struct mmap_buf_st * mmap_buf;


struct Block_st {
   mmap_buf data_ptr;
   uint64_t max_size_BAM;
   uint64_t size_no_MD;

   uint64_t object_size;
   uint64_t BAMID;
   uint64_t max_num_elem;
   uint64_t actual_num_elem;
   double valid_precentage;
};
typedef struct Block_st *BAM ;

#define page_size_B 4096
#define page_metadata_size_B 2
#define object_metadata_B 20

#define MAX_NUM_SOURCES 6
#define  page_data_B 4094


int use_buffer_target=1;
int use_buffer_source=0;

int copy_MD_to_buff=1;
int copy_as_it_is=0;
int no_copy_to_target=0;



typedef struct tstruct_st {


uint32_t obj_size:21 ;

uint32_t prev_struct:20 ;
uint32_t prev_obj:24 ;
uint64_t off_source:34 ;
uint32_t key[4] ;
}__attribute__((__packed__)) tstruct_t;


typedef tstruct_t * tsruct_ptr;




struct GYN_st{
    BAM source_BAMs[MAX_NUM_SOURCES];
    BAM target;
    uint64_t object_size_B;
    uint16_t BAM_size_GB;
};

typedef struct GYN_st * GYN;

typedef struct lazy_accumulator_st{
    pthread_t thread;
    uint64_t thread_num;
    uint64_t *values_accum;
    tsruct_ptr my_struct; 
    uint64_t start;
    uint64_t end;//not including end
    uint64_t sum_of_obj_size_per_thread;
    uint64_t offset_target_start_no_MD;
    GYN GYN;
    uint64_t num_threads;


}lazy_accumulator_t;

#define MILLION 1000000L

#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))


uint64_t get_clock_wall_time_microseconds(){
    struct timespec current_time;
    clock_gettime(CLOCK_MONOTONIC,&current_time);
    return current_time.tv_sec*MILLION+current_time.tv_nsec/1000;
}

static void * copy_data_per_thread(void * lazy_accumu){
    assert(no_copy_to_target<=use_buffer_target*copy_MD_to_buff);
    lazy_accumulator_t *lazy_accumulator=(lazy_accumulator_t *) lazy_accumu;
    if(copy_as_it_is){
        char *target=lazy_accumulator->GYN->target->data_ptr->ptr;
        char *source=lazy_accumulator->GYN->source_BAMs[lazy_accumulator->thread_num% MAX_NUM_SOURCES]->data_ptr->ptr;
        uint64_t size=lazy_accumulator->GYN->target->max_size_BAM/lazy_accumulator->num_threads;
        uint64_t start=size*lazy_accumulator->thread_num;
        memcpy(target+start,source+start,size);
        return NULL;
    }

    uint64_t start=lazy_accumulator->start;
    uint64_t end=lazy_accumulator->end;

    uint64_t start_time=get_clock_wall_time_microseconds(),end_time;

    uint64_t num_my_struct_elements=end-start;
    tsruct_ptr my_struct=lazy_accumulator->my_struct;
    uint64_t curr_offset_target_start;
    GYN GYN=lazy_accumulator->GYN;


    uint64_t page_MD_offset=page_metadata_size_B; //page offset of first obj

    char  alignas(BUFFER_BLOCK_SIZE_TARGET)  page_traget_buff[BUFFER_BLOCK_SIZE_TARGET];
    __builtin_assume_aligned(page_traget_buff,BUFFER_BLOCK_SIZE_TARGET);

    uint64_t curr_offset_target_start_no_MD;
    uint64_t offset_page_target_buff;

    if(use_buffer_target){
        memset(page_traget_buff,0,BUFFER_BLOCK_SIZE_TARGET);
        offset_page_target_buff=0;
    }



    uint64_t pages_till_now_traget;
    char *ptr_target_data=lazy_accumulator->GYN->target->data_ptr->ptr;
    uint8_t end_segment; 
    uint64_t * accum=lazy_accumulator->values_accum;
    uint64_t curr_segments=0;
    uint8_t init_local_buff=0;



;
    if(start==0){
       if(use_buffer_target){
        init_local_buff=1;
       }

       if(use_buffer_target&& copy_MD_to_buff){
            assert(offset_page_target_buff==0);
            memcpy(page_traget_buff+offset_page_target_buff,&page_MD_offset,page_metadata_size_B);
            offset_page_target_buff+=page_metadata_size_B;

       }
       else{
        memcpy(ptr_target_data+curr_offset_target_start,&page_MD_offset,page_metadata_size_B);

       }



    }


    for(uint64_t i=0;i<num_my_struct_elements;i++){

        tstruct_t my_struct_elem=my_struct[i+start];
        
        curr_offset_target_start_no_MD=accum[i]+lazy_accumulator->offset_target_start_no_MD;
        pages_till_now_traget=curr_offset_target_start_no_MD/page_data_B+1;
        curr_offset_target_start=curr_offset_target_start_no_MD+pages_till_now_traget*page_metadata_size_B;

        assert(curr_offset_target_start<GYN->target->max_size_BAM);
        uint64_t curr_offset_target_in_page=curr_offset_target_start%page_size_B;
        assert(curr_offset_target_start==(i+start)*(object_metadata_B+GYN->object_size_B)+pages_till_now_traget*page_metadata_size_B); 
        uint64_t data_left_to_copy_obj=my_struct_elem.obj_size+object_metadata_B;
        uint64_t curr_offset_source_start=my_struct_elem.off_source;
        int64_t curr_offset_source_start_in_block=curr_offset_source_start%BUFFER_BLOCK_SIZE_SOURCE;

        uint64_t source_num=my_struct_elem.prev_struct;
        char *ptr_source_data=GYN->source_BAMs[source_num]->data_ptr->ptr;
        
        



        uint8_t end_segment=0;

        while(!end_segment){


            uint64_t curr_offset_target_in_page=curr_offset_target_start%page_size_B;

            assert(curr_offset_source_start% page_size_B>=page_metadata_size_B);
            uint64_t data_left_to_copy_source=page_size_B-curr_offset_source_start%page_size_B;
            uint64_t data_left_to_copy_target=page_size_B-curr_offset_target_start%page_size_B;
            
            uint64_t data_to_copy=MIN(MIN(data_left_to_copy_source,data_left_to_copy_target),data_left_to_copy_obj);

         //   printf("%lu\n",curr_offset_target_start);
            if(!use_buffer_target || !init_local_buff){
                memcpy(ptr_target_data+curr_offset_target_start,ptr_source_data+curr_offset_source_start,data_to_copy);
            }
            else{
                assert(data_to_copy+offset_page_target_buff<=BUFFER_BLOCK_SIZE_TARGET);
                memcpy(page_traget_buff+offset_page_target_buff,ptr_source_data+curr_offset_source_start,data_to_copy);
            }
            if(data_to_copy==data_left_to_copy_obj){
                assert(curr_offset_target_start% page_size_B>=page_metadata_size_B);
                assert((curr_offset_target_start+data_left_to_copy_obj-1)% page_size_B>=page_metadata_size_B);
                assert((curr_offset_source_start+data_left_to_copy_obj-1)% page_size_B>=page_metadata_size_B);
                end_segment=1;
            }

            curr_offset_target_start+=data_to_copy;
            curr_offset_source_start+=data_to_copy;
            data_left_to_copy_obj-=data_to_copy;
           
           if(use_buffer_target && init_local_buff){
                offset_page_target_buff+=data_to_copy;
                assert(offset_page_target_buff<=BUFFER_BLOCK_SIZE_TARGET);
            }
            

            if(data_to_copy==data_left_to_copy_source){
                assert((curr_offset_source_start)% page_size_B==0);
                curr_offset_source_start+=page_metadata_size_B;
            }
            
            if(use_buffer_target&&init_local_buff&& offset_page_target_buff%BUFFER_BLOCK_SIZE_TARGET==0){
                assert(data_to_copy==data_left_to_copy_target);
               if(copy_MD_to_buff){
                                assert((curr_offset_target_start)% page_size_B==0);

               }
               if(!no_copy_to_target){
                    if(copy_MD_to_buff){
                        memcpy(ptr_target_data+(curr_offset_target_start-BUFFER_BLOCK_SIZE_TARGET),page_traget_buff,BUFFER_BLOCK_SIZE_TARGET);

                    }
                    else{
                       memcpy(ptr_target_data+(curr_offset_target_start-BUFFER_BLOCK_SIZE_TARGET+page_metadata_size_B),page_traget_buff+page_metadata_size_B,BUFFER_BLOCK_SIZE_TARGET-page_metadata_size_B);

                    }
               }
                offset_page_target_buff=0;

            }
            

            if(data_to_copy==data_left_to_copy_target){
                assert((curr_offset_target_start)% page_size_B==0);
                page_MD_offset=data_left_to_copy_obj+page_metadata_size_B;
                if(page_MD_offset>=page_size_B){
                    page_MD_offset=0;
                }
            

               if(use_buffer_target&& copy_MD_to_buff){
                    memcpy(page_traget_buff,&page_MD_offset,page_metadata_size_B);
               }
               else{
                    memcpy(ptr_target_data+curr_offset_target_start,&page_MD_offset,page_metadata_size_B);
               }
               if(use_buffer_target){
                    offset_page_target_buff=page_metadata_size_B;

               }

                curr_offset_target_start+=page_metadata_size_B;
                pages_till_now_traget++;
                assert(curr_offset_target_start% page_size_B==page_metadata_size_B);
                init_local_buff=1;

            }
        }

    }

    
}


int main(){
    return 1;
}

I tried to do

gcc sample.c -Os -o sample_Os.s -Os -lm -lpthread -S

and

gcc sample.c -O2 -fno-align-functions -fno-align-loops -fno-align-labels -fno-align-jumps -fno-prefetch-loop-arrays -freorder-blocks-algorithm=simple -finline-functions  -o sample_O2.s -lm -lpthread -S

And I finally did

diff sample_Os.s sample_O2.s

Which resulted in

< .LFB5059:
---
> .LFB5086:
11c11
<   leaq    8(%rsp), %rsi
---
>   movq    %rsp, %rsi
16,26c16,27
<   movq    16(%rsp), %rax
<   movl    $1000, %esi
<   imulq   $1000000, 8(%rsp), %rcx
<   cqto
<   idivq   %rsi
<   addq    %rcx, %rax
<   movq    24(%rsp), %rdx
<   xorq    %fs:40, %rdx
<   je  .L2
<   call    __stack_chk_fail@PLT
< .L2:
---
>   movq    8(%rsp), %rcx
>   movabsq $2361183241434822607, %rdx
>   imulq   $1000000, (%rsp), %rsi
>   movq    %rcx, %rax
>   sarq    $63, %rcx
>   imulq   %rdx
>   sarq    $7, %rdx
>   subq    %rcx, %rdx
>   movq    24(%rsp), %rdi
>   xorq    %fs:40, %rdi
>   leaq    (%rsi,%rdx), %rax
>   jne .L5
27a29
>   .cfi_remember_state
29a32,34
> .L5:
>   .cfi_restore_state
>   call    __stack_chk_fail@PLT
31c36
< .LFE5059:
---
> .LFE5086:
37c42
< .LFB5061:
---
> .LFB5088:
42c47
< .LFE5061:
user3563894
  • 331
  • 3
  • 13
  • Please share the target code used to spot the issue (or a simpler one) so people can reproduce the problem. – Jérôme Richard Feb 02 '23 at 11:01
  • This is explained in the quote you posted: "It also -- causes the compiler to tune for code size rather than execution speed, and performs further optimizations designed to reduce code size." – VLL Feb 02 '23 at 11:35
  • @VLL but what kind of addional operation it does? (except the mentioned flags, which are disabled). – user3563894 Feb 02 '23 at 11:38
  • -Os uses `idiv` instead of [Why does GCC use multiplication by a strange number in implementing integer division?](https://stackoverflow.com/q/41183935) , even though it's a lot slower and not much smaller for 32-bit operand-size. – Peter Cordes Feb 02 '23 at 15:03
  • @PeterCordes in the full code (not the sample one) -Os was signifintly faster than -O2. Maybe there is another explanstion? – user3563894 Feb 02 '23 at 17:13
  • @PeterCordes also, why gcc does not mentioned this difference? – user3563894 Feb 02 '23 at 17:13
  • Mention what difference? GCC says `-Os` optimizes for size more than speed, with `-Oz` optimizing aggressively for size without concern for speed. The choices it makes for any given ISA at different optimization levels are based on heuristics / tuning choices that can change with GCC versions, and with different `-mtune=` options for different microarchitectures for that target ISA. So a future GCC version could change that `idiv` choice without needing to update the docs. – Peter Cordes Feb 02 '23 at 17:17
  • If your actual test found `-Os` was faster, this is not a [mcve] of it. I'm not shocked that `-fno-align-functions -fno-align-loops` would hurt performance on some CPUs, there's a reason they're on by default. (But with modern CPUs having a uop cache, it's often less important.) Depending what Intel CPU you have (you didn't say what generation), differences in code alignment might fall afoul of [How can I mitigate the impact of the Intel jcc erratum on gcc?](https://stackoverflow.com/q/61256646) since you omitted `-Wa,-mbranches-within-32B-boundaries` – Peter Cordes Feb 02 '23 at 17:19
  • Hmmm ok. Where can i find these heuristics per microarchitrctures? – user3563894 Feb 02 '23 at 17:20
  • Using -O2 without -fno flags is still slower than -Os. I will read these gcc flags. Thanks! – user3563894 Feb 02 '23 at 17:23

0 Answers0