I am working on Ubuntu 18.04.6 LTS with Intel Core(TM) 3.20GHz. In GCC is written the following on the -Os flag:
-Os
Optimize for size. -Os enables all -O2 optimizations except those that often increase code size:
-falign-functions -falign-jumps
-falign-labels -falign-loops
-fprefetch-loop-arrays -freorder-blocks-algorithm=stc
It also enables -finline-functions, causes the compiler to tune for code size rather than execution speed, and performs further optimizations designed to reduce code size.
I complied my program with two kinds of flags:
The flags "-O2 -fno-align-functions -fno-align-loops -fno-align-labels -fno-align-jumps -fno-prefetch-loop-arrays -freorder-blocks-algorithm=simple -finline-functions".
The flag "-Os"
I got different assembly files, running in different speed.
Am I missing something here? What can possibly make the two programs compile differently? (it seems from GCC that both options should be equivalent).
Here is a sample of the critical code
#include <stdio.h>
#include <linux/mman.h>
#include <sys/mman.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <x86intrin.h>
#include <stddef.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <sys/mman.h>
#include <pthread.h>
#include <stdalign.h>
#define alignas _Alignas
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include<pthread.h>
#ifdef DEBUG
#define INLINE
#else
#define INLINE inline
#endif
FILE *file;
#define BUFFER_BLOCK_SIZE_TARGET 4096
#define BUFFER_BLOCK_SIZE_SOURCE 4096
#ifdef DEBUG
#define INLINE
#else
#define INLINE inline
#endif
struct mmap_buf_st {
void alignas(BUFFER_BLOCK_SIZE_TARGET) *ptr;
uint64_t num_elem;
uint64_t len_B;
int is_allocated;
};
typedef struct mmap_buf_st * mmap_buf;
struct Block_st {
mmap_buf data_ptr;
uint64_t max_size_BAM;
uint64_t size_no_MD;
uint64_t object_size;
uint64_t BAMID;
uint64_t max_num_elem;
uint64_t actual_num_elem;
double valid_precentage;
};
typedef struct Block_st *BAM ;
#define page_size_B 4096
#define page_metadata_size_B 2
#define object_metadata_B 20
#define MAX_NUM_SOURCES 6
#define page_data_B 4094
int use_buffer_target=1;
int use_buffer_source=0;
int copy_MD_to_buff=1;
int copy_as_it_is=0;
int no_copy_to_target=0;
typedef struct tstruct_st {
uint32_t obj_size:21 ;
uint32_t prev_struct:20 ;
uint32_t prev_obj:24 ;
uint64_t off_source:34 ;
uint32_t key[4] ;
}__attribute__((__packed__)) tstruct_t;
typedef tstruct_t * tsruct_ptr;
struct GYN_st{
BAM source_BAMs[MAX_NUM_SOURCES];
BAM target;
uint64_t object_size_B;
uint16_t BAM_size_GB;
};
typedef struct GYN_st * GYN;
typedef struct lazy_accumulator_st{
pthread_t thread;
uint64_t thread_num;
uint64_t *values_accum;
tsruct_ptr my_struct;
uint64_t start;
uint64_t end;//not including end
uint64_t sum_of_obj_size_per_thread;
uint64_t offset_target_start_no_MD;
GYN GYN;
uint64_t num_threads;
}lazy_accumulator_t;
#define MILLION 1000000L
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
uint64_t get_clock_wall_time_microseconds(){
struct timespec current_time;
clock_gettime(CLOCK_MONOTONIC,¤t_time);
return current_time.tv_sec*MILLION+current_time.tv_nsec/1000;
}
static void * copy_data_per_thread(void * lazy_accumu){
assert(no_copy_to_target<=use_buffer_target*copy_MD_to_buff);
lazy_accumulator_t *lazy_accumulator=(lazy_accumulator_t *) lazy_accumu;
if(copy_as_it_is){
char *target=lazy_accumulator->GYN->target->data_ptr->ptr;
char *source=lazy_accumulator->GYN->source_BAMs[lazy_accumulator->thread_num% MAX_NUM_SOURCES]->data_ptr->ptr;
uint64_t size=lazy_accumulator->GYN->target->max_size_BAM/lazy_accumulator->num_threads;
uint64_t start=size*lazy_accumulator->thread_num;
memcpy(target+start,source+start,size);
return NULL;
}
uint64_t start=lazy_accumulator->start;
uint64_t end=lazy_accumulator->end;
uint64_t start_time=get_clock_wall_time_microseconds(),end_time;
uint64_t num_my_struct_elements=end-start;
tsruct_ptr my_struct=lazy_accumulator->my_struct;
uint64_t curr_offset_target_start;
GYN GYN=lazy_accumulator->GYN;
uint64_t page_MD_offset=page_metadata_size_B; //page offset of first obj
char alignas(BUFFER_BLOCK_SIZE_TARGET) page_traget_buff[BUFFER_BLOCK_SIZE_TARGET];
__builtin_assume_aligned(page_traget_buff,BUFFER_BLOCK_SIZE_TARGET);
uint64_t curr_offset_target_start_no_MD;
uint64_t offset_page_target_buff;
if(use_buffer_target){
memset(page_traget_buff,0,BUFFER_BLOCK_SIZE_TARGET);
offset_page_target_buff=0;
}
uint64_t pages_till_now_traget;
char *ptr_target_data=lazy_accumulator->GYN->target->data_ptr->ptr;
uint8_t end_segment;
uint64_t * accum=lazy_accumulator->values_accum;
uint64_t curr_segments=0;
uint8_t init_local_buff=0;
;
if(start==0){
if(use_buffer_target){
init_local_buff=1;
}
if(use_buffer_target&& copy_MD_to_buff){
assert(offset_page_target_buff==0);
memcpy(page_traget_buff+offset_page_target_buff,&page_MD_offset,page_metadata_size_B);
offset_page_target_buff+=page_metadata_size_B;
}
else{
memcpy(ptr_target_data+curr_offset_target_start,&page_MD_offset,page_metadata_size_B);
}
}
for(uint64_t i=0;i<num_my_struct_elements;i++){
tstruct_t my_struct_elem=my_struct[i+start];
curr_offset_target_start_no_MD=accum[i]+lazy_accumulator->offset_target_start_no_MD;
pages_till_now_traget=curr_offset_target_start_no_MD/page_data_B+1;
curr_offset_target_start=curr_offset_target_start_no_MD+pages_till_now_traget*page_metadata_size_B;
assert(curr_offset_target_start<GYN->target->max_size_BAM);
uint64_t curr_offset_target_in_page=curr_offset_target_start%page_size_B;
assert(curr_offset_target_start==(i+start)*(object_metadata_B+GYN->object_size_B)+pages_till_now_traget*page_metadata_size_B);
uint64_t data_left_to_copy_obj=my_struct_elem.obj_size+object_metadata_B;
uint64_t curr_offset_source_start=my_struct_elem.off_source;
int64_t curr_offset_source_start_in_block=curr_offset_source_start%BUFFER_BLOCK_SIZE_SOURCE;
uint64_t source_num=my_struct_elem.prev_struct;
char *ptr_source_data=GYN->source_BAMs[source_num]->data_ptr->ptr;
uint8_t end_segment=0;
while(!end_segment){
uint64_t curr_offset_target_in_page=curr_offset_target_start%page_size_B;
assert(curr_offset_source_start% page_size_B>=page_metadata_size_B);
uint64_t data_left_to_copy_source=page_size_B-curr_offset_source_start%page_size_B;
uint64_t data_left_to_copy_target=page_size_B-curr_offset_target_start%page_size_B;
uint64_t data_to_copy=MIN(MIN(data_left_to_copy_source,data_left_to_copy_target),data_left_to_copy_obj);
// printf("%lu\n",curr_offset_target_start);
if(!use_buffer_target || !init_local_buff){
memcpy(ptr_target_data+curr_offset_target_start,ptr_source_data+curr_offset_source_start,data_to_copy);
}
else{
assert(data_to_copy+offset_page_target_buff<=BUFFER_BLOCK_SIZE_TARGET);
memcpy(page_traget_buff+offset_page_target_buff,ptr_source_data+curr_offset_source_start,data_to_copy);
}
if(data_to_copy==data_left_to_copy_obj){
assert(curr_offset_target_start% page_size_B>=page_metadata_size_B);
assert((curr_offset_target_start+data_left_to_copy_obj-1)% page_size_B>=page_metadata_size_B);
assert((curr_offset_source_start+data_left_to_copy_obj-1)% page_size_B>=page_metadata_size_B);
end_segment=1;
}
curr_offset_target_start+=data_to_copy;
curr_offset_source_start+=data_to_copy;
data_left_to_copy_obj-=data_to_copy;
if(use_buffer_target && init_local_buff){
offset_page_target_buff+=data_to_copy;
assert(offset_page_target_buff<=BUFFER_BLOCK_SIZE_TARGET);
}
if(data_to_copy==data_left_to_copy_source){
assert((curr_offset_source_start)% page_size_B==0);
curr_offset_source_start+=page_metadata_size_B;
}
if(use_buffer_target&&init_local_buff&& offset_page_target_buff%BUFFER_BLOCK_SIZE_TARGET==0){
assert(data_to_copy==data_left_to_copy_target);
if(copy_MD_to_buff){
assert((curr_offset_target_start)% page_size_B==0);
}
if(!no_copy_to_target){
if(copy_MD_to_buff){
memcpy(ptr_target_data+(curr_offset_target_start-BUFFER_BLOCK_SIZE_TARGET),page_traget_buff,BUFFER_BLOCK_SIZE_TARGET);
}
else{
memcpy(ptr_target_data+(curr_offset_target_start-BUFFER_BLOCK_SIZE_TARGET+page_metadata_size_B),page_traget_buff+page_metadata_size_B,BUFFER_BLOCK_SIZE_TARGET-page_metadata_size_B);
}
}
offset_page_target_buff=0;
}
if(data_to_copy==data_left_to_copy_target){
assert((curr_offset_target_start)% page_size_B==0);
page_MD_offset=data_left_to_copy_obj+page_metadata_size_B;
if(page_MD_offset>=page_size_B){
page_MD_offset=0;
}
if(use_buffer_target&& copy_MD_to_buff){
memcpy(page_traget_buff,&page_MD_offset,page_metadata_size_B);
}
else{
memcpy(ptr_target_data+curr_offset_target_start,&page_MD_offset,page_metadata_size_B);
}
if(use_buffer_target){
offset_page_target_buff=page_metadata_size_B;
}
curr_offset_target_start+=page_metadata_size_B;
pages_till_now_traget++;
assert(curr_offset_target_start% page_size_B==page_metadata_size_B);
init_local_buff=1;
}
}
}
}
int main(){
return 1;
}
I tried to do
gcc sample.c -Os -o sample_Os.s -Os -lm -lpthread -S
and
gcc sample.c -O2 -fno-align-functions -fno-align-loops -fno-align-labels -fno-align-jumps -fno-prefetch-loop-arrays -freorder-blocks-algorithm=simple -finline-functions -o sample_O2.s -lm -lpthread -S
And I finally did
diff sample_Os.s sample_O2.s
Which resulted in
< .LFB5059:
---
> .LFB5086:
11c11
< leaq 8(%rsp), %rsi
---
> movq %rsp, %rsi
16,26c16,27
< movq 16(%rsp), %rax
< movl $1000, %esi
< imulq $1000000, 8(%rsp), %rcx
< cqto
< idivq %rsi
< addq %rcx, %rax
< movq 24(%rsp), %rdx
< xorq %fs:40, %rdx
< je .L2
< call __stack_chk_fail@PLT
< .L2:
---
> movq 8(%rsp), %rcx
> movabsq $2361183241434822607, %rdx
> imulq $1000000, (%rsp), %rsi
> movq %rcx, %rax
> sarq $63, %rcx
> imulq %rdx
> sarq $7, %rdx
> subq %rcx, %rdx
> movq 24(%rsp), %rdi
> xorq %fs:40, %rdi
> leaq (%rsi,%rdx), %rax
> jne .L5
27a29
> .cfi_remember_state
29a32,34
> .L5:
> .cfi_restore_state
> call __stack_chk_fail@PLT
31c36
< .LFE5059:
---
> .LFE5086:
37c42
< .LFB5061:
---
> .LFB5088:
42c47
< .LFE5061: