I write a test program for x86 system. In the loop, there are four different store statements. If I uncomment statement1, the result is 3.2ns. The results for other statements are 2.2ns, 3.7ns, 2.6ns respectively. I can't understand these results. I think the first statement1 should be the fastest because it stores an immediate value and doesn't need to load the value at first like other statements.
Why those four statements have different speed. Could anyone explain them? Thanks.
$ ./a.out 0
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#define BUF_SIZE 8192
#define ROUND 100000000UL
int main(int argc, char **argv)
{
char *buf, *buf_newaddr, *buf_pageend;
unsigned long i __attribute__((aligned(64)));
int buf_realsize;
unsigned long offset __attribute__((aligned(64)));
struct timespec start={0,0}, end={0,0};
double start_ns, end_ns;
if (argc != 2) {
printf("missing args\n");
exit(-1);
}
offset = atoi(argv[1]);
again:
buf = (void *)malloc(BUF_SIZE);
buf_pageend = (void *)((unsigned long)(buf + 4095) & 0xfffffffffffff000UL);
if (buf_pageend - buf < 1024) { // make sure we have enough space in case the 'offset' is negative
// don't free, occupy it in order to alloc another different block
goto again;
}
memset(buf, 0, BUF_SIZE);
printf("&i = %lx, &offset=%lx\n", &i, &offset);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < ROUND; i++) {
//*((unsigned long *)(buf_pageend + offset)) = 0; // 3.2ns
//*((unsigned long *)(buf_pageend + offset)) = (unsigned long)(buf_pageend + offset); // 2.2ns
//*((unsigned long *)(buf_pageend + offset)) = i; // 3.7ns
//*((unsigned long *)(buf_pageend + offset)) = offset; // 2.6ns
}
clock_gettime(CLOCK_MONOTONIC, &end);
start_ns = start.tv_sec*1000000000 + start.tv_nsec;
end_ns = end.tv_sec*1000000000 + end.tv_nsec;
printf("ns: %lf\n", (end_ns - start_ns)/ROUND);
}
EDIT 2022-10-30 17:43 for discussion in comments:
The asm for the second assignment statement is:
movq -176(%rbp), %rdx
movq -64(%rbp), %rax
leaq (%rdx,%rax), %rcx
movq -176(%rbp), %rdx // delete this line
movq -64(%rbp), %rax // delete this line
addq %rdx, %rax
movq %rcx, (%rax)
movq -112(%rbp), %rax
addq $1, %rax
movq %rax, -112(%rbp)
If I delete the two lines marked with //
, the result will change from 2.2ns to 3.6ns.