I have this code (Some instructions are added for benchmark fairness):
.global count_forloop
.global count_addloop
.global count_mulloop
.global count_divloop
count_forloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
_count_forloop1:
inc %rax
cmp $10000000, %rax
jne _count_forloop1
pop %rcx
pop %rsi
ret
count_addloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
_count_addloop1:
inc %rax
add $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_addloop1
pop %rcx
pop %rsi
ret
count_mulloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %eax, %eax
xor %ecx, %ecx
add $4, %ecx
_count_mulloop1:
inc %rax
imul $3, %rcx # Benchmark this instruction
cmp $10000000, %rax
jne _count_mulloop1
pop %rcx
pop %rsi
ret
count_divloop:
push %rsi
push %rcx
mov $0xFFFFFFFF, %rsi
add %rsi, %rsi
xor %ecx, %ecx
add $1, %rcx
_count_divloop1:
inc %rcx
div %rsi # Benchmark this instruction
cmp $10000000, %rcx
jne _count_divloop1
pop %rcx
pop %rsi
ret
and
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#define N 1000
void count_forloop(void);
void count_addloop(void);
void count_mulloop(void);
void count_divloop(void);
uint64_t ns(void);
int main(int argc, char** argv) {
uint64_t start_time_for = ns();
for (int i = 0; i < N; i++)
count_forloop();
uint64_t end_time_for = ns();
uint64_t diff_for = (end_time_for - start_time_for) / N;
printf("10.000.000 iterations of empty forloop: %" PRIu64 "ns\n", diff_for);
uint64_t start_time_add = ns();
for (int i = 0; i < N; i++)
count_addloop();
uint64_t end_time_add = ns();
uint64_t diff_add = (end_time_add - start_time_add) / N;
printf("10.000.000 iterations of addloop: %" PRIu64 "ns\n", diff_add);
printf("10.000.000 iterations of addloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_add - diff_for);
uint64_t start_time_mul = ns();
for (int i = 0; i < N; i++)
count_mulloop();
uint64_t end_time_mul = ns();
uint64_t diff_mul = (end_time_mul - start_time_mul) / N;
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns\n", diff_mul);
printf("10.000.000 iterations of mulloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_mul - diff_for);
uint64_t start_time_div = ns();
for (int i = 0; i < N; i++) {
count_divloop();
}
uint64_t end_time_div = ns();
uint64_t diff_div = (end_time_div - start_time_div) / N;
printf("10.000.000 iterations of divloop: %" PRIu64 "ns\n", diff_div);
printf("10.000.000 iterations of divloop: %" PRIu64 "ns (Excluding for-loop)\n", diff_div - diff_for);
double real_add = (diff_add - diff_for) / 10000000.0;
double real_mul = (diff_mul - diff_for) / 10000000.0;
double real_div = (diff_div - diff_for) / 10000000.0;
printf ("Add: %lfns\n", real_add);
printf ("Mul: %lfns\n", real_mul);
printf ("Div: %lfns\n", real_div);
printf ("Mul/Add = %lf\n", real_mul / real_add);
printf ("Div/Add = %lf\n", real_div / real_add);
}
uint64_t ns(void) {
struct timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return (uint64_t)(t.tv_sec) * (uint64_t)1000000000 + (uint64_t)(t.tv_nsec);
}
I wanted to benchmark/compare the length e.g. adding, multiplying and dividing takes.
10.000.000 iterations of empty forloop: 2415544ns
10.000.000 iterations of addloop: 3074961ns
10.000.000 iterations of addloop: 659417ns (Excluding for-loop)
10.000.000 iterations of mulloop: 7177428ns
10.000.000 iterations of mulloop: 4761884ns (Excluding for-loop)
10.000.000 iterations of divloop: 43092662ns
10.000.000 iterations of divloop: 40677118ns (Excluding for-loop)
Add: 0.065942ns # Here, this is weird
Mul: 0.476188ns
Div: 4.067712ns
Mul/Add = 7.221355
Div/Add = 61.686487
First I just benchmark the calls to count_forloop in order to get the performance of all instructions except the one I want to measure, then I measure the time for the different functions, subtract the time needed for an empty for-loop and then divide until I have the time for one execution of the instruction.
My processor is able to reach 4.2GHz, while running this program, this value is nearly reached on one core.
Let's assume, it is able to reach those 4.2GHz, this means, that each cycle takes 2.381e-10
seconds, or around 0.24 nanoseconds.
But as you can see, the add
instruction only takes 0.065942ns, so only around one third of one cycle.
I tried running the programming multiple times, but I always get the same result, that add
is faster than the processor itself.
I can't find any error in my calculations.