I'm pretty new to performance measurement. I came across this question and decided to check it myself. Here is how my benchmarks look like:
For stack:
section .text
global _start
_start:
mov r12, 0xFFFFFFFF
push 0xFFFFFF
mov_loop:
mov rax, [rsp]
dec r12
jnz mov_loop
mov rax, 60
syscall
For heap:
SYS_brk equ 0x0C
section .text
global _start
_start:
mov rax, SYS_brk
mov rdi, 0
syscall
;allocate 8 bytes
mov r10, rax
mov rax, SYS_brk
mov rdi, r10
add rdi, 0x08
syscall
mov [r10], dword 0xFFFFFF
mov rcx, 0xFFFFFFFF
heap_loop:
mov rax, [r10]
dec rcx
jnz heap_loop
;release memory
mov rax, SYS_brk
mov rdi, r10
syscall
mov rax, 60
syscall
Runnning benchmarks with perf stat -d -r 10
showed that I actually measured L1-cache-loads in both of the cases.
4,295,747,868 L1-dcache-loads # 2996.483 M/sec ( +- 0.00% )
48,316 L1-dcache-load-misses # 0.00% of all L1-dcache hits ( +- 18.42% )
Is there a way to invalidate cache lines before each iteration started?