I wrote a small program to benchmark CPU cache latencies. The idea is filling a buffer with random values and letting the CPU load from it with strides that depend on the loaded values themselves.
Depending on the size of the buffers I expect to see latencies being in some well defined clusters. I must say that I see a clear pattern, and that's a success. The sequential read programs from figures 3.10, 3.11 in this classic do not work anywhere as neatly for me, presumably because prefetching improved a lot from 2007.
What's strange is that the pattern I get is consistent with a cache 8 times as big as mine. So my question.
My system (2-core MacBook Pro, mid 2014) has L1d cache 32k, L2 cache 256k, L3 cache 3M (shared), architecture is Intel Haswell.
The pattern I see is the following:
Size 1024 Loops 10240000 Cycles 7.60162
Size 2048 Loops 10240000 Cycles 7.14387
Size 4096 Loops 10240000 Cycles 7.74612
Size 8192 Loops 10240000 Cycles 6.93018
Size 16384 Loops 10240000 Cycles 7.32189
Size 32768 Loops 10240000 Cycles 7.84709
Size 65536 Loops 10240000 Cycles 8.32192 # <- L1 cache is 32k so I expect a step here
Size 131072 Loops 10240000 Cycles 7.51579
Size 262144 Loops 10240000 Cycles 9.07455
Size 524288 Loops 10240000 Cycles 16.1824 # <- L1 step is here instead / here I would expect a step associated with L2
Size 1048576 Loops 10240000 Cycles 19.0783
Size 2097152 Loops 10240000 Cycles 11.633
Size 4194304 Loops 10240000 Cycles 23.773 # <- L2 step is here instead
Size 8388608 Loops 10240000 Cycles 24.2754
Size 16777216 Loops 10240000 Cycles 61.0624 # <- L3 step is here, apparently (makes sense, since L3 is shared, that it comes a bit earlier than expected)
Size 33554432 Loops 10240000 Cycles 57.5953
Size 67108864 Loops 10240000 Cycles 44.3678
Now, admittedly it can be that I am just unable to measure L1, and I am taking the L2 step as the L1 one, but if I look at a previous answer, from the 32bit era, I guess, I see a telling line (emphasis on the * 4):
test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));
So for some reason the person who answered tested, too, latency in a buffer 4 times the nominal size of the cache.
I am puzzled - what is the reason for that?
Here is my code (it runs in macOS and clang but it can easily be adapted to other systems)
mwe.c
#include <stdio.h>
#include "x86intrin.h"
#include <fcntl.h>
#include <unistd.h>
#define N (134217728)
#define START_N (1024)
extern uint64_t access_random_place_to_place_memory_dep(uint64_t *, size_t, size_t);
int main (int argc, char** argv) {
unsigned long long n = N, ta, tb;
unsigned long long int loops = 10240000;
// create buffer of random memory
uint64_t *p = malloc(n);
uint64_t res;
int randomData = open("/dev/urandom", O_RDONLY);
read(randomData, p, n);
// result arrays
double results[64];
size_t memories[64];
int i;
for (int working_memory=START_N; working_memory < n; working_memory <<= 1) {
ta = _rdtsc();
access_random_place_to_place_memory_dep(p, working_memory, loops);
tb = _rdtsc();
memories[i] = working_memory;
results[i] = (double)(tb-ta)/loops;
i++;
}
free(p);
for (int j=0; j<i; j++) {
printf("Size %zu Loops %llu Cycles %g\n", memories[j], loops, results[j]);
}
return res;
}
mwe.s
.intel_syntax
.global _access_random_place_to_place_memory_dep
.section __DATA,__data
.section __TEXT,__text
/*
Access memory randomly, by pointer chasing,
each stride depends on the last read
*/
// (rdi, rsi, rdx) ->
// (rax) pointer to buffer
// (rsi) size of buffer (power of 2)
// (rdx) iterations
// no need to pad stack since no function call
_access_random_place_to_place_memory_dep:
mov rbx, [rdi]
xor rcx, rcx
// will use as AND mask
dec rsi
beginz:
cmp rdx, rcx
jbe endz
inc rcx
and rbx, rsi
lea rbx, [rdi + rbx]
mov r8, [rbx]
add rbx, r8
jmp beginz
endz:
mov rax, rbx
ret
to compile clang mwe.c mwe.s -o mwe
and run with ./mwe
EDIT (2022/3/23)
Thanks for the corrections! I present the current version and its output. I hope it can help somebody as I haven't found such a code easily myself
mwe.c
#include <stdlib.h>
#include <stdio.h>
#include "x86intrin.h"
#include <fcntl.h>
#include <unistd.h>
#define N (134217728)
#define START_N (128)
extern uint64_t access_random_place_to_place_memory_dep(uint64_t *, size_t, size_t);
void place_random_shuffle(uint64_t *p, uint64_t max_offset) {
uint64_t max_offset_q = max_offset/8;
// start by writing for each qword its own offset
for (uint64_t i=0; i<max_offset_q; i++) {
p[i] = 8*i;
}
// then shuffle (Fisher Yates shuffling)
for (uint64_t i=0; i<max_offset_q-1; i++) {
uint64_t t;
uint64_t j = (rand() % (max_offset_q-i)) + i;
t = p[i];
p[i] = p[j];
p[j] = t;
}
}
int main (int argc, char** argv) {
unsigned long long n = N, ta, tb;
unsigned long long int loops = 10240000;
// create buffer of random memory
uint64_t *p = malloc(n);
uint64_t res;
// result arrays
double results[64];
size_t memories[64];
int i;
for (int working_memory=START_N; working_memory < n; working_memory <<= 1) {
place_random_shuffle(p, working_memory);
ta = _rdtsc();
res = access_random_place_to_place_memory_dep(p, working_memory, loops);
tb = _rdtsc();
memories[i] = working_memory;
results[i] = (double)(tb-ta)/loops;
i++;
}
free(p);
for (int j=0; j<i; j++) {
printf("Size %zu Loops %llu Cycles %g\n", memories[j], loops, results[j]);
}
return res;
}
mwe.s
.intel_syntax
.global _access_random_place_to_place_memory_dep
.section __DATA,__data
.section __TEXT,__text
/*
Access memory randomly, by pointer chasing,
each stride depends on the last read
*/
// (rdi, rsi, rdx) ->
// (rdi) pointer to buffer
// (rsi) size of buffer
// (rdx) iterations (must be >1)
// no need to pad stack since no function call
_access_random_place_to_place_memory_dep:
xor rax, rax
xor r8, r8
beginz:
mov rax, [rdi+rax]
add r8, rax
dec rdx
jnz beginz
endz:
mov rax, r8
ret
Output is much better now!
Size 128 Loops 10240000 Cycles 4.51077
Size 256 Loops 10240000 Cycles 4.67502
Size 512 Loops 10240000 Cycles 4.46404
Size 1024 Loops 10240000 Cycles 4.47518
Size 2048 Loops 10240000 Cycles 4.67881
Size 4096 Loops 10240000 Cycles 4.5293
Size 8192 Loops 10240000 Cycles 4.36537
Size 16384 Loops 10240000 Cycles 4.56763
Size 32768 Loops 10240000 Cycles 4.59288
Size 65536 Loops 10240000 Cycles 8.66269 # <- L1 step
Size 131072 Loops 10240000 Cycles 9.48717
Size 262144 Loops 10240000 Cycles 15.2417
Size 524288 Loops 10240000 Cycles 27.2223 # <- L2 (?) step
Size 1048576 Loops 10240000 Cycles 47.3154
Size 2097152 Loops 10240000 Cycles 104.716 # <- we start to be out of L3
Size 4194304 Loops 10240000 Cycles 177.333
Size 8388608 Loops 10240000 Cycles 218.087
Size 16777216 Loops 10240000 Cycles 234.883
Size 33554432 Loops 10240000 Cycles 242.916
Size 67108864 Loops 10240000 Cycles 260.416
And the "cycles" are actually "psudocycles" because of turbo mode, but I don't know how to do take that into account.