The following code reads an array in that way that it loads one element per cache line, supposing that it is 64 bytes, then makes use of clflush
for each line and reads the array once again. That said, timings of the second reading are shorter. I wonder why. It seems that clflush does not invalidate the cache lines.
Btw, Do the cache lines consist of exactly 64 bytes per line?
I have this question since I have tried to change the step from 16 int
s to 32 and even 64, but the second reading has been still faster.
#include <time.h>
#include <cstdlib>
#include <cstdio>
#define ARR1_LEN 16384
#define PRINT_DUR {\
printf("%ld - %ld = %ld\n%.20Lf\n", t2, t1, t2-t1, ((long double)(t2 - t1))/CLOCKS_PER_SEC);\
}
#define CLEAR_CACHE {\
asm("movq %1, %%rcx; movq %0, %%rax; label_%=: clflush (%%rax); addq $64, %%rax; loop label_%= ;"::"r"(arr1), "i"((ARR1_LEN>>5) -1):"rcx", "rax");\
}
int main() {
int *arr1_ = (int*)malloc(sizeof(int) * ARR1_LEN + 64);
int temp;
if (!arr1_) {
fprintf(stderr, "Memory allocation error\n");
return 0;
}
int *arr1 = (int*)((((size_t)arr1_)+63)&0xffffffffffffffc0);
clock_t t1, t2;
t1 = clock();
for (int i = 0; i < (ARR1_LEN>>4); i++) {
temp = arr1[i<<4];
}
t2 = clock();
// __builtin___clear_cache(arr1, arr1 + ARR1_LEN -1); // It compiles into nothing at all
CLEAR_CACHE
PRINT_DUR
t1 = clock();
for (int i = 0; i < (ARR1_LEN>>4); i++) {
temp = arr1[(i<<4) + 32];
}
t2 = clock();
PRINT_DUR
free(arr1_);
return 0;
}