I have been trying to measure cache hit and cache miss.
I have been working on a Quad core Cortex-A72 (ARM v8) 64-bit SoC @ 1.5GHz.
My c code to measure cache hit is:
#define _GNU_SOURCE
#include <assert.h>
#include <sched.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/random.h>
#include <stdbool.h>
#include <fcntl.h>
#include <fcntl.h>
#include <sys/time.h>
char *chunk;
const size_t chunk_size = 1<<30;
/* FUNCTIONS */
struct timeval start_time, start_time1;
double get_diff(){
struct timeval end_time;
int rc = gettimeofday(&end_time, NULL);
assert(rc == 0);
return (end_time.tv_sec - start_time.tv_sec + (double) (end_time.tv_usec - start_time.tv_usec) / 1e6);
}
void print_affinity(){
cpu_set_t mask;
long nproc, i;
if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1){
perror("sched_getaffinity");
assert(false);
}
nproc = sysconf(_SC_NPROCESSORS_ONLN);
printf("sched_getaffinity = ");
for (i = 0; i < nproc; i++)
printf("%d ", CPU_ISSET(i, &mask));
}
void bind_to_cpu (){
cpu_set_t mask;
print_affinity();
printf("\n");
printf("sched_getcpu = %d\n", sched_getcpu());
CPU_ZERO(&mask);
CPU_SET(0, &mask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
perror("sched_setaffinity");
assert(false);
}
print_affinity();
printf("\nsched_getcpu = %d\n", sched_getcpu());
}
void reset_mem(){
memset(chunk, -1, chunk_size);
}
void initialize(size_t chunk_size){
chunk = (char *) mmap(NULL,chunk_size, PROT_READ | MAP_POPULATE |PROT_WRITE,MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
assert(chunk!=MAP_FAILED);
//initialize all bits to INIT_BIT value
printf("Initializing memory...\n\n");
reset_mem();
}
int main(int argc, char** argv){
bind_to_cpu(); // pinning/binding cpu
initialize(chunk_size);
uint64_t temp=0 ;
size_t offset1 = (rand() << 12) % chunk_size;
size_t offset2 = (rand() << 12) % chunk_size;
uint64_t *addr1 = (uint64_t*) (chunk+offset1);
uint64_t *addr2 = (uint64_t*) (chunk+offset2);
double time_result;
sched_yield();
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
for(int i =0; i<5000;i++){
gettimeofday(&start_time, NULL);
volatile uint64_t value;
asm volatile ("LDR %0, [%1]\n\t"
: "=r" (value)
: "r" (addr1)
);
asm volatile ("LDR %0, [%1]\n\t"
: "=r" (value)
: "r" (addr2)
);
time_result += get_diff();
//__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
//__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
}
sched_yield();
printf("Total Time: %f\n\n", time_result);
return 0;
}
The code to measure cache miss is the same but using the two flush instructions with comments:
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
So when i use LDR instruction everything seems to be ok, and i got the following outputs:
Cache hit: Cache miss:
0.000522 0.001503
0.000558 0.001696
0.000584 0.001977
0.000712 0.002032
0.000683 0.001137
When i use STR instruction:
for(int i =0; i<5000;i++){
gettimeofday(&start_time, NULL);
asm volatile("str %x1, %x0" : "=m"(*addr1) : "r"(temp));
asm volatile("str %x1, %x0" : "=m"(*addr2) : "r"(temp));
time_result += get_diff();
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
}
I got these outputs:
Cache hit: Cache miss:
0.000603 0.000299
0.000287 0.000311
0.000376 0.000290
0.000311 0.000305
0.000518 0.000297
The differences between cache hit and cache miss are very subtle.
Why ? am i not flushing the cache in the right way ?