I am playing with the codes that I found online and I want to try different branch prediction codes to have a better understanding of branch predictors.
CPU is AMD Ryzen 3600.
Basically, what I am doing is in the code below, I am trying to measure a misprediction rate of a given function/code segment. As a pseudo/shortened code, here is what I do:
int measurement(int length, int arr){
r1 = papi_read();
for(;i<len;){
if(arr[j]){
do_sth();
}
}
r2 = papi_read();
return r2-r1;
}
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
init_papi();
init_others(); //creates arrays, initialize them, etc.
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(128, array);
usleep(1200); // 2nd important line
}
print_mispredictions();
}
My setup
- I have isolated the core I am working on so that there is no other user process in that core. I have also isolated the sibling one so that I am full in charge of both of the cores, except there is an interrupt or a routine.
- Previously, I have seen that if I use
sleep
between iterations (as in main function), the CPU enters a deeper level C-state so the branch prediction units (BHT in this case) resets. This is the explanation of2nd important line
in the code.
What I want to see
Without the sleep
line I am seeing that in each iteration I am having lower and lower misprediction rates. That is because of the branch predictors are learning the pattern in the array.
With the sleep
line, what I want to achieve is, at each iteration, I should see similar misprediction numbers, as BPU entries are being reset.
What is the problem
The problem is when I change the warmup line from
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){}
}
}
to
void warmup(){
for(volatile int i = 0; i< 1000000; i++){ // notice that I have the
// same amount of iteration
}
}
then the measurements are messed up. I have experienced this kind of issue in my previous question, which is never ever answered. Changing a line that is irrelevant to the measurement changes the measurement behavior.
This is my results:
# With 1 for loop, expected behavior. At every iteration, it is reset to ~ 60
# For 128 if statement, 60 misprediction is 50% guessing.
$ ./exp
0:73 #iteration count, misprediction count
1:62
2:63
3:21
4:63
...
# With 2 for loops. Unexpected behavior. It should always reset to ~ 60 but it keeps decreasing.
./exp
0:66
1:18
2:4
3:4
4:1
5:0
6:0
...
Putting a mfence
or lfence
instruction after the warmup
doesn't change the result either.
Below, I am putting the whole code in case someone wants to try and/or has an answer for this behavior.
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
#include <time.h>
#include "papi.h"
#define stick_this_thread_to_core(retval, core_id){ \
int num_cores = sysconf(_SC_NPROCESSORS_ONLN); \
if (core_id < 0 || core_id >= num_cores) \
retval = EINVAL; \
cpu_set_t cpuset; \
CPU_ZERO(&cpuset); \
CPU_SET(core_id, &cpuset); \
retval = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);\
}
#define ERROR_RETURN(retval) { \
fprintf(stderr, "Error %d, (%s) %s:line %d: \n", retval,PAPI_strerror(retval), __FILE__,__LINE__); \
exit(retval); \
}
void papi_my_native_add_event(int* EventSet1, char* eventname, int *native){
int retval;
//printf("native add\n");
if((retval = PAPI_event_name_to_code(eventname, native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add to_code is successful\n");
if ((retval = PAPI_add_event(*EventSet1, *native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add add_event is successful\n");
int number = 0;
if((retval = PAPI_list_events(*EventSet1, NULL, &number)) != PAPI_OK)
ERROR_RETURN(retval);
//fprintf(stderr, "Added %d events.\n", number);
}
void papi_my_native_add_start_event(int* EventSet1, char* eventname, int *native){
papi_my_native_add_event(EventSet1, eventname, native);
int retval = 0;
if((retval = PAPI_start(*EventSet1)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("START %s\n", eventname);
}
int RNG_SIZE = 128;
uint64_t* rng_arr;
uint64_t dummy;
// 12th core
int cpuid = 11;
// Code from
// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2019/11/05
// acts like a random number generator, but it is deterministic.
static inline uint64_t rng(uint64_t h) {
h ^= h >> 33;
h *= UINT64_C(0xff51afd7ed558ccd);
h ^= h >> 33;
h *= UINT64_C(0xc4ceb9fe1a85ec53);
h ^= h >> 33;
return h;
}
uint64_t measurement(int* EventSet, uint64_t howmany, uint64_t* arr){
long long reads[2] = {0};
PAPI_read(*EventSet, &reads[0]);
for(int j = 0; j < howmany; j++){
if(arr[j]){
dummy &= arr[j];
}
}
PAPI_read(*EventSet, &reads[1]);
return (reads[1] - reads[0]);
}
void precompute_rng(){
int howmany = RNG_SIZE;
for(int i = 0; i < RNG_SIZE; i++){
rng_arr[i] = rng(howmany) &0x1;
howmany--;
}
}
int stick_to_core(){
int retval = 0;
stick_this_thread_to_core(retval, cpuid);
if(retval){
printf("Affinity error: %s\n", strerror(errno));
return 1;
}
return 0;
}
void init_papi(int* EventSet, int cpuid){
int retval = 0;
// papi init
if((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT )
ERROR_RETURN(retval);
PAPI_option_t opts1;
opts1.cpu.cpu_num = cpuid;
if((retval = PAPI_create_eventset(EventSet)) != PAPI_OK)
ERROR_RETURN(retval);
if((retval = PAPI_assign_eventset_component(*EventSet, 0)) != PAPI_OK)
ERROR_RETURN(retval);
opts1.cpu.eventset = *EventSet;
if((retval =PAPI_set_opt(PAPI_CPU_ATTACH, &opts1)) != PAPI_OK)
ERROR_RETURN(retval);
char* eventname = "RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED";
unsigned int native = 0x0;
papi_my_native_add_start_event(EventSet, eventname, &native);
}
void warmup(){
for(volatile int i = 0; i< 100000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
if(stick_to_core()){
printf("Error on sticking to the core\n");
return 1;
}
int EventSet = PAPI_NULL;
int* EventSetPtr = &EventSet;
init_papi(EventSetPtr, cpuid);
rng_arr = (uint64_t*) malloc(RNG_SIZE * sizeof(uint64_t));
precompute_rng(cpuid);
int iter = 4096;
uint64_t* results = (uint64_t*) malloc(iter * sizeof(uint64_t));
for(int i = 0; i < iter; i++)
results[i] = 0;
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(&EventSet, RNG_SIZE, rng_arr);
usleep(1200);
}
// prints
for(int i = 0; i < 20; i++){
printf("%d:%ld\n", i, results[i]);
}
printf("\n");
free(results);
return 0;
}
Compile with
gcc -O0 main.c -lpthread -lpapi -o exp