There are no 'direct' PMU events in perf
, that will help you measure the dTLB hits
. There are separate dTLB miss
events for memory loads and stores, which you can see when you run the below command,
sudo perf list | grep 'Hardware cache'
dTLB-load-misses [Hardware cache event]
dTLB-loads [Hardware cache event]
dTLB-store-misses [Hardware cache event]
dTLB-stores [Hardware cache event]
The meanings of each of these events have already been mentioned here. They are dependent on the microarchitecture you are using, and this will matter in the computation of dTLB-hits
.
Say, for example you are looking to sample the occurrences of event dTLB-load-misses
,
pe.type = PERF_TYPE_HW_CACHE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CACHE_DTLB << 0 | PERF_COUNT_HW_CACHE_OP_READ << 8 | PERF_COUNT_HW_CACHE_RESULT_MISS << 16;
and if you are looking to measure the occurrences of event dTLB-loads
,
pe.type = PERF_TYPE_HW_CACHE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CACHE_DTLB << 0 | PERF_COUNT_HW_CACHE_OP_READ << 8 | PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16;
For measuring dTLB-store-misses
and dTLB-stores
, you need to replace PERF_COUNT_HW_CACHE_OP_READ
with PERF_COUNT_HW_CACHE_OP_WRITE
in the above configs.
When measuring, any of the hardware cache events, the config should always be of the form -
pe.config = (perf_hw_cache_id << 0) | (perf_hw_cache_op_id << 8) | (perf_hw_cache_op_result_id << 16)
where meanings and different 'enumerated' values of perf_hw_cache_id
, perf_hw_cache_op_id
and perf_hw_cache_op_result_id
are mentioned here.
Ideally, depending on your requirement, you'd want to be measuring all of the above four events together for a single workload, so an example of how you can measure dTLB-load-misses
and dTLB-loads
together is shown below -
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/unistd.h>
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>
struct read_format {
uint64_t nr;
struct {
uint64_t value;
uint64_t id;
} values[];
};
int main(int argc, char* argv[]) {
struct perf_event_attr pea;
int fd1, fd2;
uint64_t id1, id2;
uint64_t val1, val2;
char buf[4096];
struct read_format* rf = (struct read_format*) buf;
int i;
memset(&pea, 0, sizeof(struct perf_event_attr));
pea.type = PERF_TYPE_HW_CACHE;
pea.size = sizeof(struct perf_event_attr);
pea.config = PERF_COUNT_HW_CACHE_DTLB << 0 | PERF_COUNT_HW_CACHE_OP_READ << 8 | PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16;
pea.disabled = 1;
pea.exclude_kernel = 1;
pea.exclude_hv = 1;
pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
fd1 = syscall(__NR_perf_event_open, &pea, 0, -1, -1, 0);
ioctl(fd1, PERF_EVENT_IOC_ID, &id1);
memset(&pea, 0, sizeof(struct perf_event_attr));
pea.type = PERF_TYPE_HW_CACHE;
pea.size = sizeof(struct perf_event_attr);
pea.config = PERF_COUNT_HW_CACHE_DTLB << 0 | PERF_COUNT_HW_CACHE_OP_READ << 8 | PERF_COUNT_HW_CACHE_RESULT_MISS << 16;;
pea.disabled = 1;
pea.exclude_kernel = 1;
pea.exclude_hv = 1;
pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
fd2 = syscall(__NR_perf_event_open, &pea, 0, -1, fd1 /*!!!*/, 0);
ioctl(fd2, PERF_EVENT_IOC_ID, &id2);
ioctl(fd1, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(fd1, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
sleep(10);
ioctl(fd1, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(fd1, buf, sizeof(buf));
for (i = 0; i < rf->nr; i++) {
if (rf->values[i].id == id1) {
val1 = rf->values[i].value;
} else if (rf->values[i].id == id2) {
val2 = rf->values[i].value;
}
}
printf("dTLB-loads: %"PRIu64"\n", val1);
printf("dTLB-load-misses: %"PRIu64"\n", val2);
return 0;
Some ideas involved while monitoring multiple events using perf_event_open
are mentioned here, from which the above program has been copied.