1

I want to measure the cache miss rate and dtlb miss rate. I have done the first part.

But I can't find how to set the config to get dtlb miss and dtlb hits. When I measured the cache miss, I do like this:

    pe.type = PERF_TYPE_HARDWARE;
    pe.size = sizeof(struct perf_event_attr);
    pe.config = PERF_COUNT_HW_CACHE_MISSES;
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
zhujiaxin
  • 11
  • 2

1 Answers1

2

There are no 'direct' PMU events in perf, that will help you measure the dTLB hits. There are separate dTLB miss events for memory loads and stores, which you can see when you run the below command,

sudo perf list | grep 'Hardware cache'

dTLB-load-misses                                   [Hardware cache event]
dTLB-loads                                         [Hardware cache event]
dTLB-store-misses                                  [Hardware cache event]
dTLB-stores                                        [Hardware cache event]

The meanings of each of these events have already been mentioned here. They are dependent on the microarchitecture you are using, and this will matter in the computation of dTLB-hits.

Say, for example you are looking to sample the occurrences of event dTLB-load-misses,

       pe.type = PERF_TYPE_HW_CACHE;
       pe.size = sizeof(struct perf_event_attr);
       pe.config = PERF_COUNT_HW_CACHE_DTLB <<  0 | PERF_COUNT_HW_CACHE_OP_READ <<  8 | PERF_COUNT_HW_CACHE_RESULT_MISS << 16;

and if you are looking to measure the occurrences of event dTLB-loads,

       pe.type = PERF_TYPE_HW_CACHE;
       pe.size = sizeof(struct perf_event_attr);
       pe.config = PERF_COUNT_HW_CACHE_DTLB <<  0 | PERF_COUNT_HW_CACHE_OP_READ <<  8 | PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16;

For measuring dTLB-store-misses and dTLB-stores, you need to replace PERF_COUNT_HW_CACHE_OP_READ with PERF_COUNT_HW_CACHE_OP_WRITE in the above configs.

When measuring, any of the hardware cache events, the config should always be of the form -

pe.config = (perf_hw_cache_id << 0) | (perf_hw_cache_op_id << 8) | (perf_hw_cache_op_result_id << 16) 

where meanings and different 'enumerated' values of perf_hw_cache_id, perf_hw_cache_op_id and perf_hw_cache_op_result_id are mentioned here.

Ideally, depending on your requirement, you'd want to be measuring all of the above four events together for a single workload, so an example of how you can measure dTLB-load-misses and dTLB-loads together is shown below -

#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/unistd.h>
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>


struct read_format {
  uint64_t nr;
  struct {
    uint64_t value;
    uint64_t id;
  } values[];
};

int main(int argc, char* argv[]) {
  struct perf_event_attr pea;
  int fd1, fd2;
  uint64_t id1, id2;
  uint64_t val1, val2;
  char buf[4096];
  struct read_format* rf = (struct read_format*) buf;
  int i;

  memset(&pea, 0, sizeof(struct perf_event_attr));
  pea.type = PERF_TYPE_HW_CACHE;
  pea.size = sizeof(struct perf_event_attr);
  pea.config = PERF_COUNT_HW_CACHE_DTLB <<  0 | PERF_COUNT_HW_CACHE_OP_READ <<  8 | PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16;
  pea.disabled = 1;
  pea.exclude_kernel = 1;
  pea.exclude_hv = 1;
  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
  fd1 = syscall(__NR_perf_event_open, &pea, 0, -1, -1, 0);
  ioctl(fd1, PERF_EVENT_IOC_ID, &id1);

  memset(&pea, 0, sizeof(struct perf_event_attr));
  pea.type = PERF_TYPE_HW_CACHE;
  pea.size = sizeof(struct perf_event_attr);
  pea.config = PERF_COUNT_HW_CACHE_DTLB <<  0 | PERF_COUNT_HW_CACHE_OP_READ <<  8 | PERF_COUNT_HW_CACHE_RESULT_MISS << 16;;
  pea.disabled = 1;
  pea.exclude_kernel = 1;
  pea.exclude_hv = 1;
  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
  fd2 = syscall(__NR_perf_event_open, &pea, 0, -1, fd1 /*!!!*/, 0);
  ioctl(fd2, PERF_EVENT_IOC_ID, &id2);


  ioctl(fd1, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
  ioctl(fd1, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
  sleep(10);
  ioctl(fd1, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);


  read(fd1, buf, sizeof(buf));
  for (i = 0; i < rf->nr; i++) {
    if (rf->values[i].id == id1) {
      val1 = rf->values[i].value;
    } else if (rf->values[i].id == id2) {
      val2 = rf->values[i].value;
    }
  }

  printf("dTLB-loads: %"PRIu64"\n", val1);
  printf("dTLB-load-misses: %"PRIu64"\n", val2);

  return 0;

Some ideas involved while monitoring multiple events using perf_event_open are mentioned here, from which the above program has been copied.

Arnabjyoti Kalita
  • 2,325
  • 1
  • 18
  • 31