I am following this post's solution and trying to get all my performance counters read at once.
I was aware that you could not push more than 3 pmcs at once in a group. However it seems that you cannot push more than 3 hardware cache counters or 3 hardware counters. It looks from playing with the code below (full example) that software counters do not count.
So what is the actual limit? Is it
- hardware cache counters
- hardware counters
- hardware + cache counters
- total counters
If I know the correct limit then I can count and create new group leaders as necessary.
I have added a Godbolt link but obviously performance counters won't work on a virtual machine. But it is there for better copy/paste.
A typical output would be like below. In this case after the 3rd hardware cache is inserted in the group, everything becomes zero.
The big issue there is that all the system calls are successful, there is no warning. Only with 6 hw counters is that perf_event_init
returns -1.
...
>>>>>>>>>>>>>>>>>>>>>>>>> Args: 12
Sum:-1683.48
Read index 836467 value 45545
Read index 836468 value 3895765
Read index 836469 value 171556074
Read index 836470 value 31866306
Read index 836471 value 4726754
Read index 836472 value 15740215
Read index 836473 value 0
Read index 836474 value 0
Read index 836475 value 0
Read index 836476 value 0
Read index 836477 value 45256961
Read index 836478 value 45245212
>>>>>>>>>>>>>>>>>>>>>>>>> Args: 13
Sum:-145982
Read index 836479 value 0
Read index 836480 value 0
Read index 836481 value 0
Read index 836482 value 0
Read index 836483 value 0
Read index 836484 value 0
Read index 836485 value 0
Read index 836486 value 0
Read index 836487 value 0
Read index 836488 value 0
Read index 836489 value 14029
Read index 836490 value 1730
Read index 836491 value 0
>>>>>>>>>>>>>>>>>>>>>>>>> Args: 14
Sum:-116814
Read index 836492 value 0
Read index 836493 value 0
Read index 836494 value 0
Read index 836495 value 0
Read index 836496 value 0
Read index 836497 value 0
Read index 836498 value 0
Read index 836499 value 0
Read index 836500 value 0
Read index 836501 value 0
Read index 836502 value 13408
Read index 836503 value 1730
Read index 836504 value 0
Read index 836505 value 0
#include <perfmon/pfmlib_perf_event.h>
#include <iostream>
#include <cstring>
#include <algorithm>
#include <vector>
#include <cmath>
#include <cstdlib>
struct MessageValue {
uint64_t value;
uint64_t id;
};
struct Message {
uint64_t nr;
MessageValue values[];
};
void do_something() {
std::vector<double> var(500000);
size_t j = 0;
double sum = 0;
for (double &x : var) {
x = sin(j++) * rand();
}
std::sort(var.begin(), var.end());
for (double x : var) sum += x;
std::cout << "Sum:" << sum / var.size() << std::endl;
}
struct Descriptor {
unsigned type;
long long config;
int fd;
uint64_t id;
uint64_t value;
};
std::vector<Descriptor> descriptors = {
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES},
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES},
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES},
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND},
{PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK},
{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_DTLB | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_MISS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_MISS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_ACCESS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_PREFETCH |
PERF_COUNT_HW_CACHE_RESULT_ACCESS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1I | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_MISS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1I | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_ACCESS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_BPU | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_MISS},
{PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_BPU | PERF_COUNT_HW_CACHE_OP_READ |
PERF_COUNT_HW_CACHE_RESULT_ACCESS}};
#define TRY(stmt) \
do { \
int res = (stmt); \
if (res != 0) { \
std::cerr << #stmt << " " << strerror(errno) << std::endl; \
} \
} while (0);
int main() {
std::vector<perf_event_attr_t> attributes(descriptors.size());
for (size_t numargs = 1; numargs < descriptors.size(); ++numargs) {
std::cerr << ">>>>>>>>>>>>>>>>>>>>>>>>> Args: " << numargs << std::endl;
pid_t pid = 0; // getpid();
int cpu = -1;
int leader = -1;
int flags = 0;
for (size_t j = 0; j < numargs; ++j) {
perf_event_attr_t &pea(attributes[j]);
pea.type = descriptors[j].type;
pea.config = descriptors[j].config;
pea.size = sizeof(perf_event_attr_t);
pea.disabled = 1;
pea.exclude_kernel = 1;
pea.exclude_hv = 1;
pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
int fd = perf_event_open(&pea, pid, cpu, leader, flags);
if (fd < 0) {
int err = errno;
std::cerr << "Index:" << j << " errno:" << err << " " << strerror(err)
<< std::endl;
return 1;
}
if (leader < 0) {
leader = fd;
}
uint64_t id = 0;
TRY(ioctl(fd, PERF_EVENT_IOC_ID, &id))
// std::cerr << " fd:" << fd << " id:" << id << std::endl;
descriptors[j].fd = fd;
descriptors[j].id = id;
}
TRY(ioctl(leader, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP));
TRY(ioctl(leader, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP));
do_something();
TRY(ioctl(leader, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
size_t n = descriptors.size();
for (Descriptor &d : descriptors) d.value = std::numeric_limits<uint64_t>::max();
size_t bufsize = 2 * (sizeof(Message) + n * sizeof(MessageValue));
std::vector<uint8_t> buf(bufsize);
std::fill(buf.begin(), buf.end(), 0);
ssize_t nb = ::read(leader, buf.data(), bufsize);
Message *msg = (Message *)buf.data();
bool allzero = true;
for (uint64_t i = 0; i < msg->nr; i++) {
uint64_t id = msg->values[i].id;
uint64_t value = msg->values[i].value;
std::cerr << "Read index " << id << " value " << value << std::endl;
descriptors[i].value = value;
if (value != 0) allzero = false;
}
// if (allzero) break;
for (size_t j = 0; j < numargs; ++j) {
close(descriptors[j].fd);
}
}
return 0;
}