I'm trying to profile multiple threads within a given process using perf
. It does appear though with the code below that even though pid
argument to perf_event_open
is 0
(which should result in profiling of the process as a whole ?), the HW counter values correspond just to the thread of execution (vs. perf counter summed across threads within a process) with the inf_loop
not being counted.
My question is: How does one profile all threads in a process vs. just the thread that executed perf_event_open
as below? Is there some other config in the perf_event_attr
that needs to be set to enable process-wide profiling?
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
static int fd;
void setup()
{
struct perf_event_attr pe;
memset(&pe, 0, sizeof(pe));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(pe);
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
pid_t pid = 0;
int cpu = -1;
fd = perf_event_open(&pe, pid, cpu, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
}
int64_t read()
{
int64_t count;
read(fd, &count, sizeof(count));
return count;
}
std::size_t k;
// The instruction counts do not reflect the cycles consumed here
void inf_loop()
{
k = 2;
volatile size_t *p = &k;
while (*p)
{
++k;
}
}
int main(int argc, char **argv)
{
setup();
thread t1(inf_loop);
int count = 0;
for (uint64_t idx = 0; idx < (1ULL << 54); ++idx)
{
if (idx % (1ULL << 32) == 0)
{
cout << "Cycles: " << read() << endl;
}
}
}