4

I'm trying to profile multiple threads within a given process using perf. It does appear though with the code below that even though pid argument to perf_event_open is 0 (which should result in profiling of the process as a whole ?), the HW counter values correspond just to the thread of execution (vs. perf counter summed across threads within a process) with the inf_loop not being counted.

My question is: How does one profile all threads in a process vs. just the thread that executed perf_event_open as below? Is there some other config in the perf_event_attr that needs to be set to enable process-wide profiling?

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
        int cpu, int group_fd, unsigned long flags)
{
    int ret;
    ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
            group_fd, flags);
    return ret;
}

static int fd;

void setup()
{
   struct perf_event_attr pe;
   memset(&pe, 0, sizeof(pe));
   pe.type     = PERF_TYPE_HARDWARE;
   pe.size     = sizeof(pe);
   pe.config   = PERF_COUNT_HW_INSTRUCTIONS;
   pe.disabled = 1;
   pe.exclude_kernel = 1;
   pe.exclude_hv     = 1;
   pid_t pid = 0;
   int cpu   = -1;
   fd = perf_event_open(&pe, pid, cpu, -1, 0);
   if (fd == -1) {
       fprintf(stderr, "Error opening leader %llx\n", pe.config);
       exit(EXIT_FAILURE);
   }
   ioctl(fd, PERF_EVENT_IOC_RESET, 0);
   ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
}

int64_t read()
{
    int64_t count;
    read(fd, &count, sizeof(count));
    return count;
}

std::size_t k;

// The instruction counts do not reflect the cycles consumed here
void inf_loop()
{
    k = 2;
    volatile size_t *p = &k;
    while (*p)
    {
        ++k;
    }
}

int main(int argc, char **argv)
{
   setup();
   thread t1(inf_loop);
   int count = 0;
   for (uint64_t idx = 0; idx < (1ULL << 54); ++idx)
   {
       if (idx % (1ULL << 32) == 0)
       {
           cout << "Cycles: " << read() << endl;
       }
   }
}
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
user3882729
  • 1,339
  • 8
  • 11

1 Answers1

1

What you want is perf_event_attr's inherit flag, which incorporates child thread counts into the parent.

Jeff Trull
  • 1,236
  • 11
  • 16
  • 1
    Is there any way to do it without having to call `perf_event_open` in the parent thread i.e. in a child thread? – SuibianP Jul 22 '22 at 10:35