How to use the ARM PMU in GEM5?

Question

I had a problem initializing the PMU in gem5 for an arm full system with the starter_fs.py in --cpu hpi. i followed the instructions of this post Using perf_event with the ARM PMU inside gem5 and i managed to solve my problem. I added the patch and configure the system. I am not using perf. I try to access directly the registers and read them. As i see GEM5 has only some register events implemented. Can we add the others as well as : for example EXC_TAKEN is not implemented. Is the following the way to add them?

self.addEvent(ProbeEvent(self,0x09, cpu, "EXC_TAKEN"))

#0x09: EXC_TAKEN ???

Also, reading the pmu event registers i manage to read them and extract the events but the pmccntr cycle register always returns zero? How gem5 increments this register? What are the steps to read the cycle reggister?

a code that i use to read using perf is the following:

#include <stdio.h>
    
    
    #include <stdint.h>
    #include <string.h>
    
    
    
    #include <errno.h>
    
    #include <unistd.h>
    #include <sys/syscall.h>
    #include <linux/perf_event.h>
    
    
    
    #define NUM_NODES                          100
    #define NONE                               9999
    
    
    
    
    struct _NODE
    {
      int iDist;
      int iPrev;
    };
    typedef struct _NODE NODE;
    
    struct _QITEM
    {
      int iNode;
      int iDist;
      int iPrev;
      struct _QITEM *qNext;
    };
    typedef struct _QITEM QITEM;
    
    QITEM *qHead = NULL;
    
                 
                 
                 
    int AdjMatrix[NUM_NODES][NUM_NODES];
    
    int g_qCount = 0;
    NODE rgnNodes[NUM_NODES];
    int ch;
    int iPrev, iNode;
    int i, iCost, iDist;
    
    
    void print_path (NODE *rgnNodes, int chNode)
    {
      if (rgnNodes[chNode].iPrev != NONE)
        {
          //print_path(rgnNodes, rgnNodes[chNode].iPrev);
        }
      //printf (" %d", chNode);
      fflush(stdout);
    }
    
    
    void enqueue (int iNode, int iDist, int iPrev)
    {
      QITEM *qNew = (QITEM *) malloc(sizeof(QITEM));
      QITEM *qLast = qHead;
      
      if (!qNew) 
        {
          //fprintf(stderr, "Out of memory.\n");
          exit(1);
        }
      qNew->iNode = iNode;
      qNew->iDist = iDist;
      qNew->iPrev = iPrev;
      qNew->qNext = NULL;
      
      if (!qLast) 
        {
          qHead = qNew;
        }
      else
        {
          while (qLast->qNext) qLast = qLast->qNext;
          qLast->qNext = qNew;
        }
      g_qCount++;
      //               ASSERT(g_qCount);
    }
    
    
    void dequeue (int *piNode, int *piDist, int *piPrev)
    {
      QITEM *qKill = qHead;
      
      if (qHead)
        {
          //                 ASSERT(g_qCount);
          *piNode = qHead->iNode;
          *piDist = qHead->iDist;
          *piPrev = qHead->iPrev;
          qHead = qHead->qNext;
          free(qKill);
          g_qCount--;
        }
    }
    
    
    int qcount (void)
    {
      return(g_qCount);
    }
    
    int dijkstra(int chStart, int chEnd) 
    {
      
    
      
      for (ch = 0; ch < NUM_NODES; ch++)
        {
          rgnNodes[ch].iDist = NONE;
          rgnNodes[ch].iPrev = NONE;
        }
    
      if (chStart == chEnd) 
        {
          //printf("Shortest path is 0 in cost. Just stay where you are.\n");
        }
      else
        {
          rgnNodes[chStart].iDist = 0;
          rgnNodes[chStart].iPrev = NONE;
          
          enqueue (chStart, 0, NONE);
          
         while (qcount() > 0)
        {
          dequeue (&iNode, &iDist, &iPrev);
          for (i = 0; i < NUM_NODES; i++)
            {
              if ((iCost = AdjMatrix[iNode][i]) != NONE)
            {
              if ((NONE == rgnNodes[i].iDist) || 
                  (rgnNodes[i].iDist > (iCost + iDist)))
                {
                  rgnNodes[i].iDist = iDist + iCost;
                  rgnNodes[i].iPrev = iNode;
                  enqueue (i, iDist + iCost, iNode);
                }
            }
            }
        }
          
          //printf("Shortest path is %d in cost. ", rgnNodes[chEnd].iDist);
          //printf("Path is: ");
          //print_path(rgnNodes, chEnd);
          //printf("\n");
        }
    }
    
    int main(int argc, char *argv[]) {
      int diff = 0;
      uint64_t num_cycles_nominal=0;
        uint64_t num_cycles_attack=0;
        uint64_t counter_cpu_cycles = 0;
      //system("./load-module");
      int i,j,k;
      FILE *fp;
      static int perf_fd_cpu_cycles;
      static struct perf_event_attr attr_cpu_cycles;
      attr_cpu_cycles.size = sizeof(attr_cpu_cycles);
      attr_cpu_cycles.exclude_kernel = 1;
      attr_cpu_cycles.exclude_hv = 1;
      attr_cpu_cycles.exclude_callchain_kernel = 1;
        attr_cpu_cycles.type = PERF_TYPE_RAW;
        attr_cpu_cycles.config = 0x11;
    
        /* Open the file descriptor corresponding to this counter. The counter
             should start at this moment. */
        if ((perf_fd_cpu_cycles = syscall(__NR_perf_event_open, &attr_cpu_cycles, 0, -1, -1, 0)) == -1)
            fprintf(stderr, "perf_event_open fail %d %d: %s\n", perf_fd_cpu_cycles, errno, strerror(errno));
        
      
      
      if (argc<2) {
        //fprintf(stderr, "Usage: dijkstra <filename>\n");
        //fprintf(stderr, "Only supports matrix size is #define'd.\n");
      }
    
      /* open the adjacency matrix file */
      fp = fopen (argv[1],"r");
        
    
      /* make a fully connected matrix */
      for (i=0;i<NUM_NODES;i++) {
        for (j=0;j<NUM_NODES;j++) {
          /* make it more sparce */
          fscanf(fp,"%d",&k);
                AdjMatrix[i][j]= k;
        }
      }
        
        /* Get and close the performance counters. */
        read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
        //close(perf_fd_cpu_cycles);
        printf("Number of cpu_cycles before: %d\n", counter_cpu_cycles);
        num_cycles_nominal = counter_cpu_cycles;    
        
        /* Get and close the performance counters. */
        read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
        //close(perf_fd_cpu_cycles);
        printf("Number of cpu_cycles after attack: %d\n", counter_cpu_cycles);
        num_cycles_attack = counter_cpu_cycles - num_cycles_nominal; 
  
        
      /* finds 10 shortest paths between nodes */
      for (i=0,j=NUM_NODES/2;i<100;i++,j++) {
                j=j%NUM_NODES;
          dijkstra(i,j);
      }
        
        read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
        close(perf_fd_cpu_cycles);
        printf("Number of cpu_cycles end: %d\n", counter_cpu_cycles);
        num_cycles_nominal = counter_cpu_cycles - num_cycles_attack;
        printf("Number of cpu_cycles nominal: %d\n", num_cycles_nominal);
        printf("Number of cpu_cycles attack: %d\n", num_cycles_attack);
    
        
        
      exit(0);
      
    
    }

the problem is that i can read the branch misses with perf having 0x10 instead 0f 0x11 (cycle counters RAW EVENT in GEM5) but using 0x11 for reading the cycles i get zero. When i try to reverse engineer the increment of cycle counter i do the following comments: when simple/atomic or simple/timing i see that updateCycleCounter is called from the base.hh, also for the 03 cpu model. When HPI and considering that hpi is a MinorCPU model i see that updateCycleCounter is called only in POWER_STATE_ON, but i didnt find in the code a POWER_STATE_ON reference updateCycleCounter(CPU_STATE_ON) which will update the cycle counter. Please help me verify this assumption.

*****The problem was that in the MinorCPU the updateCycleCounter wasnt called for the CPU_STATE_ON which updates the ActiveCycles. It was fixed by the following patch https://gem5-review.googlesource.com/c/public/gem5/+/38095 .

For the missing events, a ticket was added BTW: https://gem5.atlassian.net/browse/GEM5-791 I will try to implement some soon hopefully. Like for the implemented ones, you will have to find the correct location in the CPU source code where those events need to be incremented and add a probe point there in the first place. The ones that were implemented, I can already view as in the `man perf_event_open` example through the kernel API, which indirectly calls the PMU. It is likely that the kernel is doing some needed initialization. Just use `perf_event_open` if possible. — Ciro Santilli, Nov 18 '20 at 22:54
Hello and thank you for your answer. My problem is that some counters are not incrementing. ie cycles and instructions are implemented but not incrementing. i will give an example : Performance counter stats for './dijkstra_small input.dat': 0 armv8_pmuv3/br_immed_retired/ 0 cpu-cycles 0 cache-misse s 244128 branch-misses 0 armv8_pmuv3/st_retired/ 0 armv8_pmuv3/st_retired/ 0 instructions 0.011671384 seconds time elapsed — Nfpol, Nov 20 '20 at 10:55
i use this command perf stat -e armv8_pmuv3/br_immed_retired/,cpu-cycles,cache-misses,branch-misses,armv8_pmuv3/st_retired/,armv8_pmuv3/st_retired/,instructions ./dijkstra_small input.dat Performance counter stats for './dijkstra_small input.dat': i run perf list and i take the events. The same problem happen when i directly read the registers. I thought i dont configure correctly the module to enable userspace access so i try from EL1 with the same result CPU CYCLES =0 . — Nfpol, Nov 20 '20 at 11:00
If possible, provide a a minimal C program for reproduction (you are manually instrumenting and recompiling an executable, is that correct?). Any reason not to use `perf_event_open` directly? I got that to work for instruction and cycle counters as mentioned on the ticket. Doing it manually without the syscall requires a lot of care on the configuration setup. Doing it manually also won't allow you to exclude kernel instructions/instructions from other userland processes running in parallel. — Ciro Santilli, Nov 20 '20 at 15:50
to read the cycle counter from my main i just follow the steps from this post https://stackoverflow.com/questions/63988672/using-perf-event-with-the-arm-pmu-inside-gem5 by using event 0x11. And it also returns zero. For the example with the branch missesit works perfectly. By the way i i use the starter_fs.py in HPI. i try to reverse engineer the way the counter is incrementing and i see that if HPI is a TimingSimpleCPU then it is incrementin only in CPU_STATE_ON. I am not sure the updateCycleCounters is called for this state. — Nfpol, Nov 21 '20 at 18:50

How to use the ARM PMU in GEM5?

0 Answers0