If you want really fine granularity measurements of cache misses, you should use Intel's architectural counters, which can accessed from userspace using the rdpmc
instruction. The kernel module source I wrote in this answer will enable rdpmc
in userspace for older CPU's.
Here is another kernel module to enable configuration of the counters for measuring last-level cache misses and last-level cache references. Note that I have hardcoded 8
cores, because that was what I had happened to use for my configuration.
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#define PERFEVTSELx_MSR_BASE 0x00000186
#define PMCx_MSR_BASE 0x000000c1 /* NB: write when evt disabled*/
#define PERFEVTSELx_USR (1U << 16) /* count in rings 1, 2, or 3 */
#define PERFEVTSELx_OS (1U << 17) /* count in ring 0 */
#define PERFEVTSELx_EN (1U << 22) /* enable counter */
static void
write_msr(uint32_t msr, uint64_t val)
{
uint32_t lo = val & 0xffffffff;
uint32_t hi = val >> 32;
__asm __volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi));
}
static uint64_t
read_msr(uint32_t msr)
{
uint32_t hi, lo;
__asm __volatile("rdmsr" : "=d" (hi), "=a" (lo) : "c" (msr));
return ((uint64_t) lo) | (((uint64_t) hi) << 32);
}
static uint64_t old_value_perfsel0[8];
static uint64_t old_value_perfsel1[8];
static spinlock_t mr_lock = SPIN_LOCK_UNLOCKED;
static unsigned long flags;
static void wrapper(void* ptr) {
int id;
uint64_t value;
spin_lock_irqsave(&mr_lock, flags);
id = smp_processor_id();
// Save the old values before we do something stupid.
old_value_perfsel0[id] = read_msr(PERFEVTSELx_MSR_BASE);
old_value_perfsel1[id] = read_msr(PERFEVTSELx_MSR_BASE+1);
// Clear out the existing counters
write_msr(PERFEVTSELx_MSR_BASE, 0);
write_msr(PERFEVTSELx_MSR_BASE + 1, 0);
write_msr(PMCx_MSR_BASE, 0);
write_msr(PMCx_MSR_BASE + 1, 0);
if (clear){
spin_unlock_irqrestore(&mr_lock, flags);
return;
}
// Table 19-1 in the most recent Intel Manual - Architectural
// Last Level Cache References Event select 2EH, Umask 4FH
value = 0x2E | (0x4F << 8) |PERFEVTSELx_EN |PERFEVTSELx_OS|PERFEVTSELx_USR;
write_msr(PERFEVTSELx_MSR_BASE, value);
// Table 19-1 in the most recent Intel Manual - Architectural
// Last Level Cache Misses Event select 2EH, Umask 41H
value = 0x2E | (0x41 << 8) |PERFEVTSELx_EN |PERFEVTSELx_OS|PERFEVTSELx_USR;
write_msr(PERFEVTSELx_MSR_BASE + 1, value);
spin_unlock_irqrestore(&mr_lock, flags);
}
static void restore_wrapper(void* ptr) {
int id = smp_processor_id();
if (clear) return;
write_msr(PERFEVTSELx_MSR_BASE, old_value_perfsel0[id]);
write_msr(PERFEVTSELx_MSR_BASE+1, old_value_perfsel1[id]);
}
int init_module(void)
{
printk(KERN_INFO "Entering write-msr!\n");
on_each_cpu(wrapper, NULL, 0);
/*
* A non 0 return means init_module failed; module can't be loaded.
*/
return 0;
}
void cleanup_module(void)
{
on_each_cpu(restore_wrapper, NULL, 0);
printk(KERN_INFO "Exiting write-msr!\n");
}
Here is a user-space wrapper around rdpmc
.
uint64_t
read_pmc(int ecx)
{
unsigned int a, d;
__asm __volatile("rdpmc" : "=a"(a), "=d"(d) : "c"(ecx));
return ((uint64_t)a) | (((uint64_t)d) << 32);
}