1

About the MSR IA32_TIME_STAMP_COUNTER (10h) : Which rules of serialization does it follow ? rdtsc or rdtscp ? or other ?

If not serialized, should I provide a cpuid "barrier" before any math computations ?

-- Edit --

So far I have implemented two kinds of barriers : cpuid and fences.

With cpuid :

#define RDCOUNTER(_val,  _cnt)                      \
asm volatile                                        \
(                                                   \
    "xorq   %%rax, %%rax    \n\t"                   \
    "cpuid                  \n\t"                   \
    "movq   %1, %%rcx       \n\t"                   \
    "rdmsr                  \n\t"                   \
    "push   %%rax           \n\t"                   \
    "push   %%rdx           \n\t"                   \
    "xorq   %%rax, %%rax    \n\t"                   \
    "cpuid                  \n\t"                   \
    "pop    %%rdx           \n\t"                   \
    "pop    %%rax           \n\t"                   \
    "shlq   $32, %%rdx      \n\t"                   \
    "orq    %%rdx, %%rax    \n\t"                   \
    "movq   %%rax, %0"                              \
    : "=m" (_val)                                   \
    : "i" (_cnt)                                    \
    : "%rax", "%rbx", "%rcx", "%rdx", "memory"      \
)

With fence :

#define RDCOUNTER(_val,  _cnt)                      \
asm volatile                                        \
(                                                   \
    "movq   %1, %%rcx       \n\t"                   \
    "mfence                 \n\t"                   \
    "rdmsr                  \n\t"                   \
    "mfence                 \n\t"                   \
    "shlq   $32, %%rdx      \n\t"                   \
    "orq    %%rdx, %%rax    \n\t"                   \
    "movq   %%rax, %0"                              \
    : "=m" (_val)                                   \
    : "i" (_cnt)                                    \
    : "%rax", "%rbx", "%rcx", "%rdx", "memory"      \
)

Bellow part of my project is trying to estimate the processor external clock frequency (FSB or BCLK).

  • Algorithm allocates an array of a structured memory to read and measure deltas of the Time Stamp Counter.
  • This slab of memory is allocated to be resident in the processor cache.
  • A cpu affinity is made with the BSP, scheduler and interrupts are suspended the time of computation.
  • Several loops of the TSC reads are done to force cache residency; and the most occurrences of same result is declared as the best frequency.

What I expect is to get a constant frequency after several run.

Unfortunately, I still have variance whatever the barrier instruction is employed or not.

Results are pretty closed, at least 3 decimals past period, but never constant.

(this is tested on a Core 2 and Core i7)

DECLARE_COMPLETION(bclk_job_complete);

typedef struct {
    unsigned long long V[2], D;
} TSC_STRUCT;

#define OCCURENCES 32
signed int Compute_Clock(void *arg)
{
CLOCK *clock=(CLOCK *) arg;
unsigned int ratio=clock->Q;
unsigned long long overhead=0;
struct kmem_cache *hardwareCache=kmem_cache_create(
            "IntelClockCache",
            OCCURENCES * sizeof(TSC_STRUCT), 0,
            SLAB_HWCACHE_ALIGN, NULL);
TSC_STRUCT *TSC=kmem_cache_alloc(hardwareCache, GFP_KERNEL);
unsigned int loop=0, best=0, top=0;

// No preemption, no interrupt.
unsigned long flags;
preempt_disable();
raw_local_irq_save(flags);
// Warm-up
RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
// Overhead
RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
overhead=TSC[loop].V[1] - TSC[loop].V[0];
// Pick-up
for(loop=0; loop < OCCURENCES; loop++)
{
    RDCOUNTER(TSC[loop].V[0], MSR_IA32_TSC);
    udelay(100);
    RDCOUNTER(TSC[loop].V[1], MSR_IA32_TSC);
}
// Restore interrupt and preemption.
raw_local_irq_restore(flags);
preempt_enable();

for(loop=0; loop < OCCURENCES; loop++)
    TSC[loop].D=TSC[loop].V[1] - TSC[loop].V[0] - overhead;
for(loop=0; loop < OCCURENCES; loop++) {
    unsigned int inner=0, count=0;
    for(inner=loop; inner < OCCURENCES; inner++)
        if(TSC[loop].D == TSC[inner].D)
            count++;
    if((count > top)
    ||((count == top) && (TSC[loop].D < TSC[best].D))) {
        top=count;
        best=loop;
    }
/*  printk("%3u x D[%02u]=%llu\t%llu - %llu\n",
    count, loop, TSC[loop].D, TSC[loop].V[1], TSC[loop].V[0]); */
}
printk("Overhead=%llu\tBest=%llu\n", overhead, TSC[best].D);

clock->Q=TSC[best].D / (ratio * PRECISION);
clock->R=TSC[best].D % (ratio * PRECISION);

kmem_cache_free(hardwareCache, TSC);
kmem_cache_destroy(hardwareCache);

complete_and_exit(&bclk_job_complete, 0);
}
CyrIng
  • 343
  • 3
  • 8
  • The RDMSR instruction is non-serializing. Yes. – Hans Passant Aug 01 '15 at 16:06
  • Great. By extension, if reading a bunch of MSR fixed counters, such as UCC, URC, C3, C6, C7, and lastly TSC , a cpuid is required before doing counter calculations ? – CyrIng Aug 01 '15 at 16:16
  • See also https://stackoverflow.com/questions/35819821/how-to-ensure-that-rdtsc-is-accurate and https://www.intel.de/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf – Claudio Oct 08 '18 at 13:14

0 Answers0