How to check for TSX support?

Question

My current attempt:

/**simplified from
 * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
 **/

#include <stdio.h>
#include <stdint.h>

#if defined(_MSC_VER)
#   include <intrin.h>
#endif

void get_cpuid(uint32_t eax, uint32_t ecx, uint32_t *abcd){
    #if defined(_MSC_VER)
        __cpuidex(abcd,eax,ecx);
    #else
        uint32_t ebx,edx;
        #if defined( __i386__ ) && defined ( __PIC__ )
            /*in case of PIC, under 32-bit EBX cannot be clobbered*/
            __asm__( "movl %%ebx, %%edi \n\t xchgl %%ebx, %%edi" : "=D"(ebx),
        #else
            __asm__( "cpuid" : "+b"(ebx),
        #endif
            "+a"(eax), "+c"(ecx), "=d"(edx));

        abcd[0]=eax;abcd[1]=ebx;abcd[2]=ecx;abcd[3]=edx;
    #endif
}

int has_RTM_support(){
    uint32_t abcd[4];
    
    /*processor supports RTM execution if CPUID.07H.EBX.RTM [bit 11] = 1*/
    get_cpuid(0x7,0x0,abcd);
    return (abcd[1] & (1 << 11)) != 0;
}


int main(int argc, char **argv){
    
    if(has_RTM_support()){
        printf("This CPU supports RTM.");
    }else{
        printf("This CPU does NOT support RTM.");
    }
    return 0;
}

I have an Intel® Core™ i7-7600U (cpuinfo below), and as you can see from the ark, it's supposed to support TSX-NI.

Still, above check will return

This CPU does NOT support RTM.

And the has_tsx implementation from the tsx-tools agrees:

RTM: No

HLE: No

Yet at the same time, I can execute this snippet just fine...

#include <stdio.h>

int main()
{
    volatile int i = 0;
    while (i < 100000000) {
        __asm__ ("xbegin ABORT");
        i++;
        __asm__ ("xend");
        __asm__ ("ABORT:");
    }

    printf("%d\n", i);
    return 0;
}

Where my understanding would've been these asm instructions "will generate a #UD exception when used on a processor that does not support RTM", or at least that's what the intel manual says on the matter (page 387).

I checked the asm code, too, and these instructions are still there (see below for the content of the .s file).

So since these instructions appear to be executed, are these checks simply wrong?

If so, how would you properly test for RTM support?

ASM Code of the snippet

    .file   "rtm_simple.c"
# GNU C11 (Ubuntu 6.3.0-12ubuntu2) version 6.3.0 20170406 (x86_64-linux-gnu)
#   compiled by GNU C version 6.3.0 20170406, GMP version 6.1.2, MPFR version 3.1.5, MPC version 1.0.3, isl version 0.15
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  -imultiarch x86_64-linux-gnu rtm_simple.c -mtune=generic
# -march=x86-64 -fverbose-asm -fstack-protector-strong -Wformat
# -Wformat-security
# options enabled:  -fPIC -fPIE -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fchkp-check-incomplete-type
# -fchkp-check-read -fchkp-check-write -fchkp-instrument-calls
# -fchkp-narrow-bounds -fchkp-optimize -fchkp-store-bounds
# -fchkp-use-static-bounds -fchkp-use-static-const-bounds
# -fchkp-use-wrappers -fcommon -fdelete-null-pointer-checks
# -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
# -ffunction-cse -fgcse-lm -fgnu-runtime -fgnu-unique -fident
# -finline-atomics -fira-hoist-pressure -fira-share-save-slots
# -fira-share-spill-slots -fivopts -fkeep-static-consts
# -fleading-underscore -flifetime-dse -flto-odr-type-merging -fmath-errno
# -fmerge-debug-strings -fpeephole -fplt -fprefetch-loop-arrays
# -freg-struct-return -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
# -fsemantic-interposition -fshow-column -fsigned-zeros
# -fsplit-ivs-in-unroller -fssa-backprop -fstack-protector-strong
# -fstdarg-opt -fstrict-volatile-bitfields -fsync-libcalls -ftrapping-math
# -ftree-cselim -ftree-forwprop -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-reassoc -ftree-scev-cprop -funit-at-a-time
# -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -malign-stringops
# -mavx256-split-unaligned-load -mavx256-split-unaligned-store
# -mfancy-math-387 -mfp-ret-in-387 -mfxsr -mglibc -mieee-fp
# -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone -msse -msse2
# -mstv -mtls-direct-seg-refs -mvzeroupper

    .section    .rodata
.LC0:
    .string "%d\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    pushq   %rbp    #
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp  #,
    .cfi_def_cfa_register 6
    subq    $16, %rsp   #,
    movl    $0, -4(%rbp)    #, i
    jmp .L2 #
.L3:
#APP
# 7 "rtm_simple.c" 1
    xbegin ABORT
# 0 "" 2
#NO_APP
    movl    -4(%rbp), %eax  # i, i.0_5
    addl    $1, %eax    #, i.1_6
    movl    %eax, -4(%rbp)  # i.1_6, i
#APP
# 9 "rtm_simple.c" 1
    xend
# 0 "" 2
# 10 "rtm_simple.c" 1
    ABORT:
# 0 "" 2
#NO_APP
.L2:
    movl    -4(%rbp), %eax  # i, i.2_4
    cmpl    $99999999, %eax #, i.2_4
    jle .L3 #,
    movl    -4(%rbp), %eax  # i, i.3_8
    movl    %eax, %esi  # i.3_8,
    leaq    .LC0(%rip), %rdi    #,
    movl    $0, %eax    #,
    call    printf@PLT  #
    movl    $0, %eax    #, _10
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
    .ident  "GCC: (Ubuntu 6.3.0-12ubuntu2) 6.3.0 20170406"
    .section    .note.GNU-stack,"",@progbits

CPUINFO

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 142
model name      : Intel(R) Core(TM) i7-7600U CPU @ 2.80GHz
stepping        : 9
cpu MHz         : 2904.004
cache size      : 4096 KB
physical id     : 0
siblings        : 2
core id         : 0
cpu cores       : 2
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 22
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc pni pclmulqdq ssse3 cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx rdrand hypervisor lahf_lm abm 3dnowprefetch rdseed clflushopt
bugs            :
bogomips        : 5808.00
clflush size    : 64
cache_alignment : 64
address sizes   : 39 bits physical, 48 bits virtual
power management:

processor       : 1
vendor_id       : GenuineIntel
cpu family      : 6
model           : 142
model name      : Intel(R) Core(TM) i7-7600U CPU @ 2.80GHz
stepping        : 9
cpu MHz         : 2904.004
cache size      : 4096 KB
physical id     : 0
siblings        : 2
core id         : 1
cpu cores       : 2
apicid          : 1
initial apicid  : 1
fpu             : yes
fpu_exception   : yes
cpuid level     : 22
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc pni pclmulqdq ssse3 cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx rdrand hypervisor lahf_lm abm 3dnowprefetch rdseed clflushopt
bugs            :
bogomips        : 5808.00
clflush size    : 64
cache_alignment : 64
address sizes   : 39 bits physical, 48 bits virtual
power management:

Hm, from the output of /proc/cpuinfo I would guess, that you are in some sort of virtual container; this could explain the effect you observe. Try it again without a virtualized environment. — Ctx, Nov 07 '17 at 09:26
@Ctx Virtualbox, yes. Ubuntu guest on a windows host. (Sadly, I cannot set up dual-boot on this machine.) And yes, that seems to be the issue. Executing the check on the windows host directly takes a much longer time, but it does yield a "supports RTM". Thank you. Want to make an answer out of it? — User1291, Nov 07 '17 at 09:48
@Ctx Out of curiosity, how did you guess it was a Virtual environment? — Margaret Bloom, Nov 07 '17 at 10:01
@MargaretBloom 1. I wondered, why there were only two cores visible (should be 4 with ht). 2. the cpu flags are way too few for a real core i7 (especially the power management flags missing indicate virtualization). 3. I discovered the "hypervisor" flag among the cpu flags, which denotes that the system runs under a hypervisor — Ctx, Nov 07 '17 at 10:04
Your `#ifdef` is broken for 32-bit PIC. You need to save/restore `%ebx` around CPUID, rather than running `xchg` *instead of* CPUID. But really you should use gcc's CPUID functions instead of writing your own: https://stackoverflow.com/questions/14266772/how-do-i-call-cpuid-in-linux — Peter Cordes, Nov 07 '17 at 14:23

How to check for TSX support?

0 Answers0