74

Is the following code valid to check if a CPU supports the SSE3 instruction set?

Using the IsProcessorFeaturePresent() function apparently does not work on Windows XP.

bool CheckSSE3()
{
    int CPUInfo[4] = {-1};

    //-- Get number of valid info ids
    __cpuid(CPUInfo, 0);
    int nIds = CPUInfo[0];

    //-- Get info for id "1"
    if (nIds >= 1)
    {
        __cpuid(CPUInfo, 1);
        bool bSSE3NewInstructions = (CPUInfo[2] & 0x1) || false;
        return bSSE3NewInstructions;     
    }

    return false;      
}
phuclv
  • 37,963
  • 15
  • 156
  • 475
Stiefel
  • 2,677
  • 3
  • 31
  • 42
  • 1
    It seems correct, as far as I can tell from reading Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2 (2A & 2B): Instruction Set Reference, A-Z, page 284. Also, bit 9 of CPUInfo[2] signals supplemental SSE3 instructions. – Norbert P. May 25 '11 at 15:19
  • 1
    `SSE3` and `AVX` (and `CLMUL` and `MOVD`) are different features, and they are tested separately. From the Intel manual (cited by Norbert), page 3-189: *"Software must confirm that a processor feature is present using feature flags returned by CPUID prior to using the feature. Software should not depend on future offerings retaining all features."* So don't depend upon `AVX` availability just because `SSE3` is present. – jww Aug 19 '15 at 05:37
  • Also, CPU support is different than OS support. See Andy's answer below. – jww Aug 19 '15 at 17:05
  • 2
    On `bool bSSE3NewInstructions = (CPUInfo[2] & 0x1) || false;`, you don't need the `|| false` portion: `bool bSSE3NewInstructions = (CPUInfo[2] & 0x1);` And then you can get rid of the `bSSE3NewInstructions` variable as well: `return (CPUInfo[2] & 0x1);` – Remy Lebeau Mar 03 '18 at 02:29

7 Answers7

110

I've created a GitHub repro that will detect CPU and OS support for all the major x86 ISA extensions: https://github.com/Mysticial/FeatureDetector

Here's a shorter version:


First you need to access the CPUID instruction:

#ifdef _WIN32

//  Windows
#define cpuid(info, x)    __cpuidex(info, x, 0)

#else

//  GCC Intrinsics
#include <cpuid.h>
void cpuid(int info[4], int InfoType){
    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
}

#endif

Then you can run the following code:

//  Misc.
bool HW_MMX;
bool HW_x64;
bool HW_ABM;      // Advanced Bit Manipulation
bool HW_RDRAND;
bool HW_BMI1;
bool HW_BMI2;
bool HW_ADX;
bool HW_PREFETCHWT1;

//  SIMD: 128-bit
bool HW_SSE;
bool HW_SSE2;
bool HW_SSE3;
bool HW_SSSE3;
bool HW_SSE41;
bool HW_SSE42;
bool HW_SSE4a;
bool HW_AES;
bool HW_SHA;

//  SIMD: 256-bit
bool HW_AVX;
bool HW_XOP;
bool HW_FMA3;
bool HW_FMA4;
bool HW_AVX2;

//  SIMD: 512-bit
bool HW_AVX512F;    //  AVX512 Foundation
bool HW_AVX512CD;   //  AVX512 Conflict Detection
bool HW_AVX512PF;   //  AVX512 Prefetch
bool HW_AVX512ER;   //  AVX512 Exponential + Reciprocal
bool HW_AVX512VL;   //  AVX512 Vector Length Extensions
bool HW_AVX512BW;   //  AVX512 Byte + Word
bool HW_AVX512DQ;   //  AVX512 Doubleword + Quadword
bool HW_AVX512IFMA; //  AVX512 Integer 52-bit Fused Multiply-Add
bool HW_AVX512VBMI; //  AVX512 Vector Byte Manipulation Instructions

int info[4];
cpuid(info, 0);
int nIds = info[0];

cpuid(info, 0x80000000);
unsigned nExIds = info[0];

//  Detect Features
if (nIds >= 0x00000001){
    cpuid(info,0x00000001);
    HW_MMX    = (info[3] & ((int)1 << 23)) != 0;
    HW_SSE    = (info[3] & ((int)1 << 25)) != 0;
    HW_SSE2   = (info[3] & ((int)1 << 26)) != 0;
    HW_SSE3   = (info[2] & ((int)1 <<  0)) != 0;

    HW_SSSE3  = (info[2] & ((int)1 <<  9)) != 0;
    HW_SSE41  = (info[2] & ((int)1 << 19)) != 0;
    HW_SSE42  = (info[2] & ((int)1 << 20)) != 0;
    HW_AES    = (info[2] & ((int)1 << 25)) != 0;

    HW_AVX    = (info[2] & ((int)1 << 28)) != 0;
    HW_FMA3   = (info[2] & ((int)1 << 12)) != 0;

    HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
}
if (nIds >= 0x00000007){
    cpuid(info,0x00000007);
    HW_AVX2   = (info[1] & ((int)1 <<  5)) != 0;

    HW_BMI1        = (info[1] & ((int)1 <<  3)) != 0;
    HW_BMI2        = (info[1] & ((int)1 <<  8)) != 0;
    HW_ADX         = (info[1] & ((int)1 << 19)) != 0;
    HW_SHA         = (info[1] & ((int)1 << 29)) != 0;
    HW_PREFETCHWT1 = (info[2] & ((int)1 <<  0)) != 0;

    HW_AVX512F     = (info[1] & ((int)1 << 16)) != 0;
    HW_AVX512CD    = (info[1] & ((int)1 << 28)) != 0;
    HW_AVX512PF    = (info[1] & ((int)1 << 26)) != 0;
    HW_AVX512ER    = (info[1] & ((int)1 << 27)) != 0;
    HW_AVX512VL    = (info[1] & ((int)1 << 31)) != 0;
    HW_AVX512BW    = (info[1] & ((int)1 << 30)) != 0;
    HW_AVX512DQ    = (info[1] & ((int)1 << 17)) != 0;
    HW_AVX512IFMA  = (info[1] & ((int)1 << 21)) != 0;
    HW_AVX512VBMI  = (info[2] & ((int)1 <<  1)) != 0;
}
if (nExIds >= 0x80000001){
    cpuid(info,0x80000001);
    HW_x64   = (info[3] & ((int)1 << 29)) != 0;
    HW_ABM   = (info[2] & ((int)1 <<  5)) != 0;
    HW_SSE4a = (info[2] & ((int)1 <<  6)) != 0;
    HW_FMA4  = (info[2] & ((int)1 << 16)) != 0;
    HW_XOP   = (info[2] & ((int)1 << 11)) != 0;
}

Note that this only detects whether the CPU supports the instructions. To actually run them, you also need to have operating system support.

Specifically, operating system support is required for:

  • x64 instructions. (You need a 64-bit OS.)
  • Instructions that use the (AVX) 256-bit ymm registers. See Andy Lutomirski's answer for how to detect this.
  • Instructions that use the (AVX512) 512-bit zmm and mask registers. Detecting OS support for AVX512 is the same as with AVX, but using the flag 0xe6 instead of 0x6.
Community
  • 1
  • 1
Mysticial
  • 464,885
  • 45
  • 335
  • 332
  • 3
    Note for others like me: Read the question carefully - the `__cpuid` intrinsic is MSVC only. – squidpickles Feb 21 '13 at 21:09
  • @slugchewer Good point. In GCC, I believe you need to use inline assembly. Lemme see if I can find an already existing solution. – Mysticial Feb 21 '13 at 21:11
  • @slugchewer I've added an inline assembly version that should work for GCC, ICC, and possibly Clang as well. I haven't tested it yet. So let me know if it breaks. – Mysticial Feb 21 '13 at 21:29
  • This answer is incorrect. You are, at best, checking whether the CPU supports AVX, XOP, etc. You are failing to use xgetbv to check whether the OS has enabled the required CPU state. Your code will crash if you run it on a new CPU with an old OS. – Andy Lutomirski Mar 20 '14 at 01:13
  • @AndyLutomirski Good point. I'll update the answer when I get the chance. Though IIRC, you can still use 128-bit AVX and XOP without OS support. – Mysticial Mar 20 '14 at 01:15
  • Hmm. I'm not really sure. You certainly can't touch YMM state without OS support, but I don't know about 128-bit VEX-encoded instructions. – Andy Lutomirski Mar 20 '14 at 01:24
  • @AndyLutomirski Yes, I was referring to the VEX-encoded 128-bit instructions on XMM registers only. That said, I'm still not 100% sure, but I did read about it somewhere. – Mysticial Mar 20 '14 at 01:25
  • Nice addition! I was using [InstructionSet.cpp](https://msdn.microsoft.com/en-us/library/hskdteyh.aspx) from the Visual Studio docs, but it's nice to have a less platform dependent option. – chappjc Apr 21 '15 at 20:50
  • `cpuid` blows away the high 32-bits of `RAX`, `RBX`, `RCX` and `RDX` according to [Intel® 64 and IA-32 Architectures Software Developer Manual](https://www-ssl.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html), Vol 2A, page 3-177. That could cause trouble under Linux with PIC because the Global Offest Table (GOT) is held in EBX. Also see this answer on the ABI: [GCC not saving/restoring reserved registers on function calls](http://stackoverflow.com/a/6863420/608639). – jww Aug 19 '15 at 05:59
  • @jww Are you sure that the compiler doesn't automatically save/restore what's in e/rax, bx, cx, and dx? Looking at the inline asm docs, output operands are implicitly part of the clobber list. Or does that only apply to the bottom 32 bits? – Mysticial Aug 19 '15 at 14:12
  • @Mystical - yes, certain. Also see [Proper use of x86/x86_64 CPUID instruction with extended assembler](https://gcc.gnu.org/ml/gcc-help/2015-08/msg00088.html) on the GCC Help mailing list. – jww Aug 19 '15 at 15:07
  • @jww Oh wow, that's a very recent thread. I'll have to take a look when I get the time and see how to best fix this. I'm surprised this has lasted so long without any issues. – Mysticial Aug 19 '15 at 15:18
  • @Mystical - Yeah, that's me suffering it. So there's at least two of us in the same boat (likely more). I came across it while auditing code that took the ABI into account. It looks like the safe strategy is to simply save `EBX` and `RBX`. – jww Aug 19 '15 at 15:41
  • 1
    @jww I'm probably gonna change it to [this instead](http://stackoverflow.com/a/14266932/922184). Then I dump the problem on the GCC folks. :P (Which they've solved already?) Since we're using GCC-specific inline asm, there shouldn't be any loss portability. ICC supports both MSVC intrinsics on Windows and I believe GCC builtins on Linux (heck, they even support GCC inline asm on Windows). So it shouldn't be a problem. I'm not sure about Clang though. – Mysticial Aug 19 '15 at 15:49
58

Mysticial's answer is a bit dangerous -- it explains how to detect CPU support but not OS support. You need to use _xgetbv to check whether the OS has enabled the required CPU extended state. See here for another source. Even gcc has made the same mistake. The meat of the code is:

bool avxSupported = false;

int cpuInfo[4];
__cpuid(cpuInfo, 1);

bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false;
bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;

if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
{
    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
    avxSupported = (xcrFeatureMask & 0x6) == 0x6;
}
Mysticial
  • 464,885
  • 45
  • 335
  • 332
Andy Lutomirski
  • 1,343
  • 12
  • 15
  • 1
    +1, so I didn't have to look up and test it myself. I'll keep my answer specific to whether the CPU supports it and point to yours about proper OS support for 256-bit AVX. – Mysticial Mar 20 '14 at 01:33
  • 3
    Regarding `_xgetbv`: using MSVC, you need `#include ` and using GCC you need `#include `. Also in GCC, `_XCR_XFEATURE_ENABLED_MASK` doesn't exist so you should just use `_xgetbv(0)`. Thanks for the code :) – antou Nov 24 '20 at 11:38
12

After quite a bit of googling, I also found the solutions from Intel:

Link: https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family

    void cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) {
#if defined(_MSC_VER)
            __cpuidex((int*)abcd, eax, ecx);
#else
            uint32_t ebx, edx;
# if defined( __i386__ ) && defined ( __PIC__ )
            /* in case of PIC under 32-bit EBX cannot be clobbered */
            __asm__("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
# else
            __asm__("cpuid" : "+b" (ebx),
# endif
            "+a" (eax), "+c" (ecx), "=d" (edx));
            abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif
    }

    int check_xcr0_ymm()
    {
        uint32_t xcr0;
#if defined(_MSC_VER)
        xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
#else
        __asm__("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx");
#endif
        return ((xcr0 & 6) == 6); /* checking if xmm and ymm state are enabled in XCR0 */
    }

Also note that GCC has some special intrinsics that you can use (see: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html ):

    if (__builtin_cpu_supports("avx2"))
    // ...

If you put this together with the information above, it'll all work out fine.

atlaste
  • 30,418
  • 3
  • 57
  • 87
  • 1
    Modern GNU C compilers know how to save/restore EBX around an inline-asm statement in PIC code. This workaround isn't needed anymore, but doesn't cause harm beyond possibly a tiny bit of code-size overhead in a function that probably only runs a couple times in your whole program. (Cache your CPUID results; it's not a fast instruction.) – Peter Cordes Jul 10 '19 at 04:23
8

To add to Abhiroop's answer: On linux, you can run this shell command to find out the features supported by your CPU

cat /proc/cpuinfo | grep flags | uniq

On my machine this prints

flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single retpoline kaiser fsgsbase bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx xsaveopt

rahul003
  • 155
  • 1
  • 6
7

On a Mac OS this works:

sysctl -a | grep machdep.cpu.features

In my machine it outputs this:

machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C

As you can see with the instructions written in bold, SSE3 and bunch of other SIMD instructions are supported.

Abhiroop Sarkar
  • 2,251
  • 1
  • 26
  • 45
  • You forgot to bold SSSE3, which includes important stuff like `pshufb`. You also left out FMA, which is very important for some uses. (Weird that you have FMA but not AVX2. Is that an AMD Piledriver or Steamroller CPU in your Mac?) – Peter Cordes Apr 30 '19 at 17:05
  • 1
    For some reason AVX2 support is listed in `machdep.cpu.leaf7_features` instead of `machdep.cpu.features` – porglezomp Aug 21 '19 at 07:56
3

Alternativley on linux or wsl2 the lscpucommand from the util-linux repository will do the job.

E.g:

lscpu | grep sse3
abu_bua
  • 1,361
  • 17
  • 25
0

Or the "POSIX way":

cat /proc/cpuinfo |grep -i sse3 >/dev/null 2>&1 && echo "ESS3=TRUE" || echo "ESS3=FALSE"

Results:

ESS3=TRUE
Tweeks
  • 19
  • 4