SSE shifting instruction causes weird output (-1.#IND00) in subsequent instruction(s)?

Question

This error may not actually show up on all machines but on mine I ran the code below and got the output (notice the value -1.#IND00)?

values int:: 4 2
shifts:: 4 2
result: : 64 32
input 1 HADDPD:: 10.000000 -1.#IND00
input 2 HADDPD:: 13.000000 10.000000
result of HADDPD:: -1.#IND00 23.000000

If I comment out

__m64 PSLLDm64_IN = _mm_set_pi32(2,4);
    __m64 PSLLDm64_C = _mm_set_pi32(2,4);//could this be the culprit?
    __m64 PSLLDm64_r  =  PSLLD(PSLLDm64_IN, PSLLDm64_C);

    print_2_32_bit_int("values int:" , PSLLDm64_IN);
    print_2_32_bit_int("shifts:", PSLLDm64_C);
    print_2_32_bit_int("result: ", PSLLDm64_r);

I get...

input 1 HADDPD:: 10.000000 100.000000
input 2 HADDPD:: 13.000000 10.000000
result of HADDPD:: 110.000000 23.000000

I'm wondering if line 32 where __m64 PSLLDm64_C = _mm_set_pi32(2,4); could be screwed up?

Heres the complete code (it runs with -msse3 -mmmx using g++) not all the headers are really necessary though.

#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <iostream>

void print_2_64_bit_doubles(const char * label, __m128d m64_r)
{
    double *val = (double *) &m64_r;
    printf("%s: %f %f\n",
       label, val[0], val[1]);
}
void print_2_32_bit_int(const char * label, __m64 m32_r)
{
    int *val = (int *) &m32_r;
    printf("%s: %d %d\n",
       label, val[0], val[1]);
}
__m128d HADDPD(__m128d __X, __m128d __Y)
{
    return _mm_hadd_pd ( __X, __Y);
}
__m64 PSLLD(__m64 __m, __m64 __count)
{
    return _mm_sll_pi32 ( __m,  __count);
}
int main()
{
    //PSLLD-------------------------------------------------------------------
    __m64 PSLLDm64_IN = _mm_set_pi32(2,4);
    __m64 PSLLDm64_C = _mm_set_pi32(2,4);
    __m64 PSLLDm64_r  =  PSLLD(PSLLDm64_IN, PSLLDm64_C);

    print_2_32_bit_int("values int:" , PSLLDm64_IN);
    print_2_32_bit_int("shifts:", PSLLDm64_C);
    print_2_32_bit_int("result: ", PSLLDm64_r);
    //HADDPD------------------------------------------------------------------
    double C1 = 10;
    double D = C1*C1;
    double x = 10;
    double y = 13;

    __m128d HADDPDm64_1 = _mm_set_pd(D,C1);
    __m128d HADDPDm64_2 = _mm_set_pd(x,y);
    __m128d HADDPDm64_r = HADDPD( HADDPDm64_1, HADDPDm64_2);

    print_2_64_bit_doubles("input 1 HADDPD:", HADDPDm64_1);
    print_2_64_bit_doubles("input 2 HADDPD:", HADDPDm64_2);
    print_2_64_bit_doubles("result of HADDPD:", HADDPDm64_r);

    return 0;
}

EDIT: This is the updated code with the new shifting instructions compiled with g++ 4.4.1 -msse -msse2 -msse3 -msse4

#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <mmintrin.h>
#include <stdio.h>
#include <stdint.h>


void print_2_64_bit_doubles(const char * label, __m128d m64_r)
{
    double *val = (double *) &m64_r;
    printf("%s: %f %f\n",
       label, val[0], val[1]);
}
void print_2_32_bit_int(const char * label, __m64 m32_r)
{
    int *val = (int *) &m32_r;
    printf("%s: %d %d\n",
       label, val[0], val[1]);
}
void print_1_32_bit_int(const char * label, __m64 m32_r)
{
    int *val = (int *) &m32_r;
    printf("%s: %d \n",
       label, val[0]);
}
__m128d HADDPD(__m128d __X, __m128d __Y)
{
    return _mm_hadd_pd ( __X, __Y);
}
__m64 PSLLD(__m64 __m, __m64 __count)
{
    return _mm_sll_pi32 ( __m,  __count);
}
int main()
{
    //PSLLD-------------------------------------------------------------------
    __m64 PSLLDm64_IN = _mm_set_pi32(2,4);
    long long __i = 2;
    __m64 PSLLDm64_C = (__m64)(__i);
    __m64 PSLLDm64_r  =  PSLLD(PSLLDm64_IN, PSLLDm64_C);
    _mm_empty();

    print_2_32_bit_int("values int:" , PSLLDm64_IN);
    print_1_32_bit_int("shifts:", PSLLDm64_C);
    print_2_32_bit_int("result: ", PSLLDm64_r);
    //HADDPD------------------------------------------------------------------
    double C1 = 10;
    double D = C1*C1;
    double x = 10;
    double y = 13;

    __m128d HADDPDm64_1 = _mm_set_pd(D,C1);
    __m128d HADDPDm64_2 = _mm_set_pd(x,y);
    __m128d HADDPDm64_r = HADDPD( HADDPDm64_1, HADDPDm64_2);

    print_2_64_bit_doubles("input 1 HADDPD:", HADDPDm64_1);
    print_2_64_bit_doubles("input 2 HADDPD:", HADDPDm64_2);
    print_2_64_bit_doubles("result of HADDPD:", HADDPDm64_r);

    return 0;
}

And the output

values int:: 4 2
shifts:: 2
result: : 16 8
input 1 HADDPD:: 10.000000 -1.#IND00
input 2 HADDPD:: 13.000000 10.000000
result of HADDPD:: -1.#IND00 23.000000

Works fine for me with gcc 4.2.1 - what compiler are you using ? — Paul R, Jul 04 '13 at 06:33
I wonder if you need an `_mm_empty()` after the 64 bit SIMD stuff ? — Paul R, Jul 04 '13 at 06:42
When I type in `g++ -v` into the command line I get `gcc 4.4.1 tdm-2 mingw 32`. I placed `_mm_empty();` right after `__m64 PSLLDm64_r = PSLLD(PSLLDm64_IN, PSLLDm64_C);` and same result with the output? — pandoragami, Jul 04 '13 at 14:23
Could it be that I'm using cygwin mmintrin headers with mingw? — pandoragami, Jul 04 '13 at 14:26
Beats me then - may be a compiler problem. The only code error I see is `__m64 PSLLDm64_C = _mm_set_pi32(2,4);` which should be e.g. `__m64 PSLLDm64_C = _mm_set_pi64x(2);` but that should not cause your particular problem. — Paul R, Jul 04 '13 at 14:27
What header do I use for `_mm_set_pi64x` ? I can't seem to find it any of them. — pandoragami, Jul 04 '13 at 14:34
It should be in `` but see also http://stackoverflow.com/questions/9061293/how-to-convert-long-long-or-int64-to-m64/9061485#9061485 - you can also try just `__m64 PSLLDm64_C = (__m64)(2);` — Paul R, Jul 04 '13 at 14:36
For this `__m64 PSLLDm64_C = (__m64)(2);` I'm getting a compliler error `cannot convert between vector values of different size` (might be c++ problem) and for the other `__m64 PSLLDm64_C = _mm_set_pi64x(2);` I have the `mmintrin.h` included but it says `_mm_set_pi64x` is not in scope. Thanks for your time though. — pandoragami, Jul 04 '13 at 14:49
I tried fixing the above compiler errors by using GCC for a C program (removed the header) and I get the same two compiler errors. Actually I get `error: incompatible types when initializing type '__m64' using type 'int'` for this `__m64 PSLLDm64_C = _mm_set_pi64x(2);` — pandoragami, Jul 04 '13 at 14:59
@ScottD Are you saying I should use 4.8.1 sse instrinsics for mingw? Do you have a hyperlink btw? — pandoragami, Jul 04 '13 at 15:13
@ Paul R I managed to get it going with the code I edited into the post above but I still get the output wrong. See edit above. I know its not your fault but I figure just for reference it should be known somewhere on the internet. — pandoragami, Jul 04 '13 at 21:13
I switched on the `-O3` flag and it works??? The output is normal now. Should I be worried? — pandoragami, Jul 04 '13 at 21:27

score 1 · Accepted Answer · 2013-07-07T01:08:59.077

Testing with the Windows x64 ports of gcc and g++ 4.8.1 from http://www.drangon.org/mingw/ both give expected results. Just unzip the archive and set the path to mingw64\bin. Use a compiler option such as -msse4 to tell the compiler your hardware supports these instructions.

07/05/2013: Sorry about the incomplete initial comment. Also, the above answer was intended to be a comment and not an answer.

Microsoft VS2010 gets the same incorrect result you report from cygwin, and the cause is easy to find with the Microsoft debugger. In fact, a compile warning also points out the problem:

warning C4730: 'main' : mixing _m64 and floating point expressions may result in incorrect code

The problem you report occurs when a compiler generates a mix of MMX and x87 FPU instructions. Compilers use MMX registers for _m64 data, and compilers use either x87 FPU registers or the newer XMM or YMM registers for floating point data type double. When Intel designed MMX, a decision to reuse the x87 registers for MMX register data. This was done so operating systems would not need any update in order to support MMX use. The drawback of this decision is that MMX and x87 FPU instructions cannot be mixed. To help prevent accidental mixing of FPU and MMX instructions, Intel made MMX register loads mark the tag word bits of the corresponding FPU register as SNAN (signalling NAN). That is what leads to the unexpected output you see. Some compiler and build option combinations may allow this code to function correctly. Possible reasons this code may work in some cases: 1) compiler uses XMM or YMM registers for double precision data. 2) compiler keeps all x87 FPU values in memory and does not rely on the FPU register state across MMX instructions. Bottom line is that it is up to the coder to avoid situations that allow the compiler to generate code that mixes MMX and x87 FPU instructions. Take warnings such as "function 'print_2_32_bit_int' has no EMMS instruction" or "mixing _m64 and floating point expressions may result in incorrect code" seriously. One approach that may be workable is to avoid the _m64 data type altogether.

Paul R's suggestion about using _mm_empty() solves the problem for Microsoft VS2010. I added it before 'double C1 = 10' and the problem went away. _mm_empty is explained here http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/intref_cls/common/intref_mmx_emms_usage.htm.

For your other questions, I am using command line only for gcc, no IDE. The older version of gcc should work OK if you add the _mm_empty() or avoid mixing MMX and x87 FPU code.

I'm just wondering if you're using an IDE such as codeblocks (which is what I'm on). I can change my toolchain to mingw64/bin but I cannot get it to compile anything (e.g. it stops dead saying the compilation failed). What IDE are you using by the way? — pandoragami, Jul 04 '13 at 16:05

SSE shifting instruction causes weird output (-1.#IND00) in subsequent instruction(s)?

1 Answers1