3

Today I stumbled across a weird problem. Consider this simple program where I try to emulate MMX's PADDW instruction:

#include <cstdint>
#include <cstdio>

int main()
{
    uint64_t a = 0;
    uint64_t b = 0x1234123412341234;

    uint64_t c = 0;
    uint16_t *a_words = reinterpret_cast<uint16_t*>(&a);
    uint16_t *b_words = reinterpret_cast<uint16_t*>(&b);
    uint16_t *c_words = reinterpret_cast<uint16_t*>(&c);

    for (size_t i = 0; i < 4; i ++)
        c_words[i] = a_words[i] + b_words[i];

    printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
    printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
    printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
    printf("%016llx\n", c);
    return 0;
}

Compiling this and running with g++ -std=c++11 test.cpp -o test && ./test results in following:

0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
1234123412341234

However, if I enable -O2, it displays wrong value (on -O1 it still works):

0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
0000000000000000

Why is that?


Other observations:

  1. If I unroll the loop, compiling with -O2 works (!!):

    #include <cstdint>
    #include <cstdio>
    
    int main()
    {
        uint64_t a = 0;
        uint64_t b = 0x1234123412341234;
    
        uint64_t c = 0;
        uint16_t *a_words = reinterpret_cast<uint16_t*>(&a);
        uint16_t *b_words = reinterpret_cast<uint16_t*>(&b);
        uint16_t *c_words = reinterpret_cast<uint16_t*>(&c);
    
        c_words[0] = a_words[0] + b_words[0];
        c_words[1] = a_words[1] + b_words[1];
        c_words[2] = a_words[2] + b_words[2];
        c_words[3] = a_words[3] + b_words[3];
    
        printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
        printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
        printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
        printf("%016llx\n", c);
        return 0;
    }
    
  2. If I work with very similar problem but for 32-bit integers instead of 64-bit ones, it works as well:

    #include <cstdint>
    #include <cstdio>
    
    int main()
    {
        uint32_t a = 0;
        uint32_t b = 0x12121212;
    
        uint32_t c = 0;
        uint8_t *a_words = reinterpret_cast<uint8_t*>(&a);
        uint8_t *b_words = reinterpret_cast<uint8_t*>(&b);
        uint8_t *c_words = reinterpret_cast<uint8_t*>(&c);
    
        for (size_t i = 0; i < 4; i ++)
            c_words[i] = a_words[i] + b_words[i];
    
        printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
        printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
        printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
        printf("%08x\n", c);
        return 0;
    }
    

The problem recurs on both 32-bit and 64-bit machines. Tried g++ (GCC) 4.9.2 on Cygwin and g++ (Debian 4.9.1-19) 4.9.1 on GNU/Linux.

rr-
  • 14,303
  • 6
  • 45
  • 67
  • 4
    You violate strict aliasing, which results in undefined behaviour, which your compiler exploits. – milleniumbug Feb 15 '15 at 13:48
  • 2
    Have you tried fno-strict-aliasing? – harold Feb 15 '15 at 13:49
  • 1
    http://stackoverflow.com/questions/2958633/gcc-strict-aliasing-and-horror-stories – Hans Passant Feb 15 '15 at 13:49
  • It works with `-fno-strict-aliasing`, thank you! However, judging from the tone of your comments I feel I'm doing the whole thing totally wrong. Mind giving me a clue how could I tackle the problem in a more elegant way? – rr- Feb 15 '15 at 13:51
  • @rr- Yes, breaking strict aliasing is totally wrong. Consider `memcpy()` or better yet, `std::copy()` for extracting bytes from an object and putting it into an object of incompatible type. – The Paramagnetic Croissant Feb 15 '15 at 13:54
  • 1
    In this case, you could use the good old SWAR addition: `((a & 0x7FFF7FFF7FFF7FFF) + (b & 0x7FFF7FFF7FFF7FFF)) ^ ((a ^ b) & 0x8000800080008000)` – harold Feb 15 '15 at 13:55

1 Answers1

4

This is strict aliasing violation. You write values of type A to memory which stores object of type B. C++ standard says you can't do that (the exception to this rule are char and its unsigned and signed variant)

This is non-portable code, but yet, if you still want to do it legally, what can you do about it?

  • copy from uint64_t to uint16_t array (by memcpy or std::copy), modify the values, copy it back.
  • OR use compiler intrisics which translate directly to vectorized instructions
  • OR disable strict aliasing.
milleniumbug
  • 15,379
  • 3
  • 47
  • 71