Today I stumbled across a weird problem. Consider this simple program where I try to emulate MMX's PADDW
instruction:
#include <cstdint>
#include <cstdio>
int main()
{
uint64_t a = 0;
uint64_t b = 0x1234123412341234;
uint64_t c = 0;
uint16_t *a_words = reinterpret_cast<uint16_t*>(&a);
uint16_t *b_words = reinterpret_cast<uint16_t*>(&b);
uint16_t *c_words = reinterpret_cast<uint16_t*>(&c);
for (size_t i = 0; i < 4; i ++)
c_words[i] = a_words[i] + b_words[i];
printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]);
printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]);
printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]);
printf("%016llx\n", c);
return 0;
}
Compiling this and running with g++ -std=c++11 test.cpp -o test && ./test
results in following:
0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
1234123412341234
However, if I enable -O2
, it displays wrong value (on -O1
it still works):
0 0 0 0
4660 4660 4660 4660
4660 4660 4660 4660
0000000000000000
Why is that?
Other observations:
If I unroll the loop, compiling with
-O2
works (!!):#include <cstdint> #include <cstdio> int main() { uint64_t a = 0; uint64_t b = 0x1234123412341234; uint64_t c = 0; uint16_t *a_words = reinterpret_cast<uint16_t*>(&a); uint16_t *b_words = reinterpret_cast<uint16_t*>(&b); uint16_t *c_words = reinterpret_cast<uint16_t*>(&c); c_words[0] = a_words[0] + b_words[0]; c_words[1] = a_words[1] + b_words[1]; c_words[2] = a_words[2] + b_words[2]; c_words[3] = a_words[3] + b_words[3]; printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]); printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]); printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]); printf("%016llx\n", c); return 0; }
If I work with very similar problem but for 32-bit integers instead of 64-bit ones, it works as well:
#include <cstdint> #include <cstdio> int main() { uint32_t a = 0; uint32_t b = 0x12121212; uint32_t c = 0; uint8_t *a_words = reinterpret_cast<uint8_t*>(&a); uint8_t *b_words = reinterpret_cast<uint8_t*>(&b); uint8_t *c_words = reinterpret_cast<uint8_t*>(&c); for (size_t i = 0; i < 4; i ++) c_words[i] = a_words[i] + b_words[i]; printf("%d %d %d %d\n", a_words[0], a_words[1], a_words[2], a_words[3]); printf("%d %d %d %d\n", b_words[0], b_words[1], b_words[2], b_words[3]); printf("%d %d %d %d\n", c_words[0], c_words[1], c_words[2], c_words[3]); printf("%08x\n", c); return 0; }
The problem recurs on both 32-bit and 64-bit machines. Tried g++ (GCC) 4.9.2
on Cygwin and g++ (Debian 4.9.1-19) 4.9.1
on GNU/Linux.