I'm using a library that uses Intel's MMX single instruction, multiple data (SIMD) instruction set to speed up the multiplication of integer arrays. The function I am using contains in-line assembly to use the MMX SIMD registers in Intel processors and perform the multiplication.
After multiplying two integer arrays with the function, I receive an array that contains incorrect values for integers that should be negative. However, when converting these values to binary, I notice the integers are representing the correct value in 2's complement. The integers should be, and are, 16 bits long.
Even stranger is when two negative integers are multiplied, as opposed to one positive one negative, the function returns an integer value that, when converted to binary, adds an extra bit as the most significant bit (tags additional bit onto left hand side of the binary number). That bit's value is 1, but if you ignore that bit, the rest of the bits correctly display the expected value.
It's hard to say in words so let me give an example:
I have three int arrays A, B, and C.
A = {-1, 4, 1, -1, 1, -2, -3, 7},
B = {-1, -1, -1, -1, -1, -1, -1, 1}
C = {0, 0, 0, 0, 0, 0, 0, 0}
When A and B are multiplied together, I would expect
{1, -4, -1, 1, -1, 2, 3, 7}
to be stored in C.
However after using the library's function, I get
{65537, 65532, 65535, 65537, 65535, 65538, 65539, 7}
as my values for C.
The first value, 65537, in binary is 10000000000000001. Without the extra 17th bit, this would equal 1, but even so, the value should just be 1, not 65537. The second value, 65532, in binary is 1111111111111100 which is 2's complement for -4. This is well and good but why isn't that value just -4. Also notice the last value, 7. When negative signs are not involved, the function gives a value in the expected form.
The in-line assembly is written for compilation on Microsoft Visual Studio but I'm using intel's c/c++ compiler with -use-msasm flag.
Here is the function code:
void mmx_mul(void *A, void *B, void *C, int cnt)
{
int cnt1;
int cnt2;
int cnt3;
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 4;
cnt3 = (cnt - (32*cnt1) - (4*cnt2));
__asm
{
//; Set up for loop
mov edi, A; // Address of A source1
mov esi, B; // Address of B source2
mov ebx, C; // Address of C dest
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movq mm0, [edi]; //Load from A
movq mm1, [edi+8]; //Load from A
movq mm2, [edi+16]; //Load from A
movq mm3, [edi+24]; //Load from A
movq mm4, [edi+32]; //Load from A
movq mm5, [edi+40]; //Load from A
movq mm6, [edi+48]; //Load from A
movq mm7, [edi+56]; //Load from A
pmullw mm0, [esi]; //Load from B & multiply B * (A*C)
pmullw mm1, [esi+8]; //Load from B & multiply B * (A*C)
pmullw mm2, [esi+16]; //Load from B & multiply B * (A*C)
pmullw mm3, [esi+24]; //Load from B & multiply B * (A*C)
pmullw mm4, [esi+32]; //Load from B & multiply B * (A*C)
pmullw mm5, [esi+40]; //Load from B & multiply B * (A*C)
pmullw mm6, [esi+48]; //Load from B & multiply B * (A*C)
pmullw mm7, [esi+56]; //Load from B & multiply B * (A*C)
movq [ebx], mm0; //Store C = A*B
movq [ebx+8], mm1; //Store C = A*B
movq [ebx+16], mm2; //Store C = A*B
movq [ebx+24], mm3; //Store C = A*B
movq [ebx+32], mm4; //Store C = A*B
movq [ebx+40], mm5; //Store C = A*B
movq [ebx+48], mm6; //Store C = A*B
movq [ebx+56], mm7; //Store C = A*B
add edi, 64;
add esi, 64;
add ebx, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movq mm1, [edi]; //Load from A
pmullw mm1, [esi]; //Load from B & multiply B * (A*C)
movq [ebx], mm1;
add edi, 8;
add esi, 8;
add ebx, 8;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov eax, [edi];
imul eax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop L3;
ZERO2:
EMMS;
}
}
and an example of me calling it.
int A[8] = {-1, 4, 1, -1, 1, -2, -3, 7};
int B[8] = {-1, -1, -1, -1, -1, -1, -1, 1};
int C[8];
mmx_mul(A, B, C, 16);
The last argument 16 is the number of total elements in A and B combined.
The library I am using is free-use and can be found at https://www.ngs.noaa.gov/gps-toolbox/Heckler.htm