EDIT partial solution below (EDIT 2), but I have still one question (see at the end)
I am trying to compile the following C program with gcc-4.9.2, on Windows 7, 32 bits, running on a Pentium G3220 (according to Windows System Information). If I understand correctly, this processor does not have AVX extensions, so it's quite natural that something happens, I am just unsure about what exactly. Initially, I was playing with optimizations with gcc, and I tried -mavx rather "accidentally".
The following program computes permutations of numbers 0 ... n-1 (with n given as an argument) in lexicographic order, and also rank of each permutation (its position in this sequential order), and "unrank" (recover permutation from rank), and checks that all of these are correct. It should only print "OK" or "Error" in the end.
- With
gcc -O3
, the program runs correctly with all integer input I checked (1 <= n <= 11). - With
gcc -O3 -mavx
, it runs correctly for 1 <= n <= 7, and for n >= 8, it prints nothing, and actually it does nothing (almost no delay before exiting). I get no message from the program or from Windows (I would have expected maybe a crash with an unknown instruction, but it didn't happen).
(On another computer with Windows 7 64 bits, on a core-i5, and the same gcc-4.9.2, the program seems to run fine with of without -mavx, when compiled either in 32 or 64 bits)
What I don't understand is why it runs correctly for some input values, and fails for other ones. Does anybody have some hint about this?
Here is the full program, followed by a shorter one with the same problem.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
void copyvec(int n, int dst[n], int src[n]) {
int i;
for(i = 0; i < n; i++) {
dst[i] = src[i];
}
}
int eqvec(int n, int a[n], int b[n]) {
int i;
for(i = 0; i < n; i++) {
if(a[i] != b[i]) return 0;
}
return 1;
}
int rank(int n, int a[n]) {
int v[n], i, j, r;
v[n - 1] = 1;
for(j = n - 2; j >= 0; j--) v[j] = v[j + 1]*(n - 1 - j);
for(r = i = 0; ; i++) {
for(j = i; j < n; j++) {
if(a[j] > j) goto cont;
}
return r;
cont:
i = j;
r += v[i]*(a[i] - i);
for(j = i + 1; j < n; j++) {
if(a[j] < a[i]) a[j]++;
}
}
}
void unrank(int n, int a[n], int p) {
int v[n], i, j, r, s;
v[n - 1] = 1;
for(i = n - 2; i >= 0; i--) v[i] = v[i + 1]*(n - 1 - i);
p %= n*v[0];
for(i = 0; i < n; i++) a[i] = i;
for(i = 0; p > 0; i++) {
for(; v[i] > p; i++);
r = p/v[i];
p %= v[i];
for(s = a[j = i + r]; j >= i; j--) a[j] = a[j - 1];
a[i] = s;
}
}
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0;
int *a = NULL, *b = NULL, *c = NULL;
if(argc == 2 && (n = strtol(argv[1], NULL, 0)) > 0) {
a = malloc(n*sizeof(int));
b = malloc(n*sizeof(int));
c = malloc(n*sizeof(int));
if(!a || !b || !c) {
puts("Unable to allocate memory");
goto end;
} else {
for(i = 0; i < n; i++) a[i] = i;
do {
copyvec(n, b, a);
r = rank(n, b);
unrank(n, c, r);
q |= s++ != r || !eqvec(n, a, c);
} while(next_perm(n, a));
puts(q?"Error":"OK");
}
} else {
puts("perm n - Check all permutations of {0 ... n - 1}, with n > 0");
}
end:
if(a) free(a);
if(b) free(b);
if(c) free(c);
return 0;
}
EDIT
Following Brian Cain's comment, here is a shorter program with the same problem. I removed all checks on input value, all the rank/unrank stuff, and I replaced the malloc/free with an array of size 20 (only one now, since b and c are not used anymore). Now the program only computes the permutations with the while(next_perm(n, a));
loop, and does nothing with them. It should still print "OK" in the end, though, because the value of q does not change after the initial q=0.
#include <stdlib.h>
#include <stdio.h>
#define SWAP(a,b) {int c; c = a; a = b; b = c;}
int next_perm(int n, int a[n]) {
int i, j, k;
for(i = n - 1; i > 0 && a[i - 1] > a[i]; i--);
for(j = i, k = n - 1; j < k; j++, k--) SWAP(a[j], a[k]);
if(i == 0) return 0;
for(j = i--; a[j] < a[i]; j++);
SWAP(a[i], a[j]);
return 1;
}
#undef SWAP
int main(int argc, char **argv) {
int n, i, r, s = 0, q = 0, a[20];
n = strtol(argv[1], NULL, 0);
for(i = 0; i < n; i++) a[i] = i;
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
EDIT 2: explanation of the assembly output
I add also the disassembly output of gcc (in Intel syntax), found with gcc -O3 -mavx -S -masm=intel
and gcc-4.9.2 (see link above for the actual binary files of the compiler). However, it needs some work, because as is, gcc will inline the call to next_perm, and it's less readable. I also remove the CFI directives and alignment and actually all other directives, to improve readability:
_next_perm:
LFB0:
push ebp
push edi
push esi
push ebx
mov ecx, DWORD PTR [esp+20]
mov edx, DWORD PTR [esp+24]
lea eax, [ecx-1]
test eax, eax
jle L12
mov edi, DWORD PTR [edx-4+ecx*4]
cmp DWORD PTR [edx-8+ecx*4], edi
mov ecx, eax
jg L5
jmp L11
L28:
mov esi, DWORD PTR [edx+ecx*4]
cmp DWORD PTR [edx-4+ecx*4], esi
jle L27
L5:
sub ecx, 1
jne L28
L4:
mov ebx, ecx
L7:
mov esi, DWORD PTR [edx+ebx*4]
mov edi, DWORD PTR [edx+eax*4]
mov DWORD PTR [edx+ebx*4], edi
mov DWORD PTR [edx+eax*4], esi
add ebx, 1
sub eax, 1
cmp ebx, eax
jl L7
L2:
xor eax, eax
test ecx, ecx
je L23
L11:
sal ecx, 2
lea esi, [edx+ecx]
lea ebp, [edx-4+ecx]
mov ebx, DWORD PTR [esi]
mov edi, DWORD PTR [ebp+0]
cmp edi, ebx
jle L9
lea eax, [edx+4+ecx]
L10:
mov esi, eax
add eax, 4
mov ebx, DWORD PTR [eax-4]
cmp ebx, edi
jl L10
L9:
mov DWORD PTR [ebp+0], ebx
mov eax, 1
mov DWORD PTR [esi], edi
L23:
pop ebx
pop esi
pop edi
pop ebp
ret
L27:
cmp eax, ecx
jg L4
jmp L11
L12:
mov ecx, eax
jmp L2
The assembly output is the same with or without -mavx, apart from label numbers: there is no AVX instruction, which means the problem actually lies in main
.
This can be checked by adding some puts
in main:
int main(int argc, char **argv) {
int n, i, q = 0, a[20];
puts("X");
n = strtol(argv[1], NULL, 0);
puts("Y");
for(i = 0; i < n; i++) a[i] = i;
puts("Z");
while(next_perm(n, a));
puts(q?"Error":"OK");
return 0;
}
Then, the programs prints only X and Y when it fails, hence the problem comes from the AVX instructions used to build 'a' in the for loop between Y and Z.
Here is the assembly output of main
, again without directives (LC2 points to "Y", and LC3 to "Z"). The only AVX instructions in the assembly ouptut of main are between those two puts
, and they are used for the for
loop that builds the initial 'a', that is the array {0, 1, ..., n-1}. What happens actually, is that AVX instructions are used to build several elements of 'a' at a time (4 I guess), and if the length of 'a' is not a multiple of 4, then there is an additional step (between L4 and L9), before calling the puts("Z") at L9, then the while(next_perm(n, a)); at L3. Thus, the problem is very simple: if n
is small enough, then the AVX loop is actually not run, and there is no error. Here the maximum valid n
is 4, but it varies between differents runs of gcc, it's a bit randomized it seems (I got 8 yesterday).
The LC0 and LC4 labels point to two arrays of 4 elements that are used by the AVX instructions: LC0 is {0,1,2,3}, and LC4 is {4,4,4,4}. No wonder why they are here, even without deep knowledge of AVX, it smells like an unrolled loop :-)
_main:
push ebp
mov ebp, esp
push edi
push esi
push ebx
and esp, -16
sub esp, 96
call ___main
mov DWORD PTR [esp], OFFSET FLAT:LC1
call _puts
mov eax, DWORD PTR [ebp+12]
mov DWORD PTR [esp+8], 0
mov DWORD PTR [esp+4], 0
mov eax, DWORD PTR [eax+4]
mov DWORD PTR [esp], eax
call _strtol
mov DWORD PTR [esp], OFFSET FLAT:LC2
mov ebx, eax
call _puts
test ebx, ebx
jle L17
lea edx, [ebx-4]
lea ecx, [ebx-1]
shr edx, 2
add edx, 1
cmp ecx, 3
lea eax, [0+edx*4]
jbe L10
vmovdqa xmm1, XMMWORD PTR LC4
lea esi, [esp+16]
xor ecx, ecx
vmovdqa xmm0, XMMWORD PTR LC0
L5:
mov edi, ecx
add ecx, 1
sal edi, 4
cmp edx, ecx
vmovaps XMMWORD PTR [esi+edi], xmm0
vpaddd xmm0, xmm0, xmm1
ja L5
cmp ebx, eax
je L9
L4:
lea edx, [eax+1]
mov DWORD PTR [esp+16+eax*4], eax
cmp ebx, edx
jle L9
mov DWORD PTR [esp+16+edx*4], edx
lea edx, [eax+2]
cmp ebx, edx
jle L9
add eax, 3
mov DWORD PTR [esp+16+edx*4], edx
cmp ebx, eax
jle L9
mov DWORD PTR [esp+16+eax*4], eax
L9:
mov DWORD PTR [esp], OFFSET FLAT:LC3
call _puts
L3:
mov DWORD PTR [esp+4], esi
mov DWORD PTR [esp], ebx
call _next_perm
test eax, eax
jne L3
mov DWORD PTR [esp], OFFSET FLAT:LC5
call _puts
lea esp, [ebp-12]
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
L10:
xor eax, eax
lea esi, [esp+16]
jmp L4
L17:
lea esi, [esp+16]
jmp L9
Now, I understand what actually happens, but one question remains: why is there no error message whatsoever when the program tries to run an AVX instruction? It simply exits, or it's killed, but without any hint that something went wrong.