When implementing different sort algorithms for my computer science class I stumbled upon a curious optimization case with GCC(version 12.2).
I tried programming selection sort with two different variants: once storing the value of the current minimum (selection_sort) and once with only storing the index of the current minimum value (selection_nostore_sort)
links to godbolt and quick-bench (code & relevant assembly is also down below):
https://gcc.godbolt.org/z/Gdab1xenf
https://quick-bench.com/q/5xh06k8DD9qi1RCFvFMVBdbkDuw
I was expecting the nostore version to be slower because it has to reach out into the array for comparisons.
This was the case up to -O2
, however with -O3
nostore runs about 10% faster (both on my local machine i7-10750H and quick-bench)
Testing out all the extra optimizations that -O3
enables I was able to figure out that -O2 -ftree-partial-pre
produces the same result.
When checking out the generated assembly in both cases I noticed that -ftree-partial-pre
actually produces more assembler code yet runs faster.
I dont see any obvious reason why the -ftree-partial-pre
version runs faster, what could be the causes here?
Thanks a lot for your help! My professors were sadly unable to give me an explanation.
#include <vector>
#include <random>
#include <functional>
using std::size_t;
const size_t DATA_ELEMS = 10'000;
const size_t RAND_RANGE = 1'000'000;
std::vector<size_t> random_data()
{
std::srand(42);
std::vector<size_t> result(DATA_ELEMS);
const auto rand_size_t_range = [] () -> size_t {
return std::rand() % RAND_RANGE;
};
std::generate(result.begin(), result.end(), rand_size_t_range);
return result;
}
inline void swap(std::vector<size_t> &a, size_t i1, size_t i2)
{
size_t h = a[i1];
a[i1] = a[i2];
a[i2] = h;
}
void selection_sort(std::vector<size_t> &a)
{
for (size_t k = 0; k < a.size(); k++)
{
size_t min = a[k];
size_t i_min = k;
for (size_t i = k + 1; i < a.size(); i++)
{
if (a[i] < min)
{
min = a[i];
i_min = i;
}
}
if (i_min != k)
{
swap(a, i_min, k);
}
}
}
/*
This is normally quite a bit slower than selection_sort
However compiling with -02 -ftree-partial-pre seems to make this function
quite a bit faster than selection_sort
*/
void selection_nostore_sort(std::vector<size_t> &a)
{
for (size_t k = 0; k < a.size(); k++)
{
size_t i_min = k;
for (size_t i = k + 1; i < a.size(); i++)
{
if (a[i] < a[i_min])
{
i_min = i;
}
}
if (i_min != k)
{
swap(a, i_min, k);
}
}
}
The -O2
version
selection_nostore_sort(std::vector<unsigned long, std::allocator<unsigned long> >&):
mov rax, QWORD PTR [rdi+8]
mov rcx, QWORD PTR [rdi]
mov rsi, rax
sub rsi, rcx
sar rsi, 3
cmp rcx, rax
je .L10
xor r8d, r8d
lea rdi, [r8+1]
cmp rdi, rsi
jnb .L10
.L18:
mov rax, rdi
mov rdx, r8
.L13:
mov r9, QWORD PTR [rcx+rdx*8]
cmp QWORD PTR [rcx+rax*8], r9
cmovb rdx, rax
add rax, 1
cmp rsi, rax
jne .L13
cmp rdx, r8
je .L14
lea rax, [rcx+rdx*8]
mov r8, QWORD PTR [rcx-8+rdi*8]
mov rdx, QWORD PTR [rax]
mov QWORD PTR [rax], r8
mov QWORD PTR [rcx-8+rdi*8], rdx
.L14:
mov r8, rdi
lea rdi, [r8+1]
cmp rdi, rsi
jb .L18
.L10:
ret
The -O3
/ -O2 -ftree-partial-pre
version
selection_nostore_sort(std::vector<unsigned long, std::allocator<unsigned long> >&):
mov rax, QWORD PTR [rdi+8]
mov r10, QWORD PTR [rdi]
mov r9, rax
sub r9, r10
sar r9, 3
cmp r10, rax
je .L21
push r12
lea r11, [r10+8]
push rbp
push rbx
xor ebx, ebx
lea rbp, [rbx+1]
cmp rbp, r9
jnb .L10
.L26:
mov r12, QWORD PTR [r11-8]
mov rax, r11
mov rdx, rbp
mov r8, rbx
mov rsi, r12
jmp .L14
.L25:
add rdx, 1
lea rdi, [r10+r8*8]
add rax, 8
cmp rdx, r9
je .L24
.L14:
mov rcx, QWORD PTR [rax]
mov rdi, rax
cmp rcx, rsi
jnb .L25
mov r8, rdx
add rdx, 1
mov rsi, rcx
add rax, 8
cmp rdx, r9
jne .L14
.L24:
cmp r8, rbx
je .L15
mov QWORD PTR [rdi], r12
mov QWORD PTR [r11-8], rsi
.L15:
mov rbx, rbp
add r11, 8
lea rbp, [rbx+1]
cmp rbp, r9
jb .L26
.L10:
pop rbx
pop rbp
pop r12
ret
.L21:
ret