Context
When implementing generic functions in C that work with a range of types, void*
is regularly used. The libc
function qsort()
is a classic example. Internally qsort()
and many other algorithms require a swap()
function.
A simple but typical implementation of a generic swap looks like this:
void swap(void* x, void* y, size_t size) {
char t[size];
memcpy(t, x, size);
memcpy(x, y, size);
memcpy(y, t, size);
}
For larger types a byte-by-byte swap can be used, or malloc
which would be slow, but the focus here is on what happens when this generic swap()
is used for small types.
A better generic swap?
It turns out that if we match for some common type sizes (int and long for 4 and 8 bytes on x86_64) which also cover float, double, pointer etc, that we can get a surprising performance improvement:
void swap(void* x, void* y, size_t size) {
if (size == sizeof(int)) {
int t = *((int*)x);
*((int*)x) = *((int*)y);
*((int*)y) = t;
} else if (size == sizeof(long)) {
long t = *((long*)x);
*((long*)x) = *((long*)y);
*((long*)y) = t;
} else {
char t[size];
memcpy(t, x, size);
memcpy(x, y, size);
memcpy(y, t, size);
}
}
NB: This could clearly be refined to using #if
rather than if/else
and for more types.
In the context of the below generic quicksort()
implementation, the above swap gives ~2x performance improvement for a 10,000,000 random int sort, compared with the more standard memcpy()
only swap at top. This is using gcc-9 or clang-10 on ubuntu 20.04 with -O3
.
Question
This seems like a remarkable result.
- Does this break any standards?
- Can anyone validate this?
- What makes this gain possible? Is it simply copying in "wider words" or is the some compiler optimization / inlining at play?
- If it is indeed valid, why is it not being done already? Or is it?
NB: I have not examined the generated assembly code - yet.
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef bool (*cmp)(const void*, const void*);
bool cmp_ints_less(const void* a, const void* b) {
return *(const int*)a < *(const int*)b;
}
bool cmp_ints_greater(const void* a, const void* b) {
return *(const int*)a > *(const int*)b;
}
bool cmp_floats_less(const void* a, const void* b) {
return *(const float*)a < *(const float*)b;
}
bool cmp_floats_greater(const void* a, const void* b) {
return *(const float*)a > *(const float*)b;
}
bool cmp_doubles_less(const void* a, const void* b) {
return *(const double*)a < *(const double*)b;
}
bool cmp_doubles_greater(const void* a, const void* b) {
return *(const double*)a > *(const double*)b;
}
bool cmp_strs_less(const void* a, const void* b) {
return strcmp(*((const char**)a), *((const char**)b)) < 0;
}
bool cmp_strs_greater(const void* a, const void* b) {
return strcmp(*((const char**)a), *((const char**)b)) > 0;
}
void swap(void* x, void* y, size_t size) {
if (size == sizeof(int)) {
int t = *((int*)x);
*((int*)x) = *((int*)y);
*((int*)y) = t;
} else if (size == sizeof(long)) {
long t = *((long*)x);
*((long*)x) = *((long*)y);
*((long*)y) = t;
} else {
char t[size];
memcpy(t, x, size);
memcpy(x, y, size);
memcpy(y, t, size);
}
}
void* partition(void* start, void* end, size_t size, cmp predicate) {
if (start == NULL || end == NULL || start == end) return start;
char* storage = (char*)start;
char* last = (char*)end - size; // used as pivot
for (char* current = start; current != last; current += size) {
if (predicate(current, last)) {
swap(current, storage, size);
storage += size;
}
}
swap(storage, last, size);
return storage; // returns position of pivot
}
void quicksort(void* start, void* end, size_t size, cmp predicate) {
if (start == end) return;
void* middle = partition(start, end, size, predicate);
quicksort(start, middle, size, predicate);
quicksort((char*)middle + size, end, size, predicate);
}
void print(const int* start, int size) {
for (int i = 0; i < size; ++i) printf("%3d", start[i]);
printf("\n");
}
void rand_seed() {
int seed = 0;
FILE* fp = fopen("/dev/urandom", "re");
if (!fp) {
fprintf(stderr, "Warning: couldn't open source of randomness, falling back to time(NULL)");
srand(time(NULL));
return;
}
if (fread(&seed, sizeof(int), 1, fp) < 1) {
fprintf(stderr, "Warning: couldn't read random seed, falling back to time(NULL)");
fclose(fp);
srand(time(NULL));
return;
}
fclose(fp);
srand(seed); // nice seed for rand()
}
int rand_range(int start, int end) {
return start + rand() / (RAND_MAX / (end - start + 1) + 1);
}
int main() {
// int demo
rand_seed();
#define int_count 20
int* ints = malloc(int_count * sizeof(int));
if (!ints) {
fprintf(stderr, "couldn't allocate memory");
exit(EXIT_FAILURE);
}
for (int i = 0; i < int_count; ++i) ints[i] = rand_range(1, int_count / 2);
print(ints, int_count);
quicksort(ints, ints + int_count, sizeof(int), &cmp_ints_less);
print(ints, int_count);
free(ints);
// string demo
const char* strings[] = {
"material", "rare", "fade", "aloof", "way", "torpid",
"men", "purring", "abhorrent", "unpack", "zinc", "unsightly",
};
const int str_count = sizeof(strings) / sizeof(strings[0]);
quicksort(strings, strings + str_count, sizeof(char*), &cmp_strs_greater);
for (int i = 0; i < str_count; ++i) printf("%s\n", strings[i]);
// double demo
#define dbl_count 20
double doubles[dbl_count];
for (int i = 0; i < dbl_count; ++i) doubles[i] = rand() / (RAND_MAX / 100.0);
quicksort(doubles, doubles + dbl_count, sizeof(char*), &cmp_doubles_less);
for (int i = 0; i < dbl_count; ++i) printf("%20.16f\n", doubles[i]);
return EXIT_SUCCESS;
}
EDIT:
Just for reference Compiler Explorer reports the very obvious following assembly for alternative generic swap()
:
The sample main()
there is:
int main() {
int two = 2;
int three = 3;
swap(&two, &three, sizeof(int));
swap2(&two, &three, sizeof(int));
return two - three;
}
full assembler for swap2()
below, but it is noteworthy that the compiler has inlined swap2()
but not swap()
which of course containes further calls to memcopy
. This might be some (all?) of the difference?
swap2:
push rbp
mov rbp, rsp
push r14
mov r14, rdi
push r13
mov r13, rsi
push r12
push rbx
cmp rdx, 4
je .L9
mov r12, rdx
cmp rdx, 8
jne .L7
mov rax, QWORD PTR [rdi]
mov rdx, QWORD PTR [rsi]
mov QWORD PTR [rdi], rdx
mov QWORD PTR [rsi], rax
lea rsp, [rbp-32]
pop rbx
pop r12
pop r13
pop r14
pop rbp
ret
.L7:
lea rax, [rdx+15]
mov rbx, rsp
mov rsi, rdi
and rax, -16
sub rsp, rax
mov rdi, rsp
call memcpy
mov rdx, r12
mov rsi, r13
mov rdi, r14
call memcpy
mov rdx, r12
mov rsi, rsp
mov rdi, r13
call memcpy
mov rsp, rbx
lea rsp, [rbp-32]
pop rbx
pop r12
pop r13
pop r14
pop rbp
ret
.L9:
mov eax, DWORD PTR [rdi]
mov edx, DWORD PTR [rsi]
mov DWORD PTR [rdi], edx
mov DWORD PTR [rsi], eax
lea rsp, [rbp-32]
pop rbx
pop r12
pop r13
pop r14
pop rbp
ret