I'm making a small game engine for personal use. The target architecture is x86_64 preferably with SSE2.
The sine/cosine function is one of the core parts, and it's implemented as a precomputed table of 1024 cosine values for the input range [0, π / 2]
.
The scalar implementation is quite straightforward.
typedef unsigned uns;
typedef float flt;
enum {COS_TABLE_SIZE = 1 << 10};
extern flt COS_TABLE[COS_TABLE_SIZE];
flt f(uns i) {
flt *t = COS_TABLE;
uns z = COS_TABLE_SIZE;
switch (i / z) {
case 0:
return +t[+(i - z * 0) + 0];
case 1:
return -t[-(i - z * 1) + z];
case 2:
return -t[+(i - z * 2) + 0];
case 3:
return +t[-(i - z * 3) + z];
default:
__builtin_unreachable();
}
}
The code isn't tested yet, so there could be an error in math.
Modern compilers aren't sophisticated enough to generate good code for the naivest approach to vectorization.
typedef uns u32;
typedef u32 vec_u32 __attribute__((vector_size(16)));
typedef flt vec_flt __attribute__((vector_size(16)));
vec_flt fv(vec_u32 i) {
vec_flt r;
for (uns j = 0; j < 4; ++j) {
r[j] = f(i[j]);
}
return r;
}
Both GCC and Clang produce horrible code for fv
. So I decided to do the vectorization manually.
When you have a look below, the code above /*---*/
isn't related much to this question. The branching in the scalar version is converted to a branchless vectorized version. Please do comment if there is room for improvement in that part.
Anyway, this question is about the lines below /*---*/
. The given problem is to create a vector from a vector of indices by doing table look-ups with those indices. In C, the upper part looks more complex, but in machine level the lower part is more expensive. Extracting each index value separately from a vector and then reconstructing a vector with the results doesn't seem to be a simple task.
What is an efficient way to deal with such problem? It is a personal project, and any kind of restructuring is always possible.
I prefer SSE2 for portability, but if there is a better solution available in later extensions, it would be good to know.
static uns uns_log2(uns x) {
if (__builtin_constant_p(x)) {
return 31 - __builtin_clz(x);
}
uns r = 0;
__asm__ ("bsr\t%0, %1" : "+r"(r) : "r"(x));
return r;
}
static u32 flt_reint_u32(flt x) {
u32 r;
memcpy(&r, &x, sizeof(x));
return r;
}
static flt u32_reint_flt(u32 x) {
flt r;
memcpy(&r, &x, sizeof(x));
return r;
}
static vec_u32 vec_u32_fill(u32 x) {
return (vec_u32){x, x, x, x};
}
vec_flt fv2(vec_u32 i) {
flt *t = COS_TABLE;
uns z = COS_TABLE_SIZE;
vec_u32 q = i >> uns_log2(z);
i -= q << uns_log2(z);
vec_u32 c = q == 1 | q == 3;
i = i & ~c | z - i & c;
vec_u32 s = vec_u32_fill(0x80000000);
s &= ~(q == 0 | q == 3);
/*---*/
vec_flt r;
for (uns j = 0; j < 4; ++j) {
r[j] = u32_reint_flt(flt_reint_u32(t[i[j]]) ^ s[j]);
}
return r;
}