Edit: The last method described below is by far the fastest, giving a 5+ times speedup on smaller datasets; the problem was the benchmark itself, which caused page faults and had too much data to fit in cache.
My problem is the following: I have a large piece of data in memory. I want to generate the result of spreading out that data by a factor of three (what I'll call triplification, for lack of a better word), with the intermediate bits being 0. To be explicit, I want
11011011…
to be converted into
100100000100100000100100…
(where the convention is little-endian). What I’ve tried:
- Straightforwardly iterating over the bytes and using a 256-entry 8-bit lookup table, generating 24 bits at a time. Pretty fast since the table is small and probably hot in L1d. (~7.2 cycles / byte)
- Using a 65536-entry 16-bit lookup table and unrolling the loop somewhat. (~5.01 c/byte)
- Using a series of
pdep
(bit deposit) instructions by grabbing sets of 21 or 22 bits and performing bit deposition into a mask, each time generating 64 bits. (~4.63 c/byte)
I was expected the last method to be significantly faster, but apparently not. Even though the loop is heavily unrolled and the pdep
instructions should be executable in parallel (throughput of 1, latency of 3 cycles), and shifts are fairly fast, it's bottlenecking somewhere else that I can't identify. Given the fairly regular form of output, I thought there could be a clever bithack to do so (esp. on AVX2, where we have vpshll
and brethren), but I couldn't think of a particularly good one. What is the fastest way to perform this operation (on large datasets, and assuming 32-byte alignment)?
I’m on Haswell, on a 2.7 GHz Quad-Core Intel Core i7, so I have access to AVX2 and BMI2, but no further. My code:
// gcc triple.c -o triple -march=haswell -g -O2
#include <immintrin.h>
#include <time.h>
#include <stdint.h>
#include <stdio.h>
#include <stdint.h>
#include <limits.h>
#ifndef __AVX2__
#error "AVX2 needed"
#endif
#ifndef __BMI2__
#error "BMI2 needed"
#endif
const size_t SRC_SIZE = 1ULL << 30; // bytes
uint32_t lut[256];
// Probably too big to fit in L1d...
uint64_t lut16[65536];
// Reference implementation, working byte by byte (~7.200 cycles / byte)
void basic_stretch_data(void* src, size_t bytes, void* dst) {
uint8_t* src_c = src;
uint8_t* dst_c = dst;
for (size_t i = 0; i < bytes; ++i) {
uint32_t t = lut[src_c[i]]; // lut filled below
dst_c[3*i] = (t & 0xff);
dst_c[3*i+1] = (t & 0xff00) >> 8;
dst_c[3*i+2] = (t & 0xff0000) >> 16;
}
}
// Attempts to be a bit more clever, working in 16-bit chunks and writing in 64-bit chunks (~5.01 cycles / byte).
// Expects 8-byte alignment.
void improved_stretch_data(void* src, size_t bytes, void* dst) {
uint16_t* src_c = src;
uint64_t* dst_c = dst;
for (size_t i = 0; i < bytes / 2; i += 4) {
// 48 bytes per cycle
uint64_t t1 = lut16[src_c[i]]; // lut filled below
uint64_t t2 = lut16[src_c[i+1]];
uint64_t t3 = lut16[src_c[i+2]];
uint64_t t4 = lut16[src_c[i+3]];
dst_c[3*i/4] = t1 | ((t2 & 0xffff) << 48);
dst_c[3*i/4+1] = ((t2 & 0xffffffff0000) >> 16) | ((t3 & 0xffffffff) << 32);
dst_c[3*i/4+2] = ((t3 & 0xffff00000000) >> 32) | (t4 << 16);
}
}
// Every three bits toggled, in one of three locations
#define BIT_MASK_1 0x9249249249249249ULL
#define BIT_MASK_2 (BIT_MASK_1 >> 1)
#define BIT_MASK_3 (BIT_MASK_1 >> 2)
// Expects 32-byte alignment. ~3.02 cycles / byte
void pdep_stretch_data(void* src, size_t bytes, void* dst) {
uint64_t* src_c = src;
uint64_t* dst_c = dst;
uint64_t k;
for (size_t i = 0; i < bytes / 8; ) {
// Output 96 bytes per loop
uint64_t k = src_c[i];
#define WRITE_PARTS \
k = src_c[i]; \
dst_c[3*i] = _pdep_u64(k, BIT_MASK_1); /* first 22 bits (little endian) */ \
dst_c[3*i+1] = _pdep_u64(k >> 22, BIT_MASK_2); /* middle 21 bits */ \
dst_c[3*i+2] = _pdep_u64(k >> 43, BIT_MASK_3); /* last 21 bits */ \
i += 1;
WRITE_PARTS
WRITE_PARTS
WRITE_PARTS
WRITE_PARTS
}
}
#define DEFINE_TRIPLIFY(name, IntType, Out) Out name(IntType b) { \
Out s = 0; \
for (int i = 0; i < sizeof(IntType) * CHAR_BIT; ++i) { \
s |= (Out)(b & (1 << i)) << (2 * i); \
} \
return s; \
}
DEFINE_TRIPLIFY(triplify8, uint8_t, uint32_t)
DEFINE_TRIPLIFY(triplify16, uint16_t, uint64_t)
void fill_triplify_lut() {
uint8_t b = 0;
do {
lut[b] = triplify8(b);
} while (++b != 0);
}
void fill_triplify_lut16() {
uint16_t b = 0;
do {
lut16[b] = triplify16(b);
} while (++b != 0);
}
clock_t c;
void clock_start() {
c = clock();
}
void clock_end(const char* msg) {
printf("Clock %s took %f s\n", msg, (double)(clock() - c) / CLOCKS_PER_SEC);
}
int main() {
fill_triplify_lut();
fill_triplify_lut16();
char* src = aligned_alloc(32, SRC_SIZE);
char* dst = aligned_alloc(32, 3 * SRC_SIZE);
printf("Initializing data\n");
for (size_t i = 0; i < SRC_SIZE; ++i) {
src[i] = i;
}
printf("Initialized data\n");
// Stretch data so that each bit becomes three bits in the result, with two of them 0
clock_start();
pdep_stretch_data(src, SRC_SIZE, dst);
clock_end("pdep finished");
char* dst2 = aligned_alloc(32, 3 * SRC_SIZE);
clock_start();
basic_stretch_data(src, SRC_SIZE, dst2);
clock_end("basic finished");
// Validate
for (size_t i = 0; i < 3 * SRC_SIZE; ++i) {
if (dst[i] != dst2[i]) {
printf("Results do not match at byte %zu. Expected: %i; got: %i.\n", i, dst2[i], dst[i]);
return 0;
}
}
}
Disassembly
Second method:
improved_stretch_data: # @improved_stretch_data
cmp rsi, 2
jb .LBB1_3
shr rsi
xor r9d, r9d
vmovdqa xmm0, xmmword ptr [rip + .LCPI1_0] # xmm0 = [16,32]
vpxor xmm1, xmm1, xmm1
vmovdqa xmm2, xmmword ptr [rip + .LCPI1_1] # xmm2 = [32,16]
xor ecx, ecx
.LBB1_2: # =>This Inner Loop Header: Depth=1
movzx r8d, word ptr [rdi + 2*rcx]
movzx eax, word ptr [rdi + 2*rcx + 2]
mov rax, qword ptr [8*rax + lut16]
movzx r10d, word ptr [rdi + 2*rcx + 4]
vmovq xmm3, rax
shl rax, 48
or rax, qword ptr [8*r8 + lut16]
movzx r8d, word ptr [rdi + 2*rcx + 6]
vmovq xmm4, qword ptr [8*r10 + lut16] # xmm4 = mem[0],zero
vmovq xmm5, qword ptr [8*r8 + lut16] # xmm5 = mem[0],zero
mov qword ptr [rdx + 2*r9], rax
vpunpcklqdq xmm3, xmm3, xmm4 # xmm3 = xmm3[0],xmm4[0]
vpsrlvq xmm3, xmm3, xmm0
vpblendw xmm3, xmm1, xmm3, 19 # xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
vpunpcklqdq xmm4, xmm4, xmm5 # xmm4 = xmm4[0],xmm5[0]
vpsllvq xmm4, xmm4, xmm2
vpor xmm3, xmm4, xmm3
vmovdqu xmmword ptr [rdx + 2*r9 + 8], xmm3
add rcx, 4
add r9, 12
cmp rcx, rsi
jb .LBB1_2
.LBB1_3:
ret
Third method:
pdep_stretch_data: # @pdep_stretch_data
push r15
push r14
push rbx
cmp rsi, 8
jb .LBB2_3
shr rsi, 3
mov eax, 9
xor r15d, r15d
movabs r8, -7905747460161236407
movabs r9, 5270498306774157604
movabs r10, 2635249153387078802
.LBB2_2: # =>This Inner Loop Header: Depth=1
mov rcx, qword ptr [rdi + 8*r15]
pdep r14, rcx, r8
lea r11, [8*rax]
mov qword ptr [rdx + 8*rax - 72], r14
mov rbx, rcx
shr rbx, 22
pdep rbx, rbx, r9
mov qword ptr [rdx + 8*rax - 64], rbx
shr rcx, 43
pdep rcx, rcx, r10
mov qword ptr [rdx + 8*rax - 56], rcx
mov rcx, qword ptr [rdi + 8*r15 + 8]
pdep rbx, rcx, r8
mov qword ptr [rdx + 8*rax - 48], rbx
mov rbx, rcx
shr rbx, 22
pdep rbx, rbx, r9
mov qword ptr [rdx + 8*rax - 40], rbx
shr rcx, 43
pdep rcx, rcx, r10
mov qword ptr [rdx + 8*rax - 32], rcx
mov rcx, qword ptr [rdi + 8*r15 + 16]
pdep rbx, rcx, r8
mov qword ptr [rdx + 8*rax - 24], rbx
mov rbx, rcx
shr rbx, 22
pdep rbx, rbx, r9
mov qword ptr [rdx + 8*rax - 16], rbx
shr rcx, 43
pdep rcx, rcx, r10
mov qword ptr [rdx + 8*rax - 8], rcx
mov rcx, qword ptr [rdi + 8*r15 + 24]
pdep rbx, rcx, r8
mov qword ptr [rdx + 8*rax], rbx
mov rbx, rcx
shr rbx, 22
pdep rbx, rbx, r9
mov qword ptr [rdx + 8*rax + 8], rbx
shr rcx, 43
pdep rcx, rcx, r10
or r11, 16
mov qword ptr [rdx + r11], rcx
add r15, 4
add rax, 12
cmp r15, rsi
jb .LBB2_2
.LBB2_3:
pop rbx
pop r14
pop r15
ret