Varint64 data is varlen data format.
Varint64: Divide uint64 into 8 8bits, each 8bits contains the highest flag bit and the lower 7 data bits. When the flag bit is 1, it indicates that the next 8bit still belongs to this uint64, otherwise it is the highest 8bit of this uint64. https://developers.google.com/protocol-buffers/docs/encoding
data stream like:
uint8_t p[32] = {128, 129,130,131,132, 133, 0, 128, 129,130,131,132, 134, 0,131,132, 133, 134, 0, 128, 129,130,131,132, 133, 134, 0,128,129,130,131};
__m256i like:
__m256i split 4*4 m64i:
m64i[0] = p[0]:p[6]
m64i[1] = p[7]:p[13]
m64i[2] = p[14]:p[18]
m64i[3] = p[19]:p[26]
I want to use AVX2/SSE to parse varint64. First, I need to align the raw stream to __m256i in order to do the next step. I find that aligning data is very time-consuming. Is there any good way to quickly get the data for the first four varint64 elements from the byte-stream into the 64-bit elements of an __m256i
?
my fastest align code:
#define SetBitMask(x) ((x) >= 8 ? 0xFFFFFFFFFFFFFFFFULL : ((1ULL << ((x)<<3)) -1 ))
inline __m256i _mm256_align_epi64_2(const uint8_t* p) {
auto b = _mm256_loadu_si256((__m256i*)(p));
auto bitmask = _mm256_movemask_epi8(b);
auto bm_not = ~bitmask;
auto first_len = __builtin_ctz(bm_not) + 1;
bm_not = bm_not >> first_len;
auto second_len = __builtin_ctz(bm_not) + 1;
bm_not = bm_not >> second_len;
auto third_len = __builtin_ctz(bm_not) + 1;
bm_not = bm_not >> third_len;
auto fourth_len = __builtin_ctz(bm_not) + 1;
auto n1 = (*(uint64_t*)(p+=0)) & SetBitMask(first_len);
auto n2 = (*(uint64_t*)(p+=first_len)) & SetBitMask(second_len);
auto n3 = (*(uint64_t*)(p+=second_len)) & SetBitMask(third_len);
auto n4 = (*(uint64_t*)(p+=third_len)) & SetBitMask(fourth_len);
return _mm256_set_epi64x(n1, n2, n3, n4);
}
cat /proc/cpuinfo
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca
model name : AMD EPYC 7642 48-Core Processor