void ascii_pack_neon(uint8_t *pBin, uint8_t *pAscii, intptr_t len)
{
assert(len >= 64);
assert((len & 63) == 0);
uint8x8x4_t ina, inb, outa;
uint8x8x3_t outb;
uint8x8_t row1, row2, row3, row4, row5, row6, row7;
do {
len -= 64;
ina = vld4_u8(pAscii); pAscii += 32;
inb = vld4_u8(pAscii); pAscii += 32;
// finish transposing
outa.val[0] = vuzp1_u8(ina.val[0], inb.val[0]);
row1 = vuzp1_u8(ina.val[1], inb.val[1]);
row2 = vuzp1_u8(ina.val[2], inb.val[2]);
row3 = vuzp1_u8(ina.val[3], inb.val[3]);
row4 = vuzp2_u8(ina.val[0], inb.val[0]);
row5 = vuzp2_u8(ina.val[1], inb.val[1]);
row6 = vuzp2_u8(ina.val[2], inb.val[2]);
row7 = vuzp2_u8(ina.val[3], inb.val[3]);
outa.val[1] = vshr_n_u8(row1, 1);
outa.val[2] = vshr_n_u8(row2, 2);
outa.val[3] = vshr_n_u8(row3, 3);
outb.val[0] = vshr_n_u8(row4, 4);
outb.val[1] = vshr_n_u8(row5, 5);
outb.val[2] = vshr_n_u8(row6, 6);
outa.val[0] = vsli_n_u8(outa.val[0], row1, 7);
outa.val[1] = vsli_n_u8(outa.val[1], row2, 6);
outa.val[2] = vsli_n_u8(outa.val[2], row3, 5);
outa.val[3] = vsli_n_u8(outa.val[3], row4, 4);
outb.val[0] = vsli_n_u8(outb.val[0], row5, 3);
outb.val[1] = vsli_n_u8(outb.val[1], row6, 2);
outb.val[2] = vsli_n_u8(outb.val[2], row7, 1);
vst4_lane_u8(pBin, outa, 0); pBin += 4;
vst3_lane_u8(pBin, outb, 0); pBin += 3;
vst4_lane_u8(pBin, outa, 1); pBin += 4;
vst3_lane_u8(pBin, outb, 1); pBin += 3;
vst4_lane_u8(pBin, outa, 2); pBin += 4;
vst3_lane_u8(pBin, outb, 2); pBin += 3;
vst4_lane_u8(pBin, outa, 3); pBin += 4;
vst3_lane_u8(pBin, outb, 3); pBin += 3;
vst4_lane_u8(pBin, outa, 4); pBin += 4;
vst3_lane_u8(pBin, outb, 4); pBin += 3;
vst4_lane_u8(pBin, outa, 5); pBin += 4;
vst3_lane_u8(pBin, outb, 5); pBin += 3;
vst4_lane_u8(pBin, outa, 6); pBin += 4;
vst3_lane_u8(pBin, outb, 6); pBin += 3;
vst4_lane_u8(pBin, outa, 7); pBin += 4;
vst3_lane_u8(pBin, outb, 7); pBin += 3;
} while (len);
}
Below is the conventional version without transposing, which is much longer than the previous one:
static inline uint64x1_t pack8(uint64x1_t in)
{
const uint64x1_t mask1 = vdup_n_u64(0x007f007f007f007f);
const uint64x1_t mask2 = vdup_n_u64(0x00003fff00003fff);
const uint64x1_t mask4 = vdup_n_u64(0x000000000fffffff);
in = vbsl_u64(mask1, in, vshr_n_u64(in, 1));
in = vbsl_u64(mask2, in, vshr_n_u64(in, 2));
in = vbsl_u64(mask4, in, vshr_n_u64(in, 4));
return in;
}
void ascii_pack_neon_conventional(uint8_t *pBin, uint8_t *pAscii, intptr_t len)
{
// assert(len >= 64);
// assert((len & 63) == 0);
uint64x1x4_t ina, inb, outa;
uint64x1x3_t outb;
uint64x1_t row1, row2, row3, row4, row5, row6, row7;
do {
len -= 64;
ina = vld1_u64_x4((uint64_t *)pAscii); pAscii += 32;
inb = vld1_u64_x4((uint64_t *)pAscii); pAscii += 32;
outa.val[0] = pack8(ina.val[0]);
row1 = pack8(ina.val[1]);
row2 = pack8(ina.val[2]);
row3 = pack8(ina.val[3]);
row4 = pack8(inb.val[0]);
row5 = pack8(inb.val[1]);
row6 = pack8(inb.val[2]);
row7 = pack8(inb.val[3]);
outa.val[1] = vshr_n_u64(row1, 8);
outa.val[2] = vshr_n_u64(row2, 16);
outa.val[3] = vshr_n_u64(row3, 24);
outb.val[0] = vshr_n_u64(row4, 32);
outb.val[1] = vshr_n_u64(row5, 40);
outb.val[2] = vshr_n_u64(row6, 48);
outa.val[0] = vsli_n_u64(outa.val[0], row1, 56);
outa.val[1] = vsli_n_u64(outa.val[1], row2, 48);
outa.val[2] = vsli_n_u64(outa.val[2], row3, 40);
outa.val[3] = vsli_n_u64(outa.val[3], row4, 32);
outb.val[0] = vsli_n_u64(outa.val[0], row5, 24);
outb.val[1] = vsli_n_u64(outa.val[1], row6, 16);
outb.val[2] = vsli_n_u64(outa.val[2], row7, 8);
vst1_u64_x4((uint64_t *)pBin, outa); pBin += 32;
vst1_u64_x3((uint64_t *)pBin, outb); pBin += 24;
} while (len);
}
It seems that GCC
is the culprit here: godbolt link (transposing)
And GCC
keeps being a disaster even in conventional version
Conclusion: ditch GCC
. Use Clang
instead, or better - write in assembly:
.arch armv8-a
.global ascii_pack_asm_transpose, ascii_pack_asm_conventional
.text
pBin .req x0
pAscii .req x1
len .req w2
.balign 64
.func
ascii_pack_asm_transpose:
1:
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [pAscii], #32
ld4 {v20.8b, v21.8b, v22.8b, v23.8b}, [pAscii], #32
subs len, len, #64
uzp1 v0.8b, v16.8b, v20.8b
uzp1 v24.8b, v17.8b, v21.8b
uzp1 v25.8b, v18.8b, v22.8b
uzp1 v26.8b, v19.8b, v23.8b
uzp2 v27.8b, v16.8b, v20.8b
uzp2 v28.8b, v17.8b, v21.8b
uzp2 v29.8b, v18.8b, v22.8b
uzp2 v30.8b, v19.8b, v23.8b
ushr v1.8b, v24.8b, #1
ushr v2.8b, v25.8b, #2
ushr v3.8b, v26.8b, #3
ushr v4.8b, v27.8b, #4
ushr v5.8b, v28.8b, #5
ushr v6.8b, v29.8b, #6
sli v0.8b, v24.8b, #7
sli v1.8b, v25.8b, #6
sli v2.8b, v26.8b, #5
sli v3.8b, v27.8b, #4
sli v4.8b, v28.8b, #3
sli v5.8b, v29.8b, #2
sli v6.8b, v30.8b, #1
st4 {v0.b, v1.b, v2.b, v3.b}[0], [pBin], #4
st3 {v4.b, v5.b, v6.b}[0], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[1], [pBin], #4
st3 {v4.b, v5.b, v6.b}[1], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[2], [pBin], #4
st3 {v4.b, v5.b, v6.b}[2], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[3], [pBin], #4
st3 {v4.b, v5.b, v6.b}[3], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[4], [pBin], #4
st3 {v4.b, v5.b, v6.b}[4], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[5], [pBin], #4
st3 {v4.b, v5.b, v6.b}[5], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[6], [pBin], #4
st3 {v4.b, v5.b, v6.b}[6], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[7], [pBin], #4
st3 {v4.b, v5.b, v6.b}[7], [pBin], #3
b.gt 1b
.balign 16
ret
.endfunc
/////////////////////////////////////////////////////////////
.balign 64
.func
ascii_pack_asm_conventional:
adr x3, 2f
sub pAscii, pAscii, #16
sub pBin, pBin, #8
movi v0.4h, #0x007f // mask1
ldp d1, d2, [x3] // mask2, mask4
b 1f
.balign 16
2:
.long 0x00003fff, 0x00003fff
.long 0x0fffffff, 0x00000000
.balign 64
1:
ldp d16, d17, [pAscii, #16]
ldp d18, d19, [pAscii, #32]
ldp d20, d21, [pAscii, #48]
ldp d22, d23, [pAscii, #64]!
subs len, len, #64
ushr d24, d16, #1
ushr d25, d17, #1
ushr d26, d18, #1
ushr d27, d19, #1
ushr d28, d20, #1
ushr d29, d21, #1
ushr d30, d22, #1
ushr d31, d23, #1
bif v16.8b, v24.8b, v0.8b
bif v17.8b, v25.8b, v0.8b
bif v18.8b, v26.8b, v0.8b
bif v19.8b, v27.8b, v0.8b
bif v20.8b, v28.8b, v0.8b
bif v21.8b, v29.8b, v0.8b
bif v22.8b, v30.8b, v0.8b
bif v23.8b, v31.8b, v0.8b
ushr d24, d16, #2
ushr d25, d17, #2
ushr d26, d18, #2
ushr d27, d19, #2
ushr d28, d20, #2
ushr d29, d21, #2
ushr d30, d22, #2
ushr d31, d23, #2
bif v16.8b, v24.8b, v1.8b
bif v17.8b, v25.8b, v1.8b
bif v18.8b, v26.8b, v1.8b
bif v19.8b, v27.8b, v1.8b
bif v20.8b, v28.8b, v1.8b
bif v21.8b, v29.8b, v1.8b
bif v22.8b, v30.8b, v1.8b
bif v23.8b, v31.8b, v1.8b
ushr d24, d16, #4
ushr d25, d17, #4
ushr d26, d18, #4
ushr d27, d19, #4
ushr d28, d20, #4
ushr d29, d21, #4
ushr d30, d22, #4
ushr d31, d23, #4
bif v16.8b, v24.8b, v2.8b
bif v17.8b, v25.8b, v2.8b
bif v18.8b, v26.8b, v2.8b
bif v19.8b, v27.8b, v2.8b
bif v20.8b, v28.8b, v2.8b
bif v21.8b, v29.8b, v2.8b
bif v22.8b, v30.8b, v2.8b
bif v23.8b, v31.8b, v2.8b
ushr d24, d17, #8
ushr d25, d18, #16
ushr d26, d19, #24
ushr d27, d20, #32
ushr d28, d21, #40
ushr d29, d22, #48
sli d16, d17, #56
sli d24, d18, #48
sli d25, d19, #40
sli d26, d20, #32
sli d27, d21, #24
sli d28, d22, #16
sli d29, d23, #8
stp d16, d24, [pBin, #8]
stp d25, d26, [pBin, #24]
stp d27, d28, [pBin, #40]
str d29, [pBin, #56]!
b.gt 1b
.balign 16
ret
.endfunc
.end
Now you can see clearly that the transposing version is vastly superior, provided the chip doesn't mind unaligned stores much. (most armv8a
ones don't).
You may ask why I don't use quad registers instead of double ones: on armv8
, most instructions on quad registers have half the throughput of double ones. There is hardly any gain, if any while being less flexible. This might be different on more advanced cores.