How do I convert _mm_clmulepi64_si128 to vmull_{high}_p64?
Here are results from the sample program below. The conversions are:
_mm_clmulepi64_si128(a, b, 0x00)
→ vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))
_mm_clmulepi64_si128(a, b, 0x01)
→ vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 0))
_mm_clmulepi64_si128(a, b, 0x10)
→ vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 1))
_mm_clmulepi64_si128(a, b, 0x11)
→ vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))
For case (4), _mm_clmulepi64_si128(a, b, 0x11)
, the following also holds:
_mm_clmulepi64_si128(a, b, 0x11)
→ vmull_high_p64((poly64x2_t)a, (poly64x2_t)b)
I'm guessing the cases (1) through (4) can spill out into memory if not careful because vgetq_lane_u64
returns a scalar or non-vector type. I'm also guessing case (5) has a propensity to stay in the Q registers because its a vector type.
x86_64 and _mm_clmulepi64_si128:
$ ./mul-sse-neon.exe
IS_X86: true
****************************************
clmulepi64(a, b, 0x00)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0x606060606060606, r[1]: 0x606060606060606
****************************************
clmulepi64(a, b, 0x01)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0xc0c0c0c0c0c0c0c, r[1]: 0xc0c0c0c0c0c0c0c
****************************************
clmulepi64(a, b, 0x10)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0xa0a0a0a0a0a0a0a, r[1]: 0xa0a0a0a0a0a0a0a
****************************************
clmulepi64(a, b, 0x11)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0x1414141414141414, r[1]: 0x1414141414141414
ARM64 and vmull_p64:
$ ./mul-sse-neon.exe
IS_ARM: true
****************************************
vmull_p64(a, b, 0x00)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0x606060606060606, r[1]: 0x606060606060606
****************************************
vmull_p64(a, b, 0x01)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0xa0a0a0a0a0a0a0a, r[1]: 0xa0a0a0a0a0a0a0a
****************************************
vmull_p64(a, b, 0x10)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0xc0c0c0c0c0c0c0c, r[1]: 0xc0c0c0c0c0c0c0c
****************************************
vmull_p64(a, b, 0x11)
a[0]: 0x2222222222222222, a[1]: 0x4444444444444444
b[0]: 0x3333333333333333, b[1]: 0x5555555555555555
r[0]: 0x1414141414141414, r[1]: 0x1414141414141414
The sample program mul-sse-neon.cc:
#define IS_ARM (__arm__ || __arm32__ || __aarch32__ || __arm64__ || __aarch64__)
#define IS_X86 (__i386__ || __i586__ || __i686__ || __amd64__ || __x86_64__)
#if (IS_ARM)
# include <arm_neon.h>
# if defined(__ARM_ACLE) || defined(__GNUC__)
# include <arm_acle.h>
# endif
#endif
#if (IS_X86)
# include <emmintrin.h>
# if defined(__GNUC__)
# include <x86intrin.h>
# endif
#endif
#if (IS_ARM)
typedef uint64x2_t word128;
#elif (IS_X86)
typedef __m128i word128;
#else
# error "Need a word128"
#endif
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
void print_val(const word128* value, const char* label);
/* gcc -DNDEBUG -g3 -O0 -march=native mul-sse-neon.cc -o mul-sse-neon.exe */
/* gcc -DNDEBUG -g3 -O0 -march=armv8-a+crc+crypto mul-sse-neon.cc -o mul-sse-neon.exe */
int main(int argc, char* argv[])
{
#if (IS_ARM)
printf("IS_ARM: true\n");
#elif (IS_X86)
printf("IS_X86: true\n");
#endif
word128 a,b, r;
a[0] = 0x2222222222222222, a[1] = 0x4444444444444444;
b[0] = 0x3333333333333333, b[1] = 0x5555555555555555;
#if (IS_ARM)
printf("****************************************\n");
printf("vmull_p64(a, b, 0x00)\n");
r = (uint64x2_t)vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b,0));
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("vmull_p64(a, b, 0x01)\n");
r = (uint64x2_t)vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b,1));
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("vmull_p64(a, b, 0x10)\n");
r = (uint64x2_t)vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b,0));
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("vmull_p64(a, b, 0x11)\n");
r = (uint64x2_t)vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b,1));
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
#elif (IS_X86)
printf("****************************************\n");
printf("clmulepi64(a, b, 0x00)\n");
r = _mm_clmulepi64_si128(a, b, 0x00);
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("clmulepi64(a, b, 0x01)\n");
r = _mm_clmulepi64_si128(a, b, 0x01);
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("clmulepi64(a, b, 0x10)\n");
r = _mm_clmulepi64_si128(a, b, 0x10);
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
printf("****************************************\n");
printf("clmulepi64(a, b, 0x11)\n");
r = _mm_clmulepi64_si128(a, b, 0x11);
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
#endif
return 0;
}
static const word128 s_v = {0,0};
static const char s_l[] = "";
void print_val(const word128* value, const char* label)
{
const word128* v = (value ? value : &s_v);
const char* l = (label ? label : s_l);
#if (IS_ARM)
printf("%s[0]: 0x%" PRIx64 ", %s[1]: 0x%" PRIx64 "\n", l, (*v)[0], l, (*v)[1]);
#elif (IS_X86)
printf("%s[0]: 0x%" PRIx64 ", %s[1]: 0x%" PRIx64 "\n", l, (*v)[0], l, (*v)[1]);
#endif
}
The code for vmull_high_p64
is as follows. It always produces the same result because its always taking the same high words:
printf("****************************************\n");
printf("vmull_p64(a, b)\n");
r = (uint64x2_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b);
print_val(&a, "a"); print_val(&b, "b"); print_val(&r, "r");
For completeness, switching the data to:
word128 a,b, r;
a[0] = 0x2222222233333333, a[1] = 0x4444444455555555;
b[0] = 0x6666666677777777, b[1] = 0x8888888899999999;
Produces the following results:
$ ./mul-sse-neon.exe
IS_X86: true
****************************************
clmulepi64(a, b, 0x00)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0xd0d0d0d09090909, r[1]: 0xc0c0c0c08080808
****************************************
clmulepi64(a, b, 0x01)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x191919191b1b1b1b, r[1]: 0x181818181a1a1a1a
****************************************
clmulepi64(a, b, 0x10)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x111111111b1b1b1b, r[1]: 0x101010101a1a1a1a
****************************************
clmulepi64(a, b, 0x11)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x212121212d2d2d2d, r[1]: 0x202020202c2c2c2c
And:
$ ./mul-sse-neon.exe
IS_ARM: true
****************************************
vmull_p64(a, b, 0x00)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0xd0d0d0d09090909, r[1]: 0xc0c0c0c08080808
****************************************
vmull_p64(a, b, 0x01)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x111111111b1b1b1b, r[1]: 0x101010101a1a1a1a
****************************************
vmull_p64(a, b, 0x10)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x191919191b1b1b1b, r[1]: 0x181818181a1a1a1a
****************************************
vmull_p64(a, b, 0x11)
a[0]: 0x2222222233333333, a[1]: 0x4444444455555555
b[0]: 0x6666666677777777, b[1]: 0x8888888899999999
r[0]: 0x212121212d2d2d2d, r[1]: 0x202020202c2c2c2c