I am new to writing some avx intrinsics based code so need some help in understanding if my observations are expected. I have 2 methods implementing distance computations, both methods take 2 float arrays and its dimension and returns a float distance. The first method computes a euclidean distance
static float
compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *) qty_ptr);
float __attribute__((aligned(32))) TmpRes[8];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
}
The second method computes a bitwise xor and then counts number of 1 i.e hamming distance
static float compute_hamming(const void* __restrict pVect1v,
const void* __restrict pVect2v,
const void* __restrict qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *)qty_ptr);
uint64_t __attribute__((aligned(32))) TmpRes[4];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
int res = 0;
__m256 diff, v1, v2;
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_xor_ps(v1, v2);
_mm256_store_si256( (__m256i*)TmpRes, _mm256_castps_si256(diff));
res += __builtin_popcountll(TmpRes[0]) + __builtin_popcountll(TmpRes[1])
+ __builtin_popcountll(TmpRes[2]) + __builtin_popcountll(TmpRes[3]);
}
return res;
}
For the same number of bits, l2 square distance computation is much faster than hamming i.e almost 2x-4x 9 ( i.e computing l2 distance for 512 bits which 16 floats is faster than computing hamming on the 16 floats) . I am not really sure if this is expected .
To me it seems that popcount and storing the results to temp is causing some slowness , because when i modify the l2 distance computation to do xor operation instead of sub i.e replace _mm256_sub_ps
with _mm256_xor_ps
the l2 computation becomes more fast.
I am benchmarking on a mac os, which has avx instruction support. Also another observation is a non avx implementation of hamming distance using just loop : sum += popcount(vec_a[i] ^ vec_b[i]) is also giving similar numbers as avx implementation . I also checked that avx instructions and methods are invoked just for sanity checks.
The non vectorized implementation :
static float compute_hamming(const void* __restrict pVect1,
const void* __restrict pVect2,
const void* __restrict qty_ptr) {
size_t qty = *((size_t *)qty_ptr);
int res = 0;
const float *pVect1LL = (const float *)pVect1;
const float *pVect2LL = (const float *)pVect2;
for (unsigned i = 0; i < qty; i = i + 2) {
if (i + 1 == qty) {
unsigned int v1;
unsigned int v2;
memcpy(&v1, &pVect1LL[i], sizeof(float));
memcpy(&v2, &pVect2LL[i], sizeof(float));
res += __builtin_popcount(v1 ^ v2);
break;
}
uint64_t v1;
uint64_t v2;
memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);
res += __builtin_popcountll(v1 ^ v2);
}
return res;
}
Need some help and recommendations on improving the performance since the bottleneck is distance computation method.