The OP didn't originally specify what hardware support could be assumed and in this case, the hardware makes a ton of difference. To support older hardware, here is a function which does the job requiring SSE2 only without requiring hardware support for popcounts. The SSE2 version uses the popcnt8() function I found at Fast counting the number of set bits in __m128i register.
static inline __m128i popcnt8(__m128i x) {
const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
__m128i n;
// Count bits in each 4-bit field.
n = _mm_srli_epi64(x, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
n = _mm_srli_epi64(n, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
n = _mm_srli_epi64(n, 1);
n = _mm_and_si128(popcount_mask1, n);
x = _mm_sub_epi8(x, n);
x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
x = _mm_and_si128(popcount_mask2, x);
return x;
}
The following function is very slow compared to the OP's function except when hardware popcount is not available. On my Nehalem i5 CPU I measured the OP's function at around 8000-9000 rdtsc 'cycles' with hardware popcount enabled and around 40000 without. The SSE2 (gcc -msse2) function measured about 19000 cycles. It does work without requiring changes for different values of N.
int f6(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
int r, s, E = 0;
__m128i ABCr, ABCs, ABC[N];
union {
__m128i v;
uint8_t u[16];
} popcounts;
for (r = 0; r < N; ++r) {
ABC[r] = _mm_setr_epi16(A[r], B[r], C[r], 0, 0, 0, 0, 0);
}
for (r = 0; r < N; ++r) {
ABCr = ABC[r];
ABCs = popcnt8(ABCr);
popcounts.v = _mm_bslli_si128(ABCs, 1);
popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
for (s = r+1; s < N; s++) {
ABCs = ABC[s];
ABCs = _mm_and_si128(ABCs, ABCr);
ABCs = popcnt8(ABCs);
popcounts.v = _mm_bslli_si128(ABCs, 1);
popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
}
}
return E;
}
While this function's performance won't impress anyone, I found writing and benchmarking it interesting and thought some others might find it interesting too. Since it's quite common to assume SSE2 support as a minimum and coding for multiple architectures can be very complex, I thought there was some value in sharing what I did. If the OP had asked for widely compatible code and assuming not more than SSE2 support, then this could have been a worthwhile improvement I think.
EDIT:
I made a faster SSE2 version of the function by reordering the calculation. It runs slightly faster than the hardware popcount enabled functions at about 5900 cycles. I know the OP wanted AVX2 but I think this approach is of interest if someone wants to create a manually vectorised version in AVX2 or AVX512.
SSE2 + AVX512:
_mm_popcnt_epi16 is an AVX512 intrinsic which if substituted in place of X_mm_popcnt_epi16 should give a decent performance gain I think, but I don't have any AVX512 supporting hardware to provide benchmarks.
static inline __m128i X_mm_popcnt_epi16(__m128i v) {
// Taken from https://stackoverflow.com/questions/6431692/tweaking-mits-bitcount-algorithm-to-count-words-in-parallel
v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x5555)), _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi16(0x5555)));
v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x3333)), _mm_and_si128(_mm_srli_epi16(v, 2), _mm_set1_epi16(0x3333)));
v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x0f0f)), _mm_and_si128(_mm_srli_epi16(v, 4), _mm_set1_epi16(0x0f0f)));
v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x00ff)), _mm_srli_epi16(v, 8));
return v;
}
static inline __m128i getBrs(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C, uint32_t r, uint32_t s) {
uint32_t i;
__m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr, As, Bs, Cs;
Evec = _mm_setzero_si128();
/* getBrs does sth like...
uint32_t EB = 0;
uint32_t j;
for (i = r; i<r+8; i++) {
for (j = s; j<s+8; j++) {
EB += __builtin_popcount(A[i] & A[j])*__builtin_popcount(B[i] & B[j])*__builtin_popcount(C[i] & C[j]);
}
}
*/
Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
Br = _mm_loadu_si128( (const __m128i*)&B[r]);
Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
As = _mm_loadu_si128( (const __m128i*)&A[s]);
Bs = _mm_loadu_si128( (const __m128i*)&B[s]);
Cs = _mm_loadu_si128( (const __m128i*)&C[s]);
for (i=0; i<8; i++) {
As = _mm_bsrli_si128(As, 2);
As = _mm_insert_epi16(As, A[s], 7);
Bs = _mm_bsrli_si128(Bs, 2);
Bs = _mm_insert_epi16(Bs, B[s], 7);
Cs = _mm_bsrli_si128(Cs, 2);
Cs = _mm_insert_epi16(Cs, C[s], 7);
temp = _mm_and_si128(Ar, As);
popA = X_mm_popcnt_epi16(temp);
temp = _mm_and_si128(Br, Bs);
popB = X_mm_popcnt_epi16(temp);
temp = _mm_and_si128(Cr, Cs);
popC = X_mm_popcnt_epi16(temp);
temp = _mm_mullo_epi16(popA, popB);
temp2 = _mm_mullo_epi16(temp, popC);
Evec = _mm_add_epi16(Evec, temp2);
s++;
}
return _mm_madd_epi16(Evec, _mm_set1_epi16(1));
}
int f8(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
uint32_t r, i,j, Ediag = 0, E = 0;
__m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr;
Evec = _mm_setzero_si128();
union {
__m128i v;
uint32_t u32[4];
} popcounts;
/*
for (i = 0; i<N; i++) {
Ediag += __builtin_popcount(A[i] & A[i])*__builtin_popcount(B[i] & B[i])*__builtin_popcount(C[i] & C[i]);
}
*/
for (r = 0; r < 48; r+=8) {
Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
Br = _mm_loadu_si128( (const __m128i*)&B[r]);
Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
popA = X_mm_popcnt_epi16(Ar);
popB = X_mm_popcnt_epi16(Br);
popC = X_mm_popcnt_epi16(Cr);
temp = _mm_mullo_epi16(popA, popB);
temp2 = _mm_mullo_epi16(temp, popC);
Evec = _mm_add_epi16(Evec, temp2);
}
popcounts.v = _mm_madd_epi16(Evec, _mm_set1_epi16(1));;
Ediag = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
Evec = _mm_setzero_si128();
for (i = 0; i<N; i+=8) {
Evec = _mm_add_epi32(Evec, getBrs(A,B,C,i,i));
for (j = i+8; j<N; j+=8) {
temp = getBrs(A,B,C,i,j);
temp = _mm_add_epi32(temp, temp);
Evec = _mm_add_epi32(Evec, temp);
}
}
popcounts.v = Evec;
E = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
return (Ediag + E)/2;
}