Here is a function which takes an array of 64 bit integers and counts how many 1 bits are in each position. Using AVX2, it should be possible to do this for 16 bits simultaneously, but then each counter can only go up to 65536. The following code would count each block of 65500 values, but I am missing one critical operation: I cannot find a way to individually shift each number by a different count. Am I missing something? The comment in the code shows the spot. Ideally each number is loaded from memory, and split into 4 16-bit chunks. Each chunk can be processed on one 256-bit register. In order to count more than 64k, the counts would be stored in memory and added to a wider register.
void countHistBits4(const uint64_t p[], uint32_t n, uint32_t hist[64]) {
uint16_t shifts[16] = {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15};
uint16_t maskss[16] = {1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1};
__m256i shift = _mm256_load_si256((__m256*)shifts);
__m256i mask1 = _mm256_load_si256((__m256*)masks);
for (uint32_t j = 0; j < n; j += 65500) {
__m256i count1 = _mm256_setzero_si256();
__m256i count2 = _mm256_setzero_si256();
__m256i count3 = _mm256_setzero_si256();
__m256i count4 = _mm256_setzero_si256();
for (uint32_t i = 0; i < 65500; i++) {
__m256i v1 = _mm256_set1_epi16(p[i] & 0xFFFF);
__m256i v2 = _mm256_set1_epi16((p[i] >> 16) & 0xFFFF);
__m256i v3 = _mm256_set1_epi16((p[i] >> 32) & 0xFFFF);
__m256i v4 = _mm256_set1_epi16((p[i] >> 48) & 0xFFFF);
// for each bit, right shift to the 1 position, and with 1
// and add to the count
// this isn't right. How to shift each 16 bit value by different const?
// if that isn't possible, what is the approach?
v1 = _mm256_srl_epi16(v1, shift);
v2 = _mm256_srl_epi16(v2, shift);
v3 = _mm256_srl_epi16(v3, shift);
v4 = _mm256_srl_epi16(v4, shift);
v1 = _mm256_and_si256(v1, mask1);
count1 = _mm256_adds_epi16 (count1, v1);
v2 = _mm256_and_si256(v2, mask1);
count2 = _mm256_adds_epi16 (count2, v2);
v3 = _mm256_and_si256(v3, mask1);
count3 = _mm256_adds_epi16 (count3, v3);
v4 = _mm256_and_si256(v4, mask1);
count4 = _mm256_adds_epi16 (count4, v4);
}
// store and add into larger counters...
}
}