I'm trying to load a binary file into memory. It has a specific encoding and limited alphabet by which I mean every two bit represent an integer value (00 -> 0, 01 -> 1, 10 -> 2, and 11 -> 3) and that's it.
Now 11 or basically 3 represent missing value. I can use BMI2 to unpack and set missing to zero and populate the main matrix (full data without missing) but would like to create a map or basically sparse array to store index of missing values.
Below is the demo code for how I'm doing it currently (with BMI2) as I didn't find avx2 or other approaches beneficial compared to 2byte unpacking. Let me know if you disagree!
So the real question is: How can I use the result_and_mask
variable here to store the index of bytes in an array without using a simple loop?
Just to add more context, this is happening in a loop that is reading a col of data with memmap and doing unpacking, and storing.
#include <immintrin.h>
#include <stdio.h>
#include <inttypes.h>
void printBits(size_t const size, void const * const ptr, int put)
{
unsigned char *b = (unsigned char*) ptr;
unsigned char byte;
int i, j;
for (i = size-1; i >= 0; i--) {
for (j = 7; j >= 0; j--) {
byte = (b[i] >> j) & 1;
if (byte == 0) {
printf(".");
} else{
printf("1");
}
// printf("%u", byte);
if (j % 2 == 0){
printf(" ");
}
}
if (put != 0) printf("|");
}
if (put != 0) puts("");
}
int main() {
uint16_t ww = 0xCA07;
uint64_t result;
const int PRINT_LINE = 1;
# define S_CAST(type, val) ((type)(val))
static const uintptr_t kMask0303 = (~S_CAST(uintptr_t, 0)) / 85;
static const uintptr_t kMask1111 = (~S_CAST(uintptr_t, 0)) / 15;
printf("ww : ");
printBits(sizeof(ww), &ww, PRINT_LINE);
printf("mask03: ");
printBits(sizeof(kMask0303), &kMask0303, PRINT_LINE);
printf("result: ");
result = _pdep_u64(ww, kMask0303);
printBits(sizeof(result), &result, PRINT_LINE);
uint64_t result_shift = result >> 1;
printf("shift : ");
printBits(sizeof(result_shift), &result_shift, PRINT_LINE);
uint64_t result_and = result & result_shift;
printf("and : ");
printBits(sizeof(result_and), &result_and, PRINT_LINE);
printf("mask01: ");
printBits(sizeof(kMask1111), &kMask1111, PRINT_LINE);
uint64_t result_and_mask = result_and & kMask1111;
printf("miss : ");
printBits(sizeof(result_and_mask), &result_and_mask, PRINT_LINE);
// printf("sub : ");
// uint64_t result_and_mask_mult = result_and_mask * 3;
// printBits(sizeof(result_and_mask_mult), &result_and_mask_mult, PRINT_LINE);
// printf("fin : ");
// uint64_t final = result - result_and_mask_mult;
// printBits(sizeof(final), &final, PRINT_LINE);
}
Example output:
ww : 11 .. 1. 1. |.. .. .1 11 |
mask03: .. .. .. 11 |.. .. .. 11 |.. .. .. 11 |.. .. .. 11 |.. .. .. 11 |.. .. .. 11 |.. .. .. 11 |.. .. .. 11 |
result: .. .. .. 11 |.. .. .. .. |.. .. .. 1. |.. .. .. 1. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |.. .. .. 11 |
shift : .. .. .. .1 |1. .. .. .. |.. .. .. .1 |.. .. .. .1 |.. .. .. .. |.. .. .. .. |.. .. .. .. |1. .. .. .1 |
and : .. .. .. .1 |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
mask01: .. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |.. .1 .. .1 |
miss : .. .. .. .1 |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
Update: Further Clarifications
- The desired output is a left packed list of indices. In the example above, I need 0,7. Bascially, I have a pointer to the next empty spot in array to store index of missing values, say that is pointing to element 100 (so far 99 missing values found), and right now I'm reading elements of col at index 1000, so what I would like to do is to add (1000 + 7, 1000 + 0, order doesn't matter) to the missing array. Now the pointer will point to 102.
- The platform would have minimum avx2, this is a scientific program and used mostly in cloud and or HPC and the CPUs are mostly intel and Skylake upwards (avx-512 is common) so I'm assuming avx2 to be present at least.
Update: AVX2 Demo for Unpacking
This is my first try to do the unpacking with avx2:
#include <immintrin.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdint.h>
#include <string.h> // for memset
void printBits(size_t const size, void const * const ptr, int put)
{
unsigned char *b = (unsigned char*) ptr;
unsigned char byte;
int i, j;
for (i = size-1; i >= 0; i--) {
for (j = 7; j >= 0; j--) {
byte = (b[i] >> j) & 1;
if (byte == 0) {
printf(".");
} else{
printf("1");
}
// printf("%u", byte);
if (j % 2 == 0){
printf(" ");
}
}
if (put != 0) printf("|");
}
if (put != 0) puts("");
}
int main() {
uint64_t x = 0x1360EA787654321;
const int PRINT_LINE = 1;
// mask: where to put each byte
static const char mask1a[32] = {
0x00, 0x00, 0x00, 0x00,
0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02,
0x03, 0x03, 0x03, 0x03,
0x04, 0x04, 0x04, 0x04,
0x05, 0x05, 0x05, 0x05,
0x06, 0x06, 0x06, 0x06,
0x07, 0x07, 0x07, 0x07
};
static const char mask2a[32] = {
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
0x03, 0x0c, 0x30, 0xC0,
};
static const char mask3a[32] = {
0x00, 0x04, 0x08, 0x0C,
0x01, 0x05, 0x09, 0x0D,
0x02, 0x06, 0x0A, 0x0E,
0x03, 0x07, 0x0B, 0x0F,
0x00, 0x04, 0x08, 0x0C,
0x01, 0x05, 0x09, 0x0D,
0x02, 0x06, 0x0A, 0x0E,
0x03, 0x07, 0x0B, 0x0F,
};
static const char mask4a[32] = {
0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00,
};
unsigned char out[32];
__m256i mask2 = _mm256_loadu_si256((__m256i*)mask2a);
__m256i mask1 = _mm256_loadu_si256((__m256i*)mask1a);
__m256i mask3 = _mm256_loadu_si256((__m256i*)mask3a);
__m256i mask4 = _mm256_loadu_si256((__m256i*)mask4a);
union combine { __m128i reg; uint8_t value; };
const union combine THREE = {.value = 3};
__m256i mult3 = _mm256_broadcastb_epi8(THREE.reg);
__m256i y = _mm256_set1_epi64x(x);
__m256i z = _mm256_shuffle_epi8(y,mask1);
z = _mm256_and_si256(z,mask2);
_mm256_storeu_si256((__m256i*)out,z);
printf("\nX : ");
printBits(sizeof(x), &x, PRINT_LINE);
for(int i=28; i>=0; i-=4) {
uint8_t orig = *((uint8_t *)&(x) + (i / 4));
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(orig), &orig, 0);
printf(" ---> ");
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
printf("Result with Missing: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
z = _mm256_shuffle_epi8(z, mask3);
_mm256_storeu_si256((__m256i*)out,z);
printf("Transpose: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
z = _mm256_srlv_epi32(z, mask4);
_mm256_storeu_si256((__m256i*)out,z);
printf("Shift: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
z = _mm256_shuffle_epi8(z, mask3);
_mm256_storeu_si256((__m256i*)out,z);
printf("Transpose: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
__m256i z_shifted = _mm256_srli_epi32(z, 1);
_mm256_storeu_si256((__m256i*)out,z_shifted);
printf("Shifted: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
__m256i z_and = _mm256_and_si256(z, z_shifted);
_mm256_storeu_si256((__m256i*)out,z_and);
printf("Bytes with Missing value (11): \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
__m256i z_and_shift_back = _mm256_slli_epi32(z_and, 1);
_mm256_storeu_si256((__m256i*)out,z_and_shift_back);
printf("SHIFT BACK: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
__m256i z_and_shift_back_or = _mm256_or_si256(z_and, z_and_shift_back);
_mm256_storeu_si256((__m256i*)out,z_and_shift_back_or);
printf("OR: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
__m256i z_sub = _mm256_sub_epi8(z, z_and_shift_back_or);
_mm256_storeu_si256((__m256i*)out,z_sub);
printf("Result with Missing set to zero: \n");
for(int i=28; i>=0; i-=4) {
uint32_t num = *(uint32_t *)&out[i];
printBits(sizeof(num), &num, PRINT_LINE);
} printf("\n");
}
The output:
X : .. .. .. .1 |.. 11 .1 1. |.. .. 11 1. |1. 1. .1 11 |1. .. .1 11 |.1 1. .1 .1 |.1 .. .. 11 |.. 1. .. .1 |
.. .. .. .1 ---> .. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. 11 .1 1. ---> .. .. .. .. |.. 11 .. .. |.. .. .1 .. |.. .. .. 1. |
.. .. 11 1. ---> .. .. .. .. |.. .. .. .. |.. .. 11 .. |.. .. .. 1. |
1. 1. .1 11 ---> 1. .. .. .. |.. 1. .. .. |.. .. .1 .. |.. .. .. 11 |
1. .. .1 11 ---> 1. .. .. .. |.. .. .. .. |.. .. .1 .. |.. .. .. 11 |
.1 1. .1 .1 ---> .1 .. .. .. |.. 1. .. .. |.. .. .1 .. |.. .. .. .1 |
.1 .. .. 11 ---> .1 .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. 1. .. .1 ---> .. .. .. .. |.. 1. .. .. |.. .. .. .. |.. .. .. .1 |
Result with Missing:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. 11 .. .. |.. .. .1 .. |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. 11 .. |.. .. .. 1. |
1. .. .. .. |.. 1. .. .. |.. .. .1 .. |.. .. .. 11 |
1. .. .. .. |.. .. .. .. |.. .. .1 .. |.. .. .. 11 |
.1 .. .. .. |.. 1. .. .. |.. .. .1 .. |.. .. .. .1 |
.1 .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. .. .. .. |.. 1. .. .. |.. .. .. .. |.. .. .. .1 |
Transpose:
.. .. .. .. |.. .. .. .. |.. .. .. .. |1. .. .. .. |
.. .. .. .. |.. 11 .. .. |.. .. .. .. |.. 1. .. .. |
.. .. .. .. |.. .. .1 .. |.. .. 11 .. |.. .. .1 .. |
.. .. .. .1 |.. .. .. 1. |.. .. .. 1. |.. .. .. 11 |
1. .. .. .. |.1 .. .. .. |.1 .. .. .. |.. .. .. .. |
.. .. .. .. |.. 1. .. .. |.. .. .. .. |.. 1. .. .. |
.. .. .1 .. |.. .. .1 .. |.. .. .. .. |.. .. .. .. |
.. .. .. 11 |.. .. .. .1 |.. .. .. 11 |.. .. .. .1 |
Shift:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 1. |
.. .. .. .. |.. .. .. 11 |.. .. .. .. |.. .. .. 1. |
.. .. .. .. |.. .. .. .1 |.. .. .. 11 |.. .. .. .1 |
.. .. .. .1 |.. .. .. 1. |.. .. .. 1. |.. .. .. 11 |
.. .. .. 1. |.. .. .. .1 |.. .. .. .1 |.. .. .. .. |
.. .. .. .. |.. .. .. 1. |.. .. .. .. |.. .. .. 1. |
.. .. .. .1 |.. .. .. .1 |.. .. .. .. |.. .. .. .. |
.. .. .. 11 |.. .. .. .1 |.. .. .. 11 |.. .. .. .1 |
Transpose:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. 11 |.. .. .. .1 |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. .. 11 |.. .. .. 1. |
.. .. .. 1. |.. .. .. 1. |.. .. .. .1 |.. .. .. 11 |
.. .. .. 1. |.. .. .. .. |.. .. .. .1 |.. .. .. 11 |
.. .. .. .1 |.. .. .. 1. |.. .. .. .1 |.. .. .. .1 |
.. .. .. .1 |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. .. .. .. |.. .. .. 1. |.. .. .. .. |.. .. .. .1 |
Shifted:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .1 |1. .. .. .. |1. .. .. .1 |
.. .. .. .. |.. .. .. .. |.. .. .. .1 |1. .. .. .1 |
.. .. .. .1 |.. .. .. .1 |.. .. .. .. |1. .. .. .1 |
.. .. .. .1 |.. .. .. .. |.. .. .. .. |1. .. .. .1 |
.. .. .. .. |1. .. .. .1 |.. .. .. .. |1. .. .. .. |
.. .. .. .. |1. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. .1 |.. .. .. .. |.. .. .. .. |
Bytes with Missing value (11):
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .1 |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .1 |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
SHIFT BACK:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. 1. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. 1. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
OR:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. 11 |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. 11 |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 11 |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .. |
Result with Missing set to zero:
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. .1 |
.. .. .. .. |.. .. .. .. |.. .. .. .1 |.. .. .. 1. |
.. .. .. .. |.. .. .. .. |.. .. .. .. |.. .. .. 1. |
.. .. .. 1. |.. .. .. 1. |.. .. .. .1 |.. .. .. .. |
.. .. .. 1. |.. .. .. .. |.. .. .. .1 |.. .. .. .. |
.. .. .. .1 |.. .. .. 1. |.. .. .. .1 |.. .. .. .1 |
.. .. .. .1 |.. .. .. .. |.. .. .. .. |.. .. .. .. |
.. .. .. .. |.. .. .. 1. |.. .. .. .. |.. .. .. .1 |
I'm using the result without missing to store the full array. Now I'd like to use the "Bytes with Missing value (11)" to get a left packed indices to store the indices to a sparse array.