Python solution:
import numpy as np
import time
def compute(A, AU):
A[:,:] = 0
# Put every 8 columns in AU into A
for i in range(A.shape[1]):
A[:, i//8] = np.bitwise_or(A[:, i//8], np.left_shift(AU[:, i], i % 8))
colm = int(20000/8)
rows = 10000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
start_time = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
start_time = time.time()
compute(A, AU)
end_time = time.time()
print(end_time - start_time)
Packs bits in 1/2 second
Same code in C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char* argv[]) {
int colm = 200000/8;
int rows = 10000;
int cols = colm*8;
unsigned char *A = (unsigned char *)malloc(rows * colm * sizeof(unsigned char));
unsigned char *AU = (unsigned char *)malloc(rows * cols * sizeof(unsigned char));
int i, j;
clock_t begin;
clock_t end;
double time_spent;
begin = clock();
// Create AU
for (i = 0; i < rows; i++)
for (j = 0; j < cols; j++)
*(AU + i*cols + j) = (unsigned char) (rand() & 0x01);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to create AU\n", time_spent);
begin = clock();
// Create a zeroed out A
for (i = 0; i < rows; i++)
for (j = 0; j < colm; j++)
*(A + i*colm + j) = (unsigned char) 0;
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to create A\n", time_spent);
begin = clock();
// Pack into bits
for (i = 0; i < rows; i++)
for (j = 0; j < colm; j++) {
int au_idx = i*cols + j*8;
for (int k=0; k<8; k++)
*(A + i*colm + j) |= *(AU + au_idx + k) << k;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%lf seconds to pack\n", time_spent);
free(A);
free(AU);
return 0;
}
Tested with colm=200,000. Bit packing takes 0.27 seconds against 0.64 seconds for the optimized Python version provided by Jérôme Richard. Calls to rand() are expensive and greatly increase overall runtime. In terms of memory, the C version peaks at 2GB against Python's 4.2GB. Further code optimization and parallelization would certainly reduce runtime.
Julia version:
using Random
colm = 200000÷8
rows = 30000
cols = colm*8
AU = zeros(UInt8, (rows, cols))
rand!(AU)
AU .&= 0x01
A = zeros(UInt8, (rows, colm))
function compute(A, AU)
for i in 1:size(A)[2]
start_col = (i-1) << 3
@views A[:, i] .= AU[:, start_col + 1] .|
(AU[:, start_col + 2] .<< 1) .|
(AU[:, start_col + 3] .<< 2) .|
(AU[:, start_col + 4] .<< 3) .|
(AU[:, start_col + 5] .<< 4) .|
(AU[:, start_col + 6] .<< 5) .|
(AU[:, start_col + 7] .<< 6) .|
(AU[:, start_col + 8] .<< 7)
end
end
@time compute(A, AU)
Julia scales well in terms of performance. Results with colm=25,000 and rows=30,000:
Language Total Run Time (secs) Bit Packing Time (secs) Peak Memory (GB)
Python 22.1 3.0 6
Julia 11.7 1.2 6