You can write a wrapper for OpenSSL's MD5()
function that accepts NumPy arrays. Our baseline will be a pure Python implementation.
Create a builder
# build.py
import cffi
ffi = cffi.FFI()
header = r"""
void md5_array(uint64_t* buffer, int len, unsigned char* out);
"""
source = r"""
#include <stdint.h>
#include <openssl/md5.h>
void md5_array(uint64_t * buffer, int len, unsigned char * out) {
int i = 0;
for(i=0; i<len; i++) {
MD5((const unsigned char *) &buffer[i], 8, out + i*16);
}
}
"""
ffi.set_source("_md5", source, libraries=['ssl'])
ffi.cdef(header)
if __name__ == "__main__":
ffi.compile()
and a wrapper
# md5.py
import numpy as np
import _md5
def md5_array(data):
out = np.zeros(data.shape, dtype='|S16')
_md5.lib.md5_array(
_md5.ffi.from_buffer(data),
data.size,
_md5.ffi.cast("unsigned char *", _md5.ffi.from_buffer(out))
)
return out
and compare the two:
# run.py
import numpy as np
import hashlib
import md5
data = np.arange(16, dtype=np.uint64)
out = [hashlib.md5(i).digest() for i in data]
out2 = md5.md5_array(data)
print(data)
# [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
print(out)
# [b'}\xea6+?\xac\x8e\x00\x95jIR\xa3\xd4\xf4t', ... , b'w)\r\xf2^\x84\x11w\xbb\xa1\x94\xc1\x8c8XS']
print(out2)
# [b'}\xea6+?\xac\x8e\x00\x95jIR\xa3\xd4\xf4t', ... , b'w)\r\xf2^\x84\x11w\xbb\xa1\x94\xc1\x8c8XS']
print(all(out == out2))
# True
To compile the bindings and run the script, run
python build.py
python run.py
For large arrays it's about 15x faster (I am a bit disappointed by that honestly...)
data = np.arange(100000, dtype=np.uint64)
%timeit [hashlib.md5(i).digest() for i in data]
169 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit md5.md5_array(data)
12.1 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)