UPDATE Added comparison and benchmarks at the end
Here's another take, based on a perfect hash. The perfect hash was generated using gperf
(like described here: Is it possible to map string to int faster than using hashmap?).
I've further optimized by moving function local statics out of the way and marking hexdigit()
and hash()
as constexpr
. This removes unnecessary any initialization overhead and gives the compiler full room for optimization/
I don't expect things to get much faster than this.
You could try reading e.g. 1024 nibbles at a time if possible, and give the compiler a chance to vectorize the operations using AVX/SSE instruction sets. (I have not inspected the generated code to see whether this would happen.)
The full sample code to translate std::cin
to std::cout
in streaming mode is:
#include <iostream>
int main()
{
char buffer[4096];
while (std::cin.read(buffer, sizeof(buffer)), std::cin.gcount())
{
size_t got = std::cin.gcount();
char* out = buffer;
for (auto it = buffer; it < buffer+got; it += 4)
*out++ = Perfect_Hash::hexchar(it);
std::cout.write(buffer, got/4);
}
}
Here's the Perfect_Hash
class, slightly redacted and extended with the hexchar
lookup. Note that it does validate input in DEBUG
builds using the assert
:
Live On Coliru
#include <array>
#include <algorithm>
#include <cassert>
class Perfect_Hash {
/* C++ code produced by gperf version 3.0.4 */
/* Command-line: gperf -L C++ -7 -C -E -m 100 table */
/* Computed positions: -k'1-4' */
/* maximum key range = 16, duplicates = 0 */
private:
static constexpr unsigned char asso_values[] = {
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15, 7, 3, 1, 0, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
template <typename It>
static constexpr unsigned int hash(It str)
{
return
asso_values[(unsigned char)str[3] + 2] + asso_values[(unsigned char)str[2] + 1] +
asso_values[(unsigned char)str[1] + 3] + asso_values[(unsigned char)str[0]];
}
static constexpr char hex_lut[] = "???????????fbead9c873625140";
public:
#ifdef DEBUG
template <typename It>
static char hexchar(It binary_nibble)
{
assert(Perfect_Hash::validate(binary_nibble)); // for DEBUG only
return hex_lut[hash(binary_nibble)]; // no validation!
}
#else
template <typename It>
static constexpr char hexchar(It binary_nibble)
{
return hex_lut[hash(binary_nibble)]; // no validation!
}
#endif
template <typename It>
static bool validate(It str)
{
static constexpr std::array<char, 4> vocab[] = {
{{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}},
{{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}},
{{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}},
{{'?', '?', '?', '?'}}, {{'?', '?', '?', '?'}},
{{'1', '1', '1', '1'}}, {{'1', '0', '1', '1'}},
{{'1', '1', '1', '0'}}, {{'1', '0', '1', '0'}},
{{'1', '1', '0', '1'}}, {{'1', '0', '0', '1'}},
{{'1', '1', '0', '0'}}, {{'1', '0', '0', '0'}},
{{'0', '1', '1', '1'}}, {{'0', '0', '1', '1'}},
{{'0', '1', '1', '0'}}, {{'0', '0', '1', '0'}},
{{'0', '1', '0', '1'}}, {{'0', '0', '0', '1'}},
{{'0', '1', '0', '0'}}, {{'0', '0', '0', '0'}},
};
int key = hash(str);
if (key <= 26 && key >= 0)
return std::equal(str, str+4, vocab[key].begin());
else
return false;
}
};
constexpr unsigned char Perfect_Hash::asso_values[];
constexpr char Perfect_Hash::hex_lut[];
#include <iostream>
int main()
{
char buffer[4096];
while (std::cin.read(buffer, sizeof(buffer)), std::cin.gcount())
{
size_t got = std::cin.gcount();
char* out = buffer;
for (auto it = buffer; it < buffer+got; it += 4)
*out++ = Perfect_Hash::hexchar(it);
std::cout.write(buffer, got/4);
}
}
Demo output for e.g. od -A none -t o /dev/urandom | tr -cd '01' | dd bs=1 count=4096 | ./test
03bef5fb79c7da917e3ebffdd8c41488d2b841dac86572cf7672d22f1f727627a2c4a48b15ef27eb0854dd99756b24c678e3b50022d695cc5f5c8aefaced2a39241bfd5deedcfa0a89060598c6b056d934719eba9ccf29e430d2def5751640ff17860dcb287df8a94089ade0283ee3d76b9fefcce3f3006b8c71399119423e780cef81e9752657e97c7629a9644be1e7c96b5d0324ab16d20902b55bb142c0451e675973489ae4891ec170663823f9c1c9b2a11fcb1c39452aff76120b21421069af337d14e89e48ee802b1cecd8d0886a9a0e90dea5437198d8d0d7ef59c46f9a069a83835286a9a8292d2d7adb4e7fb0ef42ad4734467063d181745aaa6694215af7430f95e854b7cad813efbbae0d2eb099523f215cff6d9c45e3edcaf63f78a485af8f2bfc2e27d46d61561b155d619450623b7aa8ca085c6eedfcc19209066033180d8ce1715e8ec9086a7c28df6e4202ee29705802f0c2872fbf06323366cf64ecfc5ea6f15ba6467730a8856a1c9ebf8cc188e889e783c50b85824803ed7d7505152b891cb2ac2d6f4d1329e100a2e3b2bdd50809b48f0024af1b5092b35779c863cd9c6b0b8e278f5bec966dd0e5c4756064cca010130acf24071d02de39ef8ba8bd1b6e9681066be3804d36ca83e7032274e4c8e8cacf520e8078f8fa80eb8e70af40367f53e53a7d7f7afe8704c46f58339d660b8151c91bddf82b4096
BENCHMARKS
I came up with three different approaches:
- naive.cpp (no hacks, no libraries); Live disassembly on Godbolt
- spirit.cpp (Trie);
Live disassembly on pastebin
- and this answer: perfect.cpp hash based; Live disassembly on Godbolt
In order to do some comparisons, I've
- compiled them all with the same compiler (GCC 4.9) and flags (
-O3 -march=native -g0 -DNDEBUG
)
- optimized input/output so it doesn't read by 4 chars/write single characters
- created a large input file (1 Gigabyte)
Here are the results:

- Surprisingly, the
naive
approach from the first answer does rather well
- Spirit does really badly here; it nets 3.4MB/s so that the whole file would take at 294 seconds (!!!). We've left it off the charts
- The average throughputs are ~720MB/s for naive.cpp and ~1.14GB/s forperfect.cpp
- This makes the perfect hash approach roughly 50% faster than the naive approach.
*Summary I'd say the naive approach was plenty good as I posted it on whim 10 hours ago. If you really want high throughput, the perfect hash is a nice start, but consider hand-rolling a SIMD based solution