#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;
const int p[9] = {1, 10, 100,
1000, 10000, 100000,
1000000, 10000000, 100000000};
class MyTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
}
};
int convert1(const char *a) {
int res = 0;
for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
return res;
}
int convert2(const char *a) {
return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
+ (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
+ (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}
int convert3(const char *a) {
return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
+ a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
- 533333328;
}
const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
100000000};
int convert4u(const char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
+ a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}
int convert5(const char* a) {
int val = 0;
for(size_t k =0;k <9;++k) {
val = (val << 3) + (val << 1) + (a[k]-'0');
}
return val;
}
const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
int convert6u(const char *a) {
return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}
constexpr std::uint64_t zeros(char z) {
std::uint64_t result = 0;
for (int i = 0; i < sizeof(result); ++i) {
result = result*256 + z;
}
return result;
}
int convertX(const char *a) {
constexpr std::uint64_t offset = zeros('0');
constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;
std::uint64_t buffer;
std::memcpy(&buffer, a, sizeof(buffer));
const auto bytes = buffer - offset;
const auto b1 = (bytes & o1) >> 8;
const auto words = (bytes & ~o1) + 10*b1;
const auto w1 = (words & o2) >> 16;
const auto dwords = (words & ~o2) + 100*w1;
const auto d1 = (dwords & o3) >> 32;
const auto qwords = (dwords & ~o3) + 1000*d1;
const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
return static_cast<int>(final);
}
//######################## ACCEPTED ANSWER
//########################
//########################
typedef struct { // for output into memory
alignas(16) unsigned hours;
unsigned minutes, seconds, nanos;
} hmsn;
void str2hmsn(hmsn *out, const char str[15]) // HHMMSSXXXXXXXXX 15 total, with 9-digit nanoseconds.
{ // 15 not including the terminating 0 (if any) which we don't read
//hmsn retval;
__m128i digs = _mm_loadu_si128((const __m128i*)str);
digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
__m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) )); // SSSE3 pairs of digits => 10s, 1s places.
__m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words); // SSE4.1 hours, minutes, seconds unpack from uint16_t to uint32
//_mm_storeu_si128((__m128i*)&retval, hms_unpacked); // store first 3 struct members; last to be written separately
_mm_storeu_si128((__m128i*)out, hms_unpacked);
// or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx
__m128i xwords = _mm_bsrli_si128(hms_x_words, 6); // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
// 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
// could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.
__m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1, 0,0,0,0)); // low/high uint32 chunks, discard the 9th x digit.
uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
uint32_t msd = 100*100 * (uint32_t)pair32; // most significant dword was at lower address (in printing order), so low half on little-endian x86. encourage compilers to use 32-bit operand-size for imul
uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0'); // total*10 + lowest digit
out->nanos = nanos;
//retval.nanos = nanos;
//return retval;
// returning the struct by value encourages compilers in the wrong direction
// into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;
int convertSIMD(const char* a)
{
str2hmsn(&mystruct, a);
return mystruct.nanos;
}
//########################
//########################
using ConvertFunc = int(const char*);
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
MyTimer timer;
const int N = 100000;
char *a = new char[9*N + 17];
int64_t runtime = 0;
for (int t=1; t<=numTest; t++) {
// change something to prevent unexpected optimization
for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0';
timer.startCounter();
for (int i=0; i<9*N; i+= 9) result = converter(a+i);
runtime += timer.getCounterNs();
}
cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
delete[] a;
}
int main() {
benchmark(convert1, "slow");
benchmark(convert2, "normal");
benchmark(convert3, "fast");
benchmark(convert4u, "unsigned");
benchmark(convert5, "shifting");
benchmark(convert6u, "reverse");
benchmark(convertX, "swar64");
benchmark(convertSIMD, "manualSIMD");
return 0;
}
I want to find the fastest way turn char a[9]
into an int
. The full problem is convert char a[15]
with form HHMMSSxxxxxxxxx timestamp to nanosecond, where ~50 bytes after the x
are allocated and can be safely read (but not write). We only care about the last 9 digits in this question.
Version 1 is basic, version 2,3 try to save some computation. I compile with -O3 flag, and storing power of 10s in array is fine because it is optimized away (checked using Godbolt).
How can I make this faster? Yes I know this sounds like premature optimization, but let's assume I need that final 2-3% boost.
Version 3 is decidedly faster, while version 2 is slower due to code size. But can we do better than version 3? Invalid benchmark
Edit 2: the function can return unsigned int
instead of int
(i.e
unsigned convert1(char *a);
Edit 3: I noticed that the new code is an invalid benchmark, since convert(a) is only executed once. Using the original code, the difference is only ~1%.
Edit 4: New benchmark. using unsigned (convert4u, convert6u) is consistently 3-5% faster than using int. I will run a long (10+ min) benchmark to see if there's a winner. I've edited the code to use a new benchmark. It generates a large amount of data, then run the converter functions.
Edit 5: results: 4.19, 4.51, 3.82, 3.59, 7.64, 3.72
seconds. The unsigned version is fastest. Is it possible to use SIMD on just 9 bytes? If not, then I guess this is the best solution. I still hope there's a crazier solution, though
Edit 6: benchmark result on AMD Ryzen 4350G, gcc version 10.3, compile command gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native
slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average
The accepted answer does even more than the question require and compute HH/MM/SS/nanosec
, so it's even faster than this benchmark shows.