#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
#include <vector>
#include <time.h>
using namespace std;
class MyTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
}
};
//########################
//########################
using ConvertFunc = uint64_t(const char *);
inline uint64_t parse1(const char *s) {
struct tm tm;
memset(&tm, 0, sizeof(tm));
if (strptime(s, "%Y%m%d %H%M%S", &tm)) {
return 1000000UL * ((1900 + tm.tm_year) * 10000 + (tm.tm_mon + 1) * 100 + tm.tm_mday) +
tm.tm_hour * 10000 + tm.tm_min * 100 + tm.tm_sec;
}
return std::numeric_limits<uint64_t>::max();
}
inline uint64_t parse2(const char *s) {
static const uint64_t mul[16] = {
10'000'000'000'000UL, 1'000'000'000'000UL, 100'000'000'000UL, 10'000'000'000UL, 1'000'000'000UL, 100'000'000, 10'000'000, 1'000'000,
0,
100'000, 10'000, 1'000, 100, 10, 1,
0
};
uint64_t res = 0;
for (int i = 0; i < 16; i++) {
res += mul[i] * (s[i] - '0');
}
return res;
}
inline uint64_t parse3(const char *s) {
static const int date_mul[] = { 10'000'000, 1'000'000, 100'000, 10'000, 1'000, 100, 10, 1 };
static const int time_mul[] = { 100'000, 10'000, 1'000, 100, 10, 1 };
int date_val = 0;
for (int i = 0; i < 8; i++) date_val += date_mul[i] * (s[i] - '0');
int time_val = 0;
for (int i = 0; i < 6; i++) time_val += time_mul[i] * (s[9 + i] - '0');
return uint64_t(date_val) * 1'000'000UL + time_val;
}
inline uint64_t parse3a(const char *s) {
static const int left_mul[] = { 10'000'000, 1'000'000, 100'000, 10'000, 1'000, 100, 10, 1 };
static const int right_mul[] = { 0, 100'000, 10'000, 1'000, 100, 10, 1, 0 };
int date_val = 0;
for (int i = 0; i < 8; i++) date_val += left_mul[i] * (s[i] - '0');
int time_val = 0;
for (int i = 0; i < 8; i++) time_val += right_mul[i] * (s[8 + i] - '0');
return uint64_t(date_val) * 1'000'000UL + time_val;
}
inline uint64_t parse4(const char *s) {
__m128i chunk = _mm_lddqu_si128(
reinterpret_cast<const __m128i*>(s)
);
__m128i zeros = _mm_set1_epi8('0');
chunk = _mm_sub_epi8(chunk, zeros);
{
const auto mult = _mm_set_epi8(
0, 1, 1, 10, 1, 10, 1, 0, 1, 10, 1, 10, 1, 10, 1, 10
);
chunk = _mm_maddubs_epi16(chunk, mult);
}
{
//const __m128i mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
const auto mult = _mm_set_epi16(1, 10, 1, 100, 1, 100, 1, 100);
chunk = _mm_madd_epi16(chunk, mult);
}
{
chunk = _mm_packus_epi32(chunk, chunk);
const auto mult = _mm_set_epi16(0, 0, 0, 0, 1, 1000, 1, 10000);
chunk = _mm_madd_epi16(chunk, mult);
}
return ((chunk[0] & 0xffffffff) * 1000000) + (chunk[0] >> 32);
}
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
template <ConvertFunc converter>
void benchmark(string name, int numTest=1000) {
MyTimer timer;
const int N = 100000;
char *a = new char[16*N + 64];
int64_t runtime = 0;
int warmup = 100;
for (int t = 1; t <= warmup + numTest; t++) {
// change input data to prevent unexpected optimization
for (int i = 0; i < 16 * N; i += 16) {
for (int j = 0; j < 8; j++) a[i + j] = rand() % 10 + '0';
a[i + 8] = ' ';
for (int j = 0; j < 6; j++) a[i + 9 + j] = rand() % 10 + '0';
a[i + 15] = '\0';
}
if (t > warmup) timer.startCounter();
for (int i = 0; i < 16 * N; i += 16) result = converter(a+i);
if (t > warmup) runtime += timer.getCounterNs();
}
cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
delete[] a;
}
void correct_test()
{
//20141103 012910
//20141103 012910
string s = "20141103 012910";
vector<uint64_t> result;
result.push_back(parse1(s.c_str()));
result.push_back(parse2(s.c_str()));
result.push_back(parse3(s.c_str()));
result.push_back(parse3a(s.c_str()));
result.push_back(parse4(s.c_str()));
for (int i = 0; i < result.size() - 1; i++)
if (result[i] != result[i + 1]) {
cout << "Wrong at i = " << i << "\n";
cout << result[i] << "|" << result[i+1] << "|\n";
exit(1);
}
}
int main() {
correct_test();
cout << "test correct\n";
benchmark<parse1>("slow");
benchmark<parse2>("unroll");
benchmark<parse3>("dual_unroll");
benchmark<parse3a>("dual_unroll2");
benchmark<parse4>("manual_sse");
return 0;
}
program to test in detail the number of cycles per iteration below.
// Run command: taskset -c 7 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,idq.mite_uops,idq_uops_not_delivered.cycles_fe_was_ok -r1 ./bench
#include <stdlib.h>
#ifndef __cplusplus
#include <stdalign.h>
#endif
#include <stdint.h>
#if 1 && defined(__GNUC__)
#define LLVM_MCA_BEGIN asm("# LLVM-MCA-BEGIN")
#define LLVM_MCA_END asm("# LLVM-MCA-END")
#else
#define LLVM_MCA_BEGIN
#define LLVM_MCA_END
#endif
#include <cstring>
#include <time.h>
#include <immintrin.h>
#include <iostream>
// copy the function you want to test here: parse4
inline uint64_t parse4(const char *s) {
__m128i chunk = _mm_lddqu_si128(
reinterpret_cast<const __m128i*>(s)
);
__m128i zeros = _mm_set1_epi8('0');
chunk = _mm_sub_epi8(chunk, zeros);
{
const auto mult = _mm_set_epi8(
0, 1, 1, 10, 1, 10, 1, 0, 1, 10, 1, 10, 1, 10, 1, 10
);
chunk = _mm_maddubs_epi16(chunk, mult);
}
{
//const __m128i mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
const auto mult = _mm_set_epi16(1, 10, 1, 100, 1, 100, 1, 100);
chunk = _mm_madd_epi16(chunk, mult);
}
{
chunk = _mm_packus_epi32(chunk, chunk);
const auto mult = _mm_set_epi16(0, 0, 0, 0, 1, 1000, 1, 10000);
chunk = _mm_madd_epi16(chunk, mult);
}
return ((chunk[0] & 0xffffffff) * 1000000) + (chunk[0] >> 32);
}
#if defined(__cplusplus)
#include <atomic>
using std::atomic_thread_fence, std::memory_order_acq_rel;
#else
#include <stdatomic.h>
#endif
uint64_t testloop(const char str[16]) {
uint64_t result = 0;
for (int i=0 ; i<1000000000 ; i++){
LLVM_MCA_BEGIN;
result = parse4(str);
// compiler memory barrier
// force materializing the result, and forget about the input string being the same
#ifdef __GNUC__
asm volatile("" ::"m"(result): "memory");
#else
//#warning happens to be enough with current MSVC
atomic_thread_fence(memory_order_acq_rel); // strongest barrier that doesn't require any asm instructions on x86; MSVC defeats signal_fence.
#endif
}
LLVM_MCA_END;
volatile uint64_t dummy = result; // make sure both halves are really used, else MSVC optimizes.
return dummy;
}
int main(int argc, char *argv[])
{
// performance isn't data-dependent, so just use a handy string.
// alignas(16) static char str[] = "235959123456789";
// uintptr_t p = (uintptr_t)argv[0];
// p &= -16;
alignas(16) static const char str[] = "20141103 012910";
std::cout << testloop(str) << "\n";
return 0;
// return testloop((char*)p); // argv[0] apparently has a cache-line split within 16 bytes on my system, worsening from 5c throughput to 6.12c
}
The function needs to convert 15-digit string (has 2 parts: 8 chars, empty space, 6 chars) into a number. Example: "20141103 012910"
-> 20141103012910
. Basically, it's just string2int
after removing the middle white space.
The string is guaranteed to have length 15 and is null terminated, so we can access s[15]
without worrying about segfault. But it's not allowed to read/write on s[16]
and later. We also ignore the case where the timestamp string is invalid (containing non-digit char, for example).
The current solutions, commands to run, and benchmark results are described below. How can I make it faster? Related question: Most insanely fast way to convert 9 char digits into an int or unsigned int
Solution 1: use strptime
from C library, this one is slow because it performs a lot of checking, I think
Solution 2: assuming the input is always valid, we can "unroll" it.
Solution 3: the output is uint64_t
, but it's made up of 2 uint32_t
part. So we can compute them separately using uint32_t
, and only convert to uint64_t
at the end.
Solution 3a: the second half has 6 char, which is ugly because it doesn't divide by 4. So we include the middle white space and the null terminating character in computation in hope that the compiler can do some SIMD magic and makes it faster (it doesn't).
Solution 4: manual SSE, modified from here: https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html . I'm not sure if we can go faster than this.
Fastest fast way to parse date time timestamp SIMD (I added this to make the question easier to Google)
Command to run (change 7
to whichever idle CPU):
g++ -o main main.cpp -O3 -std=c++17 -mavx2
numactl --membind=0 taskset --cpu-list 7 ./main
Result on AMD EPYC 75F3 32-Core Processor, 2950MHz
test correct
slow: 46.3384ns average
unroll: 6.25358ns average
dual_unroll: 5.22005ns average
dual_unroll2: 5.24976ns average
manual_sse: 0.891766ns average