6
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;

const int p[9] =   {1, 10, 100, 
                    1000, 10000, 100000, 
                    1000000, 10000000, 100000000};
                    
class MyTimer {
 private:
  std::chrono::time_point<std::chrono::steady_clock> starter;

 public:
  void startCounter() {
    starter = std::chrono::steady_clock::now();
  }

  int64_t getCounterNs() {    
    return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
  }
};
                    
int convert1(const char *a) {
    int res = 0;
    for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
    return res;
}

int convert2(const char *a) {
    return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
            + (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
            + (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}

int convert3(const char *a) {
    return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
            + a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
            - 533333328;
}

const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
    100000000};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
      + a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}

int convert5(const char* a) {
    int val = 0;
    for(size_t k =0;k <9;++k) {
        val = (val << 3) + (val << 1) + (a[k]-'0');
    }
    return val;
}

const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};

int convert6u(const char *a) {
  return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}

constexpr std::uint64_t zeros(char z) {
    std::uint64_t result = 0;
    for (int i = 0; i < sizeof(result); ++i) {
        result = result*256 + z;
    }
    return result;
}

int convertX(const char *a) {
    constexpr std::uint64_t offset = zeros('0');
    constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
    constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
    constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;

    std::uint64_t buffer;
    std::memcpy(&buffer, a, sizeof(buffer));
    const auto bytes = buffer - offset;
    const auto b1 = (bytes & o1) >> 8;
    const auto words = (bytes & ~o1) + 10*b1;
    const auto w1 = (words & o2) >> 16;
    const auto dwords = (words & ~o2) + 100*w1;
    const auto d1 = (dwords & o3) >> 32;
    const auto qwords = (dwords & ~o3) + 1000*d1;

    const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
    return static_cast<int>(final);
}

//########################  ACCEPTED ANSWER
//########################
//########################
typedef struct {             // for output into memory
    alignas(16) unsigned hours;
    unsigned minutes, seconds, nanos;
} hmsn;

void str2hmsn(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    //hmsn retval;
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    __m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places.

    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);                           // SSE4.1  hours, minutes, seconds unpack from uint16_t to uint32
    //_mm_storeu_si128((__m128i*)&retval, hms_unpacked);                                  // store first 3 struct members; last to be written separately
    _mm_storeu_si128((__m128i*)out, hms_unpacked);
    // or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

    __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
    uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
    uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
    //retval.nanos = nanos;
    //return retval;

  // returning the struct by value encourages compilers in the wrong direction
  // into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;

int convertSIMD(const char* a)
{
    str2hmsn(&mystruct, a);
    return mystruct.nanos;
}


//########################
//########################
using ConvertFunc = int(const char*);

volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
    MyTimer timer;
    const int N = 100000;
    char *a = new char[9*N + 17];
    int64_t runtime = 0;    

    for (int t=1; t<=numTest; t++) {        
        // change something to prevent unexpected optimization
        for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0'; 

        timer.startCounter();
        for (int i=0; i<9*N; i+= 9) result = converter(a+i);
        runtime += timer.getCounterNs();
    }
    cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
    delete[] a;
}

int main() {
    benchmark(convert1, "slow");
    benchmark(convert2, "normal");    
    benchmark(convert3, "fast");
    benchmark(convert4u, "unsigned");
    benchmark(convert5, "shifting");
    benchmark(convert6u, "reverse");
    benchmark(convertX, "swar64");
    benchmark(convertSIMD, "manualSIMD");

    return 0;
}

I want to find the fastest way turn char a[9] into an int. The full problem is convert char a[15] with form HHMMSSxxxxxxxxx timestamp to nanosecond, where ~50 bytes after the x are allocated and can be safely read (but not write). We only care about the last 9 digits in this question.

Version 1 is basic, version 2,3 try to save some computation. I compile with -O3 flag, and storing power of 10s in array is fine because it is optimized away (checked using Godbolt).

How can I make this faster? Yes I know this sounds like premature optimization, but let's assume I need that final 2-3% boost.

**Big edit:** I've replaced the code to reduce the effect of std::chrono on the measured time. The results is very different: 2700ms, 810ms, 670ms. On my laptop with i7 8750H, gcc 9.3.0 with -O3 flag, the result is: 355, 387, 320ms.

Version 3 is decidedly faster, while version 2 is slower due to code size. But can we do better than version 3? Invalid benchmark

Edit 2: the function can return unsigned int instead of int (i.e

unsigned convert1(char *a);

Edit 3: I noticed that the new code is an invalid benchmark, since convert(a) is only executed once. Using the original code, the difference is only ~1%.

Edit 4: New benchmark. using unsigned (convert4u, convert6u) is consistently 3-5% faster than using int. I will run a long (10+ min) benchmark to see if there's a winner. I've edited the code to use a new benchmark. It generates a large amount of data, then run the converter functions.

Edit 5: results: 4.19, 4.51, 3.82, 3.59, 7.64, 3.72 seconds. The unsigned version is fastest. Is it possible to use SIMD on just 9 bytes? If not, then I guess this is the best solution. I still hope there's a crazier solution, though

Edit 6: benchmark result on AMD Ryzen 4350G, gcc version 10.3, compile command gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native

slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average

The accepted answer does even more than the question require and compute HH/MM/SS/nanosec, so it's even faster than this benchmark shows.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Huy Le
  • 1,439
  • 4
  • 19
  • 1
    You are compiling your benchmarks with optimization flags on I hope... right? Also, yeah, this is the definition of premature optimization. The operation is instantaneous. The overhead of calling `std::chrono::whatever` twice is probably more than the calculation itself. Unless you need to achieve a throughput of one trillion of integer conversions per second, I wouldn't worry about it. Anyway, using a global array for storing powers of 10 is pretty bad for performance. Just inline them in the formula. – Marco Bonelli Dec 20 '21 at 11:15
  • 3
    what compiler flags did you use? Optimizations turned on? I doubt, because you arent using `result` and the compiler should optimize such that `converter` isnt called at all – 463035818_is_not_an_ai Dec 20 '21 at 11:15
  • Why is it important that the number have exactly nine digits? Otherwise I'm pretty sure this is a dupe... have you had a look at [this question](https://stackoverflow.com/q/16826422/1593077) for example? – einpoklum Dec 20 '21 at 11:16
  • see what happens once you make use of optimizations: https://godbolt.org/z/xsx46v98a (though that doesnt tell anything about how efficient the different ways to convert actually are) – 463035818_is_not_an_ai Dec 20 '21 at 11:16
  • 1
    Can it be a `char a[16]` with 9 real chars and 7 chars of padding? Dealing with 9 chars is possible but a pain. – harold Dec 20 '21 at 11:19
  • 4
    Have you compared against [`std::from_chars()`](https://en.cppreference.com/w/cpp/utility/from_chars)? – einpoklum Dec 20 '21 at 11:21
  • @MarcoBonelli optimization is -O3. 9 digits because I have string HHMMSSxxxxxxxxx (timestamp). I already looked at godbolt compiler explorer. – Huy Le Dec 20 '21 at 11:33
  • @harold unfortunately no, since my string has format HHMMSSxxxxxxxxx (9 last digits). But can you provide a solution in case of char a[16]? I'll still accept it if there's no better – Huy Le Dec 20 '21 at 11:34
  • @MarcoBonelli with -O3, the compiler will replace const array access by const number, so that part is fine. I checked it using Godbolt assembly explorer – Huy Le Dec 20 '21 at 11:35
  • I've replaced the code very differently. Can you check and run on your machine? – Huy Le Dec 20 '21 at 11:39
  • Micro op candidate: replace `(a[8] - 48) * p[0]` with `(a[8] - 48)`. – chux - Reinstate Monica Dec 20 '21 at 11:47
  • 1
    @HuyLe I could have it parse those 9 chars and ignore the rest, the main concern is being able to use a 16-byte load instruction without too much effort spent on edge cases – harold Dec 20 '21 at 11:54
  • @chux-ReinstateMonica that doesn't matter because compiler will optimize away multiply by 1. It also replaces const array access with const number – Huy Le Dec 20 '21 at 11:54
  • @HuyLe Why fill `a[i]` with random characters `rand()` instead of random digits `rand() % 10 + '0'`? Are you certain that is will not make a difference? – chux - Reinstate Monica Dec 20 '21 at 11:58
  • @463035818_is_not_a_number "compiler should optimize such that ... isn't called". unfortunately that's false because "volatile" flag will force the compiler to write to result, even if it isn't used anywhere else. – Huy Le Dec 20 '21 at 11:58
  • @chux-ReinstateMonica I tried both and it doesn't make a difference – Huy Le Dec 20 '21 at 12:00
  • sorry I missed the `volatile`, then turning on optimizations will show you meaningful results – 463035818_is_not_an_ai Dec 20 '21 at 12:01
  • @harold okay I noticed that you can treat it as a[16] (~50 bytes after &a[0] is allocated, so accessing them will not cause segfault). But you must not change those last 7 digits (only read). Can you provide a solution in that case? – Huy Le Dec 20 '21 at 12:05
  • I cant comment so i had to make this an answer. Here is a [link](https://stackoverflow.com/questions/16826422/c-most-efficient-way-to-convert-string-to-int-faster-than-atoi/16826908) – Yasser CHENIK Dec 20 '21 at 12:06
  • 2
    If code used `unsigned` math, then `(a[0] - 48) * p[8] ... - 533333328` could become `a[0] * p[8] ... - (unsigned)5333333328u`. (Hope I got the constant correct) – chux - Reinstate Monica Dec 20 '21 at 12:08
  • @chux-ReinstateMonica unsigned int is allowed, thank you – Huy Le Dec 20 '21 at 12:10
  • 4
    Your benchmark is *completely* invalid. The compiler (or at least [one compiler](https://godbolt.org/z/6ezrqhxzn)) is able to see right through your `for (int i=1; i<= 1000000; i++) result = converter(a);` loop and understand that the function always returns the same result for the same input. Note the instruction `mov dword ptr [rip + result], eax` repeated 10 times in a row and then a jump right back to the start of the block to achieve a 1000000 repetition. The actual conversion is only performed once. (And the timing is x10 faster than with older compilers). – n. m. could be an AI Dec 20 '21 at 12:14
  • 1
    @n.1.8e9-where's-my-sharem. Perhaps `char a[9];` --> `char aa[9]; char * volatile a = aa;` would solve that so the call `converter(a);` is not assume-able to always result in the same value. – chux - Reinstate Monica Dec 20 '21 at 12:18
  • @n.1.8e9-where's-my-sharem. sorry I fixed that code in a rush, the original code didn't use it that way. Using the original code the difference is ~1% – Huy Le Dec 20 '21 at 12:31
  • 1
    Note also how clang 13 compiles convert2 and convert3 to [exactly the same machine code](https://godbolt.org/z/9Tsf1c1P1). – n. m. could be an AI Dec 20 '21 at 12:31
  • 4
    Maybe you should try some kind of SIMD implementation of atoi function. References: https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd http://0x80.pl/articles/simd-parsing-int-sequences.html – Zuljin Dec 20 '21 at 12:50
  • @Zuljin: Yeah, [How to implement atoi using SIMD?](https://stackoverflow.com/q/35127060) is relevant, but it has to deal with variable-length numbers while this is padded with leading zeros so we can blindly use SIMD without masking, making it even better. – Peter Cordes Dec 20 '21 at 12:52
  • Before even thinking of optimizing anything here, I'd highly suggest you work a bit on your benchmark. the results are highly dependent of the ordering of the single benchmarks. You should probably generate the test data in advance and measure the execution time of the whole loop. – florestan Dec 20 '21 at 13:23
  • I hihgly recomment this video: https://www.youtube.com/watch?v=r-TLSBdHe1A – florestan Dec 20 '21 at 13:28
  • And now it is many many times slower and cannot be checked on godbolt. You allocate and initialize 9 times more memory for `a` than you actually use... – n. m. could be an AI Dec 20 '21 at 14:23
  • whoops, I saw that. I've fixed and rerunning the code – Huy Le Dec 20 '21 at 14:53
  • 1
    **Don't use** `a[i] - 48`. Write that as `a[i] - '0'`. That works for all character encodings. – Pete Becker Dec 20 '21 at 15:32
  • To get a better idea of timing overhead, try `int convert_null(char* a) { return 0; }`. I doubt it is 0.0. – chux - Reinstate Monica Dec 20 '21 at 18:46
  • 1
    @PeteBecker A downside to `a[i] - '0'` (and `a[i] - 48`) is that the subtraction is an `int` one. To encourage a lesser compiler into _unsigned_ math, it may be useful to banish signed operations and use `a[i] - (unsigned) '0'`. Still code should avoid magic numbers, even `533333328`, which could be made into `#define` as `'0' + 10*'0' + 100*'0' ...`. – chux - Reinstate Monica Dec 20 '21 at 18:58
  • @chux-ReinstateMonica -- nevertheless, `ch - '0'` is the canonical way to convert a digit character to the value that it represents. `ch - 48` is simply wrong. Sure, if you have additional constraints, you might want to add some bells and whistles, but that has nothing to do with what I said. – Pete Becker Dec 20 '21 at 19:59
  • 2
    @PeteBecker True that your comment was certainly about clarity and portability, yet OP's issue here is sensitive to sign-ness of types and so I commented on how to take that good idea, extend it and relate it to types.. – chux - Reinstate Monica Dec 20 '21 at 20:10
  • With Visual C++ (v19.31.31107), I was getting results about 10x worse than you, each benchmark returned approximately 4000 ms. By simply changing the convert functions to take `const` pointers, all of the times dropped to approximately 400 ms. I haven't checked with Compiler Explorer, but my guess is that using non-const pointers blocked some compiler optimization because it couldn't prove there was no pointer aliasing. – Adrian McCarthy Apr 28 '22 at 18:04
  • Thanks for updating with your benchmark results. `autoSIMD` is not an appropriate name for Adrian's answer, though: it's [manual SWAR](https://en.wikipedia.org/wiki/SWAR). (SIMD Within A Register). So maybe SWAR64bit as a name. – Peter Cordes Apr 29 '22 at 02:46
  • @AdrianMcCarthy: You didn't mention what CPU you tested on. My version with intrinsics should be good on most CPUs, I think, even if you compile without AVX. (Which costs some extra instructions.) Been a while since I looked at it. Your portable SWAR version is interesting. If stuff like `const` makes a big difference, that makes me wonder if the benchmark might be getting partially optimized away, or if inlining details matter. But IIRC, 1.5 ns on OP's Zen (several clock cycles) seems reasonable throughput for mine, so if it's worse that might be an MSVC missed optimization. – Peter Cordes Apr 29 '22 at 02:51
  • 1
    @AdrianMcCarthy: I checked on Godbolt by copy/pasting the current code in the question. (https://godbolt.org/z/8E8719MvK) x64 MSVC 19.31 -O2 and x86-64 GCC 11.3 -O3 -msse4.1, neither `-march=znver1` or `/arch:AVX2` options. Both compile without inlining the `benchmark()` function calls or constant-propagating function pointers into it, so they don't have any opportunity to hoist redundant work out of the loop, or even hoist *constants* out of loops like you might hope a normal use-case would allow. So anyway, the benchmark looks fine here. Our actual functions compile basically the same too – Peter Cordes Apr 29 '22 at 03:32
  • @AdrianMcCarthy: I also don't see anything weird (or even different) when removing `const` from the `const char*` args (https://godbolt.org/z/GhvY89hhq), but I didn't try actually running the code. (I don't have a convenient way to run MSVC output, and didn't port the asm to run on Linux.) The critical parts, the `benchmark` loop itself, and the `convert...` functions, look to have compiled about the same. You didn't maybe test a debug build in your version without `const`? The `const` version still avoids any optimizations that would defeat the benchmark, same as non-const. – Peter Cordes Apr 29 '22 at 03:39

3 Answers3

10

Yes, SIMD is possible, as mentioned in comments. You can take advantage of it to parse the HH, MM, and SS parts of the string at the same time.

Since you have a 100% fixed format with leading 0s where necessary, this is easier than How to implement atoi using SIMD? - Place-values are fixed and we don't need any compare / bit-scan or pcmpistri to look up a shuffle control mask or scale-factor. Also SIMD string to unsigned int parsing in C# performance improvement has some good ideas, like tweaking the place-value multipliers to avoid a step at the end (see the BMI2 version at the end of this answer which also uses that trick.)

9 decimal digits is two dwords of input and one leftover byte that's probably best to grab separately.

Assuming you care about throughput (ability to overlap this with surrounding code, or do this in a loop on independent elements) moreso than critical path latency in cycles from input pointer and data in memory being ready to nanoseconds integer being ready, SSSE3 SIMD should be very good on modern x86. (With SSE4.1 being useful if you want to unpack your hours, minutes, seconds into contiguous uint32_t elements e.g. in a struct). It might be competitive on latency, too, vs. scalar.

Fun fact: clang auto-vectorizes your convert2 / convert3 functions, widening to 8x dword in a YMM register for vpmulld (2 uops), then a chain of shuffle/add.

The strategy is to use pmaddubsw and pmaddwd to multiply-and-add pairs horizontally, in a way that gets each digit multiplied by its place value. e.g. 10 and 1 pairs, then 100 and 1 for pairs of integer that come from double-digits. Then extract to scalar for the last pair: multiply the most-significant part by 100 * 100, and add to the least-significant part. I'm pretty sure overflow is impossible at any step for inputs that are actually '0'..'9'; This runs and compiles to the asm I expected, but I didn't verify the numeric results.

// See also an updated version using RORX as discussed in comments
#include <immintrin.h>

typedef struct {             // for output into memory
    alignas(16) unsigned hours;
    unsigned minutes, seconds, nanos;
} hmsn;

void str2hmsn(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    //hmsn retval;
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    __m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places.

    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);                           // SSE4.1  hours, minutes, seconds unpack from uint16_t to uint32
    //_mm_storeu_si128((__m128i*)&retval, hms_unpacked);                                  // store first 3 struct members; last to be written separately
    _mm_storeu_si128((__m128i*)out, hms_unpacked);
    // or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

    __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
    uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
    uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
    //retval.nanos = nanos;
    //return retval;

  // returning the struct by value encourages compilers in the wrong direction
  // into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}

On Godbolt with a test loop that uses asm("" ::"m"(sink): "memory" ) to make the compiler redo the work in a loop. Or a std::atomic_thread_fence(acq_rel) hack that gets MSVC to not optimize away the loop either. On my i7-6700k with GCC 11.1, x86-64 GNU/Linux, energy_performance_preference = performance, I got this to run at one iteration per 5 cycles.

IDK why it doesn't run at one per 4c; I tweaked GCC options to avoid the JCC erratum slowdown without padding, and to have the loop in hopefully 4 uop cache lines. (6 uops, 1 uop ended by a 32B boundary, 6 uops, 2 uops ended by the dec/jnz). Perf counters say the front-end was "ok", and uops_dispatched_port shows all 4 ALU ports at less than 4 uops per iteration, highest being port0 at 3.34. Manually padding the early instructions gets it down to 3 total lines, of 3, 6, 6 uops but still no improvement from 5c per iter, so I guess the front-end really is ok.

LLVM-MCA seems very ambitious in projecting 3c per iter, apparently based on a wrong model of Skylake with a "dispatch" (front-end rename I think) width of 6. Even with -mcpu=haswell with a proper 4-wide model it projects 4.5c. (I used asm("# LLVM-MCA-BEGIN") etc. macros on Godbolt and included an LLVM-MCA output window for the test loop.) It doesn't have fully accurate uop->port mapping, apparently not knowing about slow-LEA running only on port 1, but IDK if that's significant.

Throughput may be limited by the ability to find instruction-level parallelism and overlap across several iterations, as in Understanding the impact of lfence on a loop with two long dependency chains, for increasing lengths

The test loop is:

#include <stdlib.h>
#ifndef __cplusplus
#include <stdalign.h>
#endif
#include <stdint.h>

#if 1 && defined(__GNUC__)
#define LLVM_MCA_BEGIN  asm("# LLVM-MCA-BEGIN")
#define LLVM_MCA_END  asm("# LLVM-MCA-END")
#else
#define LLVM_MCA_BEGIN
#define LLVM_MCA_END
#endif


#if defined(__cplusplus)
    #include <atomic>
    using std::atomic_thread_fence, std::memory_order_acq_rel;
#else
    #include <stdatomic.h>
#endif

unsigned testloop(const char str[15]){
    hmsn sink;
    for (int i=0 ; i<1000000000 ; i++){
        LLVM_MCA_BEGIN;
        str2hmsn(&sink, str);
        // compiler memory barrier 
        // force materializing the result, and forget about the input string being the same
#ifdef __GNUC__
        asm volatile("" ::"m"(sink): "memory");
#else
  //#warning happens to be enough with current MSVC
        atomic_thread_fence(memory_order_acq_rel); // strongest barrier that doesn't require any asm instructions on x86; MSVC defeats signal_fence.
#endif
    }
    LLVM_MCA_END;
    volatile unsigned dummy = sink.hours + sink.nanos;  // make sure both halves are really used, else MSVC optimizes.
    return dummy;
}



int main(int argc, char *argv[])
{
    // performance isn't data-dependent, so just use a handy string.
    // alignas(16) static char str[] = "235959123456789";
    uintptr_t p = (uintptr_t)argv[0];
    p &= -16;
    return testloop((char*)p);   // argv[0] apparently has a cache-line split within 16 bytes on my system, worsening from 5c throughput to 6.12c
}

I compiled as follows, to squeeze the loop in so it ends before the 32-byte boundary it's almost hitting. Note that -march=haswell allows it to use AVX encodings, saving an instruction or two.

$ g++ -fno-omit-frame-pointer -fno-stack-protector -falign-loops=16 -O3 -march=haswell foo.c -masm=intel
$ objdump -drwC -Mintel a.out | less

...
0000000000001190 <testloop(char const*)>:
  1190:   55                    push   rbp
  1191:   b9 00 ca 9a 3b        mov    ecx,0x3b9aca00
  1196:   48 89 e5              mov    rbp,rsp
  1199:   c5 f9 6f 25 6f 0e 00 00    vmovdqa xmm4,[rip+0xe6f]        # 2010
  11a1:   c5 f9 6f 15 77 0e 00 00    vmovdqa xmm2, [rip+0xe77]        # 2020  # vector constants hoisted
  11a9:   c5 f9 6f 0d 7f 0e 00 00    vmovdqa xmm1, [rip+0xe7f]        # 2030
  11b1:   66 66 2e 0f 1f 84 00 00 00 00 00      data16 cs nop WORD PTR [rax+rax*1+0x0]
  11bc:   0f 1f 40 00        nop    DWORD PTR [rax+0x0]
### Top of loop is 16-byte aligned here, instead of ending up with 8 byte default
  11c0:   c5 d9 fc 07        vpaddb xmm0,xmm4, [rdi]
  11c4:   c4 e2 79 04 c2     vpmaddubsw xmm0,xmm0,xmm2
  11c9:   c4 e2 79 33 d8     vpmovzxwd xmm3,xmm0
  11ce:   c5 f9 73 d8 06     vpsrldq xmm0,xmm0,0x6
  11d3:   c5 f9 f5 c1        vpmaddwd xmm0,xmm0,xmm1
  11d7:   c5 f9 7f 5d f0     vmovdqa [rbp-0x10],xmm3
  11dc:   c4 e1 f9 7e c0     vmovq  rax,xmm0
  11e1:   69 d0 10 27 00 00  imul   edx,eax,0x2710
  11e7:   48 c1 e8 20        shr    rax,0x20
  11eb:   01 d0              add    eax,edx
  11ed:   8d 14 80           lea    edx,[rax+rax*4]
  11f0:   0f b6 47 0e        movzx  eax,BYTE PTR [rdi+0xe]
  11f4:   8d 44 50 d0        lea    eax,[rax+rdx*2-0x30]
  11f8:   89 45 fc           mov    DWORD PTR [rbp-0x4],eax
  11fb:   ff c9              dec    ecx
  11fd:   75 c1              jne    11c0 <testloop(char const*)+0x30>
  # loop ends 1 byte before it would be a problem for the JCC erratum workaround
  11ff:   8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]

So GCC made the asm I had planned by hand before writing the intrinsics this way, using as few instructions as possible to optimize for throughput. (Clang favours latency in this loop, using a separate add instead of a 3-component LEA).

This is faster than any of the scalar versions that just parse X, and it's parsing HH, MM, and SS as well. Although clang auto-vectorization of convert3 may give this a run for its money in that department, but it strangely doesn't do that when inlining.

GCC's scalar convert3 takes 8 cycles per iteration. clang's scalar convert3 in a loop takes 7, running at 4.0 fused-domain uops/clock, maxing out the front-end bandwidth and saturating port 1 with one imul uop per cycle. (This is reloading each byte with movzx and storing the scalar result to a stack local every iteration. But not touching the HHMMSS bytes.)

$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,idq.mite_uops,idq_uops_not_delivered.cycles_fe_was_ok -r1 ./a.out

 Performance counter stats for './a.out':

          1,221.82 msec task-clock                #    1.000 CPUs utilized          
                 0      context-switches          #    0.000 /sec                   
                 0      cpu-migrations            #    0.000 /sec                   
               105      page-faults               #   85.937 /sec                   
     5,079,784,301      cycles                    #    4.158 GHz                    
    16,002,910,115      instructions              #    3.15  insn per cycle         
    15,004,354,053      uops_issued.any           #   12.280 G/sec                  
    18,003,922,693      uops_executed.thread      #   14.735 G/sec                  
         1,484,567      idq.mite_uops             #    1.215 M/sec                  
     5,079,431,697      idq_uops_not_delivered.cycles_fe_was_ok #    4.157 G/sec                  

       1.222107519 seconds time elapsed

       1.221794000 seconds user
       0.000000000 seconds sys

Note that this is for 1G iterations, so 5.08G cycles means 5.08 cycles per iteration average throughput.

Removing the extra work to produce the HHMMSS part of the output (vpsrldq, vpmovzxwd, and vmovdqa store), just the 9-digit integer part, it runs at 4.0 cycles per iteration on Skylake. Or 3.5 without the scalar store at the end. (I edited GCC's asm output to comment that instruction, so I know it's still doing all the work.)

The fact that there's some kind of back-end bottleneck here (rather than front-end) is probably a good thing for overlapping this with independent work.


Alternate version using BMI2 rorx

@aqrit's answer on SIMD string to unsigned int parsing in C# performance improvement inspired a version that allows the remaining high * 2 part to be done as part of an LEA instead of scalar ADD, using that movq strategy instead of pshufd/paddd. After coaxing GCC into emitting RORX to copy-and-extract instead of a braindead 2x vmovq r64, xmm0, that gets us down to 14 front-end uops, down from 16, and unfused domain uops 17 down from 18. (clang deoptimizes to mov+shr). Godbolt

// BMI2 version, compiles to efficient asm with GCC11
void str2hmsn_rorx(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    const __m128i mul1 = _mm_set_epi16(0, 0x010A, 0x0A64, 0x14C8, 0x14C8 /* nanos 7 .. 0 */, 0x010A, 0x010A, 0x010A /* SS, MM, HH */);
    const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);  // extra scaling for the more-significant half baked in to save an imul

    //__m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places in printing order.
    __m128i hms_x_words = _mm_maddubs_epi16(mul1, digs);    // mul1 as the unsigned operand (first)

    // or scalar extract with _mm_cvtsi128_si64 (movq) instead of unpack, and shift / movzx
    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);             // SSE4.1 pmovxzwd   hours, minutes, seconds unpack from u16 to u32
    _mm_storeu_si128((__m128i*)out, hms_unpacked);

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

//  __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    __m128i xdwords = _mm_madd_epi16(xwords, mul2);   // low/high uint32 chunks, without the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
//  uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
//  uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
//  uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit

    uint32_t msd = 2 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low bits of qword on little-endian x86.
//  uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t first8_x = msd + (uint32_t)_lrotr(pair32, 32);  // try to get it to use rorx to copy-and-extract the high half
    // FIXME: _rotr64 not available everywhere, but _lrotr is 32-bit on Windows.

    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
}

(_lrotr requires GCC11 or later. And on Windows it's a 32-bit rotate. But _rotr64 isn't available everywhere. On earlier GCC, look for a different intrinsic or rotate idiom that convinces the compiler to use rorx dst, src, 32 instead of mov+shr.)

Inlined into testloop() in the Godbolt link (which can hoist the constants out of the loop, but forces the work to happen repeatedly), uiCA (https://uica.uops.info/) predicts that Skylake could run it at one iteration per approximately 3.78 cycles, including a dec/jnz at the bottom of the loop and a store of the result, but no pointer increment. (uiCA is significantly more accurate than LLVM-MCA)

Ice Lake / Rocket Lake might run this at one iter per 3.14 cycles.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • 2
    This is the type of insane solution that I expect, thanks! What kind of book do I need to start learning things like these? – Huy Le Dec 21 '21 at 10:01
  • 1
    I only care about assembly/intrinsic for x86. Is there a practical optimization book on that topic? – Huy Le Dec 21 '21 at 10:17
  • I guess I need to start with computer architecture of modern x86-64 cpu first? – Huy Le Dec 21 '21 at 10:34
  • 4
    @HuyLe: https://agner.org/optimize/ is where I learned about microarchitecture details like how the uop cache works in Sandybridge-family, and https://www.realworldtech.com/haswell-cpu/. Also https://uops.info/ is a better version of Agner's instruction tables, with automated result gathering that avoids Agner's occasional typos and omissions. (But Agner's hand testing has found performance effects that Andreas Abel had to update uops.info to look for.) Anyway, more stuff in https://stackoverflow.com/tags/x86/info, and the [tag:x86] / [tag:cpu-architecture] tags on SO, like JCC erratum stuff – Peter Cordes Dec 21 '21 at 16:20
  • 1
    Better multipliers could replace the `100*100` imul with an add. See [here](https://stackoverflow.com/a/66430672). – aqrit Dec 21 '21 at 21:22
  • 2
    @aqrit: Thanks, that actually allows the remaining `high * 2` part to be done as part of an LEA instead of scalar ADD, using that movq strategy instead of pshufd/paddd. After coaxing GCC into emitting RORX to copy-and-extract instead of a braindead 2x `vmovq r64, xmm0`, that gets us down to 4.58c per iter (14 front-end uops, down from 16, and unfused domain uops 17 down from 18). (clang deoptimizes to mov+shr). https://godbolt.org/z/6oxzY4EWK Will update the answer at some point. – Peter Cordes Dec 22 '21 at 02:40
  • @PeterCordes Is there a reason the nano conversion needs to be dependent on the computation for HMS? Why can you independently start the `xwords` portion with `vmovq` with either a reload from memory or cast from initial load? – Noah Dec 25 '21 at 20:02
  • @Noah: we need byte pairs -> words, and word pairs -> dwords. HMS can fork off from that chain after bytes -> words with `pmaddubsw`, but the 8-digit part of nanoseconds has to continue. The only thing it's costing us is a `psrldq` as part of the dep chain, as opposed to doing another separate `pmaddubsw`. (Two shorter dep chains might be better throughput for a loop that does only this; I think we're limited by ROB / RS capacity to find ILP, not by front-end or port01 throughput, but haswell has lower `pmadd` throughput, only one port, and I figured this was more balanced for ports.) – Peter Cordes Dec 26 '21 at 03:10
  • @PeterCordes For some reason this code crash on AMD cpu. It says "illegal instructions" ? I forgot to update you, sorry – Huy Le Jan 11 '22 at 02:18
  • 1
    @HuyLe: On *which* AMD CPU? If compiled with `-march=haswell`, this uses AVX1 and BMI2 (RORX), so it requires at least Zen1 or Haswell. But it can be compiled for just SSE4.1, or with a small change SSSE3, if you want, so obviously compile with `-march=native` to make asm that can run on your CPU. The only hint in the question about what ISA extensions to target was your mention of a 8750H laptop, which is Skylake-family, so I didn't worry much about obsolete microarchitectures when talking about how it will compile. – Peter Cordes Jan 11 '22 at 02:30
  • I think it's ryzen 4750G pro, but the code is run inside a virtual machine. I will try to test it on a native machine instead of VM – Huy Le Jan 11 '22 at 08:25
  • 2
    @HuyLe: Or fix your VM config to pass through the instruction-set extensions you compiled for. Many default to disabling a lot of stuff to enable migration of VMs to older CPUs. – Peter Cordes Jan 11 '22 at 09:04
  • `taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any -r1 ./insane` So this command give `event syntax error: '..structions,uops_issued.any'\___ parser error` when I add `uops_issued.any` and later. I'm using `perf 5.13.19`, `uops_issued` doesn't show up in `perf list` – Huy Le Apr 29 '22 at 03:07
  • 1
    @HuyLe: That event is specific to Intel CPUs. It does show up in `perf list` on my Skylake. I wasn't trying to choose a set of events that would be portable, just interesting ones on my CPU. A bit strange that perf reports a *parser* error, rather than something about the event simply not being found, but that's probably it, especially since it's not in your `perf list` output. Anyway, pick some useful / relevant events for your Zen. `uops_issued.any` on Intel counts issue slots, so micro-fused `add eax, [mem]` counts as one there, but two uops for `uops_executed.thread` in the back-end. – Peter Cordes Apr 29 '22 at 03:09
  • 1
    The IDQ is the instruction decode queue that feeds the issue stage, MITE is the legacy-decode pathway (not the uop cache). Skylake has some microcode workarounds for errata that can defeat the uop cache, so it's always good to check while benchmarking that your code isn't falling back to legacy decode. `idq_uops_not_delivered.cycles_fe_was_ok` counts cycles where the front-end had uops ready for the back-end, but the back-end didn't take them. For a short loop, it confirms the front-end is keeping up, so the bottleneck is in the back-end. Large progs can have a mix of FE/BE bottlenecks. – Peter Cordes Apr 29 '22 at 03:12
  • `error: ‘_lrotr’ was not declared in this scope` I get this on gcc 10.2.0, Ubuntu 20.04. Is it on Window only ? – Huy Le Jun 02 '23 at 11:15
  • 1
    @HuyLe: No, I use (Arch) Linux on my desktop. And it compiles in the Godbolt link in my answer which is also on Linux. IDK, maybe some older version of GCC doesn't define it in `immintrin.h`? Apparently it was new in GCC11, according to https://godbolt.org/z/14q6T44P1 . Pick any 64-bit rotate idiom you like, as long as it gets your compiler to use `rorx` to get the high half of one 64-bit register into the low 32 of another; see [Best practices for circular shift (rotate) operations in C++](https://stackoverflow.com/q/776508) – Peter Cordes Jun 02 '23 at 15:25
  • 1
    @HuyLe: I had that `_lrotr` version sitting around in an editor since probably 2021 when I replied to aqrit. I finally got around to editing it into the answer, at least as an update instead of rewriting the whole answer to be about it initially. I'd forgotten about the `FIXME` comment in the code about `_lrotr` not being fully portable (e.g. to Windows). Maybe try `_rotr64` if your GCC has it. – Peter Cordes Jun 02 '23 at 15:58
5

An alternative candidate

Use unsigned math to avoid UB of int overflow and allow for taking all the - 48 out and then into a constant.

const unsigned p[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
    100000000};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0] * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5] + a[4] * p[4]
      + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8] - (unsigned) 5333333328u;
}

Also try ordering p[9] like a[]. Perhaps easier to parallel calculate. I see no down-side to re-ordering.

const unsigned p[9] = {100000000, 10000000, ..., 1};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0]*p[0] + a[1]*p[1] ... a[1]*p[1] + a[8] - (unsigned) 5333333328u;
}
chux - Reinstate Monica
  • 143,097
  • 13
  • 135
  • 256
  • I've benchmarked and unfortunate it has the same speed as the regular version. – Huy Le Dec 20 '21 at 12:56
  • @HuyLe Maybe 2nd idea about ordering helps? IAC, a interesting problem. Hope you squeeze out a few % improvement somehow. – chux - Reinstate Monica Dec 20 '21 at 13:02
  • I just tried and re-ordering give the same speed. This is because assembly replace const array with actual numbers (i.e it replaces a[0] * p[0] with a[0] * 10^9 directly). So the order does not matter – Huy Le Dec 20 '21 at 13:08
  • 2
    @HuyLe To be fair, the order does not make a difference with your compiler/implementation used today. This applies to [using characters or digits](https://stackoverflow.com/questions/70420948/most-insanely-fastest-way-to-convert-9-char-digits-into-an-int-or-unsigned-int/70422026?noredirect=1#comment124483156_70420948) too. That is the trouble with such [micro-optimizations](https://softwareengineering.stackexchange.com/questions/80084/is-premature-optimization-really-the-root-of-all-evil). With next year's processor/compiler, the results may differ. – chux - Reinstate Monica Dec 20 '21 at 13:12
  • 2
    Okay I've changed the benchmark a bit and the unsigned version is 1-2% faster than convert3() most of the time. I think I need to setup a good benchmark first. – Huy Le Dec 20 '21 at 13:13
  • @HuyLe "good benchmark first" --> Perhaps only test random _digits_ and not _characters_. Maybe as you say, it makes no difference in past tests, but if a new approach came along where it did make a difference, it really is only the random _digit_ one that is important. – chux - Reinstate Monica Dec 20 '21 at 13:16
  • a[i] = rand() was a mistake on my part. Even my code assume a[i] must be in range '0' and '9', else it'll overflow. That has been fixed and I'm running a long benchmark. – Huy Le Dec 20 '21 at 13:26
  • `(unsigned) 5333333328u` -- belt and suspenders? – Pete Becker Dec 20 '21 at 19:36
  • @PeteBecker Does look like it. The cast in `(unsigned) 5333333328u` is to narrow the 33-bit constant to an `unsigned` when `unsigned` is 32-bit and so avoid wider math. `5333333328` comes from `'0' + 10*'0' + 100*'0' .... 100000000*'0'`. – chux - Reinstate Monica Dec 20 '21 at 20:06
  • @chux-ReinstateMonica: you might be interested in my answer. I got around to trying SIMD, and AVX on Skylake lets us parse the whole HHMMSSXX...XX string at one per 5 cycles throughput, storing results to memory. Latency is probably worse, but I was optimizing for throughput since timestamps are usually just data. – Peter Cordes Dec 21 '21 at 07:49
3

You don't necessarily need to use special SIMD instructions to do computation in parallel. By using a 64-bit unsigned integer, we can process eight of the nine bytes in parallel and then treat the ninth as a one-off at the end.

constexpr std::uint64_t zeros(char z) {
    std::uint64_t result = 0;
    for (int i = 0; i < sizeof(result); ++i) {
        result = result*256 + z;
    }
    return result;
}

unsigned convertX(const char *a) {
    constexpr std::uint64_t offset = zeros('0');
    constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
    constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
    constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;

    std::uint64_t buffer;
    std::memcpy(&buffer, a, sizeof(buffer));
    const auto bytes = buffer - offset;
    const auto b1 = (bytes & o1) >> 8;
    const auto words = (bytes & ~o1) + 10*b1;
    const auto w1 = (words & o2) >> 16;
    const auto dwords = (words & ~o2) + 100*w1;
    const auto d1 = (dwords & o3) >> 32;
    const auto qwords = (dwords & ~o3) + 1000*d1;

    const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
    return static_cast<unsigned>(final);
}

I tested with MS Visual C++ (64-bit), and the benchmark time for this solution was just over 200 ms versus all of the others which came in right at 400 ms. This makes sense since it uses about half of the multiply and add instructions that the "normal" solution does.

I know the memcpy looks wasteful, but it avoids undefined behavior and alignment problems.

Adrian McCarthy
  • 45,555
  • 16
  • 123
  • 175
  • I just benchmarked again and updated the post. The manual SIMD solution is still much faster (and actually does more things) – Huy Le Apr 29 '22 at 02:43