2
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
#include <vector>
#include <time.h>
using namespace std;
                    
class MyTimer {
 private:
  std::chrono::time_point<std::chrono::steady_clock> starter;

 public:
  void startCounter() {
    starter = std::chrono::steady_clock::now();
  }

  int64_t getCounterNs() {    
    return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
  }
};
                    
//########################
//########################
using ConvertFunc = uint64_t(const char *);

inline uint64_t parse1(const char *s) {
  struct tm tm;
  memset(&tm, 0, sizeof(tm));
  if (strptime(s, "%Y%m%d %H%M%S", &tm)) {
    return 1000000UL * ((1900 + tm.tm_year) * 10000 + (tm.tm_mon + 1) * 100 + tm.tm_mday) +
        tm.tm_hour * 10000 + tm.tm_min * 100 + tm.tm_sec;
  }
  
  return std::numeric_limits<uint64_t>::max();
}

inline uint64_t parse2(const char *s) {
  static const uint64_t mul[16] = {
    10'000'000'000'000UL, 1'000'000'000'000UL, 100'000'000'000UL, 10'000'000'000UL, 1'000'000'000UL, 100'000'000, 10'000'000, 1'000'000,
    0,
    100'000, 10'000, 1'000, 100, 10, 1,
    0
  };
  
  uint64_t res = 0;
  for (int i = 0; i < 16; i++) {
      res += mul[i] * (s[i] - '0');
  }
  
  return res;
}

inline uint64_t parse3(const char *s) {
  static const int date_mul[] = { 10'000'000, 1'000'000, 100'000, 10'000, 1'000, 100, 10, 1 };
  static const int time_mul[] = { 100'000, 10'000, 1'000, 100, 10, 1 };
  
  int date_val = 0;
  for (int i = 0; i < 8; i++) date_val += date_mul[i] * (s[i] - '0');
  
  int time_val = 0;
  for (int i = 0; i < 6; i++) time_val += time_mul[i] * (s[9 + i] - '0');
  
  return uint64_t(date_val) * 1'000'000UL + time_val;
}

inline uint64_t parse3a(const char *s) {
  static const int left_mul[] = { 10'000'000, 1'000'000, 100'000, 10'000, 1'000, 100, 10, 1 };
  static const int right_mul[] = { 0, 100'000, 10'000, 1'000, 100, 10, 1, 0 };
  
  int date_val = 0;
  for (int i = 0; i < 8; i++) date_val += left_mul[i] * (s[i] - '0');
  
  int time_val = 0;
  for (int i = 0; i < 8; i++) time_val += right_mul[i] * (s[8 + i] - '0');
  
  return uint64_t(date_val) * 1'000'000UL + time_val;
}

inline uint64_t parse4(const char *s) {
  __m128i chunk = _mm_lddqu_si128(
    reinterpret_cast<const __m128i*>(s)
  );
  
  __m128i zeros =  _mm_set1_epi8('0');
  chunk = _mm_sub_epi8(chunk, zeros);

  {
    const auto mult = _mm_set_epi8(
      0, 1, 1, 10, 1, 10, 1, 0, 1, 10, 1, 10, 1, 10, 1, 10
    );
    chunk = _mm_maddubs_epi16(chunk, mult);
  }

  {
    //const __m128i mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
    const auto mult = _mm_set_epi16(1, 10, 1, 100, 1, 100, 1, 100);
    chunk = _mm_madd_epi16(chunk, mult);
  }

  {
    chunk = _mm_packus_epi32(chunk, chunk);
    const auto mult = _mm_set_epi16(0, 0, 0, 0, 1, 1000, 1, 10000);
    chunk = _mm_madd_epi16(chunk, mult);
  }

  return ((chunk[0] & 0xffffffff) * 1000000) + (chunk[0] >> 32);
}

volatile int result = 0; // do something with the result of function to prevent unexpected optimization
template <ConvertFunc converter>
void benchmark(string name, int numTest=1000) {
    MyTimer timer;
    const int N = 100000;
    char *a = new char[16*N + 64];
    int64_t runtime = 0;
        
    int warmup = 100;

    for (int t = 1; t <= warmup + numTest; t++) {
        // change input data to prevent unexpected optimization
        for (int i = 0; i < 16 * N; i += 16) {
          for (int j = 0; j < 8; j++) a[i + j] = rand() % 10 + '0';
          a[i + 8] = ' ';
          for (int j = 0; j < 6; j++) a[i + 9 + j] = rand() % 10 + '0';
          a[i + 15] = '\0';
        }

        if (t > warmup) timer.startCounter();
        for (int i = 0; i < 16 * N; i += 16) result = converter(a+i);
        if (t > warmup) runtime += timer.getCounterNs();
    }
    cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
    delete[] a;
}

void correct_test()
{
  //20141103 012910
  //20141103 012910
  string s = "20141103 012910";
  vector<uint64_t> result;
  result.push_back(parse1(s.c_str()));
  result.push_back(parse2(s.c_str()));
  result.push_back(parse3(s.c_str()));
  result.push_back(parse3a(s.c_str()));
  result.push_back(parse4(s.c_str()));
  
  for (int i = 0; i < result.size() - 1; i++)
    if (result[i] != result[i + 1]) {
      cout << "Wrong at i = " << i << "\n";
      cout << result[i] << "|" << result[i+1] << "|\n";
      exit(1);
    }
}

int main() {
    correct_test();
    cout << "test correct\n";
    benchmark<parse1>("slow");
    benchmark<parse2>("unroll");    
    benchmark<parse3>("dual_unroll");
    benchmark<parse3a>("dual_unroll2");
    benchmark<parse4>("manual_sse");

    return 0;
}

program to test in detail the number of cycles per iteration below.

// Run command: taskset -c 7 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,idq.mite_uops,idq_uops_not_delivered.cycles_fe_was_ok -r1 ./bench

#include <stdlib.h>
#ifndef __cplusplus
#include <stdalign.h>
#endif
#include <stdint.h>

#if 1 && defined(__GNUC__)
#define LLVM_MCA_BEGIN  asm("# LLVM-MCA-BEGIN")
#define LLVM_MCA_END  asm("# LLVM-MCA-END")
#else
#define LLVM_MCA_BEGIN
#define LLVM_MCA_END
#endif

#include <cstring>
#include <time.h>
#include <immintrin.h>
#include <iostream>

// copy the function you want to test here: parse4
inline uint64_t parse4(const char *s) {
  __m128i chunk = _mm_lddqu_si128(
    reinterpret_cast<const __m128i*>(s)
  );
  
  __m128i zeros =  _mm_set1_epi8('0');
  chunk = _mm_sub_epi8(chunk, zeros);

  {
    const auto mult = _mm_set_epi8(
      0, 1, 1, 10, 1, 10, 1, 0, 1, 10, 1, 10, 1, 10, 1, 10
    );
    chunk = _mm_maddubs_epi16(chunk, mult);
  }

  {
    //const __m128i mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
    const auto mult = _mm_set_epi16(1, 10, 1, 100, 1, 100, 1, 100);
    chunk = _mm_madd_epi16(chunk, mult);
  }

  {
    chunk = _mm_packus_epi32(chunk, chunk);
    const auto mult = _mm_set_epi16(0, 0, 0, 0, 1, 1000, 1, 10000);
    chunk = _mm_madd_epi16(chunk, mult);
  }

  return ((chunk[0] & 0xffffffff) * 1000000) + (chunk[0] >> 32);
}

#if defined(__cplusplus)
    #include <atomic>
    using std::atomic_thread_fence, std::memory_order_acq_rel;
#else
    #include <stdatomic.h>
#endif

uint64_t testloop(const char str[16]) {
    uint64_t result = 0;
    for (int i=0 ; i<1000000000 ; i++){
        LLVM_MCA_BEGIN;
        result = parse4(str);
        // compiler memory barrier 
        // force materializing the result, and forget about the input string being the same
#ifdef __GNUC__
        asm volatile("" ::"m"(result): "memory");
#else
  //#warning happens to be enough with current MSVC
        atomic_thread_fence(memory_order_acq_rel); // strongest barrier that doesn't require any asm instructions on x86; MSVC defeats signal_fence.
#endif
    }
    LLVM_MCA_END;
    volatile uint64_t dummy = result;  // make sure both halves are really used, else MSVC optimizes.
    return dummy;
}

int main(int argc, char *argv[])
{
    // performance isn't data-dependent, so just use a handy string.
    // alignas(16) static char str[] = "235959123456789";
//    uintptr_t p = (uintptr_t)argv[0];
//    p &= -16;
    alignas(16) static const char str[] = "20141103 012910";
    std::cout << testloop(str) << "\n";
    return 0;
//    return testloop((char*)p);   // argv[0] apparently has a cache-line split within 16 bytes on my system, worsening from 5c throughput to 6.12c
}


The function needs to convert 15-digit string (has 2 parts: 8 chars, empty space, 6 chars) into a number. Example: "20141103 012910" -> 20141103012910. Basically, it's just string2int after removing the middle white space.

The string is guaranteed to have length 15 and is null terminated, so we can access s[15] without worrying about segfault. But it's not allowed to read/write on s[16] and later. We also ignore the case where the timestamp string is invalid (containing non-digit char, for example).

The current solutions, commands to run, and benchmark results are described below. How can I make it faster? Related question: Most insanely fast way to convert 9 char digits into an int or unsigned int

Solution 1: use strptime from C library, this one is slow because it performs a lot of checking, I think

Solution 2: assuming the input is always valid, we can "unroll" it.

Solution 3: the output is uint64_t, but it's made up of 2 uint32_t part. So we can compute them separately using uint32_t, and only convert to uint64_t at the end.

Solution 3a: the second half has 6 char, which is ugly because it doesn't divide by 4. So we include the middle white space and the null terminating character in computation in hope that the compiler can do some SIMD magic and makes it faster (it doesn't).

Solution 4: manual SSE, modified from here: https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html . I'm not sure if we can go faster than this.

Fastest fast way to parse date time timestamp SIMD (I added this to make the question easier to Google)


Command to run (change 7 to whichever idle CPU):

g++ -o main main.cpp -O3 -std=c++17 -mavx2
numactl --membind=0 taskset --cpu-list 7 ./main

Result on AMD EPYC 75F3 32-Core Processor, 2950MHz

test correct
slow: 46.3384ns average
unroll: 6.25358ns average
dual_unroll: 5.22005ns average
dual_unroll2: 5.24976ns average
manual_sse: 0.891766ns average
Huy Le
  • 1,439
  • 4
  • 19
  • Have you considered just replacing the space with a `'0'` and using `strtol`? – dbush Mar 09 '23 at 03:54
  • 1
    This reminds me of some [testing I did](https://stackoverflow.com/a/16826908/1553090) a while back, which surprised me. I tried all kinds of tricks, hacks and clever ideas, but nothing beat the dumbest approach. I would recommend giving up on your table of multipliers and adapt my naive solution to this problem. You can unroll the loops yourself. Maybe it's possible to do all this with SIMD. You could try loading the string into a 16-byte register and subtracting the `'0'` in one instruction, but you still have to unpack that and do multiplies / adds. Try the simplest way and see if it helps. – paddy Mar 09 '23 at 03:57
  • input is `const char*`, unfortunately. But I just tested `strtol`, and the result is `23.3559ns average`, so still much slower than other solutions. – Huy Le Mar 09 '23 at 04:07
  • I tried a full unroll out of interest, and it's almost exactly the same as the two dual_unroll methods in an optimised build. I guess the compiler is clever enough to unroll the loops itself. – Jonathan Potter Mar 09 '23 at 04:15
  • 3
    Are you interested in AVX2 or SSE intrinsics? Similar problem to your previous question [Most insanely fastest way to convert 9 char digits into an int or unsigned int](https://stackoverflow.com/a/70432059) (where my answer uses SIMD as @paddy suspected was possible). Surprised you didn't link it as a related question in this post. – Peter Cordes Mar 09 '23 at 04:27
  • 2
    Wait, what? Small tens of *nanoseconds* is not fast enough? What are you trying to accomplish? – Steve Friedl Mar 09 '23 at 04:44
  • Yeah, with one second resolution, why bother about speed? – Martin James Mar 09 '23 at 05:27
  • 1
    @SteveFriedl: Presumably parsing large amounts of text with many of these timestamps. This is the same sort of goal as libsimdjson, spending less CPU time (and less energy) parsing the same amount of text. https://github.com/simdjson/simdjson and Daniel Lemire's blog about it: https://lemire.me/blog/2021/09/25/new-release-of-the-simdjson-library-version-1-0/. Also https://lemire.me/blog/2022/05/25/parsing-json-faster-with-intel-avx-512/ - AVX-512 gave a big 30% speedup over an already fast previous version. If you do something billions of times, nanoseconds add up. – Peter Cordes Mar 09 '23 at 06:03
  • 2
    I imagine that the intended purpose is to process tons of time-stamped data. It can make a difference, but once the date parsing is orders of magnitude faster than I/O routines, database insertion or whatever, it passes the point of diminishing returns. – paddy Mar 09 '23 at 06:03
  • 1
    @SteveFriedl it's for parsing data, like SIMD JSON library. When the data is big enough (I've benchmarked), any improvement becomes noticeable – Huy Le Mar 09 '23 at 06:21
  • @PeterCordes yes the questions are on the same topic. But it looks different enough to be separate questions I think? – Huy Le Mar 09 '23 at 06:37
  • 1
    Well yeah of course it's a separate question, not an edit to or duplicate of the old one, but linking a *related* question often makes sense; a future reader might not know both questions exist, and their searching might land them at the one, even if their actual problem is more similar to the other. Might also give potential answerers a starting point for some ideas to apply here. – Peter Cordes Mar 09 '23 at 06:52
  • @PeterCordes oh I agree. I've edited the post to mention the other question – Huy Le Mar 09 '23 at 06:56
  • @HuyLe Just out of curiosity... Where will data come from in your real application? And what is the calculated result used for? Of course, I can't be sure, but I have a strong feeling that you are optimizing on something that has little significance in the overall picture. – Support Ukraine Mar 09 '23 at 08:15
  • 1
    If your goal in converting to integer is to be able to use numerical comparison, then you could save a lot of computation by encoding as [BCD](https://en.wikipedia.org/wiki/Binary-coded_decimal) instead, which preserves the ordering. In essence, instead of converting `"20141103 012910"` to the number `20141103012910`, you convert it to `0x20141103012910`. This requires no multiplication at all. – Nate Eldredge Mar 14 '23 at 04:30

2 Answers2

3

clang is able to auto-vectorize parse2 starting from version >=13.0.0 and parse3 and parse3a from version >=14.0.0. Additionally, it seems that clang version 15.0.0 brings more code generation changes to these auto-vectorizations.

At first, I could not get clang to auto-vectorize your benchmarking code (only the functions), and neither -Rpass-missed=loop-vectorize nor -Rpass-analysis=loop-vectorizer helped with finding a reason why. Thankfully, gcc is able to give much more verbose information with -fopt-info-vec-missed compared to anything that clang provides. With that I was able to find that the auto-vectorizer did not like the volatile int result variable:

<source>:108:53: missed: not vectorized: volatile type: result ={v} _24;

I removed volatile and replaced it with the more idiomatic asm volatile("" ::"r,m"(result): "memory"); aka. google benchmark's benchmark::DoNotOptimize (slightly different variation discussed in the answer linked and written by Peter Cordes). With these changes clang was able to auto-vectorize the functions mentioned earlier: godbolt link here.

I am not proficient enough with vectorization assembly to understand what clang is doing but I was able to note a noticeable improvement in the performance at least compared to gcc. Also, it seems that clang is able to auto-vectorize parse2 best, as parse2 is always able to beat parse3 and parse3a.

Results on WSL (virtual environment) with AMD Ryzen 5 5600 6-Core Processor, 3.5/4.4Ghz:

gcc-11.3.0 -O3 -march=native:

test correct
slow: 41.3296ns average
unroll: 5.94719ns average
dual_unroll: 4.58844ns average
dual_unroll2: 4.57958ns average

clang-14.0.0 -O3 -march=native:

test correct
slow: 40.9391ns average
unroll: 1.43401ns average
dual_unroll: 2.00796ns average
dual_unroll2: 2.14979ns average

clang-15.0.6 -O3 -march=native:

test correct
slow: 40.9575ns average
unroll: 1.42839ns average
dual_unroll: 2.88576ns average
dual_unroll2: 2.91396ns average

Unfortunately, it seems that the auto-vectorization changes in clang-15 were a small regression for parse3 and parse3a (at least on my CPU). Nevertheless, any clang version >=13.0.0 seems to provide a rather fast auto-vectorized conversion with parse2 without having to start playing around with AVX intrinsics.


Edit: Added results with manual_sse and perf outputs:

First, the manual_sse results. As many of the hardware metrics of perf are unsupported with WSL, I had to change to a physical Linux installation, which explains the sudden improvements compared to the previous results.

clang-14.0.0 -O3 -march=native:

test correct
slow: 39.3491ns average
unroll: 1.36997ns average
dual_unroll: 1.91694ns average
dual_unroll2: 2.03751ns average
manual_sse: 0.597174ns average

Impressive find with the parse4 algorithm. The auto-vectorized unroll was already quite fast, and still manual_sse manages to double the throughput.

perf output for unroll with clang-14.0.0 -O3 -march=native:

unroll: 1.36152ns average

Performance counter stats for './vectorize':

          8 081,53 msec task-clock                #    1,000 CPUs utilized          
                 0      context-switches          #    0,000 /sec                   
                 0      cpu-migrations            #    0,000 /sec                   
               517      page-faults               #   63,973 /sec                   
    35 836 824 430      cycles                    #    4,434 GHz                      (83,32%)
     1 543 320 914      stalled-cycles-frontend   #    4,31% frontend cycles idle     (83,32%)
           293 174      stalled-cycles-backend    #    0,00% backend cycles idle      (83,32%)
   112 663 502 633      instructions              #    3,14  insn per cycle         
                                                  #    0,01  stalled cycles per insn  (83,33%)
    24 710 457 304      branches                  #    3,058 G/sec                    (83,37%)
            17 924      branch-misses             #    0,00% of all branches          (83,34%)

       8,081971368 seconds time elapsed

       8,081756000 seconds user
       0,000000000 seconds sys

It is fascinating to see that the auto-vectorized parse2 code is equivalent to Peter Corde's hand-written str2hmsn in terms of instructions per cycle.

perf output for manual_sse with clang-14.0.0 -O3 -march=native:

 manual_sse: 0.593577ns average

 Performance counter stats for './vectorize':

          7 969,91 msec task-clock                #    1,000 CPUs utilized          
                 0      context-switches          #    0,000 /sec                   
                 0      cpu-migrations            #    0,000 /sec                   
               517      page-faults               #   64,869 /sec                   
    35 342 715 779      cycles                    #    4,435 GHz                      (83,34%)
     1 543 347 968      stalled-cycles-frontend   #    4,37% frontend cycles idle     (83,34%)
            48 008      stalled-cycles-backend    #    0,00% backend cycles idle      (83,34%)
   111 231 696 104      instructions              #    3,15  insn per cycle         
                                                  #    0,01  stalled cycles per insn  (83,34%)
    24 710 482 257      branches                  #    3,100 G/sec                    (83,34%)
            16 718      branch-misses             #    0,00% of all branches          (83,31%)

       7,970375126 seconds time elapsed

       7,966130000 seconds user
       0,004001000 seconds sys

No real improvement in instructions per cycle but double the performance. I guess this makes sense as the generated code for parse4 only has around half as many instructions compared to parse2.

It should be noted that my experience with perf is very limited. I tried playing around with additional events, but I felt that the selection was quite limited for my AMD CPU compared to what many Intel CPU's had to offer. Maybe someone more knowledgeable with AMD and perf could help here. I also messed around with AMD μProf but I have nothing interesting to report from that.

Rane
  • 321
  • 2
  • 6
  • Wow, in case of `clang-14` and `unroll` version, it's almost as fast as the manual SIMD solution in the related question (~5 CPU cycles). Can you run the `perf` command (using the second test program) to print more detailed information? I will also update the post after installing new perf/clang version to test – Huy Le Mar 09 '23 at 07:29
  • I've just updated with a new answer using manual SSE. Could you test again with clang on your hardware? Thanks! – Huy Le Mar 09 '23 at 12:36
  • 3
    @HuyLe Updated the answer! Had to throw away WSL and dig up my old Ubuntu installation and upgrade it. Having not used `perf` all that much before, it seems like an amazingly powerful tool. – Rane Mar 10 '23 at 00:22
3

The answer is modified from here, the problem is extremely similar: https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html

Result: manual_sse: 0.891766ns average

Compile the program below with -O0 to see how chunk changes after each step, to visualize how each step works. You have to use -O0 to see the effects of command >= 5, because with -O3 the last few instructions are done purely in register (I think?), so the variable chunk isn't updated.

#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
#include <vector>
#include <time.h>
using namespace std;

template <typename T>
void print_m128i(__m128i data)
{
  alignas(16) T arr[16 / sizeof(T)];
  _mm_store_si128((__m128i*)arr, data);
  for (int i = 0; i < 16 / sizeof(T); i++) cout << uint64_t(arr[i]) << " ";
  cout << "\n";
}

inline std::uint64_t parse4(const char* str) noexcept
{
  __m128i chunk = _mm_lddqu_si128(
    reinterpret_cast<const __m128i*>(str)
  );
  cout << "1: ";
  print_m128i<unsigned char>(chunk);
  
  __m128i zeros =  _mm_set1_epi8('0');
  chunk = _mm_sub_epi8(chunk, zeros);
  cout << "2: ";
  print_m128i<unsigned char>(chunk);

  {
    const auto mult = _mm_set_epi8(
      0, 1, 1, 10, 1, 10, 1, 0, 1, 10, 1, 10, 1, 10, 1, 10
    );
    chunk = _mm_maddubs_epi16(chunk, mult);
    cout << "3: ";
    print_m128i<unsigned char>(chunk);
  }

  {
    const auto mult = _mm_set_epi16(1, 10, 1, 100, 1, 100, 1, 100);
    chunk = _mm_madd_epi16(chunk, mult);
    cout << "4: ";
    print_m128i<uint16_t>(chunk);
  }

  {
    chunk = _mm_packus_epi32(chunk, chunk);
    cout << "5: ";
    print_m128i<uint16_t>(chunk);

    const auto mult = _mm_set_epi16(0, 0, 0, 0, 1, 1000, 1, 10000);
    chunk = _mm_madd_epi16(chunk, mult);
    cout << "6: ";
    print_m128i<uint32_t>(chunk);
  }

  cout << "left = " << (chunk[0] & 0xffffffff) << "\n";
  cout << "right = " << (chunk[0] >> 32) << "\n";
  return ((chunk[0] & 0xffffffff) * 1000000) + (chunk[0] >> 32);
}


int main()
{
  string s = "20141103 112910";
  cout << parse4(s.c_str()) << "\n";
  return 0;
}

Output:

1: 50 48 49 52 49 49 48 51 32 49 49 50 57 49 48 0
2: 2 0 1 4 1 1 0 3 240 1 1 2 9 1 0 208
3: 20 0 14 0 11 0 3 0 1 0 12 0 91 0 0 0
4: 2014 0 1103 0 112 0 910 0
5: 2014 1103 112 910 2014 1103 112 910
6: 20141103 112910 0 0
left = 20141103
right = 112910
20141103112910
Huy Le
  • 1,439
  • 4
  • 19
  • 1
    `T* a = reinterpret_cast(&data);` is undefined behaviour for `T` other than `char` or `unsigned char`. Unless you use `__attribute__((may_alias))`, it's a violation of `-fstrict-aliasing` which is on by default in GCC/clang. See [print a \_\_m128i variable](https://stackoverflow.com/q/13257166) - use `alignas(16) T [16/sizeof(T)];` and `_mm_store_si128` – Peter Cordes Jul 02 '23 at 15:05
  • Thanks. I've changed the answer to use memcpy instead. But is it possible to print the array without costing a small stack memory allocation (`T a[16]`) and a `memcpy` ? – Huy Le Jul 03 '23 at 02:46
  • 1
    `T a[16]` over-allocates when `sizeof(T) > 1`; you only need 16 bytes, not 16 elements. Of course you don't need `memcpy` to store a `__m128i` to an array, that's what `_mm_store_si128( (__m128i*)a, data)` is for. Without using any array at all, yes, like I said, you could use `typedef T aliasing_T __attribute__((may_alias))` and `auto p = (aliasing_T*)&data;` in GNU C, or your original code in MSVC, or with `gcc -fno-strict-aliasing` but that could make your whole program less efficient. Or you could use `memcpy(&tmp, i*sizeof(T) + (const char*)&data, sizeof(T))`. – Peter Cordes Jul 03 '23 at 03:05
  • 1
    But if you want to make more than one `printf` call and reference the original `data` before each one, it's going to have to spill / reload your vector across calls, so you might as well just spill it manually to an aligned array of `T`. That's *more* efficient; it doesn't tempt the compiler into making bad code that reloads a whole `__m128i` into an XMM and then extracts a chunk of it like your original. Compilers can optimize that `_mm_store_si128` (or `memcpy` if you want) when appropriate, e.g. into a shuffle or `movd` or `movq`. – Peter Cordes Jul 03 '23 at 03:06
  • 1
    See also [Is \`reinterpret\_cast\`ing between hardware SIMD vector pointer and the corresponding type an undefined behavior?](https://stackoverflow.com/q/52112605) re: why it's safe to use `_mm_store_si128` without strict-aliasing violations, and [Why does unaligned access to mmap'ed memory sometimes segfault on AMD64?](https://stackoverflow.com/q/47510783) re: using `__attribute__((may_alias))`. In GNU C, you could define your own vector like `typedef T __attribute__((vector_size(16)))` and assign from it, then you can index it. Might compile less efficiently than `_mm_store_si128` + reload – Peter Cordes Jul 03 '23 at 03:09
  • 1
    Ok, that works, but now your code isn't portable to MSVC since it depends on GNU extensions. And uses features most readers won't know about, and compiles to asm that's no more efficient than you'd get from simple portable `alignas(16) T arr[16/sizeof(T)]` and store + index the array. – Peter Cordes Jul 03 '23 at 03:27
  • @PeterCordes I've fixed that. Thanks! – Huy Le Jul 03 '23 at 08:43