5

How do I clear the 16 - i upper bytes of a __m128i?

I've tried this; it works, but I'm wondering if there is a better (shorter, faster) way:

int i = ...  //  0 < i < 16

__m128i x = ...

__m128i mask = _mm_set_epi8(
    0,
    (i > 14) ? -1 : 0,
    (i > 13) ? -1 : 0,
    (i > 12) ? -1 : 0,
    (i > 11) ? -1 : 0,
    (i > 10) ? -1 : 0,
    (i >  9) ? -1 : 0,
    (i >  8) ? -1 : 0,
    (i >  7) ? -1 : 0,
    (i >  6) ? -1 : 0,
    (i >  5) ? -1 : 0,
    (i >  4) ? -1 : 0,
    (i >  3) ? -1 : 0,
    (i >  2) ? -1 : 0,
    (i >  1) ? -1 : 0,
    -1);

x = _mm_and_si128(x, mask);
dtb
  • 213,145
  • 36
  • 401
  • 431
  • 2
    Does not sound like something that deserves the C++ tag. – László Papp Sep 07 '13 at 14:21
  • If `i` is not known until run-time then I think the best bet is a lookup table. – Paul R Sep 07 '13 at 15:23
  • @LaszloPapp: why not, exactly? Which language *should* it be tagged with? – jalf Sep 09 '13 at 07:19
  • 1
    @jalf: I would have used 'C'. – László Papp Sep 09 '13 at 07:33
  • 3
    But if he's writing C++ code which happens to use SSE, why should he *not* tag it as C++? What if, by some crazy leap of the imagination, he wants to be sure that the solution he gets compiles as C++? Are you saying that SSE is a part of the C language, and not part of C++? – jalf Sep 09 '13 at 12:38

2 Answers2

6

I tried a few different ways of implementing this and benchmarked them with a couple of different compilers on an early Core i7 @ 2.67 GHz and a recent Haswell @ 3.6 GHz:

//
// mask_shift_0
//
// use PSHUFB (note: SSSE3 required)
//

inline __m128i mask_shift_0(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(255);
  const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
  __m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n));
  return _mm_shuffle_epi8(vmask, vp);
}

//
// mask_shift_1
//
// use 16 element LUT
//

inline __m128i mask_shift_1(uint32_t n)
{
  static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = {
    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 }
  };
  return _mm_load_si128((__m128i *)&mask_lut[n]);
}

//
// mask_shift_2
//
// use misaligned load from 2 vector LUT
//

inline __m128i mask_shift_2(uint32_t n)
{
  static const int8_t mask_lut[32] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  };
  return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n));
}

//
// mask_shift_3
//
// use compare and single vector LUT
//

inline __m128i mask_shift_3(uint32_t n)
{
  const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
  __m128i vn = _mm_set1_epi8(n);
  return _mm_cmpgt_epi8(vm, vn);
}

//
// mask_shift_4
//
// use jump table and immediate shifts
//

inline __m128i mask_shift_4(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(-1);
  switch (n)
  {
    case 0:
      return vmask;
    case 1:
      return _mm_slli_si128(vmask, 1);
    case 2:
      return _mm_slli_si128(vmask, 2);
    case 3:
      return _mm_slli_si128(vmask, 3);
    case 4:
      return _mm_slli_si128(vmask, 4);
    case 5:
      return _mm_slli_si128(vmask, 5);
    case 6:
      return _mm_slli_si128(vmask, 6);
    case 7:
      return _mm_slli_si128(vmask, 7);
    case 8:
      return _mm_slli_si128(vmask, 8);
    case 9:
      return _mm_slli_si128(vmask, 9);
    case 10:
      return _mm_slli_si128(vmask, 10);
    case 11:
      return _mm_slli_si128(vmask, 11);
    case 12:
      return _mm_slli_si128(vmask, 12);
    case 13:
      return _mm_slli_si128(vmask, 13);
    case 14:
      return _mm_slli_si128(vmask, 14);
    case 15:
      return _mm_slli_si128(vmask, 15);
  }
}

//
// lsb_mask_0
//
// Contributed by by @Leeor/@dtb
//
// uses _mm_set_epi64x
//

inline __m128i lsb_mask_0(int n)
{
  if (n >= 8)
    return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1);
  else
    return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8));
}

//
// lsb_mask_1
//
// Contributed by by @Leeor/@dtb
//
// same as lsb_mask_0 but uses conditional operator instead of if/else
//

inline __m128i lsb_mask_1(int n)
{
  return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8));
}

Results were interesting:

Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc -O3)

mask_shift_0: 2.23377 ns
mask_shift_1: 2.14724 ns
mask_shift_2: 2.14270 ns
mask_shift_3: 2.15063 ns
mask_shift_4: 2.98304 ns
lsb_mask_0:   2.15782 ns
lsb_mask_1:   2.96628 ns

Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)

mask_shift_0: 1.35014 ns
mask_shift_1: 1.12789 ns
mask_shift_2: 1.04329 ns
mask_shift_3: 1.09258 ns
mask_shift_4: 2.01478 ns
lsb_mask_0:   1.70573 ns
lsb_mask_1:   1.84337 ns

Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc -O2)

mask_shift_0: 0.851416 ns
mask_shift_1: 0.575245 ns
mask_shift_2: 0.577746 ns
mask_shift_3: 0.850086 ns
mask_shift_4: 1.398270 ns
lsb_mask_0:   1.359660 ns
lsb_mask_1:   1.709720 ns

So mask_shift_4 (switch/case) seems to be the slowest method in all cases, whereas the others are pretty similar. The LUT-based methods seem to be consistently the fastest overall.

NB: I get some suspiciously fast numbers with clang -O3 and gcc -O3 (gcc 4.7.2 only) - I need to look at the generated assembly for these cases to see what the compiler is doing, and make sure it is not doing anything "clever", such as optimise away some part of the timing test harness.

If anyone else has any further ideas on this or has another mask_shift implementation they'd like to try I would be happy to add it to the test suite and update the results.

Paul R
  • 208,748
  • 37
  • 389
  • 560
  • Awesome comparison, thanks. Here's what I'm currently using based on the answer by @Leeor: inline __m128i lsb_mask(int n) { if (n >= 8) return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1); else return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8)); } – dtb Sep 08 '13 at 17:16
  • I get 1.64, 0.59, 0.75, 1.64, and 3.79 seconds for 50000000 iterations using vs2012, win8, x64, release build, intel ivy bridge. The function in the above comment takes 1.31 seconds. So mask_shift_1 is, while not particularly concise, the fastest so far. – dtb Sep 08 '13 at 17:49
  • Great - I'll add your function to the test suite and post an update later. The LUT method in mask_shift_1 is the method I arrived at some years back after looking at this exact problem in the pre Core i7 days - it's interesting to see that it still wins on newer CPUs (although I haven't tried this test on Sandy Bridge/Ivy Bridge/Haswell yet). – Paul R Sep 08 '13 at 18:24
  • @dtb: I've now added `lsb_mask` into the mix - see latest edit above. – Paul R Sep 08 '13 at 18:56
  • 2
    Hey guys, nice work with the benchmarking, I humbly admit to being beaten :) - would still like to know if the native 128b shift could be made to work though, and how fast it could get compared to these – Leeor Sep 08 '13 at 20:55
  • @Leeor: well there are older and newer x86 CPUs that we haven't tested, and the fastest solution on one may not be the fastest on another, so it's always good to have multiple implementations available. :-) – Paul R Sep 08 '13 at 20:57
  • 1
    @Leeor: A `switch` with `case n: return _mm_srli_si128(_mm_set1_epi32(-1), 16 - n)` for each possible value of `n` takes 2.10 seconds on my machine. – dtb Sep 08 '13 at 21:04
  • 1
    It might be noteworthy that `mask_shift_*` return the bitwise negation of `lsb_mask`. – dtb Sep 08 '13 at 21:06
  • 1
    The original code posted in the question takes 8.29 seconds. So every solution that has been posted is a vast improvement. Thanks again, to both of you! – dtb Sep 08 '13 at 21:10
  • btw, looking from the uarch perspective, a switch statement should be the worst solution probably on any CPU because the control flow is complicated and hard to predict since it's data dependent. Missing branches is expensive. Conditional moves would be faster but probably clog the execution and waste registers (btw, the above implementation still has if/else instead). The lookup table woulb be the simplest and fastest in terms of control flow and execution, but would probably have to sit in the cache, not an issue in the benchmark, but may introduce new problems) – Leeor Sep 08 '13 at 21:33
  • 1
    checked now - by making lsb mask use cond moves I get a 2x speedup (gcc 4.6.3 on linux, with no other operation in the loop): return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8)); – Leeor Sep 08 '13 at 21:58
  • @Leeor: Interesting. Maybe Paul can add it to the benchmark. A problem is that _mm_set_epi64x seems to be available only on x64. – dtb Sep 08 '13 at 22:07
  • @Leeor: re `switch` statement - most compilers will generate a jump table if you have a nice sequential range of case values, so you should just get one pipeline bubble, rather than a sequence of branch mispredictions. I haven't looked at the generated code yet though to confirm that this is actually what is happening. – Paul R Sep 09 '13 at 06:19
  • @dtb: I *think* `_mm_set_epi64x` is available for both 32 and 64 bit, but it's a relatively recent addition and not all platforms/compilers/headers have it - some just have `_mm_set_epi64` and you then need to roll your own `_mm_set_epi64x`. I think there's an SO question on this very subject but a quick search didn't turn it up. – Paul R Sep 09 '13 at 06:23
  • @dtb: good catch on the inverted output of `mask_shift` versus `lsb_mask` - I hadn't noticed this - it should be fairly easy to fix though. BTW, my particular use case for this function is masking out elements at the start of a row, so I want a mask that starts with `n` 0s - I wrongly assumed that was what you were doing too. – Paul R Sep 09 '13 at 06:27
  • @Leeor/dtb: I've added the conditional version of `lsb_mask` to the tests now - the original is now `lsb_mask_0` and the new version with the conditional operator is `lsb_mask_1`. For the two compilers I've been using the first version is actually faster, which is a useful reminder of the need to test optimisations with the compiler and platform that you intend to deliver on and not assume that you'll get the same benefits with other compilers/platforms! (P.S. I was noticing some variability in the measured times so I've increased the benchmark time and re-run all the tests.) – Paul R Sep 09 '13 at 06:51
  • I've now added timing for a Haswell CPU with gcc 4.7.2. I had to use gcc -O2 in this case as the numbers with gcc -O3 were suspiciously fast. Further investigation needed. – Paul R Sep 09 '13 at 07:15
2

If it were normal 64bit values, i'd use something like -

    mask = (1 << (i * 8)) - 1;

But take care when generalizing this to 128, the internal shift operators aren't necessarily working at these ranges.

For 128b, you could either just build an upper and lower masks, for e.g -

    __m128i mask = _mm_set_epi64x( 
       i > 7 ? 0xffffffff : (1 << ((i) * 8)) - 1 
       i > 7 ? (1 << ((i-8) * 8)) - 1 : 0 
    );

(assuming I didn't swap the order, check me on this one, i'm not very familiar with these intrinsics) Alternatively, you can do this on a 2-wide uint64 array and load the 128b mask directly from memory using it's address.

However, both these methods don't seem natural like the original one, they just extend the elements from 1 to 8 bytes, but are still partial. It would be much preferable to do a proper shift with a single 128b variable.

I just came across this topic regarding 128b shifts -

Looking for sse 128 bit shift operation for non-immediate shift value

looks like it's possible but i've never used it. You could try the above one-liner with the appropriate SSE intrinsitc from there. I'd give this one a shot -

    mask = _mm_slli_si128(1, i); //emmintrin.h shows the second argument is in bytes already

And then just subtract one using your preferred way (I'd be surprised if this type supports a plain old operator-)

Community
  • 1
  • 1
Leeor
  • 19,260
  • 5
  • 56
  • 87
  • Sounds good. Can you expand on how to create the upper and lower masks? – dtb Sep 07 '13 at 17:47
  • Edited my post to what I believe is a slightly better approach – Leeor Sep 07 '13 at 19:51
  • Thanks for the update. _mm_slli_si128 needs a compile-time constant; that's why the code you linked is needed for variable operands. I'm using a slightly modified version of your code based on _mm_set_epi64x code, and some equivalent code based on _mm_set_epi32 on 32-bit platforms. Comparing it to the functions posted by Paul now... – dtb Sep 08 '13 at 17:35