I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.
But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs. I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.
My test code is here:
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#include <stdlib.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
#define ALIGNED16 alignas(16)
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// @ A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
size_t search_logic(const char* data, size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
while (len - i >= 16) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (!((c < del) && (c >= sp)) && (c != ht)) == 0
//if(!_mm_testc_si128(_mm_and_si128(_mm_cmpgt_epi8(c, sp_m1), _mm_cmplt_epi8(c, del)), _mm_xor_si128(c, ht)))
//break;
// !(c == del) && ((c == ht) || (c >= sp)) == 1
//if(!_mm_test_all_ones(_mm_andnot_si128(_mm_cmpeq_epi8(c, del), _mm_or_si128(_mm_cmpeq_epi8(c, ht), _mm_cmpgt_epi8(c, sp_m1)))))
//break;
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
while (len - i >= 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr_implicit(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
srand(0);
for (size_t i = 0; i < len; ++i) {
const char v = rand() % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table lookup", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpestr ranges", {
size_t i = search_sse4cmpstr(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpistr ranges", {
size_t i = search_sse4cmpstr_implicit(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("logic ranges", {
size_t i = search_logic(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
}
Compiled with gcc -O3 -march=native -pedantic -Wall -Wextra main2.cpp
it gives me these results:
Setting up 1GB of data...
table lookup - 476.4820ms
cmpestr ranges - 519.3350ms
cmpistr ranges - 497.5770ms
logic ranges - 153.2650ms
I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.
Am I using it wrong? Or why does this instruction exist at all?
EDIT: As pointed out in the comments, cmpistr (implicit length instruction with less parameters) is slightly faster than cmpestr and sometimes faster than the table lookup.
However, SSE2 bitwise and integer operations seem to be even faster.
EDIT2 Peter Cordes found the right answer. I've added the revised program in a new answer, so please look at this one if you are interested in cmpstr.
DO NOT USE THE CODE ABOVE!