C standard library function "strtok" runs vastly slower when compiled on Windows versus Linux on same machine. Any insight why that might be the case?

Question

I sporadically chose to try writing my own strtok() function (named mystrtok()) and compare it to C's strtok() from <string.h> and came across this strange phenomenon:

When compiled on Linux with gcc, strtok() performs faster than my version as expected. However, when compiled on Windows with gcc or cl, strtok() is significantly slower than my version.

Confused by this, I wondered what might happen if I tried pasting in the source code for strtok() (renaming it to stdstrtok() to avoid name collision) and benchmarking it separately. It gave inconsistent but close results. This is where I got the code from

Picture showing strtok() running very slowly when benchmarked on Windows versus Linux

Does anyone have an idea why strtok() is running so slowly when compiled on Windows? Here's the code I am using:

#include <stdio.h>
#include <string.h>
#include <time.h>

#ifdef _MSC_VER 
#define RESTRICT __restrict
void print_compiler() {
  printf("This was compiled with CL\n");
}
#else
#define RESTRICT restrict
void print_compiler() {
  printf("This was compiled with GCC\n");
}
#endif


typedef struct {
  char string[100]; // string to be processed
  char* delim;      // delimeter for tokens
  char size;        // number of characters in string
} to_parse;


void show(func,name,target) // Show how the strtok processes and alters the target string
  char* RESTRICT (*func)(char*,const char*); // Function pointer to strtok variants
  const char* RESTRICT name; // Name of strtok variant for display
  to_parse target; // Information about the string and its delimeter
{
  printf("%s\tgives the output: ",name);
  char* output = func(target.string,target.delim);
  while (output != NULL) {
    printf("{%s}",output);
    output = func(NULL,target.delim);
  };
  printf("\n%s\tchanged the string to: ",name);
  int x;
  for (x=0;x<target.size;x++) {
    if (target.string[x] == '\0') printf("%s","\\0");
    else printf("%c",target.string[x]);
  }
  printf("\n");
}


void bench(func,name,target,iterations) // Benchmark a strtok variant by measuring execution seconds
  char* RESTRICT (*func)(char*,const char*); // Function pointer to strtok variants
  const char* RESTRICT name; // Name of strtok variant for display
  const to_parse* target; // Information about the string and its delimeter
  unsigned int iterations; // Number of times to execute strtok on the target
{
  time_t start_time = time(NULL);
  unsigned int x;
  to_parse retarget;
  for (x=0; x<iterations; x++) {
    retarget = *target;
    func(retarget.string,retarget.delim);
    while(func(NULL,retarget.delim));
  }
  printf("%s\ttook %d seconds to iterate %d times\n",name,(int)(time(NULL)-start_time),iterations);
}

/*
This is my version of strtok, written for random practice. It operates
differently than strtok, returning null when nothing is between delimeters.
Also, it processes multichar delimeters correctly, unlike strtok. Why?
*/

char* mystrtok(line, delim)
  register char* RESTRICT line;
  register const char* RESTRICT delim;
  {
  static char* fline = NULL;
  if (line != NULL) fline = line;
  else if (fline == NULL) return NULL;
  else line = fline;
  register const char* fdelim = delim;
  while (*fline != '\0') {
    if (*fline == *fdelim) fdelim++;
    else if (*fdelim == '\0') {
      *(fline-(fdelim-delim)) = '\0';
      return line;
    }
    else fdelim = delim;
    fline++;
  }
  if (*fdelim == '\0') {
    *(fline-(fdelim-delim)) = '\0';
    return line;
  }
  fline = NULL;
  return line;
}



/*
I copied this strtok function from
https://opensource.apple.com/source/Libc/Libc-167/string.subproj/strtok.c.auto.html
and renamed it to "stdstrtok" to avoid name collision with "strtok" from <string.h>
*/

char *
stdstrtok(s, delim)
    register char *s;
    register const char *delim;
{
    register char *spanp;
    register int c, sc;
    char *tok;
    static char *last;


    if (s == NULL && (s = last) == NULL)
        return (NULL);

    /*
     * Skip (span) leading delimiters (s += strspn(s, delim), sort of).
     */
cont:
    c = *s++;
    for (spanp = (char *)delim; (sc = *spanp++) != 0;) {
        if (c == sc)
            goto cont;
    }

    if (c == 0) {       /* no non-delimiter characters */
        last = NULL;
        return (NULL);
    }
    tok = s - 1;

    /*
     * Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
     * Note that delim must have one NUL; we stop if we see that, too.
     */
    for (;;) {
        c = *s++;
        spanp = (char *)delim;
        do {
            if ((sc = *spanp++) == c) {
                if (c == 0)
                    s = NULL;
                else
                    s[-1] = 0;
                last = s;
                return (tok);
            }
        } while (sc != 0);
    }
    /* NOTREACHED */
}



void benchem(const to_parse* target,unsigned int iterations) {
  bench(mystrtok,"mystrtok",&*target,iterations);
  bench(stdstrtok,"stdstrtok",&*target,iterations);
  bench(strtok,"strtok  ",&*target,iterations);
}

void showem(const to_parse* target) {
  show(mystrtok,"mystrtok",*target);
  show(stdstrtok,"stdstrtok",*target);
  show(strtok,"strtok  ",*target);
}


int main()
{ 
  print_compiler();
  const to_parse parse_me = { .string = "delimdelimDATAdelimdatadelimdelimDATAdelimdatadelimDATA", 
                              .delim = "delim", 
                              .size = 56
                            };
  showem(&parse_me);
  benchem(&parse_me,40000000);
  // showem(&((to_parse){"duhduhduhtestduh","duh",17})); // this works too!

}

I'd also appreciate anyone compiling and running this on Windows and Linux (I used WSL) and confirm whether the issue is reproducible on their system.

This was all on the same hardware? What CPU? What compiler options and versions? Targeting a recent x86-64 with `gcc -O3 -march=native`? If it's a Skylake, did you use [How can I mitigate the impact of the Intel jcc erratum on gcc?](https://stackoverflow.com/q/61256646) — Peter Cordes, Feb 06 '23 at 01:51
Also, I'm curious why your function declarations are using K&R style for the args, not proper ISO C declarations like `void foo(char *name, char *xyz){ ... }`. I could imagine using K&R style for legacy code you copy/pasted, but `bench` is a new function. Also use of the `register` keyword in new code is pointless unless you care about `-O0` unoptimized builds. — Peter Cordes, Feb 06 '23 at 01:56
@PeterCordes Yes, it was on the same system. My CPU is an Intel i3-2120. I used gcc (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0 and gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 and Microsoft (R) C/C++ Optimizing Compiler Version 19.16.27045 for x86 I didn't use any options. I just used the K&R style (didn't know that's what it was) just because it was new to me and I wanted to try it out. I think it looks good. — John Schock, Feb 06 '23 at 02:21
So the only surprising thing here is that MS's standard-library `strtok` is slow, much slower than the glibc version on Linux. Your code is about the same speed on GCC regardless of OS, and the Apple / FreeBSD code compiled without optimization is a bit slower (and gives results that match the library version, unlike yours). Compiling with optimization disabled (the default, compile fast instead of making fast code) is of course a performance disaster in general, somewhat mitigated by `register`. Different compilers will suffer differently from the default `-O0`, with GCC often less. — Peter Cordes, Feb 06 '23 at 02:30
When compiling "I didn't use any options" means you're probably testing unoptimized code potentially with additional debug code or heap protection. — Retired Ninja, Feb 06 '23 at 02:33
For a comparison, on my machine using Visual Studio 2022 the timings are 2/4/6 in release x64 mode, and 4/16/17 in debug x64 mode. — Retired Ninja, Feb 06 '23 at 02:44
I didn't actually test anything, just looking at your benchmark numbers again. Perhaps @RetiredNinja is on to something, if MSVC debug builds add a bunch of overhead even to the loop that calls standard-library `strtok`. As usual, don't benchmark unoptimized code, it's not interesting or useful. I thought maybe we were getting away with it here since a lot of the work is in the pre-compiled library, and a lot of your code is using `register`, but that could easily *not* be the case; something could create a big store/reload latency bottleneck. — Peter Cordes, Feb 06 '23 at 02:47
[Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?](https://stackoverflow.com/q/53366394) / [Idiomatic way of performance evaluation?](https://stackoverflow.com/q/60291987) / [How to optimize these loops (with compiler optimization disabled)?](https://stackoverflow.com/a/32001196) — Peter Cordes, Feb 06 '23 at 02:48
Thanks @RetiredNinja for checking the performance on your system. I've tried optimizing the code for speed on GCC (-O3) and CL (/Ot) and, while it does indeed perform better, the strtok() function is still absurdly slow on Windows. — John Schock, Feb 06 '23 at 03:00

Craig Estey · Answer 1 · 2023-02-06T03:54:35.173

The strtok source you have is rudimentary--it only processes a single char at a time.

The source is ancient as evidenced by the K&R prototypes instead of ANSI (which were available before the copyright on the file).

The strtok implementation depends on the quality/speed of the libc implementation that is used.

Modern libc implementations use special tricks to process more than a single char at a time.

The glibc version [which is what linux uses] is tuned/tweaked for speed. Below are some of the source code fragments, taken from the glibc git repository: https://www.gnu.org/software/libc/sources.html

Note that glibc also has arch-specific versions that are coded using SIMD/assembler that can be found in the source packages for the particular linux distro you are using. I've included one example at the bottom.

FILE: strtok.c

char *
__strtok_r (char *s, const char *delim, char **save_ptr)
{
  char *end;

  if (s == NULL)
    s = *save_ptr;

  if (*s == '\0')
    {
      *save_ptr = s;
      return NULL;
    }

  /* Scan leading delimiters.  */
  s += strspn (s, delim);
  if (*s == '\0')
    {
      *save_ptr = s;
      return NULL;
    }

  /* Find the end of the token.  */
  end = s + strcspn (s, delim);
  if (*end == '\0')
    {
      *save_ptr = end;
      return s;
    }

  /* Terminate the token and make *SAVE_PTR point past it.  */
  *end = '\0';
  *save_ptr = end + 1;
  return s;
}

FILE: strcspn.c

/* Return the length of the maximum initial segment of S
   which contains no characters from REJECT.  */
size_t
strcspn (const char *str, const char *reject)
{
  if (__glibc_unlikely (reject[0] == '\0')
      || __glibc_unlikely (reject[1] == '\0'))
    return __strchrnul (str, reject [0]) - str;

  /* Use multiple small memsets to enable inlining on most targets.  */
  unsigned char table[256];
  unsigned char *p = memset (table, 0, 64);
  memset (p + 64, 0, 64);
  memset (p + 128, 0, 64);
  memset (p + 192, 0, 64);

  unsigned char *s = (unsigned char*) reject;
  unsigned char tmp;
  do
    p[tmp = *s++] = 1;
  while (tmp);

  s = (unsigned char*) str;
  if (p[s[0]]) return 0;
  if (p[s[1]]) return 1;
  if (p[s[2]]) return 2;
  if (p[s[3]]) return 3;

  s = (unsigned char *) PTR_ALIGN_DOWN (s, 4);

  unsigned int c0, c1, c2, c3;
  do
    {
      s += 4;
      c0 = p[s[0]];
      c1 = p[s[1]];
      c2 = p[s[2]];
      c3 = p[s[3]];
    }
  while ((c0 | c1 | c2 | c3) == 0);

  size_t count = s - (unsigned char *) str;
  return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3;
}

FILE: strspn.c

/* Return the length of the maximum initial segment
   of S which contains only characters in ACCEPT.  */
size_t
strspn (const char *str, const char *accept)
{
  if (accept[0] == '\0')
    return 0;
  if (__glibc_unlikely (accept[1] == '\0'))
    {
      const char *a = str;
      for (; *str == *accept; str++);
      return str - a;
    }

  /* Use multiple small memsets to enable inlining on most targets.  */
  unsigned char table[256];
  unsigned char *p = memset (table, 0, 64);
  memset (p + 64, 0, 64);
  memset (p + 128, 0, 64);
  memset (p + 192, 0, 64);

  unsigned char *s = (unsigned char*) accept;
  /* Different from strcspn it does not add the NULL on the table
     so can avoid check if str[i] is NULL, since table['\0'] will
     be 0 and thus stopping the loop check.  */
  do
    p[*s++] = 1;
  while (*s);

  s = (unsigned char*) str;
  if (!p[s[0]]) return 0;
  if (!p[s[1]]) return 1;
  if (!p[s[2]]) return 2;
  if (!p[s[3]]) return 3;

  s = (unsigned char *) PTR_ALIGN_DOWN (s, 4);

  unsigned int c0, c1, c2, c3;
  do {
      s += 4;
      c0 = p[s[0]];
      c1 = p[s[1]];
      c2 = p[s[2]];
      c3 = p[s[3]];
  } while ((c0 & c1 & c2 & c3) != 0);

  size_t count = s - (unsigned char *) str;
  return (c0 & c1) == 0 ? count + c0 : count + c2 + 2;
}

FILE: strchrnul.c

/* Find the first occurrence of C in S or the final NUL byte.  */
char *
STRCHRNUL (const char *s, int c_in)
{
  const unsigned char *char_ptr;
  const unsigned long int *longword_ptr;
  unsigned long int longword, magic_bits, charmask;
  unsigned char c;

  c = (unsigned char) c_in;

  /* Handle the first few characters by reading one character at a time.
     Do this until CHAR_PTR is aligned on a longword boundary.  */
  for (char_ptr = (const unsigned char *) s;
       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
       ++char_ptr)
    if (*char_ptr == c || *char_ptr == '\0')
      return (void *) char_ptr;

  /* All these elucidatory comments refer to 4-byte longwords,
     but the theory applies equally well to 8-byte longwords.  */

  longword_ptr = (unsigned long int *) char_ptr;

  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
     the "holes."  Note that there is a hole just to the left of
     each byte, with an extra at the end:

     bits:  01111110 11111110 11111110 11111111
     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD

     The 1-bits make sure that carries propagate to the next 0-bit.
     The 0-bits provide holes for carries to fall into.  */
  magic_bits = -1;
  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;

  /* Set up a longword, each of whose bytes is C.  */
  charmask = c | (c << 8);
  charmask |= charmask << 16;
  if (sizeof (longword) > 4)
    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
    charmask |= (charmask << 16) << 16;
  if (sizeof (longword) > 8)
    abort ();

  /* Instead of the traditional loop which tests each character,
     we will test a longword at a time.  The tricky part is testing
     if *any of the four* bytes in the longword in question are zero.  */
  for (;;)
    {
      /* We tentatively exit the loop if adding MAGIC_BITS to
     LONGWORD fails to change any of the hole bits of LONGWORD.

     1) Is this safe?  Will it catch all the zero bytes?
     Suppose there is a byte with all zeros.  Any carry bits
     propagating from its left will fall into the hole at its
     least significant bit and stop.  Since there will be no
     carry from its most significant bit, the LSB of the
     byte to the left will be unchanged, and the zero will be
     detected.

     2) Is this worthwhile?  Will it ignore everything except
     zero bytes?  Suppose every byte of LONGWORD has a bit set
     somewhere.  There will be a carry into bit 8.  If bit 8
     is set, this will carry into bit 16.  If bit 8 is clear,
     one of bits 9-15 must be set, so there will be a carry
     into bit 16.  Similarly, there will be a carry into bit
     24.  If one of bits 24-30 is set, there will be a carry
     into bit 31, so all of the hole bits will be changed.

     The one misfire occurs when bits 24-30 are clear and bit
     31 is set; in this case, the hole at bit 31 is not
     changed.  If we had access to the processor carry flag,
     we could close this loophole by putting the fourth hole
     at bit 32!

     So it ignores everything except 128's, when they're aligned
     properly.

     3) But wait!  Aren't we looking for C as well as zero?
     Good point.  So what we do is XOR LONGWORD with a longword,
     each of whose bytes is C.  This turns each byte that is C
     into a zero.  */

      longword = *longword_ptr++;

      /* Add MAGIC_BITS to LONGWORD.  */
      if ((((longword + magic_bits)

        /* Set those bits that were unchanged by the addition.  */
        ^ ~longword)

       /* Look at only the hole bits.  If any of the hole bits
          are unchanged, most likely one of the bytes was a
          zero.  */
       & ~magic_bits) != 0

      /* That caught zeroes.  Now test for C.  */
      || ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
          & ~magic_bits) != 0)
    {
      /* Which of the bytes was C or zero?
         If none of them were, it was a misfire; continue the search.  */

      const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);

      if (*cp == c || *cp == '\0')
        return (char *) cp;
      if (*++cp == c || *cp == '\0')
        return (char *) cp;
      if (*++cp == c || *cp == '\0')
        return (char *) cp;
      if (*++cp == c || *cp == '\0')
        return (char *) cp;
      if (sizeof (longword) > 4)
        {
          if (*++cp == c || *cp == '\0')
        return (char *) cp;
          if (*++cp == c || *cp == '\0')
        return (char *) cp;
          if (*++cp == c || *cp == '\0')
        return (char *) cp;
          if (*++cp == c || *cp == '\0')
        return (char *) cp;
        }
    }
    }

  /* This should never happen.  */
  return NULL;
}

weak_alias (__strchrnul, strchrnul)

FILE: strcspn.S

    .text
ENTRY (strcspn)

    movq %rdi, %rdx     /* Save SRC.  */

    /* First we create a table with flags for all possible characters.
       For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
       supported by the C string functions we have 256 characters.
       Before inserting marks for the stop characters we clear the whole
       table.  */
    movq %rdi, %r8          /* Save value.  */
    subq $256, %rsp         /* Make space for 256 bytes.  */
    cfi_adjust_cfa_offset(256)
    movl $32,  %ecx         /* 32*8 bytes = 256 bytes.  */
    movq %rsp, %rdi
    xorl %eax, %eax         /* We store 0s.  */
    cld
    rep
    stosq

    movq %rsi, %rax         /* Setup skipset.  */

/* For understanding the following code remember that %rcx == 0 now.
   Although all the following instruction only modify %cl we always
   have a correct zero-extended 64-bit value in %rcx.  */

    .p2align 4
L(2):   movb (%rax), %cl    /* get byte from skipset */
    testb %cl, %cl      /* is NUL char? */
    jz L(1)         /* yes => start compare loop */
    movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */

    movb 1(%rax), %cl   /* get byte from skipset */
    testb $0xff, %cl    /* is NUL char? */
    jz L(1)         /* yes => start compare loop */
    movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */

    movb 2(%rax), %cl   /* get byte from skipset */
    testb $0xff, %cl    /* is NUL char? */
    jz L(1)         /* yes => start compare loop */
    movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */

    movb 3(%rax), %cl   /* get byte from skipset */
    addq $4, %rax       /* increment skipset pointer */
    movb %cl, (%rsp,%rcx)   /* set corresponding byte in skipset table */
    testb $0xff, %cl    /* is NUL char? */
    jnz L(2)        /* no => process next dword from skipset */

L(1):   leaq -4(%rdx), %rax /* prepare loop */

    /* We use a neat trick for the following loop.  Normally we would
       have to test for two termination conditions
       1. a character in the skipset was found
       and
       2. the end of the string was found
       But as a sign that the character is in the skipset we store its
       value in the table.  But the value of NUL is NUL so the loop
       terminates for NUL in every case.  */

    .p2align 4
L(3):   addq $4, %rax       /* adjust pointer for full loop round */

    movb (%rax), %cl    /* get byte from string */
    cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
    je L(4)         /* yes => return */

    movb 1(%rax), %cl   /* get byte from string */
    cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
    je L(5)         /* yes => return */

    movb 2(%rax), %cl   /* get byte from string */
    cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
    jz L(6)         /* yes => return */

    movb 3(%rax), %cl   /* get byte from string */
    cmpb %cl, (%rsp,%rcx)   /* is it contained in skipset? */
    jne L(3)        /* no => start loop again */

    incq %rax       /* adjust pointer */
L(6):   incq %rax
L(5):   incq %rax

L(4):   addq $256, %rsp     /* remove skipset */
    cfi_adjust_cfa_offset(-256)
#ifdef USE_AS_STRPBRK
    xorl %edx,%edx
    orb %cl, %cl        /* was last character NUL? */
    cmovzq %rdx, %rax   /* Yes: return NULL */
#else
    subq %rdx, %rax     /* we have to return the number of valid
                   characters, so compute distance to first
                   non-valid character */
#endif
    ret
END (strcspn)
libc_hidden_builtin_def (strcspn)

C standard library function "strtok" runs vastly slower when compiled on Windows versus Linux on same machine. Any insight why that might be the case?

1 Answers1