TL;DR: I tested it on 2 systems and no, it was not faster on both systems. But you can make the code faster with removing the strlen()
.
I tested both functions and a lot more. I tested what happens when i replace -('A'-'a')
with +0x20
, replaced +
by |
and replacing the for
loop with a while
loop. Result is that this does not change performance.
However, moving strlen()
to the beginning and only call it once improved performance. A way to improve performance even more was to remove the call to strlen()
and check for the '\0'
-byte inside the same loop. This way we have to go trough the loop only once, this probably reduces cache misses on longer strings.
Testprogram
I tested it by creating an array of random strings, copy the array and lowering all copies with one method while measuring the lowering time. Then i did the same, with the same random array of strings, with all the other methods. And i repeated this multiple times.
The code should work on POSIX compatible systems, but you probably have to replace the GetTime()
function for other systems such as Windows. I compiled it with GCC and the -O3
flag.
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
//#define DEBUG
#ifdef DEBUG
#define N 10
#else
#define N 1000UL*100
#endif
#define M 20
#define STR_(x) #x
#define STR(x) STR_(x)
void lower1(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower2(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower3(char *s)
{
size_t i;
size_t len = strlen(s);
int d='A'-'a';
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= d;
}
}
}
void lower4(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += 0x20;
}
}
}
void lower5(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += ('a' - 'A');
}
}
}
void lower6(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower7(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower8(char *s)
{
size_t len = strlen(s);
while(len--)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lower9(char *s)
{
while(1)
{
if (!*s)
{
break;
}
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lowerA(char *s)
{
while(*s)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
uint64_t die(const char *msg)
{
fprintf(stderr,"die: %s : %s\n",msg,strerror(errno));
exit(1);
}
uint64_t getTime(void)
{
uint64_t time;
struct timespec t_v;
if(clock_gettime(CLOCK_BOOTTIME,&t_v)<0)
{
die("cant get time");
}
time=t_v.tv_sec*1000000000ULL;
time+=t_v.tv_nsec;
return time;
}
void test(void (*fp)(char *),char (*s)[M],const char *name)
{
static char (*copy)[M];
copy=malloc(N*M);
if(!copy)
{
die("can't alloc memory");
}
memcpy(copy,s,N*M);
uint64_t start=getTime();
for(size_t u=0;u<N;u++)
{
fp(copy[u]);
}
uint64_t end=getTime();
printf("time %13"PRIu64" %s\n",end-start,name);
#ifdef DEBUG
for(size_t u=0;u<N;u++)
{
printf("%3zu %"STR(M)"s %"STR(M)"s\n",u,s[u],copy[u]);
}
#endif
free(copy);
}
void runTest(void)
{
//create a random string
srand(getTime());
static char string[N][M];
for(size_t u=0;u<N;u++)
{
size_t l=rand()%M;
for(size_t i=0;i<l;i++)
{
string[u][i]=rand()%('z'-'/')+'/';
}
string[u][l]='\0';
}
#define TEST(s) test(s,string,STR(s))
TEST(lower1);
TEST(lower2);
TEST(lower3);
TEST(lower4);
TEST(lower5);
TEST(lower6);
TEST(lower7);
TEST(lower8);
TEST(lower9);
TEST(lowerA);
}
int main(void)
{
for(unsigned i=0;i<8;i++)
{
runTest();
}
return 1;
}
The disassembly on AMD64 shows that functions lower1()
, lower5()
and lower6()
(functions that call strlen()
in every loop, compiler did not optimize that call) are almost identical with the exception of addresses and that a add
instructions was replaced by the or
instruction. lower2()
, lower3()
, lower4()
and lower7()
(functions where strlen()
is only called once and for
is used) are also almost identical. lower8()
is different from each other (uses strlen()
once and a while
-loop). loop9()
and loopA()
are almost identical and do not call strlen()
)
Results
On my Debian 9 Stretch ARM running on a Raspberry Pi, the functions lower9()
and lowerA()
are equally as fast and faster than all other tested functions. lower2()
, lower3()
, lower4()
, lower7()
and lower8()
took about 58-66% more time but are equally to each other. Dispate the different assembly for lower8()
the execution time did not differ significantly. lower1()
and lower6()
took about 297-348% longer than lower9()
and lowerA()
, interestingly lower5()
took even longer (consistent in multiple measurements) with 324%-375%. I do not know why lower5()
took longer since it uses the same machine code except for different addresses (this is also true for the ARM code).
On my Debian 10 Buster AMD64, the function lowerA()
is the fastest, faster than lower9()
by about 3%-6%. I don't know why. But lower5()
is here as fast as lower1()
and lower6()
.