I decide to create a string-length function in Assembly (using FASM
).
My function takes a string (no matter aligned at 8 bytes or not) and checks if it's aligned at 8 bytes. If it's aligned, the main process (loop) will be begun. Otherwise, first 8 characters will be checked one-by-one, then the string will be aligned at 8 bytes and continue ...
There will be no "end of the memory page" problem since the string will be aligned at 8 bytes boundary anyway and by this alignment, it will never face the end of memory page problem.
But the problem is that I decided to implement its C version too, and I compiled it, and now I have 2 assembly codes, the one I wrote it and the one is written in C and compiled to assembly. The problem is the C version is up to 1.5x faster than my handwritten assembly !!!!!!! In my code, everything is just fine, and I even aligned the jump-points to 16 bytes and there is no nop
running (except one, out of the loop which is kinda nothing (.align8
to .loop
)) !!!
I can't find why my pure assembly code is 1.5x slower than the GCC version !!!
My Assembly source-code :
align 16
slen:
mov r8, rcx
test cl, 7
jz .loop
xor eax, eax
cmp BYTE [rcx], al
je SHORT .ret
cmp BYTE [rcx+1], al
je SHORT .ret1
cmp BYTE [rcx+2], al
je SHORT .ret2
cmp BYTE [rcx+3], al
je SHORT .ret3
cmp BYTE [rcx+4], al
je SHORT .ret4
cmp BYTE [rcx+5], al
je SHORT .ret5
cmp BYTE [rcx+6], al
je SHORT .ret6
cmp BYTE [rcx+7], al
jne SHORT .align8
mov al, 7
ret
align 16
.ret: ret
align 16
.ret1: mov al, 1
ret
align 16
.ret2: mov al, 2
ret
align 16
.ret3: mov al, 3
ret
align 16
.ret4: mov al, 4
ret
align 16
.ret5: mov al, 5
ret
align 16
.ret6: mov al, 6
ret
align 16
.align8:
lea rcx, [rcx+7]
and rcx, (-8)
align 16
.loop: mov rax, QWORD [rcx]
test al, al
jz SHORT .end
test ah, ah
jz SHORT .end.1
test eax, 0x00ff0000
jz SHORT .end.2
test eax, 0xff000000
jz SHORT .end.3
shr rax, 32
test al, al
jz SHORT .end.4
test ah, ah
jz SHORT .end.5
test eax, 0x00ff0000
jz SHORT .end.6
test eax, 0xff000000
jz SHORT .end.7
add rcx, 8
jmp SHORT .loop
align 16
.end: mov rax, rcx
sub rax, r8
ret
align 16
.end.1:
lea rax, [rcx+1]
sub rax, r8
ret
.end.2:
lea rax, [rcx+2]
sub rax, r8
ret
.end.3:
lea rax, [rcx+3]
sub rax, r8
ret
.end.4:
lea rax, [rcx+4]
sub rax, r8
ret
.end.5:
lea rax, [rcx+5]
sub rax, r8
ret
.end.6:
lea rax, [rcx+6]
sub rax, r8
ret
.end.7:
lea rax, [rcx+7]
sub rax, r8
ret
The GCC version :
align 16
slen:
test cl, 7
je .L18
xor eax, eax
cmp BYTE [rcx], 0
je .L1
cmp BYTE [rcx+1], 0
mov eax, 1
je .L1
cmp BYTE [rcx+2], 0
mov eax, 2
je .L1
cmp BYTE [rcx+3], 0
mov eax, 3
je .L1
cmp BYTE [rcx+4], 0
mov eax, 4
je .L1
cmp BYTE [rcx+5], 0
mov eax, 5
je .L1
cmp BYTE [rcx+6], 0
mov eax, 6
je .L1
cmp BYTE [rcx+7], 0
mov eax, 7
je .L1
lea rax, [rcx+7]
and rax, -8
jmp .L47
align 16
.L18:
mov rax, rcx
jmp .L47
align 16
.L40:
test dh, dh
je .L49
test edx, 16711680
je .L50
test edx, 4278190080
je .L51
shr rdx, 32
test dl, dl
je .L52
test dh, dh
je .L53
test edx, 16711680
je .L54
test edx, 4278190080
je .L55
add rax, 8
.L47:
mov rdx, QWORD [rax]
test dl, dl
jne .L40
sub eax, ecx
.L1:
ret
align 16
.L49:
sub rax, rcx
add eax, 1
ret
align 16
.L50:
sub rax, rcx
add eax, 2
ret
align 16
.L51:
sub rax, rcx
add eax, 3
ret
align 16
.L52:
sub rax, rcx
add eax, 4
ret
align 16
.L53:
sub rax, rcx
add eax, 5
ret
align 16
.L54:
sub rax, rcx
add eax, 6
ret
align 16
.L55:
sub rax, rcx
add eax, 7
ret
My function test result :
string length => 336
loop execution times => 10000000
total execution time => 0.772015
GCC function test result :
string length => 336
loop execution times => 10000000
total execution time => 0.522015
What is the problem ? Why my function is 1.5x slower when everything is kinda looks fine? My string is aligned at 8 bytes, so you can skip the first one-by-one process and alignment.
Is there any problem with my label aligning ? Or the problem is from somewhere else?
ABI -> x64 (Windows)
CPU (Test) => i7-7800X
My C test application source-code :
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
unsigned int
slen_by_me(const char *);
unsigned int
slen_gcc(const char *);
int main() {
static const char *str="WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW";
LARGE_INTEGER frequency;
LARGE_INTEGER start;
LARGE_INTEGER end;
double interval;
unsigned int l = 0;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&start);
for (int i = 0; i < 10000000; i++) {
l += slen_gcc(str);
}
QueryPerformanceCounter(&end);
interval = (double) (end.QuadPart - start.QuadPart) / frequency.QuadPart;
printf("%f\n%u\n", interval, l);
return 0;
}
My object file (with these 2 slen
functions to link to that C tester) creator in FASM :
format MS64 COFF
public slen_gcc
public slen_by_me
section '.text' code readable executable align 64
align 16
slen_gcc:
test cl, 7
je .L18
xor eax, eax
cmp BYTE [rcx], 0
je .L1
cmp BYTE [rcx+1], 0
mov eax, 1
je .L1
cmp BYTE [rcx+2], 0
mov eax, 2
je .L1
cmp BYTE [rcx+3], 0
mov eax, 3
je .L1
cmp BYTE [rcx+4], 0
mov eax, 4
je .L1
cmp BYTE [rcx+5], 0
mov eax, 5
je .L1
cmp BYTE [rcx+6], 0
mov eax, 6
je .L1
cmp BYTE [rcx+7], 0
mov eax, 7
je .L1
lea rax, [rcx+7]
and rax, -8
jmp .L47
align 16
.L18:
mov rax, rcx
jmp .L47
align 16
.L40:
test dh, dh
je .L49
test edx, 16711680
je .L50
test edx, 4278190080
je .L51
shr rdx, 32
test dl, dl
je .L52
test dh, dh
je .L53
test edx, 16711680
je .L54
test edx, 4278190080
je .L55
add rax, 8
.L47:
mov rdx, QWORD [rax]
test dl, dl
jne .L40
sub eax, ecx
.L1:
ret
align 16
.L49:
sub rax, rcx
add eax, 1
ret
align 16
.L50:
sub rax, rcx
add eax, 2
ret
align 16
.L51:
sub rax, rcx
add eax, 3
ret
align 16
.L52:
sub rax, rcx
add eax, 4
ret
align 16
.L53:
sub rax, rcx
add eax, 5
ret
align 16
.L54:
sub rax, rcx
add eax, 6
ret
align 16
.L55:
sub rax, rcx
add eax, 7
ret
align 16
slen_by_me:
mov r8, rcx
test cl, 7
jz .loop
xor eax, eax
cmp BYTE [rcx], al
je SHORT .ret
cmp BYTE [rcx+1], al
je SHORT .ret1
cmp BYTE [rcx+2], al
je SHORT .ret2
cmp BYTE [rcx+3], al
je SHORT .ret3
cmp BYTE [rcx+4], al
je SHORT .ret4
cmp BYTE [rcx+5], al
je SHORT .ret5
cmp BYTE [rcx+6], al
je SHORT .ret6
cmp BYTE [rcx+7], al
jne SHORT .align8
mov al, 7
ret
align 16
.ret: ret
align 16
.ret1: mov al, 1
ret
align 16
.ret2: mov al, 2
ret
align 16
.ret3: mov al, 3
ret
align 16
.ret4: mov al, 4
ret
align 16
.ret5: mov al, 5
ret
align 16
.ret6: mov al, 6
ret
align 16
.align8:
lea rcx, [rcx+7]
and rcx, (-8)
align 16
.loop: mov rax, QWORD [rcx]
test al, al
jz SHORT .end
test ah, ah
jz SHORT .end.1
test eax, 0x00ff0000
jz SHORT .end.2
test eax, 0xff000000
jz SHORT .end.3
shr rax, 32
test al, al
jz SHORT .end.4
test ah, ah
jz SHORT .end.5
test eax, 0x00ff0000
jz SHORT .end.6
test eax, 0xff000000
jz SHORT .end.7
add rcx, 8
jmp SHORT .loop
align 16
.end: mov rax, rcx
sub rax, r8
ret
align 16
.end.1:
lea rax, [rcx+1]
sub rax, r8
ret
.end.2:
lea rax, [rcx+2]
sub rax, r8
ret
.end.3:
lea rax, [rcx+3]
sub rax, r8
ret
.end.4:
lea rax, [rcx+4]
sub rax, r8
ret
.end.5:
lea rax, [rcx+5]
sub rax, r8
ret
.end.6:
lea rax, [rcx+6]
sub rax, r8
ret
.end.7:
lea rax, [rcx+7]
sub rax, r8
ret
Also the C version of slen
int
slen(const char *str) {
const char *start=str;
if(((unsigned long long)str & 7) != 0) {
if(str[0] == 0x00)
return 0;
if(str[1] == 0x00)
return 1;
if(str[2] == 0x00)
return 2;
if(str[3] == 0x00)
return 3;
if(str[4] == 0x00)
return 4;
if(str[5] == 0x00)
return 5;
if(str[6] == 0x00)
return 6;
if(str[7] == 0x00)
return 7;
str=(const char *)(((unsigned long long)str + 7) & (-8));
}
do {
unsigned long long bytes=(*(unsigned long long*)(str));
if((unsigned char)bytes==0x00)
return (int)(str-start);
if((bytes & 0x0000ff00)==0)
return (int)(str-start+1);
if((bytes & 0x00ff0000)==0)
return (int)(str-start+2);
if((bytes & 0xff000000)==0)
return (int)(str-start+3);
bytes >>= 32;
if((unsigned char)bytes==0x00)
return (int)(str-start+4);
if((bytes & 0x0000ff00)==0)
return (int)(str-start+5);
if((bytes & 0x00ff0000)==0)
return (int)(str-start+6);
if((bytes & 0xff000000)==0)
return (int)(str-start+7);
str+=8;
} while (1);
}