The correct asm version may be:
unit MyStrings; // do not overlap Strings.pas unit
interface
function StringLen(const texto : string) : integer;
implementation
function StringLen(const texto : string) : integer;
asm
test eax,eax
jz @done
mov eax,dword ptr [eax-4]
@done:
end;
end.
Note that:
- I used
MyStrings
as unit name, since it is a very bad idea to overlap the official RTL unit names, like Strings.pas
;
- I wrote
(const texto: string)
instead of (texto: string)
, to avoid a reference count change at calling;
- Delphi
string
type already has its length stored as integer
just before the character memory buffer;
- In Delphi asm calling conventions, the input parameters are set in
eax edx ecx
registers, and the integer result of a function is the eax
register - see this reference article - for Win32 only;
- I tested for
texto
to be nil
(eax=0
), which stands for a void ''
string;
- This would work only under Win32 - asm code under Win64 would be diverse;
- Built-in
length()
function would be faster than an asm sub-function, since it is inlined in new versions of Delphi;
- Be aware of potential name collisions: there is already a well known
StrLen()
function, which expects a PChar
as input parameter - so I renamed your function as StringLen()
.
Since you want to learn asm, here are some reference implementation of this function.
A fast PChar
oriented version may be :
function StrLen(S: PAnsiChar): integer;
asm
test eax,eax
mov edx,eax
jz @0
xor eax,eax
@s: cmp byte ptr [eax+edx+0],0; je @0
cmp byte ptr [eax+edx+1],0; je @1
cmp byte ptr [eax+edx+2],0; je @2
cmp byte ptr [eax+edx+3],0; je @3
add eax,4
jmp @s
@1: inc eax
@0: ret
@2: add eax,2; ret
@3: add eax,3
end;
A more optimized version:
function StrLen(S: PAnsiChar): integer;
// pure x86 function (if SSE2 not available) - faster than SysUtils' version
asm
test eax,eax
jz @@z
cmp byte ptr [eax+0],0; je @@0
cmp byte ptr [eax+1],0; je @@1
cmp byte ptr [eax+2],0; je @@2
cmp byte ptr [eax+3],0; je @@3
push eax
and eax,-4 { DWORD Align Reads }
@@Loop:
add eax,4
mov edx,[eax] { 4 Chars per Loop }
lea ecx,[edx-$01010101]
not edx
and edx,ecx
and edx,$80808080 { Set Byte to $80 at each #0 Position }
jz @@Loop { Loop until any #0 Found }
@@SetResult:
pop ecx
bsf edx,edx { Find First #0 Position }
shr edx,3 { Byte Offset of First #0 }
add eax,edx { Address of First #0 }
sub eax,ecx { Returns Length }
@@z: ret
@@0: xor eax,eax; ret
@@1: mov eax,1; ret
@@2: mov eax,2; ret
@@3: mov eax,3
end;
An SSE2 optimized version:
function StrLen(S: PAnsiChar): integer;
asm // from GPL strlen32.asm by Agner Fog - www.agner.org/optimize
or eax,eax
mov ecx,eax // copy pointer
jz @null // returns 0 if S=nil
push eax // save start address
pxor xmm0,xmm0 // set to zero
and ecx,0FH // lower 4 bits indicate misalignment
and eax,-10H // align pointer by 16
movdqa xmm1,[eax] // read from nearest preceding boundary
pcmpeqb xmm1,xmm0 // compare 16 bytes with zero
pmovmskb edx,xmm1 // get one bit for each byte result
shr edx,cl // shift out false bits
shl edx,cl // shift back again
bsf edx,edx // find first 1-bit
jnz @A200 // found
// Main loop, search 16 bytes at a time
@A100: add eax,10H // increment pointer by 16
movdqa xmm1,[eax] // read 16 bytes aligned
pcmpeqb xmm1,xmm0 // compare 16 bytes with zero
pmovmskb edx,xmm1 // get one bit for each byte result
bsf edx,edx // find first 1-bit
// (moving the bsf out of the loop and using test here would be faster
// for long strings on old processors, but we are assuming that most
// strings are short, and newer processors have higher priority)
jz @A100 // loop if not found
@A200: // Zero-byte found. Compute string length
pop ecx // restore start address
sub eax,ecx // subtract start address
add eax,edx // add byte index
@null:
end;
Or even a SSE4.2 optimized version:
function StrLen(S: PAnsiChar): integer;
asm // warning: may read up to 15 bytes beyond the string itself
or eax,eax
mov edx,eax // copy pointer
jz @null // returns 0 if S=nil
xor eax,eax
pxor xmm0,xmm0
{$ifdef HASAESNI}
pcmpistri xmm0,dqword [edx],EQUAL_EACH // comparison result in ecx
{$else}
db $66,$0F,$3A,$63,$02,EQUAL_EACH
{$endif}
jnz @loop
mov eax,ecx
@null: ret
@loop: add eax,16
{$ifdef HASAESNI}
pcmpistri xmm0,dqword [edx+eax],EQUAL_EACH // comparison result in ecx
{$else}
db $66,$0F,$3A,$63,$04,$10,EQUAL_EACH
{$endif}
jnz @loop
@ok: add eax,ecx
end;
You will find all those functions, including Win64 versions, in our very optimized SynCommons.pas unit, which is shared by almost all our Open Source projects.