Learning assembly. how to make code faster

Question

I started to learn assembly some days ago and i write my first ever piece of code using user input, string functions, passing arguments by stack or by register etc... I have some questions. Do you have some advices to make my code faster. For example, in my atoi function, i know that imul is time consuming. Maybe, there are enormous mistakes but as far as i know, many things to improve for sure. So my main question is : are there fatal errors in this first code and my second is : any type to refactoring code with faster instructions

SYS_READ    equ 3
SYS_WRITE   equ 4
STDIN       equ 0
STDOUT      equ 1

%macro printm 2
    mov eax, SYS_WRITE
    mov ebx, STDOUT
    mov ecx, %1
    mov edx, %2
    int 0x80
%endmacro

%macro prolog 0
    push ebp,
    mov ebp, esp
%endmacro

%macro epilog 0    
    mov esp, ebp
    pop ebp
%endmacro

section .text

global _start

_start:

    ; first check if our strlen proc works
    push dword msgbegin
    call strlen
    add esp, byte 4
    cmp eax, lenbegin
    je .suite       ; it works, we continue

    ; exiting prog if the len computed in rax != lenbegin
    mov eax, 1
    int 0x80

.suite:
    
    ; check if strcpy works printing res (msgbegin -> srcdst)
    push dword lenbegin
    push dword msgbegin
    push dword strdst
    call strcpy
    add esp, byte 12
    
    push dword lenbegin
    push dword strdst
    call print
    add esp, byte 8

    ; first input
    printm msgbinp1, leninp1

    mov eax, SYS_READ
    mov ebx, STDIN
    mov ecx, num1
    mov edx, 2
    int 0x80

    printm msgbinp2, leninp2

    mov eax, SYS_READ
    mov ebx, STDIN
    mov ecx, num2
    mov edx, 2
    int 0x80

    printm msgbinp3, leninp3

    mov eax, SYS_READ
    mov ebx, STDIN
    mov ecx, bignum
    mov edx, 4
    int 0x80

    mov edx, bignum
    call atoi
    cmp eax, 123
    je .success     ; exit if bignum != 123

    mov eax, 1
    int 0x80 

.success:

    ; need to strip line feed from bignum
    printm bignum, 4
    printm msgoutp, lenoutp

    ; now we compute the sum
    mov eax, [num1]
    sub eax, '0'
    mov ebx, [num2]
    sub ebx, '0'
    add eax, ebx
    add eax, '0'

    mov [sum], eax

    printm msgres, lenres
    ; we print it
    printm sum, 1

    ; exiting the programm
    mov     eax, 1
    int     0x80
    
print:
    push ebp
    mov ebp, esp
    mov eax, 4
    mov ebx, 1
    mov ecx, [ebp + 8]
    mov edx, [ebp + 12]
    int 0x80
    mov esp, ebp
    pop ebp
    ret
    
strcpy:
    push ebp
    mov ebp, esp
    mov ecx, [ebp + 16]
    mov esi, [ebp + 12]
    mov edi, [ebp + 8]
    rep movsb
    mov esp, ebp
    pop ebp
    ret

strlen:
    push ebp
    mov ebp, esp
    push edi
    push ecx
    mov edi, [ebp + 8]
    sub ecx, ecx
    sub al, al
    not ecx
    cld
    repne scasb
    not ecx
    lea eax, [ecx] ; keep null term in size
    pop ecx
    pop edi
    mov esp, ebp
    pop ebp
    ret

atoi:
    xor eax, eax            ; zero a "result so far"
.top:
    movzx ecx, byte [edx]   ; get a character
    inc edx                 ; ready for next one
    cmp ecx, '0'            ; valid?
    jb .done
    cmp ecx, '9'
    ja .done
    sub ecx, '0'            ; "convert" character to number
    imul eax, 10            ; multiply "result so far" by ten
    add eax, ecx            ; add in current digit
    jmp .top                ; until done
.done:
    ret    

section .data

    msgbegin db  "hello everyone !", 0xa, 0
    lenbegin equ $ - msgbegin
    msgbinp1 db  "Enter a digit : ", 0xa, 0
    leninp1 equ $ - msgbinp1
    msgbinp2 db  "Enter second digit : ", 0xa, 0
    leninp2 equ $ - msgbinp2
    msgbinp3 db  "Enter third digit : ", 0xa, 0
    leninp3 equ $ - msgbinp3
    msgoutp db  "is equal to 123 !", 0xa, 0
    lenoutp equ $ - msgoutp
    msgres db  "sum of x and y is ", 0xa,  0
    lenres equ $ - msgres
    strdst times lenbegin db 0

segment .bss

    sum     resb 1
    num1    resb 2
    num2    resb 2
    bignum  resd 4

Thanks you. I started reading the doc but i'm not sure that i understood key concepts.

Cases are rare where string-to-int conversion is a bottleneck, but one of the basics is that `c >= '0' && c <= '9'` can be optimized to `c - 0x30 <= 9`, and `add r32, r32; lea r32, [r32 + r32 * 4]` is *probably* faster than `imul r32, r32, 10`. — xiver77, Jun 23 '22 at 13:04
For example, your `now we compute the sum` block uses 32 bit registers when it should be using 8 bits. As for optimization, subtracting `0` twice then adding it once is the same as subtracting it once. — Jester, Jun 23 '22 at 13:08
Oh also, always align the critical loops. Some machines (like my Tigerlake laptop) are *very* sensitive to loop alignment. Aligning to 16-byte-boundary is an Intel recommendation. Some assemblers provide convenience macros to generate multibyte `nop`s to match the alignment. — xiver77, Jun 23 '22 at 13:11
In `strlen` the `sub ecx, ecx` and the `not ecx` together are really just a `mov ecx, -1`. The `lea eax, [ecx]` is just a `mov eax, ecx`. You don't need to set up a stack frame there either and you don't need to preserve `ecx` per standard convention. `esi` and `edi` are supposed to be preserved but you don't do that in `strcpy` so it's unclear whether you want to follow convention or not. By the way, there is a [separate code review site](https://codereview.stackexchange.com/). — Jester, Jun 23 '22 at 13:13
Maybe, for positive integers, since each integer string is max 10 bytes, you can load it to a `ymm` vector -> `vpmovzxbw` -> `vpmullw` with `[1, 10, 100, ...]` -> sum all the needed elements. It's untested, and I don't know if it's faster at all. — xiver77, Jun 23 '22 at 13:21
For a more efficient `atoi`, see [NASM Assembly convert input to integer?](https://stackoverflow.com/a/49548057). Or for @xiver77's suggestion to use SIMD, see [Is there a fast way to convert a string of 8 ASCII decimal digits into a binary number?](https://stackoverflow.com/q/71570932) and [Most insanely fastest way to convert 9 char digits into an int or unsigned int](https://stackoverflow.com/q/70420948) for fixed-width using `pmaddubsw` / `pmaddwd` to avoid overflowing a 16-bit element, and keep density higher. — Peter Cordes, Jun 23 '22 at 13:39
@xiver77: For variable-width it's harder, but perhaps compute a load address so the LSD will be at the same place. And see [How to implement atoi using SIMD?](https://stackoverflow.com/q/35127060). Probably also a good idea to look at what modern SIMD-JSON is doing; they may have refined those techniques. (https://github.com/simdjson/simdjson) — Peter Cordes, Jun 23 '22 at 13:41
`rep movsb` isn't `strcpy`, it's `memcpy`. It's decently efficient for non-tiny copies on modern x86, although only Ice Lake has the "fast short rep movs" feature which should reduce startup overhead. (https://www.phoronix.com/scan.php?page=news_item&px=Intel-5.6-FSRM-Memmove). But `repne scasb` is *not* particularly efficient; it only goes 1 byte at a time on current CPUs. (**https://agner.org/optimize/**). You can go *much* faster with SSE2 `pcmpeqb` / `pmovmskb`. See [Is it safe to read past the end of a buffer within the same page on x86 and x64?](//stackoverflow.com/q/37800739) — Peter Cordes, Jun 23 '22 at 13:47
Of course, all of these things are trivial compared to the cost of one `int 0x80` system call unless your strings are *many* KiB. So you could get a big speedup by reading command line args, which would let your program make only one system call. — Peter Cordes, Jun 23 '22 at 13:52
I'd say the first thing you need to do is realize that complying with C calling conventions (rather than not having any prologue or epilogue and passing parameters in registers) does nothing more than add worthless bloat. For an example, look at your `strcpy:` - it's a single useful instruction surrounded in 7 instructions of useless bloat (which could and should be "inlined" to also get rid of the `call` and `ret` instructions). — Brendan, Jun 23 '22 at 14:45
Thanks you all for great advices. I take note. I don't understand yet the idea of 16 bytes boundary stack alignment. Could you show me an example with my code ? After the prologue in main, is is possible to make `and esp, 0xFFFFFFF0` to make sure the stack is aligned ? And yes i will learn the C calling conventions. It is needed. — Etienne Armangau, Jun 23 '22 at 15:47
@EtienneArmangau It's not stack alignment. I'm talking about the instruction alignment so that the start of each label of a loop is at a 16-byte boundary. [This](https://stackoverflow.com/a/45391788/17665807) is a nice read. — xiver77, Jun 23 '22 at 17:00
Thanks a lot. Now i understand the concept. I see that any of my label is correctly on 16b boundaries. My question now is, except add nops instructions to pad it, is there a way or rewriting code smartly, to reach this goal ? or the lazy way : a compiler option ? Thanks. — Etienne Armangau, Jun 23 '22 at 18:04
Unfortunately current assemblers handle an `align 16` directive by expanding it to NOPs, without trying to [lengthen previous instructions](//stackoverflow.com/questions/48046814/what-methods-can-be-used-to-efficiently-extend-instruction-length-on-modern-x86). (Except for GAS's `-mbranches-within-32B-boundaries` workaround for [How can I mitigate the impact of the Intel jcc erratum on gcc?](//stackoverflow.com/q/61256646)). Even worse, NASM's default behaviour for `align` is single-byte NOPs; you need `%use smartalign` to get long NOPs, important inside a function where they'll execute. — Peter Cordes, Jun 23 '22 at 19:11
I’m voting to close this question because it belongs to https://codereview.stackexchange.com/ — Mike Nakis, Jun 23 '22 at 19:15
Thanks you Peter for `%use smartalign`. I did not know this macro before. — Etienne Armangau, Jun 23 '22 at 20:29

Learning assembly. how to make code faster

0 Answers0