0

I am new to assembly programming, currently taking online course.

Original problem was to count number of paths from top left corner to bottom right corner. But I found a good solution to that here:

https://www.geeksforgeeks.org/count-possible-paths-top-left-bottom-right-nxm-matrix/

Based on the combinatorics solution I should be able to find all paths in a binary manner. First question, do you know a faster way to count paths?

Searched for the solution to print all paths in:

https://www.geeksforgeeks.org/print-all-possible-paths-from-top-left-to-bottom-right-of-a-mxn-matrix/

But did not notice any using the binary approach with seemed adequate for assembly. Searching a bit more online I found:

https://www.baeldung.com/cs/generate-k-combinations

Revolving door algorithm was well detailed, and I calculate it to be O (number of combinations) * O (width or height of matrix (for printing) -1) * O (branching loops) on time complexity and O (width or height + 1) on space. Second question is this a correct assumption? If not, what is the correct complexity? Is it faster than the other solutions posted for finding all paths to this problem? Those are stated to be O(2^(width*height))

Third question: Who wrote this algorithm? Where can I find more like it?

And lastly, I will post my newbie 32-bit assembly pasta code for fasm, should work on matrixes larger than 3 x 3 smaller than 32 x 32(not recommended to go above 16 x 16 that is already a lot of combinations and only omitting the print instructions), any improvements are more than welcome. Thank you.

    format PE console
entry start



include 'win32a.inc' 

; ===============================================
struct MAT
    h   db  ?       ; X coordinate.
    w   db  ?       ; Y coordinate.
ends



; ===============================================
section '.bss' readable writeable
    ; Declare the uninitialized table in memory:
    path_matrix     MAT  ?
    count           dd  ?
    indices         db path_matrix.w - 1 dup ?
    end_indices:
    

; ===============================================
section '.text' code readable executable

start:

    call    read_hex
    mov     [path_matrix.h], al ; down will be 0
    call    read_hex
    mov     [path_matrix.w], al ; right will be 1
    
    dec     eax
    mov     ecx, eax
    
initialize: 
    mov     ebx, ecx
    dec     ebx
    mov     byte[indices+ecx], bl
    loop    initialize
    movzx   ebx, [path_matrix.h]
    dec     ebx
    add     ebx, eax
    mov     byte[indices+eax+1], bl 
    mov     eax, ebx
    
    

print_combination:
    inc     [count]
    movzx   ebx, [end_indices - indices]
    dec     ebx
    xor     eax, eax
    
    
print_loop:
    xor     esi, esi
    inc     esi
    mov     cl, byte[indices + ebx ]
    shl     esi, cl
    xor     eax, esi
    dec     ebx
    cmp     ebx, 0
    jnz     print_loop
    call    print_eax_binary


    
branch:
    lea     edi, [indices +1]
    movzx   eax, [path_matrix.w] ; check if withd is eaven, if true matrix is odd (w -1)
    shr     eax, 1
    jnc     odd
    

eaven:
    movzx   eax, byte[edi]
    cmp     eax, 0
    jle     eaven_indice
    dec     eax
    mov     byte[edi], al
    jmp     print_combination 
    

eaven_indice:
    inc     edi
        

try_to_increase:
    movzx   ebx, byte[edi]
    inc     ebx
    cmp     bl, [edi+1]
    jl      increase
    lea     ecx, [edi-indices+1]
    cmp     cl, [path_matrix.w]
    jl      increase_indice
    jmp     fin
    
    
increase:
    mov     byte[edi], bl
    dec     ebx
    mov     byte[edi-1], bl
    jmp     print_combination

    
    
odd:
    movzx   eax, byte[edi]
    inc     eax
    cmp     al, [edi+1]
    jge     increase_indice
    mov     byte[edi], al
    jmp     print_combination

    
increase_indice:
    inc     edi
    

try_decrease:
    lea     eax, [edi - indices]
    cmp     byte[edi], al
    jl      eaven_indice    

    
decrease:
    movzx   ebx, byte[edi-1]
    mov     byte[edi], bl
    sub     eax, 2
    mov     byte[edi-1], al
    jmp     print_combination
    
    
fin:
    mov     eax, [count]
    call    print_eax

    ; Exit the process:
    push    0
    call    [ExitProcess]

include 'training.inc'
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847

1 Answers1

2

The solution is not binary because paths from the top or left can overlap - creating duplicates. Working backward the solution is additive - sum of paths from left and paths from top.

Which leads to a simple solution:

_m      = 16
_n      = 16

ddp     rq _m

        ; initialize first position
        mov [ddp + (_n-1)*8], 1
        mov ecx, _m
outer:
        push rcx
        mov ecx, _n
        xor eax, eax
inner:
        add rax, [ddp + (rcx-1)*8]
        mov [ddp + (rcx-1)*8], rax
        loop inner
        pop rcx
        loop outer

There is a closed form solution, but it's a little tricky to reduce the expression in such a way as to cover a large number of inputs:

; number of paths from one corner to opposite diagonal corner of M x N matrix
; RCX : N, RDX : M
numberOfPaths:
        mov eax,1
        mov r9,1
        lea r8,[rcx+rdx-1]
        jmp .try
    .more:
        mul rcx
        inc ecx
        div r9
        inc r9
    .try:
        cmp rcx,r8
        jc .more
        retn

It can be reduced further with more code.

bitRAKE
  • 391
  • 2
  • 9
  • 1
    It would be more convenient to use a different register for the inner loop counter. [`loop` is slow](https://stackoverflow.com/questions/35742570/why-is-the-loop-instruction-slow-couldnt-intel-have-implemented-it-efficiently), so it's better not to use it for either loop, especially where writing FLAGS would be fine with `dec ecx` / `jnz` – Peter Cordes Jun 24 '22 at 20:47
  • I like CISC and use it creatively. – bitRAKE Jun 25 '22 at 16:57
  • [loop is getting faster all the time](https://www.uops.info/table.html?search=loop&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_ZEN3=on&cb_measurements=on&cb_doc=on&cb_base=on) – bitRAKE Jun 25 '22 at 18:17
  • It's been fast on AMD since Bulldozer-family, but it's not getting faster on Intel. It's garbage even on Alder Lake P-cores, and on Tremont and Alder Lake E-cores, at about 5 cycles / iteration. (Note that uops.info's testing on Zen showing 0.56 or 0.50 cycles/loop throughput is for the not-taken case, `mov rcx,1` / `loop`. In the taken case, RCX=0 or 2, it looks like the best case measured throughput is more like 2.15 cycles, but they only measured non-looping uses of it :P Agner Fog's timing is I assume measuring it as a loop branch, so as expected can run it 1/clock on AMD.) – Peter Cordes Jun 25 '22 at 19:43
  • Thanks, for the lesson on StackOverflow culture. Considering this in my future replies might be difficult as I have no gauge for "most humans". So, they might remain weird. – bitRAKE Jul 22 '22 at 16:05