2

I have written a Assembly program to display the factorial of a number following AT&T syntax. But it's not working. Here is my code

.text 

.globl _start

_start:
movq $5,%rcx
movq $5,%rax


Repeat:                     #function to calculate factorial
   decq %rcx
   cmp $0,%rcx
   je print
   imul %rcx,%rax
   cmp $1,%rcx
   jne Repeat
# Now result of factorial stored in rax
print:
     xorq %rsi, %rsi

  # function to print integer result digit by digit by pushing in 
       #stack
  loop:
    movq $0, %rdx
    movq $10, %rbx
    divq %rbx
    addq $48, %rdx
    pushq %rdx
    incq %rsi
    cmpq $0, %rax
    jz   next
    jmp loop

  next:
    cmpq $0, %rsi
    jz   bye
    popq %rcx
    decq %rsi
    movq $4, %rax
    movq $1, %rbx
    movq $1, %rdx
    int  $0x80
    addq $4, %rsp
    jmp  next
bye:
movq $1,%rax
movq $0, %rbx
int  $0x80


.data
   num : .byte 5

This program is printing nothing, I also used gdb to visualize it work fine until loop function but when it comes in next some random value start entering in various register. Help me to debug so that it could print factorial.

MattDMo
  • 100,794
  • 21
  • 241
  • 231
am10
  • 449
  • 1
  • 6
  • 17
  • 1
    `int 0x80` in 64-bit code: https://stackoverflow.com/questions/46087730/what-happens-if-you-use-the-32-bit-int-0x80-linux-abi-in-64-bit-code – Peter Cordes Sep 19 '17 at 13:44

2 Answers2

5

As @ped7g points out, you're doing several things wrong: using the int 0x80 32-bit ABI in 64-bit code, and passing character values instead of pointers to the write() system call.

Here's how to print an integer in x8-64 Linux, the simple and somewhat-efficient1 way, using the same repeated division / modulo by 10.

System calls are expensive (probably thousands of cycles for write(1, buf, 1)), and doing a syscall inside the loop steps on registers so it's inconvenient and clunky as well as inefficient. We should write the characters into a small buffer, in printing order (most-significant digit at the lowest address), and make a single write() system call on that.

But then we need a buffer. The maximum length of a 64-bit integer is only 20 decimal digits, so we can just use some stack space. In x86-64 Linux, we can use stack space below RSP (up to 128B) without "reserving" it by modifying RSP. This is called the . If you wanted to pass the buffer to another function instead of a syscall, you would have to reserve space with sub $24, %rsp or something.

Instead of hard-coding system-call numbers, using GAS makes it easy to use the constants defined in .h files. Note the mov $__NR_write, %eax near the end of the function. The x86-64 SystemV ABI passes system-call arguments in similar registers to the function-calling convention. (So it's totally different from the 32-bit int 0x80 ABI, which you shouldn't use in 64-bit code.)

// building with  gcc foo.S  will use CPP before GAS so we can use headers
#include <asm/unistd.h>    // This is a standard Linux / glibc header file
      // includes unistd_64.h or unistd_32.h depending on current mode
      // Contains only #define constants (no C prototypes) so we can include it from asm without syntax errors.

.p2align 4
.globl print_integer            #void print_uint64(uint64_t value)
print_uint64:
    lea   -1(%rsp), %rsi        # We use the 128B red-zone as a buffer to hold the string
                                # a 64-bit integer is at most 20 digits long in base 10, so it fits.

    movb  $'\n', (%rsi)         # store the trailing newline byte.  (Right below the return address).
    # If you need a null-terminated string, leave an extra byte of room and store '\n\0'.  Or  push $'\n'

    mov    $10, %ecx            # same as  mov $10, %rcx  but 2 bytes shorter
    # note that newline (\n) has ASCII code 10, so we could actually have stored the newline with  movb %cl, (%rsi) to save code size.

    mov    %rdi, %rax           # function arg arrives in RDI; we need it in RAX for div
.Ltoascii_digit:                # do{
    xor    %edx, %edx
    div    %rcx                  #  rax = rdx:rax / 10.  rdx = remainder

                                 # store digits in MSD-first printing order, working backwards from the end of the string
    add    $'0', %edx            # integer to ASCII.  %dl would work, too, since we know this is 0-9
    dec    %rsi
    mov    %dl, (%rsi)           # *--p = (value%10) + '0';

    test   %rax, %rax
    jnz  .Ltoascii_digit        # } while(value != 0)
    # If we used a loop-counter to print a fixed number of digits, we would get leading zeros
    # The do{}while() loop structure means the loop runs at least once, so we get "0\n" for input=0

    # Then print the whole string with one system call
    mov   $__NR_write, %eax     # call number from asm/unistd_64.h
    mov   $1, %edi              # fd=1
    # %rsi = start of the buffer
    mov   %rsp, %rdx
    sub   %rsi, %rdx            # length = one_past_end - start
    syscall                     # write(fd=1 /*rdi*/, buf /*rsi*/, length /*rdx*/); 64-bit ABI
    # rax = return value (or -errno)
    # rcx and r11 = garbage (destroyed by syscall/sysret)
    # all other registers = unmodified (saved/restored by the kernel)

    # we don't need to restore any registers, and we didn't modify RSP.
    ret

To test this function, I put this in the same file to call it and exit:

.p2align 4
.globl _start
_start:
    mov    $10120123425329922, %rdi
#    mov    $0, %edi    # Yes, it does work with input = 0
    call   print_uint64

    xor    %edi, %edi
    mov    $__NR_exit, %eax
    syscall                             # sys_exit(0)

I built this into a static binary (with no libc):

$ gcc -Wall -static -nostdlib print-integer.S && ./a.out 
10120123425329922
$ strace ./a.out  > /dev/null
execve("./a.out", ["./a.out"], 0x7fffcb097340 /* 51 vars */) = 0
write(1, "10120123425329922\n", 18)     = 18
exit(0)                                 = ?
+++ exited with 0 +++
$ file ./a.out 
./a.out: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), statically linked, BuildID[sha1]=69b865d1e535d5b174004ce08736e78fade37d84, not stripped

Footnote 1: See Why does GCC use multiplication by a strange number in implementing integer division? for avoiding div r64 for division by 10, because that's very slow (21 to 83 cycles on Intel Skylake). A multiplicative inverse would make this function actually efficient, not just "somewhat". (But of course there'd still be room for optimizations...)



Related: Linux x86-32 extended-precision loop that prints 9 decimal digits from each 32-bit "limb": see .toascii_digit: in my Extreme Fibonacci code-golf answer. It's optimized for code-size (even at the expense of speed), but well-commented.

It uses div like you do, because that's smaller than using a fast multiplicative inverse). It uses loop for the outer loop (over multiple integer for extended precision), again for code-size at the cost of speed.

It uses the 32-bit int 0x80 ABI, and prints into a buffer that was holding the "old" Fibonacci value, not the current.


Another way to get efficient asm is from a C compiler. For just the loop over digits, look at what gcc or clang produce for this C source (which is basically what the asm is doing). The Godbolt Compiler explorer makes it easy to try with different options and different compiler versions.

See gcc7.2 -O3 asm output which is nearly a drop-in replacement for the loop in print_uint64 (because I chose the args to go in the same registers):

void itoa_end(unsigned long val, char *p_end) {
  const unsigned base = 10;
  do {
    *--p_end = (val % base) + '0';
    val /= base;
  } while(val);

  // write(1, p_end, orig-current);
}

I tested performance on a Skylake i7-6700k by commenting out the syscall instruction and putting a repeat loop around the function call. The version with mul %rcx / shr $3, %rdx is about 5 times faster than the version with div %rcx for storing a long number-string (10120123425329922) into a buffer. The div version ran at 0.25 instructions per clock, while the mul version ran at 2.65 instructions per clock (although requiring many more instructions).

It might be worth unrolling by 2, and doing a divide by 100 and splitting up the remainder of that into 2 digits. That would give a lot better instruction-level parallelism, in case the simpler version bottlenecks on mul + shr latency. The chain of multiply/shift operations that brings val to zero would be half as long, with more work in each short independent dependency chain to handle a 0-99 remainder.


Related:

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
3

Several things:

0) I guess this is 64b linux environment, but you should have stated so (if it is not, some of my points will be invalid)

1) int 0x80 is 32b call, but you are using 64b registers, so you should use syscall (and different arguments)

2) int 0x80, eax=4 requires the ecx to contain address of memory, where the content is stored, while you give it the ASCII character in ecx = illegal memory access (the first call should return error, i.e. eax is negative value). Or using strace <your binary> should reveal the wrong arguments + error returned.

3) why addq $4, %rsp? Makes no sense to me, you are damaging rsp, so the next pop rcx will pop wrong value, and in the end you will run way "up" into the stack.

... maybe some more, I didn't debug it, this list is just by reading the source (so I may be even wrong about something, although that would be rare).

BTW your code is working. It just doesn't do what you expected. But work fine, precisely as the CPU is designed and precisely what you wrote in the code. Whether that does achieve what you wanted, or makes sense, that's different topic, but don't blame the HW or assembler.

... I can do a quick guess how the routine may be fixed (just partial hack-fix, still needs rewrite for syscall under 64b linux):

  next:
    cmpq $0, %rsi
    jz   bye
    movq %rsp,%rcx    ; make ecx to point to stack memory (with stored char)
      ; this will work if you are lucky enough that rsp fits into 32b
      ; if it is beyond 4GiB logical address, then you have bad luck (syscall needed)
    decq %rsi
    movq $4, %rax
    movq $1, %rbx
    movq $1, %rdx
    int  $0x80
    addq $8, %rsp     ; now rsp += 8; is needed, because there's no POP
    jmp  next

Again didn't try myself, just writing it from head, so let me know how it changed situation.

Ped7g
  • 16,236
  • 3
  • 26
  • 63
  • Thanks for your valuable time.yes i am getting the factorial result in my rax register but the problem is that it is an integer and we can only print string so i am pushing each digit in stack and finally trying to print it..but this is not working.Now trying to incorporate your code changes.. – am10 Aug 23 '17 at 09:41
  • 1
    @AyushMishra sure, I see you convert the single digit of remainder into ASCII by `addq $48, %rdx` ... BTW, isn't gcc/as capable to compile `addq $'0', %rdx`? I prefer this style to make it "obvious" that it is the quick single-digit-to-ASCII thing. But I think your stack ahead of `next:` contains all the correct ASCII digits, only print-out is not working? You can check in gdb also stack memory, to see yourself. You still should verify that `rsp` fits into 32b, so run it in gdb. – Ped7g Aug 23 '17 at 09:45
  • Nothing working any idea how to convert int to string @Ped7g – am10 Aug 23 '17 at 09:53
  • 1
    it can't be "nothing working", it must be doing SOMETHING. Explain what it is doing, as I said, from a *look* at the source it looks like your conversion of integer should produce all correct digits on the stack, and `rsi` should contain amount of stored ASCII digits in stack memory. Can you verify at least that, to be sure that only printing is wrong? Then you can instead of stack store those digits into some buffer. But again you may hit 32b vs 64b address problem with `int 0x80`. – Ped7g Aug 23 '17 at 10:00
  • next () at Factorial.s:33 33 cmpq $0, %rsi (gdb) info registers rax rcx rdx rsi rbx (rax 0x0 0 ) (rcx 0x1 1) (rdx 0x31 49) ( rsi 0x3 3) ( rbx 0xa 10) value in each register when it first time hit the next label and i think it's correct – am10 Aug 23 '17 at 10:13
  • in next : as you suggested movq %rsp,%rcx changes rcx to (rcx 0x7fffffffddb8 140737488346552 ) which is garbage i think – am10 Aug 23 '17 at 10:18
  • 1
    That's not garbage, that's address of stack memory, if you will print content in that memory, there should be three digits stored there (5! = 120, so at top of stack value `0x31 == '1'` is stored as eight bytes at address `0x7fffffffddb8: 31 00 00 00 00 00 00 00` .. continuing with the other two digits... `32 00 00 00 00 00 00 00 30 00 00 00 00 00 00 00` (that's what your `push rdx` inside `loop:` produced).... the problem is, that that address is not 32b value, so `int 0x80` will probably fail to reach there for that `0x31`. – Ped7g Aug 23 '17 at 11:03
  • Thanks for the insight..Can you suggest how to print the content then ? – am10 Aug 23 '17 at 11:16
  • 2
    Use the correct API (`syscall` for 64b mode). Or rewrite the code to use only 32b registers, and compile it as 32b ELF binary (`-m32` I think). Google search first hit was this http://blog.rchapman.org/posts/Linux_System_Call_Table_for_x86_64/ (looks usable to me) – Ped7g Aug 23 '17 at 11:18
  • 1
    @Ped7g: I added an answer with a nice and clean function that prints the whole digit string with one `write()` call, as an attempt at a canonical answer for printing integers questions. Fixing the OP's code with small tweaks may be possible, but it's ugly. (and BTW, Linux always puts `%rsp` above 4G for regular 64-bit binaries, leaving the whole below-4G for static code/data. The x32 ABI uses 32-bit pointers in long mode, so building with `-mx32` will get Linux to start your binary with `%rsp` in the low 4G. A few syscalls have x32-specific versions.) – Peter Cordes Aug 24 '17 at 01:21