Since I have a simulator handy...
.thumb
.globl _start
_start:
.word 0x20001000
.word reset
.word hang
.word hang
.word hang
.word hang
.word hang
.word hang
.thumb_func
reset:
mov r0,#5
bl test
b hang
.thumb_func
hang:
swi 0xFF
b hang
test:
cmp r0,#0
bne test1
bx lr
test1:
sub r0,#1
push {r3,lr}
bl test
pop {r3,pc}
Build:
08000000 <_start>:
8000000: 20001000 andcs r1, r0, r0
8000004: 08000021 stmdaeq r0, {r0, r5}
8000008: 08000029 stmdaeq r0, {r0, r3, r5}
800000c: 08000029 stmdaeq r0, {r0, r3, r5}
8000010: 08000029 stmdaeq r0, {r0, r3, r5}
8000014: 08000029 stmdaeq r0, {r0, r3, r5}
8000018: 08000029 stmdaeq r0, {r0, r3, r5}
800001c: 08000029 stmdaeq r0, {r0, r3, r5}
08000020 <reset>:
8000020: 2005 movs r0, #5
8000022: f000 f803 bl 800002c <test>
8000026: e7ff b.n 8000028 <hang>
08000028 <hang>:
8000028: dfff svc 255 ; 0xff
800002a: e7fd b.n 8000028 <hang>
0800002c <test>:
800002c: 2800 cmp r0, #0
800002e: d100 bne.n 8000032 <test1>
8000030: 4770 bx lr
08000032 <test1>:
8000032: 3801 subs r0, #1
8000034: b508 push {r3, lr}
8000036: f7ff fff9 bl 800002c <test>
800003a: bd08 pop {r3, pc}
And run it showing disassembly in execution order and memory accesses:
--- 0x08000020: 0x2005 movs r0,#0x05
--- 0x08000022: 0xF000
--- 0x08000024: 0xF803 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000032: 0x3801 subs r0,#0x01
--- 0x08000034: 0xB508 push {r3,lr}
write16(0x20000FF8,0x0000)
write16(0x20000FFA,0x0000)
write16(0x20000FFC,0x0027)
write16(0x20000FFE,0x0800)
--- 0x08000036: 0xF7FF
--- 0x08000038: 0xFFF9 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000032: 0x3801 subs r0,#0x01
--- 0x08000034: 0xB508 push {r3,lr}
write16(0x20000FF0,0x0000)
write16(0x20000FF2,0x0000)
write16(0x20000FF4,0x003B)
write16(0x20000FF6,0x0800)
--- 0x08000036: 0xF7FF
--- 0x08000038: 0xFFF9 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000032: 0x3801 subs r0,#0x01
--- 0x08000034: 0xB508 push {r3,lr}
write16(0x20000FE8,0x0000)
write16(0x20000FEA,0x0000)
write16(0x20000FEC,0x003B)
write16(0x20000FEE,0x0800)
--- 0x08000036: 0xF7FF
--- 0x08000038: 0xFFF9 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000032: 0x3801 subs r0,#0x01
--- 0x08000034: 0xB508 push {r3,lr}
write16(0x20000FE0,0x0000)
write16(0x20000FE2,0x0000)
write16(0x20000FE4,0x003B)
write16(0x20000FE6,0x0800)
--- 0x08000036: 0xF7FF
--- 0x08000038: 0xFFF9 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000032: 0x3801 subs r0,#0x01
--- 0x08000034: 0xB508 push {r3,lr}
write16(0x20000FD8,0x0000)
write16(0x20000FDA,0x0000)
write16(0x20000FDC,0x003B)
write16(0x20000FDE,0x0800)
--- 0x08000036: 0xF7FF
--- 0x08000038: 0xFFF9 bl 0x0800002B
--- 0x0800002C: 0x2800 cmp r0,#0x00
--- 0x0800002E: 0xD100 bne 0x08000031
--- 0x08000030: 0x4770 bx r14
--- 0x0800003A: 0xBD08 pop {r3,pc}
read16(0x20000FD8)=0x0000
read16(0x20000FDA)=0x0000
read16(0x20000FDC)=0x003B
read16(0x20000FDE)=0x0800
--- 0x0800003A: 0xBD08 pop {r3,pc}
read16(0x20000FE0)=0x0000
read16(0x20000FE2)=0x0000
read16(0x20000FE4)=0x003B
read16(0x20000FE6)=0x0800
--- 0x0800003A: 0xBD08 pop {r3,pc}
read16(0x20000FE8)=0x0000
read16(0x20000FEA)=0x0000
read16(0x20000FEC)=0x003B
read16(0x20000FEE)=0x0800
--- 0x0800003A: 0xBD08 pop {r3,pc}
read16(0x20000FF0)=0x0000
read16(0x20000FF2)=0x0000
read16(0x20000FF4)=0x003B
read16(0x20000FF6)=0x0800
--- 0x0800003A: 0xBD08 pop {r3,pc}
read16(0x20000FF8)=0x0000
read16(0x20000FFA)=0x0000
read16(0x20000FFC)=0x0027
read16(0x20000FFE)=0x0800
--- 0x08000026: 0xE7FF B 0x08000027
--- 0x08000028: 0xDFFF swi 0xFF
I can maybe see your confusion since the return address for all but the last one is the same address and we could perhaps craft an example. But recursion often has more than the return address but has some other local variables that are changing, in this case our local variable is in r0 if you will so doesn't need to be saved to the stack on each call.
First time our return back to the top bl after reset:
write16(0x20000FFC,0x0027)
write16(0x20000FFE,0x0800)
The remaining times it is the same return address but we need N number of them on the stack for the code to work as written.
write16(0x20000FF4,0x003B)
write16(0x20000FF6,0x0800)
write16(0x20000FEC,0x003B)
write16(0x20000FEE,0x0800)
write16(0x20000FE4,0x003B)
write16(0x20000FE6,0x0800)
write16(0x20000FDC,0x003B)
write16(0x20000FDE,0x0800)
So as we unwind this we have those five addresses on the stack now.
read16(0x20000FDC)=0x003B
read16(0x20000FDE)=0x0800
...
read16(0x20000FFC)=0x0027
read16(0x20000FFE)=0x0800
In general bl modifies lr and places the return address on the stack (the above is thumb code not arm code but covers your questions as they work the same in this respect). so if you are nesting calls one() calls two(), two() calls three() for two() to get back to one() lr needs to be saved in two() so that it can be used, if you don't save lr then the call to three() changes lr and we can't get back.
If your recursion wants to use bl (looks like compiled code) for purity, and you want a way for that function, factorial in your example test in mine, to be able to return to the original caller then those two facts combine to having to push lr on the stack. If you want to bl to the top of the recursive function, the same entry point that the outside caller used then every call will be adding lr to the stack, and every return needs to pull it back off.
If you want to do some hand assembly to modify it and it doesn't call the same entry point you can get rid of the bl and the stack stuff.
test:
push {r3,lr}
test1:
cmp r0,#0
beq test2
sub r0,#1
b test1
test2:
pop {r3,pc}
can even leave the bl in there
test:
push {r3,lr}
test1:
cmp r0,#0
beq test2
sub r0,#1
bl test1
test2:
pop {r3,pc}
but if you want to return each time then breaking the loop has to be done differently. I don't have a solution off hand that uses bl and returns but is able to break out of the loop at the right time.