2

As I understand the documentation, 2.8.2, the process of launching core 1 is to send a sequence of values, with the final 3 being the vector table, stack pointer, and entry point, over the FIFO, while core 1 will echo the values back to you.

From the c code provided by the documentation, I wrote out this assembly:

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xe000ed08
    ldr r3, [r0]            @vector table offset register
core:
    mov r7, pc
    b fifo_drain
    sev
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r7, pc
    b fifo_drain
    sev
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r1, #1
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #1
    bne core

    mov r1, r3              @vector table
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core

    mov r1, sp              @stack pointer
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, sp
    bne core

    mov r1, pc
    add r1, #2              @entry point
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led

The sequence of values sent over the FIFO is {0, 0, 1, vt, sp, ent}, and when the value isn't echoed back, the sequence starts over. The entry point is simply the last 4 lines, where the core reads the CPUID register from the SIO, and turns on the LED (GPIO25) if the cpu id is 1.

The sequence seems to get stuck in a loop at the vector table, which makes sense since I barely understand it, the FIFO just doesn't echo it back. Also, the documentation has a note next to the entry point that says "don't forget the thumb bit!", whatever that means.

Edit:

Updated code, same problem:

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xe000ed08
    ldr r1, =0x20000000
    str r1, [r0]            @init vtor

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led
    
    b core
    
.thumb_func
core:
    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core
    
    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core
    
    mov r1, #1
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #1
    bne core
    
    ldr r3, =0x20000000
    mov r1, r3              @vector table
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core
    
    mov r1, sp              @stack pointer
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, sp
    bne core
    
    ldr r3, =0x20000001
    mov r1, r3              @entry point
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r3
    bne core
    
    b loop
    
.thumb_func
fifo_stat:
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #15
    and r1, r1, r2
    mov pc, r7

.thumb_func
fifo_writ:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #2
    and r3, r3, r2
    beq fifo_writ
    
    ldr r0, =0xd0000054
    str r1, [r0]
    sev
    mov pc, r7

.thumb_func
fifo_read:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #1
    and r3, r3, r2
    beq _wfe

    ldr r0, =0xd0000058
    ldr r1, [r0]
    mov pc, r7

.thumb_func
fifo_drain:
    ldr r0, =0xd0000058
    ldr r1, [r0]
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #1
    and r1, r1, r2
    bne fifo_drain
    sev
    mov pc, r7
    
.thumb_func
_wfe:
    wfe
    b fifo_read

.thumb_func
led:
    movs r1, #32            @io_bank
    ldr r0, =0x4000f000
    str r1, [r0]            @release reset on io_bank

    movs r1, #5             @sio
    ldr r0, =0x400140cc
    str r1, [r0]            @assign sio to gpio25_ctrl

    movs r1, #1
    lsl r1, r1, #25
    
    ldr r0, =0xd0000024
    str r1, [r0]            @enable output

    ldr r0, =0xd0000014
    str r1, [r0]            @turn on the led
    
.thumb_func
loop:
    nop
    b loop
  • The 'thumb bit' means that the LSB of a call address should be 1 to enter thumb mode (I'm not saying that very well.) So, the vector addresses and any BL destinations should have the LSB set. Some assemblers or compilers do that for us, I think. How it affects your code, I'm really not sure. Get the official M0+ docs from ARM and it'll be explained in there. – aMike Sep 03 '22 at 23:11
  • @aMike I get what you're saying, thumb mode instructions are at an offset of 1. I think the only time you need to manually offset the pc is when you switch between thumb and arm mode. – Will Thomas Sep 04 '22 at 08:47
  • No any time you use a thumb interwork branch which originally was only bx but later pop and others (armv6-m supports many) you have to have the lsbit correct. ARM wanting it in the cortex-m vector table is a strange request but perhaps that was forward thinking to a vector table for some processor that might support both modes. We can probably find it in their bootstrap code for core one, but almost guaranteed they are basically doing a bx to the address we specify so we have to have the lsbit set. – old_timer Sep 04 '22 at 14:27
  • you have to have the bit set correctly for all branches that use addresses and that can change modes, not just when changing modes. which instructions you have to get this right for depends on the architecture (armv4t, armv5, armv6, armv6-m armv7, armv7-m ) – old_timer Sep 04 '22 at 14:29
  • out of curiosity why are you avoiding bl? and doing this pc r7 thing? – old_timer Sep 05 '22 at 15:21
  • @old_timer Cause I’m dumb as shit, only been doing arm for like two weeks. I’ll fix it once I’m back at my laptop – Will Thomas Sep 05 '22 at 15:48
  • just curious, that was all, no worries – old_timer Sep 05 '22 at 16:03
  • I updated my answer, starting with From your rewrite... – old_timer Sep 05 '22 at 17:03

1 Answers1

0

My core zero code is a mixture of C and assembly language. I think we can sort your questions out though.

My bootstrap looks like this

.cpu cortex-m0
.thumb

    ldr r1,=0xD0000000 ;@SIO_CPUID
    ldr r0,[r1]
    cmp r0,#0
    bne core_one

    ;@ core_zero
    ldr r0,=0x20002000
    mov sp,r0
    bl zero_entry
    b .

core_one:
    ;@ core_one
    bl notmain
    b .

.align
.ltorg


;@ ----------------------------------
.balign 0x100

.thumb_func
.globl PUT32
PUT32:
    str r1,[r0]
    bx lr

.thumb_func
.globl GET32
GET32:
    ldr r0,[r0]
    bx lr

.globl SEV
.thumb_func
SEV:
    sev
    bx lr

.globl WFE
.thumb_func
WFE:
    wfe
    bx lr

.globl DELAY
.thumb_func
DELAY:
    sub r0,#1
    bne DELAY
    bx lr

And I link for 0x20000000 and build my uf2 file for sram/0x20000000 as the destination for the binary. It depends on circumstances, but you need to know where your code is running.

My core zero code looks like this

extern void PUT32 ( unsigned int, unsigned int );
extern unsigned int GET32 ( unsigned int );

extern void SEV ( void );
extern void WFE ( void );

#define SIO_BASE                    0xD0000000

#define SIO_FIFO_ST                 (SIO_BASE+0x50)
#define SIO_FIFO_WR                 (SIO_BASE+0x54)
#define SIO_FIFO_RD                 (SIO_BASE+0x58)

static void fifo_flush ( void )
{
    while(1)
    {
        if((GET32(SIO_FIFO_ST)&0x1) == 0) break; //zero if empty
        GET32(SIO_FIFO_RD);
    }
    SEV();
}

static unsigned int fifo_send ( unsigned int cmd )
{
    while(1)
    {
        if((GET32(SIO_FIFO_ST)&0x2) != 0) break; //one if ready
    }
    PUT32(SIO_FIFO_WR,cmd);
    SEV();
    while(1)
    {
        if((GET32(SIO_FIFO_ST)&0x1) == 0) //zero if  empty
        {
            WFE();
        }
        else
        {
            break;
        }
    }
    return(GET32(SIO_FIFO_RD));
}

unsigned int zero_entry ( void )
{
    unsigned int ra;

    while(1)
    {
        fifo_flush();
        ra=fifo_send(0);
        if(ra!=0) continue;
        fifo_flush();
        ra=fifo_send(0);
        if(ra!=0) continue;
        ra=fifo_send(1);
        if(ra!=1) continue;
        ra=fifo_send(0x20000000); //vector_table
        if(ra!=0x20000000) continue;
        ra=fifo_send(0x20003000);    //stack pointer
        if(ra!=0x20003000) continue;
        ra=fifo_send(0x20000001);    //entry
        if(ra!=0x20000001) continue;
        break;
    }
    return(0);
}

And if interested my core one code looks like this

void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
void DELAY ( unsigned int );

#define RESETS_BASE                 0x4000C000

#define RESETS_RESET_RW             (RESETS_BASE+0x0+0x0000)
#define RESETS_RESET_XOR            (RESETS_BASE+0x0+0x1000)
#define RESETS_RESET_SET            (RESETS_BASE+0x0+0x2000)
#define RESETS_RESET_CLR            (RESETS_BASE+0x0+0x3000)

#define RESETS_WDSEL_RW             (RESETS_BASE+0x4+0x0000)
#define RESETS_WDSEL_XOR            (RESETS_BASE+0x4+0x1000)
#define RESETS_WDSEL_SET            (RESETS_BASE+0x4+0x2000)
#define RESETS_WDSEL_CLR            (RESETS_BASE+0x4+0x3000)

#define RESETS_RESET_DONE_RW        (RESETS_BASE+0x8+0x0000)
#define RESETS_RESET_DONE_XOR       (RESETS_BASE+0x8+0x1000)
#define RESETS_RESET_DONE_SET       (RESETS_BASE+0x8+0x2000)
#define RESETS_RESET_DONE_CLR       (RESETS_BASE+0x8+0x3000)

#define SIO_BASE                    0xD0000000

#define SIO_GPIO_OUT_RW             (SIO_BASE+0x10)
#define SIO_GPIO_OUT_SET            (SIO_BASE+0x14)
#define SIO_GPIO_OUT_CLR            (SIO_BASE+0x18)
#define SIO_GPIO_OUT_XOR            (SIO_BASE+0x1C)

#define SIO_GPIO_OE_RW              (SIO_BASE+0x20)
#define SIO_GPIO_OE_SET             (SIO_BASE+0x24)
#define SIO_GPIO_OE_CLR             (SIO_BASE+0x28)
#define SIO_GPIO_OE_XOR             (SIO_BASE+0x2C)

#define IO_BANK0_BASE               0x40014000

#define IO_BANK0_GPIO25_STATUS_RW   (IO_BANK0_BASE+0x0C8+0x0000)
#define IO_BANK0_GPIO25_STATUS_XOR  (IO_BANK0_BASE+0x0C8+0x1000)
#define IO_BANK0_GPIO25_STATUS_SET  (IO_BANK0_BASE+0x0C8+0x2000)
#define IO_BANK0_GPIO25_STATUS_CLR  (IO_BANK0_BASE+0x0C8+0x3000)

#define IO_BANK0_GPIO25_CTRL_RW     (IO_BANK0_BASE+0x0CC+0x0000)
#define IO_BANK0_GPIO25_CTRL_XOR    (IO_BANK0_BASE+0x0CC+0x1000)
#define IO_BANK0_GPIO25_CTRL_SET    (IO_BANK0_BASE+0x0CC+0x2000)
#define IO_BANK0_GPIO25_CTRL_CLR    (IO_BANK0_BASE+0x0CC+0x3000)

int notmain ( void )
{
    //release reset on IO_BANK0
    PUT32(RESETS_RESET_CLR,1<<5); //IO_BANK0
    //wait for reset to be done
    while(1)
    {
        if((GET32(RESETS_RESET_DONE_RW)&(1<<5))!=0) break;
    }

    //output disable
    PUT32(SIO_GPIO_OE_CLR,1<<25);
    //turn off pin 25
    PUT32(SIO_GPIO_OUT_CLR,1<<25);

    //set the function select to SIO (software controlled I/O)
    PUT32(IO_BANK0_GPIO25_CTRL_RW,5);

    //output enable
    PUT32(SIO_GPIO_OE_SET,1<<25);
    while(1)
    {
        //turn on the led
        PUT32(SIO_GPIO_OUT_SET,1<<25);
        DELAY(0x100000);
        //turn off the led
        PUT32(SIO_GPIO_OUT_CLR,1<<25);
        DELAY(0x100000);
    }
    return(0);
}

What does the thumb bit mean? If you look at the bx instruction or other related information in the ARM documentation (armv6-m architectural reference manual). This goes back to the full sized cores that can run arm and thumb code. Since instructions in both modes are aligned they chose to use the lsbit for branch by address instructions to determine the mode to use at the branch destination (originally only the bx instruction but later pop and others). If the lsbit is set then it is branching to a thumb instruction, if reset then branching to an arm instruction.

The cortex-ms they chose to go with a vector table (makes sense based on the target market for the product) instead of hardcoded addresses like the prior full sized cores (ARM7, ARM9, ARM10, ARM11). As documented in the architectural reference manual the first word is a value to put in the stack pointer to save that step in the boot process and the second is the reset vector.

Now ARM chose to make it such that you had to put a thumb function pointer address in there meaning the lsbit is ORRed with one. I emphasize ORRed with one and not ADD one, because if you use your tools properly (IMO) then the tool will set the lsbit and ADDing one you will then break it.

Letting the tools do the work

.cpu cortex-m0
.thumb

.thumb_func
.global _start
_start:
.word 0x20001000
.word reset
.word hang
.word hang

.word hang
.word hang
.word hang
.word hang

.word hang
.word hang
.word hang
.word hang

.word hang
.word hang
.word hang
.word hang

.thumb_func
reset:
    bl notmain
    b hang
.thumb_func
hang:   b .

(This does not work on a pico, this is a what does the thumb it mean).

.thumb_func causes the next label it finds in the code to be a thumb function address not just a plain old address.

So this gives

00200000 <_start>:
  200000:   20001000    andcs   r1, r0, r0
  200004:   00200041    eoreq   r0, r0, r1, asr #32
  200008:   00200047    eoreq   r0, r0, r7, asr #32
  20000c:   00200047    eoreq   r0, r0, r7, asr #32
  200010:   00200047    eoreq   r0, r0, r7, asr #32
  200014:   00200047    eoreq   r0, r0, r7, asr #32
  200018:   00200047    eoreq   r0, r0, r7, asr #32
  20001c:   00200047    eoreq   r0, r0, r7, asr #32
  200020:   00200047    eoreq   r0, r0, r7, asr #32
  200024:   00200047    eoreq   r0, r0, r7, asr #32
  200028:   00200047    eoreq   r0, r0, r7, asr #32
  20002c:   00200047    eoreq   r0, r0, r7, asr #32
  200030:   00200047    eoreq   r0, r0, r7, asr #32
  200034:   00200047    eoreq   r0, r0, r7, asr #32
  200038:   00200047    eoreq   r0, r0, r7, asr #32
  20003c:   00200047    eoreq   r0, r0, r7, asr #32

00200040 <reset>:
  200040:   f000 f81a   bl  200078 <notmain>
  200044:   e7ff        b.n 200046 <hang>

00200046 <hang>:
  200046:   e7fe        b.n 200046 <hang>

Built and linked for a different mcu, not the pci. reset is at 0x00200040 and hang at 0x00200046. The tools have done the work for us, because we used .thumb_func and put the address orred with one.

And everything is happy and this mcu will boot, or at least it won't hang right after reset.

The longer way to do this, there is no .arm_func so for ARM and thumb you can instead do

.type reset,%function
reset:

It does not have to be immediately before the label, but you have to do the extra work to type in the label name.

If I take your code and change it like this:

    ldr r1, =one_entry
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read

.thumb_func
one_entry:
    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led

Then I get

2000005a:   4907        ldr r1, [pc, #28]   ; (20000078 <one_entry+0x14>)
2000005c:   467f        mov r7, pc
2000005e:   e011        b.n 20000084 <fifo_writ>
20000060:   467f        mov r7, pc
20000062:   e00e        b.n 20000082 <fifo_read>

20000064 <one_entry>:
20000064:   4805        ldr r0, [pc, #20]   ; (2000007c <one_entry+0x18>)
20000066:   6801        ldr r1, [r0, #0]
20000068:   2901        cmp r1, #1
2000006a:   d00c        beq.n   20000086 <led>
2000006c:   e7fe        b.n 2000006c <one_entry+0x8>
2000006e:   46c0        nop         ; (mov r8, r8)
20000070:   20001000    andcs   r1, r0, r0
20000074:   e000ed08    and lr, r0, r8, lsl #26
20000078:   20000065    andcs   r0, r0, r5, rrx
2000007c:   d0000000    andle   r0, r0, r0

The tool has created the address to the entry point for core one with the lsbit set. 20000065

Now the next problem you have is

mov r1, sp              @stack pointer

You are taking core zeros stack pointer address at this point in core zeros execution and setting that for core one. If you end core zero in an infinite loop after starting core one, then this can work. But if you want to keep doing things with core zero you need to give core one its own stack pointer. In my example you can see that I give core zero 0x20002000 and core one 0x20003000. This would have been very painful to debug as core one would start but you would have random chaos that changes every time you change the code.

And to your VTOR problem. I also tried just reading the VTOR and it did not work. Originally my code had a special vector table:

.globl vector_table
vector_table:
    b reset
    .balign 4
    .word reset ;@ has to be offset 4
    .word loop
    .word loop
    .word loop

And I set the vector table, instead of read it

ldr r1,=0xE000ED08 ;@ VTOR
ldr r0,=vector_table
str r0,[r1]

For core zero which is probably borrowed from other pico code I wrote that might have actually used the table. The b reset because we don't actually get to use the reset vector for core zero so this was my kludge. Could have done alignment stuff and put the vector table somewhere else in memory (and yes for both cores I set the stack pointer myself, initially, but for the above example assumed that core one was doing it itself).

And used that same address vector_table for core one. In this case I could have then read it and it would have worked. You have only provided a fraction so we do not know what you did with the VTOR for core zero before this code, but I assume you did not set it, since your code is not working.

You/we are not using a vector table in these examples so just need to make it happy, so I just forced 0x20000000 and it then worked.

I believe you need to fix all three addresses, the vector table, the entry point, and the stack pointer in order to have success.


From your rewrite, I made these modifications.

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xe000ed08
    ldr r1, =0x20000000
    str r1, [r0]            @init vtor

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led

    b core

.thumb_func
core:
    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r7, pc
    b fifo_drain
    mov r1, #0
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #0
    bne core

    mov r1, #1
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, #1
    bne core

    ldr r4, =0x20000000
    mov r1, r4              @vector table
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r4
    bne core

    mov r4, sp              @stack pointer
    mov r1, r4
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r4
    bne core

    ldr r4, =0x20000001
    mov r1, r4              @entry point
    mov r7, pc
    b fifo_writ
    mov r7, pc
    b fifo_read
    cmp r1, r4
    bne core

    b loop

.thumb_func
fifo_stat:
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #15
    and r1, r1, r2
    mov pc, r7

.thumb_func
fifo_writ:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #2
    and r3, r3, r2
    beq fifo_writ

    ldr r0, =0xd0000054
    str r1, [r0]
    sev
    mov pc, r7

.thumb_func
fifo_read:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #1
    and r3, r3, r2
    beq _wfe

    ldr r0, =0xd0000058
    ldr r1, [r0]
    mov pc, r7

.thumb_func
fifo_drain:
    ldr r0, =0xd0000058
    ldr r1, [r0]
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #1
    and r1, r1, r2
    bne fifo_drain
    sev
    mov pc, r7

.thumb_func
_wfe:
    wfe
    b fifo_read

;@ ----------------------------------
.balign 0x100

.thumb_func
led:
    movs r1, #32            @io_bank
    ldr r0, =0x4000f000
    str r1, [r0]            @release reset on io_bank

    movs r1, #5             @sio
    ldr r0, =0x400140cc
    str r1, [r0]            @assign sio to gpio25_ctrl

    movs r1, #1
    lsl r1, r1, #25

    ldr r0, =0xd0000024
    str r1, [r0]            @enable output

    ldr r0, =0xd0000014
    str r1, [r0]            @turn on the led

.thumb_func
loop:
    nop
    b loop

First in a couple of places you used r3 to save the value you wanted to compare against after writing and reading back. But r3 is used both in the write and read so its contents are lost.

Second the program was larger than 0x100 bytes, there is something strange that I would have to understand how I figured it out, so by avoiding the boundary then it worked.

As used above sp did not need to go to r4, but I did it to shotgun the problem.

If I remove the items not needed (the write to VTOR, a b core up front. And I used bl and bx lr to call and return, this saved enough instructions to make the binary less than 0x100 bytes. And it can be used without putting that boundary in.

    .cpu cortex-m0
    .thumb
ent:
    ldr r0, =0x20001000
    mov sp, r0              @init stack pointer

    ldr r0, =0xd0000000
    ldr r1, [r0]
    cmp r1, #1
    beq led

core:
    bl fifo_drain
    mov r1, #0
    bl fifo_writ
    bl fifo_read
    cmp r1, #0
    bne core

    b fifo_drain
    mov r1, #0
    bl fifo_writ
    bl fifo_read
    cmp r1, #0
    bne core

    mov r1, #1
    bl fifo_writ
    bl fifo_read
    cmp r1, #1
    bne core

    ldr r4, =0x20000000
    mov r1, r4              @vector table
    bl fifo_writ
    bl fifo_read
    cmp r1, r4
    bne core

    mov r1, sp              @stack pointer
    bl fifo_writ
    bl fifo_read
    cmp r1, sp
    bne core

    ldr r4, =0x20000001
    mov r1, r4              @entry point
    bl fifo_writ
    bl fifo_read
    cmp r1, r4
    bne core

    b loop

fifo_stat:
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #15
    and r1, r1, r2
    bx lr

fifo_writ:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #2
    and r3, r3, r2
    beq fifo_writ

    ldr r0, =0xd0000054
    str r1, [r0]
    sev
    bx lr

fifo_read:
    ldr r0, =0xd0000050
    ldr r3, [r0]
    mov r2, #1
    and r3, r3, r2
    beq _wfe

    ldr r0, =0xd0000058
    ldr r1, [r0]
    bx lr

fifo_drain:
    ldr r0, =0xd0000058
    ldr r1, [r0]
    ldr r0, =0xd0000050
    ldr r1, [r0]
    mov r2, #1
    and r1, r1, r2
    bne fifo_drain
    sev
    bx lr

_wfe:
    wfe
    bl fifo_read

led:
    movs r1, #32            @io_bank
    ldr r0, =0x4000f000
    str r1, [r0]            @release reset on io_bank

    movs r1, #5             @sio
    ldr r0, =0x400140cc
    str r1, [r0]            @assign sio to gpio25_ctrl

    movs r1, #1
    lsl r1, r1, #25

    ldr r0, =0xd0000024
    str r1, [r0]            @enable output

    ldr r0, =0xd0000014
    str r1, [r0]            @turn on the led

loop:
    nop
    b loop

Note the instruction set allows for things like this:

fifo_drain:
    ldr r0, =0xd0000050
    ldr r1, [r0,#8] @0xd0000058
    ldr r1, [r0] @0xd0000050
    mov r2, #1
    and r1, r1, r2
    bne fifo_drain
    sev
    bx lr

not as brute force and simple to read, but saves instructions.

For someone just learning ARM assembly language, and I presume the rp2040 at the same time. I am quite impressed, keep up the excellent work. This particular mcu is very very cool, but also poorly documented. The ARM instruction set is well documented, but with ARM vs thumb and then unified syntax vs not (fortunately you did not hit the difference). And the this 0x100 byte thing, which I cannot remember how I figured out, I think I looked at their code and figured it from that, but I would have to re-research the whole thing. If you want to confirm this for yourself, take a version that is just under 0x100 bytes and then add some nops in the body somewhere to stretch it past 0x100 bytes. Note with the simple changes described and removing unused/needed code I got yours down to

216 bytes read (0xD8)

216 bytes...

bottom line.

You had the right idea on the three parameters but they needed some work. And then a simple oops on using a register outside a function call that got used within a function call. Then the crazy 0x100 byte thing. This is the thing with bare-metal, hard to debug, gotta grind your way through, do not give up.

The mov r7,pc thing, I am actually impressed about, not critical of - a lot of folks would struggle with the two instructions ahead thing.

halfer
  • 19,824
  • 17
  • 99
  • 186
old_timer
  • 69,149
  • 8
  • 89
  • 168
  • BTW Thank you, when I wrote my core one example, they wanted a vector table, but also a thumb function address entry point. My brain was telling me they wanted the reset vector address there. But in the end, duh, no they wanted an entry point and the vector table was for everything else (except stack pointer and reset address). Your question caused me to work on my example to resolve all of this...So...Thanks for the question. – old_timer Sep 04 '22 at 14:25
  • No problem, but still no luck. I've added an update to my post, which includes the new code(and all of it). – Will Thomas Sep 05 '22 at 13:05
  • Very close, but I think I got you through this, look at it and see if you agree. Works on my pico pi. Although during the process my usb hub with push button power switches died, had to debug that for a while. sigh – old_timer Sep 05 '22 at 17:02
  • I cannot tell you how helpful this was. Not only am I now able to use the full power of my chip, your explanations have cleared up a lot of confusions I had with ARM, and I feel more confident writing assembly moving forward. I can see where the 256-byte boundary is coming from, as the UF2 format prefers programs to be loaded in blocks of 256 bytes. You're a great teacher and I had a lot of fun working on this. Thank you, and sorry to hear about your usb hub. – Will Thomas Sep 05 '22 at 23:17
  • 1
    Hah, thanks, can buy another hub, just cost me some time....You are welcome, keep up with baremetal (does not have to be asm only), we need more folks with an interest and skills to keep hardware and compilers running...pay it forward... – old_timer Sep 05 '22 at 23:56
  • Lol, the usb cable from the computer to my first hub came unplugged taking out both hubs as they were daisy chained. they work fine....(now that i bought two more). – old_timer Sep 08 '22 at 21:32