1

In linux 5.10.0-rc5 arch/arm64/kernel/entry.S, there is this macro kernel_entry. This should be a very important macro for the interrupt processing. I tried to briefly analyze it but it is very difficult for me to understand exactly and I want to ask a couple of questions here in the hope an expert could shed some light on this. Below is the code and I added some comments of my understanding and questions with <== sign. Original code is at https://elixir.bootlin.com/linux/v5.10-rc5/source/arch/arm64/kernel/entry.S#L176 in case you want to look around.

    .macro  kernel_entry, el, regsize = 64
    .if \regsize == 32
    mov w0, w0              // zero upper 32 bits of x0
    .endif
    stp x0, x1, [sp, #16 * 0]
    stp x2, x3, [sp, #16 * 1]
    stp x4, x5, [sp, #16 * 2]
    stp x6, x7, [sp, #16 * 3]
    stp x8, x9, [sp, #16 * 4]
    stp x10, x11, [sp, #16 * 5]
    stp x12, x13, [sp, #16 * 6]
    stp x14, x15, [sp, #16 * 7]
    stp x16, x17, [sp, #16 * 8]
    stp x18, x19, [sp, #16 * 9]
    stp x20, x21, [sp, #16 * 10]
    stp x22, x23, [sp, #16 * 11]
    stp x24, x25, [sp, #16 * 12]
    stp x26, x27, [sp, #16 * 13]
    stp x28, x29, [sp, #16 * 14]   
  
  <== in the previous kernel_ventry, sp was subtracted by #S_FRAME_SIZE which is 
        the size of struct pt_regs, pt_regs is the structure for storing registers 
        and some values on stack when interrupt happens.
        At this moment, only 30 registers have been stored in the stack.
     (see https://elixir.bootlin.com/linux/v5.10-rc5/source/arch/arm64/include/asm/ptrace.h#L173 for pt_regs)

    .if \el == 0                      <== if this exception was taken from lower EL(=el0) 
    clear_gp_regs
    mrs x21, sp_el0
    ldr_this_cpu    tsk, __entry_task, x20
    msr sp_el0, tsk

   <== clear all general purpose registers (already stored) and store sp_el0 in x21, and 
    store "this cpu"'s kernel task_struct address in tsk(=x28). and set sp_el0 with the 
    address of that task_struct. (in kernel code, the  current task_struct is stored in 
    sp_el0 for faster access because el0 code is not running anyway).

    /*
     * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions
     * when scheduling.
     */
    ldr x19, [tsk, #TSK_TI_FLAGS]
    disable_step_tsk x19, x20

    /* Check for asynchronous tag check faults in user space */
    check_mte_async_tcf x19, x22
    apply_ssbd 1, x22, x23

    ptrauth_keys_install_kernel tsk, x20, x22, x23

    <== let's skip debug, mte and prauth thing..because it looks like side things.

    scs_load tsk, x20    <== this is saving shadow call stack address to x18 from the new task_struct

    .else                      <== now, if the exception was taken from the same level
    add x21, sp, #S_FRAME_SIZE             <== original sp in x21 (to store it in the pt_regs later) 
      
     get_current_task tsk
    /* Save the task's original addr_limit and set USER_DS */
    ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]     
    str x20, [sp, #S_ORIG_ADDR_LIMIT]               <== copy VA address limit field to pt_regs on stack
    mov x20, #USER_DS
    str x20, [tsk, #TSK_TI_ADDR_LIMIT]      <== Q1. why?
    /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
    .endif /* \el == 0 */
    mrs x22, elr_el1
    mrs x23, spsr_el1
    stp lr, x21, [sp, #S_LR]    <== now stores lr(x30) and x21(sp). 
         so pt_regs's 31 registers and sp has been stored.

    /*
     * In order to be able to dump the contents of struct pt_regs at the
     * time the exception was taken (in case we attempt to walk the call
     * stack later), chain it together with the stack frames.
     */
    .if \el == 0                   <== Q2. these 5 lines down. See question body.
    stp xzr, xzr, [sp, #S_STACKFRAME]
    .else
    stp x29, x22, [sp, #S_STACKFRAME]
    .endif
    add x29, sp, #S_STACKFRAME

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
alternative_if_not ARM64_HAS_PAN
    bl  __swpan_entry_el\el
alternative_else_nop_endif
#endif

    stp x22, x23, [sp, #S_PC]

    /* Not in a syscall by default (el0_svc overwrites for real syscall) */
    .if \el == 0
    mov w21, #NO_SYSCALL
    str w21, [sp, #S_SYSCALLNO]
    .endif

    /* Save pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
    mrs_s   x20, SYS_ICC_PMR_EL1
    str x20, [sp, #S_PMR_SAVE]
alternative_else_nop_endif

    /* Re-enable tag checking (TCO set on exception entry) */
#ifdef CONFIG_ARM64_MTE
alternative_if ARM64_MTE
    SET_PSTATE_TCO(0)
alternative_else_nop_endif
#endif

    /*
     * Registers that may be useful after this macro is invoked:
     *
     * x20 - ICC_PMR_EL1
     * x21 - aborted SP
     * x22 - aborted PC
     * x23 - aborted PSTATE
    */
    .endm

Q1 : It looks like in the preceding lines, the code has just copied the current process's address limit (just before the exception) to the value at pt_regs(on the new stack frame). Now it sets the current task_struct.thread_info.addr_space with #USER_DS. It's still kernel process (an exception occurred while running in EL1 or EL2). Then why? Is it switching to user process?

Q2 : first, for case \el == 1, i.e. when the exception was taken from the same EL (when exception occurred during kernel execution and trapped in the same EL), the x29(=fp, the stack frame pointer before the exception occurred), and x22(=elr_el1) is being written at pt_regs's framepointer values. I thought fp(the frame pointer before this function), lr(the return address after this funcion) is placed at the bottom(lowest address) of the stack frame for normal function call. But why are these fp and elr stored in pt_regs's framepointer values? Is it because when we walk the call stack, when there is this 'exception from the same EL', the pt_regs values are used to follow the call stack? And for case \el == 0, why is it writing zero to fp and lr? Probably because those pointer values are user virtual address and cannot be traced anyway?

Wow, it took very long time to ask these questions and I realized some things while writing these questions. I know there are some people answering or giving comments for this kind of topic and I get good help many times. I hope to get answers or comments. Good comments will be upvoted.
Thank you for reading!

Chan Kim
  • 5,177
  • 12
  • 57
  • 112
  • I have not looked at the ARM64 versions. However, for ARM32 there are two issues that are confusing. Issue 1:[mode switching](https://stackoverflow.com/questions/22928904/linux-kernel-arm-exception-stack-init/22940593#22940593) will change the view of the CPU. So, your analysis may apply to some other stack/state. Issue 2: The CPU may automatically push things on the stack and set values during exceptions. You need to account for this. – artless noise Sep 01 '22 at 14:36

0 Answers0