I am currently carrying out optimizations on a linux-based software itself on an ARM processor. Those optimizations are mostly in the form of ARM and ARM NEON functions.
In order to profile the software I use perf record and flame-graphs, however, once I introduce the assembler functions, they do not stack on top of the functions that call them but rather seemingly random places.
My question therefore was, what should I include in my functions for them to appear properly in the call stacks.
There was a slightly related topic but no good answer was given How to get call graph profiling working with gcc compiled code and ARM Cortex A8 target?. I use the same flags plus mapcs-frame.
Below, I give an example of a C function translated to ARM by GCC. This ARM function seems to produces decent stacks but I would like to understand why.
int half(int in);
int sum(int in1, int in2);
int mean(int in1, int in2);
int half(int i)
{
return i / 2;
}
int sum(int i, int j)
{
return i + j;
}
int mean(int i, int j)
{
int s = sum(i, j);
int m = half(s);
return m;
}
int main()
{
int a = 1;
int b = 5;
int i;
int result;
for (i = 0; i<10000000; i++) {
result = mean(a, b);
}
return 0;
}
.cpu cortex-a9
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 6
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "a.c"
.text
.align 2
.global half
.type half, %function
half:
@ args = 0, pretend = 0, frame = 8
@ frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #8
str r0, [fp, #-16]
ldr r3, [fp, #-16]
mov r2, r3, lsr #31
add r3, r2, r3
mov r3, r3, asr #1
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size half, .-half
.align 2
.global sum
.type sum, %function
sum:
@ args = 0, pretend = 0, frame = 8
@ frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #8
str r0, [fp, #-16]
str r1, [fp, #-20]
ldr r2, [fp, #-16]
ldr r3, [fp, #-20]
add r3, r2, r3
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size sum, .-sum
.align 2
.global mean
.type mean, %function
mean:
@ args = 0, pretend = 0, frame = 16
@ frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #16
str r0, [fp, #-24]
str r1, [fp, #-28]
ldr r1, [fp, #-28]
ldr r0, [fp, #-24]
bl sum
str r0, [fp, #-16]
ldr r0, [fp, #-16]
bl half
str r0, [fp, #-20]
ldr r3, [fp, #-20]
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size mean, .-mean
.align 2
.global main
.type main, %function
main:
@ args = 0, pretend = 0, frame = 16
@ frame_needed = 1, uses_anonymous_args = 0
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #16
mov r3, #1
str r3, [fp, #-20]
mov r3, #5
str r3, [fp, #-24]
mov r3, #0
str r3, [fp, #-16]
b .L8
.L9:
ldr r1, [fp, #-24]
ldr r0, [fp, #-20]
bl mean
str r0, [fp, #-28]
ldr r3, [fp, #-16]
add r3, r3, #1
str r3, [fp, #-16]
.L8:
ldr r2, [fp, #-16]
movw r3, #38527
movt r3, 152
cmp r2, r3
ble .L9
mov r3, #0
mov r0, r3
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.size main, .-main
.ident "GCC: (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) 4.9.2 20140904 (prerelease)"
.section .note.GNU-stack,"",%progbits
-------------------EDIT-------------------
Here is the example of the kind of function I am trying to integrate. In terms of linkage, all it does is save the stack and link register at the beginning and set them a the end. What should I add to it?
.section .text
.global ARM_smoothing
ARM_smoothing:
STMFD sp!, {r4-r12,lr} //move used registers on stack (avoid segmentation fault)
MOV r5, r0
ADD r0, r0, r2
ADD r0, r0, r2
MOV r8, r0
ADD r8, r8, r2
ADD r8, r8, r2 //the 6 instructions create 3 pointers to the row above and below as well as the current one
ADD r1, r1, r2
ADD r1, r1, r2
ADD r1, r1, #2 //move destination pointer to first element (1 row down, 1 element left)
SUB r2, r2, #2
SUB r3, r3, #2 //counters decremented because smoothing function works with a margin of 1 on every side
LDR r9, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
LDR r10, =0x2
LDR r11, =0xC //shifts for pointers to data
VLDR.U64 d20, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
VLDR.U64 d22, =0x0 //initialization of zeros to be used (not ncessarily needed)
VLDR.U64 d23, =0x0
VDUP.32 d20, d20[0] //initialize vector for multiplication
height_loop:
MOV r4, r2 //reset width counter
CMP r4, #8
BLGE width_loop_eight_smoothing //use neon while more than 8 elements in row need smoothing
CMP r4, #1
BLGE width_loop_rest //use normal ARM for remaining elements, can't do in NEON because of margin
ADD r0, r0, #4 //skip margin
ADD r1, r1, #4
ADD r5, r5, #4
ADD r8, r8, #4
SUBS r3, r3, #1 //decrement row counter
BNE height_loop //loop while there still are rows
LDMFD sp!, {r4-r12,pc} //restore stack and return to calling function
width_loop_eight_smoothing:
SUB r4, r4, #8 //decrement width counter
VLD1.16 {d0, d1}, [r5], r10 //load upper left elements
VLD1.16 {d2, d3}, [r5], r10 //load upper middle elements
VADDL.S16 q2, d0, d2 //long addition of elements to be sure to not lose any data
VADDL.S16 q3, d1, d3
VLD1.16 {d0, d1}, [r5], r11 //load upper right elements
VLD1.16 {d2, d3}, [r0], r10 //load middle left elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4 //add to grand total
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r0], r10 //load current elements
VLD1.16 {d2, d3}, [r0], r11 //load middle right elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r8], r10 //load lower left elements
VLD1.16 {d2, d3}, [r8], r10 //load lower middle elements
VADDL.S16 q4, d0, d2
VADDL.S16 q5, d1, d3
VADD.S32 q2, q4
VADD.S32 q3, q5
VLD1.16 {d0, d1}, [r8], r11 //load lower right elements
VADDL.S16 q4, d0, d22
VADDL.S16 q5, d1, d23
VADD.S32 q2, q4
VADD.S32 q3, q5
VMULL.S32 q6, d4, d20 //divide by 9 (upper element is total divided by 9)
VMULL.S32 q7, d5, d20
VMULL.S32 q8, d6, d20
VMULL.S32 q9, d7, d20
VUZP.32 q6, q7 //pack results into less registers and smaller elements
VUZP.32 q8, q9
VUZP.16 q7, q9
VSHR.U16 q8, q7, #15 //when multiplied element is negative, result is always one under
VADD.S16 q7, q8 //rectifying by adding sign bit to total
VST1.16 {d14, d15}, [r1]! //store results
CMP r4, #8 //check if theres enough elements to do 8 more in NEON
BCS width_loop_eight_smoothing //if yes, loop neon code
MOV PC, LR //return to ARM_smoothing if not
width_loop_rest: //works similaarly to NEON but one element at a time
LDRSH r6, [r0], #2 //converts loaded half words to signed full words
LDRSH r7, [r0] //main difference is with the way increments are done since there is an overlap
ADD r6, r7, r6
LDRSH r7, [r0, #2]
ADD r6, r7, r6
LDRSH r7, [r5], #2
ADD r6, r7, r6
LDRSH r7, [r5]
ADD r6, r7, r6
LDRSH r7, [r5, #2]
ADD r6, r7, r6
LDRSH r7, [r8], #2
ADD r6, r7, r6
LDRSH r7, [r8]
ADD r6, r7, r6
LDRSH r7, [r8, #2]
ADD r6, r7, r6
SMULLS r6, r7, r6, r9
ADDMI r7, #1
STRH r7, [r1], #2
SUBS r4, #1 //decrement width counter and check if there's any left
BNE width_loop_rest
MOV PC, LR