I cannot replicate your results using GCC-4.9.3 (gcc-arm-none-eabi-4.9.3.2015q2-1trusty1
from Terry Guo's PPA for Ubuntu 14.04.2 LTS on x86_64). Starting with file.c
,
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
void test(const unsigned int size)
{
void *hardware = AllocateHardwareArea(size);
volatile unsigned int *reader = hardware;
unsigned int x;
for (x = 0; x < size / sizeof *reader; x++)
(void)*reader++;
ReleaseHardwareArea(hardware);
}
using arm-none-eabi-gcc-4.9.3 -march=armv6 -mtune=arm6 -O3 -S file.c
compiles to the following assembly:
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "file.c"
.text
.align 2
.global test
.type test, %function
test:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r4, lr}
mov r4, r0
bl AllocateHardwareArea
movs r2, r4, lsr #2
beq .L2
mov r3, r0
add r2, r0, r2, asl #2
.L3:
ldr r1, [r3]
add r3, r3, #4
cmp r3, r2
bne .L3
.L2:
ldmfd sp!, {r4, lr}
b ReleaseHardwareArea
.size test, .-test
or, compiled to object code using arm-none-eabi-gcc-4.9.3 -march=armv6 -mtune=arm6 -O3 -c file.c
, the disassembly using arm-none-eabi-objdump -d file.o
is
file.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test>:
0: e92d4010 push {r4, lr}
4: e1a04000 mov r4, r0
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1b02124 lsrs r2, r4, #2
10: 0a000005 beq 2c <test+0x2c>
14: e1a03000 mov r3, r0
18: e0802102 add r2, r0, r2, lsl #2
1c: e5931000 ldr r1, [r3]
20: e2833004 add r3, r3, #4
24: e1530002 cmp r3, r2
28: 1afffffb bne 1c <test+0x1c>
2c: e8bd4010 pop {r4, lr}
30: eafffffe b 0 <ReleaseHardwareArea>
The allocated area is read, as it should be, in unsigned int
-sized units. In the assembly source, the read loop is between labels .L3
and .L2
. In the object code, the read loop is at 1c
..28
.
Edited to add: Olaf pointed out in a comment, that OP might use a constant size
. Let's examine that case, too:
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
#define SIZE 32
void test(void)
{
void *hardware = AllocateHardwareArea(SIZE);
volatile unsigned int *reader = hardware;
unsigned int x;
for (x = 0; x < SIZE / sizeof *reader; x++)
(void)*reader++;
ReleaseHardwareArea(hardware);
}
The assembly is
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "file2.c"
.text
.align 2
.global test
.type test, %function
test:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r3, lr}
mov r0, #32
bl AllocateHardwareArea
mov r3, r0
ldr r2, [r0]
ldr r2, [r0, #4]
ldr r2, [r0, #8]
ldr r2, [r0, #12]
ldr r2, [r0, #16]
ldr r2, [r0, #20]
ldr r2, [r0, #24]
ldr r3, [r3, #28]
ldmfd sp!, {r3, lr}
b ReleaseHardwareArea
.size test, .-test
.ident "GCC: (GNU Tools for ARM Embedded Processors) 4.9.3 20150529 (release) [ARM/embedded-4_9-branch revision 224288]"
and the disassembly of the object code
00000000 <test>:
0: e92d4008 push {r3, lr}
4: e3a00020 mov r0, #32
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1a03000 mov r3, r0
10: e5902000 ldr r2, [r0]
14: e5902004 ldr r2, [r0, #4]
18: e5902008 ldr r2, [r0, #8]
1c: e590200c ldr r2, [r0, #12]
20: e5902010 ldr r2, [r0, #16]
24: e5902014 ldr r2, [r0, #20]
28: e5902018 ldr r2, [r0, #24]
2c: e593301c ldr r3, [r3, #28]
30: e8bd4008 pop {r3, lr}
34: eafffffe b 0 <ReleaseHardwareArea>
i.e. the loop is simply unrolled. Of course, if SIZE
is less than 4, then the loop is optimized away. Unrolling occurs for SIZE <= 71
. For SIZE = 72
, the object code is
00000000 <test>:
0: e92d4008 push {r3, lr}
4: e3a00048 mov r0, #72 ; 0x48
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1a03000 mov r3, r0
10: e2802048 add r2, r0, #72 ; 0x48
14: e5931000 ldr r1, [r3]
18: e2833004 add r3, r3, #4
1c: e1530002 cmp r3, r2
20: 1afffffb bne 14 <test+0x14>
24: e8bd4008 pop {r3, lr}
28: eafffffe b 0 <ReleaseHardwareArea>
Since you are compiling with extreme optimizations (-O3
), I recommend rewriting your code snippet, sprinkling const
liberally, instead of assuming the compiler detects const-ness automatically. For example, using the same commands as above, the following version
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
void test(const unsigned int size)
{
void *const hardware = AllocateHardwareArea(size);
volatile unsigned int *const reader = hardware;
const unsigned int n = size / sizeof *reader;
unsigned int i;
for (i = 0; i < n; i++)
reader[i];
ReleaseHardwareArea(hardware);
}
performs the exact same task, but with one fewer instruction within the inner loop. The assembly is
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "new.c"
.text
.align 2
.global test
.type test, %function
test:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r4, lr}
mov r4, r0
bl AllocateHardwareArea
movs r2, r4, lsr #2
beq .L2
mov r3, r0
add r2, r0, r2, asl #2
.L3:
ldr r1, [r3], #4
cmp r3, r2
bne .L3
.L2:
ldmfd sp!, {r4, lr}
b ReleaseHardwareArea
.size test, .-test
.ident "GCC: (GNU Tools for ARM Embedded Processors) 4.9.3 20150529 (release) [ARM/embedded-4_9-branch revision 224288]"
and the object code
Disassembly of section .text:
00000000 <test>:
0: e92d4010 push {r4, lr}
4: e1a04000 mov r4, r0
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1b02124 lsrs r2, r4, #2
10: 0a000004 beq 28 <test+0x28>
14: e1a03000 mov r3, r0
18: e0802102 add r2, r0, r2, lsl #2
1c: e4931004 ldr r1, [r3], #4
20: e1530002 cmp r3, r2
24: 1afffffc bne 1c <test+0x1c>
28: e8bd4010 pop {r4, lr}
2c: eafffffe b 0 <ReleaseHardwareArea>
Perhaps you could test if your GCC compiles this latter version correctly? If not, we have a compiler bug at hand (assuming SIZE
is at least 4), possibly/likely already fixed in later versions.