Adding to Maciej's answer and Andy's comment, let's check the code that is generated.
Using this Makefile:
CXX = $(NDKPATH)/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++
CC = $(NDKPATH)/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++
INC = -I$(NDKPATH)/cxx-stl/llvm-libc++/include/
LIB = -L$(NDKPATH)/cxx-stl/llvm-libc++/lib/
CXXFLAGS = -ggdb -O$(OPTLEVEL)
.PHONY: all clean dump
all: dump
dump: test
$(NDKPATH)/toolchains/llvm/prebuilt/linux-x86_64/aarch64-linux-android/bin/objdump -d -C test | gawk '/<big|<small|::resize/ {p=1} /^$$/ {p=0} {if (p) print $0}'
clean:
$(RM) test.o test
test: test.o
...and a very simple test.cpp:
#include <vector>
using std::vector;
void big(vector<int>& v) {
v.resize(10000000);
}
void small(vector<int>& v) {
v.resize(0);
}
int main() {
return 0;
}
Compiling without optimization (-O0
), note how both big()
and small()
call resize()
, which does a whole bunch of stuff in a loop (as you've also found in the source code).
ndk-vector-speed$ export NDKPATH=~/.androidsdk/ndk-bundle
ndk-vector-speed$ make clean && OPTLEVEL=0 make dump
rm -f test.o test
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++ -ggdb -O0 -c -o test.o test.cpp
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++ test.o -o test
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/aarch64-linux-android/bin/objdump -d -C test | gawk '/<big|<small|::resize/ {p=1} /^$/ {p=0} {if (p) print }'
0000000000000f04 <big(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)>:
f04: d10083ff sub sp, sp, #0x20
f08: a9017bfd stp x29, x30, [sp,#16]
f0c: 910043fd add x29, sp, #0x10
f10: d292d001 mov x1, #0x9680 // #38528
f14: f2a01301 movk x1, #0x98, lsl #16
f18: f90007e0 str x0, [sp,#8]
f1c: f94007e0 ldr x0, [sp,#8]
f20: 94000013 bl f6c <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)>
f24: a9417bfd ldp x29, x30, [sp,#16]
f28: 910083ff add sp, sp, #0x20
f2c: d65f03c0 ret
0000000000000f30 <small(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)>:
f30: d10083ff sub sp, sp, #0x20
f34: a9017bfd stp x29, x30, [sp,#16]
f38: 910043fd add x29, sp, #0x10
f3c: d2800001 mov x1, #0x0 // #0
f40: f90007e0 str x0, [sp,#8]
f44: f94007e0 ldr x0, [sp,#8]
f48: 94000009 bl f6c <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)>
f4c: a9417bfd ldp x29, x30, [sp,#16]
f50: 910083ff add sp, sp, #0x20
f54: d65f03c0 ret
0000000000000f6c <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)>:
f6c: d100c3ff sub sp, sp, #0x30
f70: a9027bfd stp x29, x30, [sp,#32]
f74: 910083fd add x29, sp, #0x20
f78: f81f83a0 stur x0, [x29,#-8]
f7c: f9000be1 str x1, [sp,#16]
f80: f85f83a0 ldur x0, [x29,#-8]
f84: f90003e0 str x0, [sp]
f88: 94000020 bl 1008 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::size() const>
f8c: f90007e0 str x0, [sp,#8]
f90: f94007e0 ldr x0, [sp,#8]
f94: f9400be1 ldr x1, [sp,#16]
f98: eb01001f cmp x0, x1
f9c: 1a9f27e8 cset w8, cc
fa0: 37000048 tbnz w8, #0, fa8 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)+0x3c>
fa4: 14000007 b fc0 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)+0x54>
fa8: f9400be8 ldr x8, [sp,#16]
fac: f94007e9 ldr x9, [sp,#8]
fb0: eb090101 subs x1, x8, x9
fb4: f94003e0 ldr x0, [sp]
fb8: 9400001e bl 1030 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::__append(unsigned long)>
fbc: 14000010 b ffc <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)+0x90>
fc0: f94007e8 ldr x8, [sp,#8]
fc4: f9400be9 ldr x9, [sp,#16]
fc8: eb09011f cmp x8, x9
fcc: 1a9f97ea cset w10, hi
fd0: 3700004a tbnz w10, #0, fd8 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)+0x6c>
fd4: 1400000a b ffc <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::resize(unsigned long)+0x90>
fd8: b27e03e8 orr x8, xzr, #0x4
fdc: f94003e9 ldr x9, [sp]
fe0: f9400129 ldr x9, [x9]
fe4: f9400bea ldr x10, [sp,#16]
fe8: 9b0a7d08 mul x8, x8, x10
fec: 8b080128 add x8, x9, x8
ff0: f94003e0 ldr x0, [sp]
ff4: aa0803e1 mov x1, x8
ff8: 94000054 bl 1148 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::__destruct_at_end(int*)>
ffc: a9427bfd ldp x29, x30, [sp,#32]
1000: 9100c3ff add sp, sp, #0x30
1004: d65f03c0 ret
With -O2
, the compiler can do lots of optimization for us.
First of all, resize()
is completely gone; it has been removed because no one needs it anymore.
big()
has inlined what it needs from resize()
, calling __append()
directly instead, and looks generally simpler than the full resize()
function we called before. Since I haven't run this code, I can't make any claims regarding how much this helps with speed.
small()
now has no function calls, no loops, and only five instructions (which I've annotated manually below). It has essentially become if (v.begin != v.end) v.end = v.begin
. This will of course be very fast.
ndk-vector-speed$ make clean && OPTLEVEL=2 make dump
rm -f test.o test
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++ -ggdb -O2 -c -o test.o test.cpp
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android29-clang++ test.o -o test
/home/snild/.androidsdk/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/aarch64-linux-android/bin/objdump -d -C test | gawk '/<big|<small|::resize/ {p=1} /^$/ {p=0} {if (p) print }'
0000000000000e64 <big(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)>:
e64: a9402408 ldp x8, x9, [x0]
e68: 5292d00a mov w10, #0x9680 // #38528
e6c: 72a0130a movk w10, #0x98, lsl #16
e70: cb080129 sub x9, x9, x8
e74: 9342fd2b asr x11, x9, #2
e78: eb0a017f cmp x11, x10
e7c: 54000062 b.cs e88 <big(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)+0x24>
e80: cb0b0141 sub x1, x10, x11
e84: 14000011 b ec8 <std::__ndk1::vector<int, std::__ndk1::allocator<int> >::__append(unsigned long)>
e88: 528b400a mov w10, #0x5a00 // #23040
e8c: 72a04c4a movk w10, #0x262, lsl #16
e90: eb0a013f cmp x9, x10
e94: 540000a0 b.eq ea8 <big(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)+0x44>
e98: 528b4009 mov w9, #0x5a00 // #23040
e9c: 72a04c49 movk w9, #0x262, lsl #16
ea0: 8b090108 add x8, x8, x9
ea4: f9000408 str x8, [x0,#8]
ea8: d65f03c0 ret
0000000000000eac <small(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)>:
eac: a9402408 ldp x8, x9, [x0] // load the first two values (begin and end) from v
eb0: eb08013f cmp x9, x8 // compare them
eb4: 54000040 b.eq ebc <small(std::__ndk1::vector<int, std::__ndk1::allocator<int> >&)+0x10>
// skip to 'ret' if they were equal
eb8: f9000408 str x8, [x0,#8] // write v.begin to v.end
ebc: d65f03c0 ret // return.
Conclusion: Maciej and Andy are correct; you're not building with optimizations enabled.