Here are some observations I made. I am going to show some Intel syntax assembly code I got from compiling this program. I am using Compiler Explorer. To understand what follows you don't have to know a lot of assembly, the most important element here is size, the bigger it is the slower it is. If I could I would make this post smaller, but the code generated is surprisingly huge, and I don't know enough about Go to know what is useless. If you want to see what each statement converts to in assembly the Compiler Explorer will highlight everything for you.
TL;DR:
The Go compiler is a disastrous mess in my opinion, the C++ code get's very well optimized, and Java is small compared to Go. JIT'ing may have a major effect on the Java code, it may also be too complex for a loop unraveling inline optimization (precompute the value of count
).
The Go code compiled down to this monstrosity:
text "".main(SB), $224-0
movq (TLS), CX
leaq -96(SP), AX
cmpq AX, 16(CX)
jls 835
subq $224, SP
movq BP, 216(SP)
leaq 216(SP), BP
funcdata $0, gclocals·f6bd6b3389b872033d462029172c8612(SB)
funcdata $1, gclocals·17283ea8379a997487dd6f8baf7ae6ea(SB)
pcdata $0, $0
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+160(SP)
movq CX, time.t·2+168(SP)
movq AX, time.t·2+176(SP)
movq time.t·2+160(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 806
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
andq $1073741823, DX
movlqsx DX, DX
imulq $1000000000, CX
addq DX, CX
movq $-6795364578871345152, DX
addq DX, CX
movq $4835703278458516699, AX
imulq CX
sarq $63, CX
sarq $18, DX
subq CX, DX
movq DX, "".start+72(SP)
xorl AX, AX
movq AX, CX
jmp 257
incq CX
incq AX
cmpq CX, $2000000000
jlt 213
movq "".i+80(SP), SI
incq SI
movq "".start+72(SP), DX
movq $59453308800, BX
movq AX, CX
movq SI, AX
movq CX, "".count+88(SP)
cmpq AX, $10
jge 404
movq AX, "".i+80(SP)
movq AX, ""..autotmp_24+112(SP)
xorps X0, X0
movups X0, ""..autotmp_23+120(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_24+112(SP), DX
movq DX, 8(SP)
pcdata $0, $1
call runtime.convT2E64(SB)
movq 24(SP), AX
movq 16(SP), CX
movq CX, ""..autotmp_23+120(SP)
movq AX, ""..autotmp_23+128(SP)
leaq go.string."On step %d\n"(SB), AX
movq AX, (SP)
movq $11, 8(SP)
leaq ""..autotmp_23+120(SP), CX
movq CX, 16(SP)
movq $1, 24(SP)
movq $1, 32(SP)
pcdata $0, $1
call fmt.Printf(SB)
movq "".count+88(SP), AX
xorl CX, CX
jmp 219
pcdata $0, $2
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+136(SP)
movq CX, time.t·2+144(SP)
movq AX, time.t·2+152(SP)
movq time.t·2+136(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 787
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
imulq $1000000000, CX
andq $1073741823, DX
movlqsx DX, DX
addq DX, CX
movq $-6795364578871345152, DX
leaq (DX)(CX*1), AX
movq AX, "".~R0+64(SP)
movq $4835703278458516699, CX
imulq CX
sarq $18, DX
movq "".~R0+64(SP), CX
sarq $63, CX
subq CX, DX
movq "".start+72(SP), CX
subq CX, DX
movq DX, ""..autotmp_29+104(SP)
movq "".count+88(SP), CX
movq CX, ""..autotmp_30+96(SP)
xorps X0, X0
movups X0, ""..autotmp_28+184(SP)
movups X0, ""..autotmp_28+200(SP)
leaq type.int64(SB), CX
movq CX, (SP)
leaq ""..autotmp_29+104(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 16(SP), CX
movq 24(SP), DX
movq CX, ""..autotmp_28+184(SP)
movq DX, ""..autotmp_28+192(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_30+96(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 24(SP), CX
movq 16(SP), DX
movq DX, ""..autotmp_28+200(SP)
movq CX, ""..autotmp_28+208(SP)
leaq go.string."Total time took: %d to get at count: %d\n"(SB), CX
movq CX, (SP)
movq $40, 8(SP)
leaq ""..autotmp_28+184(SP), CX
movq CX, 16(SP)
movq $2, 24(SP)
movq $2, 32(SP)
pcdata $0, $3
call fmt.Printf(SB)
movq 216(SP), BP
addq $224, SP
ret
movq time.t·2+144(SP), BX
movq CX, DX
movq BX, CX
jmp 501
movq time.t·2+168(SP), SI
movq CX, DX
movq $59453308800, BX
movq SI, CX
jmp 144
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text "".init(SB), $8-0
movq (TLS), CX
cmpq SP, 16(CX)
jls 89
subq $8, SP
movq BP, (SP)
leaq (SP), BP
funcdata $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
movblzx "".initdone·(SB), AX
cmpb AL, $1
jls 47
movq (SP), BP
addq $8, SP
ret
jne 56
pcdata $0, $0
call runtime.throwinit(SB)
undef
movb $1, "".initdone·(SB)
pcdata $0, $0
call fmt.init(SB)
pcdata $0, $0
call time.init(SB)
movb $2, "".initdone·(SB)
movq (SP), BP
addq $8, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..hash.[2]interface {}(SB), DUPOK, $40-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 103
subq $40, SP
movq BP, 32(SP)
leaq 32(SP), BP
funcdata $0, gclocals·d4dc2f11db048877dbc0f60a22b4adb3(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
movq "".h+56(SP), CX
jmp 82
movq AX, "".i+24(SP)
shlq $4, AX
movq "".p+48(SP), BX
addq BX, AX
movq AX, (SP)
movq CX, 8(SP)
pcdata $0, $0
call runtime.nilinterhash(SB)
movq 16(SP), CX
movq "".i+24(SP), AX
incq AX
cmpq AX, $2
jlt 38
movq CX, "".~r2+64(SP)
movq 32(SP), BP
addq $40, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..eq.[2]interface {}(SB), DUPOK, $48-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 155
subq $48, SP
movq BP, 40(SP)
leaq 40(SP), BP
funcdata $0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
jmp 46
movq ""..autotmp_8+32(SP), CX
leaq 1(CX), AX
cmpq AX, $2
jge 140
movq AX, CX
shlq $4, AX
movq "".p+56(SP), DX
movq 8(AX)(DX*1), BX
movq (AX)(DX*1), SI
movq "".q+64(SP), DI
movq 8(AX)(DI*1), R8
movq (AX)(DI*1), AX
cmpq SI, AX
jne 125
movq CX, ""..autotmp_8+32(SP)
movq SI, (SP)
movq BX, 8(SP)
movq R8, 16(SP)
pcdata $0, $0
call runtime.efaceeq(SB)
movblzx 24(SP), AX
testb AL, AL
jne 37
movb $0, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
movb $1, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
I have no clue what most of that is doing. I can only hope that most of that is some sort of GC code. I looked up how to enable optimizations for the Go compiler, and all I could find was how to disable optimizations.
Comparatively I looked at a similar function in C++
#include <cstdio>
#include <chrono>
#include <cinttypes>
using namespace std::chrono;
milliseconds getMS()
{
return duration_cast< milliseconds >(
system_clock::now().time_since_epoch()
);
}
int main()
{
int count = 0;
milliseconds millis = getMS();
for(int i = 0; i < 10; ++i)
{
printf("On step %d\n", i);
for(int j = 0; j < 2000000000; ++j)
{
++count;
}
}
milliseconds time = getMS() - millis;
printf("Total time took: %" PRId64 " to get at count: %d\n", time.count(), count);
}
Which without optimizations compiled down to (compiler x86-64 clang (trunk (probably 6.0.0)
, flags: -std=c++0x -O0
):
main: # @main
push rbp
mov rbp, rsp
sub rsp, 48
mov dword ptr [rbp - 4], 0
mov dword ptr [rbp - 8], 0
call getMS()
mov qword ptr [rbp - 16], rax
mov dword ptr [rbp - 20], 0
.LBB3_1: # =>This Loop Header: Depth=1
cmp dword ptr [rbp - 20], 10
jge .LBB3_8
mov esi, dword ptr [rbp - 20]
movabs rdi, offset .L.str
mov al, 0
call printf
mov dword ptr [rbp - 24], 0
mov dword ptr [rbp - 44], eax # 4-byte Spill
.LBB3_3: # Parent Loop BB3_1 Depth=1
cmp dword ptr [rbp - 24], 2000000000
jge .LBB3_6
mov eax, dword ptr [rbp - 8]
add eax, 1
mov dword ptr [rbp - 8], eax
mov eax, dword ptr [rbp - 24]
add eax, 1
mov dword ptr [rbp - 24], eax
jmp .LBB3_3
.LBB3_6: # in Loop: Header=BB3_1 Depth=1
jmp .LBB3_7
.LBB3_7: # in Loop: Header=BB3_1 Depth=1
mov eax, dword ptr [rbp - 20]
add eax, 1
mov dword ptr [rbp - 20], eax
jmp .LBB3_1
.LBB3_8:
call getMS()
mov qword ptr [rbp - 40], rax
lea rdi, [rbp - 40]
lea rsi, [rbp - 16]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rbp - 32], rax
lea rdi, [rbp - 32]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov edx, dword ptr [rbp - 8]
movabs rdi, offset .L.str.1
mov rsi, rax
mov al, 0
call printf
mov edx, dword ptr [rbp - 4]
mov dword ptr [rbp - 48], eax # 4-byte Spill
mov eax, edx
add rsp, 48
pop rbp
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
There is actually a lot more code than that, but it is just the chrono implementation, in the optimized code it is just a library function call. I've also removed the implementation of getMS
because it is mostly a wrapper method.
With O1 (size) optimizations this turns into:
main: # @main
push rbx
sub rsp, 32
call getMS()
mov qword ptr [rsp + 24], rax
xor ebx, ebx
.LBB3_1: # =>This Inner Loop Header: Depth=1
mov edi, offset .L.str
xor eax, eax
mov esi, ebx
call printf
add ebx, 1
cmp ebx, 10
jne .LBB3_1
call getMS()
mov qword ptr [rsp + 8], rax
lea rdi, [rsp + 8]
lea rsi, [rsp + 24]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rsp + 16], rax
lea rdi, [rsp + 16]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov rcx, rax
mov edi, offset .L.str.1
mov edx, -1474836480
xor eax, eax
mov rsi, rcx
call printf
xor eax, eax
add rsp, 32
pop rbx
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
O2 (speed) and O3 (max) optimizations essentially boil down to an unrolled outer loop (which is only there for the print statements), and a precomputed count value.
This mostly shows the terrible code generated by Go and some of the optimizations that occur in C++. But none of this shows exactly what the Java bytecode contains or what that will get JIT'ed down to if it is run enough times. So here is the Java bytecode:
public static void countToTwentyBillion();
Code:
0: lconst_0
1: lstore_0
2: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
5: lstore_2
6: iconst_0
7: istore
4
9: iload
4
11: bipush
10
13: if_icmpge
68
16: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
19: new
#4
// class java/lang/StringBuilder
22: dup
23: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
26: ldc
#6
// String On step
28: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
31: iload
4
33: invokevirtual #8
// Method java/lang/StringBuilder.append:(I)Ljava/lang/StringBuilder;
36: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
39: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
42: iconst_0
43: istore
5
45: iload
5
47: ldc
#11
// int 2000000000
49: if_icmpge
62
52: lload_0
53: lconst_1
54: ladd
55: lstore_0
56: iinc
5, 1
59: goto
45
62: iinc
4, 1
65: goto
9
68: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
71: lstore
4
73: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
76: new
#4
// class java/lang/StringBuilder
79: dup
80: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
83: ldc
#12
// String Total time took:
85: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
88: lload
4
90: lload_2
91: lsub
92: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
95: ldc
#14
// String ms to get at count:
97: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
100: lload_0
101: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
104: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
107: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
110: return
Unfortunately at the moment I don't feel like compiling hsdis and JIT'ing the code, but it will probably end up looking like some of the C++ examples. From what I know about the JIT, it might be able to precompute the count value. But this code is a bit complex (in terms of looping) which may make it harder for a fast JIT optimization.