I got a llvm IR from a C code and optimize by my own optimization order.
The runtime of the IR which attribute "frame-pointer"="none"
is slower than the "frame-pointer"="all"
one.
I think it is unreasonable. "frame-pointer = none"
means that the frame pointer is not stored or loaded according to assembly code.
And the registers are not occupied by frame pointer address.
Am I wrong?
Why is the performance getting worse by frame-pointer=none?
I used Linux tool - perf to profile two files.
frame-pinter=all
one
Performance counter stats for './190_all' (10 runs):
142666 cache-misses # 0.020 % of all cache refs ( +- 5.01% )
698701320 cache-references ( +- 0.71% )
234781 branch-misses ( +- 0.44% )
13059296783 cycles ( +- 0.16% )
59991967735 instructions # 4.59 insn per cycle ( +- 0.05% )
3.417880975 seconds time elapsed ( +- 0.26% )
frame-pointer=none
one
Performance counter stats for './190_none' (10 runs):
352932 cache-misses # 0.046 % of all cache refs ( +- 2.58% )
770977710 cache-references ( +- 0.81% )
260282 branch-misses ( +- 0.33% )
30052057516 cycles ( +- 0.05% )
60037013675 instructions # 2.00 insn per cycle ( +- 0.05% )
7.921856465 seconds time elapsed ( +- 0.05% )
We can see that the instruction num per cycle is different.
This is the IR I profiled.
; ModuleID = './IR_source/insertion_sort.ll'
source_filename = "insertion_sort.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@.str = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [22 x i8] c"insertion sort: %f s\0A\00", align 1
; Function Attrs: nofree noinline norecurse nosync nounwind uwtable
define dso_local void @insertionSort(i32* nocapture %arr, i32 %n) local_unnamed_addr #0 {
entry:
%cmp2 = icmp sgt i32 %n, 1
br i1 %cmp2, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.body: ; preds = %for.body.preheader, %for.inc
%indvars.iv = phi i64 [ 1, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%indvars.iv.next48 = add nsw i64 %indvars.iv, -1
br label %land.end
land.end: ; preds = %for.body, %while.body
%indvars.iv.next410 = phi i64 [ %indvars.iv.next48, %for.body ], [ %indvars.iv.next4, %while.body ]
%indvars.iv39 = phi i64 [ %indvars.iv, %for.body ], [ %indvars.iv.next410, %while.body ]
%arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv.next410
%1 = load i32, i32* %arrayidx3, align 4
%cmp4 = icmp sgt i32 %1, %0
br i1 %cmp4, label %while.body, label %for.inc
while.body: ; preds = %land.end
%arrayidx8 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv39
store i32 %1, i32* %arrayidx8, align 4
%indvars.iv.next4 = add nsw i64 %indvars.iv.next410, -1
%cmp1 = icmp sgt i64 %indvars.iv.next410, 0
br i1 %cmp1, label %land.end, label %for.inc, !llvm.loop !4
for.inc: ; preds = %while.body, %land.end
%.in.lcssa = phi i64 [ %indvars.iv39, %land.end ], [ 0, %while.body ]
%sext = shl i64 %.in.lcssa, 32
%idxprom11 = ashr exact i64 %sext, 32
%arrayidx12 = getelementptr inbounds i32, i32* %arr, i64 %idxprom11
store i32 %0, i32* %arrayidx12, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !6
for.end.loopexit: ; preds = %for.inc
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
; Function Attrs: noinline nounwind uwtable
define dso_local void @printArray(i32* nocapture readonly %arr, i32 %n) local_unnamed_addr #1 {
entry:
%cmp1 = icmp sgt i32 %n, 0
br i1 %cmp1, label %for.inc.preheader, label %for.end
for.inc.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.inc
for.inc: ; preds = %for.inc.preheader, %for.inc
%indvars.iv = phi i64 [ 0, %for.inc.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%call = tail call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %0) #6
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end.loopexit, label %for.inc, !llvm.loop !7
for.end.loopexit: ; preds = %for.inc
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
%putchar = tail call i32 @putchar(i32 10)
ret void
}
; Function Attrs: nofree nounwind
declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #2
; Function Attrs: noinline nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #1 {
for.body.lr.ph:
%call = tail call noalias align 16 dereferenceable_or_null(800000) i8* @malloc(i64 800000) #6
%0 = bitcast i8* %call to i32*
%call1 = tail call i64 @time(i64* null) #6
%conv2 = trunc i64 %call1 to i32
tail call void @srand(i32 %conv2) #6
br label %for.inc
for.inc: ; preds = %for.inc, %for.body.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.lr.ph ]
%call4 = tail call i32 @rand() #6
%rem = srem i32 %call4, 20000000
%add = add nsw i32 %rem, 1
%arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
store i32 %add, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 200000
br i1 %exitcond.not, label %for.end, label %for.inc, !llvm.loop !8
for.end: ; preds = %for.inc
%call6 = tail call i64 @clock() #6
tail call void @insertionSort(i32* nonnull %0, i32 200000)
%call7 = tail call i64 @clock() #6
%sub = sub nsw i64 %call7, %call6
%conv8 = sitofp i64 %sub to double
%div = fdiv double %conv8, 1.000000e+06
%call9 = tail call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([22 x i8], [22 x i8]* @.str.2, i64 0, i64 0), double %div) #6
ret i32 0
}
; Function Attrs: inaccessiblememonly mustprogress nofree nounwind willreturn
declare dso_local noalias noundef i8* @malloc(i64 noundef) local_unnamed_addr #3
; Function Attrs: nounwind
declare dso_local void @srand(i32) local_unnamed_addr #4
; Function Attrs: nounwind
declare dso_local i64 @time(i64*) local_unnamed_addr #4
; Function Attrs: nounwind
declare dso_local i32 @rand() local_unnamed_addr #4
; Function Attrs: nounwind
declare dso_local i64 @clock() local_unnamed_addr #4
; Function Attrs: nofree nounwind
declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5
attributes #0 = { nofree noinline norecurse nosync nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { nofree nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #3 = { inaccessiblememonly mustprogress nofree nounwind willreturn "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #4 = { nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #5 = { nofree nounwind }
attributes #6 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 1}
!2 = !{i32 7, !"frame-pointer", i32 2}
!3 = !{!"clang version 14.0.0 (https://github.com/llvm/llvm-project edff0070a126d9a27263958dbb22133f4ed6526e)"}
!4 = distinct !{!4, !5}
!5 = !{!"llvm.loop.mustprogress"}
!6 = distinct !{!6, !5}
!7 = distinct !{!7, !5}
!8 = distinct !{!8, !5}