What is the relation between performance and attribute "frame-pointer" in llvm IR?

Question

I got a llvm IR from a C code and optimize by my own optimization order.

The runtime of the IR which attribute "frame-pointer"="none" is slower than the "frame-pointer"="all" one.

I think it is unreasonable. "frame-pointer = none" means that the frame pointer is not stored or loaded according to assembly code. And the registers are not occupied by frame pointer address.

Am I wrong?

Why is the performance getting worse by frame-pointer=none?

I used Linux tool - perf to profile two files.

frame-pinter=all one

  Performance counter stats for './190_all' (10 runs):

            142666      cache-misses              #    0.020 % of all cache refs      ( +-  5.01% )
         698701320      cache-references                                              ( +-  0.71% )
            234781      branch-misses                                                 ( +-  0.44% )
       13059296783      cycles                                                        ( +-  0.16% )
       59991967735      instructions              #    4.59  insn per cycle           ( +-  0.05% )

       3.417880975 seconds time elapsed                                          ( +-  0.26% )

frame-pointer=none one

  Performance counter stats for './190_none' (10 runs):

            352932      cache-misses              #    0.046 % of all cache refs      ( +-  2.58% )
         770977710      cache-references                                              ( +-  0.81% )
            260282      branch-misses                                                 ( +-  0.33% )
       30052057516      cycles                                                        ( +-  0.05% )
       60037013675      instructions              #    2.00  insn per cycle           ( +-  0.05% )

       7.921856465 seconds time elapsed                                          ( +-  0.05% )

We can see that the instruction num per cycle is different.

This is the IR I profiled.

; ModuleID = './IR_source/insertion_sort.ll'
source_filename = "insertion_sort.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [4 x i8] c"%d \00", align 1
@.str.2 = private unnamed_addr constant [22 x i8] c"insertion sort: %f s\0A\00", align 1

; Function Attrs: nofree noinline norecurse nosync nounwind uwtable
define dso_local void @insertionSort(i32* nocapture %arr, i32 %n) local_unnamed_addr #0 {
entry:
  %cmp2 = icmp sgt i32 %n, 1
  br i1 %cmp2, label %for.body.preheader, label %for.end

for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %n to i64
  br label %for.body

for.body:                                         ; preds = %for.body.preheader, %for.inc
  %indvars.iv = phi i64 [ 1, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  %0 = load i32, i32* %arrayidx, align 4
  %indvars.iv.next48 = add nsw i64 %indvars.iv, -1
  br label %land.end

land.end:                                         ; preds = %for.body, %while.body
  %indvars.iv.next410 = phi i64 [ %indvars.iv.next48, %for.body ], [ %indvars.iv.next4, %while.body ]
  %indvars.iv39 = phi i64 [ %indvars.iv, %for.body ], [ %indvars.iv.next410, %while.body ]
  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv.next410
  %1 = load i32, i32* %arrayidx3, align 4
  %cmp4 = icmp sgt i32 %1, %0
  br i1 %cmp4, label %while.body, label %for.inc

while.body:                                       ; preds = %land.end
  %arrayidx8 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv39
  store i32 %1, i32* %arrayidx8, align 4
  %indvars.iv.next4 = add nsw i64 %indvars.iv.next410, -1
  %cmp1 = icmp sgt i64 %indvars.iv.next410, 0
  br i1 %cmp1, label %land.end, label %for.inc, !llvm.loop !4

for.inc:                                          ; preds = %while.body, %land.end
  %.in.lcssa = phi i64 [ %indvars.iv39, %land.end ], [ 0, %while.body ]
  %sext = shl i64 %.in.lcssa, 32
  %idxprom11 = ashr exact i64 %sext, 32
  %arrayidx12 = getelementptr inbounds i32, i32* %arr, i64 %idxprom11
  store i32 %0, i32* %arrayidx12, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !6

for.end.loopexit:                                 ; preds = %for.inc
  br label %for.end

for.end:                                          ; preds = %for.end.loopexit, %entry
  ret void
}

; Function Attrs: noinline nounwind uwtable
define dso_local void @printArray(i32* nocapture readonly %arr, i32 %n) local_unnamed_addr #1 {
entry:
  %cmp1 = icmp sgt i32 %n, 0
  br i1 %cmp1, label %for.inc.preheader, label %for.end

for.inc.preheader:                                ; preds = %entry
  %wide.trip.count = zext i32 %n to i64
  br label %for.inc

for.inc:                                          ; preds = %for.inc.preheader, %for.inc
  %indvars.iv = phi i64 [ 0, %for.inc.preheader ], [ %indvars.iv.next, %for.inc ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  %0 = load i32, i32* %arrayidx, align 4
  %call = tail call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %0) #6
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.end.loopexit, label %for.inc, !llvm.loop !7

for.end.loopexit:                                 ; preds = %for.inc
  br label %for.end

for.end:                                          ; preds = %for.end.loopexit, %entry
  %putchar = tail call i32 @putchar(i32 10)
  ret void
}

; Function Attrs: nofree nounwind
declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #2

; Function Attrs: noinline nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #1 {
for.body.lr.ph:
  %call = tail call noalias align 16 dereferenceable_or_null(800000) i8* @malloc(i64 800000) #6
  %0 = bitcast i8* %call to i32*
  %call1 = tail call i64 @time(i64* null) #6
  %conv2 = trunc i64 %call1 to i32
  tail call void @srand(i32 %conv2) #6
  br label %for.inc

for.inc:                                          ; preds = %for.inc, %for.body.lr.ph
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.lr.ph ]
  %call4 = tail call i32 @rand() #6
  %rem = srem i32 %call4, 20000000
  %add = add nsw i32 %rem, 1
  %arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
  store i32 %add, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, 200000
  br i1 %exitcond.not, label %for.end, label %for.inc, !llvm.loop !8

for.end:                                          ; preds = %for.inc
  %call6 = tail call i64 @clock() #6
  tail call void @insertionSort(i32* nonnull %0, i32 200000)
  %call7 = tail call i64 @clock() #6
  %sub = sub nsw i64 %call7, %call6
  %conv8 = sitofp i64 %sub to double
  %div = fdiv double %conv8, 1.000000e+06
  %call9 = tail call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([22 x i8], [22 x i8]* @.str.2, i64 0, i64 0), double %div) #6
  ret i32 0
}

; Function Attrs: inaccessiblememonly mustprogress nofree nounwind willreturn
declare dso_local noalias noundef i8* @malloc(i64 noundef) local_unnamed_addr #3

; Function Attrs: nounwind
declare dso_local void @srand(i32) local_unnamed_addr #4

; Function Attrs: nounwind
declare dso_local i64 @time(i64*) local_unnamed_addr #4

; Function Attrs: nounwind
declare dso_local i32 @rand() local_unnamed_addr #4

; Function Attrs: nounwind
declare dso_local i64 @clock() local_unnamed_addr #4

; Function Attrs: nofree nounwind
declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5

attributes #0 = { nofree noinline norecurse nosync nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { nofree nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #3 = { inaccessiblememonly mustprogress nofree nounwind willreturn "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #4 = { nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #5 = { nofree nounwind }
attributes #6 = { nounwind }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 1}
!2 = !{i32 7, !"frame-pointer", i32 2}
!3 = !{!"clang version 14.0.0 (https://github.com/llvm/llvm-project edff0070a126d9a27263958dbb22133f4ed6526e)"}
!4 = distinct !{!4, !5}
!5 = !{!"llvm.loop.mustprogress"}
!6 = distinct !{!6, !5}
!7 = distinct !{!7, !5}
!8 = distinct !{!8, !5}

Your reason looks correct. Without looking into exact benchmark that you do it is not likely to be answered. — Alex Guteniev, Nov 08 '21 at 15:26
Can you provide the exact IR you're optimizing? Could be that removing the frame pointer is introducing a data dependency and lowering the IPC. — Nick ODell, Nov 08 '21 at 18:39
@NickODell I provide the IR above. Also, I update the results of perf that put "brach-misses" in them. — Shane, Nov 09 '21 at 05:17
What was the compilation command you used to generate the binary? — A. K., May 10 '23 at 23:28
Read this: https://stackoverflow.com/questions/13006371/does-omitting-the-frame-pointers-really-have-a-positive-effect-on-performance-an — A. K., May 10 '23 at 23:30

What is the relation between performance and attribute "frame-pointer" in llvm IR?

0 Answers0