Skip to content

Commit

Permalink
[SCEVExpander] Fix addrec cost model (llvm#106704)
Browse files Browse the repository at this point in the history
The current isHighCostExpansion cost model for addrecs computes the cost
for some kind of polynomial expansion that does not appear to have any
relation to addrec expansion whatsoever.

A literal expansion of an affine addrec is a phi and add (plus the
expansion of start and step). For a non-affine addrec, we get another
phi+add for each additional addrec nested in the step recurrence.

This partially `fixes` llvm#53205
(the runtime unroll test case in this PR).
  • Loading branch information
nikic authored Sep 19, 2024
1 parent c18be32 commit 4ec4ac1
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 56 deletions.
48 changes: 11 additions & 37 deletions llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1911,43 +1911,17 @@ template<typename T> static InstructionCost costAndCollectOperands(
break;
}
case scAddRecExpr: {
// In this polynominal, we may have some zero operands, and we shouldn't
// really charge for those. So how many non-zero coefficients are there?
int NumTerms = llvm::count_if(S->operands(), [](const SCEV *Op) {
return !Op->isZero();
});

assert(NumTerms >= 1 && "Polynominal should have at least one term.");
assert(!(*std::prev(S->operands().end()))->isZero() &&
"Last operand should not be zero");

// Ignoring constant term (operand 0), how many of the coefficients are u> 1?
int NumNonZeroDegreeNonOneTerms =
llvm::count_if(S->operands(), [](const SCEV *Op) {
auto *SConst = dyn_cast<SCEVConstant>(Op);
return !SConst || SConst->getAPInt().ugt(1);
});

// Much like with normal add expr, the polynominal will require
// one less addition than the number of it's terms.
InstructionCost AddCost = ArithCost(Instruction::Add, NumTerms - 1,
/*MinIdx*/ 1, /*MaxIdx*/ 1);
// Here, *each* one of those will require a multiplication.
InstructionCost MulCost =
ArithCost(Instruction::Mul, NumNonZeroDegreeNonOneTerms);
Cost = AddCost + MulCost;

// What is the degree of this polynominal?
int PolyDegree = S->getNumOperands() - 1;
assert(PolyDegree >= 1 && "Should be at least affine.");

// The final term will be:
// Op_{PolyDegree} * x ^ {PolyDegree}
// Where x ^ {PolyDegree} will again require PolyDegree-1 mul operations.
// Note that x ^ {PolyDegree} = x * x ^ {PolyDegree-1} so charging for
// x ^ {PolyDegree} will give us x ^ {2} .. x ^ {PolyDegree-1} for free.
// FIXME: this is conservatively correct, but might be overly pessimistic.
Cost += MulCost * (PolyDegree - 1);
// Addrec expands to a phi and add per recurrence.
unsigned NumRecurrences = S->getNumOperands() - 1;
Cost += TTI.getCFInstrCost(Instruction::PHI, CostKind) * NumRecurrences;
Cost +=
TTI.getArithmeticInstrCost(Instruction::Add, S->getType(), CostKind) *
NumRecurrences;
// AR start is used in phi.
Worklist.emplace_back(Instruction::PHI, 0, S->getOperand(0));
// Other operands are used in add.
for (const SCEV *Op : S->operands().drop_front())
Worklist.emplace_back(Instruction::Add, 1, Op);
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,28 @@ define dso_local void @hoge() local_unnamed_addr {
; CHECK-LABEL: @hoge(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[N:%.*]] = sdiv exact i64 undef, 40
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 undef, [[N]]
; CHECK-NEXT: br label [[HEADER:%.*]]
; CHECK: header:
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH:%.*]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH]] ], [ undef, [[ENTRY]] ]
; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N]], [[IDX]]
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[INNER_PREHEADER:%.*]]
; CHECK: inner.preheader:
; CHECK-NEXT: br label [[INNER:%.*]]
; CHECK: inner:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[INNER]] ], [ 0, [[INNER_PREHEADER]] ]
; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNER]] ], [ [[N]], [[INNER_PREHEADER]] ]
; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
; CHECK-NEXT: [[J_NEXT]] = add nsw i64 [[J]], 1
; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I]], 1
; CHECK-NEXT: store i64 undef, ptr @ptr, align 8
; CHECK-NEXT: [[COND1:%.*]] = icmp slt i64 [[J]], [[IDX]]
; CHECK-NEXT: br i1 [[COND1]], label [[INNER]], label [[INNER_EXIT:%.*]]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[I_NEXT]], [[INDVARS_IV]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER]], label [[INNER_EXIT:%.*]]
; CHECK: inner_exit:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[I_NEXT]], [[INNER]] ]
; CHECK-NEXT: [[INDVAR_USE:%.*]] = add i64 [[INDVAR]], 1
; CHECK-NEXT: br label [[LATCH]]
; CHECK: latch:
; CHECK-NEXT: [[IDX_NEXT]] = add nsw i64 [[IDX]], -1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
; CHECK-NEXT: br label [[HEADER]]
; CHECK: end:
; CHECK-NEXT: ret void
Expand Down
91 changes: 78 additions & 13 deletions llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,27 @@ define void @selsort(ptr %array) #0 {
; CHECK-NEXT: br i1 [[CMP21_NOT]], label %[[FOR_END18:.*]], label %[[FOR_BODY_LR_PH:.*]]
; CHECK: [[FOR_BODY_LR_PH]]:
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAY]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP0]], -1
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], -2
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[BASE_022:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], %[[FOR_END:.*]] ]
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[BASE_022]], -1
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP7]], [[TMP10]]
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[ADD]] = add nuw i64 [[BASE_022]], 1
; CHECK-NEXT: [[CMP318:%.*]] = icmp ult i64 [[ADD]], [[TMP0]]
; CHECK-NEXT: br i1 [[CMP318]], label %[[FOR_BODY4_PREHEADER:.*]], label %[[FOR_END]]
; CHECK: [[FOR_BODY4_PREHEADER]]:
; CHECK-NEXT: br label %[[FOR_BODY4:.*]]
; CHECK: [[FOR_BODY4]]:
; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4]] ], [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ]
; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4]] ], [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ]
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP11]], 3
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY4_PROL_PREHEADER:.*]], label %[[FOR_BODY4_PROL_LOOPEXIT:.*]]
; CHECK: [[FOR_BODY4_PROL_PREHEADER]]:
; CHECK-NEXT: br label %[[FOR_BODY4_PROL:.*]]
; CHECK: [[FOR_BODY4_PROL]]:
; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4_PROL]] ], [ [[BASE_022]], %[[FOR_BODY4_PROL_PREHEADER]] ]
; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4_PROL]] ], [ [[ADD]], %[[FOR_BODY4_PROL_PREHEADER]] ]
; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY4_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[FOR_BODY4_PROL]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[C_019]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_020]]
Expand All @@ -32,18 +42,69 @@ define void @selsort(ptr %array) #0 {
; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP7]], i64 [[C_019]], i64 [[MIN_020]]
; CHECK-NEXT: [[INC]] = add nuw i64 [[C_019]], 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INC]], [[TMP0]]
; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT:.*]]
; CHECK: [[FOR_END_LOOPEXIT]]:
; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4]] ]
; CHECK-NEXT: br label %[[FOR_END]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY4_PROL]], label %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[MIN_020_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ]
; CHECK-NEXT: [[C_019_UNR_PH:%.*]] = phi i64 [ [[INC]], %[[FOR_BODY4_PROL]] ]
; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ]
; CHECK-NEXT: br label %[[FOR_BODY4_PROL_LOOPEXIT]]
; CHECK: [[FOR_BODY4_PROL_LOOPEXIT]]:
; CHECK-NEXT: [[MIN_020_UNR:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ], [ [[MIN_020_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CHECK-NEXT: [[C_019_UNR:%.*]] = phi i64 [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ], [ [[C_019_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR:%.*]] = phi i64 [ poison, %[[FOR_BODY4_PREHEADER]] ], [ [[SPEC_SELECT_LCSSA_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], 3
; CHECK-NEXT: br i1 [[TMP9]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY4_PREHEADER_NEW:.*]]
; CHECK: [[FOR_BODY4_PREHEADER_NEW]]:
; CHECK-NEXT: br label %[[FOR_BODY4:.*]]
; CHECK: [[FOR_BODY4]]:
; CHECK-NEXT: [[MIN_20:%.*]] = phi i64 [ [[MIN_020_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[SPEC_SELECT_3:%.*]], %[[FOR_BODY4]] ]
; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[C_019_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY4]] ]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]]
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_20]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[CMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[SPEC_SELECT1:%.*]] = select i1 [[CMP8]], i64 [[MIN_0_LCSSA]], i64 [[MIN_20]]
; CHECK-NEXT: [[INC1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 1
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC1]]
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4
; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT1]]
; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX6_1]], align 4
; CHECK-NEXT: [[CMP7_1:%.*]] = icmp ult i32 [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[SPEC_SELECT_1:%.*]] = select i1 [[CMP7_1]], i64 [[INC1]], i64 [[SPEC_SELECT1]]
; CHECK-NEXT: [[INC_1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 2
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_1]]
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4
; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_1]]
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_2]], align 4
; CHECK-NEXT: [[CMP7_2:%.*]] = icmp ult i32 [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[SPEC_SELECT_2:%.*]] = select i1 [[CMP7_2]], i64 [[INC_1]], i64 [[SPEC_SELECT_1]]
; CHECK-NEXT: [[INC_2:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 3
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_2]]
; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4
; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_2]]
; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX6_3]], align 4
; CHECK-NEXT: [[CMP7_3:%.*]] = icmp ult i32 [[TMP16]], [[TMP17]]
; CHECK-NEXT: [[SPEC_SELECT_3]] = select i1 [[CMP7_3]], i64 [[INC_2]], i64 [[SPEC_SELECT_2]]
; CHECK-NEXT: [[INC_3]] = add nuw i64 [[MIN_0_LCSSA]], 4
; CHECK-NEXT: [[CMP3_3:%.*]] = icmp ult i64 [[INC_3]], [[TMP0]]
; CHECK-NEXT: br i1 [[CMP3_3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]]
; CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[SPEC_SELECT_LCSSA_PH:%.*]] = phi i64 [ [[SPEC_SELECT_3]], %[[FOR_BODY4]] ]
; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT]]
; CHECK: [[FOR_END_LOOPEXIT]]:
; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT_LCSSA_UNR]], %[[FOR_BODY4_PROL_LOOPEXIT]] ], [ [[SPEC_SELECT_LCSSA_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA]] ]
; CHECK-NEXT: br label %[[FOR_END]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: [[MIN_0_LCSSA1:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA1]]
; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]]
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX12]], align 4
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD]], [[TMP0]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END18_LOOPEXIT:.*]], label %[[FOR_BODY]]
; CHECK: [[FOR_END18_LOOPEXIT]]:
Expand Down Expand Up @@ -96,3 +157,7 @@ for.end18: ; preds = %for.end, %entry
}

attributes #0 = { "tune-cpu"="generic" }
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
;.

0 comments on commit 4ec4ac1

Please sign in to comment.