Skip to content

Commit 809c6eb

Browse files
[LLVM][LV] Improve UF calculation for vscale based scalar loops.
Update getSmallConstantTripCount() to return scalable ElementCount values that is used to acurrately determine the maximum value for UF, namely: TripCount / VF ==> X * VScale / Y * VScale ==> X / Y This improves the chances of being able to remove the scalar loop and also fixes an issue where a UF=2 is choosen for a scalar loop with exactly VF(= X * VScale) iterations.
1 parent 7057eee commit 809c6eb

File tree

2 files changed

+243
-94
lines changed

2 files changed

+243
-94
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
#include "llvm/Analysis/ProfileSummaryInfo.h"
9494
#include "llvm/Analysis/ScalarEvolution.h"
9595
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96+
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
9697
#include "llvm/Analysis/TargetLibraryInfo.h"
9798
#include "llvm/Analysis/TargetTransformInfo.h"
9899
#include "llvm/Analysis/ValueTracking.h"
@@ -155,6 +156,7 @@
155156
#include <utility>
156157

157158
using namespace llvm;
159+
using namespace SCEVPatternMatch;
158160

159161
#define LV_NAME "loop-vectorize"
160162
#define DEBUG_TYPE LV_NAME
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
418420
/// ElementCount to include loops whose trip count is a function of vscale.
419421
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
420422
const Loop *L) {
421-
return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
423+
if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
424+
return ElementCount::getFixed(ExpectedTC);
425+
426+
const SCEV *BTC = SE->getBackedgeTakenCount(L);
427+
if (isa<SCEVCouldNotCompute>(BTC))
428+
return ElementCount::getFixed(0);
429+
430+
const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
431+
if (isa<SCEVVScale>(ExitCount))
432+
return ElementCount::getScalable(1);
433+
434+
const APInt *Scale;
435+
if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
436+
if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
437+
if (Scale->getActiveBits() <= 32)
438+
return ElementCount::getScalable(Scale->getZExtValue());
439+
440+
return ElementCount::getFixed(0);
422441
}
423442

424443
/// Returns "best known" trip count, which is either a valid positive trip count
@@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) {
25932612
}
25942613
}
25952614

2596-
/// This function attempts to return a value that represents the vectorization
2597-
/// factor at runtime. For fixed-width VFs we know this precisely at compile
2615+
/// This function attempts to return a value that represents the ElementCount
2616+
/// at runtime. For fixed-width VFs we know this precisely at compile
25982617
/// time, but for scalable VFs we calculate it based on an estimate of the
25992618
/// vscale value.
2600-
static unsigned getEstimatedRuntimeVF(ElementCount VF,
2601-
std::optional<unsigned> VScale) {
2619+
static unsigned estimateElementCount(ElementCount VF,
2620+
std::optional<unsigned> VScale) {
26022621
unsigned EstimatedVF = VF.getKnownMinValue();
26032622
if (VF.isScalable())
26042623
if (VScale)
@@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
27082727
// use the value of vscale used for tuning.
27092728
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
27102729
unsigned EstimatedVFxUF =
2711-
getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
2730+
estimateElementCount(VF * UF, Cost->getVScaleForTuning());
27122731
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
27132732
}
27142733

@@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
43374356

43384357
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
43394358
unsigned Width =
4340-
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4359+
estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
43414360
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
43424361
<< " costs: " << (Candidate.Cost / Width));
43434362
if (VF.isScalable())
@@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
44454464
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
44464465
? EpilogueVectorizationMinVF
44474466
: TTI.getEpilogueVectorizationMinVF();
4448-
return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4467+
return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
44494468
MinVFThreshold;
44504469
}
44514470

@@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
44984517
// the main loop handles 8 lanes per iteration. We could still benefit from
44994518
// vectorizing the epilogue loop with VF=4.
45004519
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4501-
getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
4520+
estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
45024521

45034522
ScalarEvolution &SE = *PSE.getSE();
45044523
Type *TCType = Legal->getWidestInductionType();
@@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47454764
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
47464765
}
47474766

4748-
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
4749-
47504767
// Try to get the exact trip count, or an estimate based on profiling data or
47514768
// ConstantMax from PSE, failing that.
4752-
if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
4769+
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4770+
4771+
// For fixed length VFs treat a scalable trip count as unknown.
4772+
if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4773+
// Re-evaluate trip counts and VFs to be in the same numerical space.
4774+
unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
4775+
unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);
4776+
47534777
// At least one iteration must be scalar when this constraint holds. So the
47544778
// maximum available iterations for interleaving is one less.
4755-
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4756-
? BestKnownTC->getFixedValue() - 1
4757-
: BestKnownTC->getFixedValue();
4779+
if (requiresScalarEpilogue(VF.isVector()))
4780+
--AvailableTC;
47584781

47594782
unsigned InterleaveCountLB = bit_floor(std::max(
47604783
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
@@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
69256948
// Now compute and add the VPlan-based cost.
69266949
Cost += Plan.cost(VF, CostCtx);
69276950
#ifndef NDEBUG
6928-
unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6951+
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
69296952
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
69306953
<< " (Estimated cost per lane: ");
69316954
if (Cost.isValid()) {
@@ -9611,7 +9634,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
96119634
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
96129635
// the computations are performed on doubles, not integers and the result
96139636
// is rounded up, hence we get an upper estimate of the TC.
9614-
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
9637+
unsigned IntVF = estimateElementCount(VF.Width, VScale);
96159638
uint64_t RtC = TotalCost.getValue();
96169639
uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
96179640
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);

0 commit comments

Comments
 (0)