Skip to content

[LLVM][LV] Improve UF calculation for vscale based scalar loops. #146102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 40 additions & 17 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
Expand Down Expand Up @@ -155,6 +156,7 @@
#include <utility>

using namespace llvm;
using namespace SCEVPatternMatch;

#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
Expand Down Expand Up @@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// ElementCount to include loops whose trip count is a function of vscale.
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
const Loop *L) {
return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
return ElementCount::getFixed(ExpectedTC);

const SCEV *BTC = SE->getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BTC))
return ElementCount::getFixed(0);

const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
if (isa<SCEVVScale>(ExitCount))
return ElementCount::getScalable(1);

const APInt *Scale;
if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
if (Scale->getActiveBits() <= 32)
Comment on lines +436 to +437
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
if (Scale->getActiveBits() <= 32)
if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap() && Scale->getActiveBits() <= 32)

or could even fold with the if above.

return ElementCount::getScalable(Scale->getZExtValue());

return ElementCount::getFixed(0);
}

/// Returns "best known" trip count, which is either a valid positive trip count
Expand Down Expand Up @@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) {
}
}

/// This function attempts to return a value that represents the vectorization
/// factor at runtime. For fixed-width VFs we know this precisely at compile
/// This function attempts to return a value that represents the ElementCount
/// at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
static unsigned getEstimatedRuntimeVF(ElementCount VF,
std::optional<unsigned> VScale) {
static unsigned estimateElementCount(ElementCount VF,
std::optional<unsigned> VScale) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (VScale)
Expand Down Expand Up @@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// use the value of vscale used for tuning.
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
unsigned EstimatedVFxUF =
getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
estimateElementCount(VF * UF, Cost->getVScaleForTuning());
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}

Expand Down Expand Up @@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
Expand Down Expand Up @@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
MinVFThreshold;
}

Expand Down Expand Up @@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));

ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
Expand Down Expand Up @@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);

// Try to get the exact trip count, or an estimate based on profiling data or
// ConstantMax from PSE, failing that.
if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);

// For fixed length VFs treat a scalable trip count as unknown.
if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
// Re-evaluate trip counts and VFs to be in the same numerical space.
unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);

// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
? BestKnownTC->getFixedValue() - 1
: BestKnownTC->getFixedValue();
if (requiresScalarEpilogue(VF.isVector()))
--AvailableTC;

unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
Expand Down Expand Up @@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
Expand Down Expand Up @@ -9611,7 +9634,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
unsigned IntVF = estimateElementCount(VF.Width, VScale);
uint64_t RtC = TotalCost.getValue();
uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
Expand Down
Loading