|
93 | 93 | #include "llvm/Analysis/ProfileSummaryInfo.h"
|
94 | 94 | #include "llvm/Analysis/ScalarEvolution.h"
|
95 | 95 | #include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
| 96 | +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" |
96 | 97 | #include "llvm/Analysis/TargetLibraryInfo.h"
|
97 | 98 | #include "llvm/Analysis/TargetTransformInfo.h"
|
98 | 99 | #include "llvm/Analysis/ValueTracking.h"
|
|
155 | 156 | #include <utility>
|
156 | 157 |
|
157 | 158 | using namespace llvm;
|
| 159 | +using namespace SCEVPatternMatch; |
158 | 160 |
|
159 | 161 | #define LV_NAME "loop-vectorize"
|
160 | 162 | #define DEBUG_TYPE LV_NAME
|
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
|
418 | 420 | /// ElementCount to include loops whose trip count is a function of vscale.
|
419 | 421 | static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
|
420 | 422 | const Loop *L) {
|
421 |
| - return ElementCount::getFixed(SE->getSmallConstantTripCount(L)); |
| 423 | + if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L)) |
| 424 | + return ElementCount::getFixed(ExpectedTC); |
| 425 | + |
| 426 | + const SCEV *BTC = SE->getBackedgeTakenCount(L); |
| 427 | + if (isa<SCEVCouldNotCompute>(BTC)) |
| 428 | + return ElementCount::getFixed(0); |
| 429 | + |
| 430 | + const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L); |
| 431 | + if (isa<SCEVVScale>(ExitCount)) |
| 432 | + return ElementCount::getScalable(1); |
| 433 | + |
| 434 | + const APInt *Scale; |
| 435 | + if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale()))) |
| 436 | + if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap()) |
| 437 | + if (Scale->getActiveBits() <= 32) |
| 438 | + return ElementCount::getScalable(Scale->getZExtValue()); |
| 439 | + |
| 440 | + return ElementCount::getFixed(0); |
422 | 441 | }
|
423 | 442 |
|
424 | 443 | /// Returns "best known" trip count, which is either a valid positive trip count
|
@@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) {
|
2593 | 2612 | }
|
2594 | 2613 | }
|
2595 | 2614 |
|
2596 |
| -/// This function attempts to return a value that represents the vectorization |
2597 |
| -/// factor at runtime. For fixed-width VFs we know this precisely at compile |
| 2615 | +/// This function attempts to return a value that represents the ElementCount |
| 2616 | +/// at runtime. For fixed-width VFs we know this precisely at compile |
2598 | 2617 | /// time, but for scalable VFs we calculate it based on an estimate of the
|
2599 | 2618 | /// vscale value.
|
2600 |
| -static unsigned getEstimatedRuntimeVF(ElementCount VF, |
2601 |
| - std::optional<unsigned> VScale) { |
| 2619 | +static unsigned estimateElementCount(ElementCount VF, |
| 2620 | + std::optional<unsigned> VScale) { |
2602 | 2621 | unsigned EstimatedVF = VF.getKnownMinValue();
|
2603 | 2622 | if (VF.isScalable())
|
2604 | 2623 | if (VScale)
|
@@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
|
2708 | 2727 | // use the value of vscale used for tuning.
|
2709 | 2728 | Loop *VectorLoop = LI->getLoopFor(HeaderBB);
|
2710 | 2729 | unsigned EstimatedVFxUF =
|
2711 |
| - getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning()); |
| 2730 | + estimateElementCount(VF * UF, Cost->getVScaleForTuning()); |
2712 | 2731 | setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
|
2713 | 2732 | }
|
2714 | 2733 |
|
@@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
|
4337 | 4356 |
|
4338 | 4357 | VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
|
4339 | 4358 | unsigned Width =
|
4340 |
| - getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); |
| 4359 | + estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); |
4341 | 4360 | LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
|
4342 | 4361 | << " costs: " << (Candidate.Cost / Width));
|
4343 | 4362 | if (VF.isScalable())
|
@@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
|
4445 | 4464 | unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
|
4446 | 4465 | ? EpilogueVectorizationMinVF
|
4447 | 4466 | : TTI.getEpilogueVectorizationMinVF();
|
4448 |
| - return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= |
| 4467 | + return estimateElementCount(VF * Multiplier, VScaleForTuning) >= |
4449 | 4468 | MinVFThreshold;
|
4450 | 4469 | }
|
4451 | 4470 |
|
@@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
|
4498 | 4517 | // the main loop handles 8 lanes per iteration. We could still benefit from
|
4499 | 4518 | // vectorizing the epilogue loop with VF=4.
|
4500 | 4519 | ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
|
4501 |
| - getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); |
| 4520 | + estimateElementCount(MainLoopVF, CM.getVScaleForTuning())); |
4502 | 4521 |
|
4503 | 4522 | ScalarEvolution &SE = *PSE.getSE();
|
4504 | 4523 | Type *TCType = Legal->getWidestInductionType();
|
@@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
|
4745 | 4764 | MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
|
4746 | 4765 | }
|
4747 | 4766 |
|
4748 |
| - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); |
4749 |
| - |
4750 | 4767 | // Try to get the exact trip count, or an estimate based on profiling data or
|
4751 | 4768 | // ConstantMax from PSE, failing that.
|
4752 |
| - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { |
| 4769 | + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); |
| 4770 | + |
| 4771 | + // For fixed length VFs treat a scalable trip count as unknown. |
| 4772 | + if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { |
| 4773 | + // Re-evaluate trip counts and VFs to be in the same numerical space. |
| 4774 | + unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); |
| 4775 | + unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); |
| 4776 | + |
4753 | 4777 | // At least one iteration must be scalar when this constraint holds. So the
|
4754 | 4778 | // maximum available iterations for interleaving is one less.
|
4755 |
| - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) |
4756 |
| - ? BestKnownTC->getFixedValue() - 1 |
4757 |
| - : BestKnownTC->getFixedValue(); |
| 4779 | + if (requiresScalarEpilogue(VF.isVector())) |
| 4780 | + --AvailableTC; |
4758 | 4781 |
|
4759 | 4782 | unsigned InterleaveCountLB = bit_floor(std::max(
|
4760 | 4783 | 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
|
@@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
|
6925 | 6948 | // Now compute and add the VPlan-based cost.
|
6926 | 6949 | Cost += Plan.cost(VF, CostCtx);
|
6927 | 6950 | #ifndef NDEBUG
|
6928 |
| - unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); |
| 6951 | + unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); |
6929 | 6952 | LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
|
6930 | 6953 | << " (Estimated cost per lane: ");
|
6931 | 6954 | if (Cost.isValid()) {
|
@@ -9611,7 +9634,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
|
9611 | 9634 | // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
|
9612 | 9635 | // the computations are performed on doubles, not integers and the result
|
9613 | 9636 | // is rounded up, hence we get an upper estimate of the TC.
|
9614 |
| - unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); |
| 9637 | + unsigned IntVF = estimateElementCount(VF.Width, VScale); |
9615 | 9638 | uint64_t RtC = TotalCost.getValue();
|
9616 | 9639 | uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
|
9617 | 9640 | uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
|
|
0 commit comments