Skip to content

Commit ac36655

Browse files
committed
[VPlan] Materialize vector trip count using VPInstructions.
Materialize the vector trip count computation using VPInstruction instead of directly creating IR. This is one of the last few steps needed to model the full vector skeleton in VPlan. It also simplifies vector-trip count computations for scalable vectors, as we can re-use the UF x VF computation. The underlying value of the vector trip count VPValue needs to be re-set the generated IR value for the vector trip count to keep legacy epilogue skeleton generation working. But once the full skeleton is modeled in VPlan, we will be able to also handle the epilogue skeleton in VPlan.
1 parent 19803d8 commit ac36655

File tree

135 files changed

+1846
-3787
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+1846
-3787
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 13 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -548,9 +548,6 @@ class InnerLoopVectorizer {
548548
protected:
549549
friend class LoopVectorizationPlanner;
550550

551-
/// Returns (and creates if needed) the trip count of the widened loop.
552-
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
553-
554551
// Create a check to see if the vector loop should be executed
555552
Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
556553

@@ -602,6 +599,7 @@ class InnerLoopVectorizer {
602599
/// vector elements.
603600
ElementCount VF;
604601

602+
public:
605603
ElementCount MinProfitableTripCount;
606604

607605
/// The vectorization unroll factor to use. Each scalar is vectorized to this
@@ -2279,56 +2277,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
22792277
return TTI.enableMaskedInterleavedAccessVectorization();
22802278
}
22812279

2282-
Value *
2283-
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2284-
if (VectorTripCount)
2285-
return VectorTripCount;
2286-
2287-
Value *TC = getTripCount();
2288-
IRBuilder<> Builder(InsertBlock->getTerminator());
2289-
2290-
Type *Ty = TC->getType();
2291-
// This is where we can make the step a runtime constant.
2292-
Value *Step = createStepForVF(Builder, Ty, VF, UF);
2293-
2294-
// If the tail is to be folded by masking, round the number of iterations N
2295-
// up to a multiple of Step instead of rounding down. This is done by first
2296-
// adding Step-1 and then rounding down. Note that it's ok if this addition
2297-
// overflows: the vector induction variable will eventually wrap to zero given
2298-
// that it starts at zero and its Step is a power of two; the loop will then
2299-
// exit, with the last early-exit vector comparison also producing all-true.
2300-
// For scalable vectors the VF is not guaranteed to be a power of 2, but this
2301-
// is accounted for in emitIterationCountCheck that adds an overflow check.
2302-
if (Cost->foldTailByMasking()) {
2303-
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2304-
"VF*UF must be a power of 2 when folding tail by masking");
2305-
TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2306-
"n.rnd.up");
2307-
}
2308-
2309-
// Now we need to generate the expression for the part of the loop that the
2310-
// vectorized body will execute. This is equal to N - (N % Step) if scalar
2311-
// iterations are not required for correctness, or N - Step, otherwise. Step
2312-
// is equal to the vectorization factor (number of SIMD elements) times the
2313-
// unroll factor (number of SIMD instructions).
2314-
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2315-
2316-
// There are cases where we *must* run at least one iteration in the remainder
2317-
// loop. See the cost model for when this can happen. If the step evenly
2318-
// divides the trip count, we set the remainder to be equal to the step. If
2319-
// the step does not evenly divide the trip count, no adjustment is necessary
2320-
// since there will already be scalar iterations. Note that the minimum
2321-
// iterations check ensures that N >= Step.
2322-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2323-
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2324-
R = Builder.CreateSelect(IsZero, Step, R);
2325-
}
2326-
2327-
VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2328-
2329-
return VectorTripCount;
2330-
}
2331-
23322280
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
23332281
// Note: The block with the minimum trip-count check is already connected
23342282
// during earlier VPlan construction.
@@ -7319,6 +7267,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73197267
// Canonicalize EVL loops after regions are dissolved.
73207268
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
73217269
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
7270+
VPlanTransforms::materializeVectorTripCount(
7271+
BestVPlan, VectorPH, CM.foldTailByMasking(),
7272+
CM.requiresScalarEpilogue(BestVF.isVector()));
73227273

73237274
// Perform the actual loop transformation.
73247275
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7375,8 +7326,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73757326
//===------------------------------------------------===//
73767327

73777328
// 2. Copy and widen instructions from the old loop into the new loop.
7378-
BestVPlan.prepareToExecute(
7379-
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7329+
BestVPlan.prepareToExecute(State);
73807330
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
73817331

73827332
// Move check blocks to their final position.
@@ -7496,7 +7446,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
74967446
emitIterationCountCheck(LoopScalarPreHeader, false);
74977447

74987448
// Generate the induction variable.
7499-
EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
75007449

75017450
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
75027451
return LoopVectorPreHeader;
@@ -9376,13 +9325,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
93769325
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
93779326
cast_if_present<BinaryOperator>(FPBinOp));
93789327
DerivedIV->setName(Name);
9379-
// If index is the vector trip count, the concrete value will only be set in
9380-
// prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9381-
// TODO: Remove the special case for the vector trip count once it is computed
9382-
// in VPlan and can be used during VPlan simplification.
9383-
assert((DerivedIV != Index ||
9384-
getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9385-
"IV didn't need transforming?");
93869328
State.set(this, DerivedIV, VPLane(0));
93879329
}
93889330

@@ -10272,8 +10214,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1027210214

1027310215
// TODO: Move to general VPlan pipeline once epilogue loops are also
1027410216
// supported.
10275-
VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
10276-
BestPlan, VF.Width, IC, PSE);
10217+
VPlanTransforms::runPass(
10218+
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
10219+
VF.Width, IC, PSE);
1027710220

1027810221
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
1027910222

@@ -10312,6 +10255,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1031210255
// edges from the first pass.
1031310256
EPI.MainLoopVF = EPI.EpilogueVF;
1031410257
EPI.MainLoopUF = EPI.EpilogueUF;
10258+
EPI.VectorTripCount =
10259+
BestMainPlan->getVectorTripCount().getLiveInIRValue();
1031510260
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
1031610261
ORE, EPI, &CM, BFI, PSI,
1031710262
Checks, BestEpiPlan);
@@ -10344,8 +10289,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1034410289
Checks, BestPlan);
1034510290
// TODO: Move to general VPlan pipeline once epilogue loops are also
1034610291
// supported.
10347-
VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
10348-
BestPlan, VF.Width, IC, PSE);
10292+
VPlanTransforms::runPass(
10293+
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
10294+
VF.Width, IC, PSE);
1034910295

1035010296
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
1035110297
++LoopsVectorized;

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -951,15 +951,9 @@ VPlan::~VPlan() {
951951
delete BackedgeTakenCount;
952952
}
953953

954-
void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
955-
if (!VectorTripCount.getUnderlyingValue())
956-
VectorTripCount.setUnderlyingValue(VectorTripCountV);
957-
else
958-
assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
959-
"VectorTripCount set earlier must much VectorTripCountV");
960-
954+
void VPlan::prepareToExecute(VPTransformState &State) {
961955
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
962-
Type *TCTy = VectorTripCountV->getType();
956+
Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount());
963957
// FIXME: Model VF * UF computation completely in VPlan.
964958
unsigned UF = getUF();
965959
if (VF.getNumUsers()) {
@@ -1023,6 +1017,16 @@ void VPlan::execute(VPTransformState *State) {
10231017
Block->execute(State);
10241018

10251019
State->CFG.DTU.flush();
1020+
auto *ScalarPH = getScalarPreheader();
1021+
1022+
// Set the underlying value of the vector trip count VPValue to the generated
1023+
// vector trip count by accessing the incoming value from the resume phi.
1024+
// TODO: This is needed, as the epilogue skeleton generation code needs to
1025+
// access the IR value. Remove once skeleton is modeled completely in VPlan.
1026+
if (!getVectorTripCount().getLiveInIRValue() &&
1027+
ScalarPH->getNumPredecessors() > 0)
1028+
getVectorTripCount().setUnderlyingValue(
1029+
State->get(cast<VPPhi>(&*ScalarPH->begin())->getOperand(0), true));
10261030

10271031
VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT);
10281032
if (!Header)

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3958,7 +3958,7 @@ class VPlan {
39583958
}
39593959

39603960
/// Prepare the plan for execution, setting up the required live-in values.
3961-
void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
3961+
void prepareToExecute(VPTransformState &State);
39623962

39633963
/// Generate the IR code for this VPlan.
39643964
void execute(VPTransformState *State);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3153,7 +3153,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
31533153
}
31543154
}
31553155

3156-
void VPlanTransforms::materializeVectorTripCount(
3156+
void VPlanTransforms::materializeConstantVectorTripCount(
31573157
VPlan &Plan, ElementCount BestVF, unsigned BestUF,
31583158
PredicatedScalarEvolution &PSE) {
31593159
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
@@ -3191,6 +3191,62 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
31913191
BTC->replaceAllUsesWith(TCMO);
31923192
}
31933193

3194+
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
3195+
VPBasicBlock *VectorPHVPBB,
3196+
bool TailByMasking,
3197+
bool RequiresScalarEpilogue) {
3198+
VPValue &VectorTC = Plan.getVectorTripCount();
3199+
if (VectorTC.getNumUsers() == 0 ||
3200+
(VectorTC.isLiveIn() && VectorTC.getLiveInIRValue()))
3201+
return;
3202+
VPValue *TC = Plan.getTripCount();
3203+
Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3204+
VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3205+
3206+
VPValue *Step = &Plan.getVFxUF();
3207+
3208+
// If the tail is to be folded by masking, round the number of iterations N
3209+
// up to a multiple of Step instead of rounding down. This is done by first
3210+
// adding Step-1 and then rounding down. Note that it's ok if this addition
3211+
// overflows: the vector induction variable will eventually wrap to zero given
3212+
// that it starts at zero and its Step is a power of two; the loop will then
3213+
// exit, with the last early-exit vector comparison also producing all-true.
3214+
// For scalable vectors the VF is not guaranteed to be a power of 2, but this
3215+
// is accounted for in emitIterationCountCheck that adds an overflow check.
3216+
if (TailByMasking) {
3217+
TC = Builder.createNaryOp(
3218+
Instruction::Add,
3219+
{TC, Builder.createNaryOp(
3220+
Instruction::Sub,
3221+
{Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3222+
DebugLoc::getUnknown(), "n.rnd.up");
3223+
}
3224+
3225+
// Now we need to generate the expression for the part of the loop that the
3226+
// vectorized body will execute. This is equal to N - (N % Step) if scalar
3227+
// iterations are not required for correctness, or N - Step, otherwise. Step
3228+
// is equal to the vectorization factor (number of SIMD elements) times the
3229+
// unroll factor (number of SIMD instructions).
3230+
VPValue *R = Builder.createNaryOp(Instruction::URem, {TC, Step},
3231+
DebugLoc::getUnknown(), "n.mod.vf");
3232+
3233+
// There are cases where we *must* run at least one iteration in the remainder
3234+
// loop. See the cost model for when this can happen. If the step evenly
3235+
// divides the trip count, we set the remainder to be equal to the step. If
3236+
// the step does not evenly divide the trip count, no adjustment is necessary
3237+
// since there will already be scalar iterations. Note that the minimum
3238+
// iterations check ensures that N >= Step.
3239+
if (RequiresScalarEpilogue) {
3240+
auto *IsZero = Builder.createICmp(
3241+
CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3242+
R = Builder.createSelect(IsZero, Step, R);
3243+
}
3244+
3245+
auto Res = Builder.createNaryOp(Instruction::Sub, {TC, R},
3246+
DebugLoc::getUnknown(), "n.vec");
3247+
Plan.getVectorTripCount().replaceAllUsesWith(Res);
3248+
}
3249+
31943250
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
31953251
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
31963252
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,16 @@ struct VPlanTransforms {
252252

253253
// Materialize vector trip counts for constants early if it can simply be
254254
// computed as (Original TC / VF * UF) * VF * UF.
255-
static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF,
256-
unsigned BestUF,
257-
PredicatedScalarEvolution &PSE);
255+
static void
256+
materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF,
257+
unsigned BestUF,
258+
PredicatedScalarEvolution &PSE);
259+
260+
// Materialize vector trip count computations to a set of VPInstructions.
261+
static void materializeVectorTripCount(VPlan &Plan,
262+
VPBasicBlock *VectorPHVPBB,
263+
bool TailByMasking,
264+
bool RequiresScalarEpilogue);
258265

259266
/// Materialize the backedge-taken count to be computed explicitly using
260267
/// VPInstructions.

llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,13 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
99
; CHECK: vector.ph:
1010
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1111
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
12-
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
13-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
14-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
15-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
16-
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
17-
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
1812
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
1913
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
2014
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
2115
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
2216
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
2317
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
24-
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
18+
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]]
2519
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
2620
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
2721
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -34,7 +28,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
3428
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
3529
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
3630
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
37-
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
31+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
3832
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
3933
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
4034
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -92,19 +86,13 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
9286
; CHECK: vector.ph:
9387
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
9488
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
95-
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
96-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
97-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
98-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
99-
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
100-
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
10189
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
10290
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
10391
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
10492
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
10593
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
10694
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
107-
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
95+
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]]
10896
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
10997
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
11098
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -117,7 +105,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
117105
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
118106
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
119107
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
120-
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
108+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
121109
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
122110
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
123111
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]

0 commit comments

Comments
 (0)