From 0e19e9e8a3a06bfea0e83e6c33adba8dde187031 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 5 Aug 2025 17:31:58 +0800 Subject: [PATCH 1/3] [VPlan] Support VPWidenPointerInductionRecipes with EVL tail folding Now that VPWidenPointerInductionRecipes are modelled in VPlan in #148274, we can support them in EVL tail folding. We need to replace their VFxUF operand with EVL as the increment is not guaranteed to always be VF on the penultimate iteration, and UF is always 1 with EVL tail folding. We also need to move the creation of the backedge value to the latch so that EVL dominates it. With this we will no longer fail to convert a VPlan to EVL tail folding, so adjust tryAddExplicitVectorLength to account for this. This brings us to 99.4% of all vector loops vectorized on SPEC CPU 2017 with tail folding vs no tail folding. The test in only-compute-cost-for-vplan-vfs.ll previously relied on widened pointer inductions with EVL tail folding to end up in a scenario with no vector VPlans, so this also replaces it with an unvectorizable fixed-order recurrence test from first-order-recurrence-multiply-recurrences.ll that also gets discarded. --- .../Transforms/Vectorize/LoopVectorize.cpp | 8 ++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 25 +++++----- .../Transforms/Vectorize/VPlanTransforms.h | 7 ++- .../Transforms/Vectorize/VPlanVerifier.cpp | 2 +- .../LoopVectorize/AArch64/sve-widen-gep.ll | 4 +- .../LoopVectorize/AArch64/sve-widen-phi.ll | 4 +- .../RISCV/evl-compatible-loops.ll | 48 ++++++++++++++++--- .../RISCV/only-compute-cost-for-vplan-vfs.ll | 43 ++++++++++------- .../LoopVectorize/RISCV/strided-accesses.ll | 6 +-- .../LoopVectorize/pointer-induction.ll | 2 +- .../LoopVectorize/scev-predicate-reasoning.ll | 16 +++---- 11 files changed, 103 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index eb0e0fd7b3d8e..57601b905c8d8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8450,11 +8450,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && !HasScalarVF && - !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength, - *Plan, CM.getMaxSafeElements())) - break; + if (CM.foldTailWithEVL() && !HasScalarVF) + VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, + *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a7965a053e6e3..b64b573540074 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2177,8 +2177,18 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { assert(all_of(Plan.getVF().users(), IsaPred) && + all_of(Plan.getVFxUF().users(), + [&Plan](VPUser *U) { + return match(U, m_c_Binary( + m_Specific(Plan.getCanonicalIV()), + m_Specific(&Plan.getVFxUF()))) || + isa(U); + }) && "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); + Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + return isa(U); + }); // Defer erasing recipes till the end so that we don't invalidate the // VPTypeAnalysis cache. @@ -2315,16 +2325,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextAVL = sub IVSize nuw %AVL, %OpEVL /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength( +void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - // The transform updates all users of inductions to work based on EVL, instead - // of the VF directly. At the moment, widened pointer inductions cannot be - // updated, so bail out if the plan contains any. - bool ContainsWidenPointerInductions = - any_of(Header->phis(), IsaPred); - if (ContainsWidenPointerInductions) - return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); auto *CanIVTy = CanonicalIVPHI->getScalarType(); @@ -2379,7 +2382,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength( CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); // TODO: support unroll factor > 1. Plan.setUF(1); - return true; } void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { @@ -2803,13 +2805,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, R->replaceAllUsesWith(PtrAdd); // Create the backedge value for the scalar pointer phi. - Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); + Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF), DL); VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF}); - VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); - Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VPValue *InductionGEP = Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind"); ScalarPtrPhi->addOperand(InductionGEP); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5943684e17a76..7adfdf549a5ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -177,10 +177,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. - /// \returns true if the transformation succeeds, or false if it doesn't. - static bool - tryAddExplicitVectorLength(VPlan &Plan, - const std::optional &MaxEVLSafeElements); + static void + addExplicitVectorLength(VPlan &Plan, + const std::optional &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 14ae4f2204310..3417e1c8dc1ea 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 9929f35d47dac..5c6328e51760d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -35,7 +35,6 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul [[TMP13]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP15]] @@ -48,6 +47,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP20]], ptr [[TMP18]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -119,7 +119,6 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP14]] @@ -128,6 +127,7 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP17]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 6947884efb699..2c88e0e744fac 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -239,7 +239,6 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP10]] @@ -250,6 +249,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[TMP12]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: store [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -313,7 +313,6 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP5]] @@ -321,6 +320,7 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: [[TMP7:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0( zeroinitializer, ptr [[TMP7]], i32 2, [[TMP6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index 5f13089ff17fd..369da05015af8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -74,16 +74,50 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) { ; CHECK-LABEL: define void @test_wide_ptr_induction( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], splat (i64 8) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2p0.p0( [[VECTOR_GEP]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 8, [[TMP10]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[B]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[VECTOR_BODY]] ], [ [[B]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -109,4 +143,6 @@ for.cond.cleanup: ; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} ; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll index 0afe04e610bce..07a7b7b1fcc1c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll @@ -1,29 +1,36 @@ -; RUN: opt -passes=loop-vectorize \ -; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s ; REQUIRES: asserts -; Make sure we do not vectorize a loop with a widened pointer induction. -define void @test_wide_pointer_induction(ptr noalias %a, i64 %N) { +; For %for.1, we are fine initially, because the previous value %for.1.next dominates the +; user of %for.1. But for %for.2, we have to sink the user (%for.1.next) past the previous +; value %for.2.next. This however breaks the condition we have for %for.1. We cannot fix +; both first order recurrences and cannot vectorize the loop. +; +; Make sure we don't compute costs if there are no vector VPlans. + ; CHECK-NOT: LV: Vector loop of width {{.+}} costs: ; -; CHECK: define void @test_wide_pointer_induction( +; CHECK: define i32 @test( ; CHECK-NOT: vector.body ; +define i32 @test(i32 %N) { entry: - br label %loop + br label %for.body -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %iv.ptr = phi ptr [ %a, %entry ], [ %iv.ptr.next, %loop ] - %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv - store ptr %iv.ptr, ptr %arrayidx, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %iv.ptr.next = getelementptr i64, ptr %iv.ptr, i32 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i32 [ %inc, %for.body ], [ 10, %entry ] + %for.1 = phi i32 [ %for.1.next, %for.body ], [ 20, %entry ] + %for.2 = phi i32 [ %for.2.next, %for.body ], [ 11, %entry ] + %for.1.next = add nsw i32 %for.2, 1 + %for.2.next = shl i32 %for.1, 24 + %inc = add nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond1.for.end_crit_edge, label %for.body -exit: - ret void +for.cond1.for.end_crit_edge: ; preds = %for.body + %add.lcssa = phi i32 [ %for.1.next, %for.body ] + %sext.lcssa = phi i32 [ %for.2.next, %for.body ] + %res = add i32 %add.lcssa, %sext.lcssa + ret i32 %res } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 9e492c62a5577..df907dcbd4606 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -170,7 +170,6 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP8]] ; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul [[TMP14]], splat (i64 8) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP16]] @@ -181,6 +180,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP19]], splat (i32 1) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[VECTOR_GEP]], i32 4, splat (i1 true)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP8]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -753,13 +753,11 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI11:%.*]] = phi ptr [ [[P2]], [[VECTOR_PH]] ], [ [[PTR_IND12:%.*]], [[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT10:%.*]] = shufflevector [[DOTSPLATINSERT9]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul [[TMP19]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], [[TMP18]] -; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[TMP27:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[TMP21:%.*]] = mul [[TMP27]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP21]] @@ -767,7 +765,9 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP30]], [[VECTOR_GEP]], i32 4, splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]] ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] +; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP17]] ; STRIDED-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; STRIDED-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 69931a0774883..d2c53f47a6670 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -231,7 +231,6 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) { ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 ; STRIDED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; STRIDED-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] @@ -240,6 +239,7 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) { ; STRIDED-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL:%.*]], i32 [[OFFSET_IDX]] ; STRIDED-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; STRIDED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index b2acc6470da75..77f2fc587cc9e 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -96,17 +96,17 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i64> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP3]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: From 4b319634b9d8a15aa4f83d8761beee588ac70065 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 6 Aug 2025 13:01:43 +0800 Subject: [PATCH 2/3] Add comments --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b64b573540074..a9cd61b99191c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2179,6 +2179,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPWidenIntOrFpInductionRecipe>) && all_of(Plan.getVFxUF().users(), [&Plan](VPUser *U) { + // VFxUF users must either be VPWidenPointerInductionRecipe or + // canonical IV increment. return match(U, m_c_Binary( m_Specific(Plan.getCanonicalIV()), m_Specific(&Plan.getVFxUF()))) || @@ -2187,6 +2189,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + // Don't replace the canonical IV increment. return isa(U); }); From cf4197299ecfdb0e71dcd1e1923e83926471f0fb Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 6 Aug 2025 18:02:53 +0800 Subject: [PATCH 3/3] Break up into two separate asserts, update comment --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a9cd61b99191c..bb0de1f0157a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2177,19 +2177,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { assert(all_of(Plan.getVF().users(), IsaPred) && - all_of(Plan.getVFxUF().users(), + "User of VF that we can't transform to EVL."); + Plan.getVF().replaceAllUsesWith(&EVL); + + assert(all_of(Plan.getVFxUF().users(), [&Plan](VPUser *U) { - // VFxUF users must either be VPWidenPointerInductionRecipe or - // canonical IV increment. return match(U, m_c_Binary( m_Specific(Plan.getCanonicalIV()), m_Specific(&Plan.getVFxUF()))) || isa(U); }) && - "User of VF that we can't transform to EVL."); - Plan.getVF().replaceAllUsesWith(&EVL); + "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " + "increment of the canonical induction."); Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { - // Don't replace the canonical IV increment. + // Only replace uses in VPWidenPointerInductionRecipe; The increment of the + // canonical induction must not be updated. return isa(U); });