diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a52aa8420b301..ccd90550ecac2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8450,11 +8450,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && !HasScalarVF && - !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength, - *Plan, CM.getMaxSafeElements())) - break; + if (CM.foldTailWithEVL() && !HasScalarVF) + VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, + *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a7965a053e6e3..bb0de1f0157a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2180,6 +2180,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); + assert(all_of(Plan.getVFxUF().users(), + [&Plan](VPUser *U) { + return match(U, m_c_Binary( + m_Specific(Plan.getCanonicalIV()), + m_Specific(&Plan.getVFxUF()))) || + isa(U); + }) && + "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " + "increment of the canonical induction."); + Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + // Only replace uses in VPWidenPointerInductionRecipe; The increment of the + // canonical induction must not be updated. + return isa(U); + }); + // Defer erasing recipes till the end so that we don't invalidate the // VPTypeAnalysis cache. SmallVector ToErase; @@ -2315,16 +2330,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextAVL = sub IVSize nuw %AVL, %OpEVL /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength( +void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - // The transform updates all users of inductions to work based on EVL, instead - // of the VF directly. At the moment, widened pointer inductions cannot be - // updated, so bail out if the plan contains any. - bool ContainsWidenPointerInductions = - any_of(Header->phis(), IsaPred); - if (ContainsWidenPointerInductions) - return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); auto *CanIVTy = CanonicalIVPHI->getScalarType(); @@ -2379,7 +2387,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength( CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); // TODO: support unroll factor > 1. Plan.setUF(1); - return true; } void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { @@ -2803,13 +2810,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, R->replaceAllUsesWith(PtrAdd); // Create the backedge value for the scalar pointer phi. - Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); + Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF), DL); VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF}); - VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); - Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VPValue *InductionGEP = Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind"); ScalarPtrPhi->addOperand(InductionGEP); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5943684e17a76..7adfdf549a5ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -177,10 +177,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. - /// \returns true if the transformation succeeds, or false if it doesn't. - static bool - tryAddExplicitVectorLength(VPlan &Plan, - const std::optional &MaxEVLSafeElements); + static void + addExplicitVectorLength(VPlan &Plan, + const std::optional &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 14ae4f2204310..3417e1c8dc1ea 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 9929f35d47dac..5c6328e51760d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -35,7 +35,6 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul [[TMP13]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP15]] @@ -48,6 +47,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP20]], ptr [[TMP18]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -119,7 +119,6 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP14]] @@ -128,6 +127,7 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP17]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 6947884efb699..2c88e0e744fac 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -239,7 +239,6 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP10]] @@ -250,6 +249,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[TMP12]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: store [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -313,7 +313,6 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], splat (i64 1) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP5]] @@ -321,6 +320,7 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: [[TMP7:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0( zeroinitializer, ptr [[TMP7]], i32 2, [[TMP6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index 5f13089ff17fd..369da05015af8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -74,16 +74,50 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) { ; CHECK-LABEL: define void @test_wide_ptr_induction( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], splat (i64 8) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2p0.p0( [[VECTOR_GEP]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 8, [[TMP10]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[B]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[VECTOR_BODY]] ], [ [[B]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -109,4 +143,6 @@ for.cond.cleanup: ; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} ; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll index 0afe04e610bce..07a7b7b1fcc1c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll @@ -1,29 +1,36 @@ -; RUN: opt -passes=loop-vectorize \ -; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ -; RUN: -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s ; REQUIRES: asserts -; Make sure we do not vectorize a loop with a widened pointer induction. -define void @test_wide_pointer_induction(ptr noalias %a, i64 %N) { +; For %for.1, we are fine initially, because the previous value %for.1.next dominates the +; user of %for.1. But for %for.2, we have to sink the user (%for.1.next) past the previous +; value %for.2.next. This however breaks the condition we have for %for.1. We cannot fix +; both first order recurrences and cannot vectorize the loop. +; +; Make sure we don't compute costs if there are no vector VPlans. + ; CHECK-NOT: LV: Vector loop of width {{.+}} costs: ; -; CHECK: define void @test_wide_pointer_induction( +; CHECK: define i32 @test( ; CHECK-NOT: vector.body ; +define i32 @test(i32 %N) { entry: - br label %loop + br label %for.body -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %iv.ptr = phi ptr [ %a, %entry ], [ %iv.ptr.next, %loop ] - %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv - store ptr %iv.ptr, ptr %arrayidx, align 8 - %iv.next = add nuw nsw i64 %iv, 1 - %iv.ptr.next = getelementptr i64, ptr %iv.ptr, i32 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %exit, label %loop +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i32 [ %inc, %for.body ], [ 10, %entry ] + %for.1 = phi i32 [ %for.1.next, %for.body ], [ 20, %entry ] + %for.2 = phi i32 [ %for.2.next, %for.body ], [ 11, %entry ] + %for.1.next = add nsw i32 %for.2, 1 + %for.2.next = shl i32 %for.1, 24 + %inc = add nsw i32 %iv, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond1.for.end_crit_edge, label %for.body -exit: - ret void +for.cond1.for.end_crit_edge: ; preds = %for.body + %add.lcssa = phi i32 [ %for.1.next, %for.body ] + %sext.lcssa = phi i32 [ %for.2.next, %for.body ] + %res = add i32 %add.lcssa, %sext.lcssa + ret i32 %res } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 9e492c62a5577..df907dcbd4606 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -170,7 +170,6 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP8]] ; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul [[TMP14]], splat (i64 8) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP16]] @@ -181,6 +180,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP19]], splat (i32 1) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[VECTOR_GEP]], i32 4, splat (i1 true)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP8]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -753,13 +753,11 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI11:%.*]] = phi ptr [ [[P2]], [[VECTOR_PH]] ], [ [[PTR_IND12:%.*]], [[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT10:%.*]] = shufflevector [[DOTSPLATINSERT9]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul [[TMP19]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], [[TMP18]] -; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[TMP27:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[TMP21:%.*]] = mul [[TMP27]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP21]] @@ -767,7 +765,9 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP30]], [[VECTOR_GEP]], i32 4, splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]] ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] +; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP17]] ; STRIDED-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; STRIDED-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 69931a0774883..d2c53f47a6670 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -231,7 +231,6 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) { ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 ; STRIDED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; STRIDED-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] @@ -240,6 +239,7 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) { ; STRIDED-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL:%.*]], i32 [[OFFSET_IDX]] ; STRIDED-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; STRIDED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index b2acc6470da75..77f2fc587cc9e 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -96,17 +96,17 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i64> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP3]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: