diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 13058a159cf95..02a0d1b7a1cde 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4228,6 +4228,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntOrFpInductionSC: case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: @@ -4256,8 +4257,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // If no def nor is a store, e.g., branches, continue - no value to check. if (R.getNumDefinedValues() == 0 && - !isa( - &R)) + !isa(&R)) continue; // For multi-def recipes, currently only interleaved loads, suffice to // check first def only. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c13fe4548ff11..382f375009d36 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: @@ -2371,11 +2372,14 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe { } }; -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase { +/// A common base class for interleaved memory operations. +/// Interleaved memory operation is a memory access method that combines +/// multiple strided loads/stores into a single wide load/store with shuffles. +/// The first operand must be the address. The optional operands are, in order, +/// the stored values and the mask. +/// TODO: Inherit from VPIRMetadata +class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase { +protected: const InterleaveGroup *IG; /// Indicates if the interleave group is in a conditional block and requires a @@ -2386,90 +2390,186 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase { /// unusued gaps can be loaded speculatively. bool NeedsMaskForGaps = false; -public: - VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, - ArrayRef StoredValues, VPValue *Mask, - bool NeedsMaskForGaps, DebugLoc DL) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, - DL), - - IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { + VPInterleaveBase(const unsigned char SC, + const InterleaveGroup *IG, + ArrayRef Operands, + ArrayRef StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) { - if (I->getType()->isVoidTy()) + for (unsigned I = 0; I < IG->getFactor(); ++I) + if (Instruction *Inst = IG->getMember(I)) { + if (Inst->getType()->isVoidTy()) continue; - new VPValue(I, this); + new VPValue(Inst, this); } for (auto *SV : StoredValues) addOperand(SV); + if (Mask) { HasMask = true; addOperand(Mask); } } - ~VPInterleaveRecipe() override = default; - VPInterleaveRecipe *clone() override { - return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps, getDebugLoc()); +public: + VPInterleaveBase *clone() override { + llvm_unreachable("cloning not supported"); } - VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInterleaveSC || + R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC; + } + + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); + } /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); // Address is the 1st, mandatory operand. } + /// Return true if the access needs a mask because of the gaps. + bool needsMaskForGaps() const { return NeedsMaskForGaps; } + /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. + // Mask is optional and the last operand. return HasMask ? getOperand(getNumOperands() - 1) : nullptr; } + const InterleaveGroup *getInterleaveGroup() { return IG; } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } + + void execute(VPTransformState &State) override { + llvm_unreachable("VPInterleaveBase should not be instantiated."); + } + + /// Return the cost of this VPInterleaveRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const = 0; + + /// Returns the number of stored operands of this interleave group. Returns 0 + /// for load interleave groups. + virtual unsigned getNumStoreOperands() const = 0; + /// Return the VPValues stored by this interleave group. If it is a load /// interleave group, return an empty ArrayRef. - ArrayRef getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef(op_begin(), getNumOperands()) - .slice(1, getNumStoreOperands()); + virtual ArrayRef getStoredValues() const = 0; +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { +public: + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, + ArrayRef StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, DebugLoc DL) + : VPInterleaveBase(VPDef::VPInterleaveSC, IG, ArrayRef({Addr}), + StoredValues, Mask, NeedsMaskForGaps, DL) {} + + ~VPInterleaveRecipe() override = default; + + VPInterleaveRecipe *clone() override { + return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), + NeedsMaskForGaps, getDebugLoc()); } + VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; - /// Return the cost of this VPInterleaveRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - const InterleaveGroup *getInterleaveGroup() { return IG; } + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + } - /// Returns the number of stored operands of this interleave group. Returns 0 - /// for load interleave groups. - unsigned getNumStoreOperands() const { + unsigned getNumStoreOperands() const override { return getNumOperands() - (HasMask ? 2 : 1); } - /// The recipe only uses the first lane of the address. + ArrayRef getStoredValues() const override { + // The first operand is the address, followed by the stored values, followed + // by an optional mask. + return ArrayRef(op_begin(), getNumOperands()) + .slice(1, getNumStoreOperands()); + } +}; + +/// A recipe for interleaved access operations with vector-predication +/// intrinsics. The first operand is the address, the second operand is the +/// explicit vector length . Stored values and mask are optional operands. +class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { +public: + VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask, + DebugLoc DL = {}) + : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(), + ArrayRef({R.getAddr(), &EVL}), + R.getStoredValues(), Mask, R.needsMaskForGaps(), DL) { + assert(!IG->isReverse() && + "Reversed interleave-group with tail folding is not supported."); + } + + ~VPInterleaveEVLRecipe() override = default; + + VPInterleaveEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC) + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(1); } + + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The recipe only uses the first lane of the address, and EVL operand. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op) || + Op == getEVL(); } - Instruction *getInsertPos() const { return IG->getInsertPos(); } + unsigned getNumStoreOperands() const override { + return getNumOperands() - (HasMask ? 3 : 2); + } + + ArrayRef getStoredValues() const override { + // The first operand is the address, and the second operand is EVL, followed + // by the stored values, followe by an optional mask. + return ArrayRef(op_begin(), getNumOperands()) + .slice(2, getNumStoreOperands()); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 16072f268a98c..db541bc6e53a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -295,7 +295,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) - .Case([V](const VPInterleaveRecipe *R) { + .Case([V](const auto *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 98d11f0bc7893..74e894759bab8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -53,8 +53,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast(this)->getNumStoreOperands() > 0; + return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -108,6 +109,9 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; + case VPInterleaveEVLSC: + case VPInterleaveSC: + return cast(this)->getNumStoreOperands() == 0; case VPReplicateSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayReadFromMemory(); @@ -184,6 +188,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -256,7 +261,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast(this)) UI = dyn_cast_or_null(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast(this)) + else if (auto *IG = dyn_cast(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast(this)) UI = &WidenMem->getIngredient(); @@ -2091,7 +2096,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa(R)) + if (isa(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -3627,8 +3632,155 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!NeedsMaskForGaps && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup *Group = IG; + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *BlockInMask = getMask(); + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + + auto CreateGroupMask = [&BlockInMask, &State, + &InterleaveFactor]() -> Value * { + auto *ResBlockInMask = State.get(BlockInMask); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); + }; + + Value *GroupMask = nullptr; + if (BlockInMask) + GroupMask = CreateGroupMask(); + else + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + + const DataLayout &DL = Instr->getDataLayout(); + // Vectorize the interleaved load group. + if (isa(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic(VecTy, Intrinsic::vp_load, + {ResAddr, GroupMask, EVL}, + nullptr, "wide.vp.load"); + NewLoad->addParamAttr(0, Attribute::getWithAlignment(NewLoad->getContext(), + Group->getAlign())); + + Group->addMetadata(NewLoad); + + ArrayRef VPDefs = definedValues(); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(VPDefs[J], StridedVec); + ++J; + } + return; + } + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + + // Vectorize the interleaved store group. + ArrayRef StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector StoredVecs; + unsigned StoredIdx = 0; + for (unsigned I = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + ++StoredIdx; + + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = State.Builder.CreateIntrinsic( + Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, EVL}); + NewStore->addParamAttr(1, Attribute::getWithAlignment(NewStore->getContext(), + Group->getAlign())); + + Group->addMetadata(NewStore); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 98c6b9a70405b..a011a72367dd2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2142,6 +2142,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); }) + .Case([&](VPInterleaveRecipe *IR) { + VPValue *NewMask = GetNewMask(IR->getMask()); + return new VPInterleaveEVLRecipe(*IR, EVL, NewMask, IR->getDebugLoc()); + }) .Case([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); @@ -2233,17 +2237,18 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); - assert( - NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); + EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); + if (isa( + EVLRecipe)) { + for (unsigned I = 0; I < NumDefVal; ++I) { + VPValue *CurVPV = CurRecipe->getVPValue(I); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); + } } ToErase.push_back(CurRecipe); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef6..85c6c2c8d7965 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,7 +38,7 @@ struct VPDoubleValueDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPInterleaveRecipe; +class VPInterleaveBase; class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -48,7 +48,7 @@ class VPPhiAccessors; class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; - friend class VPInterleaveRecipe; + friend class VPInterleaveBase; friend class VPlan; friend class VPExpressionRecipe; @@ -335,6 +335,7 @@ class VPDef { VPExpressionSC, VPIRInstructionSC, VPInstructionSC, + VPInterleaveEVLSC, VPInterleaveSC, VPReductionEVLSC, VPReductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 14ae4f2204310..0ca8f7d2f2807 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case( + .Case( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 976ce77d2ba29..144972bc692cd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -117,34 +117,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) -; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP11]], [[TMP14]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, [[INTERLEAVED_MASK5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], [[INTERLEAVED_MASK]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_VP_LOAD]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = call @llvm.smax.nxv16i8( [[TMP6]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP9]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP8]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP8]], [[TMP11]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], [[INTERLEAVED_MASK3]], i32 [[TMP1]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: @@ -304,38 +299,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 2 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) -; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP2]], [[TMP2]], [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], [[INTERLEAVED_MASK]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_VP_LOAD]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = call @llvm.smax.nxv16i8( [[TMP6]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP10]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP8]], [[TMP9]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP2]], [[TMP2]], [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP10]], [[TMP11]], [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], [[INTERLEAVED_MASK3]], i32 [[TMP1]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 8d987a94d383d..9b88df004f45c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -7,7 +7,6 @@ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s -; FIXME: interleaved accesses are not supported yet with predicated vectorization. define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-LABEL: @interleave( ; IF-EVL-NEXT: entry: @@ -25,25 +24,20 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0 -; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP17]], [[TMP17]]) -; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP6]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP16]]) ; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw [[TMP15]], [[TMP14]] ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP16]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: