diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b4ea70e0e5cc2..ad17d8b157ce6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7284,6 +7284,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); + VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); bool HasBranchWeights = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 25b9616880bf4..96ccf5bf50a25 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -372,6 +372,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { set(Def, VectorValue); } else { assert(!VF.isScalable() && "VF is assumed to be non scalable."); + assert(isa(Def) && "Explicit BuildVector recipes must " + "handle packing for non-VPInstructions."); // Initialize packing with insertelements to start from poison. VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 98d11f0bc7893..f7657c54dde99 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -461,6 +461,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::Load: case VPInstruction::AnyOf: case VPInstruction::BranchOnCond: + case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExplicitVectorLength: diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 373b2f76308a2..ad737eb358e77 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3176,6 +3176,53 @@ void VPlanTransforms::materializeVectorTripCount( Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue()); } +void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { + if (Plan.hasScalarVFOnly()) + return; + + VPTypeAnalysis TypeInfo(Plan); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getEntry())); + auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly( + vp_depth_first_shallow(LoopRegion->getEntry())); + // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes, + // excluding ones in replicate regions. Those are not unrolled explicitly yet. + for (VPBasicBlock *VPBB : + concat(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *RepR = dyn_cast(&R); + if (!RepR || RepR->isSingleScalar()) + continue; + VPInstruction *BuildVector = nullptr; + for (VPUser *U : to_vector(RepR->users())) { + VPRegionBlock *ParentRegion = + cast(U)->getParent()->getParent(); + if (U->usesScalars(RepR) && ParentRegion == LoopRegion) + continue; + + if (!BuildVector) { + Type *ScalarTy = TypeInfo.inferScalarType(RepR); + unsigned Opc = ScalarTy->isStructTy() + ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector; + BuildVector = new VPInstruction(Opc, {RepR}); + BuildVector->insertAfter(RepR); + } + + // Only update a single operand per users, as the same user is added + // multiple times, once per use. + // TODO: Introduce de-duplicating iterator over users. + for (unsigned Idx = 0; Idx != U->getNumOperands(); ++Idx) + if (U->getOperand(Idx) == RepR) { + U->setOperand(Idx, BuildVector); + break; + } + } + } + } +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 880159f760922..1a19e15bbaa25 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -256,6 +256,10 @@ struct VPlanTransforms { unsigned BestUF, PredicatedScalarEvolution &PSE); + /// Add explicit Build[Struct]Vector recipes that combine scalar values + /// produced by VPReplicateRecipes to a single vector. + static void materializeBuildVectors(VPlan &Plan); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 871e37ef3966a..2a3e30932f12e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -463,9 +463,10 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { } /// Create a single-scalar clone of \p RepR for lane \p Lane. -static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, - Type *IdxTy, VPReplicateRecipe *RepR, - VPLane Lane) { +static VPReplicateRecipe * +cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, + VPReplicateRecipe *RepR, VPLane Lane, + DenseMap> &Value2Lanes) { // Collect the operands at Lane, creating extracts as needed. SmallVector NewOps; for (VPValue *Op : RepR->operands()) { @@ -478,6 +479,11 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; } + if (Value2Lanes.contains(Op)) { + NewOps.push_back(Value2Lanes[Op][Lane.getKnownLane()]); + continue; + } + // Look through buildvector to avoid unnecessary extracts. if (match(Op, m_BuildVector())) { NewOps.push_back( @@ -510,6 +516,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())); auto VPBBsToUnroll = concat(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion); + DenseMap> Value2Lanes; + SmallVector ToRemove; for (VPBasicBlock *VPBB : VPBBsToUnroll) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *RepR = dyn_cast(&R); @@ -521,12 +529,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { if (isa(RepR->getUnderlyingInstr()) && vputils::isSingleScalar(RepR->getOperand(1))) { // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, - VPLane::getLastLaneForVF(VF)); + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), + Value2Lanes); } else { // Create single-scalar version of RepR for all lanes. for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes); } RepR->eraseFromParent(); continue; @@ -534,23 +542,30 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { /// Create single-scalar version of RepR for all lanes. SmallVector LaneDefs; for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); + LaneDefs.push_back( + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes)); + Value2Lanes[RepR] = LaneDefs; /// Users that only demand the first lane can use the definition for lane /// 0. RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); }); - // If needed, create a Build(Struct)Vector recipe to insert the scalar - // lane values into a vector. - Type *ResTy = RepR->getUnderlyingInstr()->getType(); - VPValue *VecRes = Builder.createNaryOp( - ResTy->isStructTy() ? VPInstruction::BuildStructVector - : VPInstruction::BuildVector, - LaneDefs); - RepR->replaceAllUsesWith(VecRes); - RepR->eraseFromParent(); + for (VPUser *U : to_vector(RepR->users())) { + auto *VPI = dyn_cast(U); + if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && + VPI->getOpcode() != VPInstruction::BuildStructVector)) + continue; + assert(VPI->getNumOperands() == 1 && + "Build(Struct)Vector must have a single operand"); + VPI->setOperand(0, LaneDefs[0]); + for (VPValue *Def : drop_begin(LaneDefs)) + VPI->addOperand(Def); + } + ToRemove.push_back(RepR); } } + for (auto *R : reverse(ToRemove)) + R->eraseFromParent(); }