-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[VPlan] Materialize Build(Struct)Vectors for VPReplicateRecipes. (NFCI) #151487
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[VPlan] Materialize Build(Struct)Vectors for VPReplicateRecipes. (NFCI) #151487
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesMaterialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to serve their users requiring a vector, instead of doing so when unrolling by VF. Now we only need to implicitly build vectors in VPTransformState::get for VPInstructions. Once they are also unrolled by VF we can remove the code-path alltogether. Full diff: https://github.com/llvm/llvm-project/pull/151487.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 850c4a11edc67..9c547f759617c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7284,6 +7284,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());
+ VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
bool HasBranchWeights =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 25b9616880bf4..96ccf5bf50a25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -372,6 +372,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
set(Def, VectorValue);
} else {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ assert(isa<VPInstruction>(Def) && "Explicit BuildVector recipes must "
+ "handle packing for non-VPInstructions.");
// Initialize packing with insertelements to start from poison.
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68e7c20a070f4..7689ed6c2bd35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::Load:
case VPInstruction::AnyOf:
case VPInstruction::BranchOnCond:
+ case VPInstruction::BuildStructVector:
+ case VPInstruction::BuildVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ad8235d891c5d..4c0f7f40408c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3192,6 +3192,51 @@ void VPlanTransforms::materializeVectorTripCount(
Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
}
+void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ VPTypeAnalysis TypeInfo(Plan);
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry()));
+ auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(LoopRegion->getEntry()));
+ for (VPBasicBlock *VPBB :
+ concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ if (!RepR || RepR->isSingleScalar())
+ continue;
+ VPInstruction *BuildVector = nullptr;
+ for (VPUser *U : to_vector(RepR->users())) {
+ VPRegionBlock *ParentRegion =
+ cast<VPRecipeBase>(U)->getParent()->getParent();
+ if (U->usesScalars(RepR) && ParentRegion == LoopRegion)
+ continue;
+
+ if (!BuildVector) {
+ Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+ unsigned Opc = ScalarTy->isStructTy()
+ ? VPInstruction::BuildStructVector
+ : VPInstruction::BuildVector;
+ BuildVector = new VPInstruction(Opc, {RepR});
+ BuildVector->insertAfter(RepR);
+ }
+
+ // Only update a single operand per users, as the same user is added
+ // multiple times, once per use.
+ // TODO: Introduce de-duplicating iterator over users.
+ for (unsigned Idx = 0; Idx != U->getNumOperands(); ++Idx)
+ if (U->getOperand(Idx) == RepR) {
+ U->setOperand(Idx, BuildVector);
+ break;
+ }
+ }
+ }
+ }
+}
+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 880159f760922..1a19e15bbaa25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -256,6 +256,10 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Add explicit Build[Struct]Vector recipes that combine scalar values
+ /// produced by VPReplicateRecipes to a single vector.
+ static void materializeBuildVectors(VPlan &Plan);
+
/// Try to convert a plan with interleave groups with VF elements to a plan
/// with the interleave groups replaced by wide loads and stores processing VF
/// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 871e37ef3966a..7f23fb5b7d11a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -463,9 +463,10 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
}
/// Create a single-scalar clone of \p RepR for lane \p Lane.
-static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
- Type *IdxTy, VPReplicateRecipe *RepR,
- VPLane Lane) {
+static VPReplicateRecipe *
+cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
+ VPReplicateRecipe *RepR, VPLane Lane,
+ DenseMap<VPValue *, SmallVector<VPValue *>> &Value2Lanes) {
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : RepR->operands()) {
@@ -478,6 +479,11 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
}
+ if (Value2Lanes.contains(Op)) {
+ NewOps.push_back(Value2Lanes[Op][Lane.getKnownLane()]);
+ continue;
+ }
+
// Look through buildvector to avoid unnecessary extracts.
if (match(Op, m_BuildVector())) {
NewOps.push_back(
@@ -510,6 +516,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
auto VPBBsToUnroll =
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
+ DenseMap<VPValue *, SmallVector<VPValue *>> Value2Lanes;
+ SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -521,12 +529,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
vputils::isSingleScalar(RepR->getOperand(1))) {
// Stores to invariant addresses need to store the last lane only.
- cloneForLane(Plan, Builder, IdxTy, RepR,
- VPLane::getLastLaneForVF(VF));
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
+ Value2Lanes);
} else {
// Create single-scalar version of RepR for all lanes.
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes);
}
RepR->eraseFromParent();
continue;
@@ -534,23 +542,28 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
/// Create single-scalar version of RepR for all lanes.
SmallVector<VPValue *> LaneDefs;
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+ LaneDefs.push_back(
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes));
+ Value2Lanes[RepR] = LaneDefs;
/// Users that only demand the first lane can use the definition for lane
/// 0.
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
return U.onlyFirstLaneUsed(RepR);
});
- // If needed, create a Build(Struct)Vector recipe to insert the scalar
- // lane values into a vector.
- Type *ResTy = RepR->getUnderlyingInstr()->getType();
- VPValue *VecRes = Builder.createNaryOp(
- ResTy->isStructTy() ? VPInstruction::BuildStructVector
- : VPInstruction::BuildVector,
- LaneDefs);
- RepR->replaceAllUsesWith(VecRes);
- RepR->eraseFromParent();
+ for (VPUser *U : to_vector(RepR->users())) {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
+ VPI->getOpcode() != VPInstruction::BuildStructVector))
+ continue;
+ VPI->setOperand(0, LaneDefs[0]);
+ for (VPValue *Def : drop_begin(LaneDefs))
+ VPI->addOperand(Def);
+ }
+ ToRemove.push_back(RepR);
}
}
+ for (auto *R : reverse(ToRemove))
+ R->eraseFromParent();
}
|
Materialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to serve their users requiring a vector, instead of doing so when unrolling by VF. Now we only need to implicitly build vectors in VPTransformState::get for VPInstructions. Once they are also unrolled by VF we can remove the code-path alltogether.
…ldvector-for-vpreplicate-recipes
b4d634d
to
d293818
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the explanations. This patch LGTM, thanks!
|
||
// Only update a single operand per users, as the same user is added | ||
// multiple times, once per use. | ||
// TODO: Introduce de-duplicating iterator over users. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe not the right place for this TODO?
Requires llvm#151487 to completely subsume the non-VPlan based limited CSE. Inspired by llvm#146856, although the test from that PR remains unchanged: still investigating.
Requires llvm#151487 to completely subsume the non-VPlan based limited CSE. Inspired by llvm#146856, although the test from that PR remains unchanged: still investigating.
Materialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to serve their users requiring a vector, instead of doing so when unrolling by VF.
Now we only need to implicitly build vectors in VPTransformState::get for VPInstructions. Once they are also unrolled by VF we can remove the code-path alltogether.