Skip to content

Commit b4d634d

Browse files
committed
[VPlan] Materialize Build(Struct)Vectors for VPReplicateRecipes. (NFCI)
Materialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to serve their users requiring a vector, instead of doing so when unrolling by VF. Now we only need to implicitly build vectors in VPTransformState::get for VPInstructions. Once they are also unrolled by VF we can remove the code-path alltogether.
1 parent c10736a commit b4d634d

File tree

6 files changed

+83
-16
lines changed

6 files changed

+83
-16
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7284,6 +7284,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72847284
// cost model is complete for better cost estimates.
72857285
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
72867286
OrigLoop->getHeader()->getContext());
7287+
VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
72877288
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
72887289
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
72897290
bool HasBranchWeights =

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
372372
set(Def, VectorValue);
373373
} else {
374374
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
375+
assert(isa<VPInstruction>(Def) && "Explicit BuildVector recipes must "
376+
"handle packing for non-VPInstructions.");
375377
// Initialize packing with insertelements to start from poison.
376378
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
377379
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
461461
case Instruction::Load:
462462
case VPInstruction::AnyOf:
463463
case VPInstruction::BranchOnCond:
464+
case VPInstruction::BuildStructVector:
465+
case VPInstruction::BuildVector:
464466
case VPInstruction::CalculateTripCountMinusVF:
465467
case VPInstruction::CanonicalIVIncrementForPart:
466468
case VPInstruction::ExplicitVectorLength:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3192,6 +3192,51 @@ void VPlanTransforms::materializeVectorTripCount(
31923192
Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
31933193
}
31943194

3195+
void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
3196+
if (Plan.hasScalarVFOnly())
3197+
return;
3198+
3199+
VPTypeAnalysis TypeInfo(Plan);
3200+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3201+
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3202+
vp_depth_first_shallow(Plan.getEntry()));
3203+
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3204+
vp_depth_first_shallow(LoopRegion->getEntry()));
3205+
for (VPBasicBlock *VPBB :
3206+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3207+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3208+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3209+
if (!RepR || RepR->isSingleScalar())
3210+
continue;
3211+
VPInstruction *BuildVector = nullptr;
3212+
for (VPUser *U : to_vector(RepR->users())) {
3213+
VPRegionBlock *ParentRegion =
3214+
cast<VPRecipeBase>(U)->getParent()->getParent();
3215+
if (U->usesScalars(RepR) && ParentRegion == LoopRegion)
3216+
continue;
3217+
3218+
if (!BuildVector) {
3219+
Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3220+
unsigned Opc = ScalarTy->isStructTy()
3221+
? VPInstruction::BuildStructVector
3222+
: VPInstruction::BuildVector;
3223+
BuildVector = new VPInstruction(Opc, {RepR});
3224+
BuildVector->insertAfter(RepR);
3225+
}
3226+
3227+
// Only update a single operand per users, as the same user is added
3228+
// multiple times, once per use.
3229+
// TODO: Introduce de-duplicating iterator over users.
3230+
for (unsigned Idx = 0; Idx != U->getNumOperands(); ++Idx)
3231+
if (U->getOperand(Idx) == RepR) {
3232+
U->setOperand(Idx, BuildVector);
3233+
break;
3234+
}
3235+
}
3236+
}
3237+
}
3238+
}
3239+
31953240
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
31963241
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
31973242
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,10 @@ struct VPlanTransforms {
256256
unsigned BestUF,
257257
PredicatedScalarEvolution &PSE);
258258

259+
/// Add explicit Build[Struct]Vector recipes that combine scalar values
260+
/// produced by VPReplicateRecipes to a single vector.
261+
static void materializeBuildVectors(VPlan &Plan);
262+
259263
/// Try to convert a plan with interleave groups with VF elements to a plan
260264
/// with the interleave groups replaced by wide loads and stores processing VF
261265
/// elements, if all transformed interleave groups access the full vector

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -463,9 +463,10 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
463463
}
464464

465465
/// Create a single-scalar clone of \p RepR for lane \p Lane.
466-
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
467-
Type *IdxTy, VPReplicateRecipe *RepR,
468-
VPLane Lane) {
466+
static VPReplicateRecipe *
467+
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
468+
VPReplicateRecipe *RepR, VPLane Lane,
469+
DenseMap<VPValue *, SmallVector<VPValue *>> &Value2Lanes) {
469470
// Collect the operands at Lane, creating extracts as needed.
470471
SmallVector<VPValue *> NewOps;
471472
for (VPValue *Op : RepR->operands()) {
@@ -478,6 +479,11 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
478479
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
479480
continue;
480481
}
482+
if (Value2Lanes.contains(Op)) {
483+
NewOps.push_back(Value2Lanes[Op][Lane.getKnownLane()]);
484+
continue;
485+
}
486+
481487
// Look through buildvector to avoid unnecessary extracts.
482488
if (match(Op, m_BuildVector())) {
483489
NewOps.push_back(
@@ -510,6 +516,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
510516
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
511517
auto VPBBsToUnroll =
512518
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
519+
DenseMap<VPValue *, SmallVector<VPValue *>> Value2Lanes;
520+
SmallVector<VPRecipeBase *> ToRemove;
513521
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
514522
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
515523
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -521,36 +529,41 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
521529
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
522530
vputils::isSingleScalar(RepR->getOperand(1))) {
523531
// Stores to invariant addresses need to store the last lane only.
524-
cloneForLane(Plan, Builder, IdxTy, RepR,
525-
VPLane::getLastLaneForVF(VF));
532+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
533+
Value2Lanes);
526534
} else {
527535
// Create single-scalar version of RepR for all lanes.
528536
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
529-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
537+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes);
530538
}
531539
RepR->eraseFromParent();
532540
continue;
533541
}
534542
/// Create single-scalar version of RepR for all lanes.
535543
SmallVector<VPValue *> LaneDefs;
536544
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
537-
LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
545+
LaneDefs.push_back(
546+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes));
538547

548+
Value2Lanes[RepR] = LaneDefs;
539549
/// Users that only demand the first lane can use the definition for lane
540550
/// 0.
541551
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
542552
return U.onlyFirstLaneUsed(RepR);
543553
});
544554

545-
// If needed, create a Build(Struct)Vector recipe to insert the scalar
546-
// lane values into a vector.
547-
Type *ResTy = RepR->getUnderlyingInstr()->getType();
548-
VPValue *VecRes = Builder.createNaryOp(
549-
ResTy->isStructTy() ? VPInstruction::BuildStructVector
550-
: VPInstruction::BuildVector,
551-
LaneDefs);
552-
RepR->replaceAllUsesWith(VecRes);
553-
RepR->eraseFromParent();
555+
for (VPUser *U : to_vector(RepR->users())) {
556+
auto *VPI = dyn_cast<VPInstruction>(U);
557+
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
558+
VPI->getOpcode() != VPInstruction::BuildStructVector))
559+
continue;
560+
VPI->setOperand(0, LaneDefs[0]);
561+
for (VPValue *Def : drop_begin(LaneDefs))
562+
VPI->addOperand(Def);
563+
}
564+
ToRemove.push_back(RepR);
554565
}
555566
}
567+
for (auto *R : reverse(ToRemove))
568+
R->eraseFromParent();
556569
}

0 commit comments

Comments
 (0)