-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[VPlan] Refine check for preserving uniformity #151720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
The logic for narrowing to single scalar recipes is in two different places: narrowToSingleScalarRecipes and legalizeAndOptimizeInductions. Consolidate them, with minor test changes.
The PreservesUniformity lambda in isSingleScalar can be improved by taking inspiration from ValueTracking's isNotCrossLaneOperation, by also forbidding operations that are not uniform across all lanes.
@llvm/pr-subscribers-vectorizers Author: Ramkumar Ramachandra (artagnon) ChangesThe PreservesUniformity lambda in isSingleScalar can be improved by taking inspiration from ValueTracking's isNotCrossLaneOperation, by also forbidding operations that are not uniform across all lanes. -- 8< -- Patch is 52.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151720.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a1d12a3a01e5e..3c1cbb12eebfd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -631,30 +631,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
if (!PhiR)
continue;
- // Try to narrow wide and replicating recipes to uniform recipes, based on
- // VPlan analysis.
- // TODO: Apply to all recipes in the future, to replace legacy uniformity
- // analysis.
- auto Users = collectUsersRecursively(PhiR);
- for (VPUser *U : reverse(Users)) {
- auto *Def = dyn_cast<VPSingleDefRecipe>(U);
- auto *RepR = dyn_cast<VPReplicateRecipe>(U);
- // Skip recipes that shouldn't be narrowed.
- if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
- Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
- (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
- continue;
-
- // Skip recipes that may have other lanes than their first used.
- if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
- continue;
-
- auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
- Def->operands(), /*IsUniform*/ true);
- Clone->insertAfter(Def);
- Def->replaceAllUsesWith(Clone);
- }
-
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
@@ -1239,10 +1215,12 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
- // Skip recipes that aren't single scalars or don't have only their
- // scalar results used. In the latter case, we would introduce extra
- // broadcasts.
- if (!vputils::isSingleScalar(RepOrWidenR) ||
+ // Skip recipes that aren't single scalars, that don't have users, and
+ // that don't have only their scalar results used (this would introduce
+ // extra broadcasts).
+ if ((!vputils::isSingleScalar(RepOrWidenR) &&
+ !vputils::onlyFirstLaneUsed(RepOrWidenR)) ||
+ RepOrWidenR->getNumUsers() == 0 ||
any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
return !U->usesScalars(RepOrWidenR);
}))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f1b3598..bf4deddfbb925 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -40,18 +40,37 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.
inline bool isSingleScalar(const VPValue *VPV) {
- auto PreservesUniformity = [](unsigned Opcode) -> bool {
- if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
+ // A variant of ValueTracking's isNotCrossLaneOperation that checks that the
+ // operation is uniform across lanes.
+ auto PreservesUniformity = [](auto *V) {
+ Intrinsic::ID ID;
+ if (const auto *R = dyn_cast<VPWidenIntrinsicRecipe>(V))
+ ID = R->getVectorIntrinsicID();
+ if (const auto *R = dyn_cast<VPWidenCallRecipe>(V))
+ ID = R->getCalledScalarFunction()->getIntrinsicID();
+ if (const auto *R = dyn_cast<VPReplicateRecipe>(V))
+ if (const auto *CI = dyn_cast<CallInst>(R->getUnderlyingInstr()))
+ if (const auto *F = CI->getCalledFunction())
+ ID = F->getIntrinsicID();
+ if (isTriviallyVectorizable(ID))
return true;
- switch (Opcode) {
- case Instruction::GetElementPtr:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case VPInstruction::Broadcast:
- case VPInstruction::PtrAdd:
- return true;
- default:
+
+ switch (V->getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertElement:
+ case Instruction::ExtractElement:
+ case VPInstruction::BuildVector:
+ case VPInstruction::BuildStructVector:
+ case VPInstruction::ExtractLane:
+ case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractPenultimateElement:
return false;
+ default:
+ return true;
}
};
@@ -66,19 +85,19 @@ inline bool isSingleScalar(const VPValue *VPV) {
// lanes.
if (RegionOfR && RegionOfR->isReplicator())
return false;
- return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) &&
+ return Rep->isSingleScalar() || (PreservesUniformity(Rep) &&
all_of(Rep->operands(), isSingleScalar));
}
if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
VPWidenSelectRecipe>(VPV))
return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
- return PreservesUniformity(WidenR->getOpcode()) &&
+ return PreservesUniformity(WidenR) &&
all_of(WidenR->operands(), isSingleScalar);
}
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
- (PreservesUniformity(VPI->getOpcode()) &&
+ (PreservesUniformity(VPI) &&
all_of(VPI->operands(), isSingleScalar));
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index 1c6a225f99f61..becf7177f6081 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -82,90 +82,56 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
; CHECK: [[PRED_UDIV_IF]]:
; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]]
; CHECK: [[PRED_UDIV_CONTINUE]]:
-; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP3]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_UDIV_IF]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2:.*]]
; CHECK: [[PRED_UDIV_IF1]]:
; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP6]], i32 1
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]]
; CHECK: [[PRED_UDIV_CONTINUE2]]:
-; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i64> [ [[TMP4]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP7]], %[[PRED_UDIV_IF1]] ]
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_UDIV_IF3:.*]], label %[[PRED_UDIV_CONTINUE4:.*]]
; CHECK: [[PRED_UDIV_IF3]]:
; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP10]], i32 2
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE4]]
; CHECK: [[PRED_UDIV_CONTINUE4]]:
-; CHECK-NEXT: [[TMP49:%.*]] = phi <4 x i64> [ [[TMP8]], %[[PRED_UDIV_CONTINUE2]] ], [ [[TMP34]], %[[PRED_UDIV_IF3]] ]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_UDIV_IF5:.*]], label %[[PRED_UDIV_CONTINUE6:.*]]
; CHECK: [[PRED_UDIV_IF5]]:
; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP14]], i32 3
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE6]]
; CHECK: [[PRED_UDIV_CONTINUE6]]:
-; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i64> [ [[TMP49]], %[[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], %[[PRED_UDIV_IF5]] ]
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_UDIV_IF7:.*]], label %[[PRED_UDIV_CONTINUE8:.*]]
; CHECK: [[PRED_UDIV_IF7]]:
; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP18]], i32 0
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE8]]
; CHECK: [[PRED_UDIV_CONTINUE8]]:
-; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i64> [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP19]], %[[PRED_UDIV_IF7]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP18]], %[[PRED_UDIV_IF7]] ]
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_UDIV_IF9:.*]], label %[[PRED_UDIV_CONTINUE10:.*]]
; CHECK: [[PRED_UDIV_IF9]]:
; CHECK-NEXT: [[TMP22:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP22]], i32 1
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE10]]
; CHECK: [[PRED_UDIV_CONTINUE10]]:
-; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i64> [ [[TMP20]], %[[PRED_UDIV_CONTINUE8]] ], [ [[TMP23]], %[[PRED_UDIV_IF9]] ]
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_UDIV_IF11:.*]], label %[[PRED_UDIV_CONTINUE12:.*]]
; CHECK: [[PRED_UDIV_IF11]]:
; CHECK-NEXT: [[TMP26:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP24]], i64 [[TMP26]], i32 2
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE12]]
; CHECK: [[PRED_UDIV_CONTINUE12]]:
-; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i64> [ [[TMP24]], %[[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], %[[PRED_UDIV_IF11]] ]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; CHECK-NEXT: br i1 [[TMP29]], label %[[PRED_UDIV_IF13:.*]], label %[[PRED_UDIV_CONTINUE14]]
; CHECK: [[PRED_UDIV_IF13]]:
; CHECK-NEXT: [[TMP30:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP30]], i32 3
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE14]]
; CHECK: [[PRED_UDIV_CONTINUE14]]:
-; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i64> [ [[TMP28]], %[[PRED_UDIV_CONTINUE12]] ], [ [[TMP31]], %[[PRED_UDIV_IF13]] ]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP16]]
-; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP32]]
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
-; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 1
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
-; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 2
+; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[C]], i64 0, i64 [[TMP3]]
+; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[C]], i64 0, i64 [[TMP15]]
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 3
; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
-; CHECK-NEXT: store i16 0, ptr [[TMP11]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP36]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP38]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP40]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP42]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP44]], align 2
; CHECK-NEXT: store i16 0, ptr [[TMP46]], align 2
; CHECK-NEXT: store i16 0, ptr [[TMP48]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index add58758788f9..a5edb8a2a232d 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -578,39 +578,39 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i64 0
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP25]], i64 3
; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
-; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8
-; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
-; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
-; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
-; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
+; CHECK-NEXT: store i64 [[TMP31]], ptr [[TMP21]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -905,27 +905,27 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
-; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
+; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP8]], align 4
; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4
; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4
-; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: store i32 [[Z]], ptr [[TMP9]], align 4
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4
+; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
-; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4
+; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP18]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesThe PreservesUniformity lambda in isSingleScalar can be improved by taking inspiration from ValueTracking's isNotCrossLaneOperation, by also forbidding operations that are not uniform across all lanes. -- 8< -- Patch is 52.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151720.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a1d12a3a01e5e..3c1cbb12eebfd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -631,30 +631,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
if (!PhiR)
continue;
- // Try to narrow wide and replicating recipes to uniform recipes, based on
- // VPlan analysis.
- // TODO: Apply to all recipes in the future, to replace legacy uniformity
- // analysis.
- auto Users = collectUsersRecursively(PhiR);
- for (VPUser *U : reverse(Users)) {
- auto *Def = dyn_cast<VPSingleDefRecipe>(U);
- auto *RepR = dyn_cast<VPReplicateRecipe>(U);
- // Skip recipes that shouldn't be narrowed.
- if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
- Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
- (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
- continue;
-
- // Skip recipes that may have other lanes than their first used.
- if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
- continue;
-
- auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
- Def->operands(), /*IsUniform*/ true);
- Clone->insertAfter(Def);
- Def->replaceAllUsesWith(Clone);
- }
-
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
@@ -1239,10 +1215,12 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
- // Skip recipes that aren't single scalars or don't have only their
- // scalar results used. In the latter case, we would introduce extra
- // broadcasts.
- if (!vputils::isSingleScalar(RepOrWidenR) ||
+ // Skip recipes that aren't single scalars, that don't have users, and
+ // that don't have only their scalar results used (this would introduce
+ // extra broadcasts).
+ if ((!vputils::isSingleScalar(RepOrWidenR) &&
+ !vputils::onlyFirstLaneUsed(RepOrWidenR)) ||
+ RepOrWidenR->getNumUsers() == 0 ||
any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
return !U->usesScalars(RepOrWidenR);
}))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f1b3598..bf4deddfbb925 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -40,18 +40,37 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.
inline bool isSingleScalar(const VPValue *VPV) {
- auto PreservesUniformity = [](unsigned Opcode) -> bool {
- if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
+ // A variant of ValueTracking's isNotCrossLaneOperation that checks that the
+ // operation is uniform across lanes.
+ auto PreservesUniformity = [](auto *V) {
+ Intrinsic::ID ID;
+ if (const auto *R = dyn_cast<VPWidenIntrinsicRecipe>(V))
+ ID = R->getVectorIntrinsicID();
+ if (const auto *R = dyn_cast<VPWidenCallRecipe>(V))
+ ID = R->getCalledScalarFunction()->getIntrinsicID();
+ if (const auto *R = dyn_cast<VPReplicateRecipe>(V))
+ if (const auto *CI = dyn_cast<CallInst>(R->getUnderlyingInstr()))
+ if (const auto *F = CI->getCalledFunction())
+ ID = F->getIntrinsicID();
+ if (isTriviallyVectorizable(ID))
return true;
- switch (Opcode) {
- case Instruction::GetElementPtr:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case VPInstruction::Broadcast:
- case VPInstruction::PtrAdd:
- return true;
- default:
+
+ switch (V->getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertElement:
+ case Instruction::ExtractElement:
+ case VPInstruction::BuildVector:
+ case VPInstruction::BuildStructVector:
+ case VPInstruction::ExtractLane:
+ case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractPenultimateElement:
return false;
+ default:
+ return true;
}
};
@@ -66,19 +85,19 @@ inline bool isSingleScalar(const VPValue *VPV) {
// lanes.
if (RegionOfR && RegionOfR->isReplicator())
return false;
- return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) &&
+ return Rep->isSingleScalar() || (PreservesUniformity(Rep) &&
all_of(Rep->operands(), isSingleScalar));
}
if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
VPWidenSelectRecipe>(VPV))
return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
- return PreservesUniformity(WidenR->getOpcode()) &&
+ return PreservesUniformity(WidenR) &&
all_of(WidenR->operands(), isSingleScalar);
}
if (auto *VPI = dyn_cast<VPInstruction>(VPV))
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
- (PreservesUniformity(VPI->getOpcode()) &&
+ (PreservesUniformity(VPI) &&
all_of(VPI->operands(), isSingleScalar));
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index 1c6a225f99f61..becf7177f6081 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -82,90 +82,56 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
; CHECK: [[PRED_UDIV_IF]]:
; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]]
; CHECK: [[PRED_UDIV_CONTINUE]]:
-; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP3]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_UDIV_IF]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2:.*]]
; CHECK: [[PRED_UDIV_IF1]]:
; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP6]], i32 1
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]]
; CHECK: [[PRED_UDIV_CONTINUE2]]:
-; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i64> [ [[TMP4]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP7]], %[[PRED_UDIV_IF1]] ]
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_UDIV_IF3:.*]], label %[[PRED_UDIV_CONTINUE4:.*]]
; CHECK: [[PRED_UDIV_IF3]]:
; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP10]], i32 2
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE4]]
; CHECK: [[PRED_UDIV_CONTINUE4]]:
-; CHECK-NEXT: [[TMP49:%.*]] = phi <4 x i64> [ [[TMP8]], %[[PRED_UDIV_CONTINUE2]] ], [ [[TMP34]], %[[PRED_UDIV_IF3]] ]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_UDIV_IF5:.*]], label %[[PRED_UDIV_CONTINUE6:.*]]
; CHECK: [[PRED_UDIV_IF5]]:
; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP14]], i32 3
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE6]]
; CHECK: [[PRED_UDIV_CONTINUE6]]:
-; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i64> [ [[TMP49]], %[[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], %[[PRED_UDIV_IF5]] ]
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_UDIV_IF7:.*]], label %[[PRED_UDIV_CONTINUE8:.*]]
; CHECK: [[PRED_UDIV_IF7]]:
; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP18]], i32 0
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE8]]
; CHECK: [[PRED_UDIV_CONTINUE8]]:
-; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i64> [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP19]], %[[PRED_UDIV_IF7]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP18]], %[[PRED_UDIV_IF7]] ]
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_UDIV_IF9:.*]], label %[[PRED_UDIV_CONTINUE10:.*]]
; CHECK: [[PRED_UDIV_IF9]]:
; CHECK-NEXT: [[TMP22:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP22]], i32 1
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE10]]
; CHECK: [[PRED_UDIV_CONTINUE10]]:
-; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i64> [ [[TMP20]], %[[PRED_UDIV_CONTINUE8]] ], [ [[TMP23]], %[[PRED_UDIV_IF9]] ]
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_UDIV_IF11:.*]], label %[[PRED_UDIV_CONTINUE12:.*]]
; CHECK: [[PRED_UDIV_IF11]]:
; CHECK-NEXT: [[TMP26:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP24]], i64 [[TMP26]], i32 2
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE12]]
; CHECK: [[PRED_UDIV_CONTINUE12]]:
-; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i64> [ [[TMP24]], %[[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], %[[PRED_UDIV_IF11]] ]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; CHECK-NEXT: br i1 [[TMP29]], label %[[PRED_UDIV_IF13:.*]], label %[[PRED_UDIV_CONTINUE14]]
; CHECK: [[PRED_UDIV_IF13]]:
; CHECK-NEXT: [[TMP30:%.*]] = udiv i64 99, [[D]]
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP30]], i32 3
; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE14]]
; CHECK: [[PRED_UDIV_CONTINUE14]]:
-; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i64> [ [[TMP28]], %[[PRED_UDIV_CONTINUE12]] ], [ [[TMP31]], %[[PRED_UDIV_IF13]] ]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP16]]
-; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP32]]
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
-; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 1
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
-; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 2
+; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[C]], i64 0, i64 [[TMP3]]
+; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[C]], i64 0, i64 [[TMP15]]
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 3
; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
-; CHECK-NEXT: store i16 0, ptr [[TMP11]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP36]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP38]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP40]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP42]], align 2
-; CHECK-NEXT: store i16 0, ptr [[TMP44]], align 2
; CHECK-NEXT: store i16 0, ptr [[TMP46]], align 2
; CHECK-NEXT: store i16 0, ptr [[TMP48]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index add58758788f9..a5edb8a2a232d 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -578,39 +578,39 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i64 0
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP25]], i64 3
; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
-; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8
-; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
-; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
-; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
-; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
+; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
+; CHECK-NEXT: store i64 [[TMP31]], ptr [[TMP21]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -905,27 +905,27 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
-; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
+; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP8]], align 4
; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4
; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4
-; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: store i32 [[Z]], ptr [[TMP9]], align 4
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4
+; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
-; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4
+; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP18]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
...
[truncated]
|
The PreservesUniformity lambda in isSingleScalar can be improved by taking inspiration from ValueTracking's isNotCrossLaneOperation, by also forbidding operations that are not uniform across all lanes.
-- 8< --
Based on #151506.