diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a1d12a3a01e5e..3c1cbb12eebfd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -631,30 +631,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { if (!PhiR) continue; - // Try to narrow wide and replicating recipes to uniform recipes, based on - // VPlan analysis. - // TODO: Apply to all recipes in the future, to replace legacy uniformity - // analysis. - auto Users = collectUsersRecursively(PhiR); - for (VPUser *U : reverse(Users)) { - auto *Def = dyn_cast(U); - auto *RepR = dyn_cast(U); - // Skip recipes that shouldn't be narrowed. - if (!Def || !isa(Def) || - Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || - (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) - continue; - - // Skip recipes that may have other lanes than their first used. - if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def)) - continue; - - auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(), - Def->operands(), /*IsUniform*/ true); - Clone->insertAfter(Def); - Def->replaceAllUsesWith(Clone); - } - // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast(&Phi)) { @@ -1239,10 +1215,12 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast(&R); - // Skip recipes that aren't single scalars or don't have only their - // scalar results used. In the latter case, we would introduce extra - // broadcasts. - if (!vputils::isSingleScalar(RepOrWidenR) || + // Skip recipes that aren't single scalars, that don't have users, and + // that don't have only their scalar results used (this would introduce + // extra broadcasts). + if ((!vputils::isSingleScalar(RepOrWidenR) && + !vputils::onlyFirstLaneUsed(RepOrWidenR)) || + RepOrWidenR->getNumUsers() == 0 || any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) { return !U->usesScalars(RepOrWidenR); })) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 8dcd57f1b3598..bf4deddfbb925 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -40,18 +40,37 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used. inline bool isSingleScalar(const VPValue *VPV) { - auto PreservesUniformity = [](unsigned Opcode) -> bool { - if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode)) + // A variant of ValueTracking's isNotCrossLaneOperation that checks that the + // operation is uniform across lanes. + auto PreservesUniformity = [](auto *V) { + Intrinsic::ID ID; + if (const auto *R = dyn_cast(V)) + ID = R->getVectorIntrinsicID(); + if (const auto *R = dyn_cast(V)) + ID = R->getCalledScalarFunction()->getIntrinsicID(); + if (const auto *R = dyn_cast(V)) + if (const auto *CI = dyn_cast(R->getUnderlyingInstr())) + if (const auto *F = CI->getCalledFunction()) + ID = F->getIntrinsicID(); + if (isTriviallyVectorizable(ID)) return true; - switch (Opcode) { - case Instruction::GetElementPtr: - case Instruction::ICmp: - case Instruction::FCmp: - case VPInstruction::Broadcast: - case VPInstruction::PtrAdd: - return true; - default: + + switch (V->getOpcode()) { + case Instruction::Call: + case Instruction::Invoke: + case Instruction::BitCast: + case Instruction::ShuffleVector: + case Instruction::InsertElement: + case Instruction::ExtractElement: + case VPInstruction::BuildVector: + case VPInstruction::BuildStructVector: + case VPInstruction::ExtractLane: + case VPInstruction::FirstActiveLane: + case VPInstruction::ExtractLastElement: + case VPInstruction::ExtractPenultimateElement: return false; + default: + return true; } }; @@ -66,19 +85,19 @@ inline bool isSingleScalar(const VPValue *VPV) { // lanes. if (RegionOfR && RegionOfR->isReplicator()) return false; - return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) && + return Rep->isSingleScalar() || (PreservesUniformity(Rep) && all_of(Rep->operands(), isSingleScalar)); } if (isa(VPV)) return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar); if (auto *WidenR = dyn_cast(VPV)) { - return PreservesUniformity(WidenR->getOpcode()) && + return PreservesUniformity(WidenR) && all_of(WidenR->operands(), isSingleScalar); } if (auto *VPI = dyn_cast(VPV)) return VPI->isSingleScalar() || VPI->isVectorToScalar() || - (PreservesUniformity(VPI->getOpcode()) && + (PreservesUniformity(VPI) && all_of(VPI->operands(), isSingleScalar)); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll index 1c6a225f99f61..becf7177f6081 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll @@ -82,90 +82,56 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d) ; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP3]], %[[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_UDIV_IF]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2:.*]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP6]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i64> [ [[TMP4]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP7]], %[[PRED_UDIV_IF1]] ] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 ; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_UDIV_IF3:.*]], label %[[PRED_UDIV_CONTINUE4:.*]] ; CHECK: [[PRED_UDIV_IF3]]: ; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP10]], i32 2 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE4]] ; CHECK: [[PRED_UDIV_CONTINUE4]]: -; CHECK-NEXT: [[TMP49:%.*]] = phi <4 x i64> [ [[TMP8]], %[[PRED_UDIV_CONTINUE2]] ], [ [[TMP34]], %[[PRED_UDIV_IF3]] ] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_UDIV_IF5:.*]], label %[[PRED_UDIV_CONTINUE6:.*]] ; CHECK: [[PRED_UDIV_IF5]]: ; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP14]], i32 3 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE6]] ; CHECK: [[PRED_UDIV_CONTINUE6]]: -; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i64> [ [[TMP49]], %[[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], %[[PRED_UDIV_IF5]] ] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_UDIV_IF7:.*]], label %[[PRED_UDIV_CONTINUE8:.*]] ; CHECK: [[PRED_UDIV_IF7]]: ; CHECK-NEXT: [[TMP18:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP18]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE8]] ; CHECK: [[PRED_UDIV_CONTINUE8]]: -; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i64> [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP19]], %[[PRED_UDIV_IF7]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_UDIV_CONTINUE6]] ], [ [[TMP18]], %[[PRED_UDIV_IF7]] ] ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_UDIV_IF9:.*]], label %[[PRED_UDIV_CONTINUE10:.*]] ; CHECK: [[PRED_UDIV_IF9]]: ; CHECK-NEXT: [[TMP22:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP22]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE10]] ; CHECK: [[PRED_UDIV_CONTINUE10]]: -; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i64> [ [[TMP20]], %[[PRED_UDIV_CONTINUE8]] ], [ [[TMP23]], %[[PRED_UDIV_IF9]] ] ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 ; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_UDIV_IF11:.*]], label %[[PRED_UDIV_CONTINUE12:.*]] ; CHECK: [[PRED_UDIV_IF11]]: ; CHECK-NEXT: [[TMP26:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP24]], i64 [[TMP26]], i32 2 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE12]] ; CHECK: [[PRED_UDIV_CONTINUE12]]: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i64> [ [[TMP24]], %[[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], %[[PRED_UDIV_IF11]] ] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[TMP29]], label %[[PRED_UDIV_IF13:.*]], label %[[PRED_UDIV_CONTINUE14]] ; CHECK: [[PRED_UDIV_IF13]]: ; CHECK-NEXT: [[TMP30:%.*]] = udiv i64 99, [[D]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP30]], i32 3 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE14]] ; CHECK: [[PRED_UDIV_CONTINUE14]]: -; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i64> [ [[TMP28]], %[[PRED_UDIV_CONTINUE12]] ], [ [[TMP31]], %[[PRED_UDIV_IF13]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP16]] -; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1 -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2 -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 0 -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 2 +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[C]], i64 0, i64 [[TMP3]] +; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[C]], i64 0, i64 [[TMP15]] ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 3 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] -; CHECK-NEXT: store i16 0, ptr [[TMP11]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP36]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP38]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP40]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP42]], align 2 -; CHECK-NEXT: store i16 0, ptr [[TMP44]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP46]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP48]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index add58758788f9..a5edb8a2a232d 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -578,39 +578,39 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture % ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP21]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i64 0 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP25]], i64 3 ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: store i64 [[TMP21]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 ; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 ; CHECK-NEXT: store i64 [[TMP23]], ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 ; CHECK-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 -; CHECK-NEXT: store i64 [[TMP25]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 +; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 ; CHECK-NEXT: store i64 [[TMP26]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 -; CHECK-NEXT: store i64 [[TMP27]], ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 -; CHECK-NEXT: store i64 [[TMP28]], ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 +; CHECK-NEXT: store i64 [[TMP30]], ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 +; CHECK-NEXT: store i64 [[TMP31]], ptr [[TMP21]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -905,27 +905,27 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) { ; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 +; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP8]], align 4 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP18]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] @@ -994,20 +994,20 @@ define i32 @PR27626_1(ptr %p, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -1082,28 +1082,28 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1 +; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP8]], align 4 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 [[Z]], ptr [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 ; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 -; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP20]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] @@ -1169,35 +1169,35 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[TMP1]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP5]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP5]], i64 3 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP24]], i32 1 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 -; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 ; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP25]], align 4 ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9]] = add nuw i64 [[TMP2]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) @@ -1279,19 +1279,19 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP5]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i64 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP11]], i64 24 -; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP9]], i64 24 +; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[X]], ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 [[X]], ptr [[TMP8]], align 4 -; CHECK-NEXT: store i32 [[X]], ptr [[TMP9]], align 4 -; CHECK-NEXT: store i32 [[X]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[X]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] @@ -1363,44 +1363,44 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3) +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP33]], i64 20 ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP34]], i64 20 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 28 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i64 36 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP6]], i64 2 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP6]], i64 3 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] -; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store i32 [[X]], ptr [[TMP17]], align 4 -; CHECK-NEXT: store i32 [[X]], ptr [[TMP19]], align 4 -; CHECK-NEXT: store i32 [[X]], ptr [[TMP21]], align 4 -; CHECK-NEXT: store i32 [[Y:%.*]], ptr [[TMP23]], align 4 -; CHECK-NEXT: store i32 [[Y]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]] +; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[TMP19]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[TMP21]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[TMP23]], align 4 +; CHECK-NEXT: store i32 [[Y:%.*]], ptr [[TMP11]], align 4 ; CHECK-NEXT: store i32 [[Y]], ptr [[TMP27]], align 4 ; CHECK-NEXT: store i32 [[Y]], ptr [[TMP29]], align 4 -; CHECK-NEXT: store i32 [[Z:%.*]], ptr [[TMP9]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[TMP10]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[TMP11]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[Y]], ptr [[TMP36]], align 4 +; CHECK-NEXT: store i32 [[Z1:%.*]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store i32 [[Z1]], ptr [[TMP17]], align 4 +; CHECK-NEXT: store i32 [[Z1]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store i32 [[Z1]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1418,9 +1418,9 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr i8, ptr [[TMP31]], i64 -4 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]] ; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr i8, ptr [[TMP32]], i64 -12 -; CHECK-NEXT: store i32 [[X]], ptr [[A_I_MINUS_1]], align 4 +; CHECK-NEXT: store i32 [[Z]], ptr [[A_I_MINUS_1]], align 4 ; CHECK-NEXT: store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[A_I]], align 4 +; CHECK-NEXT: store i32 [[Z1]], ptr [[A_I]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll index 3cf8b3f4bf2b7..9457893b23e01 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll @@ -12,12 +12,12 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 -56) +; CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[N]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[N]], align 16 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP3]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: @@ -26,8 +26,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF1]]: ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = alloca i8, i64 [[N]], align 16 -; CHECK-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8 +; CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP7]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 @@ -35,8 +34,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF3]]: ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = alloca i8, i64 [[N]], align 16 -; CHECK-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 8 +; CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP11]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 @@ -44,8 +42,7 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK: [[PRED_STORE_IF5]]: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = alloca i8, i64 [[N]], align 16 -; CHECK-NEXT: store ptr [[TMP16]], ptr [[TMP15]], align 8 +; CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll index 82f2fdd431238..cc6d70331e69d 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll @@ -57,11 +57,11 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll index efd9f8bea3a2c..0775ef9b8de75 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll @@ -57,11 +57,11 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP0:%.*]] = and i64 [[INDEX]], -2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll index 61f511c16e88b..ab31abd970814 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll @@ -240,11 +240,11 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP1:%.*]] = urem i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 +; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll index e412d130e115f..93e7581a8ba3d 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll @@ -79,11 +79,11 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 1 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; VF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42 -; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 ; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; VF2-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; VF2-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -159,11 +159,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; VF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42 -; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 ; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; VF2-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; VF2-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -182,11 +182,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) { ; VF4-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 2 ; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; VF4-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 -; VF4-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42 -; VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 +; VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 ; VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; VF4-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; VF4-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP4]], align 8 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; VF4-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll index ef6ce08da5230..1d140ffc14d14 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll @@ -115,11 +115,11 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { ; VF2-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]] ; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] ; VF2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 -; VF2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42 -; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 +; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 ; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) ; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] -; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VF2-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP6]], align 8 ; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; VF2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 4a4bda254bf88..57165964c7bbe 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -908,6 +908,27 @@ define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK ir<%c> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%l1> = load ir<%src> +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.*]]> = ir<%l1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.then.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.then.0: +; CHECK-NEXT: CLONE ir<%l2> = trunc vp<[[PRED]]> +; CHECK-NEXT: CLONE ir<%cmp> = icmp eq vp<[[PRED]]>, ir<0> +; CHECK-NEXT: CLONE ir<%sel> = select ir<%cmp>, ir<5>, ir<%l2> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -916,10 +937,6 @@ define void @update_multiple_users(ptr noalias %src, ptr noalias %dst, i1 %c) { ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%l1> = load ir<%src> -; CHECK-NEXT: REPLICATE ir<%l2> = trunc ir<%l1> -; CHECK-NEXT: REPLICATE ir<%cmp> = icmp eq ir<%l1>, ir<0> -; CHECK-NEXT: REPLICATE ir<%sel> = select ir<%cmp>, ir<5>, ir<%l2> ; CHECK-NEXT: REPLICATE store ir<%sel>, ir<%dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: