From 817f8e96ff02d29deb142a678eabca1f318aa9b8 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 23 Apr 2025 17:07:04 +0200 Subject: [PATCH 1/2] [LICM] Do not reassociate constant offset GEP LICM tries to reassociate GEPs in order to hoist an invariant GEP. Currently, it also does this in the case where the GEP has a constant offset. This is usually undesirable. From a back-end perspective, constant GEPs are usually free because they can be folded into addressing modes, so this just increases register pressume. From a middle-end perspective, keeping constant offsets last in the chain makes it easier to analyze the relationship between multiple GEPs on the same base. The worst that can happen here is if we start with something like ``` loop { p + 4*x p + 4*x + 1 p + 4*x + 2 p + 4*x + 3 } ``` And LICM converts it into: ``` p.1 = p + 1 p.2 = p + 2 p.3 = p + 3 loop { p + 4*x p.1 + 4*x p.2 + 4*x p.3 + 4*x } ``` Which is much worse than leaving it for CSE to convert to: ``` loop { p2 = p + 4*x p2 + 1 p2 + 2 p2 + 3 } ``` --- llvm/lib/Transforms/Scalar/LICM.cpp | 6 + .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 16 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 12 +- llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 36 +- .../CodeGen/PowerPC/more-dq-form-prepare.ll | 347 +++++++++--------- .../no-ctr-loop-if-exit-in-nested-loop.ll | 5 +- llvm/test/Transforms/LICM/gep-reassociate.ll | 53 ++- .../LoopVectorize/X86/float-induction-x86.ll | 60 +-- .../Transforms/PhaseOrdering/X86/pr88239.ll | 4 +- 9 files changed, 291 insertions(+), 248 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 68094c354cf46..5197e2a7b7d43 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2517,6 +2517,12 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, if (!L.isLoopInvariant(SrcPtr) || !all_of(GEP->indices(), LoopInvariant)) return false; + // Do not try to hoist a constant GEP out of the loop via reassociation. + // Constant GEPs can often be folded into addressing modes, and reassociating + // them may inhibit CSE of a common base. + if (GEP->hasAllConstantIndices()) + return false; + // This can only happen if !AllowSpeculation, otherwise this would already be // handled. // FIXME: Should we respect AllowSpeculation in these reassociation folds? diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 22ebb55826043..702a69f776de3 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -400,9 +400,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 ; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-NEXT: s_wait_alu 0xf1ff @@ -438,9 +438,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6 ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff @@ -531,9 +531,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 ; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-NEXT: s_wait_alu 0xf1ff @@ -569,9 +569,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0 +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6 ; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1 +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0 ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index be020457ce87d..4c0ab91b7d622 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -6982,7 +6982,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 -; CHECK-NEXT: .LBB6_3: ; %Flow9 +; CHECK-NEXT: .LBB6_3: ; %Flow7 ; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader @@ -7048,7 +7048,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; CHECK-NEXT: s_cbranch_scc0 .LBB6_5 -; CHECK-NEXT: .LBB6_6: ; %Flow10 +; CHECK-NEXT: .LBB6_6: ; %Flow8 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -7689,7 +7689,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_2 -; ALIGNED-NEXT: .LBB6_3: ; %Flow9 +; ALIGNED-NEXT: .LBB6_3: ; %Flow7 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader @@ -8316,7 +8316,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 -; ALIGNED-NEXT: .LBB6_6: ; %Flow10 +; ALIGNED-NEXT: .LBB6_6: ; %Flow8 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 @@ -8369,7 +8369,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 ; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB6_4: ; %Flow7 +; UNROLL3-NEXT: .LBB6_4: ; %Flow5 ; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual @@ -8403,7 +8403,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_6 -; UNROLL3-NEXT: .LBB6_7: ; %Flow8 +; UNROLL3-NEXT: .LBB6_7: ; %Flow6 ; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 272daa9dd0b59..dd5c247f6ef35 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -460,10 +460,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB3_3 -; CHECK-NEXT: ; %bb.1: ; %Flow34 +; CHECK-NEXT: ; %bb.1: ; %Flow36 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB3_10 -; CHECK-NEXT: .LBB3_2: ; %Flow35 +; CHECK-NEXT: .LBB3_2: ; %Flow37 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -494,7 +494,7 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB3_5 -; CHECK-NEXT: .LBB3_6: ; %Flow29 +; CHECK-NEXT: .LBB3_6: ; %Flow31 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB3_9 @@ -520,7 +520,7 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB3_8 -; CHECK-NEXT: .LBB3_9: ; %Flow27 +; CHECK-NEXT: .LBB3_9: ; %Flow29 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -556,7 +556,7 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB3_12 -; CHECK-NEXT: .LBB3_13: ; %Flow33 +; CHECK-NEXT: .LBB3_13: ; %Flow35 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB3_16 @@ -584,7 +584,7 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB3_15 -; CHECK-NEXT: .LBB3_16: ; %Flow31 +; CHECK-NEXT: .LBB3_16: ; %Flow33 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -907,10 +907,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB6_3 -; CHECK-NEXT: ; %bb.1: ; %Flow41 +; CHECK-NEXT: ; %bb.1: ; %Flow39 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB6_10 -; CHECK-NEXT: .LBB6_2: ; %Flow42 +; CHECK-NEXT: .LBB6_2: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB6_3: ; %memmove_copy_forward @@ -940,7 +940,7 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB6_5 -; CHECK-NEXT: .LBB6_6: ; %Flow36 +; CHECK-NEXT: .LBB6_6: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_9 @@ -966,11 +966,11 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB6_8 -; CHECK-NEXT: .LBB6_9: ; %Flow34 +; CHECK-NEXT: .LBB6_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 @@ -1002,15 +1002,15 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB6_12 -; CHECK-NEXT: .LBB6_13: ; %Flow40 +; CHECK-NEXT: .LBB6_13: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB6_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop @@ -1030,7 +1030,7 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB6_15 -; CHECK-NEXT: .LBB6_16: ; %Flow38 +; CHECK-NEXT: .LBB6_16: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1181,8 +1181,8 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .LBB8_9: ; %Flow31 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 @@ -1219,10 +1219,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB8_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll index 9f62477ae01df..af0942e99182d 100644 --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -56,155 +56,153 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: .cfi_offset v29, -240 ; CHECK-NEXT: .cfi_offset v30, -224 ; CHECK-NEXT: .cfi_offset v31, -208 +; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 2, 728(1) +; CHECK-NEXT: ld 14, 688(1) +; CHECK-NEXT: ld 11, 704(1) +; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 21, 5 +; CHECK-NEXT: lwa 5, 0(7) +; CHECK-NEXT: ld 7, 720(1) ; CHECK-NEXT: std 22, 464(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, 472(1) # 8-byte Folded Spill -; CHECK-NEXT: mr 22, 5 -; CHECK-NEXT: ld 5, 848(1) +; CHECK-NEXT: mr 22, 6 +; CHECK-NEXT: ld 6, 848(1) ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: mr 11, 7 -; CHECK-NEXT: ld 23, 688(1) -; CHECK-NEXT: ld 7, 728(1) +; CHECK-NEXT: ld 15, 736(1) ; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill ; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill -; CHECK-NEXT: mr 18, 6 -; CHECK-NEXT: li 6, 9 ; CHECK-NEXT: ld 19, 768(1) -; CHECK-NEXT: ld 2, 760(1) -; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill -; CHECK-NEXT: cmpldi 3, 9 -; CHECK-NEXT: ld 27, 816(1) -; CHECK-NEXT: ld 26, 808(1) -; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 15, 736(1) -; CHECK-NEXT: lxv 39, 0(8) +; CHECK-NEXT: ld 18, 760(1) ; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill ; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 30, 704(1) -; CHECK-NEXT: lxv 38, 0(9) -; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 21, 784(1) +; CHECK-NEXT: ld 12, 696(1) +; CHECK-NEXT: lxv 0, 0(9) +; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 1, 0(8) +; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: ld 30, 824(1) +; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 29, 840(1) +; CHECK-NEXT: ld 28, 832(1) +; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 23, 784(1) ; CHECK-NEXT: ld 20, 776(1) ; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill ; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill -; CHECK-NEXT: iselgt 3, 3, 6 -; CHECK-NEXT: ld 6, 720(1) +; CHECK-NEXT: ld 25, 800(1) ; CHECK-NEXT: ld 24, 792(1) -; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 27, 816(1) +; CHECK-NEXT: ld 26, 808(1) +; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 17, 752(1) +; CHECK-NEXT: extswsli 9, 5, 3 +; CHECK-NEXT: lxv 4, 0(14) +; CHECK-NEXT: std 14, 32(1) # 8-byte Folded Spill +; CHECK-NEXT: std 12, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 0, 5, 40 +; CHECK-NEXT: sldi 14, 5, 5 +; CHECK-NEXT: mulli 31, 5, 24 +; CHECK-NEXT: lxv 38, 0(2) +; CHECK-NEXT: lxv 2, 0(11) +; CHECK-NEXT: std 2, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 2, 5, 48 +; CHECK-NEXT: sldi 5, 5, 4 +; CHECK-NEXT: ld 16, 744(1) +; CHECK-NEXT: lxv 5, 0(10) +; CHECK-NEXT: std 6, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 6, 712(1) +; CHECK-NEXT: mr 10, 7 +; CHECK-NEXT: add 7, 14, 21 +; CHECK-NEXT: lxv 13, 0(19) +; CHECK-NEXT: std 8, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 8, 11 +; CHECK-NEXT: li 11, 9 +; CHECK-NEXT: iselgt 3, 3, 11 ; CHECK-NEXT: addi 3, 3, -2 -; CHECK-NEXT: lxv 6, 0(19) -; CHECK-NEXT: lxv 11, 0(7) -; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 840(1) -; CHECK-NEXT: lxv 12, 0(6) -; CHECK-NEXT: rldicl 12, 3, 61, 3 +; CHECK-NEXT: rldicl 11, 3, 61, 3 +; CHECK-NEXT: lxv 3, 0(12) +; CHECK-NEXT: lxv 40, 0(6) +; CHECK-NEXT: std 18, 112(1) # 8-byte Folded Spill ; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: add 19, 21, 5 +; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: lxv 39, 0(10) +; CHECK-NEXT: addi 3, 7, 32 +; CHECK-NEXT: add 12, 31, 21 ; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 4, 0(21) -; CHECK-NEXT: ld 25, 800(1) -; CHECK-NEXT: lxv 33, 0(10) -; CHECK-NEXT: lxv 32, 0(23) -; CHECK-NEXT: lxv 36, 0(30) -; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 17, 752(1) -; CHECK-NEXT: ld 16, 744(1) -; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 29, 712(1) -; CHECK-NEXT: ld 28, 696(1) -; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill -; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 37, 0(28) -; CHECK-NEXT: lxv 13, 0(29) -; CHECK-NEXT: mr 8, 29 -; CHECK-NEXT: mr 9, 30 -; CHECK-NEXT: mr 10, 28 -; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 33, 0(15) +; CHECK-NEXT: lxv 32, 0(16) ; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 10, 0(15) -; CHECK-NEXT: lxv 9, 0(16) -; CHECK-NEXT: li 28, 1 -; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 8, 0(17) -; CHECK-NEXT: lxv 7, 0(2) +; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 37, 0(17) +; CHECK-NEXT: lxv 36, 0(18) +; CHECK-NEXT: std 30, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 12, 0(20) +; CHECK-NEXT: lxv 11, 0(23) +; CHECK-NEXT: add 20, 21, 9 ; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 5, 0(20) -; CHECK-NEXT: lxv 3, 0(24) +; CHECK-NEXT: lxv 10, 0(24) +; CHECK-NEXT: lxv 9, 0(25) ; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 2, 0(25) -; CHECK-NEXT: lxv 1, 0(26) +; CHECK-NEXT: lxv 8, 0(26) +; CHECK-NEXT: lxv 7, 0(27) +; CHECK-NEXT: addi 12, 12, 32 +; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: mr 26, 21 ; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill -; CHECK-NEXT: lxv 0, 0(27) +; CHECK-NEXT: lxv 6, 0(30) +; CHECK-NEXT: lxv 41, 0(28) +; CHECK-NEXT: addi 7, 11, 1 +; CHECK-NEXT: add 11, 0, 21 +; CHECK-NEXT: li 28, 1 ; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill +; CHECK-NEXT: lxv 43, 0(29) +; CHECK-NEXT: lxv 42, 0(5) ; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill +; CHECK-NEXT: addi 11, 11, 32 ; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill -; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 832(1) ; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 62, 368(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill -; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill ; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 824(1) -; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill -; CHECK-NEXT: lwa 5, 0(11) -; CHECK-NEXT: li 27, 0 -; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: mulli 6, 5, 40 -; CHECK-NEXT: sldi 0, 5, 4 -; CHECK-NEXT: extswsli 14, 5, 3 -; CHECK-NEXT: lxv 40, 0(7) -; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: add 31, 14, 22 -; CHECK-NEXT: add 11, 0, 22 -; CHECK-NEXT: mr 26, 22 -; CHECK-NEXT: addi 3, 11, 32 -; CHECK-NEXT: addi 11, 12, 1 -; CHECK-NEXT: mulli 12, 5, 48 -; CHECK-NEXT: addi 31, 31, 32 -; CHECK-NEXT: add 19, 22, 6 -; CHECK-NEXT: sldi 6, 5, 5 -; CHECK-NEXT: mulli 5, 5, 24 -; CHECK-NEXT: lxv 41, 0(7) -; CHECK-NEXT: add 20, 22, 6 -; CHECK-NEXT: add 21, 22, 5 -; CHECK-NEXT: ld 5, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 43, 0(5) -; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 42, 0(5) +; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_4 Depth 2 -; CHECK-NEXT: maddld 5, 12, 27, 0 -; CHECK-NEXT: mr 6, 18 -; CHECK-NEXT: mr 29, 21 +; CHECK-NEXT: maddld 5, 2, 27, 0 +; CHECK-NEXT: mr 6, 22 ; CHECK-NEXT: mr 30, 20 -; CHECK-NEXT: mr 2, 19 -; CHECK-NEXT: mtctr 11 -; CHECK-NEXT: add 25, 22, 5 -; CHECK-NEXT: maddld 5, 12, 27, 14 -; CHECK-NEXT: add 24, 22, 5 +; CHECK-NEXT: mr 29, 19 +; CHECK-NEXT: mtctr 7 +; CHECK-NEXT: add 25, 21, 5 +; CHECK-NEXT: maddld 5, 2, 27, 14 +; CHECK-NEXT: add 24, 21, 5 +; CHECK-NEXT: maddld 5, 2, 27, 31 +; CHECK-NEXT: add 23, 21, 5 ; CHECK-NEXT: mr 5, 26 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ @@ -212,66 +210,66 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: lxvp 34, 0(6) ; CHECK-NEXT: lxvp 44, 0(5) -; CHECK-NEXT: xvmaddadp 39, 45, 35 -; CHECK-NEXT: lxvp 46, 0(24) -; CHECK-NEXT: xvmaddadp 38, 47, 35 -; CHECK-NEXT: lxvp 48, 0(25) -; CHECK-NEXT: lxvp 50, 0(29) -; CHECK-NEXT: lxvp 62, 0(30) -; CHECK-NEXT: lxvp 60, 0(2) +; CHECK-NEXT: xvmaddadp 1, 45, 35 +; CHECK-NEXT: lxvp 46, 0(30) +; CHECK-NEXT: xvmaddadp 0, 47, 35 +; CHECK-NEXT: lxvp 48, 0(29) +; CHECK-NEXT: lxvp 50, 0(23) +; CHECK-NEXT: lxvp 62, 0(24) +; CHECK-NEXT: lxvp 60, 0(25) ; CHECK-NEXT: lxvp 58, 32(6) ; CHECK-NEXT: lxvp 56, 32(5) -; CHECK-NEXT: lxvp 54, 32(24) -; CHECK-NEXT: lxvp 52, 32(25) -; CHECK-NEXT: lxvp 30, 32(29) -; CHECK-NEXT: lxvp 28, 32(30) -; CHECK-NEXT: lxvp 26, 32(2) -; CHECK-NEXT: xvmaddadp 33, 49, 35 -; CHECK-NEXT: xvmaddadp 32, 51, 35 -; CHECK-NEXT: xvmaddadp 37, 63, 35 -; CHECK-NEXT: xvmaddadp 36, 61, 35 -; CHECK-NEXT: xvmaddadp 13, 44, 34 -; CHECK-NEXT: xvmaddadp 12, 46, 34 -; CHECK-NEXT: xvmaddadp 11, 48, 34 -; CHECK-NEXT: xvmaddadp 10, 50, 34 -; CHECK-NEXT: xvmaddadp 9, 62, 34 -; CHECK-NEXT: xvmaddadp 8, 60, 34 -; CHECK-NEXT: xvmaddadp 7, 57, 59 -; CHECK-NEXT: xvmaddadp 6, 55, 59 -; CHECK-NEXT: xvmaddadp 5, 53, 59 -; CHECK-NEXT: xvmaddadp 4, 31, 59 -; CHECK-NEXT: xvmaddadp 3, 29, 59 -; CHECK-NEXT: xvmaddadp 2, 27, 59 -; CHECK-NEXT: xvmaddadp 1, 56, 58 -; CHECK-NEXT: xvmaddadp 0, 54, 58 -; CHECK-NEXT: xvmaddadp 40, 52, 58 +; CHECK-NEXT: lxvp 54, 32(30) +; CHECK-NEXT: lxvp 52, 32(29) +; CHECK-NEXT: lxvp 30, 32(23) +; CHECK-NEXT: lxvp 28, 32(24) +; CHECK-NEXT: lxvp 26, 32(25) +; CHECK-NEXT: xvmaddadp 5, 49, 35 +; CHECK-NEXT: xvmaddadp 4, 51, 35 +; CHECK-NEXT: xvmaddadp 3, 63, 35 +; CHECK-NEXT: xvmaddadp 2, 61, 35 +; CHECK-NEXT: xvmaddadp 40, 44, 34 +; CHECK-NEXT: xvmaddadp 39, 46, 34 +; CHECK-NEXT: xvmaddadp 38, 48, 34 +; CHECK-NEXT: xvmaddadp 33, 50, 34 +; CHECK-NEXT: xvmaddadp 32, 62, 34 +; CHECK-NEXT: xvmaddadp 37, 60, 34 +; CHECK-NEXT: xvmaddadp 36, 57, 59 +; CHECK-NEXT: xvmaddadp 13, 55, 59 +; CHECK-NEXT: xvmaddadp 12, 53, 59 +; CHECK-NEXT: xvmaddadp 11, 31, 59 +; CHECK-NEXT: xvmaddadp 10, 29, 59 +; CHECK-NEXT: xvmaddadp 9, 27, 59 +; CHECK-NEXT: xvmaddadp 8, 56, 58 +; CHECK-NEXT: xvmaddadp 7, 54, 58 +; CHECK-NEXT: xvmaddadp 6, 52, 58 ; CHECK-NEXT: xvmaddadp 41, 30, 58 ; CHECK-NEXT: xvmaddadp 43, 28, 58 ; CHECK-NEXT: xvmaddadp 42, 26, 58 ; CHECK-NEXT: addi 6, 6, 64 ; CHECK-NEXT: addi 5, 5, 64 +; CHECK-NEXT: addi 30, 30, 64 +; CHECK-NEXT: addi 29, 29, 64 +; CHECK-NEXT: addi 23, 23, 64 ; CHECK-NEXT: addi 24, 24, 64 ; CHECK-NEXT: addi 25, 25, 64 -; CHECK-NEXT: addi 29, 29, 64 -; CHECK-NEXT: addi 30, 30, 64 -; CHECK-NEXT: addi 2, 2, 64 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # ; CHECK-NEXT: addi 28, 28, 6 -; CHECK-NEXT: add 26, 26, 12 -; CHECK-NEXT: add 31, 31, 12 -; CHECK-NEXT: add 19, 19, 12 -; CHECK-NEXT: add 3, 3, 12 -; CHECK-NEXT: add 20, 20, 12 -; CHECK-NEXT: add 21, 21, 12 +; CHECK-NEXT: add 26, 26, 2 +; CHECK-NEXT: add 20, 20, 2 +; CHECK-NEXT: add 11, 11, 2 +; CHECK-NEXT: add 19, 19, 2 +; CHECK-NEXT: add 3, 3, 2 +; CHECK-NEXT: add 12, 12, 2 ; CHECK-NEXT: addi 27, 27, 1 ; CHECK-NEXT: cmpld 28, 4 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit -; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: stxv 1, 0(3) ; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 62, 368(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 61, 352(1) # 16-byte Folded Reload @@ -284,7 +282,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxv 54, 240(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 53, 224(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 52, 208(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 31, 584(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 576(1) # 8-byte Folded Reload @@ -297,8 +295,8 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 29, 520(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 28, 512(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 27, 504(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) -; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 26, 496(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 25, 488(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 24, 480(1) # 8-byte Folded Reload @@ -310,40 +308,41 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 18, 432(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 17, 424(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(3) -; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(10) -; CHECK-NEXT: stxv 36, 0(9) -; CHECK-NEXT: stxv 13, 0(8) +; CHECK-NEXT: stxv 4, 0(3) +; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 2, 0(8) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: stxv 39, 0(10) +; CHECK-NEXT: stxv 38, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 36, 0(3) ; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 13, 0(3) ; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: stxv 12, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 4, 0(3) +; CHECK-NEXT: stxv 11, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 2, 0(3) +; CHECK-NEXT: stxv 9, 0(3) ; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: stxv 8, 0(3) ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: stxv 7, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: stxv 6, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll index 799ba63a4df27..8fb4c21316b48 100644 --- a/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll +++ b/llvm/test/CodeGen/PowerPC/no-ctr-loop-if-exit-in-nested-loop.ll @@ -40,9 +40,10 @@ define signext i32 @test(ptr noalias %PtrA, ptr noalias %PtrB, i32 signext %LenA ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: # %if.end9 ; CHECK-NEXT: # -; CHECK-NEXT: lwzx 10, 6, 9 +; CHECK-NEXT: add 9, 3, 9 +; CHECK-NEXT: lwz 10, 4(9) ; CHECK-NEXT: addi 10, 10, 1 -; CHECK-NEXT: stwx 10, 6, 9 +; CHECK-NEXT: stw 10, 4(9) ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_5: # %if.then ; CHECK-NEXT: lwax 3, 9, 3 diff --git a/llvm/test/Transforms/LICM/gep-reassociate.ll b/llvm/test/Transforms/LICM/gep-reassociate.ll index 630a751999c49..0090c76b09dda 100644 --- a/llvm/test/Transforms/LICM/gep-reassociate.ll +++ b/llvm/test/Transforms/LICM/gep-reassociate.ll @@ -39,11 +39,13 @@ exit: ret void } -define void @both_inbounds_one_neg(ptr %ptr, i1 %c) { +define void @both_inbounds_one_neg(ptr %ptr, i1 %c, i64 %neg) { ; CHECK-LABEL: define void @both_inbounds_one_neg -; CHECK-SAME: (ptr [[PTR:%.*]], i1 [[C:%.*]]) { +; CHECK-SAME: (ptr [[PTR:%.*]], i1 [[C:%.*]], i64 [[NEG:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 -1 +; CHECK-NEXT: [[IS_NEG:%.*]] = icmp slt i64 [[NEG]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_NEG]]) +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[NEG]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @get.i32() @@ -55,13 +57,15 @@ define void @both_inbounds_one_neg(ptr %ptr, i1 %c) { ; CHECK-NEXT: ret void ; entry: + %is.neg = icmp slt i64 %neg, 0 + call void @llvm.assume(i1 %is.neg) br label %loop loop: %val = call i32 @get.i32() %val.ext = zext i32 %val to i64 %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 %val.ext - %ptr3 = getelementptr i8, ptr %ptr2, i64 -1 + %ptr3 = getelementptr i8, ptr %ptr2, i64 %neg call void @use(ptr %ptr3) br i1 %c, label %loop, label %exit @@ -69,11 +73,13 @@ exit: ret void } -define void @both_inbounds_pos(ptr %ptr, i1 %c) { +define void @both_inbounds_pos(ptr %ptr, i1 %c, i64 %nonneg) { ; CHECK-LABEL: define void @both_inbounds_pos -; CHECK-SAME: (ptr [[PTR:%.*]], i1 [[C:%.*]]) { +; CHECK-SAME: (ptr [[PTR:%.*]], i1 [[C:%.*]], i64 [[NONNEG:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1 +; CHECK-NEXT: [[IS_NONNEG:%.*]] = icmp sge i64 [[NONNEG]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_NONNEG]]) +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[NONNEG]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @get.i32() @@ -85,13 +91,15 @@ define void @both_inbounds_pos(ptr %ptr, i1 %c) { ; CHECK-NEXT: ret void ; entry: + %is.nonneg = icmp sge i64 %nonneg, 0 + call void @llvm.assume(i1 %is.nonneg) br label %loop loop: %val = call i32 @get.i32() %val.ext = zext i32 %val to i64 %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 %val.ext - %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1 + %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 %nonneg call void @use(ptr %ptr3) br i1 %c, label %loop, label %exit @@ -440,3 +448,32 @@ latch: exit: ret void } + +; Do not reassociate constant offset GEP. +define void @constant_offset(ptr %ptr, i1 %c) { +; CHECK-LABEL: define void @constant_offset +; CHECK-SAME: (ptr [[PTR:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[VAL:%.*]] = call i64 @get.i64() +; CHECK-NEXT: [[GEP_BASE:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[VAL]] +; CHECK-NEXT: [[GEP_OFF:%.*]] = getelementptr i8, ptr [[GEP_BASE]], i64 1 +; CHECK-NEXT: call void @use(ptr [[GEP_OFF]]) +; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %val = call i64 @get.i64() + %gep.base = getelementptr i8, ptr %ptr, i64 %val + %gep.off = getelementptr i8, ptr %gep.base, i64 1 + call void @use(ptr %gep.off) + br i1 %c, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index e89f41bb94665..97b5210b22f92 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -142,40 +142,40 @@ define void @fp_iv_loop2(ptr noalias nocapture %A, i32 %N) { ; AUTO_VEC-NEXT: br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]] ; AUTO_VEC: for.body.preheader.new: ; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[ZEXT]], 2147483640 -; AUTO_VEC-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 4 -; AUTO_VEC-NEXT: [[INVARIANT_GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8 -; AUTO_VEC-NEXT: [[INVARIANT_GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12 -; AUTO_VEC-NEXT: [[INVARIANT_GEP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16 -; AUTO_VEC-NEXT: [[INVARIANT_GEP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 20 -; AUTO_VEC-NEXT: [[INVARIANT_GEP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 24 -; AUTO_VEC-NEXT: [[INVARIANT_GEP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 28 ; AUTO_VEC-NEXT: br label [[FOR_BODY:%.*]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER_NEW]] ], [ [[CONV1_7:%.*]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[FOR_BODY]] ] -; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDVARS_IV]] ; AUTO_VEC-NEXT: store float [[X_06]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1:%.*]] = fadd float [[X_06]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4 ; AUTO_VEC-NEXT: store float [[CONV1]], ptr [[ARRAYIDX_1]], align 4 ; AUTO_VEC-NEXT: [[CONV1_1:%.*]] = fadd float [[CONV1]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP1]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8 ; AUTO_VEC-NEXT: store float [[CONV1_1]], ptr [[ARRAYIDX_2]], align 4 ; AUTO_VEC-NEXT: [[CONV1_2:%.*]] = fadd float [[CONV1_1]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP3]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 12 ; AUTO_VEC-NEXT: store float [[CONV1_2]], ptr [[ARRAYIDX_3]], align 4 ; AUTO_VEC-NEXT: [[CONV1_3:%.*]] = fadd float [[CONV1_2]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP5]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16 ; AUTO_VEC-NEXT: store float [[CONV1_3]], ptr [[ARRAYIDX_4]], align 4 ; AUTO_VEC-NEXT: [[CONV1_4:%.*]] = fadd float [[CONV1_3]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP7]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 20 ; AUTO_VEC-NEXT: store float [[CONV1_4]], ptr [[ARRAYIDX_5]], align 4 ; AUTO_VEC-NEXT: [[CONV1_5:%.*]] = fadd float [[CONV1_4]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP9]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 24 ; AUTO_VEC-NEXT: store float [[CONV1_5]], ptr [[ARRAYIDX_6]], align 4 ; AUTO_VEC-NEXT: [[CONV1_6:%.*]] = fadd float [[CONV1_5]], 5.000000e-01 -; AUTO_VEC-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, ptr [[INVARIANT_GEP11]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; AUTO_VEC-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 28 ; AUTO_VEC-NEXT: store float [[CONV1_6]], ptr [[ARRAYIDX_7]], align 4 ; AUTO_VEC-NEXT: [[CONV1_7]] = fadd float [[CONV1_6]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 @@ -299,40 +299,40 @@ define double @external_use_without_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: br i1 [[TMP0]], label [[FOR_END_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]] ; AUTO_VEC: entry.new: ; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[SMAX]], 9223372036854775800 -; AUTO_VEC-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8 -; AUTO_VEC-NEXT: [[INVARIANT_GEP2:%.*]] = getelementptr i8, ptr [[A]], i64 16 -; AUTO_VEC-NEXT: [[INVARIANT_GEP4:%.*]] = getelementptr i8, ptr [[A]], i64 24 -; AUTO_VEC-NEXT: [[INVARIANT_GEP6:%.*]] = getelementptr i8, ptr [[A]], i64 32 -; AUTO_VEC-NEXT: [[INVARIANT_GEP8:%.*]] = getelementptr i8, ptr [[A]], i64 40 -; AUTO_VEC-NEXT: [[INVARIANT_GEP10:%.*]] = getelementptr i8, ptr [[A]], i64 48 -; AUTO_VEC-NEXT: [[INVARIANT_GEP12:%.*]] = getelementptr i8, ptr [[A]], i64 56 ; AUTO_VEC-NEXT: br label [[FOR_BODY:%.*]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[I_NEXT_7:%.*]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: [[J:%.*]] = phi double [ 0.000000e+00, [[ENTRY_NEW]] ], [ [[J_NEXT_7:%.*]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[FOR_BODY]] ] -; AUTO_VEC-NEXT: [[T0:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[I]] ; AUTO_VEC-NEXT: store double [[J]], ptr [[T0]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT:%.*]] = fadd double [[J]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_1:%.*]] = getelementptr double, ptr [[INVARIANT_GEP]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8 ; AUTO_VEC-NEXT: store double [[J_NEXT]], ptr [[T0_1]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_1:%.*]] = fadd double [[J_NEXT]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_2:%.*]] = getelementptr double, ptr [[INVARIANT_GEP2]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 16 ; AUTO_VEC-NEXT: store double [[J_NEXT_1]], ptr [[T0_2]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_2:%.*]] = fadd double [[J_NEXT_1]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_3:%.*]] = getelementptr double, ptr [[INVARIANT_GEP4]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_3:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24 ; AUTO_VEC-NEXT: store double [[J_NEXT_2]], ptr [[T0_3]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_3:%.*]] = fadd double [[J_NEXT_2]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_4:%.*]] = getelementptr double, ptr [[INVARIANT_GEP6]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_4:%.*]] = getelementptr i8, ptr [[TMP4]], i64 32 ; AUTO_VEC-NEXT: store double [[J_NEXT_3]], ptr [[T0_4]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_4:%.*]] = fadd double [[J_NEXT_3]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_5:%.*]] = getelementptr double, ptr [[INVARIANT_GEP8]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_5:%.*]] = getelementptr i8, ptr [[TMP5]], i64 40 ; AUTO_VEC-NEXT: store double [[J_NEXT_4]], ptr [[T0_5]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_5:%.*]] = fadd double [[J_NEXT_4]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_6:%.*]] = getelementptr double, ptr [[INVARIANT_GEP10]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_6:%.*]] = getelementptr i8, ptr [[TMP6]], i64 48 ; AUTO_VEC-NEXT: store double [[J_NEXT_5]], ptr [[T0_6]], align 8 ; AUTO_VEC-NEXT: [[J_NEXT_6:%.*]] = fadd double [[J_NEXT_5]], 3.000000e+00 -; AUTO_VEC-NEXT: [[T0_7:%.*]] = getelementptr double, ptr [[INVARIANT_GEP12]], i64 [[I]] +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[A]], i64 [[I]] +; AUTO_VEC-NEXT: [[T0_7:%.*]] = getelementptr i8, ptr [[TMP7]], i64 56 ; AUTO_VEC-NEXT: store double [[J_NEXT_6]], ptr [[T0_7]], align 8 ; AUTO_VEC-NEXT: [[I_NEXT_7]] = add nuw nsw i64 [[I]], 8 ; AUTO_VEC-NEXT: [[J_NEXT_7]] = fadd double [[J_NEXT_6]], 3.000000e+00 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll index c98e7d349e6c0..482907d295706 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll @@ -8,12 +8,12 @@ define void @foo(ptr noalias noundef %0, ptr noalias noundef %1) optsize { ; CHECK-LABEL: define void @foo( ; CHECK-SAME: ptr noalias noundef readonly captures(none) [[TMP0:%.*]], ptr noalias noundef writeonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -28 ; CHECK-NEXT: br label [[TMP4:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP4]] ] ; CHECK-NEXT: [[TMP3:%.*]] = sub nuw nsw i64 255, [[INDVARS_IV]] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[INVARIANT_GEP]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[TMP3]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 -28 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = load <8 x i32>, ptr [[GEP]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 5) ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> From 6d3e64e9a73f56d27a2e59d6cf1a79763ddc4c85 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 1 Aug 2025 09:35:06 +0200 Subject: [PATCH 2/2] move check --- llvm/lib/Transforms/Scalar/LICM.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 5197e2a7b7d43..c3f80f901a120 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2508,6 +2508,12 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, if (!GEP) return false; + // Do not try to hoist a constant GEP out of the loop via reassociation. + // Constant GEPs can often be folded into addressing modes, and reassociating + // them may inhibit CSE of a common base. + if (GEP->hasAllConstantIndices()) + return false; + auto *Src = dyn_cast(GEP->getPointerOperand()); if (!Src || !Src->hasOneUse() || !L.contains(Src)) return false; @@ -2517,12 +2523,6 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, if (!L.isLoopInvariant(SrcPtr) || !all_of(GEP->indices(), LoopInvariant)) return false; - // Do not try to hoist a constant GEP out of the loop via reassociation. - // Constant GEPs can often be folded into addressing modes, and reassociating - // them may inhibit CSE of a common base. - if (GEP->hasAllConstantIndices()) - return false; - // This can only happen if !AllowSpeculation, otherwise this would already be // handled. // FIXME: Should we respect AllowSpeculation in these reassociation folds?