diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..b0b9f59a1e841 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,19 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), + [ (pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), + (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< + (defs root:$dst), + (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), + (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), + (G_OR $or, $x_lo, $y), + (G_MERGE_VALUES $dst, $or, $x_hi))>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -180,7 +193,7 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>; def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, - foldable_fneg, combine_shuffle_vector_to_build_vector]> { + foldable_fneg, combine_shuffle_vector_to_build_vector, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } @@ -188,7 +201,8 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> { + rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, + combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 353c09b4b0bfb..ecd7cc24fd920 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) { ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) { ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) { ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i65 %value, 33 ret i65 %result @@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_ashr_i65_33: ; GCN: ; %bb.0: ; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_lshr_b32 s4, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 +; GCN-NEXT: s_or_b32 s0, s0, s4 ; GCN-NEXT: s_ashr_i32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65_33: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 ; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir new file mode 100644 index 0000000000000..48e9818d08d0b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s + +--- +name: test_combine_or_s64_s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: test_combine_or_s64_s32 + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32) + ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1 + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s32) = COPY $sgpr2 + %2:_(s64) = G_ZEXT %1(s32) + %3:_(s64) = G_OR %0, %2 + $sgpr0_sgpr1 = COPY %3(s64) + SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1 +... +--- +name: test_combine_or_s64_s32_rhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32) + ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1 + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s32) = COPY $sgpr2 + %2:_(s64) = G_ZEXT %1(s32) + %3:_(s64) = G_OR %2, %0 + $sgpr0_sgpr1 = COPY %3(s64) + SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1 +... +--- +name: test_combine_or_s64_s32_merge_unmerge +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]] + ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) + %4:_(s64) = G_ZEXT %2(s32) + %5:_(s64) = G_OR %3, %4 + %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64) + $sgpr0 = COPY %6(s32) + $sgpr1 = COPY %7(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 +... +--- +name: negative_test_incorrect_types +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-LABEL: name: negative_test_incorrect_types + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s64) = COPY $vgpr4_vgpr5 + %2:_(s128) = G_ZEXT %1 + %3:_(s128) = G_OR %0, %2 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index ff26ea21390e2..80bfe13fad1c3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -230,20 +230,20 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_mov_b32 s7, -1 -; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0 -; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 @@ -251,9 +251,9 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: .LBB4_2: ; %.preheader ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 -; GFX10-NEXT: s_add_i32 s1, s1, 4 +; GFX10-NEXT: s_add_i32 s2, s2, 4 ; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -262,19 +262,19 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 -; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX10-NEXT: s_branch .LBB4_6 ; GFX10-NEXT: .LBB4_4: -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB4_6 ; GFX10-NEXT: ; %bb.5: ; %.19 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_or_b32_e32 v1, 2, v1 ; GFX10-NEXT: .LBB4_6: ; %.22 -; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2 +; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2 ; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen ; GFX10-NEXT: s_endpgm .entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..72349f084c5b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -4959,17 +4959,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) { ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GCN-NEXT: s_lshr_b32 s2, s3, 27 -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i64_5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX11-NEXT: s_lshr_b32 s2, s3, 27 -; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result @@ -4979,20 +4977,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) { ; GCN-LABEL: s_fshl_i64_32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s2, s3 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_mov_b32 s0, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i64_32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s2, s3 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_mov_b32 s0, s3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result @@ -6877,56 +6868,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 ; GFX6-NEXT: s_lshr_b32 s4, s5, 31 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_lshr_b32 s4, s7, 31 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_i128_65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 ; GFX8-NEXT: s_lshr_b32 s4, s5, 31 -; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s7, 31 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i128_65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 ; GFX9-NEXT: s_lshr_b32 s4, s5, 31 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s7, 31 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i128_65: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s2, s5, 31 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX10-NEXT: s_lshr_b32 s2, s7, 31 -; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 +; GFX10-NEXT: s_lshr_b32 s4, s5, 31 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GFX10-NEXT: s_lshr_b32 s5, s7, 31 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i128_65: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s5, 31 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX11-NEXT: s_lshr_b32 s2, s7, 31 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GFX11-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 +; GFX11-NEXT: s_lshr_b32 s4, s5, 31 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GFX11-NEXT: s_lshr_b32 s5, s7, 31 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -6939,7 +6924,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6950,7 +6935,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6961,7 +6946,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX9-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6973,7 +6958,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 31, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6985,7 +6970,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 31, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..f71a35d735bd3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -5007,20 +5007,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) { ; GCN-LABEL: s_fshr_i64_32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s2, s3 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_mov_b32 s0, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i64_32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s2, s3 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_mov_b32 s0, s3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result @@ -5031,17 +5024,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) { ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GCN-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NEXT: s_mov_b32 s3, 0 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_or_b32 s0, s0, s2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i64_48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX11-NEXT: s_lshr_b32 s2, s3, 16 -; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result @@ -5606,34 +5597,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX6-NEXT: s_or_b32 s2, s2, s0 +; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s8 ; GFX6-NEXT: s_not_b32 s9, s8 -; GFX6-NEXT: s_sub_i32 s16, s2, 64 -; GFX6-NEXT: s_sub_i32 s12, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 +; GFX6-NEXT: s_sub_i32 s16, s0, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s9 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[10:11], s9 ; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] -; GFX6-NEXT: s_and_b32 s0, s8, 0x7f -; GFX6-NEXT: s_sub_i32 s14, s0, 64 -; GFX6-NEXT: s_sub_i32 s12, 64, s0 -; GFX6-NEXT: s_cmp_lt_u32 s0, 64 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] +; GFX6-NEXT: s_and_b32 s9, s8, 0x7f +; GFX6-NEXT: s_sub_i32 s14, s9, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s9 +; GFX6-NEXT: s_cmp_lt_u32 s9, 64 ; GFX6-NEXT: s_cselect_b32 s15, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s0, 0 +; GFX6-NEXT: s_cmp_eq_u32 s9, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[6:7], s8 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] @@ -5643,9 +5633,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i128: @@ -5653,34 +5643,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX8-NEXT: s_or_b32 s2, s2, s0 +; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s8 ; GFX8-NEXT: s_not_b32 s9, s8 -; GFX8-NEXT: s_sub_i32 s16, s2, 64 -; GFX8-NEXT: s_sub_i32 s12, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 +; GFX8-NEXT: s_sub_i32 s16, s0, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[2:3], s9 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], s9 ; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] -; GFX8-NEXT: s_and_b32 s0, s8, 0x7f -; GFX8-NEXT: s_sub_i32 s14, s0, 64 -; GFX8-NEXT: s_sub_i32 s12, 64, s0 -; GFX8-NEXT: s_cmp_lt_u32 s0, 64 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] +; GFX8-NEXT: s_and_b32 s9, s8, 0x7f +; GFX8-NEXT: s_sub_i32 s14, s9, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s9 +; GFX8-NEXT: s_cmp_lt_u32 s9, 64 ; GFX8-NEXT: s_cselect_b32 s15, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: s_cmp_eq_u32 s9, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[6:7], s8 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] @@ -5690,9 +5679,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i128: @@ -5700,34 +5689,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX9-NEXT: s_or_b32 s2, s2, s0 +; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s8 ; GFX9-NEXT: s_not_b32 s9, s8 -; GFX9-NEXT: s_sub_i32 s16, s2, 64 -; GFX9-NEXT: s_sub_i32 s12, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 +; GFX9-NEXT: s_sub_i32 s16, s0, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[2:3], s9 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], s9 ; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] ; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] -; GFX9-NEXT: s_and_b32 s0, s8, 0x7f -; GFX9-NEXT: s_sub_i32 s14, s0, 64 -; GFX9-NEXT: s_sub_i32 s12, 64, s0 -; GFX9-NEXT: s_cmp_lt_u32 s0, 64 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] +; GFX9-NEXT: s_and_b32 s9, s8, 0x7f +; GFX9-NEXT: s_sub_i32 s14, s9, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s9 +; GFX9-NEXT: s_cmp_lt_u32 s9, 64 ; GFX9-NEXT: s_cselect_b32 s15, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: s_cmp_eq_u32 s9, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[6:7], s8 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] @@ -5737,19 +5725,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s10, s1, 31 -; GFX10-NEXT: s_mov_b32 s11, 0 -; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_or_b32 s2, s2, s9 +; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8 ; GFX10-NEXT: s_not_b32 s14, s8 ; GFX10-NEXT: s_sub_i32 s16, s9, 64 ; GFX10-NEXT: s_sub_i32 s10, 64, s9 @@ -5792,11 +5779,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg ; GFX11-LABEL: s_fshr_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s10, s1, 31 -; GFX11-NEXT: s_mov_b32 s11, 0 -; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8 +; GFX11-NEXT: s_lshr_b32 s9, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX11-NEXT: s_or_b32 s2, s2, s9 +; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8 ; GFX11-NEXT: s_not_b32 s14, s8 ; GFX11-NEXT: s_sub_i32 s16, s9, 64 ; GFX11-NEXT: s_sub_i32 s10, 64, s9 @@ -6103,13 +6089,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_or_b32 s2, s2, s0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 ; GFX6-NEXT: v_not_b32_e32 v8, 63 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1 -; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v8 ; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 @@ -6120,8 +6105,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc @@ -6156,13 +6141,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_or_b32 s2, s2, s0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 ; GFX8-NEXT: v_not_b32_e32 v8, 63 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v8 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 @@ -6173,8 +6157,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc @@ -6209,12 +6193,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_or_b32 s2, s2, s0 ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 @@ -6225,10 +6208,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc @@ -6258,100 +6241,101 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] +; GFX10-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 +; GFX10-NEXT: s_lshr_b32 s2, s1, 31 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_and_b32_e32 v14, 0x7f, v1 +; GFX10-NEXT: s_or_b32 s8, s8, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v15 +; GFX10-NEXT: v_add_nc_u32_e32 v12, 0xffffffc0, v15 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v15, s[4:5] +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 64, v14 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v14 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v14, s[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX10-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v14 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v4, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[12:13], v12, s[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, s[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v15 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v14 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX10-NEXT: v_or_b32_e32 v5, v9, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v2, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v12, v2, s1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v15, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: s_mov_b32 s9, 0 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s8, s1, 31 -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 +; GFX11-NEXT: s_lshr_b32 s2, s1, 31 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_or_b32 s8, s8, s2 +; GFX11-NEXT: v_and_b32_e32 v14, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 64, v14 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v14, s[8:9] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v14 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v4, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0xffffffc0, v15 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v15, s[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc_lo ; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 +; GFX11-NEXT: v_lshrrev_b64 v[12:13], v12, s[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v15 +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, s0 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v10 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v12, v2, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v15, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s5, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6364,26 +6348,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX6-NEXT: s_or_b32 s2, s2, s0 +; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX6-NEXT: s_not_b32 s5, s4 -; GFX6-NEXT: s_sub_i32 s12, s2, 64 -; GFX6-NEXT: s_sub_i32 s8, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 +; GFX6-NEXT: s_sub_i32 s12, s0, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s14, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s5 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], s5 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_and_b32 s0, s4, 0x7f ; GFX6-NEXT: s_sub_i32 s1, s0, 64 ; GFX6-NEXT: s_sub_i32 s4, 64, s0 @@ -6392,14 +6375,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1 ; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s8 +; GFX6-NEXT: s_and_b32 s0, 1, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6407,10 +6390,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_or_b32_e32 v2, s6, v2 -; GFX6-NEXT: v_or_b32_e32 v3, s7, v3 +; GFX6-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s11, v1 +; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_svs: @@ -6418,26 +6401,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 -; GFX8-NEXT: s_mov_b32 s1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s0 +; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX8-NEXT: s_not_b32 s5, s4 -; GFX8-NEXT: s_sub_i32 s12, s2, 64 -; GFX8-NEXT: s_sub_i32 s8, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 +; GFX8-NEXT: s_sub_i32 s12, s0, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s14, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s5 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s5 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s14, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_and_b32 s0, s4, 0x7f ; GFX8-NEXT: s_sub_i32 s1, s0, 64 ; GFX8-NEXT: s_sub_i32 s4, 64, s0 @@ -6446,14 +6428,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s8 +; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6461,10 +6443,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, s6, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s11, v1 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_svs: @@ -6472,26 +6454,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX9-NEXT: s_or_b32 s2, s2, s0 +; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX9-NEXT: s_not_b32 s5, s4 -; GFX9-NEXT: s_sub_i32 s12, s2, 64 -; GFX9-NEXT: s_sub_i32 s8, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 +; GFX9-NEXT: s_sub_i32 s12, s0, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s14, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s5 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s14, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_and_b32 s0, s4, 0x7f ; GFX9-NEXT: s_sub_i32 s1, s0, 64 ; GFX9-NEXT: s_sub_i32 s4, 64, s0 @@ -6500,14 +6481,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s8 +; GFX9-NEXT: s_and_b32 s0, 1, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6515,20 +6496,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX9-NEXT: v_or_b32_e32 v2, s6, v2 -; GFX9-NEXT: v_or_b32_e32 v3, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s11, v1 +; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i128_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s6, s1, 31 -; GFX10-NEXT: s_mov_b32 s7, 0 -; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 +; GFX10-NEXT: s_lshr_b32 s5, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX10-NEXT: s_not_b32 s10, s4 ; GFX10-NEXT: s_sub_i32 s12, s5, 64 ; GFX10-NEXT: s_sub_i32 s6, 64, s5 @@ -6578,11 +6558,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX11-LABEL: v_fshr_i128_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s6, s1, 31 -; GFX11-NEXT: s_mov_b32 s7, 0 -; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 +; GFX11-NEXT: s_lshr_b32 s5, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 ; GFX11-NEXT: s_not_b32 s10, s4 ; GFX11-NEXT: s_sub_i32 s12, s5, 64 ; GFX11-NEXT: s_sub_i32 s6, 64, s5 @@ -7033,81 +7012,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s22, s1, 31 -; GFX6-NEXT: s_mov_b32 s23, 0 ; GFX6-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] -; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 +; GFX6-NEXT: s_or_b32 s2, s2, s0 +; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s16 ; GFX6-NEXT: s_not_b32 s17, s16 -; GFX6-NEXT: s_sub_i32 s21, s2, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s29, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 -; GFX6-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 -; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] -; GFX6-NEXT: s_cmp_lg_u32 s29, 0 -; GFX6-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] -; GFX6-NEXT: s_and_b32 s0, s16, 0x7f ; GFX6-NEXT: s_sub_i32 s21, s0, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s0 ; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s26, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX6-NEXT: s_lshr_b64 s[22:23], s[18:19], s22 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[2:3], s17 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[18:19], s17 +; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 +; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX6-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19] +; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19] +; GFX6-NEXT: s_and_b32 s17, s16, 0x7f +; GFX6-NEXT: s_sub_i32 s21, s17, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s17 +; GFX6-NEXT: s_cmp_lt_u32 s17, 64 +; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s25, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[10:11], s16 ; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 -; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 +; GFX6-NEXT: s_cmp_lg_u32 s24, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cmp_lg_u32 s25, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s26, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX6-NEXT: s_lshr_b32 s22, s5, 31 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: s_cmp_lg_u32 s24, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] -; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s20 -; GFX6-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 +; GFX6-NEXT: s_lshr_b32 s4, s5, 31 +; GFX6-NEXT: s_or_b32 s6, s6, s4 +; GFX6-NEXT: s_andn2_b32 s4, 0x7f, s20 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] ; GFX6-NEXT: s_not_b32 s16, s20 -; GFX6-NEXT: s_sub_i32 s18, s6, 64 -; GFX6-NEXT: s_sub_i32 s10, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_sub_i32 s18, s4, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[8:9], s16 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[6:7], s16 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] -; GFX6-NEXT: s_and_b32 s4, s20, 0x7f -; GFX6-NEXT: s_sub_i32 s18, s4, 64 -; GFX6-NEXT: s_sub_i32 s16, 64, s4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_and_b32 s8, s20, 0x7f +; GFX6-NEXT: s_sub_i32 s18, s8, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], s20 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 @@ -7115,88 +7093,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] -; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s22, s1, 31 -; GFX8-NEXT: s_mov_b32 s23, 0 ; GFX8-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] -; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 +; GFX8-NEXT: s_or_b32 s2, s2, s0 +; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s16 ; GFX8-NEXT: s_not_b32 s17, s16 -; GFX8-NEXT: s_sub_i32 s21, s2, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 -; GFX8-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 -; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 -; GFX8-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] -; GFX8-NEXT: s_and_b32 s0, s16, 0x7f ; GFX8-NEXT: s_sub_i32 s21, s0, 64 ; GFX8-NEXT: s_sub_i32 s22, 64, s0 ; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX8-NEXT: s_lshr_b64 s[22:23], s[18:19], s22 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[2:3], s17 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[18:19], s17 +; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX8-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19] +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19] +; GFX8-NEXT: s_and_b32 s17, s16, 0x7f +; GFX8-NEXT: s_sub_i32 s21, s17, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s17 +; GFX8-NEXT: s_cmp_lt_u32 s17, 64 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s17, 0 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[10:11], s16 ; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 -; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX8-NEXT: s_lshr_b32 s22, s5, 31 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] -; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s20 -; GFX8-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 +; GFX8-NEXT: s_lshr_b32 s4, s5, 31 +; GFX8-NEXT: s_or_b32 s6, s6, s4 +; GFX8-NEXT: s_andn2_b32 s4, 0x7f, s20 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: s_not_b32 s16, s20 -; GFX8-NEXT: s_sub_i32 s18, s6, 64 -; GFX8-NEXT: s_sub_i32 s10, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_sub_i32 s18, s4, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[8:9], s16 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[6:7], s16 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] -; GFX8-NEXT: s_and_b32 s4, s20, 0x7f -; GFX8-NEXT: s_sub_i32 s18, s4, 64 -; GFX8-NEXT: s_sub_i32 s16, 64, s4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_and_b32 s8, s20, 0x7f +; GFX8-NEXT: s_sub_i32 s18, s8, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], s20 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 @@ -7204,88 +7181,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] -; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s22, s1, 31 -; GFX9-NEXT: s_mov_b32 s23, 0 ; GFX9-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] -; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 +; GFX9-NEXT: s_or_b32 s2, s2, s0 +; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s16 ; GFX9-NEXT: s_not_b32 s17, s16 -; GFX9-NEXT: s_sub_i32 s21, s2, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 -; GFX9-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 -; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] -; GFX9-NEXT: s_cmp_lg_u32 s29, 0 -; GFX9-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] -; GFX9-NEXT: s_and_b32 s0, s16, 0x7f ; GFX9-NEXT: s_sub_i32 s21, s0, 64 ; GFX9-NEXT: s_sub_i32 s22, 64, s0 ; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[18:19], s22 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[2:3], s17 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[18:19], s17 +; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9-NEXT: s_cselect_b64 s[18:19], s[22:23], s[18:19] +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19] +; GFX9-NEXT: s_and_b32 s17, s16, 0x7f +; GFX9-NEXT: s_sub_i32 s21, s17, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s17 +; GFX9-NEXT: s_cmp_lt_u32 s17, 64 +; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s25, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[10:11], s16 ; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 -; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s26, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX9-NEXT: s_lshr_b32 s22, s5, 31 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] -; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s20 -; GFX9-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 +; GFX9-NEXT: s_lshr_b32 s4, s5, 31 +; GFX9-NEXT: s_or_b32 s6, s6, s4 +; GFX9-NEXT: s_andn2_b32 s4, 0x7f, s20 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] ; GFX9-NEXT: s_not_b32 s16, s20 -; GFX9-NEXT: s_sub_i32 s18, s6, 64 -; GFX9-NEXT: s_sub_i32 s10, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_sub_i32 s18, s4, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[8:9], s16 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[6:7], s16 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] -; GFX9-NEXT: s_and_b32 s4, s20, 0x7f -; GFX9-NEXT: s_sub_i32 s18, s4, 64 -; GFX9-NEXT: s_sub_i32 s16, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_and_b32 s8, s20, 0x7f +; GFX9-NEXT: s_sub_i32 s18, s8, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], s20 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 @@ -7293,61 +7269,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] -; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s18, s1, 31 -; GFX10-NEXT: s_mov_b32 s19, 0 -; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16 +; GFX10-NEXT: s_lshr_b32 s17, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] -; GFX10-NEXT: s_not_b32 s18, s16 -; GFX10-NEXT: s_sub_i32 s21, s17, 64 -; GFX10-NEXT: s_sub_i32 s22, 64, s17 +; GFX10-NEXT: s_or_b32 s2, s2, s17 +; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16 +; GFX10-NEXT: s_not_b32 s21, s16 +; GFX10-NEXT: s_sub_i32 s26, s17, 64 +; GFX10-NEXT: s_sub_i32 s18, 64, s17 ; GFX10-NEXT: s_cmp_lt_u32 s17, 64 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s17, 0 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 -; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_lshr_b64 s[18:19], s[0:1], s18 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[2:3], s21 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[0:1], s21 +; GFX10-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s26 +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[22:23], s[24:25], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_and_b32 s0, s16, 0x7f -; GFX10-NEXT: s_sub_i32 s18, s0, 64 +; GFX10-NEXT: s_sub_i32 s21, s0, 64 ; GFX10-NEXT: s_sub_i32 s17, 64, s0 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 -; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[10:11], s17 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 +; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_cmp_lg_u32 s25, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cmp_lg_u32 s24, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_lshr_b32 s18, s5, 31 +; GFX10-NEXT: s_lshr_b32 s8, s5, 31 +; GFX10-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: s_andn2_b32 s8, 0x7f, s20 -; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] ; GFX10-NEXT: s_not_b32 s16, s20 ; GFX10-NEXT: s_sub_i32 s18, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 @@ -7390,54 +7365,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX11-LABEL: s_fshr_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s18, s1, 31 -; GFX11-NEXT: s_mov_b32 s19, 0 -; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16 +; GFX11-NEXT: s_lshr_b32 s17, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] -; GFX11-NEXT: s_not_b32 s18, s16 -; GFX11-NEXT: s_sub_i32 s21, s17, 64 -; GFX11-NEXT: s_sub_i32 s22, 64, s17 +; GFX11-NEXT: s_or_b32 s2, s2, s17 +; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16 +; GFX11-NEXT: s_not_b32 s21, s16 +; GFX11-NEXT: s_sub_i32 s26, s17, 64 +; GFX11-NEXT: s_sub_i32 s18, 64, s17 ; GFX11-NEXT: s_cmp_lt_u32 s17, 64 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s17, 0 ; GFX11-NEXT: s_cselect_b32 s17, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 -; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_lshr_b64 s[18:19], s[0:1], s18 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], s21 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[0:1], s21 +; GFX11-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s26 +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[22:23], s[24:25], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_and_b32 s0, s16, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s18, s0, 64 +; GFX11-NEXT: s_sub_i32 s21, s0, 64 ; GFX11-NEXT: s_sub_i32 s17, 64, s0 ; GFX11-NEXT: s_cmp_lt_u32 s0, 64 -; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_cselect_b32 s24, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s26, 1, 0 +; GFX11-NEXT: s_cselect_b32 s25, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 -; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[10:11], s17 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_cmp_lg_u32 s25, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_lshr_b32 s18, s5, 31 +; GFX11-NEXT: s_lshr_b32 s8, s5, 31 +; GFX11-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_and_not1_b32 s8, 0x7f, s20 -; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] ; GFX11-NEXT: s_not_b32 s16, s20 ; GFX11-NEXT: s_sub_i32 s18, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..b44339b41808a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 -; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, 33 @@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { ; GCN-LABEL: s_lshr_i65_33: ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_lshr_b32 s4, s1, 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 +; GCN-NEXT: s_or_b32 s0, s0, s4 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65_33: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 ; GFX10PLUS-NEXT: s_mov_b32 s2, 0 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll index af377b1d76817..e0581f01dda6a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll @@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], 0x50 +; GFX7-NEXT: s_or_b32 s4, s3, 0x50 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: s_mov_b32 s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], 0x50 +; GFX8-NEXT: s_or_b32 s2, s2, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], 0x50 +; GFX9-NEXT: s_or_b32 s2, s2, 0x50 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], 0x50 +; GFX10-NEXT: s_or_b32 s2, s2, 0x50 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], 0x50 +; GFX11-NEXT: s_or_b32 s2, s2, 0x50 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_or_b64 s[2:3], s[2:3], 0x50 +; GFX12-NEXT: s_or_b32 s2, s2, 0x50 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index a9b3deb3e49f4..cfe655ff97975 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) { ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i65 %value, 33 %ashr = ashr i65 %value, 33 @@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { ; GCN-LABEL: s_sext_inreg_i65_18: ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GCN-NEXT: s_lshr_b32 s4, s1, 14 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_lshr_b32 s3, s1, 14 +; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GCN-NEXT: s_lshl_b32 s7, s2, 14 -; GCN-NEXT: s_mov_b32 s6, s5 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: s_lshl_b32 s5, s2, 14 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_18: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 -; GFX10PLUS-NEXT: s_mov_b32 s5, 0 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 14 ; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_mov_b32 s6, s5 +; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3 +; GFX10PLUS-NEXT: s_mov_b32 s4, 0 ; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s2, 14 ; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 18 %ashr = ashr i65 %shl, 18 @@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) { ; GCN-NEXT: s_lshl_b32 s3, s2, 1 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], 31 -; GCN-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] -; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GCN-NEXT: s_bfe_u32 s0, s0, 0x1f0000 -; GCN-NEXT: s_mov_b32 s1, s2 -; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 31 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_ashr_i32 s2, s5, 1 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_bfe_u32 s4, s0, 0x1f0000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_ashr_i32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i65_33: @@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) { ; GFX10PLUS-NEXT: s_lshl_b32 s3, s2, 1 ; GFX10PLUS-NEXT: s_mov_b32 s2, 0 ; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], 31 -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1f0000 -; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_mov_b32 s1, s2 -; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 31 -; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_bfe_u32 s4, s0, 0x1f0000 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i65 %value, 33 %ashr = ashr i65 %shl, 33 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 3c991cfb7a1aa..56891920ddfb9 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-LABEL: memcpy_known: ; GISEL-GFX942: ; %bb.0: ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54 ; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 -; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 ; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 -; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 -; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 -; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54 -; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 -; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GISEL-GFX942-NEXT: s_mov_b32 s6, s13 -; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s10, s3 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s13 ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 -; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 -; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 -; GISEL-GFX942-NEXT: s_mov_b32 s2, s7 -; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 ; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54 -; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0 -; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0 -; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17 -; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17 -; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12 -; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, 0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 -; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17 -; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3 -; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10 -; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 -; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 +; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10 +; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11 ; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 @@ -936,28 +917,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-LABEL: memcpy_known_medium: ; GISEL-GFX942: ; %bb.0: ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54 ; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 -; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 ; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 -; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 -; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 -; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54 -; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 -; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GISEL-GFX942-NEXT: s_mov_b32 s6, s13 -; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s10, s3 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s13 ; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 -; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 -; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 -; GISEL-GFX942-NEXT: s_mov_b32 s2, s7 -; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] -; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0 @@ -1014,25 +985,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 ; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54 -; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0 -; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0 -; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17 -; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17 -; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12 -; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, 0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 -; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17 -; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3 -; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10 -; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] -; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 -; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 +; GISEL-GFX1100-NEXT: s_mov_b32 s13, s10 +; GISEL-GFX1100-NEXT: s_mov_b32 s14, s11 ; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 @@ -1208,27 +1170,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX942: ; %bb.0: ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 -; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 -; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 -; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 -; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 -; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GISEL-GFX942-NEXT: s_mov_b32 s10, s3 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 -; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 -; GISEL-GFX942-NEXT: s_mov_b32 s12, s7 +; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x54 ; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX942-NEXT: s_mov_b32 s5, s2 -; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 -; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] ; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0 ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen @@ -1241,35 +1194,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_clause 0x1 ; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 -; GISEL-GFX1100-NEXT: s_mov_b32 s13, 0 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX1100-NEXT: s_mov_b32 s8, s13 -; GISEL-GFX1100-NEXT: s_mov_b32 s6, s13 +; GISEL-GFX1100-NEXT: s_load_b32 s11, s[4:5], 0x34 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX1100-NEXT: s_mov_b32 s12, s1 -; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-GFX1100-NEXT: s_or_b64 s[0:1], s[12:13], s[8:9] -; GISEL-GFX1100-NEXT: s_mov_b32 s12, s3 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX1100-NEXT: s_or_b64 s[2:3], s[12:13], s[6:7] -; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen +; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 +; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_clause 0x1 -; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54 -; GISEL-GFX1100-NEXT: s_mov_b32 s4, s13 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 -; GISEL-GFX1100-NEXT: s_mov_b32 s5, s10 -; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s8 -; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5] -; GISEL-GFX1100-NEXT: s_mov_b32 s12, s11 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen -; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16 ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16 ; GISEL-GFX1100-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index e6c38d29be949..b5802e8f5a726 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -4484,7 +4484,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4 -; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -4515,14 +4515,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3 -; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v0, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4 @@ -4583,7 +4581,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4 -; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-G-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -4596,15 +4594,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v0, v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 20795431b4cd8..99f0ac0cdeee1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs } define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0 -; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 4160 %addr = inttoptr i64 %or to ptr diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 723e3ef15553a..7ea5b37482469 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4738,21 +4738,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_or_b32_e32 v0, 16, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_or_b32_e32 v0, 16, v0 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_or_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 16 %addr = inttoptr i64 %or to ptr addrspace(1) @@ -4779,21 +4771,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0 -; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 4160 %addr = inttoptr i64 %or to ptr addrspace(1) diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c316ec71863d0..9fd4f3fa38b98 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -1067,13 +1067,13 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1 -; GISEL-NEXT: v_or_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_or_b32_e32 v9, v8, v5 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 -; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v9, v2, v0 ; GISEL-NEXT: v_mov_b32_e32 v7, v6 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 9dac2393fd966..504d7f629970c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 -; GFX9-GISEL-NEXT: s_mov_b32 s11, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 -; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 -; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 -; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc @@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-GISEL: ; %bb.0: ; %entry ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10 -; GFX942-GISEL-NEXT: s_mov_b32 s7, 0 -; GFX942-GISEL-NEXT: s_mov_b32 s8, s7 -; GFX942-GISEL-NEXT: s_mov_b32 s10, s7 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: s_mov_b32 s6, s1 +; GFX942-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX942-GISEL-NEXT: s_mov_b32 s9, s2 -; GFX942-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GFX942-GISEL-NEXT: s_mov_b32 s10, s3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 -; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 -; GFX942-GISEL-NEXT: s_mov_b32 s8, s7 +; GFX942-GISEL-NEXT: s_load_dword s7, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: s_mov_b32 s6, s1 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt @@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10 -; GFX10-GISEL-NEXT: s_mov_b32 s7, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s10, s7 -; GFX10-GISEL-NEXT: s_mov_b32 s4, s7 +; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 -; GFX10-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11] +; GFX10-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] -; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen slc +; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 -; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30 +; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 -; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX10-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc ; GFX10-GISEL-NEXT: s_endpgm @@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 -; GFX11-GISEL-NEXT: s_mov_b32 s9, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_mov_b32 s10, s9 -; GFX11-GISEL-NEXT: s_mov_b32 s6, s9 +; GFX11-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX11-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] -; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc +; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 +; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc ; GFX11-GISEL-NEXT: s_endpgm @@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 -; GFX12-GISEL-NEXT: s_mov_b32 s9, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_mov_b32 s10, s9 -; GFX12-GISEL-NEXT: s_mov_b32 s6, s9 +; GFX12-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX12-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] -; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT +; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 -; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT ; GFX12-GISEL-NEXT: s_endpgm @@ -373,27 +326,18 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 -; GFX9-GISEL-NEXT: s_mov_b32 s11, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 -; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 -; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 -; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen @@ -433,27 +377,18 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL: ; %bb.0: ; %entry ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10 -; GFX942-GISEL-NEXT: s_mov_b32 s7, 0 -; GFX942-GISEL-NEXT: s_mov_b32 s8, s7 -; GFX942-GISEL-NEXT: s_mov_b32 s10, s7 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: s_mov_b32 s6, s1 +; GFX942-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX942-GISEL-NEXT: s_mov_b32 s9, s2 -; GFX942-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] -; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GFX942-GISEL-NEXT: s_mov_b32 s10, s3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 -; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 -; GFX942-GISEL-NEXT: s_mov_b32 s8, s7 +; GFX942-GISEL-NEXT: s_load_dword s7, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: s_mov_b32 s6, s1 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX942-GISEL-NEXT: s_mov_b32 s5, s2 -; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 @@ -496,29 +431,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10 -; GFX10-GISEL-NEXT: s_mov_b32 s7, 0 -; GFX10-GISEL-NEXT: s_mov_b32 s10, s7 -; GFX10-GISEL-NEXT: s_mov_b32 s4, s7 +; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 -; GFX10-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11] +; GFX10-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] -; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc +; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 -; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30 +; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 -; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX10-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 -; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX10-GISEL-NEXT: s_endpgm @@ -562,32 +490,21 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 -; GFX11-GISEL-NEXT: s_mov_b32 s9, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_mov_b32 s10, s9 -; GFX11-GISEL-NEXT: s_mov_b32 s6, s9 +; GFX11-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX11-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] -; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 +; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc ; GFX11-GISEL-NEXT: s_endpgm @@ -631,32 +548,21 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 -; GFX12-GISEL-NEXT: s_mov_b32 s9, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_mov_b32 s10, s9 -; GFX12-GISEL-NEXT: s_mov_b32 s6, s9 +; GFX12-GISEL-NEXT: s_load_b32 s11, s[4:5], 0x10 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX12-GISEL-NEXT: s_mov_b32 s11, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] -; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 -; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 -; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm