Skip to content

[AMDGPU][GlobalISel] Combine (or s64, zext(s32)) #151519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,19 @@ def zext_of_shift_amount_combines : GICombineGroup<[
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
]>;

// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
[ (pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
(pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
Comment on lines +157 to +158
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't have to manually commute patterns. I thought the matchers already tried the commuted forms like selection patterns do?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently they do not.


def combine_or_s64_s32 : GICombineRule<
(defs root:$dst),
(match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
(apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
(G_OR $or, $x_lo, $y),
(G_MERGE_VALUES $dst, $or, $x_hi))>;

let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
Expand Down Expand Up @@ -180,15 +193,16 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector_to_build_vector]> {
foldable_fneg, combine_shuffle_vector_to_build_vector, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
22 changes: 10 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = ashr i65 %value, 33
ret i65 %result
Expand Down Expand Up @@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GCN-NEXT: s_lshr_b32 s0, s1, 1
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GCN-NEXT: s_lshr_b32 s4, s1, 1
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: s_ashr_i32 s2, s3, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
; GFX10PLUS-NEXT: s_mov_b32 s1, 0
; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
Expand Down
97 changes: 97 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s

---
name: test_combine_or_s64_s32
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_combine_or_s64_s32
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s32) = COPY $sgpr2
%2:_(s64) = G_ZEXT %1(s32)
%3:_(s64) = G_OR %0, %2
$sgpr0_sgpr1 = COPY %3(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_combine_or_s64_s32_rhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s32) = COPY $sgpr2
%2:_(s64) = G_ZEXT %1(s32)
%3:_(s64) = G_OR %2, %0
$sgpr0_sgpr1 = COPY %3(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_combine_or_s64_s32_merge_unmerge
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = COPY $sgpr2
%3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%4:_(s64) = G_ZEXT %2(s32)
%5:_(s64) = G_OR %3, %4
%6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
$sgpr0 = COPY %6(s32)
$sgpr1 = COPY %7(s32)
SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
...
---
name: negative_test_incorrect_types
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
; CHECK-LABEL: name: negative_test_incorrect_types
; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
%0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(s64) = COPY $vgpr4_vgpr5
%2:_(s128) = G_ZEXT %1
%3:_(s128) = G_OR %0, %2
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
...

Original file line number Diff line number Diff line change
Expand Up @@ -230,30 +230,30 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_mov_b32 s7, -1
; GFX10-NEXT: s_mov_b32 s2, s1
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_mov_b32 s1, 0
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo
; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX10-NEXT: s_add_i32 s1, s1, 4
; GFX10-NEXT: s_add_i32 s2, s2, 4
; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -262,19 +262,19 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX10-NEXT: s_branch .LBB4_6
; GFX10-NEXT: .LBB4_4:
; GFX10-NEXT: s_mov_b32 s1, exec_lo
; GFX10-NEXT: s_mov_b32 s2, exec_lo
; GFX10-NEXT: ; implicit-def: $vgpr1
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
; GFX10-NEXT: ; %bb.5: ; %.19
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: v_or_b32_e32 v1, 2, v1
; GFX10-NEXT: .LBB4_6: ; %.22
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
; GFX10-NEXT: s_endpgm
.entry:
Expand Down
Loading