Skip to content

[AMDGPU][GlobalISel] Combine for breaking s64 and/or into two s32 insts #151731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,25 @@ def zext_of_shift_amount_combines : GICombineGroup<[
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
]>;

// (and/or i64:x, i64:y) -> i64:(merge (and/or lo_32(x), lo_32(y)), (and/or hi_32(x), hi_32(y)))
// when either x or y is all ones in low or high parts
class combine_binop_s64_with_s32_mask<Instruction opcode> : GICombineRule<
(defs root:$dst),
(match (opcode $dst, i64:$x, i64:$y):$dst,
[{ return Helper.matchConstantIs32BitMask(${x}.getReg()) ||
Helper.matchConstantIs32BitMask(${y}.getReg()); }]),
(apply (G_UNMERGE_VALUES i32:$x_lo, i32:$x_hi, $x),
(G_UNMERGE_VALUES i32:$y_lo, i32:$y_hi, $y),
(opcode i32:$lo, $x_lo, $y_lo),
(opcode i32:$hi, $x_hi, $y_hi),
(G_MERGE_VALUES $dst, $lo, $hi))>;

def combine_or_s64_with_s32_mask : combine_binop_s64_with_s32_mask<G_OR>;
def combine_and_s64_with_s32_mask : combine_binop_s64_with_s32_mask<G_AND>;
def binop_s64_with_s32_mask_combines : GICombineGroup<[
combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
]>;

let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
Expand Down Expand Up @@ -180,15 +199,17 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector_to_build_vector]> {
foldable_fneg, combine_shuffle_vector_to_build_vector,
binop_s64_with_s32_mask_combines]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
binop_s64_with_s32_mask_combines]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "AMDGPUCombinerHelper.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
Expand Down Expand Up @@ -516,3 +517,9 @@ bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(

return true;
}

bool AMDGPUCombinerHelper::matchConstantIs32BitMask(Register Reg) const {
const KnownBits &Known = VT->getKnownBits(Reg);
return Known.One.extractBits(32, 0).isAllOnes() ||
Known.One.extractBits(32, 32).isAllOnes();
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class AMDGPUCombinerHelper : public CombinerHelper {
bool matchCombineFmulWithSelectToFldexp(
MachineInstr &MI, MachineInstr &Sel,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;

bool matchConstantIs32BitMask(Register Reg) const;
};

} // namespace llvm
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s

---
name: test_and_mask_hi_rhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_and_mask_hi_rhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 -4294967296
%2:_(s64) = G_AND %0, %1
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_and_mask_hi_lhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_and_mask_hi_lhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 -4294967296
%2:_(s64) = G_AND %1, %0
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...

---
name: test_and_mask_lo_rhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_and_mask_lo_rhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 4294967295
%2:_(s64) = G_AND %0, %1
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_and_mask_lo_lhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_and_mask_lo_lhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 4294967295
%2:_(s64) = G_AND %1, %0
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_and_mask_hi_with_merge_unmerge
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-LABEL: name: test_and_mask_hi_with_merge_unmerge
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr1
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[COPY]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%3:_(s64) = G_CONSTANT i64 -4294967296
%4:_(s64) = G_AND %2, %3
%5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4(s64)
$sgpr0 = COPY %5(s32)
$sgpr1 = COPY %6(s32)
SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
...
---
name: negative_and_test_incorrect_types
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5

; CHECK-LABEL: name: negative_and_test_incorrect_types
; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -4294967296
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s128) = G_AND [[COPY]], [[C]]
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[AND]](s128)
%0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(s64) = COPY $vgpr4_vgpr5
%2:_(s128) = G_CONSTANT i128 -4294967296
%3:_(s128) = G_AND %0, %2
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
...
---
name: test_or_mask_hi_rhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_or_mask_hi_rhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 -4294967296
%2:_(s64) = G_OR %0, %1
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_or_mask_hi_lhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_or_mask_hi_lhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 -4294967296
%2:_(s64) = G_OR %1, %0
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...

---
name: test_or_mask_lo_rhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_or_mask_lo_rhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 4294967295
%2:_(s64) = G_OR %0, %1
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_or_mask_lo_lhs
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-LABEL: name: test_or_mask_lo_lhs
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
%0:_(s64) = COPY $sgpr0_sgpr1
%1:_(s64) = G_CONSTANT i64 4294967295
%2:_(s64) = G_OR %1, %0
$sgpr0_sgpr1 = COPY %2(s64)
SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
...
---
name: test_or_mask_hi_with_merge_unmerge
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-LABEL: name: test_or_mask_hi_with_merge_unmerge
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK-NEXT: $sgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[C]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%3:_(s64) = G_CONSTANT i64 -4294967296
%4:_(s64) = G_OR %2, %3
%5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4(s64)
$sgpr0 = COPY %5(s32)
$sgpr1 = COPY %6(s32)
SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
...
---
name: negative_or_test_incorrect_types
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5

; CHECK-LABEL: name: negative_or_test_incorrect_types
; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -4294967296
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[C]]
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
%0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(s64) = COPY $vgpr4_vgpr5
%2:_(s128) = G_CONSTANT i128 -4294967296
%3:_(s128) = G_OR %0, %2
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
...
Loading