Skip to content

Commit 3b66d4a

Browse files
authored
[AMDGPU] Support builtin/intrinsics for async loads/stores on gfx1250 (#151058)
1 parent 1a97452 commit 3b66d4a

13 files changed

+659
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,16 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
651651
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
652652
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
653653
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
654+
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
655+
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
656+
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
657+
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
658+
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
659+
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
660+
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
661+
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
662+
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
663+
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")
654664

655665
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
656666
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
@@ -670,9 +680,6 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1
670680
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
671681
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
672682

673-
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
674-
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")
675-
676683
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
677684
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
678685

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,89 @@
22
// REQUIRES: amdgpu-registered-target
33
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
44

5+
typedef int v2i __attribute__((ext_vector_type(2)));
6+
typedef int v4i __attribute__((ext_vector_type(4)));
7+
8+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8(
9+
// CHECK-GFX1250-NEXT: entry:
10+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
11+
// CHECK-GFX1250-NEXT: ret void
12+
//
13+
void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* laddr)
14+
{
15+
__builtin_amdgcn_global_load_async_to_lds_b8(gaddr, laddr, 16, 0);
16+
}
17+
18+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b32(
19+
// CHECK-GFX1250-NEXT: entry:
20+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
21+
// CHECK-GFX1250-NEXT: ret void
22+
//
23+
void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* laddr)
24+
{
25+
__builtin_amdgcn_global_load_async_to_lds_b32(gaddr, laddr, 16, 0);
26+
}
27+
28+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b64(
29+
// CHECK-GFX1250-NEXT: entry:
30+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
31+
// CHECK-GFX1250-NEXT: ret void
32+
//
33+
void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr)
34+
{
35+
__builtin_amdgcn_global_load_async_to_lds_b64(gaddr, laddr, 16, 0);
36+
}
37+
38+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b128(
39+
// CHECK-GFX1250-NEXT: entry:
40+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
41+
// CHECK-GFX1250-NEXT: ret void
42+
//
43+
void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* laddr)
44+
{
45+
__builtin_amdgcn_global_load_async_to_lds_b128(gaddr, laddr, 16, 0);
46+
}
47+
48+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b8(
49+
// CHECK-GFX1250-NEXT: entry:
50+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
51+
// CHECK-GFX1250-NEXT: ret void
52+
//
53+
void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char* laddr)
54+
{
55+
__builtin_amdgcn_global_store_async_from_lds_b8(gaddr, laddr, 16, 0);
56+
}
57+
58+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b32(
59+
// CHECK-GFX1250-NEXT: entry:
60+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
61+
// CHECK-GFX1250-NEXT: ret void
62+
//
63+
void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* laddr)
64+
{
65+
__builtin_amdgcn_global_store_async_from_lds_b32(gaddr, laddr, 16, 0);
66+
}
67+
68+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b64(
69+
// CHECK-GFX1250-NEXT: entry:
70+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
71+
// CHECK-GFX1250-NEXT: ret void
72+
//
73+
void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* laddr)
74+
{
75+
__builtin_amdgcn_global_store_async_from_lds_b64(gaddr, laddr, 16, 0);
76+
}
77+
78+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b128(
79+
// CHECK-GFX1250-NEXT: entry:
80+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
81+
// CHECK-GFX1250-NEXT: ret void
82+
//
83+
void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i* laddr)
84+
{
85+
__builtin_amdgcn_global_store_async_from_lds_b128(gaddr, laddr, 16, 0);
86+
}
87+
588
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
689
// CHECK-GFX1250-NEXT: entry:
790
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]])

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3643,6 +3643,50 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
36433643
[IntrNoMem, IntrSpeculatable]
36443644
>;
36453645

3646+
class AMDGPUAsyncGlobalLoadToLDS : Intrinsic <
3647+
[],
3648+
[global_ptr_ty, // Base global pointer to load from
3649+
local_ptr_ty, // LDS base pointer to store to.
3650+
llvm_i32_ty, // offset
3651+
llvm_i32_ty], // gfx12+ cachepolicy:
3652+
// bits [0-2] = th
3653+
// bits [3-4] = scope
3654+
[IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
3655+
NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
3656+
"", [SDNPMemOperand]
3657+
>;
3658+
3659+
class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic <
3660+
[],
3661+
[global_ptr_ty, // Base global pointer to store to
3662+
local_ptr_ty, // LDS base pointer to load from
3663+
llvm_i32_ty, // offset
3664+
llvm_i32_ty], // gfx12+ cachepolicy:
3665+
// bits [0-2] = th
3666+
// bits [3-4] = scope
3667+
[IntrInaccessibleMemOrArgMemOnly, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
3668+
NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
3669+
"", [SDNPMemOperand]
3670+
>;
3671+
3672+
def int_amdgcn_global_load_async_to_lds_b8 :
3673+
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS;
3674+
def int_amdgcn_global_load_async_to_lds_b32 :
3675+
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b32">, AMDGPUAsyncGlobalLoadToLDS;
3676+
def int_amdgcn_global_load_async_to_lds_b64 :
3677+
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b64">, AMDGPUAsyncGlobalLoadToLDS;
3678+
def int_amdgcn_global_load_async_to_lds_b128 :
3679+
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b128">, AMDGPUAsyncGlobalLoadToLDS;
3680+
3681+
def int_amdgcn_global_store_async_from_lds_b8 :
3682+
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b8">, AMDGPUAsyncGlobalStoreFromLDS;
3683+
def int_amdgcn_global_store_async_from_lds_b32 :
3684+
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b32">, AMDGPUAsyncGlobalStoreFromLDS;
3685+
def int_amdgcn_global_store_async_from_lds_b64 :
3686+
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b64">, AMDGPUAsyncGlobalStoreFromLDS;
3687+
def int_amdgcn_global_store_async_from_lds_b128 :
3688+
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, AMDGPUAsyncGlobalStoreFromLDS;
3689+
36463690
// WMMA intrinsics.
36473691
class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
36483692
Intrinsic<

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
143143
def gi_global_saddr_glc :
144144
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
145145
GIComplexPatternEquiv<GlobalSAddrGLC>;
146+
def gi_global_saddr_no_ioffset :
147+
GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
148+
GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
146149

147150
def gi_mubuf_scratch_offset :
148151
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
20492049
return true;
20502050
}
20512051

2052+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2053+
SDValue &SAddr,
2054+
SDValue &VOffset,
2055+
SDValue &CPol) const {
2056+
bool ScaleOffset;
2057+
SDValue DummyOffset;
2058+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2059+
false))
2060+
return false;
2061+
2062+
// We are assuming CPol is always the last operand of the intrinsic.
2063+
auto PassedCPol =
2064+
N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2065+
CPol = CurDAG->getTargetConstant(
2066+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2067+
return true;
2068+
}
2069+
20522070
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
20532071
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
20542072
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
174174
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
175175
SDValue &VOffset, SDValue &Offset,
176176
SDValue &CPol) const;
177+
bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
178+
SDValue &VOffset, SDValue &CPol) const;
177179
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
178180
SDValue &Offset) const;
179181
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5788,6 +5788,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
57885788
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
57895789
}
57905790

5791+
InstructionSelector::ComplexRendererFns
5792+
AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5793+
MachineOperand &Root) const {
5794+
const MachineInstr &I = *Root.getParent();
5795+
5796+
// We are assuming CPol is always the last operand of the intrinsic.
5797+
auto PassedCPol =
5798+
I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5799+
return selectGlobalSAddr(Root, PassedCPol, false);
5800+
}
5801+
57915802
InstructionSelector::ComplexRendererFns
57925803
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
57935804
Register Addr = Root.getReg();

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
264264
selectGlobalSAddrCPol(MachineOperand &Root) const;
265265
InstructionSelector::ComplexRendererFns
266266
selectGlobalSAddrGLC(MachineOperand &Root) const;
267+
InstructionSelector::ComplexRendererFns
268+
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
267269

268270
InstructionSelector::ComplexRendererFns
269271
selectScratchSAddr(MachineOperand &Root) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5364,6 +5364,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
53645364
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
53655365
break;
53665366
}
5367+
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
5368+
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
5369+
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
5370+
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
5371+
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
5372+
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
5373+
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
5374+
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
53675375
case Intrinsic::amdgcn_load_to_lds:
53685376
case Intrinsic::amdgcn_global_load_lds: {
53695377
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ let WantsRoot = true in {
1111
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
1212
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
1313

14+
def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
1415
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
1516
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
1617
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
@@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
13611362
(inst $saddr, $voffset, $offset, $cpol)
13621363
>;
13631364

1365+
class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
1366+
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
1367+
(inst $dsaddr, $vaddr, $offset, $cpol)
1368+
>;
1369+
1370+
class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
1371+
(node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
1372+
(inst $dsaddr, $saddr, $voffset, $offset, $cpol)
1373+
>;
1374+
1375+
class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
1376+
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
1377+
(inst $vaddr, $dsaddr, $offset, $cpol)
1378+
>;
1379+
1380+
class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
1381+
(node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
1382+
(inst $saddr, $voffset, $dsaddr, $offset, $cpol)
1383+
>;
1384+
13641385
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
13651386
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
13661387
(inst $saddr, $voffset, $offset, $cpol)
@@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
15711592
(inst $vaddr, $saddr, $offset, $cpol)
15721593
>;
15731594

1595+
multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
1596+
def : FlatLoadLDSSignedPat <inst, node> {
1597+
let AddedComplexity = 10;
1598+
}
1599+
1600+
def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
1601+
let AddedComplexity = 11;
1602+
}
1603+
}
1604+
1605+
multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
1606+
def : FlatStoreLDSSignedPat <inst, node> {
1607+
let AddedComplexity = 10;
1608+
}
1609+
1610+
def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
1611+
let AddedComplexity = 11;
1612+
}
1613+
}
1614+
15741615
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
15751616
def : FlatLoadSignedPat <inst, node, vt> {
15761617
let AddedComplexity = 10;
@@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
21372178
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
21382179
} // End SubtargetPredicate = isGFX125xOnly
21392180

2181+
let OtherPredicates = [isGFX1250Plus] in {
2182+
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
2183+
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
2184+
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
2185+
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
2186+
2187+
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
2188+
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
2189+
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
2190+
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
2191+
}
2192+
21402193
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
21412194
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
21422195
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;

0 commit comments

Comments
 (0)