Skip to content

Commit 8f187c7

Browse files
authored
[AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer (#150887)
The new instruction represents the unknown number of waitcnts needed at a release operation to ensure that prior direct loads to LDS (formerly called LDS DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a suitable value for vmcnt(). Co-authored-by: Austin Kerbow <[email protected]>.
1 parent eddd342 commit 8f187c7

File tree

6 files changed

+735
-0
lines changed

6 files changed

+735
-0
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13801380
Modified = true;
13811381
} else
13821382
WaitcntInstr = &II;
1383+
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1384+
assert(ST->hasVMemToLDSLoad());
1385+
LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1386+
<< "Before: " << Wait.LoadCnt << '\n';);
1387+
ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1388+
LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
1389+
1390+
// It is possible (but unlikely) that this is the only wait instruction,
1391+
// in which case, we exit this loop without a WaitcntInstr to consume
1392+
// `Wait`. But that works because `Wait` was passed in by reference, and
1393+
// the callee eventually calls createNewWaitcnt on it. We test this
1394+
// possibility in an articial MIR test since such a situation cannot be
1395+
// recreated by running the memory legalizer.
1396+
II.eraseFromParent();
13831397
} else {
13841398
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
13851399
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1551,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15511565
ScoreBrackets.simplifyWaitcnt(OldWait);
15521566
Wait = Wait.combined(OldWait);
15531567
UpdatableInstr = &CombinedStoreDsCntInstr;
1568+
} else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1569+
// Architectures higher than GFX10 do not have direct loads to
1570+
// LDS, so no work required here yet.
1571+
II.eraseFromParent();
1572+
continue;
15541573
} else {
15551574
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
15561575
assert(CT.has_value());
@@ -2415,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
24152434
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
24162435
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
24172436
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2437+
Opcode == AMDGPU::S_WAITCNT_lds_direct ||
24182438
counterTypeForInstr(Opcode).has_value();
24192439
}
24202440

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11701170
Changed = true;
11711171
}
11721172

1173+
// On architectures that support direct loads to LDS, emit an unknown waitcnt
1174+
// at workgroup-scoped release operations that specify the LDS address space.
1175+
// SIInsertWaitcnts will later replace this with a vmcnt().
1176+
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1177+
Scope == SIAtomicScope::WORKGROUP &&
1178+
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1179+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1180+
Changed = true;
1181+
}
1182+
11731183
if (Pos == Position::AFTER)
11741184
--MI;
11751185

@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20782088
Changed = true;
20792089
}
20802090

2091+
// On architectures that support direct loads to LDS, emit an unknown waitcnt
2092+
// at workgroup-scoped release operations that specify the LDS address space.
2093+
// SIInsertWaitcnts will later replace this with a vmcnt().
2094+
if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
2095+
Scope == SIAtomicScope::WORKGROUP &&
2096+
(AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2097+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
2098+
Changed = true;
2099+
}
2100+
20812101
if (VSCnt) {
20822102
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
20832103
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
16211621
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16221622
}
16231623

1624+
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
1625+
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
1626+
1627+
def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
1628+
let hasSideEffects = 0;
1629+
}
1630+
16241631
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
16251632
[(int_amdgcn_s_sethalt timm:$simm16)]>;
16261633
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;

llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
545545
; GFX10WGP-LABEL: name: workgroup_one_as_release
546546
; GFX10WGP: bb.0.entry:
547547
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
548+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
548549
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
549550
; GFX10WGP-NEXT: S_ENDPGM 0
550551
;
551552
; GFX10CU-LABEL: name: workgroup_one_as_release
552553
; GFX10CU: bb.0.entry:
554+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
553555
; GFX10CU-NEXT: S_ENDPGM 0
554556
;
555557
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
578580
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
579581
; GFX10WGP: bb.0.entry:
580582
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
583+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
581584
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
582585
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
583586
; GFX10WGP-NEXT: S_ENDPGM 0
584587
;
585588
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
586589
; GFX10CU: bb.0.entry:
590+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
587591
; GFX10CU-NEXT: S_ENDPGM 0
588592
;
589593
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
613617
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
614618
; GFX10WGP: bb.0.entry:
615619
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
620+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
616621
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
617622
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
618623
; GFX10WGP-NEXT: S_ENDPGM 0
619624
;
620625
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
621626
; GFX10CU: bb.0.entry:
627+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
622628
; GFX10CU-NEXT: S_ENDPGM 0
623629
;
624630
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
12931299
; GFX10WGP-LABEL: name: workgroup_release
12941300
; GFX10WGP: bb.0.entry:
12951301
; GFX10WGP-NEXT: S_WAITCNT_soft 112
1302+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
12961303
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
12971304
; GFX10WGP-NEXT: S_ENDPGM 0
12981305
;
12991306
; GFX10CU-LABEL: name: workgroup_release
13001307
; GFX10CU: bb.0.entry:
13011308
; GFX10CU-NEXT: S_WAITCNT_soft 49279
1309+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
13021310
; GFX10CU-NEXT: S_ENDPGM 0
13031311
;
13041312
; GFX11WGP-LABEL: name: workgroup_release
@@ -1330,13 +1338,15 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
13301338
; GFX10WGP-LABEL: name: workgroup_acq_rel
13311339
; GFX10WGP: bb.0.entry:
13321340
; GFX10WGP-NEXT: S_WAITCNT_soft 112
1341+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
13331342
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
13341343
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
13351344
; GFX10WGP-NEXT: S_ENDPGM 0
13361345
;
13371346
; GFX10CU-LABEL: name: workgroup_acq_rel
13381347
; GFX10CU: bb.0.entry:
13391348
; GFX10CU-NEXT: S_WAITCNT_soft 49279
1349+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
13401350
; GFX10CU-NEXT: S_ENDPGM 0
13411351
;
13421352
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1369,13 +1379,15 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
13691379
; GFX10WGP-LABEL: name: workgroup_seq_cst
13701380
; GFX10WGP: bb.0.entry:
13711381
; GFX10WGP-NEXT: S_WAITCNT_soft 112
1382+
; GFX10WGP-NEXT: S_WAITCNT_lds_direct
13721383
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
13731384
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
13741385
; GFX10WGP-NEXT: S_ENDPGM 0
13751386
;
13761387
; GFX10CU-LABEL: name: workgroup_seq_cst
13771388
; GFX10CU: bb.0.entry:
13781389
; GFX10CU-NEXT: S_WAITCNT_soft 49279
1390+
; GFX10CU-NEXT: S_WAITCNT_lds_direct
13791391
; GFX10CU-NEXT: S_ENDPGM 0
13801392
;
13811393
; GFX11WGP-LABEL: name: workgroup_seq_cst
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
3+
4+
5+
# Expected vmcnt(0) since the direct load is the only load.
6+
---
7+
name: dma_then_fence
8+
body: |
9+
bb.0:
10+
; GCN-LABEL: name: dma_then_fence
11+
; GCN: S_WAITCNT 0
12+
; GCN-NEXT: $m0 = S_MOV_B32 0
13+
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
14+
; GCN-NEXT: S_WAITCNT 3952
15+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
16+
; GCN-NEXT: S_ENDPGM 0
17+
$m0 = S_MOV_B32 0
18+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
19+
S_WAITCNT_lds_direct
20+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
21+
S_ENDPGM 0
22+
23+
...
24+
25+
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
26+
27+
---
28+
name: dma_then_global_load
29+
body: |
30+
bb.0:
31+
; GCN-LABEL: name: dma_then_global_load
32+
; GCN: S_WAITCNT 0
33+
; GCN-NEXT: $m0 = S_MOV_B32 0
34+
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
35+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
36+
; GCN-NEXT: S_WAITCNT 3953
37+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
38+
; GCN-NEXT: S_ENDPGM 0
39+
$m0 = S_MOV_B32 0
40+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
41+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
42+
S_WAITCNT_lds_direct
43+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
44+
S_ENDPGM 0
45+
46+
...
47+
48+
# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.
49+
50+
---
51+
name: no_dma_just_fence
52+
body: |
53+
bb.0:
54+
; GCN-LABEL: name: no_dma_just_fence
55+
; GCN: S_WAITCNT 0
56+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
57+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
58+
; GCN-NEXT: S_ENDPGM 0
59+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
60+
S_WAITCNT_lds_direct
61+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
62+
S_ENDPGM 0
63+
64+
...
65+
66+
# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
67+
68+
---
69+
name: dma_then_system_fence
70+
body: |
71+
bb.0:
72+
; GCN-LABEL: name: dma_then_system_fence
73+
; GCN: S_WAITCNT 0
74+
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
75+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
76+
; GCN-NEXT: S_WAITCNT 3953
77+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
78+
; GCN-NEXT: S_ENDPGM 0
79+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
80+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
81+
S_WAITCNT_lds_direct
82+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
83+
S_ENDPGM 0
84+
85+
...
86+
87+
# The computed vmcnt(1) gets merged with the existing vmcnt(0).
88+
89+
---
90+
name: merge_with_prev_wait
91+
body: |
92+
bb.0:
93+
; GCN-LABEL: name: merge_with_prev_wait
94+
; GCN: S_WAITCNT 0
95+
; GCN-NEXT: $m0 = S_MOV_B32 0
96+
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
97+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
98+
; GCN-NEXT: S_WAITCNT 3952
99+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
100+
; GCN-NEXT: S_ENDPGM 0
101+
$m0 = S_MOV_B32 0
102+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
103+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
104+
S_WAITCNT 3952
105+
S_WAITCNT_lds_direct
106+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
107+
S_ENDPGM 0
108+
109+
...
110+
111+
# The computed vmcnt(1) gets merged with the existing vmcnt(0).
112+
113+
---
114+
name: merge_with_next_wait
115+
body: |
116+
bb.0:
117+
; GCN-LABEL: name: merge_with_next_wait
118+
; GCN: S_WAITCNT 0
119+
; GCN-NEXT: $m0 = S_MOV_B32 0
120+
; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
121+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
122+
; GCN-NEXT: S_WAITCNT 3952
123+
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
124+
; GCN-NEXT: S_ENDPGM 0
125+
$m0 = S_MOV_B32 0
126+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
127+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
128+
S_WAITCNT_lds_direct
129+
S_WAITCNT 3952
130+
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
131+
S_ENDPGM 0
132+
133+
...

0 commit comments

Comments
 (0)