Skip to content

Commit 2b20cf7

Browse files
authored
[AMDGPU] Fold into uses of splat REG_SEQUENCEs through COPYs. (#145691)
1 parent b18377c commit 2b20cf7

File tree

3 files changed

+38
-11
lines changed

3 files changed

+38
-11
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1169,11 +1169,18 @@ void SIFoldOperandsImpl::foldOperand(
11691169
// Grab the use operands first
11701170
SmallVector<MachineOperand *, 4> UsesToProcess(
11711171
llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1172-
for (auto *RSUse : UsesToProcess) {
1172+
for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1173+
MachineOperand *RSUse = UsesToProcess[I];
11731174
MachineInstr *RSUseMI = RSUse->getParent();
11741175
unsigned OpNo = RSUseMI->getOperandNo(RSUse);
11751176

11761177
if (SplatRC) {
1178+
if (RSUseMI->isCopy()) {
1179+
Register DstReg = RSUseMI->getOperand(0).getReg();
1180+
append_range(UsesToProcess,
1181+
make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1182+
continue;
1183+
}
11771184
if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
11781185
FoldableDef SplatDef(SplatVal, SplatRC);
11791186
appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3238,11 +3238,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
32383238
; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
32393239
; GFX90A-GISEL: ; %bb.0: ; %bb
32403240
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3241-
; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0
3242-
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2
3243-
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
32443241
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
3245-
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
3242+
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
32463243
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1
32473244
; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
32483245
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3253,11 +3250,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
32533250
; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
32543251
; GFX942-GISEL: ; %bb.0: ; %bb
32553252
; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3256-
; GFX942-GISEL-NEXT: s_mov_b32 s2, 0
3257-
; GFX942-GISEL-NEXT: s_mov_b32 s3, s2
3258-
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
32593253
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
3260-
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1]
3254+
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
32613255
; GFX942-GISEL-NEXT: s_nop 0
32623256
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1
32633257
; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,44 @@
1-
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
23

4+
# Check that we don't hang on this.
35
---
46
name: fold_reg_sequence
57
body: |
68
bb.0:
79
liveins: $vgpr0_vgpr1, $vgpr2
810
11+
; CHECK-LABEL: name: fold_reg_sequence
12+
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
15+
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 429
16+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
17+
; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, [[REG_SEQUENCE]].sub0, implicit $exec
18+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1)
19+
; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[GLOBAL_LOAD_DWORD]], [[REG_SEQUENCE]].sub0, implicit $exec
20+
; CHECK-NEXT: S_ENDPGM 0
921
%0:sreg_32 = S_MOV_B32 0
1022
%1:sreg_32 = S_MOV_B32 429
1123
%2:sreg_64 = REG_SEQUENCE killed %1, %subreg.sub0, %0, %subreg.sub1
1224
%3:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, %2.sub0, implicit $exec
1325
%4:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1)
1426
%5:vgpr_32 = V_MUL_HI_U32_e64 %4, %2.sub0, implicit $exec
1527
S_ENDPGM 0
16-
1728
...
1829

30+
# Fold through a COPY of REG_SEQUENCE.
31+
---
32+
name: fold_through_copy
33+
body: |
34+
bb.0:
35+
; CHECK-LABEL: name: fold_through_copy
36+
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
37+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
38+
; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[DEF]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
39+
%0:sreg_32 = S_MOV_B32 0
40+
%1:sreg_64 = REG_SEQUENCE %0:sreg_32, %subreg.sub0, %0:sreg_32, %subreg.sub1
41+
%2:sreg_64_xexec = IMPLICIT_DEF
42+
%3:vreg_64_align2 = COPY %1:sreg_64
43+
%4:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %2:sreg_64_xexec, 8, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
44+
...

0 commit comments

Comments
 (0)