Skip to content

Commit ea4597e

Browse files
committed
Reapply "AMDGPU/GlobalISel: Fully handle 0 dmask case during legalize"
This reverts commit 9bca8fc. Rearrange handling to avoid changing the instruction in the case where it's going to be erased and replaced with undef.
1 parent d1a7bfc commit ea4597e

File tree

4 files changed

+104
-148
lines changed

4 files changed

+104
-148
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3470,6 +3470,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
34703470
MachineInstr &MI, MachineIRBuilder &B,
34713471
GISelChangeObserver &Observer,
34723472
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3473+
B.setInstr(MI);
3474+
34733475
const int NumDefs = MI.getNumExplicitDefs();
34743476
bool IsTFE = NumDefs == 2;
34753477
// We are only processing the operands of d16 image operations on subtargets
@@ -3479,18 +3481,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
34793481
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
34803482
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
34813483

3482-
Observer.changingInstr(MI);
3483-
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3484-
3485-
3486-
unsigned NewOpcode = NumDefs == 0 ?
3487-
AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3488-
3489-
// Track that we legalized this
3490-
MI.setDesc(B.getTII().get(NewOpcode));
3491-
3492-
B.setInstr(MI);
3493-
34943484
MachineRegisterInfo *MRI = B.getMRI();
34953485
const LLT S32 = LLT::scalar(32);
34963486
const LLT S16 = LLT::scalar(16);
@@ -3506,6 +3496,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
35063496

35073497
int NumVAddrs, NumGradients;
35083498
std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3499+
const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3500+
getDMaskIdx(BaseOpcode, NumDefs);
3501+
unsigned DMask = 0;
3502+
3503+
int DMaskLanes = 0;
3504+
if (!BaseOpcode->Atomic) {
3505+
DMask = MI.getOperand(DMaskIdx).getImm();
3506+
if (BaseOpcode->Gather4) {
3507+
DMaskLanes = 4;
3508+
} else if (DMask != 0) {
3509+
DMaskLanes = countPopulation(DMask);
3510+
} else if (!IsTFE && !BaseOpcode->Store) {
3511+
// If dmask is 0, this is a no-op load. This can be eliminated.
3512+
B.buildUndef(MI.getOperand(0));
3513+
MI.eraseFromParent();
3514+
return true;
3515+
}
3516+
}
3517+
3518+
Observer.changingInstr(MI);
3519+
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3520+
3521+
unsigned NewOpcode = NumDefs == 0 ?
3522+
AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3523+
3524+
// Track that we legalized this
3525+
MI.setDesc(B.getTII().get(NewOpcode));
3526+
3527+
// Expecting to get an error flag since TFC is on - and dmask is 0 Force
3528+
// dmask to be at least 1 otherwise the instruction will fail
3529+
if (IsTFE && DMask == 0) {
3530+
DMask = 0x1;
3531+
DMaskLanes = 1;
3532+
MI.getOperand(DMaskIdx).setImm(DMask);
3533+
}
35093534

35103535
// If the register allocator cannot place the address registers contiguously
35113536
// without introducing moves, then using the non-sequential address encoding
@@ -3556,13 +3581,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
35563581
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
35573582
}
35583583

3559-
int DMaskLanes = 0;
3560-
if (!BaseOpcode->Atomic) {
3561-
const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs);
3562-
unsigned DMask = MI.getOperand(DMaskIdx).getImm();
3563-
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
3564-
}
3565-
35663584
if (BaseOpcode->Store) { // No TFE for stores?
35673585
// TODO: Handle dmask trim
35683586
Register VData = MI.getOperand(1).getReg();

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2903,15 +2903,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
29032903
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
29042904
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
29052905
; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
2906-
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
2907-
; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
2908-
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
2909-
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
2910-
; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
2911-
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
2912-
; GFX9: $vgpr1 = COPY [[DEF]](s32)
2913-
; GFX9: $vgpr2 = COPY [[DEF]](s32)
2914-
; GFX9: $vgpr3 = COPY [[DEF]](s32)
2906+
; GFX9: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
2907+
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
2908+
; GFX9: $vgpr0 = COPY [[UV]](s32)
2909+
; GFX9: $vgpr1 = COPY [[UV1]](s32)
2910+
; GFX9: $vgpr2 = COPY [[UV2]](s32)
2911+
; GFX9: $vgpr3 = COPY [[UV3]](s32)
29152912
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
29162913
; GFX10NSA-LABEL: name: getresinfo_dmask0
29172914
; GFX10NSA: bb.1.main_body:
@@ -2925,15 +2922,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
29252922
; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
29262923
; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
29272924
; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
2928-
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
2929-
; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
2930-
; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
2931-
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
2932-
; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
2933-
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
2934-
; GFX10NSA: $vgpr1 = COPY [[DEF]](s32)
2935-
; GFX10NSA: $vgpr2 = COPY [[DEF]](s32)
2936-
; GFX10NSA: $vgpr3 = COPY [[DEF]](s32)
2925+
; GFX10NSA: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
2926+
; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
2927+
; GFX10NSA: $vgpr0 = COPY [[UV]](s32)
2928+
; GFX10NSA: $vgpr1 = COPY [[UV1]](s32)
2929+
; GFX10NSA: $vgpr2 = COPY [[UV2]](s32)
2930+
; GFX10NSA: $vgpr3 = COPY [[UV3]](s32)
29372931
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
29382932
main_body:
29392933
%mip = extractelement <2 x i16> %coords, i32 0

0 commit comments

Comments
 (0)