Skip to content

Commit 0ba40d4

Browse files
committed
AMDGPU/GlobalISel: Combines for V_CVT_F32_UBYTE[0-3]
Ports the existing DAG combines, minus the simplify demanded bits which seems to have no equivalent now. Without these, this isn't particularly helpful in most of the IR sample cases.
1 parent 10df156 commit 0ba40d4

File tree

5 files changed

+1819
-1
lines changed

5 files changed

+1819
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ def uchar_to_float : GICombineRule<
2626
[{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]),
2727
(apply [{ applyUCharToFloat(*${itofp}); }])>;
2828

29+
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
30+
31+
def cvt_f32_ubyteN : GICombineRule<
32+
(defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
33+
(match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0,
34+
G_AMDGPU_CVT_F32_UBYTE1,
35+
G_AMDGPU_CVT_F32_UBYTE2,
36+
G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
37+
[{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]),
38+
(apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
2939

3040
// Combines which should only apply on SI/VI
3141
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -49,6 +59,6 @@ def all_combines_minus_extload : GICombineGroup<[trivial_combines,
4959
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
5060
"AMDGPUGenPostLegalizerCombinerHelper",
5161
[all_combines_minus_extload, gfx6gfx7_combines,
52-
uchar_to_float]> {
62+
uchar_to_float, cvt_f32_ubyteN]> {
5363
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
5464
}

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,60 @@ static void applyUCharToFloat(MachineInstr &MI) {
165165
MI.eraseFromParent();
166166
}
167167

168+
// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
169+
// boilerplate.
170+
struct CvtF32UByteMatchInfo {
171+
Register CvtVal;
172+
unsigned ShiftOffset;
173+
};
174+
175+
static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
176+
MachineFunction &MF,
177+
CvtF32UByteMatchInfo &MatchInfo) {
178+
Register SrcReg = MI.getOperand(1).getReg();
179+
180+
// Look through G_ZEXT.
181+
mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
182+
183+
Register Src0;
184+
int64_t ShiftAmt;
185+
bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
186+
if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
187+
const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
188+
189+
unsigned ShiftOffset = 8 * Offset;
190+
if (IsShr)
191+
ShiftOffset += ShiftAmt;
192+
else
193+
ShiftOffset -= ShiftAmt;
194+
195+
MatchInfo.CvtVal = Src0;
196+
MatchInfo.ShiftOffset = ShiftOffset;
197+
return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
198+
}
199+
200+
// TODO: Simplify demanded bits.
201+
return false;
202+
}
203+
204+
static void applyCvtF32UByteN(MachineInstr &MI,
205+
const CvtF32UByteMatchInfo &MatchInfo) {
206+
MachineIRBuilder B(MI);
207+
unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
208+
209+
const LLT S32 = LLT::scalar(32);
210+
Register CvtSrc = MatchInfo.CvtVal;
211+
LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
212+
if (SrcTy != S32) {
213+
assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
214+
CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
215+
}
216+
217+
assert(MI.getOpcode() != NewOpc);
218+
B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
219+
MI.eraseFromParent();
220+
}
221+
168222
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
169223
#include "AMDGPUGenPostLegalizeGICombiner.inc"
170224
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9919,6 +9919,8 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
99199919

99209920
SDValue Src = N->getOperand(0);
99219921
SDValue Shift = N->getOperand(0);
9922+
9923+
// TODO: Extend type shouldn't matter (assuming legal types).
99229924
if (Shift.getOpcode() == ISD::ZERO_EXTEND)
99239925
Shift = Shift.getOperand(0);
99249926

0 commit comments

Comments
 (0)