Skip to content

Commit cdbe34c

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 1
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
1 parent 2edc730 commit cdbe34c

File tree

6 files changed

+3510
-43
lines changed

6 files changed

+3510
-43
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 144 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5095,12 +5095,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50955095
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
50965096
switch (Opc) {
50975097
case AMDGPU::S_MIN_U32:
5098+
case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
50985099
return std::numeric_limits<uint32_t>::max();
50995100
case AMDGPU::S_MIN_I32:
5101+
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
51005102
return std::numeric_limits<int32_t>::max();
51015103
case AMDGPU::S_MAX_U32:
5104+
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
51025105
return std::numeric_limits<uint32_t>::min();
51035106
case AMDGPU::S_MAX_I32:
5107+
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51045108
return std::numeric_limits<int32_t>::min();
51055109
case AMDGPU::S_ADD_I32:
51065110
case AMDGPU::S_SUB_I32:
@@ -5128,16 +5132,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51285132
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
51295133
Register DstReg = MI.getOperand(0).getReg();
51305134
MachineBasicBlock *RetBB = nullptr;
5135+
bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
51315136
if (isSGPR) {
51325137
switch (Opc) {
51335138
case AMDGPU::S_MIN_U32:
5139+
case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
51345140
case AMDGPU::S_MIN_I32:
5141+
case AMDGPU::V_CMP_LT_I64_e64: /*min*/
51355142
case AMDGPU::S_MAX_U32:
5143+
case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
51365144
case AMDGPU::S_MAX_I32:
5145+
case AMDGPU::V_CMP_GT_I64_e64: /*max*/
51375146
case AMDGPU::S_AND_B32:
51385147
case AMDGPU::S_OR_B32: {
51395148
// Idempotent operations.
5140-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5149+
unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5150+
BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
51415151
RetBB = &BB;
51425152
break;
51435153
}
@@ -5222,73 +5232,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52225232
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52235233
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52245234
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5225-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5226-
5235+
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
52275236
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
52285237
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
52295238
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5230-
5231-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5232-
Register LaneValueReg =
5233-
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5239+
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240+
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
52345241

52355242
bool IsWave32 = ST.isWave32();
5236-
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5243+
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52375244
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52385245

52395246
// Create initial values of induction variable from Exec, Accumulator and
52405247
// insert branch instr to newly created ComputeBlock
5241-
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5242-
auto TmpSReg =
5243-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5244-
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5245-
.addImm(InitalValue);
5248+
uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5249+
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5250+
if (is32BitOpc) {
5251+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5252+
.addImm(IdentityValue);
5253+
} else {
5254+
Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5255+
Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5256+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5257+
.addImm(IdentityValue);
5258+
switch (Opc) {
5259+
case AMDGPU::V_CMP_LT_U64_e64:
5260+
case AMDGPU::V_CMP_LT_I64_e64:
5261+
IdentityValue = int32_t(-1); // u|min
5262+
break;
5263+
case AMDGPU::V_CMP_GT_U64_e64:
5264+
case AMDGPU::V_CMP_GT_I64_e64:
5265+
IdentityValue = int32_t(0); // u|max
5266+
break;
5267+
}
5268+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5269+
.addImm(IdentityValue);
5270+
BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5271+
.addReg(Identitylo)
5272+
.addImm(AMDGPU::sub0)
5273+
.addReg(Identityhi)
5274+
.addImm(AMDGPU::sub1);
5275+
}
52465276
// clang-format off
52475277
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
52485278
.addMBB(ComputeLoop);
52495279
// clang-format on
52505280

52515281
// Start constructing ComputeLoop
5252-
I = ComputeLoop->end();
5282+
I = ComputeLoop->begin();
52535283
auto Accumulator =
52545284
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5255-
.addReg(InitalValReg)
5285+
.addReg(IdentityValReg)
52565286
.addMBB(&BB);
52575287
auto ActiveBits =
52585288
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5259-
.addReg(TmpSReg->getOperand(0).getReg())
5289+
.addReg(LoopIterator)
52605290
.addMBB(&BB);
52615291

5292+
I = ComputeLoop->end();
5293+
MachineInstr *NewAccumulator;
52625294
// Perform the computations
52635295
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5264-
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5265-
.addReg(ActiveBits->getOperand(0).getReg());
5266-
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5267-
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5268-
.addReg(SrcReg)
5269-
.addReg(FF1->getOperand(0).getReg());
5270-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5271-
.addReg(Accumulator->getOperand(0).getReg())
5272-
.addReg(LaneValue->getOperand(0).getReg());
5273-
5296+
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5297+
.addReg(ActiveBitsReg);
5298+
if (is32BitOpc) {
5299+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5300+
LaneValueReg)
5301+
.addReg(SrcReg)
5302+
.addReg(FF1Reg);
5303+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5304+
.addReg(Accumulator->getOperand(0).getReg())
5305+
.addReg(LaneValueReg);
5306+
} else {
5307+
Register LaneValueLoReg =
5308+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5309+
Register LaneValueHiReg =
5310+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5311+
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5312+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5313+
const TargetRegisterClass *SrcSubRC =
5314+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5315+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5316+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5317+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5318+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5319+
// lane value input should be in an sgpr
5320+
MachineInstr *LaneValueLo =
5321+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5322+
LaneValueLoReg)
5323+
.add(Op1L)
5324+
.addReg(FF1Reg);
5325+
MachineInstr *LaneValueHi =
5326+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5327+
LaneValueHiReg)
5328+
.add(Op1H)
5329+
.addReg(FF1Reg);
5330+
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5331+
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5332+
.addReg(LaneValueLoReg)
5333+
.addImm(AMDGPU::sub0)
5334+
.addReg(LaneValueHiReg)
5335+
.addImm(AMDGPU::sub1);
5336+
switch (Opc) {
5337+
case AMDGPU::V_CMP_GT_I64_e64:
5338+
case AMDGPU::V_CMP_GT_U64_e64:
5339+
case AMDGPU::V_CMP_LT_I64_e64:
5340+
case AMDGPU::V_CMP_LT_U64_e64: {
5341+
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5342+
Register ComparisonResultReg =
5343+
MRI.createVirtualRegister(WaveMaskRegClass);
5344+
const TargetRegisterClass *VregClass =
5345+
ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5346+
: &AMDGPU::VReg_64RegClass;
5347+
const TargetRegisterClass *VSubRegClass =
5348+
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5349+
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5350+
MachineOperand SrcReg0Sub0 =
5351+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5352+
VregClass, AMDGPU::sub0, VSubRegClass);
5353+
MachineOperand SrcReg0Sub1 =
5354+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5355+
VregClass, AMDGPU::sub1, VSubRegClass);
5356+
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5357+
AccumulatorVReg)
5358+
.add(SrcReg0Sub0)
5359+
.addImm(AMDGPU::sub0)
5360+
.add(SrcReg0Sub1)
5361+
.addImm(AMDGPU::sub1);
5362+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5363+
.addReg(LaneValue->getOperand(0).getReg())
5364+
.addReg(AccumulatorVReg);
5365+
5366+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5367+
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5368+
.addReg(LaneMaskReg)
5369+
.addReg(ActiveBitsReg);
5370+
5371+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5372+
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5373+
.addReg(LaneValue->getOperand(0).getReg())
5374+
.addReg(Accumulator->getOperand(0).getReg());
5375+
break;
5376+
}
5377+
}
5378+
}
52745379
// Manipulate the iterator to get the next active lane
52755380
unsigned BITSETOpc =
52765381
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5277-
auto NewActiveBits =
5278-
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5279-
.addReg(FF1->getOperand(0).getReg())
5280-
.addReg(ActiveBits->getOperand(0).getReg());
5382+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5383+
.addReg(FF1Reg)
5384+
.addReg(ActiveBitsReg);
52815385

52825386
// Add phi nodes
52835387
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
52845388
.addMBB(ComputeLoop);
5285-
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5286-
.addMBB(ComputeLoop);
5389+
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
52875390

52885391
// Creating branching
52895392
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
52905393
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5291-
.addReg(NewActiveBits->getOperand(0).getReg())
5394+
.addReg(NewActiveBitsReg)
52925395
.addImm(0);
52935396
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
52945397
.addMBB(ComputeLoop);
@@ -5310,12 +5413,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
53105413
switch (MI.getOpcode()) {
53115414
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
53125415
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5416+
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5417+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
53135418
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
53145419
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5420+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5421+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
53155422
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
53165423
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5424+
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5425+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
53175426
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
53185427
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5428+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5429+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
53195430
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
53205431
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
53215432
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -304,28 +304,52 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
304304
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
305305

306306
// clang-format off
307-
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
307+
308308
multiclass
309-
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
309+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
310310
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
311311
def !toupper(Op) #"_PSEUDO_" #DataType
312-
: VPseudoInstSI<(outs SGPR_32 : $sdst),
313-
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
314-
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
312+
: VPseudoInstSI<(outs RetReg : $sdst),
313+
(ins Reg : $src, VSrc_b32 : $strategy),
314+
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
315315
}
316316
}
317317
// clang-format on
318318

319+
class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
320+
RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
321+
string Name = OpName;
322+
string TypeString = TypeStr;
323+
ValueType VT = Ty;
324+
RegisterClass RetReg = ReturnRegisterClass;
325+
SrcRegOrImm9 Reg = RC;
326+
}
327+
319328
// Input list : [Operation_name,
320-
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
329+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
330+
// bit-width
331+
// output register class,
332+
// input register class]
321333
defvar Operations = [
322-
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
323-
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
324-
["xor", "B32"]
334+
WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
335+
WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
336+
WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
337+
WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
338+
WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
339+
WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
340+
WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
341+
WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
342+
WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
343+
344+
WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
345+
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346+
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347+
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
325348
];
326349

327350
foreach Op = Operations in {
328-
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
351+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
352+
Op.VT, Op.RetReg, Op.Reg>;
329353
}
330354

331355
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)