Skip to content

Commit f34b6fd

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent cdbe34c commit f34b6fd

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51075107
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51085108
return std::numeric_limits<int32_t>::min();
51095109
case AMDGPU::S_ADD_I32:
5110+
case AMDGPU::S_ADD_U64_PSEUDO:
51105111
case AMDGPU::S_SUB_I32:
5112+
case AMDGPU::S_SUB_U64_PSEUDO:
51115113
case AMDGPU::S_OR_B32:
51125114
case AMDGPU::S_XOR_B32:
51135115
return std::numeric_limits<uint32_t>::min();
@@ -5153,51 +5155,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51535155
}
51545156
case AMDGPU::S_XOR_B32:
51555157
case AMDGPU::S_ADD_I32:
5156-
case AMDGPU::S_SUB_I32: {
5158+
case AMDGPU::S_ADD_U64_PSEUDO:
5159+
case AMDGPU::S_SUB_I32:
5160+
case AMDGPU::S_SUB_U64_PSEUDO: {
51575161
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51585162
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51595163
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5160-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5164+
Register ActiveLanes =
5165+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51615166

51625167
bool IsWave32 = ST.isWave32();
51635168
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51645169
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
51655170
unsigned CountReg =
51665171
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51675172

5168-
auto Exec =
51695173
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
51705174

5171-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5172-
.addReg(Exec->getOperand(0).getReg());
5175+
auto NewAccumulator =
5176+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5177+
.addReg(ExecMask);
5178+
5179+
switch (Opc) {
5180+
case AMDGPU::S_XOR_B32: {
5181+
// Performing an XOR operation on a uniform value
5182+
// depends on the parity of the number of active lanes.
5183+
// For even parity, the result will be 0, for odd
5184+
// parity the result will be the same as the input value.
5185+
Register ParityRegister =
5186+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51735187

5174-
switch (Opc) {
5175-
case AMDGPU::S_XOR_B32: {
5176-
// Performing an XOR operation on a uniform value
5177-
// depends on the parity of the number of active lanes.
5178-
// For even parity, the result will be 0, for odd
5179-
// parity the result will be the same as the input value.
5180-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5181-
5182-
auto ParityReg =
51835188
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
51845189
.addReg(NewAccumulator->getOperand(0).getReg())
5185-
.addImm(1);
5186-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5187-
.addReg(SrcReg)
5188-
.addReg(ParityReg->getOperand(0).getReg());
5189-
break;
5190-
}
5190+
.addImm(1)
5191+
.setOperandDead(3); // Dead scc
5192+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193+
.addReg(SrcReg)
5194+
.addReg(ParityRegister);
5195+
break;
5196+
}
51915197
case AMDGPU::S_SUB_I32: {
51925198
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
51935199

51945200
// Take the negation of the source operand.
5195-
auto InvertedValReg =
5196-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5197-
.addImm(-1)
5198-
.addReg(SrcReg);
5201+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5202+
.addImm(0)
5203+
.addReg(SrcReg);
51995204
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5200-
.addReg(InvertedValReg->getOperand(0).getReg())
5205+
.addReg(NegatedVal)
52015206
.addReg(NewAccumulator->getOperand(0).getReg());
52025207
break;
52035208
}
@@ -5207,6 +5212,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52075212
.addReg(NewAccumulator->getOperand(0).getReg());
52085213
break;
52095214
}
5215+
case AMDGPU::S_ADD_U64_PSEUDO:
5216+
case AMDGPU::S_SUB_U64_PSEUDO: {
5217+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5218+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5219+
Register Op1H_Op0L_Reg =
5220+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+
Register Op1L_Op0H_Reg =
5222+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5223+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225+
Register NegatedValLo =
5226+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5227+
Register NegatedValHi =
5228+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5229+
5230+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5231+
const TargetRegisterClass *Src1SubRC =
5232+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5233+
5234+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5235+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5236+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5237+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5238+
5239+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5240+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5241+
.addImm(0)
5242+
.addReg(NewAccumulator->getOperand(0).getReg());
5243+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5244+
.addReg(NegatedValLo)
5245+
.addImm(31)
5246+
.setOperandDead(3); // Dead scc
5247+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5248+
.add(Op1L)
5249+
.addReg(NegatedValHi);
5250+
}
5251+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5252+
? NegatedValLo
5253+
: NewAccumulator->getOperand(0).getReg();
5254+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5255+
.add(Op1L)
5256+
.addReg(LowOpcode);
5257+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5258+
.add(Op1L)
5259+
.addReg(LowOpcode);
5260+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5261+
.add(Op1H)
5262+
.addReg(LowOpcode);
5263+
5264+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5265+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5266+
.addReg(CarryReg)
5267+
.addReg(Op1H_Op0L_Reg)
5268+
.setOperandDead(3); // Dead scc
5269+
5270+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5271+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5272+
.addReg(HiVal)
5273+
.addReg(Op1L_Op0H_Reg)
5274+
.setOperandDead(3); // Dead scc
5275+
}
5276+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5277+
.addReg(DestSub0)
5278+
.addImm(AMDGPU::sub0)
5279+
.addReg(DestSub1)
5280+
.addImm(AMDGPU::sub1);
5281+
break;
5282+
}
52105283
}
52115284
RetBB = &BB;
52125285
}
@@ -5374,6 +5447,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53745447
.addReg(Accumulator->getOperand(0).getReg());
53755448
break;
53765449
}
5450+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5451+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5452+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5453+
: AMDGPU::S_SUB_U32;
5454+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5455+
: AMDGPU::S_SUBB_U32;
5456+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5457+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5458+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5459+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5460+
&AMDGPU::SReg_32RegClass);
5461+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5462+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5463+
&AMDGPU::SReg_32RegClass);
5464+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5465+
.add(Accumlo)
5466+
.addReg(LaneValueLo->getOperand(0).getReg());
5467+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5468+
.add(Accumhi)
5469+
.addReg(LaneValueHi->getOperand(0).getReg());
5470+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5471+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5472+
.addReg(DestLo)
5473+
.addImm(AMDGPU::sub0)
5474+
.addReg(DestHi)
5475+
.addImm(AMDGPU::sub1);
5476+
break;
5477+
}
53775478
}
53785479
}
53795480
// Manipulate the iterator to get the next active lane
@@ -5429,8 +5530,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54295530
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54305531
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54315532
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5533+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5534+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
54325535
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
54335536
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5537+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5538+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
54345539
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
54355540
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
54365541
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,8 @@ defvar Operations = [
345345
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346346
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
348350
];
349351

350352
foreach Op = Operations in {

0 commit comments

Comments
 (0)