Skip to content

Commit 6bb5257

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 3
Supporting Arithemtic Operations: `and`, `or`, `xor`
1 parent d2b8989 commit 6bb5257

File tree

5 files changed

+3197
-6
lines changed

5 files changed

+3197
-6
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5111,9 +5111,12 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51115111
case AMDGPU::S_SUB_I32:
51125112
case AMDGPU::S_SUB_U64_PSEUDO:
51135113
case AMDGPU::S_OR_B32:
5114+
case AMDGPU::S_OR_B64:
51145115
case AMDGPU::S_XOR_B32:
5116+
case AMDGPU::S_XOR_B64:
51155117
return std::numeric_limits<uint32_t>::min();
51165118
case AMDGPU::S_AND_B32:
5119+
case AMDGPU::S_AND_B64:
51175120
return std::numeric_limits<uint32_t>::max();
51185121
default:
51195122
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5146,14 +5149,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51465149
case AMDGPU::S_MAX_I32:
51475150
case AMDGPU::V_CMP_GT_I64_e64: /*max*/
51485151
case AMDGPU::S_AND_B32:
5149-
case AMDGPU::S_OR_B32: {
5152+
case AMDGPU::S_AND_B64:
5153+
case AMDGPU::S_OR_B32:
5154+
case AMDGPU::S_OR_B64: {
51505155
// Idempotent operations.
51515156
unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51525157
BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
51535158
RetBB = &BB;
51545159
break;
51555160
}
51565161
case AMDGPU::S_XOR_B32:
5162+
case AMDGPU::S_XOR_B64:
51575163
case AMDGPU::S_ADD_I32:
51585164
case AMDGPU::S_ADD_U64_PSEUDO:
51595165
case AMDGPU::S_SUB_I32:
@@ -5177,7 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51775183
.addReg(ExecMask);
51785184

51795185
switch (Opc) {
5180-
case AMDGPU::S_XOR_B32: {
5186+
case AMDGPU::S_XOR_B32:
5187+
case AMDGPU::S_XOR_B64: {
51815188
// Performing an XOR operation on a uniform value
51825189
// depends on the parity of the number of active lanes.
51835190
// For even parity, the result will be 0, for odd
@@ -5189,10 +5196,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51895196
.addReg(NewAccumulator->getOperand(0).getReg())
51905197
.addImm(1)
51915198
.setOperandDead(3); // Dead scc
5192-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193-
.addReg(SrcReg)
5194-
.addReg(ParityRegister);
5195-
break;
5199+
if (is32BitOpc) {
5200+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201+
.addReg(SrcReg)
5202+
.addReg(ParityRegister);
5203+
break;
5204+
} else {
5205+
Register DestSub0 =
5206+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5207+
Register DestSub1 =
5208+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5209+
Register Op1H_Op0L_Reg =
5210+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5211+
Register CarryReg =
5212+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5213+
5214+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5215+
const TargetRegisterClass *SrcSubRC =
5216+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5217+
5218+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5219+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5220+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5221+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5222+
5223+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5224+
.add(Op1L)
5225+
.addReg(ParityRegister);
5226+
5227+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5228+
.add(Op1H)
5229+
.addReg(ParityRegister);
5230+
5231+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5232+
.add(Op1L)
5233+
.addReg(ParityRegister);
5234+
5235+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5236+
.addReg(CarryReg)
5237+
.addReg(Op1H_Op0L_Reg)
5238+
.setOperandDead(3); // Dead scc
5239+
5240+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5241+
.addReg(DestSub0)
5242+
.addImm(AMDGPU::sub0)
5243+
.addReg(DestSub1)
5244+
.addImm(AMDGPU::sub1);
5245+
break;
5246+
}
51965247
}
51975248
case AMDGPU::S_SUB_I32: {
51985249
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5407,6 +5458,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54075458
.addReg(LaneValueHiReg)
54085459
.addImm(AMDGPU::sub1);
54095460
switch (Opc) {
5461+
case ::AMDGPU::S_OR_B64:
5462+
case ::AMDGPU::S_AND_B64:
5463+
case ::AMDGPU::S_XOR_B64: {
5464+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5465+
.addReg(Accumulator->getOperand(0).getReg())
5466+
.addReg(LaneValue->getOperand(0).getReg())
5467+
.setOperandDead(3); // Dead scc
5468+
break;
5469+
}
54105470
case AMDGPU::V_CMP_GT_I64_e64:
54115471
case AMDGPU::V_CMP_GT_U64_e64:
54125472
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5538,10 +5598,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55385598
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55395599
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55405600
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5601+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5602+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
55415603
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
55425604
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5605+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5606+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
55435607
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
55445608
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5609+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5610+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
55455611
case AMDGPU::S_UADDO_PSEUDO:
55465612
case AMDGPU::S_USUBO_PSEUDO: {
55475613
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,9 @@ defvar Operations = [
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348348
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349349
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
350+
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
351+
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
352+
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
350353
];
351354

352355
foreach Op = Operations in {

0 commit comments

Comments
 (0)