@@ -5095,12 +5095,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5095
5095
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5096
5096
switch (Opc) {
5097
5097
case AMDGPU::S_MIN_U32:
5098
+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5098
5099
return std::numeric_limits<uint32_t>::max();
5099
5100
case AMDGPU::S_MIN_I32:
5101
+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5100
5102
return std::numeric_limits<int32_t>::max();
5101
5103
case AMDGPU::S_MAX_U32:
5104
+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5102
5105
return std::numeric_limits<uint32_t>::min();
5103
5106
case AMDGPU::S_MAX_I32:
5107
+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5104
5108
return std::numeric_limits<int32_t>::min();
5105
5109
case AMDGPU::S_ADD_I32:
5106
5110
case AMDGPU::S_SUB_I32:
@@ -5128,16 +5132,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5128
5132
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5129
5133
Register DstReg = MI.getOperand(0).getReg();
5130
5134
MachineBasicBlock *RetBB = nullptr;
5135
+ bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
5131
5136
if (isSGPR) {
5132
5137
switch (Opc) {
5133
5138
case AMDGPU::S_MIN_U32:
5139
+ case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
5134
5140
case AMDGPU::S_MIN_I32:
5141
+ case AMDGPU::V_CMP_LT_I64_e64: /*min*/
5135
5142
case AMDGPU::S_MAX_U32:
5143
+ case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
5136
5144
case AMDGPU::S_MAX_I32:
5145
+ case AMDGPU::V_CMP_GT_I64_e64: /*max*/
5137
5146
case AMDGPU::S_AND_B32:
5138
5147
case AMDGPU::S_OR_B32: {
5139
5148
// Idempotent operations.
5140
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5149
+ unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5150
+ BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
5141
5151
RetBB = &BB;
5142
5152
break;
5143
5153
}
@@ -5222,73 +5232,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5222
5232
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5223
5233
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5224
5234
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5225
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5226
-
5235
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5227
5236
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5228
5237
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5229
5238
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5230
-
5231
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5232
- Register LaneValueReg =
5233
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5239
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5234
5241
5235
5242
bool IsWave32 = ST.isWave32();
5236
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5243
+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5237
5244
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5238
5245
5239
5246
// Create initial values of induction variable from Exec, Accumulator and
5240
5247
// insert branch instr to newly created ComputeBlock
5241
- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5242
- auto TmpSReg =
5243
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5244
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5245
- .addImm(InitalValue);
5248
+ uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5249
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5250
+ if (is32BitOpc) {
5251
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5252
+ .addImm(IdentityValue);
5253
+ } else {
5254
+ Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5255
+ Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5256
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5257
+ .addImm(IdentityValue);
5258
+ switch (Opc) {
5259
+ case AMDGPU::V_CMP_LT_U64_e64:
5260
+ case AMDGPU::V_CMP_LT_I64_e64:
5261
+ IdentityValue = int32_t(-1); // u|min
5262
+ break;
5263
+ case AMDGPU::V_CMP_GT_U64_e64:
5264
+ case AMDGPU::V_CMP_GT_I64_e64:
5265
+ IdentityValue = int32_t(0); // u|max
5266
+ break;
5267
+ }
5268
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5269
+ .addImm(IdentityValue);
5270
+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5271
+ .addReg(Identitylo)
5272
+ .addImm(AMDGPU::sub0)
5273
+ .addReg(Identityhi)
5274
+ .addImm(AMDGPU::sub1);
5275
+ }
5246
5276
// clang-format off
5247
5277
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5248
5278
.addMBB(ComputeLoop);
5249
5279
// clang-format on
5250
5280
5251
5281
// Start constructing ComputeLoop
5252
- I = ComputeLoop->end ();
5282
+ I = ComputeLoop->begin ();
5253
5283
auto Accumulator =
5254
5284
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5255
- .addReg(InitalValReg )
5285
+ .addReg(IdentityValReg )
5256
5286
.addMBB(&BB);
5257
5287
auto ActiveBits =
5258
5288
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5259
- .addReg(TmpSReg->getOperand(0).getReg() )
5289
+ .addReg(LoopIterator )
5260
5290
.addMBB(&BB);
5261
5291
5292
+ I = ComputeLoop->end();
5293
+ MachineInstr *NewAccumulator;
5262
5294
// Perform the computations
5263
5295
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5264
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5265
- .addReg(ActiveBits->getOperand(0).getReg());
5266
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5267
- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5268
- .addReg(SrcReg)
5269
- .addReg(FF1->getOperand(0).getReg());
5270
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5271
- .addReg(Accumulator->getOperand(0).getReg())
5272
- .addReg(LaneValue->getOperand(0).getReg());
5273
-
5296
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5297
+ .addReg(ActiveBitsReg);
5298
+ if (is32BitOpc) {
5299
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5300
+ LaneValueReg)
5301
+ .addReg(SrcReg)
5302
+ .addReg(FF1Reg);
5303
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5304
+ .addReg(Accumulator->getOperand(0).getReg())
5305
+ .addReg(LaneValueReg);
5306
+ } else {
5307
+ Register LaneValueLoReg =
5308
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5309
+ Register LaneValueHiReg =
5310
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5311
+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5312
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5313
+ const TargetRegisterClass *SrcSubRC =
5314
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5315
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5316
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5317
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5318
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5319
+ // lane value input should be in an sgpr
5320
+ MachineInstr *LaneValueLo =
5321
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5322
+ LaneValueLoReg)
5323
+ .add(Op1L)
5324
+ .addReg(FF1Reg);
5325
+ MachineInstr *LaneValueHi =
5326
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5327
+ LaneValueHiReg)
5328
+ .add(Op1H)
5329
+ .addReg(FF1Reg);
5330
+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5331
+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5332
+ .addReg(LaneValueLoReg)
5333
+ .addImm(AMDGPU::sub0)
5334
+ .addReg(LaneValueHiReg)
5335
+ .addImm(AMDGPU::sub1);
5336
+ switch (Opc) {
5337
+ case AMDGPU::V_CMP_GT_I64_e64:
5338
+ case AMDGPU::V_CMP_GT_U64_e64:
5339
+ case AMDGPU::V_CMP_LT_I64_e64:
5340
+ case AMDGPU::V_CMP_LT_U64_e64: {
5341
+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5342
+ Register ComparisonResultReg =
5343
+ MRI.createVirtualRegister(WaveMaskRegClass);
5344
+ const TargetRegisterClass *VregClass =
5345
+ ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5346
+ : &AMDGPU::VReg_64RegClass;
5347
+ const TargetRegisterClass *VSubRegClass =
5348
+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5349
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5350
+ MachineOperand SrcReg0Sub0 =
5351
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5352
+ VregClass, AMDGPU::sub0, VSubRegClass);
5353
+ MachineOperand SrcReg0Sub1 =
5354
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5355
+ VregClass, AMDGPU::sub1, VSubRegClass);
5356
+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5357
+ AccumulatorVReg)
5358
+ .add(SrcReg0Sub0)
5359
+ .addImm(AMDGPU::sub0)
5360
+ .add(SrcReg0Sub1)
5361
+ .addImm(AMDGPU::sub1);
5362
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5363
+ .addReg(LaneValue->getOperand(0).getReg())
5364
+ .addReg(AccumulatorVReg);
5365
+
5366
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5367
+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5368
+ .addReg(LaneMaskReg)
5369
+ .addReg(ActiveBitsReg);
5370
+
5371
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5372
+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5373
+ .addReg(LaneValue->getOperand(0).getReg())
5374
+ .addReg(Accumulator->getOperand(0).getReg());
5375
+ break;
5376
+ }
5377
+ }
5378
+ }
5274
5379
// Manipulate the iterator to get the next active lane
5275
5380
unsigned BITSETOpc =
5276
5381
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5277
- auto NewActiveBits =
5278
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5279
- .addReg(FF1->getOperand(0).getReg())
5280
- .addReg(ActiveBits->getOperand(0).getReg());
5382
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5383
+ .addReg(FF1Reg)
5384
+ .addReg(ActiveBitsReg);
5281
5385
5282
5386
// Add phi nodes
5283
5387
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5284
5388
.addMBB(ComputeLoop);
5285
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5286
- .addMBB(ComputeLoop);
5389
+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5287
5390
5288
5391
// Creating branching
5289
5392
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5290
5393
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5291
- .addReg(NewActiveBits->getOperand(0).getReg() )
5394
+ .addReg(NewActiveBitsReg )
5292
5395
.addImm(0);
5293
5396
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5294
5397
.addMBB(ComputeLoop);
@@ -5310,12 +5413,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5310
5413
switch (MI.getOpcode()) {
5311
5414
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5312
5415
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5416
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5417
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5313
5418
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5314
5419
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5420
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5421
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5315
5422
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5316
5423
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5424
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5425
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5317
5426
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5318
5427
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5428
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5429
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5319
5430
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5320
5431
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5321
5432
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
0 commit comments