@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5107
5107
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5108
5108
return std::numeric_limits<int32_t>::min();
5109
5109
case AMDGPU::S_ADD_I32:
5110
+ case AMDGPU::S_ADD_U64_PSEUDO:
5110
5111
case AMDGPU::S_SUB_I32:
5112
+ case AMDGPU::S_SUB_U64_PSEUDO:
5111
5113
case AMDGPU::S_OR_B32:
5112
5114
case AMDGPU::S_XOR_B32:
5113
5115
return std::numeric_limits<uint32_t>::min();
@@ -5153,51 +5155,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5153
5155
}
5154
5156
case AMDGPU::S_XOR_B32:
5155
5157
case AMDGPU::S_ADD_I32:
5156
- case AMDGPU::S_SUB_I32: {
5158
+ case AMDGPU::S_ADD_U64_PSEUDO:
5159
+ case AMDGPU::S_SUB_I32:
5160
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5157
5161
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5158
5162
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5159
5163
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5160
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5164
+ Register ActiveLanes =
5165
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5161
5166
5162
5167
bool IsWave32 = ST.isWave32();
5163
5168
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5164
5169
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5165
5170
unsigned CountReg =
5166
5171
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5167
5172
5168
- auto Exec =
5169
5173
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5170
5174
5171
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5172
- .addReg(Exec->getOperand(0).getReg());
5175
+ auto NewAccumulator =
5176
+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5177
+ .addReg(ExecMask);
5178
+
5179
+ switch (Opc) {
5180
+ case AMDGPU::S_XOR_B32: {
5181
+ // Performing an XOR operation on a uniform value
5182
+ // depends on the parity of the number of active lanes.
5183
+ // For even parity, the result will be 0, for odd
5184
+ // parity the result will be the same as the input value.
5185
+ Register ParityRegister =
5186
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5173
5187
5174
- switch (Opc) {
5175
- case AMDGPU::S_XOR_B32: {
5176
- // Performing an XOR operation on a uniform value
5177
- // depends on the parity of the number of active lanes.
5178
- // For even parity, the result will be 0, for odd
5179
- // parity the result will be the same as the input value.
5180
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5181
-
5182
- auto ParityReg =
5183
5188
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5184
5189
.addReg(NewAccumulator->getOperand(0).getReg())
5185
- .addImm(1);
5186
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5187
- .addReg(SrcReg)
5188
- .addReg(ParityReg->getOperand(0).getReg());
5189
- break;
5190
- }
5190
+ .addImm(1)
5191
+ .setOperandDead(3); // Dead scc
5192
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193
+ .addReg(SrcReg)
5194
+ .addReg(ParityRegister);
5195
+ break;
5196
+ }
5191
5197
case AMDGPU::S_SUB_I32: {
5192
5198
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5193
5199
5194
5200
// Take the negation of the source operand.
5195
- auto InvertedValReg =
5196
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5197
- .addImm(-1)
5198
- .addReg(SrcReg);
5201
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5202
+ .addImm(0)
5203
+ .addReg(SrcReg);
5199
5204
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5200
- .addReg(InvertedValReg->getOperand(0).getReg() )
5205
+ .addReg(NegatedVal )
5201
5206
.addReg(NewAccumulator->getOperand(0).getReg());
5202
5207
break;
5203
5208
}
@@ -5207,6 +5212,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5207
5212
.addReg(NewAccumulator->getOperand(0).getReg());
5208
5213
break;
5209
5214
}
5215
+ case AMDGPU::S_ADD_U64_PSEUDO:
5216
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5217
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5218
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5219
+ Register Op1H_Op0L_Reg =
5220
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221
+ Register Op1L_Op0H_Reg =
5222
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5223
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225
+ Register NegatedValLo =
5226
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5227
+ Register NegatedValHi =
5228
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5229
+
5230
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5231
+ const TargetRegisterClass *Src1SubRC =
5232
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5233
+
5234
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5235
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5236
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5237
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5238
+
5239
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5240
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5241
+ .addImm(0)
5242
+ .addReg(NewAccumulator->getOperand(0).getReg());
5243
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5244
+ .addReg(NegatedValLo)
5245
+ .addImm(31)
5246
+ .setOperandDead(3); // Dead scc
5247
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5248
+ .add(Op1L)
5249
+ .addReg(NegatedValHi);
5250
+ }
5251
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5252
+ ? NegatedValLo
5253
+ : NewAccumulator->getOperand(0).getReg();
5254
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5255
+ .add(Op1L)
5256
+ .addReg(LowOpcode);
5257
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5258
+ .add(Op1L)
5259
+ .addReg(LowOpcode);
5260
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5261
+ .add(Op1H)
5262
+ .addReg(LowOpcode);
5263
+
5264
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5265
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5266
+ .addReg(CarryReg)
5267
+ .addReg(Op1H_Op0L_Reg)
5268
+ .setOperandDead(3); // Dead scc
5269
+
5270
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5271
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5272
+ .addReg(HiVal)
5273
+ .addReg(Op1L_Op0H_Reg)
5274
+ .setOperandDead(3); // Dead scc
5275
+ }
5276
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5277
+ .addReg(DestSub0)
5278
+ .addImm(AMDGPU::sub0)
5279
+ .addReg(DestSub1)
5280
+ .addImm(AMDGPU::sub1);
5281
+ break;
5282
+ }
5210
5283
}
5211
5284
RetBB = &BB;
5212
5285
}
@@ -5374,6 +5447,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5374
5447
.addReg(Accumulator->getOperand(0).getReg());
5375
5448
break;
5376
5449
}
5450
+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5451
+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5452
+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5453
+ : AMDGPU::S_SUB_U32;
5454
+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5455
+ : AMDGPU::S_SUBB_U32;
5456
+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5457
+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5458
+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5459
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5460
+ &AMDGPU::SReg_32RegClass);
5461
+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5462
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5463
+ &AMDGPU::SReg_32RegClass);
5464
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5465
+ .add(Accumlo)
5466
+ .addReg(LaneValueLo->getOperand(0).getReg());
5467
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5468
+ .add(Accumhi)
5469
+ .addReg(LaneValueHi->getOperand(0).getReg());
5470
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5471
+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5472
+ .addReg(DestLo)
5473
+ .addImm(AMDGPU::sub0)
5474
+ .addReg(DestHi)
5475
+ .addImm(AMDGPU::sub1);
5476
+ break;
5477
+ }
5377
5478
}
5378
5479
}
5379
5480
// Manipulate the iterator to get the next active lane
@@ -5429,8 +5530,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5429
5530
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5430
5531
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5431
5532
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5533
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5534
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5432
5535
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5433
5536
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5537
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5538
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5434
5539
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5435
5540
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5436
5541
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments