@@ -5111,9 +5111,12 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5111
5111
case AMDGPU::S_SUB_I32:
5112
5112
case AMDGPU::S_SUB_U64_PSEUDO:
5113
5113
case AMDGPU::S_OR_B32:
5114
+ case AMDGPU::S_OR_B64:
5114
5115
case AMDGPU::S_XOR_B32:
5116
+ case AMDGPU::S_XOR_B64:
5115
5117
return std::numeric_limits<uint32_t>::min();
5116
5118
case AMDGPU::S_AND_B32:
5119
+ case AMDGPU::S_AND_B64:
5117
5120
return std::numeric_limits<uint32_t>::max();
5118
5121
default:
5119
5122
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5146,14 +5149,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5146
5149
case AMDGPU::S_MAX_I32:
5147
5150
case AMDGPU::V_CMP_GT_I64_e64: /*max*/
5148
5151
case AMDGPU::S_AND_B32:
5149
- case AMDGPU::S_OR_B32: {
5152
+ case AMDGPU::S_AND_B64:
5153
+ case AMDGPU::S_OR_B32:
5154
+ case AMDGPU::S_OR_B64: {
5150
5155
// Idempotent operations.
5151
5156
unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5152
5157
BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
5153
5158
RetBB = &BB;
5154
5159
break;
5155
5160
}
5156
5161
case AMDGPU::S_XOR_B32:
5162
+ case AMDGPU::S_XOR_B64:
5157
5163
case AMDGPU::S_ADD_I32:
5158
5164
case AMDGPU::S_ADD_U64_PSEUDO:
5159
5165
case AMDGPU::S_SUB_I32:
@@ -5177,7 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5177
5183
.addReg(ExecMask);
5178
5184
5179
5185
switch (Opc) {
5180
- case AMDGPU::S_XOR_B32: {
5186
+ case AMDGPU::S_XOR_B32:
5187
+ case AMDGPU::S_XOR_B64: {
5181
5188
// Performing an XOR operation on a uniform value
5182
5189
// depends on the parity of the number of active lanes.
5183
5190
// For even parity, the result will be 0, for odd
@@ -5189,10 +5196,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5189
5196
.addReg(NewAccumulator->getOperand(0).getReg())
5190
5197
.addImm(1)
5191
5198
.setOperandDead(3); // Dead scc
5192
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193
- .addReg(SrcReg)
5194
- .addReg(ParityRegister);
5195
- break;
5199
+ if (is32BitOpc) {
5200
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201
+ .addReg(SrcReg)
5202
+ .addReg(ParityRegister);
5203
+ break;
5204
+ } else {
5205
+ Register DestSub0 =
5206
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5207
+ Register DestSub1 =
5208
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5209
+ Register Op1H_Op0L_Reg =
5210
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5211
+ Register CarryReg =
5212
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5213
+
5214
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5215
+ const TargetRegisterClass *SrcSubRC =
5216
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5217
+
5218
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5219
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5220
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5221
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5222
+
5223
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5224
+ .add(Op1L)
5225
+ .addReg(ParityRegister);
5226
+
5227
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5228
+ .add(Op1H)
5229
+ .addReg(ParityRegister);
5230
+
5231
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5232
+ .add(Op1L)
5233
+ .addReg(ParityRegister);
5234
+
5235
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5236
+ .addReg(CarryReg)
5237
+ .addReg(Op1H_Op0L_Reg)
5238
+ .setOperandDead(3); // Dead scc
5239
+
5240
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5241
+ .addReg(DestSub0)
5242
+ .addImm(AMDGPU::sub0)
5243
+ .addReg(DestSub1)
5244
+ .addImm(AMDGPU::sub1);
5245
+ break;
5246
+ }
5196
5247
}
5197
5248
case AMDGPU::S_SUB_I32: {
5198
5249
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5407,6 +5458,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5407
5458
.addReg(LaneValueHiReg)
5408
5459
.addImm(AMDGPU::sub1);
5409
5460
switch (Opc) {
5461
+ case ::AMDGPU::S_OR_B64:
5462
+ case ::AMDGPU::S_AND_B64:
5463
+ case ::AMDGPU::S_XOR_B64: {
5464
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5465
+ .addReg(Accumulator->getOperand(0).getReg())
5466
+ .addReg(LaneValue->getOperand(0).getReg())
5467
+ .setOperandDead(3); // Dead scc
5468
+ break;
5469
+ }
5410
5470
case AMDGPU::V_CMP_GT_I64_e64:
5411
5471
case AMDGPU::V_CMP_GT_U64_e64:
5412
5472
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5538,10 +5598,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5538
5598
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5539
5599
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5540
5600
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5601
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5602
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5541
5603
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5542
5604
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5605
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5606
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5543
5607
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5544
5608
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5609
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5610
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5545
5611
case AMDGPU::S_UADDO_PSEUDO:
5546
5612
case AMDGPU::S_USUBO_PSEUDO: {
5547
5613
const DebugLoc &DL = MI.getDebugLoc();
0 commit comments