Skip to content

Commit d4303b3

Browse files
committed
[AMDGPU] Fold AGPR reg_sequence initializers
Differential Revision: https://reviews.llvm.org/D69413
1 parent c9c18e5 commit d4303b3

File tree

2 files changed

+461
-60
lines changed

2 files changed

+461
-60
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 131 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "SIMachineFunctionInfo.h"
1515
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1616
#include "llvm/ADT/DepthFirstIterator.h"
17+
#include "llvm/ADT/SetVector.h"
1718
#include "llvm/CodeGen/LiveIntervals.h"
1819
#include "llvm/CodeGen/MachineFunctionPass.h"
1920
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -441,6 +442,42 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
441442
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
442443
}
443444

445+
// Find a def of the UseReg, check if it is a reg_seqence and find initializers
446+
// for each subreg, tracking it to foldable inline immediate if possible.
447+
// Returns true on success.
448+
static bool getRegSeqInit(
449+
SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
450+
Register UseReg, uint8_t OpTy,
451+
const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
452+
MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
453+
if (!Def || !Def->isRegSequence())
454+
return false;
455+
456+
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
457+
MachineOperand *Sub = &Def->getOperand(I);
458+
assert (Sub->isReg());
459+
460+
for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
461+
SubDef && Sub->isReg() && !Sub->getSubReg() &&
462+
TII->isFoldableCopy(*SubDef);
463+
SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
464+
MachineOperand *Op = &SubDef->getOperand(1);
465+
if (Op->isImm()) {
466+
if (TII->isInlineConstant(*Op, OpTy))
467+
Sub = Op;
468+
break;
469+
}
470+
if (!Op->isReg())
471+
break;
472+
Sub = Op;
473+
}
474+
475+
Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
476+
}
477+
478+
return true;
479+
}
480+
444481
static bool tryToFoldACImm(const SIInstrInfo *TII,
445482
const MachineOperand &OpToFold,
446483
MachineInstr *UseMI,
@@ -474,39 +511,30 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
474511
return false;
475512

476513
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
477-
const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
478-
if (!Def || !Def->isRegSequence())
514+
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
515+
if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
479516
return false;
480517

481-
int64_t Imm;
482-
MachineOperand *Op;
483-
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
484-
const MachineOperand &Sub = Def->getOperand(I);
485-
if (!Sub.isReg() || Sub.getSubReg())
518+
int32_t Imm;
519+
for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
520+
const MachineOperand *Op = Defs[I].first;
521+
if (!Op->isImm())
486522
return false;
487-
MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
488-
while (SubDef && !SubDef->isMoveImmediate() &&
489-
!SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
490-
SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
491-
if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
492-
return false;
493-
Op = &SubDef->getOperand(1);
523+
494524
auto SubImm = Op->getImm();
495-
if (I == 1) {
496-
if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
525+
if (!I) {
526+
Imm = SubImm;
527+
if (!TII->isInlineConstant(*Op, OpTy) ||
528+
!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
497529
return false;
498530

499-
Imm = SubImm;
500531
continue;
501532
}
502533
if (Imm != SubImm)
503534
return false; // Can only fold splat constants
504535
}
505536

506-
if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
507-
return false;
508-
509-
appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op);
537+
appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
510538
return true;
511539
}
512540

@@ -645,11 +673,92 @@ void SIFoldOperands::foldOperand(
645673
LLVM_DEBUG(dbgs() << "Folding " << OpToFold
646674
<< "\n into " << *UseMI << '\n');
647675
unsigned Size = TII->getOpSize(*UseMI, 1);
648-
UseMI->getOperand(1).setReg(OpToFold.getReg());
676+
Register UseReg = OpToFold.getReg();
677+
UseMI->getOperand(1).setReg(UseReg);
649678
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
650679
UseMI->getOperand(1).setIsKill(false);
651680
CopiesToReplace.push_back(UseMI);
652681
OpToFold.setIsKill(false);
682+
683+
// That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
684+
// can only accept VGPR or inline immediate. Recreate a reg_sequence with
685+
// its initializers right here, so we will rematerialize immediates and
686+
// avoid copies via different reg classes.
687+
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
688+
if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
689+
getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
690+
*MRI)) {
691+
const DebugLoc &DL = UseMI->getDebugLoc();
692+
MachineBasicBlock &MBB = *UseMI->getParent();
693+
694+
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
695+
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
696+
UseMI->RemoveOperand(I);
697+
698+
MachineInstrBuilder B(*MBB.getParent(), UseMI);
699+
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
700+
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
701+
for (unsigned I = 0; I < Size / 4; ++I) {
702+
MachineOperand *Def = Defs[I].first;
703+
TargetInstrInfo::RegSubRegPair CopyToVGPR;
704+
if (Def->isImm() &&
705+
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
706+
int64_t Imm = Def->getImm();
707+
708+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
709+
BuildMI(MBB, UseMI, DL,
710+
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
711+
B.addReg(Tmp);
712+
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
713+
auto Src = getRegSubRegPair(*Def);
714+
Def->setIsKill(false);
715+
if (!SeenAGPRs.insert(Src)) {
716+
// We cannot build a reg_sequence out of the same registers, they
717+
// must be copied. Better do it here before copyPhysReg() created
718+
// several reads to do the AGPR->VGPR->AGPR copy.
719+
CopyToVGPR = Src;
720+
} else {
721+
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
722+
Src.SubReg);
723+
}
724+
} else {
725+
assert(Def->isReg());
726+
Def->setIsKill(false);
727+
auto Src = getRegSubRegPair(*Def);
728+
729+
// Direct copy from SGPR to AGPR is not possible. To avoid creation
730+
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
731+
// create a copy here and track if we already have such a copy.
732+
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
733+
CopyToVGPR = Src;
734+
} else {
735+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
736+
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
737+
B.addReg(Tmp);
738+
}
739+
}
740+
741+
if (CopyToVGPR.Reg) {
742+
Register Vgpr;
743+
if (VGPRCopies.count(CopyToVGPR)) {
744+
Vgpr = VGPRCopies[CopyToVGPR];
745+
} else {
746+
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
747+
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
748+
VGPRCopies[CopyToVGPR] = Vgpr;
749+
}
750+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
751+
BuildMI(MBB, UseMI, DL,
752+
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
753+
B.addReg(Tmp);
754+
}
755+
756+
B.addImm(Defs[I].second);
757+
}
758+
LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
759+
return;
760+
}
761+
653762
if (Size != 4)
654763
return;
655764
if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&

0 commit comments

Comments
 (0)