Skip to content

Commit 94caceb

Browse files
committed
[ARM][LowOverheadLoops] Add checks for narrowing
Modify ValidateLiveOuts to track 'FalseLaneZeros' more precisely, including checks on specific operations that can generate non-zeros from zero values, e.g VMVN. We can then check that any instructions that retain some information in their output register (all narrowing instructions) that they only use and def registers that always have zeros in their falsely predicated bytes, whether or not tail predication happens. Most of the logic remains the same, just the names of the data structures and helpers have been renamed to reflect the change in logic. The key change, apart from the opcode checkers, is that the FalseZeros set now strictly contains only instructions which will always generate zeros, and not instructions that could also have their false bytes masked away later. Differential Revision: https://reviews.llvm.org/D76235
1 parent 6f86e6b commit 94caceb

File tree

1 file changed

+77
-34
lines changed

1 file changed

+77
-34
lines changed

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ using namespace llvm;
6161

6262
namespace {
6363

64+
using InstSet = SmallPtrSetImpl<MachineInstr *>;
65+
6466
class PostOrderLoopTraversal {
6567
MachineLoop &ML;
6668
MachineLoopInfo &MLI;
@@ -518,6 +520,59 @@ static bool isRegInClass(const MachineOperand &MO,
518520
return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
519521
}
520522

523+
// Can this instruction generate a non-zero result when given only zeroed
524+
// operands? This allows us to know that, given operands with false bytes
525+
// zeroed by masked loads, that the result will also contain zeros in those
526+
// bytes.
527+
static bool canGenerateNonZeros(const MachineInstr &MI) {
528+
switch (MI.getOpcode()) {
529+
default:
530+
break;
531+
// FIXME: FP minus 0?
532+
//case ARM::MVE_VNEGf16:
533+
//case ARM::MVE_VNEGf32:
534+
case ARM::MVE_VMVN:
535+
case ARM::MVE_VORN:
536+
case ARM::MVE_VCLZs8:
537+
case ARM::MVE_VCLZs16:
538+
case ARM::MVE_VCLZs32:
539+
return true;
540+
}
541+
return false;
542+
}
543+
544+
// MVE 'narrowing' operate on half a lane, reading from half and writing
545+
// to half, which are referred to has the top and bottom half. The other
546+
// half retains its previous value.
547+
static bool retainsPreviousHalfElement(const MachineInstr &MI) {
548+
const MCInstrDesc &MCID = MI.getDesc();
549+
uint64_t Flags = MCID.TSFlags;
550+
return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
551+
}
552+
553+
// Look at its register uses to see if it only can only receive zeros
554+
// into its false lanes which would then produce zeros. Also check that
555+
// the output register is also defined by an FalseLaneZeros instruction
556+
// so that if tail-predication happens, the lanes that aren't updated will
557+
// still be zeros.
558+
static bool producesFalseLaneZeros(MachineInstr &MI,
559+
const TargetRegisterClass *QPRs,
560+
const ReachingDefAnalysis &RDA,
561+
InstSet &FalseLaneZeros) {
562+
if (canGenerateNonZeros(MI))
563+
return false;
564+
for (auto &MO : MI.operands()) {
565+
if (!MO.isReg() || !MO.getReg())
566+
continue;
567+
if (auto *OpDef = RDA.getMIOperand(&MI, MO))
568+
if (FalseLaneZeros.count(OpDef))
569+
continue;
570+
return false;
571+
}
572+
LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
573+
return true;
574+
}
575+
521576
bool LowOverheadLoop::ValidateLiveOuts() const {
522577
// We want to find out if the tail-predicated version of this loop will
523578
// produce the same values as the loop in its original form. For this to
@@ -538,76 +593,64 @@ bool LowOverheadLoop::ValidateLiveOuts() const {
538593
// operands, or stored results are equivalent already. Other explicitly
539594
// predicated instructions will perform the same operation in the original
540595
// loop and the tail-predicated form too. Because of this, we can insert
541-
// loads, stores and other predicated instructions into our KnownFalseZeros
596+
// loads, stores and other predicated instructions into our Predicated
542597
// set and build from there.
543598
const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
544-
SetVector<MachineInstr *> UnknownFalseLanes;
545-
SmallPtrSet<MachineInstr *, 4> KnownFalseZeros;
599+
SetVector<MachineInstr *> Unknown;
600+
SmallPtrSet<MachineInstr *, 4> FalseLaneZeros;
601+
SmallPtrSet<MachineInstr *, 4> Predicated;
546602
MachineBasicBlock *MBB = ML.getHeader();
603+
547604
for (auto &MI : *MBB) {
548605
const MCInstrDesc &MCID = MI.getDesc();
549606
uint64_t Flags = MCID.TSFlags;
550607
if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
551608
continue;
552609

553610
if (isVectorPredicated(&MI)) {
554-
KnownFalseZeros.insert(&MI);
611+
if (MI.mayLoad())
612+
FalseLaneZeros.insert(&MI);
613+
Predicated.insert(&MI);
555614
continue;
556615
}
557616

558617
if (MI.getNumDefs() == 0)
559618
continue;
560619

561-
// Only evaluate instructions which produce a single value.
562-
assert((MI.getNumDefs() == 1 && MI.defs().begin()->isReg()) &&
563-
"Expected no more than one register def");
564-
565-
Register DefReg = MI.defs().begin()->getReg();
566-
for (auto &MO : MI.operands()) {
567-
if (!isRegInClass(MO, QPRs) || !MO.isUse() || MO.getReg() != DefReg)
568-
continue;
569-
570-
// If this instruction overwrites one of its operands, and that register
571-
// has known lanes, then this instruction also has known predicated false
572-
// lanes.
573-
if (auto *OpDef = RDA.getMIOperand(&MI, MO)) {
574-
if (KnownFalseZeros.count(OpDef)) {
575-
KnownFalseZeros.insert(&MI);
576-
break;
577-
}
578-
}
579-
}
580-
if (!KnownFalseZeros.count(&MI))
581-
UnknownFalseLanes.insert(&MI);
620+
if (producesFalseLaneZeros(MI, QPRs, RDA, FalseLaneZeros))
621+
FalseLaneZeros.insert(&MI);
622+
else if (retainsPreviousHalfElement(MI))
623+
return false;
624+
else
625+
Unknown.insert(&MI);
582626
}
583627

584-
auto HasKnownUsers = [this](MachineInstr *MI, const MachineOperand &MO,
585-
SmallPtrSetImpl<MachineInstr *> &Knowns) {
628+
auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
629+
SmallPtrSetImpl<MachineInstr *> &Predicated) {
586630
SmallPtrSet<MachineInstr *, 2> Uses;
587631
RDA.getGlobalUses(MI, MO.getReg(), Uses);
588632
for (auto *Use : Uses) {
589-
if (Use != MI && !Knowns.count(Use))
633+
if (Use != MI && !Predicated.count(Use))
590634
return false;
591635
}
592636
return true;
593637
};
594638

595-
// Now for all the unknown values, see if they're only consumed by known
596-
// instructions. Visit in reverse so that we can start at the values being
639+
// Visit the unknowns in reverse so that we can start at the values being
597640
// stored and then we can work towards the leaves, hopefully adding more
598-
// instructions to KnownFalseZeros.
599-
for (auto *MI : reverse(UnknownFalseLanes)) {
641+
// instructions to Predicated.
642+
for (auto *MI : reverse(Unknown)) {
600643
for (auto &MO : MI->operands()) {
601644
if (!isRegInClass(MO, QPRs) || !MO.isDef())
602645
continue;
603-
if (!HasKnownUsers(MI, MO, KnownFalseZeros)) {
646+
if (!HasPredicatedUsers(MI, MO, Predicated)) {
604647
LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
605648
<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
606649
return false;
607650
}
608651
}
609652
// Any unknown false lanes have been masked away by the user(s).
610-
KnownFalseZeros.insert(MI);
653+
Predicated.insert(MI);
611654
}
612655

613656
// Collect Q-regs that are live in the exit blocks. We don't collect scalars

0 commit comments

Comments
 (0)