Skip to content

Commit 36c76de

Browse files
[AArch64][SVE] Add a pass for SVE intrinsic optimisations
Summary: Creates the SVEIntrinsicOpts pass. In this patch, the pass tries to remove unnecessary reinterpret intrinsics which convert to and from svbool_t (llvm.aarch64.sve.convert.[to|from].svbool) For example, the reinterprets below are redundant: %1 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) %2 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) The pass also looks for ptest intrinsics and phi instructions where the operands are being needlessly converted to and from svbool_t. Reviewers: sdesmalen, andwar, efriedma, cameron.mcinally, c-rhodes, rengolin Reviewed By: efriedma Subscribers: mgorny, tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76078
1 parent 31c8e11 commit 36c76de

File tree

7 files changed

+564
-0
lines changed

7 files changed

+564
-0
lines changed

llvm/lib/Target/AArch64/AArch64.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ FunctionPass *createAArch64BranchTargetsPass();
5252
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
5353

5454
FunctionPass *createAArch64CollectLOHPass();
55+
ModulePass *createSVEIntrinsicOptsPass();
5556
InstructionSelector *
5657
createAArch64InstructionSelector(const AArch64TargetMachine &,
5758
AArch64Subtarget &, AArch64RegisterBankInfo &);
@@ -80,6 +81,7 @@ void initializeAArch64StorePairSuppressPass(PassRegistry&);
8081
void initializeFalkorHWPFFixPass(PassRegistry&);
8182
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
8283
void initializeLDTLSCleanupPass(PassRegistry&);
84+
void initializeSVEIntrinsicOptsPass(PassRegistry&);
8385
void initializeAArch64StackTaggingPass(PassRegistry&);
8486
void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
8587
} // end namespace llvm

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ static cl::opt<int> EnableGlobalISelAtO(
146146
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
147147
cl::init(0));
148148

149+
static cl::opt<bool> EnableSVEIntrinsicOpts(
150+
"aarch64-sve-intrinsic-opts", cl::Hidden,
151+
cl::desc("Enable SVE intrinsic opts"),
152+
cl::init(true));
153+
149154
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
150155
cl::init(true), cl::Hidden);
151156

@@ -182,6 +187,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
182187
initializeFalkorHWPFFixPass(*PR);
183188
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
184189
initializeLDTLSCleanupPass(*PR);
190+
initializeSVEIntrinsicOptsPass(*PR);
185191
initializeAArch64SpeculationHardeningPass(*PR);
186192
initializeAArch64StackTaggingPass(*PR);
187193
initializeAArch64StackTaggingPreRAPass(*PR);
@@ -434,6 +440,10 @@ void AArch64PassConfig::addIRPasses() {
434440
// ourselves.
435441
addPass(createAtomicExpandPass());
436442

443+
// Expand any SVE vector library calls that we can't code generate directly.
444+
if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
445+
addPass(createSVEIntrinsicOptsPass());
446+
437447
// Cmpxchg instructions are often used with a subsequent comparison to
438448
// determine whether it succeeded. We can exploit existing control-flow in
439449
// ldrex/strex loops to simplify this, but it needs tidying up.

llvm/lib/Target/AArch64/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ add_llvm_target(AArch64CodeGen
6464
AArch64TargetMachine.cpp
6565
AArch64TargetObjectFile.cpp
6666
AArch64TargetTransformInfo.cpp
67+
SVEIntrinsicOpts.cpp
6768
AArch64SIMDInstrOpt.cpp
6869

6970
DEPENDS
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// Performs general IR level optimizations on SVE intrinsics.
11+
//
12+
// The main goal of this pass is to remove unnecessary reinterpret
13+
// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
14+
//
15+
// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
16+
// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
17+
//
18+
// This pass also looks for ptest intrinsics & phi instructions where the
19+
// operands are being needlessly converted to and from svbool_t.
20+
//
21+
//===----------------------------------------------------------------------===//
22+
23+
#include "Utils/AArch64BaseInfo.h"
24+
#include "llvm/ADT/PostOrderIterator.h"
25+
#include "llvm/ADT/SetVector.h"
26+
#include "llvm/IR/Constants.h"
27+
#include "llvm/IR/Dominators.h"
28+
#include "llvm/IR/IRBuilder.h"
29+
#include "llvm/IR/Instructions.h"
30+
#include "llvm/IR/IntrinsicInst.h"
31+
#include "llvm/IR/IntrinsicsAArch64.h"
32+
#include "llvm/IR/LLVMContext.h"
33+
#include "llvm/IR/PatternMatch.h"
34+
#include "llvm/InitializePasses.h"
35+
#include "llvm/Support/Debug.h"
36+
37+
using namespace llvm;
38+
using namespace llvm::PatternMatch;
39+
40+
#define DEBUG_TYPE "sve-intrinsic-opts"
41+
42+
namespace llvm {
43+
void initializeSVEIntrinsicOptsPass(PassRegistry &);
44+
}
45+
46+
namespace {
47+
struct SVEIntrinsicOpts : public ModulePass {
48+
static char ID; // Pass identification, replacement for typeid
49+
SVEIntrinsicOpts() : ModulePass(ID) {
50+
initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
51+
}
52+
53+
bool runOnModule(Module &M) override;
54+
void getAnalysisUsage(AnalysisUsage &AU) const override;
55+
56+
private:
57+
static IntrinsicInst *isReinterpretFromSVBool(Value *V);
58+
static IntrinsicInst *isReinterpretToSVBool(Value *V);
59+
60+
static bool optimizeIntrinsic(Instruction *I);
61+
62+
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
63+
64+
static bool optimizeConvertFromSVBool(IntrinsicInst *I);
65+
static bool optimizePTest(IntrinsicInst *I);
66+
67+
static bool processPhiNode(IntrinsicInst *I);
68+
};
69+
} // end anonymous namespace
70+
71+
void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
72+
AU.addRequired<DominatorTreeWrapperPass>();
73+
AU.setPreservesCFG();
74+
}
75+
76+
char SVEIntrinsicOpts::ID = 0;
77+
static const char *name = "SVE intrinsics optimizations";
78+
INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
79+
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
80+
INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
81+
82+
namespace llvm {
83+
ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
84+
} // namespace llvm
85+
86+
/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
87+
/// otherwise.
88+
IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
89+
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
90+
if (!I)
91+
return nullptr;
92+
93+
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
94+
return nullptr;
95+
96+
return I;
97+
}
98+
99+
/// Returns V if it's a cast to <n x 16 x i1> (aka svbool_t), nullptr otherwise.
100+
IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) {
101+
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
102+
if (!I)
103+
return nullptr;
104+
105+
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool)
106+
return nullptr;
107+
108+
return I;
109+
}
110+
111+
/// The function will remove redundant reinterprets casting in the presence
112+
/// of the control flow
113+
bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
114+
115+
SmallVector<Instruction *, 32> Worklist;
116+
auto RequiredType = X->getType();
117+
118+
auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
119+
assert(PN && "Expected Phi Node!");
120+
121+
// Don't create a new Phi unless we can remove the old one.
122+
if (!PN->hasOneUse())
123+
return false;
124+
125+
for (Value *IncValPhi : PN->incoming_values()) {
126+
auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
127+
if (!Reinterpret ||
128+
RequiredType != Reinterpret->getArgOperand(0)->getType())
129+
return false;
130+
}
131+
132+
// Create the new Phi
133+
LLVMContext &Ctx = PN->getContext();
134+
IRBuilder<> Builder(Ctx);
135+
Builder.SetInsertPoint(PN);
136+
PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
137+
Worklist.push_back(PN);
138+
139+
for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
140+
auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
141+
NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
142+
Worklist.push_back(Reinterpret);
143+
}
144+
145+
// Cleanup Phi Node and reinterprets
146+
X->replaceAllUsesWith(NPN);
147+
X->eraseFromParent();
148+
149+
for (auto &I : Worklist)
150+
if (I->use_empty())
151+
I->eraseFromParent();
152+
153+
return true;
154+
}
155+
156+
bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
157+
IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
158+
IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
159+
160+
if (Op1 && Op2 &&
161+
Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
162+
Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
163+
Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
164+
165+
Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
166+
Type *Tys[] = {Op1->getArgOperand(0)->getType()};
167+
Module *M = I->getParent()->getParent()->getParent();
168+
169+
auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
170+
auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
171+
172+
I->replaceAllUsesWith(CI);
173+
I->eraseFromParent();
174+
if (Op1->use_empty())
175+
Op1->eraseFromParent();
176+
if (Op2->use_empty())
177+
Op2->eraseFromParent();
178+
179+
return true;
180+
}
181+
182+
return false;
183+
}
184+
185+
bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
186+
assert(isReinterpretFromSVBool(I));
187+
188+
// If the reinterpret instruction operand is a PHI Node
189+
if (isa<PHINode>(I->getArgOperand(0)))
190+
return processPhiNode(I);
191+
192+
// If we have a reinterpret intrinsic I of type A which is converting from
193+
// another reinterpret Y of type B, and the source type of Y is A, then we can
194+
// elide away both reinterprets if there are no other users of Y.
195+
auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
196+
if (!Y)
197+
return false;
198+
199+
Value *SourceVal = Y->getArgOperand(0);
200+
if (I->getType() != SourceVal->getType())
201+
return false;
202+
203+
I->replaceAllUsesWith(SourceVal);
204+
I->eraseFromParent();
205+
if (Y->use_empty())
206+
Y->eraseFromParent();
207+
208+
return true;
209+
}
210+
211+
bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
212+
IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
213+
if (!IntrI)
214+
return false;
215+
216+
switch (IntrI->getIntrinsicID()) {
217+
case Intrinsic::aarch64_sve_convert_from_svbool:
218+
return optimizeConvertFromSVBool(IntrI);
219+
case Intrinsic::aarch64_sve_ptest_any:
220+
case Intrinsic::aarch64_sve_ptest_first:
221+
case Intrinsic::aarch64_sve_ptest_last:
222+
return optimizePTest(IntrI);
223+
default:
224+
return false;
225+
}
226+
227+
return true;
228+
}
229+
230+
bool SVEIntrinsicOpts::optimizeFunctions(
231+
SmallSetVector<Function *, 4> &Functions) {
232+
bool Changed = false;
233+
for (auto *F : Functions) {
234+
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
235+
236+
// Traverse the DT with an rpo walk so we see defs before uses, allowing
237+
// simplification to be done incrementally.
238+
BasicBlock *Root = DT->getRoot();
239+
ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
240+
for (auto *BB : RPOT)
241+
for (Instruction &I : make_early_inc_range(*BB))
242+
Changed |= optimizeIntrinsic(&I);
243+
}
244+
return Changed;
245+
}
246+
247+
bool SVEIntrinsicOpts::runOnModule(Module &M) {
248+
bool Changed = false;
249+
SmallSetVector<Function *, 4> Functions;
250+
251+
// Check for SVE intrinsic declarations first so that we only iterate over
252+
// relevant functions. Where an appropriate declaration is found, store the
253+
// function(s) where it is used so we can target these only.
254+
for (auto &F : M.getFunctionList()) {
255+
if (!F.isDeclaration())
256+
continue;
257+
258+
switch (F.getIntrinsicID()) {
259+
case Intrinsic::aarch64_sve_convert_from_svbool:
260+
case Intrinsic::aarch64_sve_ptest_any:
261+
case Intrinsic::aarch64_sve_ptest_first:
262+
case Intrinsic::aarch64_sve_ptest_last:
263+
for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
264+
auto *Inst = dyn_cast<Instruction>(*I++);
265+
Functions.insert(Inst->getFunction());
266+
}
267+
break;
268+
default:
269+
break;
270+
}
271+
}
272+
273+
if (!Functions.empty())
274+
Changed |= optimizeFunctions(Functions);
275+
276+
return Changed;
277+
}

llvm/test/CodeGen/AArch64/O3-pipeline.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
1919
; CHECK-NEXT: FunctionPass Manager
2020
; CHECK-NEXT: Expand Atomic instructions
21+
; CHECK-NEXT: SVE intrinsics optimizations
22+
; CHECK-NEXT: FunctionPass Manager
23+
; CHECK-NEXT: Dominator Tree Construction
24+
; CHECK-NEXT: FunctionPass Manager
2125
; CHECK-NEXT: Simplify the CFG
2226
; CHECK-NEXT: Dominator Tree Construction
2327
; CHECK-NEXT: Natural Loop Information

0 commit comments

Comments
 (0)