Skip to content

Commit f8d344f

Browse files
committed
[ARM][ParallelDSP] Convert to function pass
Run across a whole function, visiting each basic block one at a time. Differential Revision: https://reviews.llvm.org/D65324 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367389 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent f8a2bef commit f8d344f

File tree

4 files changed

+126
-76
lines changed

4 files changed

+126
-76
lines changed

lib/Target/ARM/ARMParallelDSP.cpp

Lines changed: 45 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
1+
//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -18,13 +18,10 @@
1818
#include "llvm/ADT/SmallPtrSet.h"
1919
#include "llvm/Analysis/AliasAnalysis.h"
2020
#include "llvm/Analysis/LoopAccessAnalysis.h"
21-
#include "llvm/Analysis/LoopPass.h"
22-
#include "llvm/Analysis/LoopInfo.h"
2321
#include "llvm/IR/Instructions.h"
2422
#include "llvm/IR/NoFolder.h"
2523
#include "llvm/Transforms/Scalar.h"
2624
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
27-
#include "llvm/Transforms/Utils/LoopUtils.h"
2825
#include "llvm/Pass.h"
2926
#include "llvm/PassRegistry.h"
3027
#include "llvm/PassSupport.h"
@@ -156,13 +153,11 @@ namespace {
156153
}
157154
};
158155

159-
class ARMParallelDSP : public LoopPass {
156+
class ARMParallelDSP : public FunctionPass {
160157
ScalarEvolution *SE;
161158
AliasAnalysis *AA;
162159
TargetLibraryInfo *TLI;
163160
DominatorTree *DT;
164-
LoopInfo *LI;
165-
Loop *L;
166161
const DataLayout *DL;
167162
Module *M;
168163
std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -184,63 +179,38 @@ namespace {
184179
/// products to a 32-bit accumulate operand. Optionally, the instruction can
185180
/// exchange the halfwords of the second operand before performing the
186181
/// arithmetic.
187-
bool MatchSMLAD(Loop *L);
182+
bool MatchSMLAD(Function &F);
188183

189184
public:
190185
static char ID;
191186

192-
ARMParallelDSP() : LoopPass(ID) { }
193-
194-
bool doInitialization(Loop *L, LPPassManager &LPM) override {
195-
LoadPairs.clear();
196-
WideLoads.clear();
197-
return true;
198-
}
187+
ARMParallelDSP() : FunctionPass(ID) { }
199188

200189
void getAnalysisUsage(AnalysisUsage &AU) const override {
201-
LoopPass::getAnalysisUsage(AU);
190+
FunctionPass::getAnalysisUsage(AU);
202191
AU.addRequired<AssumptionCacheTracker>();
203192
AU.addRequired<ScalarEvolutionWrapperPass>();
204193
AU.addRequired<AAResultsWrapperPass>();
205194
AU.addRequired<TargetLibraryInfoWrapperPass>();
206-
AU.addRequired<LoopInfoWrapperPass>();
207195
AU.addRequired<DominatorTreeWrapperPass>();
208196
AU.addRequired<TargetPassConfig>();
209-
AU.addPreserved<LoopInfoWrapperPass>();
197+
AU.addPreserved<ScalarEvolutionWrapperPass>();
198+
AU.addPreserved<GlobalsAAWrapperPass>();
210199
AU.setPreservesCFG();
211200
}
212201

213-
bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
202+
bool runOnFunction(Function &F) override {
214203
if (DisableParallelDSP)
215204
return false;
216-
if (skipLoop(TheLoop))
205+
if (skipFunction(F))
217206
return false;
218207

219-
L = TheLoop;
220208
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
221209
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
222210
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
223211
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
224-
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
225212
auto &TPC = getAnalysis<TargetPassConfig>();
226213

227-
BasicBlock *Header = TheLoop->getHeader();
228-
if (!Header)
229-
return false;
230-
231-
// TODO: We assume the loop header and latch to be the same block.
232-
// This is not a fundamental restriction, but lifting this would just
233-
// require more work to do the transformation and then patch up the CFG.
234-
if (Header != TheLoop->getLoopLatch()) {
235-
LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
236-
"running pass ARMParallelDSP\n");
237-
return false;
238-
}
239-
240-
if (!TheLoop->getLoopPreheader())
241-
InsertPreheaderForLoop(L, DT, LI, nullptr, true);
242-
243-
Function &F = *Header->getParent();
244214
M = F.getParent();
245215
DL = &M->getDataLayout();
246216

@@ -265,17 +235,10 @@ namespace {
265235
return false;
266236
}
267237

268-
LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
269-
270238
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
271239
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
272240

273-
if (!RecordMemoryOps(Header)) {
274-
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
275-
return false;
276-
}
277-
278-
bool Changes = MatchSMLAD(L);
241+
bool Changes = MatchSMLAD(F);
279242
return Changes;
280243
}
281244
};
@@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
337300
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
338301
SmallVector<LoadInst*, 8> Loads;
339302
SmallVector<Instruction*, 8> Writes;
303+
LoadPairs.clear();
304+
WideLoads.clear();
340305

341306
// Collect loads and instruction that may write to memory. For now we only
342307
// record loads which are simple, sign-extended and have a single user.
@@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
415380
return LoadPairs.size() > 1;
416381
}
417382

418-
// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
383+
// The pass needs to identify integer add/sub reductions of 16-bit vector
419384
// multiplications.
420385
// To use SMLAD:
421386
// 1) we first need to find integer add then look for this pattern:
@@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
446411
// If loop invariants are used instead of loads, these need to be packed
447412
// before the loop begins.
448413
//
449-
bool ARMParallelDSP::MatchSMLAD(Loop *L) {
414+
bool ARMParallelDSP::MatchSMLAD(Function &F) {
450415
// Search recursively back through the operands to find a tree of values that
451416
// form a multiply-accumulate chain. The search records the Add and Mul
452417
// instructions that form the reduction and allows us to find a single value
453418
// to be used as the initial input to the accumlator.
454-
std::function<bool(Value*, Reduction&)> Search = [&]
455-
(Value *V, Reduction &R) -> bool {
419+
std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
420+
(Value *V, BasicBlock *BB, Reduction &R) -> bool {
456421

457422
// If we find a non-instruction, try to use it as the initial accumulator
458423
// value. This may have already been found during the search in which case
@@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
461426
if (!I)
462427
return R.InsertAcc(V);
463428

429+
if (I->getParent() != BB)
430+
return false;
431+
464432
switch (I->getOpcode()) {
465433
default:
466434
break;
@@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
471439
// Adds should be adding together two muls, or another add and a mul to
472440
// be within the mac chain. One of the operands may also be the
473441
// accumulator value at which point we should stop searching.
474-
bool ValidLHS = Search(I->getOperand(0), R);
475-
bool ValidRHS = Search(I->getOperand(1), R);
442+
bool ValidLHS = Search(I->getOperand(0), BB, R);
443+
bool ValidRHS = Search(I->getOperand(1), BB, R);
476444
if (!ValidLHS && !ValidLHS)
477445
return false;
478446
else if (ValidLHS && ValidRHS) {
@@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
498466
return false;
499467
}
500468
case Instruction::SExt:
501-
return Search(I->getOperand(0), R);
469+
return Search(I->getOperand(0), BB, R);
502470
}
503471
return false;
504472
};
505473

506474
bool Changed = false;
507-
SmallPtrSet<Instruction*, 4> AllAdds;
508-
BasicBlock *Latch = L->getLoopLatch();
509475

510-
for (Instruction &I : reverse(*Latch)) {
511-
if (I.getOpcode() != Instruction::Add)
476+
for (auto &BB : F) {
477+
SmallPtrSet<Instruction*, 4> AllAdds;
478+
if (!RecordMemoryOps(&BB))
512479
continue;
513480

514-
if (AllAdds.count(&I))
515-
continue;
481+
for (Instruction &I : reverse(BB)) {
482+
if (I.getOpcode() != Instruction::Add)
483+
continue;
516484

517-
const auto *Ty = I.getType();
518-
if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
519-
continue;
485+
if (AllAdds.count(&I))
486+
continue;
520487

521-
Reduction R(&I);
522-
if (!Search(&I, R))
523-
continue;
488+
const auto *Ty = I.getType();
489+
if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
490+
continue;
524491

525-
if (!CreateParallelPairs(R))
526-
continue;
492+
Reduction R(&I);
493+
if (!Search(&I, &BB, R))
494+
continue;
527495

528-
InsertParallelMACs(R);
529-
Changed = true;
530-
AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
496+
if (!CreateParallelPairs(R))
497+
continue;
498+
499+
InsertParallelMACs(R);
500+
Changed = true;
501+
AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
502+
}
531503
}
532504

533505
return Changed;
@@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
745717
char ARMParallelDSP::ID = 0;
746718

747719
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
748-
"Transform loops to use DSP intrinsics", false, false)
720+
"Transform functions to use DSP intrinsics", false, false)
749721
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
750-
"Transform loops to use DSP intrinsics", false, false)
722+
"Transform functions to use DSP intrinsics", false, false)

test/CodeGen/ARM/O3-pipeline.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@
3737
; CHECK-NEXT: Scalar Evolution Analysis
3838
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
3939
; CHECK-NEXT: Function Alias Analysis Results
40-
; CHECK-NEXT: Loop Pass Manager
41-
; CHECK-NEXT: Transform loops to use DSP intrinsics
40+
; CHECK-NEXT: Transform functions to use DSP intrinsics
4241
; CHECK-NEXT: Interleaved Access Pass
4342
; CHECK-NEXT: ARM IR optimizations
4443
; CHECK-NEXT: Dominator Tree Construction
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
2+
3+
; CHECK-LABEL: single_block
4+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
5+
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
6+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
7+
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
8+
; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
9+
define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
10+
entry:
11+
%ld.a.0 = load i16, i16* %a
12+
%sext.a.0 = sext i16 %ld.a.0 to i32
13+
%ld.b.0 = load i16, i16* %b
14+
%sext.b.0 = sext i16 %ld.b.0 to i32
15+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
16+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
17+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
18+
%ld.a.1 = load i16, i16* %addr.a.1
19+
%sext.a.1 = sext i16 %ld.a.1 to i32
20+
%ld.b.1 = load i16, i16* %addr.b.1
21+
%sext.b.1 = sext i16 %ld.b.1 to i32
22+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
23+
%add = add i32 %mul.0, %mul.1
24+
%res = add i32 %add, %acc
25+
ret i32 %res
26+
}
27+
28+
; CHECK-LABEL: multi_block
29+
; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
30+
; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
31+
; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
32+
; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
33+
; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
34+
define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
35+
entry:
36+
%ld.a.0 = load i16, i16* %a
37+
%sext.a.0 = sext i16 %ld.a.0 to i32
38+
%ld.b.0 = load i16, i16* %b
39+
%sext.b.0 = sext i16 %ld.b.0 to i32
40+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
41+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
42+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
43+
%ld.a.1 = load i16, i16* %addr.a.1
44+
%sext.a.1 = sext i16 %ld.a.1 to i32
45+
%ld.b.1 = load i16, i16* %addr.b.1
46+
%sext.b.1 = sext i16 %ld.b.1 to i32
47+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
48+
%add = add i32 %mul.0, %mul.1
49+
br label %bb.1
50+
51+
bb.1:
52+
%res = add i32 %add, %acc
53+
ret i32 %res
54+
}
55+
56+
; CHECK-LABEL: multi_block_1
57+
; CHECK-NOT: call i32 @llvm.arm.smlad
58+
define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
59+
entry:
60+
%ld.a.0 = load i16, i16* %a
61+
%sext.a.0 = sext i16 %ld.a.0 to i32
62+
%ld.b.0 = load i16, i16* %b
63+
%sext.b.0 = sext i16 %ld.b.0 to i32
64+
%mul.0 = mul i32 %sext.a.0, %sext.b.0
65+
br label %bb.1
66+
67+
bb.1:
68+
%addr.a.1 = getelementptr i16, i16* %a, i32 1
69+
%addr.b.1 = getelementptr i16, i16* %b, i32 1
70+
%ld.a.1 = load i16, i16* %addr.a.1
71+
%sext.a.1 = sext i16 %ld.a.1 to i32
72+
%ld.b.1 = load i16, i16* %addr.b.1
73+
%sext.b.1 = sext i16 %ld.b.1 to i32
74+
%mul.1 = mul i32 %sext.a.1, %sext.b.1
75+
%add = add i32 %mul.0, %mul.1
76+
%res = add i32 %add, %acc
77+
ret i32 %res
78+
}
79+

test/CodeGen/ARM/ParallelDSP/smlad12.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
;
33
; The loop header is not the loop latch.
44
;
5-
; CHECK-NOT: call i32 @llvm.arm.smlad
5+
; CHECK: call i32 @llvm.arm.smlad
66
;
77
define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
88
entry:

0 commit comments

Comments
 (0)