1
- // ===- ParallelDSP .cpp - Parallel DSP Pass --- -----------------------------===//
1
+ // ===- ARMParallelDSP .cpp - Parallel DSP Pass -----------------------------===//
2
2
//
3
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
4
// See https://llvm.org/LICENSE.txt for license information.
18
18
#include " llvm/ADT/SmallPtrSet.h"
19
19
#include " llvm/Analysis/AliasAnalysis.h"
20
20
#include " llvm/Analysis/LoopAccessAnalysis.h"
21
- #include " llvm/Analysis/LoopPass.h"
22
- #include " llvm/Analysis/LoopInfo.h"
23
21
#include " llvm/IR/Instructions.h"
24
22
#include " llvm/IR/NoFolder.h"
25
23
#include " llvm/Transforms/Scalar.h"
26
24
#include " llvm/Transforms/Utils/BasicBlockUtils.h"
27
- #include " llvm/Transforms/Utils/LoopUtils.h"
28
25
#include " llvm/Pass.h"
29
26
#include " llvm/PassRegistry.h"
30
27
#include " llvm/PassSupport.h"
@@ -156,13 +153,11 @@ namespace {
156
153
}
157
154
};
158
155
159
- class ARMParallelDSP : public LoopPass {
156
+ class ARMParallelDSP : public FunctionPass {
160
157
ScalarEvolution *SE;
161
158
AliasAnalysis *AA;
162
159
TargetLibraryInfo *TLI;
163
160
DominatorTree *DT;
164
- LoopInfo *LI;
165
- Loop *L;
166
161
const DataLayout *DL;
167
162
Module *M;
168
163
std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -184,63 +179,38 @@ namespace {
184
179
// / products to a 32-bit accumulate operand. Optionally, the instruction can
185
180
// / exchange the halfwords of the second operand before performing the
186
181
// / arithmetic.
187
- bool MatchSMLAD (Loop *L );
182
+ bool MatchSMLAD (Function &F );
188
183
189
184
public:
190
185
static char ID;
191
186
192
- ARMParallelDSP () : LoopPass(ID) { }
193
-
194
- bool doInitialization (Loop *L, LPPassManager &LPM) override {
195
- LoadPairs.clear ();
196
- WideLoads.clear ();
197
- return true ;
198
- }
187
+ ARMParallelDSP () : FunctionPass(ID) { }
199
188
200
189
void getAnalysisUsage (AnalysisUsage &AU) const override {
201
- LoopPass ::getAnalysisUsage (AU);
190
+ FunctionPass ::getAnalysisUsage (AU);
202
191
AU.addRequired <AssumptionCacheTracker>();
203
192
AU.addRequired <ScalarEvolutionWrapperPass>();
204
193
AU.addRequired <AAResultsWrapperPass>();
205
194
AU.addRequired <TargetLibraryInfoWrapperPass>();
206
- AU.addRequired <LoopInfoWrapperPass>();
207
195
AU.addRequired <DominatorTreeWrapperPass>();
208
196
AU.addRequired <TargetPassConfig>();
209
- AU.addPreserved <LoopInfoWrapperPass>();
197
+ AU.addPreserved <ScalarEvolutionWrapperPass>();
198
+ AU.addPreserved <GlobalsAAWrapperPass>();
210
199
AU.setPreservesCFG ();
211
200
}
212
201
213
- bool runOnLoop (Loop *TheLoop, LPPassManager & ) override {
202
+ bool runOnFunction (Function &F ) override {
214
203
if (DisableParallelDSP)
215
204
return false ;
216
- if (skipLoop (TheLoop ))
205
+ if (skipFunction (F ))
217
206
return false ;
218
207
219
- L = TheLoop;
220
208
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE ();
221
209
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults ();
222
210
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI ();
223
211
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree ();
224
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo ();
225
212
auto &TPC = getAnalysis<TargetPassConfig>();
226
213
227
- BasicBlock *Header = TheLoop->getHeader ();
228
- if (!Header)
229
- return false ;
230
-
231
- // TODO: We assume the loop header and latch to be the same block.
232
- // This is not a fundamental restriction, but lifting this would just
233
- // require more work to do the transformation and then patch up the CFG.
234
- if (Header != TheLoop->getLoopLatch ()) {
235
- LLVM_DEBUG (dbgs () << " The loop header is not the loop latch: not "
236
- " running pass ARMParallelDSP\n " );
237
- return false ;
238
- }
239
-
240
- if (!TheLoop->getLoopPreheader ())
241
- InsertPreheaderForLoop (L, DT, LI, nullptr , true );
242
-
243
- Function &F = *Header->getParent ();
244
214
M = F.getParent ();
245
215
DL = &M->getDataLayout ();
246
216
@@ -265,17 +235,10 @@ namespace {
265
235
return false ;
266
236
}
267
237
268
- LoopAccessInfo LAI (L, SE, TLI, AA, DT, LI);
269
-
270
238
LLVM_DEBUG (dbgs () << " \n == Parallel DSP pass ==\n " );
271
239
LLVM_DEBUG (dbgs () << " - " << F.getName () << " \n\n " );
272
240
273
- if (!RecordMemoryOps (Header)) {
274
- LLVM_DEBUG (dbgs () << " - No sequential loads found.\n " );
275
- return false ;
276
- }
277
-
278
- bool Changes = MatchSMLAD (L);
241
+ bool Changes = MatchSMLAD (F);
279
242
return Changes;
280
243
}
281
244
};
@@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
337
300
bool ARMParallelDSP::RecordMemoryOps (BasicBlock *BB) {
338
301
SmallVector<LoadInst*, 8 > Loads;
339
302
SmallVector<Instruction*, 8 > Writes;
303
+ LoadPairs.clear ();
304
+ WideLoads.clear ();
340
305
341
306
// Collect loads and instruction that may write to memory. For now we only
342
307
// record loads which are simple, sign-extended and have a single user.
@@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
415
380
return LoadPairs.size () > 1 ;
416
381
}
417
382
418
- // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
383
+ // The pass needs to identify integer add/sub reductions of 16-bit vector
419
384
// multiplications.
420
385
// To use SMLAD:
421
386
// 1) we first need to find integer add then look for this pattern:
@@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
446
411
// If loop invariants are used instead of loads, these need to be packed
447
412
// before the loop begins.
448
413
//
449
- bool ARMParallelDSP::MatchSMLAD (Loop *L ) {
414
+ bool ARMParallelDSP::MatchSMLAD (Function &F ) {
450
415
// Search recursively back through the operands to find a tree of values that
451
416
// form a multiply-accumulate chain. The search records the Add and Mul
452
417
// instructions that form the reduction and allows us to find a single value
453
418
// to be used as the initial input to the accumlator.
454
- std::function<bool (Value*, Reduction&)> Search = [&]
455
- (Value *V, Reduction &R) -> bool {
419
+ std::function<bool (Value*, BasicBlock*, Reduction&)> Search = [&]
420
+ (Value *V, BasicBlock *BB, Reduction &R) -> bool {
456
421
457
422
// If we find a non-instruction, try to use it as the initial accumulator
458
423
// value. This may have already been found during the search in which case
@@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
461
426
if (!I)
462
427
return R.InsertAcc (V);
463
428
429
+ if (I->getParent () != BB)
430
+ return false ;
431
+
464
432
switch (I->getOpcode ()) {
465
433
default :
466
434
break ;
@@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
471
439
// Adds should be adding together two muls, or another add and a mul to
472
440
// be within the mac chain. One of the operands may also be the
473
441
// accumulator value at which point we should stop searching.
474
- bool ValidLHS = Search (I->getOperand (0 ), R);
475
- bool ValidRHS = Search (I->getOperand (1 ), R);
442
+ bool ValidLHS = Search (I->getOperand (0 ), BB, R);
443
+ bool ValidRHS = Search (I->getOperand (1 ), BB, R);
476
444
if (!ValidLHS && !ValidLHS)
477
445
return false ;
478
446
else if (ValidLHS && ValidRHS) {
@@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L) {
498
466
return false ;
499
467
}
500
468
case Instruction::SExt:
501
- return Search (I->getOperand (0 ), R);
469
+ return Search (I->getOperand (0 ), BB, R);
502
470
}
503
471
return false ;
504
472
};
505
473
506
474
bool Changed = false ;
507
- SmallPtrSet<Instruction*, 4 > AllAdds;
508
- BasicBlock *Latch = L->getLoopLatch ();
509
475
510
- for (Instruction &I : reverse (*Latch)) {
511
- if (I.getOpcode () != Instruction::Add)
476
+ for (auto &BB : F) {
477
+ SmallPtrSet<Instruction*, 4 > AllAdds;
478
+ if (!RecordMemoryOps (&BB))
512
479
continue ;
513
480
514
- if (AllAdds.count (&I))
515
- continue ;
481
+ for (Instruction &I : reverse (BB)) {
482
+ if (I.getOpcode () != Instruction::Add)
483
+ continue ;
516
484
517
- const auto *Ty = I.getType ();
518
- if (!Ty->isIntegerTy (32 ) && !Ty->isIntegerTy (64 ))
519
- continue ;
485
+ if (AllAdds.count (&I))
486
+ continue ;
520
487
521
- Reduction R (&I );
522
- if (!Search (&I, R ))
523
- continue ;
488
+ const auto *Ty = I. getType ( );
489
+ if (!Ty-> isIntegerTy ( 32 ) && !Ty-> isIntegerTy ( 64 ))
490
+ continue ;
524
491
525
- if (!CreateParallelPairs (R))
526
- continue ;
492
+ Reduction R (&I);
493
+ if (!Search (&I, &BB, R))
494
+ continue ;
527
495
528
- InsertParallelMACs (R);
529
- Changed = true ;
530
- AllAdds.insert (R.getAdds ().begin (), R.getAdds ().end ());
496
+ if (!CreateParallelPairs (R))
497
+ continue ;
498
+
499
+ InsertParallelMACs (R);
500
+ Changed = true ;
501
+ AllAdds.insert (R.getAdds ().begin (), R.getAdds ().end ());
502
+ }
531
503
}
532
504
533
505
return Changed;
@@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
745
717
char ARMParallelDSP::ID = 0 ;
746
718
747
719
INITIALIZE_PASS_BEGIN (ARMParallelDSP, " arm-parallel-dsp" ,
748
- " Transform loops to use DSP intrinsics" , false , false )
720
+ " Transform functions to use DSP intrinsics" , false , false )
749
721
INITIALIZE_PASS_END(ARMParallelDSP, " arm-parallel-dsp" ,
750
- " Transform loops to use DSP intrinsics" , false , false )
722
+ " Transform functions to use DSP intrinsics" , false , false )
0 commit comments