@@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
572
572
gpu::LaunchOp launchOp,
573
573
BlockAndValueMapping &cloningMap,
574
574
SmallVectorImpl<Operation *> &worklist,
575
+ DenseMap<int , Value> &bounds,
575
576
PatternRewriter &rewriter) {
576
577
// TODO(herhut): Verify that this is a valid GPU mapping.
577
578
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@@ -631,31 +632,36 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
631
632
// conditional. If the lower-bound is constant or defined before the
632
633
// launch, we can use it in the launch bounds. Otherwise fail.
633
634
if (!launchIndependent (lowerBound) &&
634
- !isa <ConstantOp>(lowerBound.getDefiningOp ()))
635
+ !isa_and_nonnull <ConstantOp>(lowerBound.getDefiningOp ()))
635
636
return failure ();
636
637
// The step must also be constant or defined outside of the loop nest.
637
- if (!launchIndependent (step) && !isa<ConstantOp>(step.getDefiningOp ()))
638
+ if (!launchIndependent (step) &&
639
+ !isa_and_nonnull<ConstantOp>(step.getDefiningOp ()))
638
640
return failure ();
639
641
// If the upper-bound is constant or defined before the launch, we can
640
642
// use it in the launch bounds directly. Otherwise try derive a bound.
641
- bool boundIsPrecise = launchIndependent (upperBound) ||
642
- isa<ConstantOp>(upperBound.getDefiningOp ());
643
+ bool boundIsPrecise =
644
+ launchIndependent (upperBound) ||
645
+ isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp ());
643
646
{
644
647
PatternRewriter::InsertionGuard guard (rewriter);
645
648
rewriter.setInsertionPoint (launchOp);
646
649
if (!boundIsPrecise) {
647
650
upperBound = deriveStaticUpperBound (upperBound, rewriter);
648
- if (!upperBound)
649
- return failure ();
651
+ if (!upperBound) {
652
+ return parallelOp.emitOpError ()
653
+ << " cannot derive loop-invariant upper bound for number "
654
+ " of iterations" ;
655
+ }
650
656
}
651
657
// Compute the number of iterations needed. We compute this as an
652
658
// affine expression ceilDiv (upperBound - lowerBound) step. We use
653
659
// affine.apply here so that it composes nicely with the provided map.
654
660
AffineMap stepMap =
655
661
AffineMap::get (0 , 3 ,
656
662
((rewriter.getAffineSymbolExpr (0 ) -
657
- rewriter.getAffineSymbolExpr (1 )). ceilDiv (
658
- rewriter.getAffineSymbolExpr (2 ))));
663
+ rewriter.getAffineSymbolExpr (1 ))
664
+ . ceilDiv ( rewriter.getAffineSymbolExpr (2 ))));
659
665
Value launchBound = rewriter.create <AffineApplyOp>(
660
666
loc, annotation.boundMap .compose (stepMap),
661
667
ValueRange{
@@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
664
670
ensureLaunchIndependent (
665
671
cloningMap.lookupOrDefault (lowerBound)),
666
672
ensureLaunchIndependent (cloningMap.lookupOrDefault (step))});
667
- launchOp.setOperand (annotation.processor , launchBound);
673
+ if (bounds.find (annotation.processor ) != bounds.end ()) {
674
+ return parallelOp.emitOpError ()
675
+ << " cannot redefine the bound for processor "
676
+ << annotation.processor ;
677
+ }
678
+ bounds[annotation.processor ] = launchBound;
668
679
}
669
680
if (!boundIsPrecise) {
670
681
// We are using an approximation, create a surrounding conditional.
@@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
746
757
rewriter.setInsertionPointToStart (&launchOp.body ().front ());
747
758
748
759
BlockAndValueMapping cloningMap;
760
+ llvm::DenseMap<int , Value> launchBounds;
749
761
SmallVector<Operation *, 16 > worklist;
750
762
if (failed (processParallelLoop (parallelOp, launchOp, cloningMap, worklist,
751
- rewriter)))
763
+ launchBounds, rewriter)))
752
764
return matchFailure ();
753
765
754
766
// Whether we have seen any side-effects. Reset when leaving an inner scope.
@@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
770
782
// A nested loop.parallel needs insertion of code to compute indices.
771
783
// Insert that now. This will also update the worklist with the loops
772
784
// body.
773
- processParallelLoop (nestedParallel, launchOp, cloningMap, worklist,
774
- rewriter);
785
+ if (failed (processParallelLoop (nestedParallel, launchOp, cloningMap,
786
+ worklist, launchBounds, rewriter)))
787
+ return matchFailure ();
775
788
} else if (op == launchOp.getOperation ()) {
776
789
// Found our sentinel value. We have finished the operations from one
777
790
// nesting level, pop one level back up.
@@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
791
804
}
792
805
}
793
806
807
+ // Now that we succeeded creating the launch operation, also update the
808
+ // bounds.
809
+ for (auto bound : launchBounds)
810
+ launchOp.setOperand (std::get<0 >(bound), std::get<1 >(bound));
811
+
794
812
rewriter.eraseOp (parallelOp);
795
813
return matchSuccess ();
796
814
}
0 commit comments