Skip to content

Commit 10ec186

Browse files
author
Stephan Herhut
committed
[MLIR][GPU] Add error checking to loop.parallel to gpu transform.
Summary: Instead of crashing on malformed input, the pass now produces error messages. Differential Revision: https://reviews.llvm.org/D75468
1 parent 292ab49 commit 10ec186

File tree

2 files changed

+83
-13
lines changed

2 files changed

+83
-13
lines changed

mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
572572
gpu::LaunchOp launchOp,
573573
BlockAndValueMapping &cloningMap,
574574
SmallVectorImpl<Operation *> &worklist,
575+
DenseMap<int, Value> &bounds,
575576
PatternRewriter &rewriter) {
576577
// TODO(herhut): Verify that this is a valid GPU mapping.
577578
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@@ -631,31 +632,36 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
631632
// conditional. If the lower-bound is constant or defined before the
632633
// launch, we can use it in the launch bounds. Otherwise fail.
633634
if (!launchIndependent(lowerBound) &&
634-
!isa<ConstantOp>(lowerBound.getDefiningOp()))
635+
!isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
635636
return failure();
636637
// The step must also be constant or defined outside of the loop nest.
637-
if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
638+
if (!launchIndependent(step) &&
639+
!isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
638640
return failure();
639641
// If the upper-bound is constant or defined before the launch, we can
640642
// use it in the launch bounds directly. Otherwise try derive a bound.
641-
bool boundIsPrecise = launchIndependent(upperBound) ||
642-
isa<ConstantOp>(upperBound.getDefiningOp());
643+
bool boundIsPrecise =
644+
launchIndependent(upperBound) ||
645+
isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
643646
{
644647
PatternRewriter::InsertionGuard guard(rewriter);
645648
rewriter.setInsertionPoint(launchOp);
646649
if (!boundIsPrecise) {
647650
upperBound = deriveStaticUpperBound(upperBound, rewriter);
648-
if (!upperBound)
649-
return failure();
651+
if (!upperBound) {
652+
return parallelOp.emitOpError()
653+
<< "cannot derive loop-invariant upper bound for number "
654+
"of iterations";
655+
}
650656
}
651657
// Compute the number of iterations needed. We compute this as an
652658
// affine expression ceilDiv (upperBound - lowerBound) step. We use
653659
// affine.apply here so that it composes nicely with the provided map.
654660
AffineMap stepMap =
655661
AffineMap::get(0, 3,
656662
((rewriter.getAffineSymbolExpr(0) -
657-
rewriter.getAffineSymbolExpr(1)).ceilDiv(
658-
rewriter.getAffineSymbolExpr(2))));
663+
rewriter.getAffineSymbolExpr(1))
664+
.ceilDiv(rewriter.getAffineSymbolExpr(2))));
659665
Value launchBound = rewriter.create<AffineApplyOp>(
660666
loc, annotation.boundMap.compose(stepMap),
661667
ValueRange{
@@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
664670
ensureLaunchIndependent(
665671
cloningMap.lookupOrDefault(lowerBound)),
666672
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
667-
launchOp.setOperand(annotation.processor, launchBound);
673+
if (bounds.find(annotation.processor) != bounds.end()) {
674+
return parallelOp.emitOpError()
675+
<< "cannot redefine the bound for processor "
676+
<< annotation.processor;
677+
}
678+
bounds[annotation.processor] = launchBound;
668679
}
669680
if (!boundIsPrecise) {
670681
// We are using an approximation, create a surrounding conditional.
@@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
746757
rewriter.setInsertionPointToStart(&launchOp.body().front());
747758

748759
BlockAndValueMapping cloningMap;
760+
llvm::DenseMap<int, Value> launchBounds;
749761
SmallVector<Operation *, 16> worklist;
750762
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
751-
rewriter)))
763+
launchBounds, rewriter)))
752764
return matchFailure();
753765

754766
// Whether we have seen any side-effects. Reset when leaving an inner scope.
@@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
770782
// A nested loop.parallel needs insertion of code to compute indices.
771783
// Insert that now. This will also update the worklist with the loops
772784
// body.
773-
processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
774-
rewriter);
785+
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
786+
worklist, launchBounds, rewriter)))
787+
return matchFailure();
775788
} else if (op == launchOp.getOperation()) {
776789
// Found our sentinel value. We have finished the operations from one
777790
// nesting level, pop one level back up.
@@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
791804
}
792805
}
793806

807+
// Now that we succeeded creating the launch operation, also update the
808+
// bounds.
809+
for (auto bound : launchBounds)
810+
launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
811+
794812
rewriter.eraseOp(parallelOp);
795813
return matchSuccess();
796814
}

mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
1+
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
22

33
// 2-d parallel loop mapped to block.y and block.x
44

@@ -299,3 +299,55 @@ module {
299299
// CHECK: return
300300
// CHECK: }
301301
// CHECK: }
302+
303+
// -----
304+
305+
// Mapping to the same processor twice.
306+
307+
func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
308+
%arg3 : index,
309+
%buf : memref<?x?xf32>,
310+
%res : memref<?x?xf32>) {
311+
%four = constant 4 : index
312+
// expected-error@+2 {{cannot redefine the bound for processor 1}}
313+
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
314+
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
315+
step (%four, %four) {
316+
} { mapping = [
317+
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
318+
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
319+
] }
320+
return
321+
}
322+
323+
// -----
324+
325+
// Loop with loop-variant upper bound.
326+
327+
func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
328+
%arg3 : index,
329+
%buf : memref<?x?xf32>,
330+
%res : memref<?x?xf32>) {
331+
%zero = constant 0 : index
332+
%one = constant 1 : index
333+
%four = constant 4 : index
334+
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
335+
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
336+
step (%four, %four) {
337+
// expected-error@+1 {{cannot derive loop-invariant upper bound}}
338+
loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
339+
step (%one, %one) {
340+
%idx0 = addi %i0, %si0 : index
341+
%idx1 = addi %i1, %si1 : index
342+
%val = load %buf[%idx0, %idx1] : memref<?x?xf32>
343+
store %val, %res[%idx1, %idx0] : memref<?x?xf32>
344+
} { mapping = [
345+
{processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
346+
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
347+
] }
348+
} { mapping = [
349+
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
350+
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
351+
] }
352+
return
353+
}

0 commit comments

Comments
 (0)