[MLIR] Introduce applyOpPatternsAndFold for op local rewrites

bondhugula · bondhugula · commit 04b5274ede3e · 2020-04-15T14:10:01.000+05:30
Introduce mlir::applyOpPatternsAndFold which applies patterns as well as any folding only on a specified op (in contrast to applyPatternsAndFoldGreedily which applies patterns only on the regions of an op isolated from above). The caller is made aware of the op being folded away or erased. Depends on D77485. Differential Revision: https://reviews.llvm.org/D77487
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
@@ -455,6 +455,15 @@ bool applyPatternsAndFoldGreedily(Operation *op,
 /// Rewrite the given regions, which must be isolated from above.
 bool applyPatternsAndFoldGreedily(MutableArrayRef<Region> regions,
                                   const OwningRewritePatternList &patterns);
+
+/// Applies the specified patterns on `op` alone while also trying to fold it,
+/// by selecting the highest benefits patterns in a greedy manner. Returns true
+/// if no more patterns can be matched. `erased` is set to true if `op` was
+/// folded away or erased as a result of becoming dead. Note: This does not
+/// apply any patterns recursively to the regions of `op`.
+bool applyOpPatternsAndFold(Operation *op,
+                            const OwningRewritePatternList &patterns,
+                            bool *erased = nullptr);
 } // end namespace mlir
 
 #endif // MLIR_PATTERN_MATCH_H
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -211,20 +211,25 @@ void AffineDataCopyGeneration::runOnFunction() {
   for (auto &block : f)
     runOnBlock(&block, copyNests);
 
-  // Promote any single iteration loops in the copy nests.
+  // Promote any single iteration loops in the copy nests and collect
+  // load/stores to simplify.
+  SmallVector<Operation *, 4> copyOps;
   for (auto nest : copyNests)
-    nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+    // With a post order walk, the erasure of loops does not affect
+    // continuation of the walk or the collection of load/store ops.
+    nest->walk([&](Operation *op) {
+      if (auto forOp = dyn_cast<AffineForOp>(op))
+        promoteIfSingleIteration(forOp);
+      else if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op))
+        copyOps.push_back(op);
+    });
 
   // Promoting single iteration loops could lead to simplification of
-  // load's/store's. We will run canonicalization patterns on load/stores.
-  // TODO: this whole function load/store canonicalization should be replaced by
-  // canonicalization that is limited to only the load/store ops
-  // introduced/touched by this pass (those inside 'copyNests'). This would be
-  // possible once the necessary support is available in the pattern rewriter.
-  if (!copyNests.empty()) {
-    OwningRewritePatternList patterns;
-    AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
-    AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
-    applyPatternsAndFoldGreedily(f, std::move(patterns));
-  }
+  // contained load's/store's, and the latter could anyway also be
+  // canonicalized.
+  OwningRewritePatternList patterns;
+  AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
+  AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
+  for (auto op : copyOps)
+    applyOpPatternsAndFold(op, std::move(patterns));
 }
diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a pass to simplify affine structures.
+// This file implements a pass to simplify affine structures in operations.
 //
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
 #include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/Utils.h"
 
 #define DEBUG_TYPE "simplify-affine-structure"
@@ -77,13 +79,22 @@ mlir::createSimplifyAffineStructuresPass() {
 void SimplifyAffineStructures::runOnFunction() {
   auto func = getFunction();
   simplifiedAttributes.clear();
-  func.walk([&](Operation *opInst) {
-    for (auto attr : opInst->getAttrs()) {
+  OwningRewritePatternList patterns;
+  AffineForOp::getCanonicalizationPatterns(patterns, func.getContext());
+  AffineIfOp::getCanonicalizationPatterns(patterns, func.getContext());
+  AffineApplyOp::getCanonicalizationPatterns(patterns, func.getContext());
+  func.walk([&](Operation *op) {
+    for (auto attr : op->getAttrs()) {
       if (auto mapAttr = attr.second.dyn_cast<AffineMapAttr>())
-        simplifyAndUpdateAttribute(opInst, attr.first, mapAttr);
+        simplifyAndUpdateAttribute(op, attr.first, mapAttr);
       else if (auto setAttr = attr.second.dyn_cast<IntegerSetAttr>())
-        simplifyAndUpdateAttribute(opInst, attr.first, setAttr);
+        simplifyAndUpdateAttribute(op, attr.first, setAttr);
     }
+
+    // The simplification of the attribute will likely simplify the op. Try to
+    // fold / apply canonicalization patterns when we have affine dialect ops.
+    if (isa<AffineForOp>(op) || isa<AffineIfOp>(op) || isa<AffineApplyOp>(op))
+      applyOpPatternsAndFold(op, patterns);
   });
 
   // Turn memrefs' non-identity layouts maps into ones with identity. Collect
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -26,6 +26,10 @@ using namespace mlir;
 /// The max number of iterations scanning for pattern match.
 static unsigned maxPatternMatchIterations = 10;
 
+//===----------------------------------------------------------------------===//
+// GreedyPatternRewriteDriver
+//===----------------------------------------------------------------------===//
+
 namespace {
 /// This is a worklist-driven driver for the PatternMatcher, which repeatedly
 /// applies the locally optimal patterns in a roughly "bottom up" way.
@@ -37,8 +41,6 @@ class GreedyPatternRewriteDriver : public PatternRewriter {
     worklist.reserve(64);
   }
 
-  /// Perform the rewrites while folding and erasing any dead ops. Return true
-  /// if the rewrite converges in `maxIterations`.
   bool simplify(MutableArrayRef<Region> regions, int maxIterations);
 
   void addToWorklist(Operation *op) {
@@ -248,3 +250,112 @@ bool mlir::applyPatternsAndFoldGreedily(
   });
   return converged;
 }
+
+//===----------------------------------------------------------------------===//
+// OpPatternRewriteDriver
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This is a simple driver for the PatternMatcher to apply patterns and perform
+/// folding on a single op. It repeatedly applies locally optimal patterns.
+class OpPatternRewriteDriver : public PatternRewriter {
+public:
+  explicit OpPatternRewriteDriver(MLIRContext *ctx,
+                                  const OwningRewritePatternList &patterns)
+      : PatternRewriter(ctx), matcher(patterns), folder(ctx) {}
+
+  bool simplifyLocally(Operation *op, int maxIterations, bool &erased);
+
+  /// No additional action needed other than inserting the op.
+  Operation *insert(Operation *op) override { return OpBuilder::insert(op); }
+
+  // These are hooks implemented for PatternRewriter.
+protected:
+  /// If an operation is about to be removed, mark it so that we can let clients
+  /// know.
+  void notifyOperationRemoved(Operation *op) override {
+    opErasedViaPatternRewrites = true;
+  }
+
+  // When a root is going to be replaced, its removal will be notified as well.
+  // So there is nothing to do here.
+  void notifyRootReplaced(Operation *op) override {}
+
+private:
+  /// The low-level pattern matcher.
+  RewritePatternMatcher matcher;
+
+  /// Non-pattern based folder for operations.
+  OperationFolder folder;
+
+  /// Set to true if the operation has been erased via pattern rewrites.
+  bool opErasedViaPatternRewrites = false;
+};
+
+} // anonymous namespace
+
+/// Performs the rewrites and folding only on `op`. The simplification converges
+/// if the op is erased as a result of being folded, replaced, or dead, or no
+/// more changes happen in an iteration. Returns true if the rewrite converges
+/// in `maxIterations`. `erased` is set to true if `op` gets erased.
+bool OpPatternRewriteDriver::simplifyLocally(Operation *op, int maxIterations,
+                                             bool &erased) {
+  bool changed = false;
+  erased = false;
+  opErasedViaPatternRewrites = false;
+  int i = 0;
+  // Iterate until convergence or until maxIterations. Deletion of the op as
+  // a result of being dead or folded is convergence.
+  do {
+    // If the operation is trivially dead - remove it.
+    if (isOpTriviallyDead(op)) {
+      op->erase();
+      erased = true;
+      return true;
+    }
+
+    // Try to fold this op.
+    bool inPlaceUpdate;
+    if (succeeded(folder.tryToFold(op, /*processGeneratedConstants=*/nullptr,
+                                   /*preReplaceAction=*/nullptr,
+                                   &inPlaceUpdate))) {
+      changed = true;
+      if (!inPlaceUpdate) {
+        erased = true;
+        return true;
+      }
+    }
+
+    // Make sure that any new operations are inserted at this point.
+    setInsertionPoint(op);
+
+    // Try to match one of the patterns. The rewriter is automatically
+    // notified of any necessary changes, so there is nothing else to do here.
+    changed |= matcher.matchAndRewrite(op, *this);
+    if ((erased = opErasedViaPatternRewrites))
+      return true;
+  } while (changed && ++i < maxIterations);
+
+  // Whether the rewrite converges, i.e. wasn't changed in the last iteration.
+  return !changed;
+}
+
+/// Rewrites only `op` using the supplied canonicalization patterns and
+/// folding. `erased` is set to true if the op is erased as a result of being
+/// folded, replaced, or dead.
+bool mlir::applyOpPatternsAndFold(Operation *op,
+                                  const OwningRewritePatternList &patterns,
+                                  bool *erased) {
+  // Start the pattern driver.
+  OpPatternRewriteDriver driver(op->getContext(), patterns);
+  bool opErased;
+  bool converged =
+      driver.simplifyLocally(op, maxPatternMatchIterations, opErased);
+  if (erased)
+    *erased = opErased;
+  LLVM_DEBUG(if (!converged) {
+    llvm::dbgs() << "The pattern rewrite doesn't converge after scanning "
+                 << maxPatternMatchIterations << " times";
+  });
+  return converged;
+}
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -23,6 +23,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
@@ -312,9 +313,19 @@ LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
                                   opGroupQueue, /*offset=*/0, forOp, b);
         lbShift = d * step;
       }
-      if (!prologue && res)
-        prologue = res;
-      epilogue = res;
+
+      if (res) {
+        // Simplify/canonicalize the affine.for.
+        OwningRewritePatternList patterns;
+        AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
+        bool erased;
+        applyOpPatternsAndFold(res, std::move(patterns), &erased);
+
+        if (!erased && !prologue)
+          prologue = res;
+        if (!erased)
+          epilogue = res;
+      }
     } else {
       // Start of first interval.
       lbShift = d * step;
@@ -694,7 +705,8 @@ bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
 }
 
 /// Return true if `loops` is a perfect nest.
-static bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops) {
+static bool LLVM_ATTRIBUTE_UNUSED
+isPerfectlyNested(ArrayRef<AffineForOp> loops) {
   auto outerLoop = loops.front();
   for (auto loop : loops.drop_front()) {
     auto parentForOp = dyn_cast<AffineForOp>(loop.getParentOp());
diff --git a/mlir/test/Dialect/Affine/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir
@@ -216,7 +216,7 @@ func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
   return %A : memref<4096xf32>
 }
 // CHECK:      affine.for %[[IV1:.*]] = 0 to 4096 step 100
-// CHECK-NEXT:   %[[BUF:.*]] = alloc() : memref<100xf32>
+// CHECK:        %[[BUF:.*]] = alloc() : memref<100xf32>
 // CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
 // CHECK-NEXT:     affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
 // CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
@@ -226,7 +226,7 @@ func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
 // CHECK-NEXT:     mulf
 // CHECK-NEXT:     affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
 // CHECK-NEXT:   }
-// CHECK-NEXT:   affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
+// CHECK:        affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
 // CHECK-NEXT:     affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
 // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32>
 // CHECK-NEXT:   }
@@ -239,8 +239,8 @@ func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
 // with multi-level tiling when the tile sizes used don't divide loop trip
 // counts.
 
-#lb = affine_map<(d0, d1) -> (d0 * 512, d1 * 6)>
-#ub = affine_map<(d0, d1) -> (d0 * 512 + 512, d1 * 6 + 6)>
+#lb = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
+#ub = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
 
 // CHECK-DAG: #[[LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
 // CHECK-DAG: #[[UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
@@ -250,7 +250,7 @@ func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
 // CHECK-SAME: [[j:arg[0-9]+]]
 func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
   affine.for %ii = 0 to 2048 {
-    affine.for %jj = max #lb(%i, %j) to min #ub(%i, %j) {
+    affine.for %jj = max #lb()[%i, %j] to min #ub()[%i, %j] {
       affine.load %M[%ii, %jj] : memref<2048x516xf64>
     }
   }
diff --git a/mlir/test/Dialect/Affine/simplify-affine-structures.mlir b/mlir/test/Dialect/Affine/simplify-affine-structures.mlir
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp