MLIR  22.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Specializes parallel loops and for loops for easier unrolling and
10 // vectorization.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/PatternMatch.h"
27 
28 namespace mlir {
29 #define GEN_PASS_DEF_SCFFORLOOPPEELING
30 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
31 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
32 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
33 } // namespace mlir
34 
35 using namespace mlir;
36 using namespace mlir::affine;
37 using scf::ForOp;
38 using scf::ParallelOp;
39 
40 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant
41 /// into 2 loops after checking if the bounds are equal to that constant. This
42 /// is beneficial if the loop will almost always have the constant bound and
43 /// that version can be fully unrolled and vectorized.
44 static void specializeParallelLoopForUnrolling(ParallelOp op) {
45  SmallVector<int64_t, 2> constantIndices;
46  constantIndices.reserve(op.getUpperBound().size());
47  for (auto bound : op.getUpperBound()) {
48  auto minOp = bound.getDefiningOp<AffineMinOp>();
49  if (!minOp)
50  return;
51  int64_t minConstant = std::numeric_limits<int64_t>::max();
52  for (AffineExpr expr : minOp.getMap().getResults()) {
53  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
54  minConstant = std::min(minConstant, constantIndex.getValue());
55  }
56  if (minConstant == std::numeric_limits<int64_t>::max())
57  return;
58  constantIndices.push_back(minConstant);
59  }
60 
61  OpBuilder b(op);
62  IRMapping map;
63  Value cond;
64  for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
65  Value constant =
66  arith::ConstantIndexOp::create(b, op.getLoc(), std::get<1>(bound));
67  Value cmp = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
68  std::get<0>(bound), constant);
69  cond = cond ? arith::AndIOp::create(b, op.getLoc(), cond, cmp) : cmp;
70  map.map(std::get<0>(bound), constant);
71  }
72  auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
73  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
74  ifOp.getElseBodyBuilder().clone(*op.getOperation());
75  op.erase();
76 }
77 
78 /// Rewrite a for loop with bounds defined by an affine.min with a constant into
79 /// 2 loops after checking if the bounds are equal to that constant. This is
80 /// beneficial if the loop will almost always have the constant bound and that
81 /// version can be fully unrolled and vectorized.
82 static void specializeForLoopForUnrolling(ForOp op) {
83  auto bound = op.getUpperBound();
84  auto minOp = bound.getDefiningOp<AffineMinOp>();
85  if (!minOp)
86  return;
87  int64_t minConstant = std::numeric_limits<int64_t>::max();
88  for (AffineExpr expr : minOp.getMap().getResults()) {
89  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
90  minConstant = std::min(minConstant, constantIndex.getValue());
91  }
92  if (minConstant == std::numeric_limits<int64_t>::max())
93  return;
94 
95  OpBuilder b(op);
96  IRMapping map;
97  Value constant = arith::ConstantIndexOp::create(b, op.getLoc(), minConstant);
98  Value cond = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
99  bound, constant);
100  map.map(bound, constant);
101  auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
102  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
103  ifOp.getElseBodyBuilder().clone(*op.getOperation());
104  op.erase();
105 }
106 
107 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
108 /// into a for loop where the step divides the iteration space evenly, followed
109 /// by an scf.if for the last (partial) iteration (if any).
110 ///
111 /// This function rewrites the given scf.for loop in-place and creates a new
112 /// scf.if operation for the last iteration. It replaces all uses of the
113 /// unpeeled loop with the results of the newly generated scf.if.
114 ///
115 /// The newly generated scf.if operation is returned via `ifOp`. The boundary
116 /// at which the loop is split (new upper bound) is returned via `splitBound`.
117 /// The return value indicates whether the loop was rewritten or not.
118 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
119  ForOp &partialIteration, Value &splitBound) {
120  RewriterBase::InsertionGuard guard(b);
121  auto lbInt = getConstantIntValue(forOp.getLowerBound());
122  auto ubInt = getConstantIntValue(forOp.getUpperBound());
123  auto stepInt = getConstantIntValue(forOp.getStep());
124 
125  // No specialization necessary if step size is 1. Also bail out in case of an
126  // invalid zero or negative step which might have happened during folding.
127  if (stepInt && *stepInt <= 1)
128  return failure();
129 
130  // No specialization necessary if step already divides upper bound evenly.
131  // Fast path: lb, ub and step are constants.
132  if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
133  return failure();
134  // Slow path: Examine the ops that define lb, ub and step.
135  AffineExpr sym0, sym1, sym2;
136  bindSymbols(b.getContext(), sym0, sym1, sym2);
137  SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
138  forOp.getStep()};
139  AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
141  if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
142  if (constExpr.getValue() == 0)
143  return failure();
144 
145  // New upper bound: %ub - (%ub - %lb) mod %step
146  auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
147  b.setInsertionPoint(forOp);
148  auto loc = forOp.getLoc();
149  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
150  ValueRange{forOp.getLowerBound(),
151  forOp.getUpperBound(),
152  forOp.getStep()});
153 
154  // Create ForOp for partial iteration.
155  b.setInsertionPointAfter(forOp);
156  partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
157  partialIteration.getLowerBoundMutable().assign(splitBound);
158  b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
159  partialIteration.getInitArgsMutable().assign(forOp->getResults());
160 
161  // Set new upper loop bound.
162  b.modifyOpInPlace(forOp,
163  [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
164 
165  return success();
166 }
167 
168 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
169  ForOp partialIteration,
170  Value previousUb) {
171  Value mainIv = forOp.getInductionVar();
172  Value partialIv = partialIteration.getInductionVar();
173  assert(forOp.getStep() == partialIteration.getStep() &&
174  "expected same step in main and partial loop");
175  Value step = forOp.getStep();
176 
177  forOp.walk([&](Operation *affineOp) {
178  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
179  return WalkResult::advance();
180  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
181  step,
182  /*insideLoop=*/true);
183  return WalkResult::advance();
184  });
185  partialIteration.walk([&](Operation *affineOp) {
186  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
187  return WalkResult::advance();
188  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
189  step, /*insideLoop=*/false);
190  return WalkResult::advance();
191  });
192 }
193 
195  ForOp forOp,
196  ForOp &partialIteration) {
197  Value previousUb = forOp.getUpperBound();
198  Value splitBound;
199  if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
200  return failure();
201 
202  // Rewrite affine.min and affine.max ops.
203  rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
204 
205  return success();
206 }
207 
208 /// Rewrites the original scf::ForOp as two scf::ForOp Ops, the first
209 /// scf::ForOp corresponds to the first iteration of the loop which can be
210 /// canonicalized away in the following optimizations. The second loop Op
211 /// contains the remaining iterations, with a lower bound updated as the
212 /// original lower bound plus the step (i.e. skips the first iteration).
213 LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
214  ForOp &firstIteration) {
215  RewriterBase::InsertionGuard guard(b);
216  auto lbInt = getConstantIntValue(forOp.getLowerBound());
217  auto ubInt = getConstantIntValue(forOp.getUpperBound());
218  auto stepInt = getConstantIntValue(forOp.getStep());
219 
220  // Peeling is not needed if there is one or less iteration.
221  if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1)
222  return failure();
223 
224  AffineExpr lbSymbol, stepSymbol;
225  bindSymbols(b.getContext(), lbSymbol, stepSymbol);
226 
227  // New lower bound for main loop: %lb + %step
228  auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol});
229  b.setInsertionPoint(forOp);
230  auto loc = forOp.getLoc();
231  Value splitBound = b.createOrFold<AffineApplyOp>(
232  loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
233 
234  // Peel the first iteration.
235  IRMapping map;
236  map.map(forOp.getUpperBound(), splitBound);
237  firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map));
238 
239  // Update main loop with new lower bound.
240  b.modifyOpInPlace(forOp, [&]() {
241  forOp.getInitArgsMutable().assign(firstIteration->getResults());
242  forOp.getLowerBoundMutable().assign(splitBound);
243  });
244 
245  return success();
246 }
247 
248 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
249 static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
250 
251 namespace {
252 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
253  ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial)
254  : OpRewritePattern<ForOp>(ctx), peelFront(peelFront),
255  skipPartial(skipPartial) {}
256 
257  LogicalResult matchAndRewrite(ForOp forOp,
258  PatternRewriter &rewriter) const override {
259  if (forOp.getUnsignedCmp())
260  return rewriter.notifyMatchFailure(forOp,
261  "unsigned loops are not supported");
262 
263  // Do not peel already peeled loops.
264  if (forOp->hasAttr(kPeeledLoopLabel))
265  return failure();
266 
267  scf::ForOp partialIteration;
268  // The case for peeling the first iteration of the loop.
269  if (peelFront) {
270  if (failed(
271  peelForLoopFirstIteration(rewriter, forOp, partialIteration))) {
272  return failure();
273  }
274  } else {
275  if (skipPartial) {
276  // No peeling of loops inside the partial iteration of another peeled
277  // loop.
278  Operation *op = forOp.getOperation();
279  while ((op = op->getParentOfType<scf::ForOp>())) {
281  return failure();
282  }
283  }
284  // Apply loop peeling.
285  if (failed(
286  peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
287  return failure();
288  }
289 
290  // Apply label, so that the same loop is not rewritten a second time.
291  rewriter.modifyOpInPlace(partialIteration, [&]() {
292  partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
293  partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
294  });
295  rewriter.modifyOpInPlace(forOp, [&]() {
296  forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
297  });
298  return success();
299  }
300 
301  // If set to true, the first iteration of the loop will be peeled. Otherwise,
302  // the unevenly divisible loop will be peeled at the end.
303  bool peelFront;
304 
305  /// If set to true, loops inside partial iterations of another peeled loop
306  /// are not peeled. This reduces the size of the generated code. Partial
307  /// iterations are not usually performance critical.
308  /// Note: Takes into account the entire chain of parent operations, not just
309  /// the direct parent.
310  bool skipPartial;
311 };
312 } // namespace
313 
314 namespace {
315 struct ParallelLoopSpecialization
316  : public impl::SCFParallelLoopSpecializationBase<
317  ParallelLoopSpecialization> {
318  void runOnOperation() override {
319  getOperation()->walk(
320  [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
321  }
322 };
323 
324 struct ForLoopSpecialization
325  : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
326  void runOnOperation() override {
327  getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
328  }
329 };
330 
331 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
332  void runOnOperation() override {
333  auto *parentOp = getOperation();
334  MLIRContext *ctx = parentOp->getContext();
336  patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial);
337  (void)applyPatternsGreedily(parentOp, std::move(patterns));
338 
339  // Drop the markers.
340  parentOp->walk([](Operation *op) {
343  });
344  }
345 };
346 } // namespace
347 
349  return std::make_unique<ParallelLoopSpecialization>();
350 }
351 
352 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
353  return std::make_unique<ForLoopSpecialization>();
354 }
355 
356 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() {
357  return std::make_unique<ForLoopPeeling>();
358 }
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:407
UnitAttr getUnitAttr()
Definition: Builders.cpp:93
MLIRContext * getContext() const
Definition: Builders.h:55
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:517
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition: Operation.h:560
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition: Operation.h:600
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:783
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:716
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:636
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:628
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
static WalkResult advance()
Definition: WalkResult.h:47
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1260
DynamicAPInt ceil(const Fraction &f)
Definition: Fraction.h:79
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Peel the first iteration out of the scf.for loop.
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:331
Include the generated interface declarations.
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::unique_ptr< Pass > createForLoopPeelingPass()
Creates a pass that peels for loops at their upper bounds for better vectorization.
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314