MLIR  22.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Specializes parallel loops and for loops for easier unrolling and
10 // vectorization.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/PatternMatch.h"
27 
28 namespace mlir {
29 #define GEN_PASS_DEF_SCFFORLOOPPEELING
30 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
31 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
32 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
33 } // namespace mlir
34 
35 using namespace mlir;
36 using namespace mlir::affine;
37 using scf::ForOp;
38 using scf::ParallelOp;
39 
40 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant
41 /// into 2 loops after checking if the bounds are equal to that constant. This
42 /// is beneficial if the loop will almost always have the constant bound and
43 /// that version can be fully unrolled and vectorized.
44 static void specializeParallelLoopForUnrolling(ParallelOp op) {
45  SmallVector<int64_t, 2> constantIndices;
46  constantIndices.reserve(op.getUpperBound().size());
47  for (auto bound : op.getUpperBound()) {
48  auto minOp = bound.getDefiningOp<AffineMinOp>();
49  if (!minOp)
50  return;
51  int64_t minConstant = std::numeric_limits<int64_t>::max();
52  for (AffineExpr expr : minOp.getMap().getResults()) {
53  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
54  minConstant = std::min(minConstant, constantIndex.getValue());
55  }
56  if (minConstant == std::numeric_limits<int64_t>::max())
57  return;
58  constantIndices.push_back(minConstant);
59  }
60 
61  OpBuilder b(op);
62  IRMapping map;
63  Value cond;
64  for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
65  Value constant =
66  arith::ConstantIndexOp::create(b, op.getLoc(), std::get<1>(bound));
67  Value cmp = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
68  std::get<0>(bound), constant);
69  cond = cond ? arith::AndIOp::create(b, op.getLoc(), cond, cmp) : cmp;
70  map.map(std::get<0>(bound), constant);
71  }
72  auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
73  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
74  ifOp.getElseBodyBuilder().clone(*op.getOperation());
75  op.erase();
76 }
77 
78 /// Rewrite a for loop with bounds defined by an affine.min with a constant into
79 /// 2 loops after checking if the bounds are equal to that constant. This is
80 /// beneficial if the loop will almost always have the constant bound and that
81 /// version can be fully unrolled and vectorized.
82 static void specializeForLoopForUnrolling(ForOp op) {
83  auto bound = op.getUpperBound();
84  auto minOp = bound.getDefiningOp<AffineMinOp>();
85  if (!minOp)
86  return;
87  int64_t minConstant = std::numeric_limits<int64_t>::max();
88  for (AffineExpr expr : minOp.getMap().getResults()) {
89  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
90  minConstant = std::min(minConstant, constantIndex.getValue());
91  }
92  if (minConstant == std::numeric_limits<int64_t>::max())
93  return;
94 
95  OpBuilder b(op);
96  IRMapping map;
97  Value constant = arith::ConstantOp::create(
98  b, op.getLoc(),
99  IntegerAttr::get(op.getUpperBound().getType(), minConstant));
100  Value cond = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
101  bound, constant);
102  map.map(bound, constant);
103  auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
104  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
105  ifOp.getElseBodyBuilder().clone(*op.getOperation());
106  op.erase();
107 }
108 
109 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
110 /// into a for loop where the step divides the iteration space evenly, followed
111 /// by an scf.if for the last (partial) iteration (if any).
112 ///
113 /// This function rewrites the given scf.for loop in-place and creates a new
114 /// scf.if operation for the last iteration. It replaces all uses of the
115 /// unpeeled loop with the results of the newly generated scf.if.
116 ///
117 /// The newly generated scf.if operation is returned via `ifOp`. The boundary
118 /// at which the loop is split (new upper bound) is returned via `splitBound`.
119 /// The return value indicates whether the loop was rewritten or not.
120 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
121  ForOp &partialIteration, Value &splitBound) {
122  RewriterBase::InsertionGuard guard(b);
123  auto lbInt = getConstantIntValue(forOp.getLowerBound());
124  auto ubInt = getConstantIntValue(forOp.getUpperBound());
125  auto stepInt = getConstantIntValue(forOp.getStep());
126 
127  // No specialization necessary if step size is 1. Also bail out in case of an
128  // invalid zero or negative step which might have happened during folding.
129  if (stepInt && *stepInt <= 1)
130  return failure();
131 
132  // No specialization necessary if step already divides upper bound evenly.
133  // Fast path: lb, ub and step are constants.
134  if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
135  return failure();
136  // Slow path: Examine the ops that define lb, ub and step.
137  AffineExpr sym0, sym1, sym2;
138  bindSymbols(b.getContext(), sym0, sym1, sym2);
139  SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
140  forOp.getStep()};
141  AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
143  if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
144  if (constExpr.getValue() == 0)
145  return failure();
146 
147  // New upper bound: %ub - (%ub - %lb) mod %step
148  auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
149  b.setInsertionPoint(forOp);
150  auto loc = forOp.getLoc();
151  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
152  ValueRange{forOp.getLowerBound(),
153  forOp.getUpperBound(),
154  forOp.getStep()});
155  if (splitBound.getType() != forOp.getLowerBound().getType())
156  splitBound = b.createOrFold<arith::IndexCastOp>(
157  loc, forOp.getLowerBound().getType(), splitBound);
158 
159  // Create ForOp for partial iteration.
160  b.setInsertionPointAfter(forOp);
161  partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
162  partialIteration.getLowerBoundMutable().assign(splitBound);
163  b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
164  partialIteration.getInitArgsMutable().assign(forOp->getResults());
165 
166  // Set new upper loop bound.
167  b.modifyOpInPlace(forOp,
168  [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
169 
170  return success();
171 }
172 
173 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
174  ForOp partialIteration,
175  Value previousUb) {
176  Value mainIv = forOp.getInductionVar();
177  Value partialIv = partialIteration.getInductionVar();
178  assert(forOp.getStep() == partialIteration.getStep() &&
179  "expected same step in main and partial loop");
180  Value step = forOp.getStep();
181 
182  forOp.walk([&](Operation *affineOp) {
183  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
184  return WalkResult::advance();
185  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
186  step,
187  /*insideLoop=*/true);
188  return WalkResult::advance();
189  });
190  partialIteration.walk([&](Operation *affineOp) {
191  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
192  return WalkResult::advance();
193  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
194  step, /*insideLoop=*/false);
195  return WalkResult::advance();
196  });
197 }
198 
200  ForOp forOp,
201  ForOp &partialIteration) {
202  Value previousUb = forOp.getUpperBound();
203  Value splitBound;
204  if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
205  return failure();
206 
207  // Rewrite affine.min and affine.max ops.
208  rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
209 
210  return success();
211 }
212 
213 /// Rewrites the original scf::ForOp as two scf::ForOp Ops, the first
214 /// scf::ForOp corresponds to the first iteration of the loop which can be
215 /// canonicalized away in the following optimizations. The second loop Op
216 /// contains the remaining iterations, with a lower bound updated as the
217 /// original lower bound plus the step (i.e. skips the first iteration).
218 LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
219  ForOp &firstIteration) {
220  RewriterBase::InsertionGuard guard(b);
221  auto lbInt = getConstantIntValue(forOp.getLowerBound());
222  auto ubInt = getConstantIntValue(forOp.getUpperBound());
223  auto stepInt = getConstantIntValue(forOp.getStep());
224 
225  // Peeling is not needed if there is one or less iteration.
226  if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1)
227  return failure();
228 
229  AffineExpr lbSymbol, stepSymbol;
230  bindSymbols(b.getContext(), lbSymbol, stepSymbol);
231 
232  // New lower bound for main loop: %lb + %step
233  auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol});
234  b.setInsertionPoint(forOp);
235  auto loc = forOp.getLoc();
236  Value splitBound = b.createOrFold<AffineApplyOp>(
237  loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
238  if (splitBound.getType() != forOp.getUpperBound().getType())
239  splitBound = b.createOrFold<arith::IndexCastOp>(
240  loc, forOp.getUpperBound().getType(), splitBound);
241 
242  // Peel the first iteration.
243  IRMapping map;
244  map.map(forOp.getUpperBound(), splitBound);
245  firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map));
246 
247  // Update main loop with new lower bound.
248  b.modifyOpInPlace(forOp, [&]() {
249  forOp.getInitArgsMutable().assign(firstIteration->getResults());
250  forOp.getLowerBoundMutable().assign(splitBound);
251  });
252 
253  return success();
254 }
255 
256 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
257 static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
258 
259 namespace {
260 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
261  ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial)
262  : OpRewritePattern<ForOp>(ctx), peelFront(peelFront),
263  skipPartial(skipPartial) {}
264 
265  LogicalResult matchAndRewrite(ForOp forOp,
266  PatternRewriter &rewriter) const override {
267  if (forOp.getUnsignedCmp())
268  return rewriter.notifyMatchFailure(forOp,
269  "unsigned loops are not supported");
270 
271  // Do not peel already peeled loops.
272  if (forOp->hasAttr(kPeeledLoopLabel))
273  return failure();
274 
275  scf::ForOp partialIteration;
276  // The case for peeling the first iteration of the loop.
277  if (peelFront) {
278  if (failed(
279  peelForLoopFirstIteration(rewriter, forOp, partialIteration))) {
280  return failure();
281  }
282  } else {
283  if (skipPartial) {
284  // No peeling of loops inside the partial iteration of another peeled
285  // loop.
286  Operation *op = forOp.getOperation();
287  while ((op = op->getParentOfType<scf::ForOp>())) {
289  return failure();
290  }
291  }
292  // Apply loop peeling.
293  if (failed(
294  peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
295  return failure();
296  }
297 
298  // Apply label, so that the same loop is not rewritten a second time.
299  rewriter.modifyOpInPlace(partialIteration, [&]() {
300  partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
301  partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
302  });
303  rewriter.modifyOpInPlace(forOp, [&]() {
304  forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
305  });
306  return success();
307  }
308 
309  // If set to true, the first iteration of the loop will be peeled. Otherwise,
310  // the unevenly divisible loop will be peeled at the end.
311  bool peelFront;
312 
313  /// If set to true, loops inside partial iterations of another peeled loop
314  /// are not peeled. This reduces the size of the generated code. Partial
315  /// iterations are not usually performance critical.
316  /// Note: Takes into account the entire chain of parent operations, not just
317  /// the direct parent.
318  bool skipPartial;
319 };
320 } // namespace
321 
322 namespace {
323 struct ParallelLoopSpecialization
324  : public impl::SCFParallelLoopSpecializationBase<
325  ParallelLoopSpecialization> {
326  void runOnOperation() override {
327  getOperation()->walk(
328  [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
329  }
330 };
331 
332 struct ForLoopSpecialization
333  : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
334  void runOnOperation() override {
335  getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
336  }
337 };
338 
339 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
340  void runOnOperation() override {
341  auto *parentOp = getOperation();
342  MLIRContext *ctx = parentOp->getContext();
344  patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial);
345  (void)applyPatternsGreedily(parentOp, std::move(patterns));
346 
347  // Drop the markers.
348  parentOp->walk([](Operation *op) {
351  });
352  }
353 };
354 } // namespace
355 
357  return std::make_unique<ParallelLoopSpecialization>();
358 }
359 
360 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
361  return std::make_unique<ForLoopSpecialization>();
362 }
363 
364 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() {
365  return std::make_unique<ForLoopPeeling>();
366 }
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:407
UnitAttr getUnitAttr()
Definition: Builders.cpp:97
MLIRContext * getContext() const
Definition: Builders.h:56
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
This class helps build Operations.
Definition: Builders.h:207
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:552
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:519
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:412
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition: Operation.h:560
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition: Operation.h:600
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:793
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:726
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:638
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:646
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
static WalkResult advance()
Definition: WalkResult.h:47
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1260
DynamicAPInt ceil(const Fraction &f)
Definition: Fraction.h:79
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Peel the first iteration out of the scf.for loop.
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:331
Include the generated interface declarations.
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::unique_ptr< Pass > createForLoopPeelingPass()
Creates a pass that peels for loops at their upper bounds for better vectorization.
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314