MLIR  19.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Specializes parallel loops and for loops for easier unrolling and
10 // vectorization.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/PatternMatch.h"
27 #include "llvm/ADT/DenseMap.h"
28 
29 namespace mlir {
30 #define GEN_PASS_DEF_SCFFORLOOPPEELING
31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
34 } // namespace mlir
35 
36 using namespace mlir;
37 using namespace mlir::affine;
38 using scf::ForOp;
39 using scf::ParallelOp;
40 
41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant
42 /// into 2 loops after checking if the bounds are equal to that constant. This
43 /// is beneficial if the loop will almost always have the constant bound and
44 /// that version can be fully unrolled and vectorized.
45 static void specializeParallelLoopForUnrolling(ParallelOp op) {
46  SmallVector<int64_t, 2> constantIndices;
47  constantIndices.reserve(op.getUpperBound().size());
48  for (auto bound : op.getUpperBound()) {
49  auto minOp = bound.getDefiningOp<AffineMinOp>();
50  if (!minOp)
51  return;
52  int64_t minConstant = std::numeric_limits<int64_t>::max();
53  for (AffineExpr expr : minOp.getMap().getResults()) {
54  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
55  minConstant = std::min(minConstant, constantIndex.getValue());
56  }
57  if (minConstant == std::numeric_limits<int64_t>::max())
58  return;
59  constantIndices.push_back(minConstant);
60  }
61 
62  OpBuilder b(op);
63  IRMapping map;
64  Value cond;
65  for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
66  Value constant =
67  b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound));
68  Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq,
69  std::get<0>(bound), constant);
70  cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp;
71  map.map(std::get<0>(bound), constant);
72  }
73  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
74  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
75  ifOp.getElseBodyBuilder().clone(*op.getOperation());
76  op.erase();
77 }
78 
79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into
80 /// 2 loops after checking if the bounds are equal to that constant. This is
81 /// beneficial if the loop will almost always have the constant bound and that
82 /// version can be fully unrolled and vectorized.
83 static void specializeForLoopForUnrolling(ForOp op) {
84  auto bound = op.getUpperBound();
85  auto minOp = bound.getDefiningOp<AffineMinOp>();
86  if (!minOp)
87  return;
88  int64_t minConstant = std::numeric_limits<int64_t>::max();
89  for (AffineExpr expr : minOp.getMap().getResults()) {
90  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
91  minConstant = std::min(minConstant, constantIndex.getValue());
92  }
93  if (minConstant == std::numeric_limits<int64_t>::max())
94  return;
95 
96  OpBuilder b(op);
97  IRMapping map;
98  Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant);
99  Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq,
100  bound, constant);
101  map.map(bound, constant);
102  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
103  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
104  ifOp.getElseBodyBuilder().clone(*op.getOperation());
105  op.erase();
106 }
107 
108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
109 /// into a for loop where the step divides the iteration space evenly, followed
110 /// by an scf.if for the last (partial) iteration (if any).
111 ///
112 /// This function rewrites the given scf.for loop in-place and creates a new
113 /// scf.if operation for the last iteration. It replaces all uses of the
114 /// unpeeled loop with the results of the newly generated scf.if.
115 ///
116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary
117 /// at which the loop is split (new upper bound) is returned via `splitBound`.
118 /// The return value indicates whether the loop was rewritten or not.
119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
120  ForOp &partialIteration, Value &splitBound) {
121  RewriterBase::InsertionGuard guard(b);
122  auto lbInt = getConstantIntValue(forOp.getLowerBound());
123  auto ubInt = getConstantIntValue(forOp.getUpperBound());
124  auto stepInt = getConstantIntValue(forOp.getStep());
125 
126  // No specialization necessary if step size is 1. Also bail out in case of an
127  // invalid zero or negative step which might have happened during folding.
128  if (stepInt && *stepInt <= 1)
129  return failure();
130 
131  // No specialization necessary if step already divides upper bound evenly.
132  // Fast path: lb, ub and step are constants.
133  if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
134  return failure();
135  // Slow path: Examine the ops that define lb, ub and step.
136  AffineExpr sym0, sym1, sym2;
137  bindSymbols(b.getContext(), sym0, sym1, sym2);
138  SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
139  forOp.getStep()};
140  AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
142  if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
143  if (constExpr.getValue() == 0)
144  return failure();
145 
146  // New upper bound: %ub - (%ub - %lb) mod %step
147  auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
148  b.setInsertionPoint(forOp);
149  auto loc = forOp.getLoc();
150  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
151  ValueRange{forOp.getLowerBound(),
152  forOp.getUpperBound(),
153  forOp.getStep()});
154 
155  // Create ForOp for partial iteration.
156  b.setInsertionPointAfter(forOp);
157  partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
158  partialIteration.getLowerBoundMutable().assign(splitBound);
159  b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
160  partialIteration.getInitArgsMutable().assign(forOp->getResults());
161 
162  // Set new upper loop bound.
163  b.modifyOpInPlace(forOp,
164  [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
165 
166  return success();
167 }
168 
169 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
170  ForOp partialIteration,
171  Value previousUb) {
172  Value mainIv = forOp.getInductionVar();
173  Value partialIv = partialIteration.getInductionVar();
174  assert(forOp.getStep() == partialIteration.getStep() &&
175  "expected same step in main and partial loop");
176  Value step = forOp.getStep();
177 
178  forOp.walk([&](Operation *affineOp) {
179  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
180  return WalkResult::advance();
181  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
182  step,
183  /*insideLoop=*/true);
184  return WalkResult::advance();
185  });
186  partialIteration.walk([&](Operation *affineOp) {
187  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
188  return WalkResult::advance();
189  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
190  step, /*insideLoop=*/false);
191  return WalkResult::advance();
192  });
193 }
194 
196  ForOp forOp,
197  ForOp &partialIteration) {
198  Value previousUb = forOp.getUpperBound();
199  Value splitBound;
200  if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
201  return failure();
202 
203  // Rewrite affine.min and affine.max ops.
204  rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
205 
206  return success();
207 }
208 
209 /// When the `peelFront` option is set as true, the first iteration of the loop
210 /// is peeled off. This function rewrites the original scf::ForOp as two
211 /// scf::ForOp Ops, the first scf::ForOp corresponds to the first iteration of
212 /// the loop which can be canonicalized away in the following optimization. The
213 /// second loop Op contains the remaining iteration, and the new lower bound is
214 /// the original lower bound plus the number of steps.
216  ForOp &firstIteration) {
217  RewriterBase::InsertionGuard guard(b);
218  auto lbInt = getConstantIntValue(forOp.getLowerBound());
219  auto ubInt = getConstantIntValue(forOp.getUpperBound());
220  auto stepInt = getConstantIntValue(forOp.getStep());
221 
222  // Peeling is not needed if there is one or less iteration.
223  if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1)
224  return failure();
225 
226  AffineExpr lbSymbol, stepSymbol;
227  bindSymbols(b.getContext(), lbSymbol, stepSymbol);
228 
229  // New lower bound for main loop: %lb + %step
230  auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol});
231  b.setInsertionPoint(forOp);
232  auto loc = forOp.getLoc();
233  Value splitBound = b.createOrFold<AffineApplyOp>(
234  loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
235 
236  // Peel the first iteration.
237  IRMapping map;
238  map.map(forOp.getUpperBound(), splitBound);
239  firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map));
240 
241  // Update main loop with new lower bound.
242  b.modifyOpInPlace(forOp, [&]() {
243  forOp.getInitArgsMutable().assign(firstIteration->getResults());
244  forOp.getLowerBoundMutable().assign(splitBound);
245  });
246 
247  return success();
248 }
249 
250 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
251 static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
252 
253 namespace {
254 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
255  ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial)
256  : OpRewritePattern<ForOp>(ctx), peelFront(peelFront),
257  skipPartial(skipPartial) {}
258 
259  LogicalResult matchAndRewrite(ForOp forOp,
260  PatternRewriter &rewriter) const override {
261  // Do not peel already peeled loops.
262  if (forOp->hasAttr(kPeeledLoopLabel))
263  return failure();
264 
265  scf::ForOp partialIteration;
266  // The case for peeling the first iteration of the loop.
267  if (peelFront) {
268  if (failed(
269  peelForLoopFirstIteration(rewriter, forOp, partialIteration))) {
270  return failure();
271  }
272  } else {
273  if (skipPartial) {
274  // No peeling of loops inside the partial iteration of another peeled
275  // loop.
276  Operation *op = forOp.getOperation();
277  while ((op = op->getParentOfType<scf::ForOp>())) {
279  return failure();
280  }
281  }
282  // Apply loop peeling.
283  if (failed(
284  peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
285  return failure();
286  }
287 
288  // Apply label, so that the same loop is not rewritten a second time.
289  rewriter.modifyOpInPlace(partialIteration, [&]() {
290  partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
291  partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
292  });
293  rewriter.modifyOpInPlace(forOp, [&]() {
294  forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
295  });
296  return success();
297  }
298 
299  // If set to true, the first iteration of the loop will be peeled. Otherwise,
300  // the unevenly divisible loop will be peeled at the end.
301  bool peelFront;
302 
303  /// If set to true, loops inside partial iterations of another peeled loop
304  /// are not peeled. This reduces the size of the generated code. Partial
305  /// iterations are not usually performance critical.
306  /// Note: Takes into account the entire chain of parent operations, not just
307  /// the direct parent.
308  bool skipPartial;
309 };
310 } // namespace
311 
312 namespace {
313 struct ParallelLoopSpecialization
314  : public impl::SCFParallelLoopSpecializationBase<
315  ParallelLoopSpecialization> {
316  void runOnOperation() override {
317  getOperation()->walk(
318  [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
319  }
320 };
321 
322 struct ForLoopSpecialization
323  : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
324  void runOnOperation() override {
325  getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
326  }
327 };
328 
329 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
330  void runOnOperation() override {
331  auto *parentOp = getOperation();
332  MLIRContext *ctx = parentOp->getContext();
333  RewritePatternSet patterns(ctx);
334  patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial);
335  (void)applyPatternsAndFoldGreedily(parentOp, std::move(patterns));
336 
337  // Drop the markers.
338  parentOp->walk([](Operation *op) {
341  });
342  }
343 };
344 } // namespace
345 
347  return std::make_unique<ParallelLoopSpecialization>();
348 }
349 
350 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
351  return std::make_unique<ForLoopSpecialization>();
352 }
353 
354 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() {
355  return std::make_unique<ForLoopPeeling>();
356 }
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:69
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:47
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:397
UnitAttr getUnitAttr()
Definition: Builders.cpp:114
MLIRContext * getContext() const
Definition: Builders.h:55
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
This class helps build Operations.
Definition: Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:553
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:400
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:522
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:414
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition: Operation.h:555
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition: Operation.h:595
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:539
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:775
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:636
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:628
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
static WalkResult advance()
Definition: Visitors.h:52
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1128
MPInt ceil(const Fraction &f)
Definition: Fraction.h:76
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Peel the first iteration out of the scf.for loop.
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:334
Include the generated interface declarations.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::unique_ptr< Pass > createForLoopPeelingPass()
Creates a pass that peels for loops at their upper bounds for better vectorization.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:363
LogicalResult applyPatternsAndFoldGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:357