MLIR  18.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Specializes parallel loops and for loops for easier unrolling and
10 // vectorization.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
23 #include "mlir/IR/AffineExpr.h"
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/PatternMatch.h"
27 #include "llvm/ADT/DenseMap.h"
28 
29 namespace mlir {
30 #define GEN_PASS_DEF_SCFFORLOOPPEELING
31 #define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
32 #define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
33 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
34 } // namespace mlir
35 
36 using namespace mlir;
37 using namespace mlir::affine;
38 using scf::ForOp;
39 using scf::ParallelOp;
40 
41 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant
42 /// into 2 loops after checking if the bounds are equal to that constant. This
43 /// is beneficial if the loop will almost always have the constant bound and
44 /// that version can be fully unrolled and vectorized.
45 static void specializeParallelLoopForUnrolling(ParallelOp op) {
46  SmallVector<int64_t, 2> constantIndices;
47  constantIndices.reserve(op.getUpperBound().size());
48  for (auto bound : op.getUpperBound()) {
49  auto minOp = bound.getDefiningOp<AffineMinOp>();
50  if (!minOp)
51  return;
52  int64_t minConstant = std::numeric_limits<int64_t>::max();
53  for (AffineExpr expr : minOp.getMap().getResults()) {
54  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
55  minConstant = std::min(minConstant, constantIndex.getValue());
56  }
57  if (minConstant == std::numeric_limits<int64_t>::max())
58  return;
59  constantIndices.push_back(minConstant);
60  }
61 
62  OpBuilder b(op);
63  IRMapping map;
64  Value cond;
65  for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
66  Value constant =
67  b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound));
68  Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq,
69  std::get<0>(bound), constant);
70  cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp;
71  map.map(std::get<0>(bound), constant);
72  }
73  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
74  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
75  ifOp.getElseBodyBuilder().clone(*op.getOperation());
76  op.erase();
77 }
78 
79 /// Rewrite a for loop with bounds defined by an affine.min with a constant into
80 /// 2 loops after checking if the bounds are equal to that constant. This is
81 /// beneficial if the loop will almost always have the constant bound and that
82 /// version can be fully unrolled and vectorized.
83 static void specializeForLoopForUnrolling(ForOp op) {
84  auto bound = op.getUpperBound();
85  auto minOp = bound.getDefiningOp<AffineMinOp>();
86  if (!minOp)
87  return;
88  int64_t minConstant = std::numeric_limits<int64_t>::max();
89  for (AffineExpr expr : minOp.getMap().getResults()) {
90  if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
91  minConstant = std::min(minConstant, constantIndex.getValue());
92  }
93  if (minConstant == std::numeric_limits<int64_t>::max())
94  return;
95 
96  OpBuilder b(op);
97  IRMapping map;
98  Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant);
99  Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq,
100  bound, constant);
101  map.map(bound, constant);
102  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
103  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
104  ifOp.getElseBodyBuilder().clone(*op.getOperation());
105  op.erase();
106 }
107 
108 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
109 /// into a for loop where the step divides the iteration space evenly, followed
110 /// by an scf.if for the last (partial) iteration (if any).
111 ///
112 /// This function rewrites the given scf.for loop in-place and creates a new
113 /// scf.if operation for the last iteration. It replaces all uses of the
114 /// unpeeled loop with the results of the newly generated scf.if.
115 ///
116 /// The newly generated scf.if operation is returned via `ifOp`. The boundary
117 /// at which the loop is split (new upper bound) is returned via `splitBound`.
118 /// The return value indicates whether the loop was rewritten or not.
119 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
120  ForOp &partialIteration, Value &splitBound) {
121  RewriterBase::InsertionGuard guard(b);
122  auto lbInt = getConstantIntValue(forOp.getLowerBound());
123  auto ubInt = getConstantIntValue(forOp.getUpperBound());
124  auto stepInt = getConstantIntValue(forOp.getStep());
125 
126  // No specialization necessary if step size is 1.
127  if (getConstantIntValue(forOp.getStep()) == static_cast<int64_t>(1))
128  return failure();
129 
130  // No specialization necessary if step already divides upper bound evenly.
131  // Fast path: lb, ub and step are constants.
132  if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
133  return failure();
134  // Slow path: Examine the ops that define lb, ub and step.
135  AffineExpr sym0, sym1, sym2;
136  bindSymbols(b.getContext(), sym0, sym1, sym2);
137  SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
138  forOp.getStep()};
139  AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
141  if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
142  if (constExpr.getValue() == 0)
143  return failure();
144 
145  // New upper bound: %ub - (%ub - %lb) mod %step
146  auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
147  b.setInsertionPoint(forOp);
148  auto loc = forOp.getLoc();
149  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
150  ValueRange{forOp.getLowerBound(),
151  forOp.getUpperBound(),
152  forOp.getStep()});
153 
154  // Create ForOp for partial iteration.
155  b.setInsertionPointAfter(forOp);
156  partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
157  partialIteration.getLowerBoundMutable().assign(splitBound);
158  b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
159  partialIteration.getInitArgsMutable().assign(forOp->getResults());
160 
161  // Set new upper loop bound.
163  forOp, [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
164 
165  return success();
166 }
167 
168 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
169  ForOp partialIteration,
170  Value previousUb) {
171  Value mainIv = forOp.getInductionVar();
172  Value partialIv = partialIteration.getInductionVar();
173  assert(forOp.getStep() == partialIteration.getStep() &&
174  "expected same step in main and partial loop");
175  Value step = forOp.getStep();
176 
177  forOp.walk([&](Operation *affineOp) {
178  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
179  return WalkResult::advance();
180  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
181  step,
182  /*insideLoop=*/true);
183  return WalkResult::advance();
184  });
185  partialIteration.walk([&](Operation *affineOp) {
186  if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
187  return WalkResult::advance();
188  (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
189  step, /*insideLoop=*/false);
190  return WalkResult::advance();
191  });
192 }
193 
195  ForOp forOp,
196  ForOp &partialIteration) {
197  Value previousUb = forOp.getUpperBound();
198  Value splitBound;
199  if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
200  return failure();
201 
202  // Rewrite affine.min and affine.max ops.
203  rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
204 
205  return success();
206 }
207 
208 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
209 static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
210 
211 namespace {
212 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
213  ForLoopPeelingPattern(MLIRContext *ctx, bool skipPartial)
214  : OpRewritePattern<ForOp>(ctx), skipPartial(skipPartial) {}
215 
216  LogicalResult matchAndRewrite(ForOp forOp,
217  PatternRewriter &rewriter) const override {
218  // Do not peel already peeled loops.
219  if (forOp->hasAttr(kPeeledLoopLabel))
220  return failure();
221  if (skipPartial) {
222  // No peeling of loops inside the partial iteration of another peeled
223  // loop.
224  Operation *op = forOp.getOperation();
225  while ((op = op->getParentOfType<scf::ForOp>())) {
227  return failure();
228  }
229  }
230  // Apply loop peeling.
231  scf::ForOp partialIteration;
232  if (failed(peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
233  return failure();
234  // Apply label, so that the same loop is not rewritten a second time.
235  rewriter.updateRootInPlace(partialIteration, [&]() {
236  partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
237  partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
238  });
239  rewriter.updateRootInPlace(forOp, [&]() {
240  forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
241  });
242  return success();
243  }
244 
245  /// If set to true, loops inside partial iterations of another peeled loop
246  /// are not peeled. This reduces the size of the generated code. Partial
247  /// iterations are not usually performance critical.
248  /// Note: Takes into account the entire chain of parent operations, not just
249  /// the direct parent.
250  bool skipPartial;
251 };
252 } // namespace
253 
254 namespace {
255 struct ParallelLoopSpecialization
256  : public impl::SCFParallelLoopSpecializationBase<
257  ParallelLoopSpecialization> {
258  void runOnOperation() override {
259  getOperation()->walk(
260  [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
261  }
262 };
263 
264 struct ForLoopSpecialization
265  : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
266  void runOnOperation() override {
267  getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
268  }
269 };
270 
271 struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
272  void runOnOperation() override {
273  auto *parentOp = getOperation();
274  MLIRContext *ctx = parentOp->getContext();
275  RewritePatternSet patterns(ctx);
276  patterns.add<ForLoopPeelingPattern>(ctx, skipPartial);
277  (void)applyPatternsAndFoldGreedily(parentOp, std::move(patterns));
278 
279  // Drop the markers.
280  parentOp->walk([](Operation *op) {
283  });
284  }
285 };
286 } // namespace
287 
289  return std::make_unique<ParallelLoopSpecialization>();
290 }
291 
292 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
293  return std::make_unique<ForLoopSpecialization>();
294 }
295 
296 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() {
297  return std::make_unique<ForLoopPeeling>();
298 }
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:47
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:391
UnitAttr getUnitAttr()
Definition: Builders.cpp:114
MLIRContext * getContext() const
Definition: Builders.h:55
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
This class helps build Operations.
Definition: Builders.h:206
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:528
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:383
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:505
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:446
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:397
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition: Operation.h:538
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:686
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition: Operation.h:578
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:538
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:727
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
void updateRootInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around a root update of an operation.
Definition: PatternMatch.h:606
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:615
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:378
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
static WalkResult advance()
Definition: Visitors.h:52
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1114
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:361
Include the generated interface declarations.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::unique_ptr< Pass > createForLoopPeelingPass()
Creates a pass that peels for loops at their upper bounds for better vectorization.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:348
LogicalResult applyPatternsAndFoldGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:357