MLIR 23.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Specializes parallel loops and for loops for easier unrolling and
10// vectorization.
11//
12//===----------------------------------------------------------------------===//
13
15
23#include "mlir/IR/AffineExpr.h"
24#include "mlir/IR/IRMapping.h"
27
28namespace mlir {
29#define GEN_PASS_DEF_SCFFORLOOPPEELING
30#define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
31#define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
32#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
33} // namespace mlir
34
35using namespace mlir;
36using namespace mlir::affine;
37using scf::ForOp;
38using scf::ParallelOp;
39
40/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
41/// into 2 loops after checking if the bounds are equal to that constant. This
42/// is beneficial if the loop will almost always have the constant bound and
43/// that version can be fully unrolled and vectorized.
44static void specializeParallelLoopForUnrolling(ParallelOp op) {
45 SmallVector<int64_t, 2> constantIndices;
46 constantIndices.reserve(op.getUpperBound().size());
47 for (auto bound : op.getUpperBound()) {
48 auto minOp = bound.getDefiningOp<AffineMinOp>();
49 if (!minOp)
50 return;
51 int64_t minConstant = std::numeric_limits<int64_t>::max();
52 for (AffineExpr expr : minOp.getMap().getResults()) {
53 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
54 minConstant = std::min(minConstant, constantIndex.getValue());
55 }
56 if (minConstant == std::numeric_limits<int64_t>::max())
57 return;
58 constantIndices.push_back(minConstant);
59 }
60
61 OpBuilder b(op);
62 IRMapping map;
63 Value cond;
64 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
65 Value constant =
66 arith::ConstantIndexOp::create(b, op.getLoc(), std::get<1>(bound));
67 Value cmp = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
68 std::get<0>(bound), constant);
69 cond = cond ? arith::AndIOp::create(b, op.getLoc(), cond, cmp) : cmp;
70 map.map(std::get<0>(bound), constant);
71 }
72 auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
73 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
74 ifOp.getElseBodyBuilder().clone(*op.getOperation());
75 op.erase();
76}
77
78/// Rewrite a for loop with bounds defined by an affine.min with a constant into
79/// 2 loops after checking if the bounds are equal to that constant. This is
80/// beneficial if the loop will almost always have the constant bound and that
81/// version can be fully unrolled and vectorized.
82static void specializeForLoopForUnrolling(ForOp op) {
83 auto bound = op.getUpperBound();
84 auto minOp = bound.getDefiningOp<AffineMinOp>();
85 if (!minOp)
86 return;
87 int64_t minConstant = std::numeric_limits<int64_t>::max();
88 for (AffineExpr expr : minOp.getMap().getResults()) {
89 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
90 minConstant = std::min(minConstant, constantIndex.getValue());
91 }
92 if (minConstant == std::numeric_limits<int64_t>::max())
93 return;
94
95 OpBuilder b(op);
96 IRMapping map;
97 Value constant = arith::ConstantOp::create(
98 b, op.getLoc(),
99 IntegerAttr::get(op.getUpperBound().getType(), minConstant));
100 Value cond = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
101 bound, constant);
102 map.map(bound, constant);
103 auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
104 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
105 ifOp.getElseBodyBuilder().clone(*op.getOperation());
106 op.erase();
107}
108
109/// Rewrite a for loop with bounds/step that potentially do not divide evenly
110/// into a for loop where the step divides the iteration space evenly, followed
111/// by an scf.if for the last (partial) iteration (if any).
112///
113/// This function rewrites the given scf.for loop in-place and creates a new
114/// scf.if operation for the last iteration. It replaces all uses of the
115/// unpeeled loop with the results of the newly generated scf.if.
116///
117/// The newly generated scf.if operation is returned via `ifOp`. The boundary
118/// at which the loop is split (new upper bound) is returned via `splitBound`.
119/// The return value indicates whether the loop was rewritten or not.
120///
121/// Note: Loops with a step size of 0 cannot be peeled. Applying this function
122/// to such a loop may result in IR with undefined behavior.
123static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
124 ForOp &partialIteration, Value &splitBound) {
126 auto lbInt = getConstantIntValue(forOp.getLowerBound());
127 auto ubInt = getConstantIntValue(forOp.getUpperBound());
128 auto stepInt = getConstantIntValue(forOp.getStep());
130 // No specialization necessary if step size is 1. Also bail out in case of an
131 // invalid zero or negative step which might have happened during folding.
132 if (stepInt && *stepInt <= 1)
133 return failure();
134
135 // No specialization necessary if step already divides upper bound evenly.
136 // Fast path: lb, ub and step are constants.
137 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
138 return failure();
139 // Slow path: Examine the ops that define lb, ub and step.
140 AffineExpr sym0, sym1, sym2;
141 bindSymbols(b.getContext(), sym0, sym1, sym2);
142 SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
143 forOp.getStep()};
144 AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
146 if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
147 if (constExpr.getValue() == 0)
148 return failure();
149
150 // New upper bound: %ub - (%ub - %lb) mod %step
151 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
152 b.setInsertionPoint(forOp);
153 auto loc = forOp.getLoc();
154 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
155 ValueRange{forOp.getLowerBound(),
156 forOp.getUpperBound(),
157 forOp.getStep()});
158 if (splitBound.getType() != forOp.getLowerBound().getType())
159 splitBound = b.createOrFold<arith::IndexCastOp>(
160 loc, forOp.getLowerBound().getType(), splitBound);
161
162 // Create ForOp for partial iteration.
163 b.setInsertionPointAfter(forOp);
164 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
165 partialIteration.getLowerBoundMutable().assign(splitBound);
166 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
167 partialIteration.getInitArgsMutable().assign(forOp->getResults());
168
169 // Set new upper loop bound.
170 b.modifyOpInPlace(forOp,
171 [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
172
173 return success();
174}
175
176static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
177 ForOp partialIteration,
178 Value previousUb) {
179 Value mainIv = forOp.getInductionVar();
180 Value partialIv = partialIteration.getInductionVar();
181 assert(forOp.getStep() == partialIteration.getStep() &&
182 "expected same step in main and partial loop");
183 Value step = forOp.getStep();
184
185 forOp.walk([&](Operation *affineOp) {
186 if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
187 return WalkResult::advance();
188 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
189 step,
190 /*insideLoop=*/true);
191 return WalkResult::advance();
192 });
193 partialIteration.walk([&](Operation *affineOp) {
194 if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
195 return WalkResult::advance();
196 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
197 step, /*insideLoop=*/false);
198 return WalkResult::advance();
199 });
200}
201
203 ForOp forOp,
204 ForOp &partialIteration) {
205 Value previousUb = forOp.getUpperBound();
206 Value splitBound;
207 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
208 return failure();
209
210 // Rewrite affine.min and affine.max ops.
211 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
212
213 return success();
214}
215
216/// Rewrites the original scf::ForOp as two scf::ForOp Ops, the first
217/// scf::ForOp corresponds to the first iteration of the loop which can be
218/// canonicalized away in the following optimizations. The second loop Op
219/// contains the remaining iterations, with a lower bound updated as the
220/// original lower bound plus the step (i.e. skips the first iteration).
221LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
222 ForOp &firstIteration) {
224 auto lbInt = getConstantIntValue(forOp.getLowerBound());
225 auto ubInt = getConstantIntValue(forOp.getUpperBound());
226 auto stepInt = getConstantIntValue(forOp.getStep());
227
228 // Peeling is not needed if there is one or less iteration.
229 if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1)
230 return failure();
231
232 AffineExpr lbSymbol, stepSymbol;
233 bindSymbols(b.getContext(), lbSymbol, stepSymbol);
234
235 // New lower bound for main loop: %lb + %step
236 auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol});
237 b.setInsertionPoint(forOp);
238 auto loc = forOp.getLoc();
239 Value splitBound = b.createOrFold<AffineApplyOp>(
240 loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
241 if (splitBound.getType() != forOp.getUpperBound().getType())
242 splitBound = b.createOrFold<arith::IndexCastOp>(
243 loc, forOp.getUpperBound().getType(), splitBound);
244
245 // Peel the first iteration.
246 firstIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
247 b.modifyOpInPlace(firstIteration, [&]() {
248 firstIteration.getUpperBoundMutable().assign(splitBound);
249 });
250 // Update main loop with new lower bound.
251 b.modifyOpInPlace(forOp, [&]() {
252 forOp.getInitArgsMutable().assign(firstIteration->getResults());
253 forOp.getLowerBoundMutable().assign(splitBound);
254 });
256 return success();
258
259static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
260static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
262namespace {
263struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
264 ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial)
265 : OpRewritePattern<ForOp>(ctx), peelFront(peelFront),
266 skipPartial(skipPartial) {}
268 LogicalResult matchAndRewrite(ForOp forOp,
269 PatternRewriter &rewriter) const override {
270 if (forOp.getUnsignedCmp())
271 return rewriter.notifyMatchFailure(forOp,
272 "unsigned loops are not supported");
273
274 // Do not peel already peeled loops.
275 if (forOp->hasAttr(kPeeledLoopLabel))
276 return failure();
277
278 scf::ForOp partialIteration;
279 // The case for peeling the first iteration of the loop.
280 if (peelFront) {
281 if (failed(
282 peelForLoopFirstIteration(rewriter, forOp, partialIteration))) {
283 return failure();
284 }
285 } else {
286 if (skipPartial) {
287 // No peeling of loops inside the partial iteration of another peeled
288 // loop.
289 Operation *op = forOp.getOperation();
290 while ((op = op->getParentOfType<scf::ForOp>())) {
292 return failure();
293 }
294 }
295 // Apply loop peeling.
296 if (failed(
297 peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
298 return failure();
299 }
300
301 // Apply label, so that the same loop is not rewritten a second time.
302 rewriter.modifyOpInPlace(partialIteration, [&]() {
303 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
304 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
305 });
306 rewriter.modifyOpInPlace(forOp, [&]() {
307 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
308 });
309 return success();
310 }
311
312 // If set to true, the first iteration of the loop will be peeled. Otherwise,
313 // the unevenly divisible loop will be peeled at the end.
314 bool peelFront;
315
316 /// If set to true, loops inside partial iterations of another peeled loop
317 /// are not peeled. This reduces the size of the generated code. Partial
318 /// iterations are not usually performance critical.
319 /// Note: Takes into account the entire chain of parent operations, not just
320 /// the direct parent.
321 bool skipPartial;
322};
323} // namespace
324
325namespace {
326struct ParallelLoopSpecialization
328 ParallelLoopSpecialization> {
329 void runOnOperation() override {
330 getOperation()->walk(
331 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
332 }
333};
334
335struct ForLoopSpecialization
336 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
337 void runOnOperation() override {
338 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
339 }
340};
341
342struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
343 using impl::SCFForLoopPeelingBase<ForLoopPeeling>::SCFForLoopPeelingBase;
344
345 void runOnOperation() override {
346 auto *parentOp = getOperation();
347 MLIRContext *ctx = parentOp->getContext();
348 RewritePatternSet patterns(ctx);
349 patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial);
350 (void)applyPatternsGreedily(parentOp, std::move(patterns));
351
352 // Drop the markers.
353 parentOp->walk([](Operation *op) {
356 });
357 }
358};
359} // namespace
360
362 return std::make_unique<ParallelLoopSpecialization>();
363}
364
366 return std::make_unique<ForLoopSpecialization>();
367}
return success()
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
UnitAttr getUnitAttr()
Definition Builders.cpp:102
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:350
This class helps build Operations.
Definition Builders.h:209
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:560
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition Operation.h:600
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:363
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
DynamicAPInt ceil(const Fraction &f)
Definition Fraction.h:79
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Peel the first iteration out of the scf.for loop.
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Include the generated interface declarations.
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:305
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...