MLIR 22.0.0git
LoopSpecialization.cpp
Go to the documentation of this file.
1//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Specializes parallel loops and for loops for easier unrolling and
10// vectorization.
11//
12//===----------------------------------------------------------------------===//
13
15
23#include "mlir/IR/AffineExpr.h"
24#include "mlir/IR/IRMapping.h"
27
28namespace mlir {
29#define GEN_PASS_DEF_SCFFORLOOPPEELING
30#define GEN_PASS_DEF_SCFFORLOOPSPECIALIZATION
31#define GEN_PASS_DEF_SCFPARALLELLOOPSPECIALIZATION
32#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
33} // namespace mlir
34
35using namespace mlir;
36using namespace mlir::affine;
37using scf::ForOp;
38using scf::ParallelOp;
39
40/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
41/// into 2 loops after checking if the bounds are equal to that constant. This
42/// is beneficial if the loop will almost always have the constant bound and
43/// that version can be fully unrolled and vectorized.
44static void specializeParallelLoopForUnrolling(ParallelOp op) {
45 SmallVector<int64_t, 2> constantIndices;
46 constantIndices.reserve(op.getUpperBound().size());
47 for (auto bound : op.getUpperBound()) {
48 auto minOp = bound.getDefiningOp<AffineMinOp>();
49 if (!minOp)
50 return;
51 int64_t minConstant = std::numeric_limits<int64_t>::max();
52 for (AffineExpr expr : minOp.getMap().getResults()) {
53 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
54 minConstant = std::min(minConstant, constantIndex.getValue());
55 }
56 if (minConstant == std::numeric_limits<int64_t>::max())
57 return;
58 constantIndices.push_back(minConstant);
59 }
60
61 OpBuilder b(op);
62 IRMapping map;
63 Value cond;
64 for (auto bound : llvm::zip(op.getUpperBound(), constantIndices)) {
65 Value constant =
66 arith::ConstantIndexOp::create(b, op.getLoc(), std::get<1>(bound));
67 Value cmp = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
68 std::get<0>(bound), constant);
69 cond = cond ? arith::AndIOp::create(b, op.getLoc(), cond, cmp) : cmp;
70 map.map(std::get<0>(bound), constant);
71 }
72 auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
73 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
74 ifOp.getElseBodyBuilder().clone(*op.getOperation());
75 op.erase();
76}
77
78/// Rewrite a for loop with bounds defined by an affine.min with a constant into
79/// 2 loops after checking if the bounds are equal to that constant. This is
80/// beneficial if the loop will almost always have the constant bound and that
81/// version can be fully unrolled and vectorized.
82static void specializeForLoopForUnrolling(ForOp op) {
83 auto bound = op.getUpperBound();
84 auto minOp = bound.getDefiningOp<AffineMinOp>();
85 if (!minOp)
86 return;
87 int64_t minConstant = std::numeric_limits<int64_t>::max();
88 for (AffineExpr expr : minOp.getMap().getResults()) {
89 if (auto constantIndex = dyn_cast<AffineConstantExpr>(expr))
90 minConstant = std::min(minConstant, constantIndex.getValue());
91 }
92 if (minConstant == std::numeric_limits<int64_t>::max())
93 return;
94
95 OpBuilder b(op);
96 IRMapping map;
97 Value constant = arith::ConstantOp::create(
98 b, op.getLoc(),
99 IntegerAttr::get(op.getUpperBound().getType(), minConstant));
100 Value cond = arith::CmpIOp::create(b, op.getLoc(), arith::CmpIPredicate::eq,
101 bound, constant);
102 map.map(bound, constant);
103 auto ifOp = scf::IfOp::create(b, op.getLoc(), cond, /*withElseRegion=*/true);
104 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
105 ifOp.getElseBodyBuilder().clone(*op.getOperation());
106 op.erase();
109/// Rewrite a for loop with bounds/step that potentially do not divide evenly
110/// into a for loop where the step divides the iteration space evenly, followed
111/// by an scf.if for the last (partial) iteration (if any).
112///
113/// This function rewrites the given scf.for loop in-place and creates a new
114/// scf.if operation for the last iteration. It replaces all uses of the
115/// unpeeled loop with the results of the newly generated scf.if.
116///
117/// The newly generated scf.if operation is returned via `ifOp`. The boundary
118/// at which the loop is split (new upper bound) is returned via `splitBound`.
119/// The return value indicates whether the loop was rewritten or not.
120static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
121 ForOp &partialIteration, Value &splitBound) {
123 auto lbInt = getConstantIntValue(forOp.getLowerBound());
124 auto ubInt = getConstantIntValue(forOp.getUpperBound());
125 auto stepInt = getConstantIntValue(forOp.getStep());
126
127 // No specialization necessary if step size is 1. Also bail out in case of an
128 // invalid zero or negative step which might have happened during folding.
129 if (stepInt && *stepInt <= 1)
130 return failure();
131
132 // No specialization necessary if step already divides upper bound evenly.
133 // Fast path: lb, ub and step are constants.
134 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0)
135 return failure();
136 // Slow path: Examine the ops that define lb, ub and step.
137 AffineExpr sym0, sym1, sym2;
138 bindSymbols(b.getContext(), sym0, sym1, sym2);
139 SmallVector<Value> operands{forOp.getLowerBound(), forOp.getUpperBound(),
140 forOp.getStep()};
141 AffineMap map = AffineMap::get(0, 3, {(sym1 - sym0) % sym2});
143 if (auto constExpr = dyn_cast<AffineConstantExpr>(map.getResult(0)))
144 if (constExpr.getValue() == 0)
145 return failure();
146
147 // New upper bound: %ub - (%ub - %lb) mod %step
148 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
149 b.setInsertionPoint(forOp);
150 auto loc = forOp.getLoc();
151 splitBound = b.createOrFold<AffineApplyOp>(loc, modMap,
152 ValueRange{forOp.getLowerBound(),
153 forOp.getUpperBound(),
154 forOp.getStep()});
155 if (splitBound.getType() != forOp.getLowerBound().getType())
156 splitBound = b.createOrFold<arith::IndexCastOp>(
157 loc, forOp.getLowerBound().getType(), splitBound);
158
159 // Create ForOp for partial iteration.
160 b.setInsertionPointAfter(forOp);
161 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
162 partialIteration.getLowerBoundMutable().assign(splitBound);
163 b.replaceAllUsesWith(forOp.getResults(), partialIteration->getResults());
164 partialIteration.getInitArgsMutable().assign(forOp->getResults());
165
166 // Set new upper loop bound.
167 b.modifyOpInPlace(forOp,
168 [&]() { forOp.getUpperBoundMutable().assign(splitBound); });
169
170 return success();
171}
172
173static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
174 ForOp partialIteration,
175 Value previousUb) {
176 Value mainIv = forOp.getInductionVar();
177 Value partialIv = partialIteration.getInductionVar();
178 assert(forOp.getStep() == partialIteration.getStep() &&
179 "expected same step in main and partial loop");
180 Value step = forOp.getStep();
181
182 forOp.walk([&](Operation *affineOp) {
183 if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
184 return WalkResult::advance();
185 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, previousUb,
186 step,
187 /*insideLoop=*/true);
188 return WalkResult::advance();
189 });
190 partialIteration.walk([&](Operation *affineOp) {
191 if (!isa<AffineMinOp, AffineMaxOp>(affineOp))
192 return WalkResult::advance();
193 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, partialIv, previousUb,
194 step, /*insideLoop=*/false);
195 return WalkResult::advance();
196 });
197}
198
200 ForOp forOp,
201 ForOp &partialIteration) {
202 Value previousUb = forOp.getUpperBound();
203 Value splitBound;
204 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
205 return failure();
206
207 // Rewrite affine.min and affine.max ops.
208 rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
209
210 return success();
211}
212
213/// Rewrites the original scf::ForOp as two scf::ForOp Ops, the first
214/// scf::ForOp corresponds to the first iteration of the loop which can be
215/// canonicalized away in the following optimizations. The second loop Op
216/// contains the remaining iterations, with a lower bound updated as the
217/// original lower bound plus the step (i.e. skips the first iteration).
218LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
219 ForOp &firstIteration) {
221 auto lbInt = getConstantIntValue(forOp.getLowerBound());
222 auto ubInt = getConstantIntValue(forOp.getUpperBound());
223 auto stepInt = getConstantIntValue(forOp.getStep());
224
225 // Peeling is not needed if there is one or less iteration.
226 if (lbInt && ubInt && stepInt && ceil(float(*ubInt - *lbInt) / *stepInt) <= 1)
227 return failure();
228
229 AffineExpr lbSymbol, stepSymbol;
230 bindSymbols(b.getContext(), lbSymbol, stepSymbol);
232 // New lower bound for main loop: %lb + %step
233 auto ubMap = AffineMap::get(0, 2, {lbSymbol + stepSymbol});
234 b.setInsertionPoint(forOp);
235 auto loc = forOp.getLoc();
236 Value splitBound = b.createOrFold<AffineApplyOp>(
237 loc, ubMap, ValueRange{forOp.getLowerBound(), forOp.getStep()});
238 if (splitBound.getType() != forOp.getUpperBound().getType())
239 splitBound = b.createOrFold<arith::IndexCastOp>(
240 loc, forOp.getUpperBound().getType(), splitBound);
242 // Peel the first iteration.
243 firstIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
244 b.modifyOpInPlace(firstIteration, [&]() {
245 firstIteration.getUpperBoundMutable().assign(splitBound);
246 });
247 // Update main loop with new lower bound.
248 b.modifyOpInPlace(forOp, [&]() {
249 forOp.getInitArgsMutable().assign(firstIteration->getResults());
250 forOp.getLowerBoundMutable().assign(splitBound);
251 });
253 return success();
254}
256static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
257static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
258
259namespace {
260struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
261 ForLoopPeelingPattern(MLIRContext *ctx, bool peelFront, bool skipPartial)
262 : OpRewritePattern<ForOp>(ctx), peelFront(peelFront),
263 skipPartial(skipPartial) {}
264
265 LogicalResult matchAndRewrite(ForOp forOp,
266 PatternRewriter &rewriter) const override {
267 if (forOp.getUnsignedCmp())
268 return rewriter.notifyMatchFailure(forOp,
269 "unsigned loops are not supported");
270
271 // Do not peel already peeled loops.
272 if (forOp->hasAttr(kPeeledLoopLabel))
273 return failure();
274
275 scf::ForOp partialIteration;
276 // The case for peeling the first iteration of the loop.
277 if (peelFront) {
278 if (failed(
279 peelForLoopFirstIteration(rewriter, forOp, partialIteration))) {
280 return failure();
281 }
282 } else {
283 if (skipPartial) {
284 // No peeling of loops inside the partial iteration of another peeled
285 // loop.
286 Operation *op = forOp.getOperation();
287 while ((op = op->getParentOfType<scf::ForOp>())) {
289 return failure();
290 }
291 }
292 // Apply loop peeling.
293 if (failed(
294 peelForLoopAndSimplifyBounds(rewriter, forOp, partialIteration)))
295 return failure();
296 }
297
298 // Apply label, so that the same loop is not rewritten a second time.
299 rewriter.modifyOpInPlace(partialIteration, [&]() {
300 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
301 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
302 });
303 rewriter.modifyOpInPlace(forOp, [&]() {
304 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
305 });
306 return success();
307 }
308
309 // If set to true, the first iteration of the loop will be peeled. Otherwise,
310 // the unevenly divisible loop will be peeled at the end.
311 bool peelFront;
312
313 /// If set to true, loops inside partial iterations of another peeled loop
314 /// are not peeled. This reduces the size of the generated code. Partial
315 /// iterations are not usually performance critical.
316 /// Note: Takes into account the entire chain of parent operations, not just
317 /// the direct parent.
318 bool skipPartial;
319};
320} // namespace
321
322namespace {
323struct ParallelLoopSpecialization
325 ParallelLoopSpecialization> {
326 void runOnOperation() override {
327 getOperation()->walk(
328 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
329 }
330};
331
332struct ForLoopSpecialization
333 : public impl::SCFForLoopSpecializationBase<ForLoopSpecialization> {
334 void runOnOperation() override {
335 getOperation()->walk([](ForOp op) { specializeForLoopForUnrolling(op); });
336 }
337};
338
339struct ForLoopPeeling : public impl::SCFForLoopPeelingBase<ForLoopPeeling> {
340 void runOnOperation() override {
341 auto *parentOp = getOperation();
342 MLIRContext *ctx = parentOp->getContext();
343 RewritePatternSet patterns(ctx);
344 patterns.add<ForLoopPeelingPattern>(ctx, peelFront, skipPartial);
345 (void)applyPatternsGreedily(parentOp, std::move(patterns));
346
347 // Drop the markers.
348 parentOp->walk([](Operation *op) {
351 });
352 }
353};
354} // namespace
355
357 return std::make_unique<ParallelLoopSpecialization>();
358}
359
361 return std::make_unique<ForLoopSpecialization>();
362}
363
364std::unique_ptr<Pass> mlir::createForLoopPeelingPass() {
365 return std::make_unique<ForLoopPeeling>();
366}
return success()
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
static void specializeForLoopForUnrolling(ForOp op)
Rewrite a for loop with bounds defined by an affine.min with a constant into 2 loops after checking i...
static void specializeParallelLoopForUnrolling(ParallelOp op)
Rewrite a parallel loop with bounds defined by an affine.min with a constant into 2 loops after check...
static constexpr char kPeeledLoopLabel[]
static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, ForOp partialIteration, Value previousUb)
static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, ForOp &partialIteration, Value &splitBound)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
static constexpr char kPartialIterationLabel[]
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
AffineExpr getResult(unsigned idx) const
UnitAttr getUnitAttr()
Definition Builders.cpp:98
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:348
This class helps build Operations.
Definition Builders.h:207
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:560
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition Operation.h:600
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
DynamicAPInt ceil(const Fraction &f)
Definition Fraction.h:79
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Rewrite a for loop with bounds/step that potentially do not divide evenly into a for loop where the s...
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp, scf::ForOp &partialIteration)
Peel the first iteration out of the scf.for loop.
LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, Value iv, Value ub, Value step, bool insideLoop)
Try to simplify the given affine.min/max operation op after loop peeling.
Include the generated interface declarations.
std::unique_ptr< Pass > createParallelLoopSpecializationPass()
Creates a pass that specializes parallel loop for unrolling and vectorization.
std::unique_ptr< Pass > createForLoopSpecializationPass()
Creates a pass that specializes for loop for unrolling and vectorization.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:304
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::unique_ptr< Pass > createForLoopPeelingPass()
Creates a pass that peels for loops at their upper bounds for better vectorization.
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...