MLIR 23.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
22#include "mlir/IR/IRMapping.h"
23#include "mlir/IR/IntegerSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/Support/Debug.h"
28#include "llvm/Support/DebugLog.h"
29#include "llvm/Support/raw_ostream.h"
30#include <optional>
31
32#define DEBUG_TYPE "loop-utils"
33
34using namespace mlir;
35using namespace affine;
36using namespace presburger;
37using llvm::SmallMapVector;
38
39/// Computes the cleanup loop lower bound of the loop being unrolled with
40/// the specified unroll factor; this bound will also be upper bound of the main
41/// part of the unrolled loop. Computes the bound as an AffineMap with its
42/// operands or a null map when the trip count can't be expressed as an affine
43/// expression.
44static void
45getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
46 AffineMap &cleanupLbMap,
47 SmallVectorImpl<Value> &cleanupLbOperands) {
48 AffineMap tripCountMap;
49 SmallVector<Value, 4> tripCountOperands;
50 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
51 // Trip count can't be computed.
52 if (!tripCountMap) {
53 cleanupLbMap = AffineMap();
54 return;
55 }
56
57 OpBuilder b(forOp);
58 auto lbMap = forOp.getLowerBoundMap();
59 auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60 forOp.getLowerBoundOperands());
61
62 // For each upper bound expr, get the range.
63 // Eg: affine.for %i = lb to min (ub1, ub2),
64 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
65 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
66 // these affine.apply's make up the cleanup loop lower bound.
67 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
68 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
69 int64_t step = forOp.getStepAsInt();
70 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
71 auto tripCountExpr = tripCountMap.getResult(i);
72 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
73 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
74 tripCountMap.getNumSymbols(), bumpExprs[i]);
75 bumpValues[i] =
76 AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
77 }
78
79 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
80 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
81 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
82
83 cleanupLbOperands.clear();
84 cleanupLbOperands.push_back(lb);
85 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
86 cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
87 b.getContext());
88 // Simplify the cleanupLbMap + cleanupLbOperands.
89 fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
90 cleanupLbMap = simplifyAffineMap(cleanupLbMap);
91 canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
92 // Remove any affine.apply's that became dead from the simplification above.
93 for (auto v : bumpValues)
94 if (v.use_empty())
95 v.getDefiningOp()->erase();
96
97 if (lb.use_empty())
98 lb.erase();
99}
100
101/// Helper to replace uses of loop carried values (iter_args) and loop
102/// yield values while promoting single iteration affine.for ops.
103static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
104 // Replace uses of iter arguments with iter operands (initial values).
105 auto iterOperands = forOp.getInits();
106 auto iterArgs = forOp.getRegionIterArgs();
107 for (auto e : llvm::zip(iterOperands, iterArgs))
108 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
109
110 // Replace uses of loop results with the values yielded by the loop.
111 auto outerResults = forOp.getResults();
112 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113 for (auto e : llvm::zip(outerResults, innerResults))
114 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
115}
116
117/// Promotes the loop body of a forOp to its containing block if the forOp
118/// was known to have a single iteration.
119LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
120 std::optional<APInt> tripCount = forOp.getStaticTripCount();
121 if (!tripCount || *tripCount != 1)
122 return failure();
123
124 // TODO: extend this for arbitrary affine bounds.
125 if (forOp.getLowerBoundMap().getNumResults() != 1)
126 return failure();
127
128 // Replaces all IV uses to its single iteration value.
129 auto iv = forOp.getInductionVar();
130 auto *parentBlock = forOp->getBlock();
131 if (!iv.use_empty()) {
132 if (forOp.hasConstantLowerBound()) {
133 auto func = forOp->getParentOfType<FunctionOpInterface>();
134 OpBuilder builder(forOp->getContext());
135 if (func)
136 builder.setInsertionPointToStart(&func.getFunctionBody().front());
137 else
138 builder.setInsertionPoint(forOp);
139 auto constOp = arith::ConstantIndexOp::create(
140 builder, forOp.getLoc(), forOp.getConstantLowerBound());
141 iv.replaceAllUsesWith(constOp);
142 } else {
143 auto lbOperands = forOp.getLowerBoundOperands();
144 auto lbMap = forOp.getLowerBoundMap();
145 OpBuilder builder(forOp);
146 if (lbMap == builder.getDimIdentityMap()) {
147 // No need of generating an affine.apply.
148 iv.replaceAllUsesWith(lbOperands[0]);
149 } else {
150 auto affineApplyOp =
151 AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152 iv.replaceAllUsesWith(affineApplyOp);
153 }
154 }
155 }
156
158
159 // Move the loop body operations, except for its terminator, to the loop's
160 // containing block.
161 forOp.getBody()->back().erase();
162 parentBlock->getOperations().splice(Block::iterator(forOp),
163 forOp.getBody()->getOperations());
164 forOp.erase();
165 return success();
166}
167
168/// Generates an affine.for op with the specified lower and upper bounds
169/// while generating the right IV remappings to realize shifts for operations in
170/// its body. The operations that go into the loop body are specified in
171/// opGroupQueue starting from the specified offset, and in that order. The
172/// first element of the pair specifies the shift applied to that group of
173/// operations; the shift is multiplied by the loop step before being applied.
174/// Returns nullptr if the generated loop simplifies to a single iteration one.
175static AffineForOp generateShiftedLoop(
176 AffineMap lbMap, AffineMap ubMap,
177 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
178 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
179 auto lbOperands = srcForOp.getLowerBoundOperands();
180 auto ubOperands = srcForOp.getUpperBoundOperands();
181
182 assert(lbMap.getNumInputs() == lbOperands.size());
183 assert(ubMap.getNumInputs() == ubOperands.size());
184
185 auto loopChunk =
186 AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187 ubMap, srcForOp.getStepAsInt());
188 auto loopChunkIV = loopChunk.getInductionVar();
189 auto srcIV = srcForOp.getInductionVar();
190
191 IRMapping operandMap;
192
193 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
194 for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195 uint64_t shift = it.first;
196 auto ops = it.second;
197 // All 'same shift' operations get added with their operands being
198 // remapped to results of cloned operations, and their IV used remapped.
199 // Generate the remapping if the shift is not zero: remappedIV = newIV -
200 // shift.
201 if (!srcIV.use_empty() && shift != 0) {
202 auto ivRemap = AffineApplyOp::create(
203 bodyBuilder, srcForOp.getLoc(),
204 bodyBuilder.getSingleDimShiftAffineMap(
205 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
206 loopChunkIV);
207 operandMap.map(srcIV, ivRemap);
208 } else {
209 operandMap.map(srcIV, loopChunkIV);
210 }
211 for (auto *op : ops)
212 bodyBuilder.clone(*op, operandMap);
213 };
214 if (succeeded(promoteIfSingleIteration(loopChunk)))
215 return AffineForOp();
216 return loopChunk;
217}
218
219// The skewing of operations with respect to one another can be used for
220// example to allow overlap of asynchronous operations (such as DMA
221// communication) with computation, or just relative shifting of operations
222// for better register reuse, locality or parallelism. As such, the shifts are
223// typically expected to be at most of the order of the number of operations.
224// This method should not be used as a substitute for loop distribution/fission.
225// This method uses an algorithm// in time linear in the number of operations
226// in the body of the for loop - (using the 'sweep line' paradigm). This method
227// asserts preservation of SSA dominance. A check for that as well as that for
228// memory-based dependence preservation check rests with the users of this
229// method.
230LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
231 ArrayRef<uint64_t> shifts,
232 bool unrollPrologueEpilogue) {
233 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 "too few/many shifts");
235 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
236 return success();
237
238 // If the trip counts aren't constant, we would need versioning and
239 // conditional guards (or context information to prevent such versioning). The
240 // better way to pipeline for such loops is to first tile them and extract
241 // constant trip count "full tiles" before applying this.
242 auto mayBeConstTripCount = forOp.getStaticTripCount();
243 if (!mayBeConstTripCount) {
244 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
245 return success();
246 }
247 uint64_t tripCount = mayBeConstTripCount->getZExtValue();
248
249 assert(isOpwiseShiftValid(forOp, shifts) &&
250 "shifts will lead to an invalid transformation\n");
251
252 int64_t step = forOp.getStepAsInt();
253
254 unsigned numChildOps = shifts.size();
255
256 // Do a linear time (counting) sort for the shifts.
257 uint64_t maxShift = *llvm::max_element(shifts);
258 if (maxShift >= numChildOps) {
259 // Large shifts are not the typical use case.
260 forOp.emitWarning("not shifting because shifts are unrealistically large");
261 return success();
262 }
263
264 // An array of operation groups sorted by shift amount; each group has all
265 // operations with the same shift in the order in which they appear in the
266 // body of the 'affine.for' op.
267 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268 unsigned pos = 0;
269 for (auto &op : forOp.getBody()->without_terminator()) {
270 auto shift = shifts[pos++];
271 sortedOpGroups[shift].push_back(&op);
272 }
273
274 // Unless the shifts have a specific pattern (which actually would be the
275 // common use case), prologue and epilogue are not meaningfully defined.
276 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
277 // loop generated as the prologue and the last as epilogue and unroll these
278 // fully.
279 AffineForOp prologue, epilogue;
280
281 // Do a sweep over the sorted shifts while storing open groups in a
282 // vector, and generating loop portions as necessary during the sweep. A block
283 // of operations is paired with its shift.
284 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285
286 auto origLbMap = forOp.getLowerBoundMap();
287 uint64_t lbShift = 0;
288 OpBuilder b(forOp);
289 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290 // If nothing is shifted by d, continue.
291 if (sortedOpGroups[d].empty())
292 continue;
293 if (!opGroupQueue.empty()) {
294 assert(d > 0 &&
295 "Queue expected to be empty when the first block is found");
296 // The interval for which the loop needs to be generated here is:
297 // [lbShift, min(lbShift + tripCount, d)) and the body of the
298 // loop needs to have all operations in opQueue in that order.
299 AffineForOp res;
300 if (lbShift + tripCount * step < d * step) {
302 b.getShiftedAffineMap(origLbMap, lbShift),
303 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
304 opGroupQueue, /*offset=*/0, forOp, b);
305 // Entire loop for the queued op groups generated, empty it.
306 opGroupQueue.clear();
307 lbShift += tripCount * step;
308 } else {
309 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
310 b.getShiftedAffineMap(origLbMap, d),
311 opGroupQueue, /*offset=*/0, forOp, b);
312 lbShift = d * step;
313 }
314
315 if (res) {
316 // Simplify/canonicalize the affine.for.
317 RewritePatternSet patterns(res.getContext());
318 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
319 bool erased;
321 res.getOperation(), std::move(patterns),
322 GreedyRewriteConfig().setStrictness(
324 /*changed=*/nullptr, &erased);
325 if (!erased && !prologue)
326 prologue = res;
327 if (!erased)
328 epilogue = res;
329 }
330 } else {
331 // Start of first interval.
332 lbShift = d * step;
333 }
334 // Augment the list of operations that get into the current open interval.
335 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
336 }
337
338 // Those operations groups left in the queue now need to be processed (FIFO)
339 // and their loops completed.
340 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
343 b.getShiftedAffineMap(origLbMap, ubShift),
344 opGroupQueue, /*offset=*/i, forOp, b);
345 lbShift = ubShift;
346 if (!prologue)
347 prologue = epilogue;
348 }
349
350 // Erase the original for op.
351 forOp.erase();
352
353 if (unrollPrologueEpilogue && prologue)
354 (void)loopUnrollFull(prologue);
355 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
356 (void)loopUnrollFull(epilogue);
357
358 return success();
359}
360
361/// Checks whether a loop nest is hyper-rectangular or not.
362static LogicalResult
365 SmallVector<Operation *, 8> ops(input.begin(), input.end());
366 // 0-d or 1-d is trivially hyper-rectangular.
367 if (input.size() <= 1)
368 return success();
369 if (failed(getIndexSet(ops, &cst))) {
370 LDBG() << "Index set computation failed!";
371 return failure();
372 }
373 if (!cst.isHyperRectangular(0, input.size())) {
374 LDBG() << "Non-hyperrectangular nests not supported for tiling!";
375 return failure();
376 }
377 return success();
378}
379
380/// Check if the input nest is supported for tiling and whether tiling would be
381/// legal or not.
382template <typename t>
384 ArrayRef<t> tileSizes) {
385 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387 if (llvm::any_of(input,
388 [](AffineForOp op) { return op.getNumResults() > 0; })) {
389 LDBG() << "Cannot tile nest where a loop has yield values";
390 return failure();
391 }
392
393 // Check if the supplied `for` ops are all successively nested.
394 if (!isPerfectlyNested(input)) {
395 LDBG() << "input loops not perfectly nested";
396 return failure();
397 }
398
399 // TODO: handle non hyper-rectangular spaces.
400 if (failed(checkIfHyperRectangular(input)))
401 return failure();
402
403 return success();
404}
405
406/// Move the loop body of AffineForOp 'src' from 'src' into the specified
407/// location in destination's body, ignoring the terminator.
408static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409 Block::iterator loc) {
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
413}
414
415/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416/// body.
417static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419}
420
421/// Constructs tiled loop nest, without setting the loop bounds and move the
422/// body of the original loop nest to the tiled loop nest.
424 AffineForOp rootAffineForOp, unsigned width,
425 MutableArrayRef<AffineForOp> tiledLoops) {
426 Location loc = rootAffineForOp.getLoc();
427
428 // The outermost among the loops as we add more..
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
431
432 // Add intra-tile (or point) loops.
433 for (unsigned i = 0; i < width; i++) {
434 OpBuilder b(topLoop);
435 // Loop bounds will be set later.
436 AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
438 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439 topLoop);
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
442 if (i == 0)
443 innermostPointLoop = pointLoop;
444 }
445
446 // Add tile space loops;
447 for (unsigned i = width; i < 2 * width; i++) {
448 OpBuilder b(topLoop);
449 // Loop bounds will be set later.
450 AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
452 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453 topLoop);
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
456 }
457
458 // Move the loop body of the original nest to the new one.
459 moveLoopBody(origLoops.back(), innermostPointLoop);
460}
461
462/// Set lower and upper bounds of intra-tile loops for parametric tiling.
463// TODO: Handle non-constant lower bounds.
464static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
467 Value tileSize) {
468 // The lower bound for the intra-tile loop is represented by an affine map
469 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474 // corresponding inter-tile loop.
475
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
478
479 // Get lower bound of original loop as an affine expression.
480 AffineExpr origLowerBoundExpr;
481 origLowerBoundExpr =
482 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483
484 // Add dim operands from original lower/upper bound.
485 SmallVector<Value, 4> lbOperands, ubOperands;
486 AffineBound lb = origLoop.getLowerBound();
487 AffineBound ub = origLoop.getUpperBound();
488 lbOperands.reserve(lb.getNumOperands() + 2);
489 ubOperands.reserve(ub.getNumOperands() + 2);
490 AffineMap origLbMap = lb.getMap();
491 AffineMap origUbMap = ub.getMap();
492 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493 lbOperands.push_back(lb.getOperand(j));
494 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495 ubOperands.push_back(ub.getOperand(j));
496
497 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498 // IV.
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
501
502 // Get loop IV as an affine expression for lower/upper bound. Size of
503 // lb/ubOperands is guaranteed to be atleast one.
504 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506
507 // Add symbol operands from original lower/upper bound.
508 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512
513 // Add a new symbol operand which is the tile size for this loop.
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
516
517 SmallVector<AffineExpr, 4> lbBoundExprs;
518 SmallVector<AffineExpr, 4> ubBoundExprs;
519 lbBoundExprs.reserve(origLbMap.getNumResults());
520 ubBoundExprs.reserve(origUbMap.getNumResults());
521
522 // Get tiling parameter as an affine expression for lb/ub.
523 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525
526 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529 origLowerBoundExpr);
530
531 // Get the origLoopStep as an affine expression.
532 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533
534 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535 // (tilingParameter * origLoopStep) + origlb.
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539
540 ubBoundExprs.append(origUbMap.getResults().begin(),
541 origUbMap.getResults().end());
542
543 AffineMap lbMap =
544 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545 lbBoundExprs, b.getContext());
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547
548 AffineMap ubMap =
549 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550 ubBoundExprs, b.getContext());
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552
553 // Original loop step must be preserved.
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
555}
556
557/// Set lower and upper bounds of inter-tile loops for parametric tiling.
558// TODO: Handle non-constant lower bounds.
559static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560 AffineForOp newLoop, Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562
563 // The lower bounds for inter-tile loops are same as the corresponding lower
564 // bounds of original loops.
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566
567 // The new upper bound map for inter-tile loops, assuming constant lower
568 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569 // originalLowerBound), tiling parameter); where tiling parameter is the
570 // respective tile size for that loop. For e.g. if the original ubmap was
571 // ()->(1024), the new map will be
572 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573 // Therefore a new symbol operand is inserted in the map and the result
574 // expression is overwritten.
575
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
578
579 // Get lower bound of original loop as an affine expression.
580 AffineExpr origLowerBoundExpr;
581 origLowerBoundExpr =
582 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583
584 // Add dim operands from original upper bound.
585 SmallVector<Value, 4> ubOperands;
586 AffineBound ub = origLoop.getUpperBound();
587 ubOperands.reserve(ub.getNumOperands() + 1);
588 AffineMap origUbMap = ub.getMap();
589 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590 ubOperands.push_back(ub.getOperand(j));
591
592 // Add symbol operands from original upper bound.
593 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595
596 // Add a new symbol operand which is the tile size for this loop.
597 ubOperands.push_back(tileSize);
598
599 // Get tiling parameter as an affine expression.
600 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601
603 boundExprs.reserve(origUbMap.getNumResults());
604 int64_t origUpperBound;
605 AffineExpr origUpperBoundExpr;
606
607 // If upper bound for the original loop is constant, then the constant can
608 // be obtained as an affine expression straight away.
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
611
612 // Get original constant upper bound as an affine expression.
613 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614
615 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616 // originalLowerBound), tilingParameter).
617 boundExprs.push_back(
618 origLowerBoundExpr +
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620 } else {
621 // If upper bound for the original loop is not constant then two cases
622 // are possible, although there handeling is the same, 1.) The result of
623 // ubmap has only one result expression. For e.g.
624 // affine.for %i = 5 to %ub
625 //
626 // A symbol operand is added which represents the tiling parameter. The
627 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628 // where 's0' is the original upper bound and 's1' is the tiling
629 // parameter. 2.) When ubMap has more than one result expression. For e.g.
630 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631 // affine.for %i = 5 to min #map0()[%s0, %s1]
632 //
633 // A symbol operand is added which represents the tiling parameter. The
634 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636
637 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638 // originalLowerBound), tilingParameter).
639 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640 boundExprs.push_back(
641 origLowerBoundExpr +
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643 }
644
645 AffineMap ubMap =
646 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647 boundExprs, b.getContext());
648 newLoop.setUpperBound(ubOperands, ubMap);
649
650 // Original loop step must be preserved.
651 newLoop.setStep(origLoop.getStepAsInt());
652}
653
654/// Constructs and sets new loop bounds after tiling for the case of
655/// hyper-rectangular index sets, where the bounds of one dimension do not
656/// depend on other dimensions and tiling parameters are captured from SSA
657/// values. Bounds of each dimension can thus be treated independently,
658/// and deriving the new bounds is much simpler and faster than for the case of
659/// tiling arbitrary polyhedral shapes.
662 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663 assert(!origLoops.empty() && "expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
666
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
669
670 // Set bounds for tile space loops.
671 for (unsigned i = 0; i < width; ++i) {
672 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673 }
674
675 // Set bounds for intra-tile loops.
676 for (unsigned i = 0; i < width; ++i) {
677 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678 newLoops[i + width], tileSizes[i]);
679 }
680}
681
682/// Constructs and sets new loop bounds after tiling for the case of
683/// hyper-rectangular index sets, where the bounds of one dimension do not
684/// depend on other dimensions. Bounds of each dimension can thus be treated
685/// independently, and deriving the new bounds is much simpler and faster
686/// than for the case of tiling arbitrary polyhedral shapes.
687static void
690 ArrayRef<unsigned> tileSizes) {
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
693
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
696
697 // Bounds for tile space loops.
698 for (unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703 // If the step size of original loop is x and tileSize is y then after
704 // tiling the tile space loops' step size becomes x*y.
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706 }
707 // Bounds for intra-tile loops.
708 for (unsigned i = 0; i < width; i++) {
709 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710 AffineForOp forOp = origLoops[i];
711 std::optional<uint64_t> mayBeConstantCount = std::nullopt;
712 if (auto staticTripCount = forOp.getStaticTripCount())
713 mayBeConstantCount = staticTripCount->getZExtValue();
714 // The lower bound is just the tile-space loop.
715 AffineMap lbMap = b.getDimIdentityMap();
716 newLoops[width + i].setLowerBound(
717 /*operands=*/newLoops[i].getInductionVar(), lbMap);
718 // The step sizes of intra-tile loops is just the original loops' step size.
719 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
720
721 // Set the upper bound.
722 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
723 // Trip count is less than the tile size: upper bound is lower bound +
724 // trip count * stepSize.
725 AffineMap ubMap = b.getSingleDimShiftAffineMap(
726 *mayBeConstantCount * origLoops[i].getStepAsInt());
727 newLoops[width + i].setUpperBound(
728 /*operands=*/newLoops[i].getInductionVar(), ubMap);
729 } else if (largestDiv % tileSizes[i] != 0) {
730 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
731 // Construct the upper bound map; the operands are the original operands
732 // with 'i' (tile-space loop) appended to it. The new upper bound map is
733 // the original one with an additional expression i + tileSize * stepSize
734 // appended.
735
736 // Add dim operands from original upper bound.
737 SmallVector<Value, 4> ubOperands;
738 AffineBound ub = origLoops[i].getUpperBound();
739 ubOperands.reserve(ub.getNumOperands() + 1);
740 AffineMap origUbMap = ub.getMap();
741 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
742 ubOperands.push_back(ub.getOperand(j));
743
744 // Add dim operand for new loop upper bound.
745 ubOperands.push_back(newLoops[i].getInductionVar());
746
747 // Add symbol operands from original upper bound.
748 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
749 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
750
752 boundExprs.reserve(1 + origUbMap.getNumResults());
753 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
754 // The new upper bound map is the original one with an additional
755 // expression i + tileSize * stepSize (of original loop) appended.
756 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
757 boundExprs.append(origUbMap.getResults().begin(),
758 origUbMap.getResults().end());
759 AffineMap ubMap =
760 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
761 boundExprs, b.getContext());
762 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
763 } else {
764 // No need of the min expression.
765 AffineExpr dim = b.getAffineDimExpr(0);
767 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
768 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
769 }
770 }
771}
772
773LogicalResult
775 ArrayRef<unsigned> tileSizes,
776 SmallVectorImpl<AffineForOp> *tiledNest) {
777 if (input.empty())
778 return success();
779
780 if (failed(performPreTilingChecks(input, tileSizes)))
781 return failure();
782
783 MutableArrayRef<AffineForOp> origLoops = input;
784 AffineForOp rootAffineForOp = origLoops[0];
785
786 // Note that width is at least one since the band isn't empty.
787 unsigned width = input.size();
788 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
789
790 // Construct a tiled loop nest without setting their bounds. Bounds are
791 // set later.
792 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
793
794 SmallVector<Value, 8> origLoopIVs;
795 extractForInductionVars(input, &origLoopIVs);
796
797 // Set loop bounds for the tiled loop nest.
798 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
799
800 // Replace original IVs with intra-tile loop IVs.
801 for (unsigned i = 0; i < width; i++)
802 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
803
804 // Erase the old loop nest.
805 rootAffineForOp.erase();
806
807 if (tiledNest)
808 *tiledNest = std::move(tiledLoops);
809
810 return success();
811}
812
813/// Tiles the specified band of perfectly nested loops creating tile-space
814/// loops and intra-tile loops, using SSA values as tiling parameters. A band
815/// is a contiguous set of loops.
818 SmallVectorImpl<AffineForOp> *tiledNest) {
819 if (input.empty())
820 return success();
821
822 if (failed(performPreTilingChecks(input, tileSizes)))
823 return failure();
824
825 MutableArrayRef<AffineForOp> origLoops = input;
826 AffineForOp rootAffineForOp = origLoops[0];
827 unsigned width = input.size();
828 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
829
830 // Construct a tiled loop nest without setting their bounds. Bounds are
831 // set later.
832 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
833
834 SmallVector<Value, 8> origLoopIVs;
835 extractForInductionVars(input, &origLoopIVs);
836
837 // Set loop bounds for the tiled loop nest.
839 tileSizes);
840
841 // Replace original IVs with intra-tile loop IVs.
842 for (unsigned i = 0; i < width; i++)
843 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
844
845 // Erase the old loop nest.
846 rootAffineForOp.erase();
847
848 if (tiledNest)
849 *tiledNest = std::move(tiledLoops);
850
851 return success();
852}
853
854/// Get perfectly nested sequence of loops starting at root of loop nest
855/// (the first op being another AffineFor, and the second op - a terminator).
856/// A loop is perfectly nested iff: the first op in the loop's body is another
857/// AffineForOp, and the second op is a terminator).
859 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
860 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
861 nestedLoops.push_back(root);
862 Block &body = root.getRegion().front();
863 if (body.begin() != std::prev(body.end(), 2))
864 return;
865
866 root = dyn_cast<AffineForOp>(&body.front());
867 if (!root)
868 return;
869 }
870}
871
872/// Unrolls this loop completely.
873LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
874 std::optional<APInt> mayBeConstantTripCount = forOp.getStaticTripCount();
875 if (mayBeConstantTripCount.has_value()) {
876 uint64_t tripCount = mayBeConstantTripCount->getZExtValue();
877 if (tripCount == 0)
878 return success();
879 if (tripCount == 1)
880 return promoteIfSingleIteration(forOp);
881 return loopUnrollByFactor(forOp, tripCount);
882 }
883 return failure();
884}
885
886/// Unrolls this loop by the specified factor or by the trip count (if constant)
887/// whichever is lower.
888LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
889 uint64_t unrollFactor) {
890 std::optional<APInt> mayBeConstantTripCount = forOp.getStaticTripCount();
891 if (mayBeConstantTripCount.has_value() &&
892 mayBeConstantTripCount->ult(unrollFactor))
893 return loopUnrollByFactor(forOp, mayBeConstantTripCount->getZExtValue());
894 return loopUnrollByFactor(forOp, unrollFactor);
895}
896
897/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
898/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
899/// unrolled body. If specified, annotates the Ops in each unrolled iteration
900/// using annotateFn.
902 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
903 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
904 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
905 ValueRange iterArgs, ValueRange yieldedValues) {
906 // Builder to insert unrolled bodies just before the terminator of the body of
907 // 'forOp'.
908 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
909
910 constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
911 if (!annotateFn)
912 annotateFn = defaultAnnotateFn;
913
914 // Keep a pointer to the last non-terminator operation in the original block
915 // so that we know what to clone (since we are doing this in-place).
916 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
917
918 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
919 SmallVector<Value, 4> lastYielded(yieldedValues);
920
921 for (unsigned i = 1; i < unrollFactor; i++) {
922 IRMapping operandMap;
923
924 // Prepare operand map.
925 operandMap.map(iterArgs, lastYielded);
926
927 // If the induction variable is used, create a remapping to the value for
928 // this unrolled instance.
929 if (!forOpIV.use_empty()) {
930 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
931 operandMap.map(forOpIV, ivUnroll);
932 }
933
934 // Clone the original body of 'forOp'.
935 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
936 Operation *clonedOp = builder.clone(*it, operandMap);
937 annotateFn(i, clonedOp, builder);
938 }
939
940 // Update yielded values. If the yielded value is defined outside the
941 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
942 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
943 // value with the clone corresponding to the yielded value.
944 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
945 Operation *defOp = yieldedValues[i].getDefiningOp();
946 if (defOp && defOp->getBlock() == loopBodyBlock)
947 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
948 }
949 }
950
951 // Make sure we annotate the Ops in the original body. We do this last so that
952 // any annotations are not copied into the cloned Ops above.
953 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
954 annotateFn(0, &*it, builder);
955
956 // Update operands of the yield statement.
957 loopBodyBlock->getTerminator()->setOperands(lastYielded);
958}
959
960/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
961/// count is not a multiple of `unrollFactor`.
962static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
963 uint64_t unrollFactor) {
964 // Insert the cleanup loop right after 'forOp'.
965 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
966 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
967
968 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
969 // and produce results for the original users of `forOp` results.
970 auto results = forOp.getResults();
971 auto cleanupResults = cleanupForOp.getResults();
972 auto cleanupIterOperands = cleanupForOp.getInits();
973
974 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
975 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
976 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
977 }
978
979 AffineMap cleanupMap;
980 SmallVector<Value, 4> cleanupOperands;
981 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
982 if (!cleanupMap)
983 return failure();
984
985 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
986 // Promote the loop body up if this has turned into a single iteration loop.
987 (void)promoteIfSingleIteration(cleanupForOp);
988
989 // Adjust upper bound of the original loop; this is the same as the lower
990 // bound of the cleanup loop.
991 forOp.setUpperBound(cleanupOperands, cleanupMap);
992 return success();
993}
994
995/// Unrolls this loop by the specified factor. Returns success if the loop
996/// is successfully unrolled.
998 AffineForOp forOp, uint64_t unrollFactor,
999 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1000 bool cleanUpUnroll) {
1001 assert(unrollFactor > 0 && "unroll factor should be positive");
1002
1003 std::optional<uint64_t> mayBeConstantTripCount = std::nullopt;
1004 if (auto staticTripCount = forOp.getStaticTripCount())
1005 mayBeConstantTripCount = staticTripCount->getZExtValue();
1006 if (unrollFactor == 1) {
1007 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1008 return failure();
1009 return success();
1010 }
1011
1012 // Nothing in the loop body other than the terminator.
1013 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1014 return success();
1015
1016 // If the trip count is lower than the unroll factor, no unrolled body.
1017 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1018 if (cleanUpUnroll) {
1019 // Unroll the cleanup loop if cleanUpUnroll is specified.
1020 return loopUnrollFull(forOp);
1021 }
1022
1023 return failure();
1024 }
1025
1026 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1027 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1028 // Loops where the lower bound is a max expression or the upper bound is
1029 // a min expression and the trip count doesn't divide the unroll factor
1030 // can't be unrolled since the lower bound of the cleanup loop in such cases
1031 // cannot be expressed as an affine function or a max over affine functions.
1032 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1033 forOp.getUpperBoundMap().getNumResults() != 1)
1034 return failure();
1035 if (cleanUpUnroll)
1036 // Force unroll including cleanup loop
1037 return loopUnrollFull(forOp);
1038 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1039 assert(false && "cleanup loop lower bound map for single result lower "
1040 "and upper bound maps can always be determined");
1041 }
1042
1043 ValueRange iterArgs(forOp.getRegionIterArgs());
1044 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1045
1046 // Scale the step of loop being unrolled by unroll factor.
1047 int64_t step = forOp.getStepAsInt();
1048 forOp.setStep(step * unrollFactor);
1050 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1051 [&](unsigned i, Value iv, OpBuilder b) {
1052 // iv' = iv + i * step
1053 auto d0 = b.getAffineDimExpr(0);
1054 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1055 return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1056 },
1057 /*annotateFn=*/annotateFn,
1058 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1059
1060 // Promote the loop body up if this has turned into a single iteration loop.
1062 return success();
1063}
1064
1065LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1066 uint64_t unrollJamFactor) {
1067 std::optional<APInt> mayBeConstantTripCount = forOp.getStaticTripCount();
1068 if (mayBeConstantTripCount.has_value() &&
1069 mayBeConstantTripCount->getZExtValue() < unrollJamFactor)
1070 return loopUnrollJamByFactor(forOp, mayBeConstantTripCount->getZExtValue());
1071 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1072}
1073
1074/// Check if all control operands of all loops are defined outside of `forOp`
1075/// and return false if not.
1076static bool areInnerBoundsInvariant(AffineForOp forOp) {
1077 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1078 for (auto controlOperand : aForOp.getControlOperands()) {
1079 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1080 return WalkResult::interrupt();
1081 }
1082 return WalkResult::advance();
1083 });
1084 return !walkResult.wasInterrupted();
1085}
1086
1087/// Unrolls and jams this loop by the specified factor.
1088LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1089 uint64_t unrollJamFactor) {
1090 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1091
1092 std::optional<uint64_t> mayBeConstantTripCount = std::nullopt;
1093 if (auto staticTripCount = forOp.getStaticTripCount())
1094 mayBeConstantTripCount = staticTripCount->getZExtValue();
1095 if (unrollJamFactor == 1) {
1096 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1097 return failure();
1098 return success();
1099 }
1100
1101 // Nothing in the loop body other than the terminator.
1102 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1103 return success();
1104
1105 // If the trip count is lower than the unroll jam factor, no unroll jam.
1106 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1107 LDBG() << "[failed] trip count < unroll-jam factor";
1108 return failure();
1109 }
1110
1111 // If any control operand of any inner loop of `forOp` is defined within
1112 // `forOp`, no unroll jam.
1113 if (!areInnerBoundsInvariant(forOp))
1114 return failure();
1115
1116 // Gather all sub-blocks to jam upon the loop being unrolled.
1118 jbg.walk(forOp);
1119 auto &subBlocks = jbg.subBlocks;
1120
1121 // Collect loops with iter_args.
1122 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1123 forOp.walk([&](AffineForOp aForOp) {
1124 if (aForOp.getNumIterOperands() > 0)
1125 loopsWithIterArgs.push_back(aForOp);
1126 });
1127
1128 // Get supported reductions to be used for creating reduction ops at the end.
1129 SmallVector<LoopReduction> reductions;
1130 if (forOp.getNumIterOperands() > 0)
1131 getSupportedReductions(forOp, reductions);
1132
1133 // Generate the cleanup loop if trip count isn't a multiple of
1134 // unrollJamFactor.
1135 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1136 // Loops where the lower bound is a max expression or the upper bound is
1137 // a min expression and the trip count doesn't divide the unroll factor
1138 // can't be unrolled since the lower bound of the cleanup loop in such cases
1139 // cannot be expressed as an affine function or a max over affine functions.
1140 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1141 forOp.getUpperBoundMap().getNumResults() != 1)
1142 return failure();
1143 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1144 assert(false && "cleanup loop lower bound map for single result lower "
1145 "and upper bound maps can always be determined");
1146 }
1147
1148 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1149 // iteration. There are (`unrollJamFactor` - 1) iterations.
1150 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1151
1152 // For any loop with iter_args, replace it with a new loop that has
1153 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1154 // operands.
1155 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1156 IRRewriter rewriter(forOp.getContext());
1157 for (AffineForOp oldForOp : loopsWithIterArgs) {
1158 SmallVector<Value> dupIterOperands, dupYieldOperands;
1159 ValueRange oldIterOperands = oldForOp.getInits();
1160 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1161 ValueRange oldYieldOperands =
1162 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1163 // Get additional iterOperands, iterArgs, and yield operands. We will
1164 // fix iterOperands and yield operands after cloning of sub-blocks.
1165 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1166 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1167 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1168 }
1169 // Create a new loop with additional iterOperands, iter_args and yield
1170 // operands. This new loop will take the loop body of the original loop.
1171 bool forOpReplaced = oldForOp == forOp;
1172 AffineForOp newForOp =
1173 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1174 rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1175 [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1176 return dupYieldOperands;
1177 }));
1178 newLoopsWithIterArgs.push_back(newForOp);
1179 // `forOp` has been replaced with a new loop.
1180 if (forOpReplaced)
1181 forOp = newForOp;
1182 // Update `operandMaps` for `newForOp` iterArgs and results.
1183 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1184 unsigned oldNumIterArgs = oldIterArgs.size();
1185 ValueRange newResults = newForOp.getResults();
1186 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1187 assert(oldNumIterArgs == oldNumResults &&
1188 "oldNumIterArgs must be the same as oldNumResults");
1189 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1190 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1191 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1192 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1193 // to those in the `i`th new set.
1194 operandMaps[i - 1].map(newIterArgs[j],
1195 newIterArgs[i * oldNumIterArgs + j]);
1196 operandMaps[i - 1].map(newResults[j],
1197 newResults[i * oldNumResults + j]);
1198 }
1199 }
1200 }
1201
1202 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1203 int64_t step = forOp.getStepAsInt();
1204 forOp.setStep(step * unrollJamFactor);
1205
1206 auto forOpIV = forOp.getInductionVar();
1207 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1208 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1209 for (auto &subBlock : subBlocks) {
1210 // Builder to insert unroll-jammed bodies. Insert right at the end of
1211 // sub-block.
1212 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1213
1214 // If the induction variable is used, create a remapping to the value for
1215 // this unrolled instance.
1216 if (!forOpIV.use_empty()) {
1217 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1218 auto d0 = builder.getAffineDimExpr(0);
1219 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1220 auto ivUnroll =
1221 AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1222 operandMaps[i - 1].map(forOpIV, ivUnroll);
1223 }
1224 // Clone the sub-block being unroll-jammed.
1225 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1226 builder.clone(*it, operandMaps[i - 1]);
1227 }
1228 // Fix iterOperands and yield op operands of newly created loops.
1229 for (auto newForOp : newLoopsWithIterArgs) {
1230 unsigned oldNumIterOperands =
1231 newForOp.getNumIterOperands() / unrollJamFactor;
1232 unsigned numControlOperands = newForOp.getNumControlOperands();
1233 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1234 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1235 assert(oldNumIterOperands == oldNumYieldOperands &&
1236 "oldNumIterOperands must be the same as oldNumYieldOperands");
1237 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1238 // The `i`th duplication of an old iterOperand or yield op operand
1239 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1240 // if such mapped value exists.
1241 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1242 operandMaps[i - 1].lookupOrDefault(
1243 newForOp.getOperand(numControlOperands + j)));
1244 yieldOp.setOperand(
1245 i * oldNumYieldOperands + j,
1246 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1247 }
1248 }
1249 }
1250 if (forOp.getNumResults() > 0) {
1251 // Create reduction ops to combine every `unrollJamFactor` related results
1252 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1253 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1254 // %1.
1255 rewriter.setInsertionPointAfter(forOp);
1256 auto loc = forOp.getLoc();
1257 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1258 for (LoopReduction &reduction : reductions) {
1259 unsigned pos = reduction.iterArgPosition;
1260 Value lhs = forOp.getResult(pos);
1261 Value rhs;
1263 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1264 rhs = forOp.getResult(i * oldNumResults + pos);
1265 // Create ops based on reduction type.
1266 lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1267 if (!lhs)
1268 return failure();
1269 Operation *op = lhs.getDefiningOp();
1270 assert(op && "Reduction op should have been created");
1271 newOps.insert(op);
1272 }
1273 // Replace all uses except those in newly created reduction ops.
1274 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1275 }
1276 }
1277
1278 // Promote the loop body up if this has turned into a single iteration loop.
1280 return success();
1281}
1282
1283/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1284/// nested within 'forOpA' as the only non-terminator operation in its block.
1285void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1286 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1287 auto &forOpABody = forOpA.getBody()->getOperations();
1288 auto &forOpBBody = forOpB.getBody()->getOperations();
1289
1290 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1291 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1292 // body containing only the terminator.
1293 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1294 forOpABody, forOpABody.begin(),
1295 std::prev(forOpABody.end()));
1296 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1297 // body (this leaves forOpB's body containing only the terminator).
1298 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1299 std::prev(forOpBBody.end()));
1300 // 3) Splice forOpA into the beginning of forOpB's body.
1301 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1302 Block::iterator(forOpA));
1303}
1304
1305// Checks each dependence component against the permutation to see if the
1306// desired loop interchange would violate dependences by making the
1307// dependence component lexicographically negative.
1309 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1310 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1311 // Invert permutation map.
1312 unsigned maxLoopDepth = loops.size();
1313 SmallVector<unsigned, 4> loopPermMapInv;
1314 loopPermMapInv.resize(maxLoopDepth);
1315 for (unsigned i = 0; i < maxLoopDepth; ++i)
1316 loopPermMapInv[loopPermMap[i]] = i;
1317
1318 // Check each dependence component against the permutation to see if the
1319 // desired loop interchange permutation would make the dependence vectors
1320 // lexicographically negative.
1321 // Example 1: [-1, 1][0, 0]
1322 // Example 2: [0, 0][-1, 1]
1323 for (const auto &depComps : depCompsVec) {
1324 assert(depComps.size() >= maxLoopDepth);
1325 // Check if the first non-zero dependence component is positive.
1326 // This iterates through loops in the desired order.
1327 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1328 unsigned permIndex = loopPermMapInv[j];
1329 assert(depComps[permIndex].lb);
1330 int64_t depCompLb = *depComps[permIndex].lb;
1331 if (depCompLb > 0)
1332 break;
1333 if (depCompLb < 0)
1334 return false;
1335 }
1336 }
1337 return true;
1338}
1339
1340/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1341/// nested sequence of loops in 'loops' would violate dependences.
1343 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1344 assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1345 unsigned maxLoopDepth = loops.size();
1346 if (maxLoopDepth == 1)
1347 return true;
1348
1349 // We cannot guarantee the validity of the interchange if the loops have
1350 // iter_args, since the dependence analysis does not take them into account.
1351 // Conservatively return false in such cases.
1352 if (llvm::any_of(loops, [](AffineForOp loop) {
1353 return loop.getNumIterOperands() > 0;
1354 }))
1355 return false;
1356
1357 // Gather dependence components for dependences between all ops in loop nest
1358 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1359 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1361 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1362}
1363
1364/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1365/// in it from outermost to innermost.
1366[[maybe_unused]] bool
1368 assert(!loops.empty() && "no loops provided");
1369
1370 // We already know that the block can't be empty.
1371 auto hasTwoElements = [](Block *block) {
1372 auto secondOpIt = std::next(block->begin());
1373 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1374 };
1375
1376 auto enclosingLoop = loops.front();
1377 for (auto loop : loops.drop_front()) {
1378 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1379 // parentForOp's body should be just this loop and the terminator.
1380 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1381 return false;
1382 enclosingLoop = loop;
1383 }
1384 return true;
1385}
1386
1387// input[i] should move from position i -> permMap[i]. Returns the position in
1388// `input` that becomes the new outermost loop.
1390 ArrayRef<unsigned> permMap) {
1391 assert(input.size() == permMap.size() && "invalid permutation map size");
1392 // Check whether the permutation spec is valid. This is a small vector - we'll
1393 // just sort and check if it's iota.
1394 SmallVector<unsigned, 4> checkPermMap(permMap);
1395 llvm::sort(checkPermMap);
1396 if (llvm::any_of(llvm::enumerate(checkPermMap),
1397 [](const auto &en) { return en.value() != en.index(); }))
1398 assert(false && "invalid permutation map");
1399
1400 // Nothing to do.
1401 if (input.size() < 2)
1402 return 0;
1403
1404 assert(isPerfectlyNested(input) && "input not perfectly nested");
1405
1406 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1407 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1409 for (unsigned i = 0, e = input.size(); i < e; ++i)
1410 invPermMap.push_back({permMap[i], i});
1411 llvm::sort(invPermMap);
1412
1413 // Move the innermost loop body to the loop that would be the innermost in the
1414 // permuted nest (only if the innermost loop is going to change).
1415 if (permMap.back() != input.size() - 1) {
1416 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1417 Block *srcBody = ((AffineForOp)input.back()).getBody();
1418 destBody->getOperations().splice(destBody->begin(),
1419 srcBody->getOperations(), srcBody->begin(),
1420 std::prev(srcBody->end()));
1421 }
1422
1423 // We'll move each loop in `input` in the reverse order so that its body is
1424 // empty when we are moving it; this incurs zero copies and no erasing.
1425 for (int i = input.size() - 1; i >= 0; --i) {
1426 // If this has to become the outermost loop after permutation, add it to the
1427 // parent block of the original root.
1428 if (permMap[i] == 0) {
1429 // If the root remains the same, nothing to do.
1430 if (i == 0)
1431 continue;
1432 // Make input[i] the new outermost loop moving it into parentBlock.
1433 auto *parentBlock = input[0]->getBlock();
1434 parentBlock->getOperations().splice(Block::iterator(input[0]),
1435 input[i]->getBlock()->getOperations(),
1436 Block::iterator(input[i]));
1437 continue;
1438 }
1439
1440 // If the parent in the permuted order is the same as in the original,
1441 // nothing to do.
1442 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1443 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1444 continue;
1445
1446 // Move input[i] to its surrounding loop in the transformed nest.
1447 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1448 destBody->getOperations().splice(destBody->begin(),
1449 input[i]->getBlock()->getOperations(),
1450 Block::iterator(input[i]));
1451 }
1452
1453 return invPermMap[0].second;
1454}
1455
1456// Sinks all sequential loops to the innermost levels (while preserving
1457// relative order among them) and moves all parallel loops to the
1458// outermost (while again preserving relative order among them).
1459AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1461 getPerfectlyNestedLoops(loops, forOp);
1462 if (loops.size() < 2)
1463 return forOp;
1464
1465 // Gather dependence components for dependences between all ops in loop nest
1466 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1467 unsigned maxLoopDepth = loops.size();
1468 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1469 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1470
1471 // Mark loops as either parallel or sequential.
1472 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1473 for (auto &depComps : depCompsVec) {
1474 assert(depComps.size() >= maxLoopDepth);
1475 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1476 DependenceComponent &depComp = depComps[j];
1477 assert(depComp.lb.has_value() && depComp.ub.has_value());
1478 if (*depComp.lb != 0 || *depComp.ub != 0)
1479 isParallelLoop[j] = false;
1480 }
1481 }
1482
1483 unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1484
1485 // Compute permutation of loops that sinks sequential loops (and thus raises
1486 // parallel loops) while preserving relative order.
1487 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1488 unsigned nextSequentialLoop = numParallelLoops;
1489 unsigned nextParallelLoop = 0;
1490 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1491 if (isParallelLoop[i]) {
1492 loopPermMap[i] = nextParallelLoop++;
1493 } else {
1494 loopPermMap[i] = nextSequentialLoop++;
1495 }
1496 }
1497
1498 // Check if permutation 'loopPermMap' would violate dependences.
1499 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1500 return forOp;
1501 // Perform loop interchange according to permutation 'loopPermMap'.
1502 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1503 return loops[loopNestRootIndex];
1504}
1505
1506// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1507// lower (resp. upper) loop bound. When called for both the lower and upper
1508// bounds, the resulting IR resembles:
1509//
1510// ```mlir
1511// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1512// ...
1513// }
1514// ```
1516 SmallVector<Value, 4> *operands,
1517 int64_t offset = 0) {
1518 auto bounds = llvm::to_vector<4>(map->getResults());
1519 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1520 operands->insert(operands->begin() + map->getNumDims(), iv);
1521 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1522 b.getContext());
1523 canonicalizeMapAndOperands(map, operands);
1524}
1525
1526// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1527// Stripmine-sink is a primitive building block for generalized tiling of
1528// imperfectly nested loops.
1529// This transformation is purely mechanical and does not check legality,
1530// profitability or even structural correctness. It is the user's
1531// responsibility to specify `targets` that are dominated by `forOp`.
1532// Returns the new AffineForOps, one per `targets`, nested immediately under
1533// each of the `targets`.
1535stripmineSink(AffineForOp forOp, uint64_t factor,
1536 ArrayRef<AffineForOp> targets) {
1537 auto originalStep = forOp.getStepAsInt();
1538 auto scaledStep = originalStep * factor;
1539 forOp.setStep(scaledStep);
1540
1541 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1542
1543 // Lower-bound map creation.
1544 auto lbMap = forOp.getLowerBoundMap();
1545 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1546 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1547
1548 // Upper-bound map creation.
1549 auto ubMap = forOp.getUpperBoundMap();
1550 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1551 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1552 /*offset=*/scaledStep);
1553
1554 auto iv = forOp.getInductionVar();
1555 SmallVector<AffineForOp, 8> innerLoops;
1556 for (auto t : targets) {
1557 // Insert newForOp before the terminator of `t`.
1558 auto b = OpBuilder::atBlockTerminator(t.getBody());
1559 auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1560 ubOperands, ubMap, originalStep);
1561 auto begin = t.getBody()->begin();
1562 // Skip terminator and `newForOp` which is just before the terminator.
1563 auto nOps = t.getBody()->getOperations().size() - 2;
1564 newForOp.getBody()->getOperations().splice(
1565 newForOp.getBody()->getOperations().begin(),
1566 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1567 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1568 newForOp.getRegion());
1569 innerLoops.push_back(newForOp);
1570 }
1571
1572 return innerLoops;
1573}
1574
1575// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1576// Returns the new AffineForOps, nested immediately under `target`.
1577template <typename SizeType>
1578static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1579 AffineForOp target) {
1580 // TODO: Use cheap structural assertions that targets are nested under
1581 // forOp and that targets are not nested under each other when DominanceInfo
1582 // exposes the capability. It seems overkill to construct a whole function
1583 // dominance tree at this point.
1584 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1585 assert(res.size() == 1 && "Expected 1 inner forOp");
1586 return res[0];
1587}
1588
1591 ArrayRef<AffineForOp> targets) {
1593 SmallVector<AffineForOp, 8> currentTargets(targets);
1594 for (auto it : llvm::zip(forOps, sizes)) {
1595 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1596 res.push_back(step);
1597 currentTargets = step;
1598 }
1599 return res;
1600}
1601
1603 ArrayRef<uint64_t> sizes,
1604 AffineForOp target) {
1606 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1607 res.push_back(llvm::getSingleElement(loops));
1608 return res;
1609}
1610
1611LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1612 if (loops.size() < 2)
1613 return success();
1614
1615 AffineForOp innermost = loops.back();
1616 AffineForOp outermost = loops.front();
1617 AffineBound ub = outermost.getUpperBound();
1618 AffineMap origUbMap = ub.getMap();
1619 Location loc = outermost.getLoc();
1620 OpBuilder builder(outermost);
1621 for (AffineForOp loop : loops) {
1622 // We only work on normalized loops.
1623 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1624 loop.getConstantLowerBound() != 0)
1625 return failure();
1626 }
1627 SmallVector<Value, 4> upperBoundSymbols;
1628 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1629 ub.getOperands().end());
1630
1631 // 1. Store the upper bound of the outermost loop in a variable.
1632 Value prev;
1633 if (!llvm::hasSingleElement(origUbMap.getResults()))
1634 prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1635 else
1636 prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1637 upperBoundSymbols.push_back(prev);
1638
1639 // 2. Emit code computing the upper bound of the coalesced loop as product of
1640 // the number of iterations of all loops.
1641 for (AffineForOp loop : loops.drop_front()) {
1642 ub = loop.getUpperBound();
1643 origUbMap = ub.getMap();
1644 ubOperands = ub.getOperands();
1645 Value upperBound;
1646 // If upper bound map has more than one result, take their minimum.
1647 if (!llvm::hasSingleElement(origUbMap.getResults()))
1648 upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1649 else
1650 upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1651 upperBoundSymbols.push_back(upperBound);
1652 SmallVector<Value, 4> operands;
1653 operands.push_back(prev);
1654 operands.push_back(upperBound);
1655 // Maintain running product of loop upper bounds.
1656 prev = AffineApplyOp::create(
1657 builder, loc,
1658 AffineMap::get(/*dimCount=*/1,
1659 /*symbolCount=*/1,
1660 builder.getAffineDimExpr(0) *
1661 builder.getAffineSymbolExpr(0)),
1662 operands);
1663 }
1664 // Set upper bound of the coalesced loop.
1665 AffineMap newUbMap = AffineMap::get(
1666 /*dimCount=*/0,
1667 /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1668 outermost.setUpperBound(prev, newUbMap);
1669
1670 builder.setInsertionPointToStart(outermost.getBody());
1671
1672 // 3. Remap induction variables. For each original loop, the value of the
1673 // induction variable can be obtained by dividing the induction variable of
1674 // the linearized loop by the total number of iterations of the loops nested
1675 // in it modulo the number of iterations in this loop (remove the values
1676 // related to the outer loops):
1677 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1678 // Compute these iteratively from the innermost loop by creating a "running
1679 // quotient" of division by the range.
1680 Value previous = outermost.getInductionVar();
1681 for (unsigned idx = loops.size(); idx > 0; --idx) {
1682 if (idx != loops.size()) {
1683 SmallVector<Value, 4> operands;
1684 operands.push_back(previous);
1685 operands.push_back(upperBoundSymbols[idx]);
1686 previous = AffineApplyOp::create(builder, loc,
1688 /*dimCount=*/1, /*symbolCount=*/1,
1689 builder.getAffineDimExpr(0).floorDiv(
1690 builder.getAffineSymbolExpr(0))),
1691 operands);
1692 }
1693 // Modified value of the induction variables of the nested loops after
1694 // coalescing.
1695 Value inductionVariable;
1696 if (idx == 1) {
1697 inductionVariable = previous;
1698 } else {
1699 SmallVector<Value, 4> applyOperands;
1700 applyOperands.push_back(previous);
1701 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1702 inductionVariable = AffineApplyOp::create(
1703 builder, loc,
1705 /*dimCount=*/1, /*symbolCount=*/1,
1706 builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1707 applyOperands);
1708 }
1709 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1710 inductionVariable, loops.back().getRegion());
1711 }
1712
1713 // 4. Move the operations from the innermost just above the second-outermost
1714 // loop, delete the extra terminator and the second-outermost loop.
1715 AffineForOp secondOutermostLoop = loops[1];
1716 innermost.getBody()->back().erase();
1717 outermost.getBody()->getOperations().splice(
1718 Block::iterator(secondOutermostLoop.getOperation()),
1719 innermost.getBody()->getOperations());
1720 for (auto [iter, init] :
1721 llvm::zip_equal(secondOutermostLoop.getRegionIterArgs(),
1722 secondOutermostLoop.getInits())) {
1723 iter.replaceAllUsesWith(init);
1724 iter.dropAllUses();
1725 }
1726 secondOutermostLoop.erase();
1727 return success();
1728}
1729
1730void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1731 ArrayRef<Value> processorId,
1732 ArrayRef<Value> numProcessors) {
1733 assert(processorId.size() == numProcessors.size());
1734 if (processorId.empty())
1735 return;
1736
1737 OpBuilder b(forOp);
1738 Location loc(forOp.getLoc());
1740 bindSymbols(forOp.getContext(), lhs, rhs);
1741 auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1742 auto addMap = AffineMap::get(0, 2, lhs + rhs);
1743
1744 Value linearIndex = processorId.front();
1745 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1746 auto mulApplyOp = AffineApplyOp::create(
1747 b, loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1748 linearIndex = AffineApplyOp::create(b, loc, addMap,
1749 ValueRange{mulApplyOp, processorId[i]});
1750 }
1751
1752 auto mulApplyOp = AffineApplyOp::create(
1753 b, loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1754 Value lb = AffineApplyOp::create(
1755 b, loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1756 forOp.setLowerBound(lb);
1757
1758 Value step = forOp.getStep();
1759 for (auto numProcs : numProcessors)
1760 step = AffineApplyOp::create(b, loc, mulMap, ValueRange{numProcs, step});
1761 forOp.setStep(step);
1762}
1763
1764/// Given a memref region, determine the lowest depth at which transfers can be
1765/// placed for it, and return the corresponding block, start and end positions
1766/// in the block for placing incoming (read) and outgoing (write) copies
1767/// respectively. The lowest depth depends on whether the region being accessed
1768/// is hoistable with respect to one or more immediately surrounding loops.
1769static void
1771 Block::iterator &begin, Block::iterator &end,
1772 Block **copyPlacementBlock,
1773 Block::iterator *copyInPlacementStart,
1774 Block::iterator *copyOutPlacementStart) {
1775 const auto *cst = region.getConstraints();
1776 SmallVector<Value, 4> symbols;
1777 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1778
1779 SmallVector<Operation *, 4> enclosingAffineOps;
1780 getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1781 // Walk up loop parents till we find an IV on which this region is
1782 // symbolic/variant or we hit `hoistGuard`.
1783 auto it = enclosingAffineOps.rbegin();
1784 AffineForOp lastInvariantFor;
1785 for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1786 Operation *enclosingOp = *it;
1787 // We can't hoist past the definition of the memref being copied.
1788 Value memref = region.memref;
1789 if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1790 LDBG() << "memref definition will end up not dominating hoist location";
1791 break;
1792 }
1793
1794 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1795 if (!affineFor)
1796 break;
1797 // TODO: also need to be checking this for regions symbols that
1798 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1799 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1800 break;
1801 lastInvariantFor = affineFor;
1802 }
1803
1804 if (it != enclosingAffineOps.rbegin()) {
1805 *copyInPlacementStart = Block::iterator(lastInvariantFor);
1806 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1807 *copyPlacementBlock = lastInvariantFor->getBlock();
1808 } else {
1809 *copyInPlacementStart = begin;
1810 *copyOutPlacementStart = end;
1811 *copyPlacementBlock = &block;
1812 }
1813}
1814
1815// Info comprising stride and number of elements transferred every stride.
1820
1821/// Returns striding information for a copy/transfer of this region with
1822/// potentially multiple striding levels from outermost to innermost. For an
1823/// n-dimensional region, there can be at most n-1 levels of striding
1824/// successively nested.
1825// TODO: make this work with non-identity layout maps.
1826static void getMultiLevelStrides(const MemRefRegion &region,
1827 ArrayRef<int64_t> bufferShape,
1828 SmallVectorImpl<StrideInfo> *strideInfos) {
1829 if (bufferShape.size() <= 1)
1830 return;
1831
1832 int64_t numEltPerStride = 1;
1833 int64_t stride = 1;
1834 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1835 int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1836 stride *= dimSize;
1837 numEltPerStride *= bufferShape[d];
1838 // A stride is needed only if the region has a shorter extent than the
1839 // memref along the dimension *and* has an extent greater than one along the
1840 // next major dimension.
1841 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1842 strideInfos->push_back({stride, numEltPerStride});
1843 }
1844 }
1845}
1846
1847/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1848/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1849/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1850/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1851/// contain the expressions to be subtracted out from the respective copy loop
1852/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1853/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1854/// copy nest is inserted.
1855//
1856/// The copy-in nest is generated as follows as an example for a 2-d region:
1857/// for x = ...
1858/// for y = ...
1859/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1860///
1861static AffineForOp
1862generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1863 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1864 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1865 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1866 OpBuilder b) {
1867 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1868 return lbMap.getNumInputs() == lbOperands.size();
1869 }));
1870 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1871 return ubMap.getNumInputs() == ubOperands.size();
1872 }));
1873
1874 unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1875 // A copy nest can't be generated for 0-ranked memrefs.
1876 assert(rank != 0 && "non-zero rank memref expected");
1877 assert(lbMaps.size() == rank && "wrong number of lb maps");
1878 assert(ubMaps.size() == rank && "wrong number of ub maps");
1879
1880 SmallVector<Value, 4> memIndices;
1882 SmallVector<Value, 4> fastBufMapOperands;
1883 AffineForOp copyNestRoot;
1884 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1885 for (unsigned d = 0; d < rank; ++d) {
1886 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1887 ubOperands, ubMaps[d]);
1888 if (d == 0)
1889 copyNestRoot = forOp;
1890
1891 b = OpBuilder::atBlockTerminator(forOp.getBody());
1892
1893 auto fastBufOffsetMap =
1894 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1895 auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1896
1897 // Construct the subscript for the fast memref being copied into/from:
1898 // x - offset_x.
1899 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1900 b.getAffineDimExpr(2 * d));
1901 fastBufMapOperands.push_back(offset);
1902 fastBufMapOperands.push_back(forOp.getInductionVar());
1903 mayBeDeadApplys.push_back(offset);
1904
1905 // Subscript for the slow memref being copied.
1906 memIndices.push_back(forOp.getInductionVar());
1907 }
1908
1909 auto fastBufMap =
1910 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1913 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1914
1915 // Drop any dead affine.applys.
1916 for (auto applyOp : mayBeDeadApplys)
1917 if (applyOp.use_empty())
1918 applyOp.erase();
1919
1920 if (!isCopyOut) {
1921 // Copy in.
1922 auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1923 AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1924 fastBufMapOperands);
1925 return copyNestRoot;
1926 }
1927
1928 // Copy out.
1929 auto load =
1930 AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1931 AffineStoreOp::create(b, loc, load, memref, memIndices);
1933}
1934
1935[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
1936 return block.getParentOp()->emitRemark();
1937}
1938
1939/// Creates a buffer in the faster memory space for the specified memref region
1940/// (memref has to be non-zero ranked); generates a copy from the lower memory
1941/// space to this one, and replaces all loads/stores in the block range
1942/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1943/// if copies could not be generated due to yet unimplemented cases.
1944/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1945/// specify the insertion points where the incoming copies and outgoing copies,
1946/// respectively, should be inserted (the insertion happens right before the
1947/// insertion point). Since `begin` can itself be invalidated due to the memref
1948/// rewriting done from this method, the output argument `nBegin` is set to its
1949/// replacement (set to `begin` if no invalidation happens). Since outgoing
1950/// copies could have been inserted at `end`, the output argument `nEnd` is set
1951/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1952/// allocated.
1953static LogicalResult generateCopy(
1954 const MemRefRegion &region, Block *block, Block::iterator begin,
1955 Block::iterator end, Block *copyPlacementBlock,
1956 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1957 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1958 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1959 Block::iterator *nBegin, Block::iterator *nEnd) {
1960 *nBegin = begin;
1961 *nEnd = end;
1962
1963 auto f = begin->getParentOfType<FunctionOpInterface>();
1964 OpBuilder topBuilder(f.getFunctionBody());
1965 Value zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
1966
1967 *sizeInBytes = 0;
1968
1969 if (begin == end)
1970 return success();
1971
1972 // Record the last op in the block for which we are performing copy
1973 // generation. We later do the memref replacement only in [begin, lastCopyOp]
1974 // so that the original memref's used in the data movement code themselves
1975 // don't get replaced.
1976 Operation *lastCopyOp = end->getPrevNode();
1977
1978 // Is the copy out point at the end of the block where we are doing
1979 // explicit copying.
1980 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1981
1982 // Copies for read regions are going to be inserted at 'begin'.
1983 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1984 // Copies for write regions are going to be inserted at 'end'.
1985 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1986 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1987
1988 // Builder to create constants at the top level.
1989 auto func =
1990 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1991 OpBuilder top(func.getFunctionBody());
1992
1993 auto loc = region.loc;
1994 auto memref = region.memref;
1995 auto memRefType = cast<MemRefType>(memref.getType());
1996
1997 if (!memRefType.getLayout().isIdentity()) {
1998 LDBG() << "Non-identity layout map not yet supported";
1999 return failure();
2000 }
2001
2002 // Indices to use for the copying.
2003 // Indices for the original memref being copied from/to.
2004 SmallVector<Value, 4> memIndices;
2005 // Indices for the faster buffer being copied into/from.
2006 SmallVector<Value, 4> bufIndices;
2007
2008 unsigned rank = memRefType.getRank();
2009 if (rank == 0) {
2010 LDBG() << "Non-zero ranked memrefs supported";
2011 return failure();
2012 }
2013
2014 SmallVector<int64_t, 4> fastBufferShape;
2015
2016 // Compute the extents of the buffer.
2018 lbs.reserve(rank);
2019 std::optional<int64_t> numElements =
2020 region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2021 if (!numElements) {
2022 LDBG() << "Non-constant region size not supported";
2023 return failure();
2024 }
2025
2026 if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2027 // This can be supported in the future if needed.
2028 LDBG() << "Max lower bound for memref region start not supported";
2029 return failure();
2030 }
2031
2032 if (*numElements == 0) {
2033 LDBG() << "Nothing to copy";
2034 return success();
2035 }
2036
2037 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2038 for (unsigned i = 0; i < rank; ++i) {
2039 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2040 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2041 LDBG() << "Missing lower or upper bound for region along dimension: "
2042 << i;
2043 return failure();
2044 }
2045 }
2046
2047 const FlatAffineValueConstraints *cst = region.getConstraints();
2048 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2049 // on; these typically include loop IVs surrounding the level at which the
2050 // copy generation is being done or other valid symbols in MLIR.
2051 SmallVector<Value, 8> regionSymbols;
2052 cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2053
2054 // Construct the access expression for the fast memory buffer. The access
2055 // expression for a particular dimension of the fast buffer is obtained by
2056 // subtracting out the lower bound on the original memref's data region
2057 // along the corresponding dimension.
2058
2059 // Index start offsets for faster memory buffer relative to the original.
2060 SmallVector<AffineExpr, 4> fastBufOffsets;
2061 fastBufOffsets.reserve(rank);
2062 for (unsigned d = 0; d < rank; d++) {
2063 assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2064 "incorrect bound size");
2065
2066 // Set copy start location for this dimension in the lower memory space
2067 // memref.
2068 if (lbs[d].isSingleConstant()) {
2069 auto indexVal = lbs[d].getSingleConstantResult();
2070 if (indexVal == 0) {
2071 memIndices.push_back(zeroIndex);
2072 } else {
2073 memIndices.push_back(
2074 arith::ConstantIndexOp::create(top, loc, indexVal).getResult());
2075 }
2076 } else {
2077 // The coordinate for the start location is just the lower bound along the
2078 // corresponding dimension on the memory region (stored in 'offset').
2079 // Remap all inputs of the map to dimensions uniformly since in the
2080 // generate IR we need valid affine symbols as opposed to "symbols" for
2081 // the purpose of the memref region.
2082 SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2083 for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2084 symReplacements[i] = top.getAffineDimExpr(i);
2085 lbs[d] = lbs[d].replaceDimsAndSymbols(
2086 /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2087 /*numResultSyms=*/0);
2088 memIndices.push_back(
2089 AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2090 }
2091 // The fast buffer is copied into at location zero; addressing is relative.
2092 bufIndices.push_back(zeroIndex);
2093
2094 // Record the offsets since they are needed to remap the memory accesses of
2095 // the original memref further below.
2096 fastBufOffsets.push_back(lbs[d].getResult(0));
2097 }
2098
2099 // The faster memory space buffer.
2100 Value fastMemRef;
2101
2102 // Check if a buffer was already created.
2103 bool existingBuf = fastBufferMap.count(memref) > 0;
2104 if (!existingBuf) {
2105 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2106 auto fastMemRefType =
2107 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2108 fastBufferLayout, copyOptions.fastMemorySpace);
2109
2110 // Create the fast memory space buffer just before the 'affine.for'
2111 // operation.
2112 fastMemRef =
2113 memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2114 // Record it.
2115 fastBufferMap[memref] = fastMemRef;
2116 // fastMemRefType is a constant shaped memref.
2117 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2118 // We don't account for things of unknown size.
2119 *sizeInBytes = maySizeInBytes.value_or(0);
2120
2121 LLVM_DEBUG(emitRemarkForBlock(*block)
2122 << "Creating fast buffer of type " << fastMemRefType
2123 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2124 << " KiB\n");
2125 } else {
2126 // Reuse the one already created.
2127 fastMemRef = fastBufferMap[memref];
2128 }
2129
2130 auto numElementsSSA = arith::ConstantIndexOp::create(top, loc, *numElements);
2131
2132 Value dmaStride;
2133 Value numEltPerDmaStride;
2134 if (copyOptions.generateDma) {
2135 SmallVector<StrideInfo, 4> dmaStrideInfos;
2136 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2137
2138 // TODO: use all stride levels once DmaStartOp is extended for
2139 // multi-level strides.
2140 if (dmaStrideInfos.size() > 1) {
2141 LDBG() << "Only up to one level of stride supported";
2142 return failure();
2143 }
2144
2145 if (!dmaStrideInfos.empty()) {
2146 dmaStride =
2147 arith::ConstantIndexOp::create(top, loc, dmaStrideInfos[0].stride);
2148 numEltPerDmaStride = arith::ConstantIndexOp::create(
2149 top, loc, dmaStrideInfos[0].numEltPerStride);
2150 }
2151 }
2152
2153 // Create fully composed affine maps for each memref.
2154 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2155 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2156 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2157 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2158
2159 if (!copyOptions.generateDma) {
2160 // Point-wise copy generation.
2161 auto copyNest =
2162 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2163 /*lbOperands=*/regionSymbols, ubMaps,
2164 /*ubOperands=*/regionSymbols, fastBufOffsets,
2165 /*isCopyOut=*/region.isWrite(), b);
2166
2167 // Record this so that we can skip it from yet another copy.
2168 copyNests.insert(copyNest);
2169
2170 // Since new ops are being appended (for copy out's), adjust the end to
2171 // mark end of block range being processed if necessary.
2172 if (region.isWrite() && isCopyOutAtEndOfBlock)
2173 *nEnd = Block::iterator(copyNest.getOperation());
2174 } else {
2175 // DMA generation.
2176 // Create a tag (single element 1-d memref) for the DMA.
2177 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2178 copyOptions.tagMemorySpace);
2179 auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2180
2181 SmallVector<Value, 4> tagIndices({zeroIndex});
2182 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2183 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2184 if (!region.isWrite()) {
2185 // DMA non-blocking read from original buffer to fast buffer.
2186 AffineDmaStartOp::create(b, loc, memref, memAffineMap, memIndices,
2187 fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2188 tagAffineMap, tagIndices, numElementsSSA,
2189 dmaStride, numEltPerDmaStride);
2190 } else {
2191 // DMA non-blocking write from fast buffer to the original memref.
2192 auto op = AffineDmaStartOp::create(
2193 b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2194 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2195 dmaStride, numEltPerDmaStride);
2196 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2197 // end to mark end of block range being processed.
2198 if (isCopyOutAtEndOfBlock)
2199 *nEnd = Block::iterator(op.getOperation());
2200 }
2201
2202 // Matching DMA wait to block on completion; tag always has a 0 index.
2203 AffineDmaWaitOp::create(b, loc, tagMemRef, tagAffineMap, zeroIndex,
2204 numElementsSSA);
2205
2206 // Generate dealloc for the tag.
2207 auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2208 if (*nEnd == end && isCopyOutAtEndOfBlock)
2209 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2210 // mark end of range of the original.
2211 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2212 }
2213
2214 // Generate dealloc for the buffer.
2215 if (!existingBuf) {
2216 auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2217 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2218 // the fast buffer (since it marks the new end insertion point).
2219 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2220 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2221 }
2222
2223 // Replace all uses of the old memref with the faster one while remapping
2224 // access indices (subtracting out lower bound offsets for each dimension).
2225 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2226 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2227 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2228 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2229 // d2, d3 correspond to the original indices (%i, %j).
2230 SmallVector<AffineExpr, 4> remapExprs;
2231 remapExprs.reserve(rank);
2232 for (unsigned i = 0; i < rank; i++) {
2233 // The starting operands of indexRemap will be regionSymbols (the symbols on
2234 // which the memref region is parametric); then those corresponding to
2235 // the memref's original indices follow.
2236 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2237 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2238 }
2239 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2240 b.getContext());
2241
2242 // Record the begin since it may be invalidated by memref replacement.
2243 Block::iterator prevOfBegin;
2244 bool isBeginAtStartOfBlock = (begin == block->begin());
2245 if (!isBeginAtStartOfBlock)
2246 prevOfBegin = std::prev(begin);
2247
2248 auto userFilterFn = [&](Operation *user) {
2249 auto *ancestorUser = block->findAncestorOpInBlock(*user);
2250 return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2251 !lastCopyOp->isBeforeInBlock(ancestorUser);
2252 };
2253
2254 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2255 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2256 /*extraIndices=*/{}, indexRemap,
2257 /*extraOperands=*/regionSymbols,
2258 /*symbolOperands=*/{}, userFilterFn);
2259
2260 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2261
2262 return success();
2263}
2264
2265/// Construct the memref region to just include the entire memref. Returns false
2266/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2267/// enclosing loop IVs of `op` (starting from the outermost) that the region
2268/// is parametric on.
2269static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2270 MemRefRegion *region) {
2271 unsigned rank;
2272 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2273 rank = loadOp.getMemRefType().getRank();
2274 region->memref = loadOp.getMemRef();
2275 region->setWrite(false);
2276 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2277 rank = storeOp.getMemRefType().getRank();
2278 region->memref = storeOp.getMemRef();
2279 region->setWrite(true);
2280 } else {
2281 assert(false && "expected load or store op");
2282 return false;
2283 }
2284 auto memRefType = cast<MemRefType>(region->memref.getType());
2285 if (!memRefType.hasStaticShape())
2286 return false;
2287
2288 auto *regionCst = region->getConstraints();
2289
2290 // Just get the first numSymbols IVs, which the memref region is parametric
2291 // on.
2293 getAffineForIVs(*op, &ivs);
2294 ivs.resize(numParamLoopIVs);
2295 SmallVector<Value, 4> symbols;
2296 extractForInductionVars(ivs, &symbols);
2297 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2298 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2299
2300 // Memref dim sizes provide the bounds.
2301 for (unsigned d = 0; d < rank; d++) {
2302 auto dimSize = memRefType.getDimSize(d);
2303 assert(dimSize > 0 && "filtered dynamic shapes above");
2304 regionCst->addBound(BoundType::LB, d, 0);
2305 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2306 }
2307 return true;
2308}
2309
2310LogicalResult
2311mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2312 const AffineCopyOptions &copyOptions,
2313 std::optional<Value> filterMemRef,
2314 DenseSet<Operation *> &copyNests) {
2315 if (begin == end)
2316 return success();
2317
2318 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2319 "Inconsistent block begin/end args");
2320 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2321
2322 Block *block = begin->getBlock();
2323
2324 // Copies will be generated for this depth, i.e., symbolic in all loops
2325 // surrounding the this block range.
2326 unsigned copyDepth = getNestingDepth(&*begin);
2327
2328 LDBG() << "Generating copies at depth " << copyDepth;
2329 LDBG() << "from begin: "
2330 << OpWithFlags(&*begin, OpPrintingFlags().skipRegions());
2331 LDBG() << "to inclusive end: "
2332 << OpWithFlags(&*std::prev(end), OpPrintingFlags().skipRegions());
2333
2334 // List of memory regions to copy for. We need a map vector to have a
2335 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2336 // since the alloc's for example are identical except for the SSA id.
2337 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2338 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2339
2340 // Map from original memref's to the fast buffers that their accesses are
2341 // replaced with.
2342 DenseMap<Value, Value> fastBufferMap;
2343
2344 // To check for errors when walking the block.
2345 bool error = false;
2346
2347 // Walk this range of operations to gather all memory regions.
2348 block->walk(begin, end, [&](Operation *opInst) {
2349 Value memref;
2350 MemRefType memrefType;
2351 // Gather regions to allocate to buffers in faster memory space.
2352 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2353 memref = loadOp.getMemRef();
2354 memrefType = loadOp.getMemRefType();
2355 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2356 memref = storeOp.getMemRef();
2357 memrefType = storeOp.getMemRefType();
2358 }
2359 // Not an affine.load/store op.
2360 if (!memref)
2361 return;
2362
2363 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2364 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2365 memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2366 return;
2367
2368 if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2369 LDBG() << "memref definition is inside of the depth at "
2370 << "which copy-in/copy-out would happen";
2371 return;
2372 }
2373
2374 // Compute the MemRefRegion accessed.
2375 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2376 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2377 /*addMemRefDimBounds=*/false))) {
2378 LDBG() << "Error obtaining memory region: semi-affine maps?";
2379 LDBG() << "over-approximating to the entire memref";
2380 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2381 LDBG() << "non-constant memref sizes not yet supported";
2382 error = true;
2383 return;
2384 }
2385 }
2386
2387 // Each memref has a single buffer associated with it irrespective of how
2388 // many load's and store's happen on it.
2389 // TODO: in the future, when regions don't intersect and satisfy
2390 // other properties (based on load/store regions), we could consider
2391 // multiple buffers per memref.
2392
2393 // Add to the appropriate region if it's not already in it, or take a
2394 // bounding box union with the existing one if it's already in there.
2395 // Note that a memref may have both read and write regions - so update the
2396 // region in the other list if one exists (write in case of read and vice
2397 // versa) since there is a single bounding box for a memref across all reads
2398 // and writes that happen on it.
2399
2400 // Attempts to update; returns true if 'region' exists in targetRegions.
2401 auto updateRegion =
2402 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2403 &targetRegions) {
2404 const auto *const it = targetRegions.find(region->memref);
2405 if (it == targetRegions.end())
2406 return false;
2407
2408 // Perform a union with the existing region.
2409 if (failed(it->second->unionBoundingBox(*region))) {
2410 LDBG() << "Memory region bounding box failed; "
2411 << "over-approximating to the entire memref";
2412 // If the union fails, we will overapproximate.
2413 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2414 LDBG() << "non-constant memref sizes not yet supported";
2415 error = true;
2416 return true;
2417 }
2418 it->second->getConstraints()->clearAndCopyFrom(
2419 *region->getConstraints());
2420 } else {
2421 // Union was computed and stored in 'it->second': copy to 'region'.
2422 region->getConstraints()->clearAndCopyFrom(
2423 *it->second->getConstraints());
2424 }
2425 return true;
2426 };
2427
2428 bool existsInRead = updateRegion(readRegions);
2429 if (error)
2430 return;
2431 bool existsInWrite = updateRegion(writeRegions);
2432 if (error)
2433 return;
2434
2435 // Finally add it to the region list.
2436 if (region->isWrite() && !existsInWrite) {
2437 writeRegions[region->memref] = std::move(region);
2438 } else if (!region->isWrite() && !existsInRead) {
2439 readRegions[region->memref] = std::move(region);
2440 }
2441 });
2442
2443 if (error) {
2444 LDBG() << "copy generation failed for one or more memref's in this block";
2445 return failure();
2446 }
2447
2448 uint64_t totalCopyBuffersSizeInBytes = 0;
2449 bool ret = true;
2450 auto processRegions =
2451 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2452 &regions) {
2453 for (const auto &regionEntry : regions) {
2454 // For each region, hoist copy in/out past all hoistable
2455 // 'affine.for's.
2456 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2457 Block *copyPlacementBlock;
2459 *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2460 &copyInPlacementStart, &copyOutPlacementStart);
2461
2462 uint64_t sizeInBytes;
2463 Block::iterator nBegin, nEnd;
2464 LogicalResult iRet = generateCopy(
2465 *regionEntry.second, block, begin, end, copyPlacementBlock,
2466 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2467 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2468 if (succeeded(iRet)) {
2469 // begin/end could have been invalidated, and need update.
2470 begin = nBegin;
2471 end = nEnd;
2472 totalCopyBuffersSizeInBytes += sizeInBytes;
2473 }
2474 ret = ret & succeeded(iRet);
2475 }
2476 };
2477 processRegions(readRegions);
2478 processRegions(writeRegions);
2479
2480 if (!ret) {
2481 LDBG() << "copy generation failed for one or more memref's in this block";
2482 return failure();
2483 }
2484
2485 // For a range of operations, a note will be emitted at the caller.
2486 AffineForOp forOp;
2487 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2488 LLVM_DEBUG(forOp.emitRemark()
2489 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2490 << " KiB of copy buffers in fast memory space for this block");
2491 }
2492
2493 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2494 block->getParentOp()->emitWarning(
2495 "total size of all copy buffers' for this block exceeds fast memory "
2496 "capacity");
2497 }
2498
2499 return success();
2500}
2501
2502// A convenience version of affineDataCopyGenerate for all ops in the body of
2503// an AffineForOp.
2504LogicalResult mlir::affine::affineDataCopyGenerate(
2505 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2506 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2507 return affineDataCopyGenerate(forOp.getBody()->begin(),
2508 std::prev(forOp.getBody()->end()), copyOptions,
2509 filterMemRef, copyNests);
2510}
2511
2512LogicalResult mlir::affine::generateCopyForMemRegion(
2513 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2514 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2515 Block *block = analyzedOp->getBlock();
2516 auto begin = analyzedOp->getIterator();
2517 auto end = std::next(begin);
2518 DenseMap<Value, Value> fastBufferMap;
2519 DenseSet<Operation *> copyNests;
2520
2521 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2522 copyOptions, fastBufferMap, copyNests,
2523 &result.sizeInBytes, &begin, &end);
2524 if (failed(err))
2525 return err;
2526
2527 const auto &en = fastBufferMap.find(memrefRegion.memref);
2528 // In some cases (empty loops), no copy generation would have happened.
2529 if (en == fastBufferMap.end())
2530 return failure();
2531 result.alloc = en->second.getDefiningOp();
2532 assert(result.alloc && "fast buffer expected to be locally allocated");
2533 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2534 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2535 return success();
2536}
2537
2538/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2539static void
2540gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2541 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2542 // Add a new empty level to output if it doesn't exist level already.
2543 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2544 if (currLoopDepth == depthToLoops.size())
2545 depthToLoops.emplace_back();
2546
2547 for (auto &op : *block) {
2548 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2549 depthToLoops[currLoopDepth].push_back(forOp);
2550 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2551 }
2552 }
2553}
2554
2555/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2556void mlir::affine::gatherLoops(
2557 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2558 for (auto &block : func)
2559 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2560
2561 // Remove last loop level from output since it's empty.
2562 if (!depthToLoops.empty()) {
2563 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2564 depthToLoops.pop_back();
2565 }
2566}
2567
2568AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2569 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2570 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2571 SmallVector<Value, 4> lowerOperands(lbOperands);
2572 SmallVector<Value, 4> upperOperands(ubOperands);
2573
2574 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2575 canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2576 lbMap = removeDuplicateExprs(lbMap);
2577 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2578 canonicalizeMapAndOperands(&ubMap, &upperOperands);
2579 ubMap = removeDuplicateExprs(ubMap);
2580
2581 return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2582 step);
2583}
2584
2585/// Creates an AffineIfOp that encodes the conditional to choose between
2586/// the constant trip count version and an unknown trip count version of this
2587/// nest of loops. This is used to separate partial and full tiles if `loops`
2588/// has the intra-tile loops. The affine.if op is inserted at the builder
2589/// insertion point of `b`.
2591 OpBuilder b) {
2592 if (loops.empty())
2593 return nullptr;
2594
2595 auto *context = loops[0].getContext();
2596
2599 llvm::append_range(ops, loops);
2600 (void)getIndexSet(ops, &cst);
2601
2602 // Remove constraints that are independent of these loop IVs.
2603 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2604
2605 // Construct the constraint set representing the guard for full tiles. The
2606 // lower bound (and upper bound) corresponding to the full tile should be
2607 // larger (and resp. smaller) than any other lower (or upper bound).
2608 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2609 for (auto loop : loops) {
2610 (void)loop;
2611 // TODO: Non-unit stride is not an issue to generalize to.
2612 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2613 // Mark everything symbols for the purpose of finding a constant diff pair.
2614 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2615 1);
2616 unsigned fullTileLbPos, fullTileUbPos;
2617 if (!((IntegerRelation)cst)
2618 .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2619 /*boundFloorDivisor=*/nullptr,
2620 /*ub=*/nullptr, &fullTileLbPos,
2621 &fullTileUbPos)) {
2622 LDBG() << "Can't get constant diff pair for a loop";
2623 return nullptr;
2624 }
2625
2626 SmallVector<unsigned, 4> lbIndices, ubIndices;
2627 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2628
2629 auto fLb = cst.getInequality(fullTileLbPos);
2630 auto fUb = cst.getInequality(fullTileUbPos);
2631 fullTileLb.assign(fLb.begin(), fLb.end());
2632 fullTileUb.assign(fUb.begin(), fUb.end());
2633
2634 // Full tile lower bound should be >= than any other lower bound.
2635 for (auto lbIndex : lbIndices)
2636 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2637 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2638
2639 // Full tile upper bound should be <= any other upper bound.
2640 for (auto ubIndex : ubIndices)
2641 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2642 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2643
2644 cst.removeVar(0);
2645 }
2646
2647 // The previous step leads to all zeros for the full tile lb and ub position
2648 // itself; remove those and any other duplicates / trivial redundancies.
2650
2651 // Turn everything into dims conservatively since we earlier turned all
2652 // trailing ids past point loop IV into symbols. Some of these could be outer
2653 // loop IVs; we'll canonicalize anyway.
2655
2656 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2657 // ifCondSet can be null if cst was empty -- this can happen if all loops
2658 // in the nest have constant trip counts.
2659 if (!ifCondSet)
2660 return nullptr;
2661
2662 SmallVector<Value, 4> setOperands;
2663 cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2664 canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2665 return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2666 /*withElseRegion=*/true);
2667}
2668
2669/// Create the full tile loop nest (along with its body).
2670static LogicalResult
2672 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2673 fullTileLoops.reserve(inputNest.size());
2674
2675 // For each loop in the original nest identify a lower/upper bound pair such
2676 // that their difference is a constant.
2678 for (auto loop : inputNest) {
2679 // TODO: straightforward to generalize to a non-unit stride.
2680 if (loop.getStepAsInt() != 1) {
2681 LDBG() << "[tile separation] non-unit stride not implemented";
2682 return failure();
2683 }
2684 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2685 (void)getIndexSet(loopOp, &cst);
2686 // We will mark everything other than this loop IV as symbol for getting a
2687 // pair of <lb, ub> with a constant difference.
2689 unsigned lbPos, ubPos;
2690 if (!((IntegerRelation)cst)
2691 .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2692 /*boundFloorDivisor=*/nullptr,
2693 /*ub=*/nullptr, &lbPos, &ubPos) ||
2694 lbPos == ubPos) {
2695 LDBG() << "[tile separation] Can't get constant diff / "
2696 << "equalities not yet handled";
2697 return failure();
2698 }
2699
2700 // Set all variables as dimensions uniformly since some of those marked as
2701 // symbols above could be outer loop IVs (corresponding tile space IVs).
2702 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2703
2704 AffineValueMap lbVmap, ubVmap;
2705 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2706 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2707 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2708 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2709 ubVmap.getOperands(), ubVmap.getAffineMap());
2710 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2711 fullTileLoops.push_back(fullTileLoop);
2712 }
2713
2714 // Add the body for the full tile loop nest.
2715 IRMapping operandMap;
2716 for (const auto &loopEn : llvm::enumerate(inputNest))
2717 operandMap.map(loopEn.value().getInductionVar(),
2718 fullTileLoops[loopEn.index()].getInductionVar());
2719 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2720 for (auto &op : inputNest.back().getBody()->without_terminator())
2721 b.clone(op, operandMap);
2722 return success();
2723}
2724
2725LogicalResult
2726mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2727 SmallVectorImpl<AffineForOp> *fullTileNest) {
2728 if (inputNest.empty())
2729 return success();
2730
2731 auto firstLoop = inputNest[0];
2732
2733 // Each successive for op has to be nested in the other.
2734 auto prevLoop = firstLoop;
2735 for (auto loop : inputNest.drop_front(1)) {
2736 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2737 prevLoop = loop;
2738 }
2739
2740 // Create the full tile loop nest.
2741 SmallVector<AffineForOp, 4> fullTileLoops;
2742 OpBuilder b(firstLoop);
2743 if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2744 if (!fullTileLoops.empty())
2745 fullTileLoops.front().erase();
2746 return failure();
2747 }
2748
2749 // Create and insert the version select right before the root of the nest.
2750 b = OpBuilder(firstLoop);
2751 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2752 if (!ifOp) {
2753 fullTileLoops.front().erase();
2754 LDBG() << "All tiles are full tiles, or failure creating "
2755 << "separation condition";
2756 return failure();
2757 }
2758
2759 // Move the full tile into the then block.
2760 Block *thenBlock = ifOp.getThenBlock();
2761 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2762 thenBlock->getOperations().splice(
2763 std::prev(thenBlock->end()),
2764 outermostFullTileLoop->getBlock()->getOperations(),
2765 Block::iterator(outermostFullTileLoop));
2766
2767 // Move the partial tile into the else block. The partial tile is the same as
2768 // the original loop nest.
2769 Block *elseBlock = ifOp.getElseBlock();
2770 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2771 firstLoop->getBlock()->getOperations(),
2772 Block::iterator(firstLoop));
2773
2774 if (fullTileNest)
2775 *fullTileNest = std::move(fullTileLoops);
2776
2777 return success();
2778}
2779
2780LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2781 LogicalResult result(failure());
2783 getPerfectlyNestedLoops(loops, op);
2784 if (loops.size() <= 1)
2785 return success();
2786
2787 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2788 // loops with bounds defined above some loop.
2789 // 1. For each loop, find above which parent loop its operands are
2790 // defined.
2791 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2792 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2793 operandsDefinedAbove[i] = i;
2794 for (unsigned j = 0; j < i; ++j) {
2795 if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2796 operandsDefinedAbove[i] = j;
2797 break;
2798 }
2799 }
2800 }
2801
2802 // 2. Identify bands of loops such that the operands of all of them are
2803 // defined above the first loop in the band. Traverse the nest bottom-up
2804 // so that modifications don't invalidate the inner loops.
2805 for (unsigned end = loops.size(); end > 0; --end) {
2806 unsigned start = 0;
2807 for (; start < end - 1; ++start) {
2808 auto maxPos =
2809 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2810 std::next(operandsDefinedAbove.begin(), end));
2811 if (maxPos > start)
2812 continue;
2813 assert(maxPos == start &&
2814 "expected loop bounds to be known at the start of the band");
2815 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2816 if (succeeded(coalesceLoops(band)))
2817 result = success();
2818 break;
2819 }
2820 // If a band was found and transformed, keep looking at the loops above
2821 // the outermost transformed loop.
2822 if (start != end - 1)
2823 end = start + 1;
2824 }
2825 return result;
2826}
2827
2829 int64_t count = 0;
2830 Operation *currentOp = operand.getOwner();
2831 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2832 if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2833 break;
2834 currentOp = loopOp;
2835 count++;
2836 }
2837 return count;
2838}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its the output argument nEnd is set *to the new end sizeInBytes is set to the size of the fast buffer *allocated *static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
fullyComposeAffineMapAndOperands & fastBufMap
static InFlightDiagnostic emitRemarkForBlock(Block &block)
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 > > &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * > > > &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
auto load
return copyNestRoot
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 > > &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition LoopUtils.cpp:45
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
fastBufExprs
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType::iterator iterator
Definition Block.h:150
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition Block.cpp:27
OpListType & getOperations()
Definition Block.h:147
Operation & front()
Definition Block.h:163
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition Block.h:318
Operation * getTerminator()
Get the terminator operation of this block.
Definition Block.cpp:249
iterator end()
Definition Block.h:154
iterator begin()
Definition Block.h:153
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition Block.cpp:31
AffineMap getDimIdentityMap()
Definition Builders.cpp:388
AffineExpr getAffineDimExpr(unsigned position)
Definition Builders.cpp:369
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:567
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition Builders.h:254
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:414
This class represents an operand of an operation.
Definition Value.h:254
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
Definition Operation.h:1142
This class implements the operand iterators for the Operation class.
Definition ValueRange.h:44
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Block * getBlock()
Returns the operation block that contains this operation.
Definition Operation.h:230
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:240
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:255
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Operation * clone(IRMapping &mapper, const CloneOptions &options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition Operation.h:247
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition Value.h:208
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
AffineBound represents a lower or upper bound in the for operation.
Definition AffineOps.h:217
Value getOperand(unsigned idx)
Definition AffineOps.h:223
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:384
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition Utils.cpp:866
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 > > *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
bool isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition Utils.cpp:852
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition Utils.cpp:1460
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition Utils.cpp:2060
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
void generateUnrolledLoop(Block *loopBodyBlock, Value iv, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues, IRMapping *clonedToSrcOpsMap=nullptr)
Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs' and 'yieldedValues' as the...
Definition Utils.cpp:295
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:122
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:120
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition Utils.cpp:1023
llvm::function_ref< Fn > function_ref
Definition LLVM.h:147
int64_t stride
int64_t numEltPerStride
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition LoopUtils.h:159
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition Utils.h:489
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition Utils.cpp:1157
bool isWrite() const
Definition Utils.h:537
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition Utils.cpp:1213
void setWrite(bool flag)
Definition Utils.h:538
FlatAffineValueConstraints * getConstraints()
Definition Utils.h:535
Value memref
Memref that this region corresponds to.
Definition Utils.h:570
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition Utils.h:577
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.