MLIR 22.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
22#include "mlir/IR/IRMapping.h"
23#include "mlir/IR/IntegerSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/Support/Debug.h"
28#include "llvm/Support/DebugLog.h"
29#include "llvm/Support/raw_ostream.h"
30#include <optional>
31
32#define DEBUG_TYPE "loop-utils"
33
34using namespace mlir;
35using namespace affine;
36using namespace presburger;
37using llvm::SmallMapVector;
38
39/// Computes the cleanup loop lower bound of the loop being unrolled with
40/// the specified unroll factor; this bound will also be upper bound of the main
41/// part of the unrolled loop. Computes the bound as an AffineMap with its
42/// operands or a null map when the trip count can't be expressed as an affine
43/// expression.
44static void
45getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
46 AffineMap &cleanupLbMap,
47 SmallVectorImpl<Value> &cleanupLbOperands) {
48 AffineMap tripCountMap;
49 SmallVector<Value, 4> tripCountOperands;
50 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
51 // Trip count can't be computed.
52 if (!tripCountMap) {
53 cleanupLbMap = AffineMap();
54 return;
55 }
56
57 OpBuilder b(forOp);
58 auto lbMap = forOp.getLowerBoundMap();
59 auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60 forOp.getLowerBoundOperands());
61
62 // For each upper bound expr, get the range.
63 // Eg: affine.for %i = lb to min (ub1, ub2),
64 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
65 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
66 // these affine.apply's make up the cleanup loop lower bound.
67 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
68 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
69 int64_t step = forOp.getStepAsInt();
70 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
71 auto tripCountExpr = tripCountMap.getResult(i);
72 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
73 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
74 tripCountMap.getNumSymbols(), bumpExprs[i]);
75 bumpValues[i] =
76 AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
77 }
78
79 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
80 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
81 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
82
83 cleanupLbOperands.clear();
84 cleanupLbOperands.push_back(lb);
85 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
86 cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
87 b.getContext());
88 // Simplify the cleanupLbMap + cleanupLbOperands.
89 fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
90 cleanupLbMap = simplifyAffineMap(cleanupLbMap);
91 canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
92 // Remove any affine.apply's that became dead from the simplification above.
93 for (auto v : bumpValues)
94 if (v.use_empty())
95 v.getDefiningOp()->erase();
96
97 if (lb.use_empty())
98 lb.erase();
99}
100
101/// Helper to replace uses of loop carried values (iter_args) and loop
102/// yield values while promoting single iteration affine.for ops.
103static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
104 // Replace uses of iter arguments with iter operands (initial values).
105 auto iterOperands = forOp.getInits();
106 auto iterArgs = forOp.getRegionIterArgs();
107 for (auto e : llvm::zip(iterOperands, iterArgs))
108 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
109
110 // Replace uses of loop results with the values yielded by the loop.
111 auto outerResults = forOp.getResults();
112 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113 for (auto e : llvm::zip(outerResults, innerResults))
114 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
115}
116
117/// Promotes the loop body of a forOp to its containing block if the forOp
118/// was known to have a single iteration.
119LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
120 std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
121 if (!tripCount || *tripCount != 1)
122 return failure();
123
124 // TODO: extend this for arbitrary affine bounds.
125 if (forOp.getLowerBoundMap().getNumResults() != 1)
126 return failure();
127
128 // Replaces all IV uses to its single iteration value.
129 auto iv = forOp.getInductionVar();
130 auto *parentBlock = forOp->getBlock();
131 if (!iv.use_empty()) {
132 if (forOp.hasConstantLowerBound()) {
133 auto func = forOp->getParentOfType<FunctionOpInterface>();
134 OpBuilder builder(forOp->getContext());
135 if (func)
136 builder.setInsertionPointToStart(&func.getFunctionBody().front());
137 else
138 builder.setInsertionPoint(forOp);
139 auto constOp = arith::ConstantIndexOp::create(
140 builder, forOp.getLoc(), forOp.getConstantLowerBound());
141 iv.replaceAllUsesWith(constOp);
142 } else {
143 auto lbOperands = forOp.getLowerBoundOperands();
144 auto lbMap = forOp.getLowerBoundMap();
145 OpBuilder builder(forOp);
146 if (lbMap == builder.getDimIdentityMap()) {
147 // No need of generating an affine.apply.
148 iv.replaceAllUsesWith(lbOperands[0]);
149 } else {
150 auto affineApplyOp =
151 AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152 iv.replaceAllUsesWith(affineApplyOp);
153 }
154 }
155 }
156
158
159 // Move the loop body operations, except for its terminator, to the loop's
160 // containing block.
161 forOp.getBody()->back().erase();
162 parentBlock->getOperations().splice(Block::iterator(forOp),
163 forOp.getBody()->getOperations());
164 forOp.erase();
165 return success();
166}
167
168/// Generates an affine.for op with the specified lower and upper bounds
169/// while generating the right IV remappings to realize shifts for operations in
170/// its body. The operations that go into the loop body are specified in
171/// opGroupQueue starting from the specified offset, and in that order. The
172/// first element of the pair specifies the shift applied to that group of
173/// operations; the shift is multiplied by the loop step before being applied.
174/// Returns nullptr if the generated loop simplifies to a single iteration one.
175static AffineForOp generateShiftedLoop(
176 AffineMap lbMap, AffineMap ubMap,
177 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
178 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
179 auto lbOperands = srcForOp.getLowerBoundOperands();
180 auto ubOperands = srcForOp.getUpperBoundOperands();
181
182 assert(lbMap.getNumInputs() == lbOperands.size());
183 assert(ubMap.getNumInputs() == ubOperands.size());
184
185 auto loopChunk =
186 AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187 ubMap, srcForOp.getStepAsInt());
188 auto loopChunkIV = loopChunk.getInductionVar();
189 auto srcIV = srcForOp.getInductionVar();
190
191 IRMapping operandMap;
192
193 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
194 for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195 uint64_t shift = it.first;
196 auto ops = it.second;
197 // All 'same shift' operations get added with their operands being
198 // remapped to results of cloned operations, and their IV used remapped.
199 // Generate the remapping if the shift is not zero: remappedIV = newIV -
200 // shift.
201 if (!srcIV.use_empty() && shift != 0) {
202 auto ivRemap = AffineApplyOp::create(
203 bodyBuilder, srcForOp.getLoc(),
204 bodyBuilder.getSingleDimShiftAffineMap(
205 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
206 loopChunkIV);
207 operandMap.map(srcIV, ivRemap);
208 } else {
209 operandMap.map(srcIV, loopChunkIV);
210 }
211 for (auto *op : ops)
212 bodyBuilder.clone(*op, operandMap);
213 };
214 if (succeeded(promoteIfSingleIteration(loopChunk)))
215 return AffineForOp();
216 return loopChunk;
217}
218
219// The skewing of operations with respect to one another can be used for
220// example to allow overlap of asynchronous operations (such as DMA
221// communication) with computation, or just relative shifting of operations
222// for better register reuse, locality or parallelism. As such, the shifts are
223// typically expected to be at most of the order of the number of operations.
224// This method should not be used as a substitute for loop distribution/fission.
225// This method uses an algorithm// in time linear in the number of operations
226// in the body of the for loop - (using the 'sweep line' paradigm). This method
227// asserts preservation of SSA dominance. A check for that as well as that for
228// memory-based dependence preservation check rests with the users of this
229// method.
230LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
231 ArrayRef<uint64_t> shifts,
232 bool unrollPrologueEpilogue) {
233 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 "too few/many shifts");
235 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
236 return success();
237
238 // If the trip counts aren't constant, we would need versioning and
239 // conditional guards (or context information to prevent such versioning). The
240 // better way to pipeline for such loops is to first tile them and extract
241 // constant trip count "full tiles" before applying this.
242 auto mayBeConstTripCount = getConstantTripCount(forOp);
243 if (!mayBeConstTripCount) {
244 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
245 return success();
246 }
247 uint64_t tripCount = *mayBeConstTripCount;
248
249 assert(isOpwiseShiftValid(forOp, shifts) &&
250 "shifts will lead to an invalid transformation\n");
251
252 int64_t step = forOp.getStepAsInt();
253
254 unsigned numChildOps = shifts.size();
255
256 // Do a linear time (counting) sort for the shifts.
257 uint64_t maxShift = *llvm::max_element(shifts);
258 if (maxShift >= numChildOps) {
259 // Large shifts are not the typical use case.
260 forOp.emitWarning("not shifting because shifts are unrealistically large");
261 return success();
262 }
263
264 // An array of operation groups sorted by shift amount; each group has all
265 // operations with the same shift in the order in which they appear in the
266 // body of the 'affine.for' op.
267 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268 unsigned pos = 0;
269 for (auto &op : forOp.getBody()->without_terminator()) {
270 auto shift = shifts[pos++];
271 sortedOpGroups[shift].push_back(&op);
272 }
273
274 // Unless the shifts have a specific pattern (which actually would be the
275 // common use case), prologue and epilogue are not meaningfully defined.
276 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
277 // loop generated as the prologue and the last as epilogue and unroll these
278 // fully.
279 AffineForOp prologue, epilogue;
280
281 // Do a sweep over the sorted shifts while storing open groups in a
282 // vector, and generating loop portions as necessary during the sweep. A block
283 // of operations is paired with its shift.
284 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285
286 auto origLbMap = forOp.getLowerBoundMap();
287 uint64_t lbShift = 0;
288 OpBuilder b(forOp);
289 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290 // If nothing is shifted by d, continue.
291 if (sortedOpGroups[d].empty())
292 continue;
293 if (!opGroupQueue.empty()) {
294 assert(d > 0 &&
295 "Queue expected to be empty when the first block is found");
296 // The interval for which the loop needs to be generated here is:
297 // [lbShift, min(lbShift + tripCount, d)) and the body of the
298 // loop needs to have all operations in opQueue in that order.
299 AffineForOp res;
300 if (lbShift + tripCount * step < d * step) {
302 b.getShiftedAffineMap(origLbMap, lbShift),
303 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
304 opGroupQueue, /*offset=*/0, forOp, b);
305 // Entire loop for the queued op groups generated, empty it.
306 opGroupQueue.clear();
307 lbShift += tripCount * step;
308 } else {
309 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
310 b.getShiftedAffineMap(origLbMap, d),
311 opGroupQueue, /*offset=*/0, forOp, b);
312 lbShift = d * step;
313 }
314
315 if (res) {
316 // Simplify/canonicalize the affine.for.
317 RewritePatternSet patterns(res.getContext());
318 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
319 bool erased;
321 res.getOperation(), std::move(patterns),
322 GreedyRewriteConfig().setStrictness(
324 /*changed=*/nullptr, &erased);
325 if (!erased && !prologue)
326 prologue = res;
327 if (!erased)
328 epilogue = res;
329 }
330 } else {
331 // Start of first interval.
332 lbShift = d * step;
333 }
334 // Augment the list of operations that get into the current open interval.
335 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
336 }
337
338 // Those operations groups left in the queue now need to be processed (FIFO)
339 // and their loops completed.
340 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
343 b.getShiftedAffineMap(origLbMap, ubShift),
344 opGroupQueue, /*offset=*/i, forOp, b);
345 lbShift = ubShift;
346 if (!prologue)
347 prologue = epilogue;
348 }
349
350 // Erase the original for op.
351 forOp.erase();
352
353 if (unrollPrologueEpilogue && prologue)
354 (void)loopUnrollFull(prologue);
355 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
356 (void)loopUnrollFull(epilogue);
357
358 return success();
359}
360
361/// Checks whether a loop nest is hyper-rectangular or not.
362static LogicalResult
365 SmallVector<Operation *, 8> ops(input.begin(), input.end());
366 // 0-d or 1-d is trivially hyper-rectangular.
367 if (input.size() <= 1)
368 return success();
369 if (failed(getIndexSet(ops, &cst))) {
370 LDBG() << "Index set computation failed!";
371 return failure();
372 }
373 if (!cst.isHyperRectangular(0, input.size())) {
374 LDBG() << "Non-hyperrectangular nests not supported for tiling!";
375 return failure();
376 }
377 return success();
378}
379
380/// Check if the input nest is supported for tiling and whether tiling would be
381/// legal or not.
382template <typename t>
384 ArrayRef<t> tileSizes) {
385 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387 if (llvm::any_of(input,
388 [](AffineForOp op) { return op.getNumResults() > 0; })) {
389 LDBG() << "Cannot tile nest where a loop has yield values";
390 return failure();
391 }
392
393 // Check if the supplied `for` ops are all successively nested.
394 if (!isPerfectlyNested(input)) {
395 LDBG() << "input loops not perfectly nested";
396 return failure();
397 }
398
399 // TODO: handle non hyper-rectangular spaces.
400 if (failed(checkIfHyperRectangular(input)))
401 return failure();
402
403 return success();
404}
405
406/// Move the loop body of AffineForOp 'src' from 'src' into the specified
407/// location in destination's body, ignoring the terminator.
408static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409 Block::iterator loc) {
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
413}
414
415/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416/// body.
417static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419}
420
421/// Constructs tiled loop nest, without setting the loop bounds and move the
422/// body of the original loop nest to the tiled loop nest.
424 AffineForOp rootAffineForOp, unsigned width,
425 MutableArrayRef<AffineForOp> tiledLoops) {
426 Location loc = rootAffineForOp.getLoc();
427
428 // The outermost among the loops as we add more..
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
431
432 // Add intra-tile (or point) loops.
433 for (unsigned i = 0; i < width; i++) {
434 OpBuilder b(topLoop);
435 // Loop bounds will be set later.
436 AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
438 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439 topLoop);
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
442 if (i == 0)
443 innermostPointLoop = pointLoop;
444 }
445
446 // Add tile space loops;
447 for (unsigned i = width; i < 2 * width; i++) {
448 OpBuilder b(topLoop);
449 // Loop bounds will be set later.
450 AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
452 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453 topLoop);
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
456 }
457
458 // Move the loop body of the original nest to the new one.
459 moveLoopBody(origLoops.back(), innermostPointLoop);
460}
461
462/// Set lower and upper bounds of intra-tile loops for parametric tiling.
463// TODO: Handle non-constant lower bounds.
464static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
467 Value tileSize) {
468 // The lower bound for the intra-tile loop is represented by an affine map
469 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474 // corresponding inter-tile loop.
475
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
478
479 // Get lower bound of original loop as an affine expression.
480 AffineExpr origLowerBoundExpr;
481 origLowerBoundExpr =
482 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483
484 // Add dim operands from original lower/upper bound.
485 SmallVector<Value, 4> lbOperands, ubOperands;
486 AffineBound lb = origLoop.getLowerBound();
487 AffineBound ub = origLoop.getUpperBound();
488 lbOperands.reserve(lb.getNumOperands() + 2);
489 ubOperands.reserve(ub.getNumOperands() + 2);
490 AffineMap origLbMap = lb.getMap();
491 AffineMap origUbMap = ub.getMap();
492 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493 lbOperands.push_back(lb.getOperand(j));
494 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495 ubOperands.push_back(ub.getOperand(j));
496
497 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498 // IV.
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
501
502 // Get loop IV as an affine expression for lower/upper bound. Size of
503 // lb/ubOperands is guaranteed to be atleast one.
504 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506
507 // Add symbol operands from original lower/upper bound.
508 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512
513 // Add a new symbol operand which is the tile size for this loop.
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
516
517 SmallVector<AffineExpr, 4> lbBoundExprs;
518 SmallVector<AffineExpr, 4> ubBoundExprs;
519 lbBoundExprs.reserve(origLbMap.getNumResults());
520 ubBoundExprs.reserve(origUbMap.getNumResults());
521
522 // Get tiling parameter as an affine expression for lb/ub.
523 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525
526 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529 origLowerBoundExpr);
530
531 // Get the origLoopStep as an affine expression.
532 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533
534 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535 // (tilingParameter * origLoopStep) + origlb.
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539
540 ubBoundExprs.append(origUbMap.getResults().begin(),
541 origUbMap.getResults().end());
542
543 AffineMap lbMap =
544 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545 lbBoundExprs, b.getContext());
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547
548 AffineMap ubMap =
549 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550 ubBoundExprs, b.getContext());
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552
553 // Original loop step must be preserved.
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
555}
556
557/// Set lower and upper bounds of inter-tile loops for parametric tiling.
558// TODO: Handle non-constant lower bounds.
559static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560 AffineForOp newLoop, Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562
563 // The lower bounds for inter-tile loops are same as the corresponding lower
564 // bounds of original loops.
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566
567 // The new upper bound map for inter-tile loops, assuming constant lower
568 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569 // originalLowerBound), tiling parameter); where tiling parameter is the
570 // respective tile size for that loop. For e.g. if the original ubmap was
571 // ()->(1024), the new map will be
572 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573 // Therefore a new symbol operand is inserted in the map and the result
574 // expression is overwritten.
575
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
578
579 // Get lower bound of original loop as an affine expression.
580 AffineExpr origLowerBoundExpr;
581 origLowerBoundExpr =
582 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583
584 // Add dim operands from original upper bound.
585 SmallVector<Value, 4> ubOperands;
586 AffineBound ub = origLoop.getUpperBound();
587 ubOperands.reserve(ub.getNumOperands() + 1);
588 AffineMap origUbMap = ub.getMap();
589 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590 ubOperands.push_back(ub.getOperand(j));
591
592 // Add symbol operands from original upper bound.
593 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595
596 // Add a new symbol operand which is the tile size for this loop.
597 ubOperands.push_back(tileSize);
598
599 // Get tiling parameter as an affine expression.
600 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601
603 boundExprs.reserve(origUbMap.getNumResults());
604 int64_t origUpperBound;
605 AffineExpr origUpperBoundExpr;
606
607 // If upper bound for the original loop is constant, then the constant can
608 // be obtained as an affine expression straight away.
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
611
612 // Get original constant upper bound as an affine expression.
613 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614
615 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616 // originalLowerBound), tilingParameter).
617 boundExprs.push_back(
618 origLowerBoundExpr +
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620 } else {
621 // If upper bound for the original loop is not constant then two cases
622 // are possible, although there handeling is the same, 1.) The result of
623 // ubmap has only one result expression. For e.g.
624 // affine.for %i = 5 to %ub
625 //
626 // A symbol operand is added which represents the tiling parameter. The
627 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628 // where 's0' is the original upper bound and 's1' is the tiling
629 // parameter. 2.) When ubMap has more than one result expression. For e.g.
630 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631 // affine.for %i = 5 to min #map0()[%s0, %s1]
632 //
633 // A symbol operand is added which represents the tiling parameter. The
634 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636
637 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638 // originalLowerBound), tilingParameter).
639 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640 boundExprs.push_back(
641 origLowerBoundExpr +
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643 }
644
645 AffineMap ubMap =
646 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647 boundExprs, b.getContext());
648 newLoop.setUpperBound(ubOperands, ubMap);
649
650 // Original loop step must be preserved.
651 newLoop.setStep(origLoop.getStepAsInt());
652}
653
654/// Constructs and sets new loop bounds after tiling for the case of
655/// hyper-rectangular index sets, where the bounds of one dimension do not
656/// depend on other dimensions and tiling parameters are captured from SSA
657/// values. Bounds of each dimension can thus be treated independently,
658/// and deriving the new bounds is much simpler and faster than for the case of
659/// tiling arbitrary polyhedral shapes.
662 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663 assert(!origLoops.empty() && "expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
666
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
669
670 // Set bounds for tile space loops.
671 for (unsigned i = 0; i < width; ++i) {
672 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673 }
674
675 // Set bounds for intra-tile loops.
676 for (unsigned i = 0; i < width; ++i) {
677 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678 newLoops[i + width], tileSizes[i]);
679 }
680}
681
682/// Constructs and sets new loop bounds after tiling for the case of
683/// hyper-rectangular index sets, where the bounds of one dimension do not
684/// depend on other dimensions. Bounds of each dimension can thus be treated
685/// independently, and deriving the new bounds is much simpler and faster
686/// than for the case of tiling arbitrary polyhedral shapes.
687static void
690 ArrayRef<unsigned> tileSizes) {
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
693
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
696
697 // Bounds for tile space loops.
698 for (unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703 // If the step size of original loop is x and tileSize is y then after
704 // tiling the tile space loops' step size becomes x*y.
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706 }
707 // Bounds for intra-tile loops.
708 for (unsigned i = 0; i < width; i++) {
709 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710 std::optional<uint64_t> mayBeConstantCount =
711 getConstantTripCount(origLoops[i]);
712 // The lower bound is just the tile-space loop.
713 AffineMap lbMap = b.getDimIdentityMap();
714 newLoops[width + i].setLowerBound(
715 /*operands=*/newLoops[i].getInductionVar(), lbMap);
716 // The step sizes of intra-tile loops is just the original loops' step size.
717 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
718
719 // Set the upper bound.
720 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
721 // Trip count is less than the tile size: upper bound is lower bound +
722 // trip count * stepSize.
723 AffineMap ubMap = b.getSingleDimShiftAffineMap(
724 *mayBeConstantCount * origLoops[i].getStepAsInt());
725 newLoops[width + i].setUpperBound(
726 /*operands=*/newLoops[i].getInductionVar(), ubMap);
727 } else if (largestDiv % tileSizes[i] != 0) {
728 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
729 // Construct the upper bound map; the operands are the original operands
730 // with 'i' (tile-space loop) appended to it. The new upper bound map is
731 // the original one with an additional expression i + tileSize * stepSize
732 // appended.
733
734 // Add dim operands from original upper bound.
735 SmallVector<Value, 4> ubOperands;
736 AffineBound ub = origLoops[i].getUpperBound();
737 ubOperands.reserve(ub.getNumOperands() + 1);
738 AffineMap origUbMap = ub.getMap();
739 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
740 ubOperands.push_back(ub.getOperand(j));
741
742 // Add dim operand for new loop upper bound.
743 ubOperands.push_back(newLoops[i].getInductionVar());
744
745 // Add symbol operands from original upper bound.
746 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
747 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
748
750 boundExprs.reserve(1 + origUbMap.getNumResults());
751 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
752 // The new upper bound map is the original one with an additional
753 // expression i + tileSize * stepSize (of original loop) appended.
754 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755 boundExprs.append(origUbMap.getResults().begin(),
756 origUbMap.getResults().end());
757 AffineMap ubMap =
758 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
759 boundExprs, b.getContext());
760 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
761 } else {
762 // No need of the min expression.
763 AffineExpr dim = b.getAffineDimExpr(0);
765 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
767 }
768 }
769}
770
771LogicalResult
773 ArrayRef<unsigned> tileSizes,
774 SmallVectorImpl<AffineForOp> *tiledNest) {
775 if (input.empty())
776 return success();
777
778 if (failed(performPreTilingChecks(input, tileSizes)))
779 return failure();
780
781 MutableArrayRef<AffineForOp> origLoops = input;
782 AffineForOp rootAffineForOp = origLoops[0];
783
784 // Note that width is at least one since the band isn't empty.
785 unsigned width = input.size();
786 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
787
788 // Construct a tiled loop nest without setting their bounds. Bounds are
789 // set later.
790 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
791
792 SmallVector<Value, 8> origLoopIVs;
793 extractForInductionVars(input, &origLoopIVs);
794
795 // Set loop bounds for the tiled loop nest.
796 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
797
798 // Replace original IVs with intra-tile loop IVs.
799 for (unsigned i = 0; i < width; i++)
800 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
801
802 // Erase the old loop nest.
803 rootAffineForOp.erase();
804
805 if (tiledNest)
806 *tiledNest = std::move(tiledLoops);
807
808 return success();
809}
810
811/// Tiles the specified band of perfectly nested loops creating tile-space
812/// loops and intra-tile loops, using SSA values as tiling parameters. A band
813/// is a contiguous set of loops.
816 SmallVectorImpl<AffineForOp> *tiledNest) {
817 if (input.empty())
818 return success();
819
820 if (failed(performPreTilingChecks(input, tileSizes)))
821 return failure();
822
823 MutableArrayRef<AffineForOp> origLoops = input;
824 AffineForOp rootAffineForOp = origLoops[0];
825 unsigned width = input.size();
826 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
827
828 // Construct a tiled loop nest without setting their bounds. Bounds are
829 // set later.
830 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
831
832 SmallVector<Value, 8> origLoopIVs;
833 extractForInductionVars(input, &origLoopIVs);
834
835 // Set loop bounds for the tiled loop nest.
837 tileSizes);
838
839 // Replace original IVs with intra-tile loop IVs.
840 for (unsigned i = 0; i < width; i++)
841 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
842
843 // Erase the old loop nest.
844 rootAffineForOp.erase();
845
846 if (tiledNest)
847 *tiledNest = std::move(tiledLoops);
848
849 return success();
850}
851
852/// Get perfectly nested sequence of loops starting at root of loop nest
853/// (the first op being another AffineFor, and the second op - a terminator).
854/// A loop is perfectly nested iff: the first op in the loop's body is another
855/// AffineForOp, and the second op is a terminator).
857 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
858 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
859 nestedLoops.push_back(root);
860 Block &body = root.getRegion().front();
861 if (body.begin() != std::prev(body.end(), 2))
862 return;
863
864 root = dyn_cast<AffineForOp>(&body.front());
865 if (!root)
866 return;
867 }
868}
869
870/// Unrolls this loop completely.
871LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
872 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
873 if (mayBeConstantTripCount.has_value()) {
874 uint64_t tripCount = *mayBeConstantTripCount;
875 if (tripCount == 0)
876 return success();
877 if (tripCount == 1)
878 return promoteIfSingleIteration(forOp);
879 return loopUnrollByFactor(forOp, tripCount);
880 }
881 return failure();
882}
883
884/// Unrolls this loop by the specified factor or by the trip count (if constant)
885/// whichever is lower.
886LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
887 uint64_t unrollFactor) {
888 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
889 if (mayBeConstantTripCount.has_value() &&
890 *mayBeConstantTripCount < unrollFactor)
891 return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
892 return loopUnrollByFactor(forOp, unrollFactor);
893}
894
895/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
896/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
897/// unrolled body. If specified, annotates the Ops in each unrolled iteration
898/// using annotateFn.
900 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
901 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
902 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
903 ValueRange iterArgs, ValueRange yieldedValues) {
904 // Builder to insert unrolled bodies just before the terminator of the body of
905 // 'forOp'.
906 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
907
908 constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
909 if (!annotateFn)
910 annotateFn = defaultAnnotateFn;
911
912 // Keep a pointer to the last non-terminator operation in the original block
913 // so that we know what to clone (since we are doing this in-place).
914 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
915
916 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
917 SmallVector<Value, 4> lastYielded(yieldedValues);
918
919 for (unsigned i = 1; i < unrollFactor; i++) {
920 IRMapping operandMap;
921
922 // Prepare operand map.
923 operandMap.map(iterArgs, lastYielded);
924
925 // If the induction variable is used, create a remapping to the value for
926 // this unrolled instance.
927 if (!forOpIV.use_empty()) {
928 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929 operandMap.map(forOpIV, ivUnroll);
930 }
931
932 // Clone the original body of 'forOp'.
933 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
934 Operation *clonedOp = builder.clone(*it, operandMap);
935 annotateFn(i, clonedOp, builder);
936 }
937
938 // Update yielded values. If the yielded value is defined outside the
939 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
940 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
941 // value with the clone corresponding to the yielded value.
942 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943 Operation *defOp = yieldedValues[i].getDefiningOp();
944 if (defOp && defOp->getBlock() == loopBodyBlock)
945 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
946 }
947 }
948
949 // Make sure we annotate the Ops in the original body. We do this last so that
950 // any annotations are not copied into the cloned Ops above.
951 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
952 annotateFn(0, &*it, builder);
953
954 // Update operands of the yield statement.
955 loopBodyBlock->getTerminator()->setOperands(lastYielded);
956}
957
958/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
959/// count is not a multiple of `unrollFactor`.
960static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
961 uint64_t unrollFactor) {
962 // Insert the cleanup loop right after 'forOp'.
963 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
964 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
965
966 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
967 // and produce results for the original users of `forOp` results.
968 auto results = forOp.getResults();
969 auto cleanupResults = cleanupForOp.getResults();
970 auto cleanupIterOperands = cleanupForOp.getInits();
971
972 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
975 }
976
977 AffineMap cleanupMap;
978 SmallVector<Value, 4> cleanupOperands;
979 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
980 if (!cleanupMap)
981 return failure();
982
983 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
984 // Promote the loop body up if this has turned into a single iteration loop.
985 (void)promoteIfSingleIteration(cleanupForOp);
986
987 // Adjust upper bound of the original loop; this is the same as the lower
988 // bound of the cleanup loop.
989 forOp.setUpperBound(cleanupOperands, cleanupMap);
990 return success();
991}
992
993/// Unrolls this loop by the specified factor. Returns success if the loop
994/// is successfully unrolled.
996 AffineForOp forOp, uint64_t unrollFactor,
997 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
998 bool cleanUpUnroll) {
999 assert(unrollFactor > 0 && "unroll factor should be positive");
1000
1001 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1002 if (unrollFactor == 1) {
1003 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1004 return failure();
1005 return success();
1006 }
1007
1008 // Nothing in the loop body other than the terminator.
1009 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1010 return success();
1011
1012 // If the trip count is lower than the unroll factor, no unrolled body.
1013 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014 if (cleanUpUnroll) {
1015 // Unroll the cleanup loop if cleanUpUnroll is specified.
1016 return loopUnrollFull(forOp);
1017 }
1018
1019 return failure();
1020 }
1021
1022 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1023 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1024 // Loops where the lower bound is a max expression or the upper bound is
1025 // a min expression and the trip count doesn't divide the unroll factor
1026 // can't be unrolled since the lower bound of the cleanup loop in such cases
1027 // cannot be expressed as an affine function or a max over affine functions.
1028 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029 forOp.getUpperBoundMap().getNumResults() != 1)
1030 return failure();
1031 if (cleanUpUnroll)
1032 // Force unroll including cleanup loop
1033 return loopUnrollFull(forOp);
1034 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1035 assert(false && "cleanup loop lower bound map for single result lower "
1036 "and upper bound maps can always be determined");
1037 }
1038
1039 ValueRange iterArgs(forOp.getRegionIterArgs());
1040 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1041
1042 // Scale the step of loop being unrolled by unroll factor.
1043 int64_t step = forOp.getStepAsInt();
1044 forOp.setStep(step * unrollFactor);
1046 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1047 [&](unsigned i, Value iv, OpBuilder b) {
1048 // iv' = iv + i * step
1049 auto d0 = b.getAffineDimExpr(0);
1050 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051 return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1052 },
1053 /*annotateFn=*/annotateFn,
1054 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1055
1056 // Promote the loop body up if this has turned into a single iteration loop.
1058 return success();
1059}
1060
1061LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1062 uint64_t unrollJamFactor) {
1063 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1064 if (mayBeConstantTripCount.has_value() &&
1065 *mayBeConstantTripCount < unrollJamFactor)
1066 return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1067 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1068}
1069
1070/// Check if all control operands of all loops are defined outside of `forOp`
1071/// and return false if not.
1072static bool areInnerBoundsInvariant(AffineForOp forOp) {
1073 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074 for (auto controlOperand : aForOp.getControlOperands()) {
1075 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076 return WalkResult::interrupt();
1077 }
1078 return WalkResult::advance();
1079 });
1080 return !walkResult.wasInterrupted();
1081}
1082
1083/// Unrolls and jams this loop by the specified factor.
1084LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1085 uint64_t unrollJamFactor) {
1086 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1087
1088 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1089 if (unrollJamFactor == 1) {
1090 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1091 return failure();
1092 return success();
1093 }
1094
1095 // Nothing in the loop body other than the terminator.
1096 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1097 return success();
1098
1099 // If the trip count is lower than the unroll jam factor, no unroll jam.
1100 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101 LDBG() << "[failed] trip count < unroll-jam factor";
1102 return failure();
1103 }
1104
1105 // If any control operand of any inner loop of `forOp` is defined within
1106 // `forOp`, no unroll jam.
1107 if (!areInnerBoundsInvariant(forOp))
1108 return failure();
1109
1110 // Gather all sub-blocks to jam upon the loop being unrolled.
1112 jbg.walk(forOp);
1113 auto &subBlocks = jbg.subBlocks;
1114
1115 // Collect loops with iter_args.
1116 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1117 forOp.walk([&](AffineForOp aForOp) {
1118 if (aForOp.getNumIterOperands() > 0)
1119 loopsWithIterArgs.push_back(aForOp);
1120 });
1121
1122 // Get supported reductions to be used for creating reduction ops at the end.
1123 SmallVector<LoopReduction> reductions;
1124 if (forOp.getNumIterOperands() > 0)
1125 getSupportedReductions(forOp, reductions);
1126
1127 // Generate the cleanup loop if trip count isn't a multiple of
1128 // unrollJamFactor.
1129 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1130 // Loops where the lower bound is a max expression or the upper bound is
1131 // a min expression and the trip count doesn't divide the unroll factor
1132 // can't be unrolled since the lower bound of the cleanup loop in such cases
1133 // cannot be expressed as an affine function or a max over affine functions.
1134 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135 forOp.getUpperBoundMap().getNumResults() != 1)
1136 return failure();
1137 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1138 assert(false && "cleanup loop lower bound map for single result lower "
1139 "and upper bound maps can always be determined");
1140 }
1141
1142 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1143 // iteration. There are (`unrollJamFactor` - 1) iterations.
1144 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1145
1146 // For any loop with iter_args, replace it with a new loop that has
1147 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1148 // operands.
1149 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1150 IRRewriter rewriter(forOp.getContext());
1151 for (AffineForOp oldForOp : loopsWithIterArgs) {
1152 SmallVector<Value> dupIterOperands, dupYieldOperands;
1153 ValueRange oldIterOperands = oldForOp.getInits();
1154 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1155 ValueRange oldYieldOperands =
1156 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1157 // Get additional iterOperands, iterArgs, and yield operands. We will
1158 // fix iterOperands and yield operands after cloning of sub-blocks.
1159 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1162 }
1163 // Create a new loop with additional iterOperands, iter_args and yield
1164 // operands. This new loop will take the loop body of the original loop.
1165 bool forOpReplaced = oldForOp == forOp;
1166 AffineForOp newForOp =
1167 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168 rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1169 [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1170 return dupYieldOperands;
1171 }));
1172 newLoopsWithIterArgs.push_back(newForOp);
1173 // `forOp` has been replaced with a new loop.
1174 if (forOpReplaced)
1175 forOp = newForOp;
1176 // Update `operandMaps` for `newForOp` iterArgs and results.
1177 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178 unsigned oldNumIterArgs = oldIterArgs.size();
1179 ValueRange newResults = newForOp.getResults();
1180 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181 assert(oldNumIterArgs == oldNumResults &&
1182 "oldNumIterArgs must be the same as oldNumResults");
1183 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1185 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1186 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1187 // to those in the `i`th new set.
1188 operandMaps[i - 1].map(newIterArgs[j],
1189 newIterArgs[i * oldNumIterArgs + j]);
1190 operandMaps[i - 1].map(newResults[j],
1191 newResults[i * oldNumResults + j]);
1192 }
1193 }
1194 }
1195
1196 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1197 int64_t step = forOp.getStepAsInt();
1198 forOp.setStep(step * unrollJamFactor);
1199
1200 auto forOpIV = forOp.getInductionVar();
1201 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1202 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203 for (auto &subBlock : subBlocks) {
1204 // Builder to insert unroll-jammed bodies. Insert right at the end of
1205 // sub-block.
1206 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1207
1208 // If the induction variable is used, create a remapping to the value for
1209 // this unrolled instance.
1210 if (!forOpIV.use_empty()) {
1211 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1212 auto d0 = builder.getAffineDimExpr(0);
1213 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1214 auto ivUnroll =
1215 AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216 operandMaps[i - 1].map(forOpIV, ivUnroll);
1217 }
1218 // Clone the sub-block being unroll-jammed.
1219 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220 builder.clone(*it, operandMaps[i - 1]);
1221 }
1222 // Fix iterOperands and yield op operands of newly created loops.
1223 for (auto newForOp : newLoopsWithIterArgs) {
1224 unsigned oldNumIterOperands =
1225 newForOp.getNumIterOperands() / unrollJamFactor;
1226 unsigned numControlOperands = newForOp.getNumControlOperands();
1227 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229 assert(oldNumIterOperands == oldNumYieldOperands &&
1230 "oldNumIterOperands must be the same as oldNumYieldOperands");
1231 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1232 // The `i`th duplication of an old iterOperand or yield op operand
1233 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1234 // if such mapped value exists.
1235 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1236 operandMaps[i - 1].lookupOrDefault(
1237 newForOp.getOperand(numControlOperands + j)));
1238 yieldOp.setOperand(
1239 i * oldNumYieldOperands + j,
1240 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1241 }
1242 }
1243 }
1244 if (forOp.getNumResults() > 0) {
1245 // Create reduction ops to combine every `unrollJamFactor` related results
1246 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1247 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1248 // %1.
1249 rewriter.setInsertionPointAfter(forOp);
1250 auto loc = forOp.getLoc();
1251 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1252 for (LoopReduction &reduction : reductions) {
1253 unsigned pos = reduction.iterArgPosition;
1254 Value lhs = forOp.getResult(pos);
1255 Value rhs;
1257 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258 rhs = forOp.getResult(i * oldNumResults + pos);
1259 // Create ops based on reduction type.
1260 lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1261 if (!lhs)
1262 return failure();
1263 Operation *op = lhs.getDefiningOp();
1264 assert(op && "Reduction op should have been created");
1265 newOps.insert(op);
1266 }
1267 // Replace all uses except those in newly created reduction ops.
1268 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1269 }
1270 }
1271
1272 // Promote the loop body up if this has turned into a single iteration loop.
1274 return success();
1275}
1276
1277/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1278/// nested within 'forOpA' as the only non-terminator operation in its block.
1279void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1280 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281 auto &forOpABody = forOpA.getBody()->getOperations();
1282 auto &forOpBBody = forOpB.getBody()->getOperations();
1283
1284 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1285 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1286 // body containing only the terminator.
1287 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1288 forOpABody, forOpABody.begin(),
1289 std::prev(forOpABody.end()));
1290 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1291 // body (this leaves forOpB's body containing only the terminator).
1292 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293 std::prev(forOpBBody.end()));
1294 // 3) Splice forOpA into the beginning of forOpB's body.
1295 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1296 Block::iterator(forOpA));
1297}
1298
1299// Checks each dependence component against the permutation to see if the
1300// desired loop interchange would violate dependences by making the
1301// dependence component lexicographically negative.
1303 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1304 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1305 // Invert permutation map.
1306 unsigned maxLoopDepth = loops.size();
1307 SmallVector<unsigned, 4> loopPermMapInv;
1308 loopPermMapInv.resize(maxLoopDepth);
1309 for (unsigned i = 0; i < maxLoopDepth; ++i)
1310 loopPermMapInv[loopPermMap[i]] = i;
1311
1312 // Check each dependence component against the permutation to see if the
1313 // desired loop interchange permutation would make the dependence vectors
1314 // lexicographically negative.
1315 // Example 1: [-1, 1][0, 0]
1316 // Example 2: [0, 0][-1, 1]
1317 for (const auto &depComps : depCompsVec) {
1318 assert(depComps.size() >= maxLoopDepth);
1319 // Check if the first non-zero dependence component is positive.
1320 // This iterates through loops in the desired order.
1321 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1322 unsigned permIndex = loopPermMapInv[j];
1323 assert(depComps[permIndex].lb);
1324 int64_t depCompLb = *depComps[permIndex].lb;
1325 if (depCompLb > 0)
1326 break;
1327 if (depCompLb < 0)
1328 return false;
1329 }
1330 }
1331 return true;
1332}
1333
1334/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1335/// nested sequence of loops in 'loops' would violate dependences.
1337 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1338 assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1339 unsigned maxLoopDepth = loops.size();
1340 if (maxLoopDepth == 1)
1341 return true;
1342
1343 // We cannot guarantee the validity of the interchange if the loops have
1344 // iter_args, since the dependence analysis does not take them into account.
1345 // Conservatively return false in such cases.
1346 if (llvm::any_of(loops, [](AffineForOp loop) {
1347 return loop.getNumIterOperands() > 0;
1348 }))
1349 return false;
1350
1351 // Gather dependence components for dependences between all ops in loop nest
1352 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1353 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1354 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1355 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1356}
1357
1358/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1359/// in it from outermost to innermost.
1360[[maybe_unused]] bool
1362 assert(!loops.empty() && "no loops provided");
1363
1364 // We already know that the block can't be empty.
1365 auto hasTwoElements = [](Block *block) {
1366 auto secondOpIt = std::next(block->begin());
1367 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1368 };
1369
1370 auto enclosingLoop = loops.front();
1371 for (auto loop : loops.drop_front()) {
1372 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1373 // parentForOp's body should be just this loop and the terminator.
1374 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1375 return false;
1376 enclosingLoop = loop;
1377 }
1378 return true;
1379}
1380
1381// input[i] should move from position i -> permMap[i]. Returns the position in
1382// `input` that becomes the new outermost loop.
1384 ArrayRef<unsigned> permMap) {
1385 assert(input.size() == permMap.size() && "invalid permutation map size");
1386 // Check whether the permutation spec is valid. This is a small vector - we'll
1387 // just sort and check if it's iota.
1388 SmallVector<unsigned, 4> checkPermMap(permMap);
1389 llvm::sort(checkPermMap);
1390 if (llvm::any_of(llvm::enumerate(checkPermMap),
1391 [](const auto &en) { return en.value() != en.index(); }))
1392 assert(false && "invalid permutation map");
1393
1394 // Nothing to do.
1395 if (input.size() < 2)
1396 return 0;
1397
1398 assert(isPerfectlyNested(input) && "input not perfectly nested");
1399
1400 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1401 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1403 for (unsigned i = 0, e = input.size(); i < e; ++i)
1404 invPermMap.push_back({permMap[i], i});
1405 llvm::sort(invPermMap);
1406
1407 // Move the innermost loop body to the loop that would be the innermost in the
1408 // permuted nest (only if the innermost loop is going to change).
1409 if (permMap.back() != input.size() - 1) {
1410 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411 Block *srcBody = ((AffineForOp)input.back()).getBody();
1412 destBody->getOperations().splice(destBody->begin(),
1413 srcBody->getOperations(), srcBody->begin(),
1414 std::prev(srcBody->end()));
1415 }
1416
1417 // We'll move each loop in `input` in the reverse order so that its body is
1418 // empty when we are moving it; this incurs zero copies and no erasing.
1419 for (int i = input.size() - 1; i >= 0; --i) {
1420 // If this has to become the outermost loop after permutation, add it to the
1421 // parent block of the original root.
1422 if (permMap[i] == 0) {
1423 // If the root remains the same, nothing to do.
1424 if (i == 0)
1425 continue;
1426 // Make input[i] the new outermost loop moving it into parentBlock.
1427 auto *parentBlock = input[0]->getBlock();
1428 parentBlock->getOperations().splice(Block::iterator(input[0]),
1429 input[i]->getBlock()->getOperations(),
1430 Block::iterator(input[i]));
1431 continue;
1432 }
1433
1434 // If the parent in the permuted order is the same as in the original,
1435 // nothing to do.
1436 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1438 continue;
1439
1440 // Move input[i] to its surrounding loop in the transformed nest.
1441 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442 destBody->getOperations().splice(destBody->begin(),
1443 input[i]->getBlock()->getOperations(),
1444 Block::iterator(input[i]));
1445 }
1446
1447 return invPermMap[0].second;
1448}
1449
1450// Sinks all sequential loops to the innermost levels (while preserving
1451// relative order among them) and moves all parallel loops to the
1452// outermost (while again preserving relative order among them).
1453AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1455 getPerfectlyNestedLoops(loops, forOp);
1456 if (loops.size() < 2)
1457 return forOp;
1458
1459 // Gather dependence components for dependences between all ops in loop nest
1460 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1461 unsigned maxLoopDepth = loops.size();
1462 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1463 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1464
1465 // Mark loops as either parallel or sequential.
1466 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1467 for (auto &depComps : depCompsVec) {
1468 assert(depComps.size() >= maxLoopDepth);
1469 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1470 DependenceComponent &depComp = depComps[j];
1471 assert(depComp.lb.has_value() && depComp.ub.has_value());
1472 if (*depComp.lb != 0 || *depComp.ub != 0)
1473 isParallelLoop[j] = false;
1474 }
1475 }
1476
1477 unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1478
1479 // Compute permutation of loops that sinks sequential loops (and thus raises
1480 // parallel loops) while preserving relative order.
1481 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1482 unsigned nextSequentialLoop = numParallelLoops;
1483 unsigned nextParallelLoop = 0;
1484 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1485 if (isParallelLoop[i]) {
1486 loopPermMap[i] = nextParallelLoop++;
1487 } else {
1488 loopPermMap[i] = nextSequentialLoop++;
1489 }
1490 }
1491
1492 // Check if permutation 'loopPermMap' would violate dependences.
1493 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1494 return forOp;
1495 // Perform loop interchange according to permutation 'loopPermMap'.
1496 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1497 return loops[loopNestRootIndex];
1498}
1499
1500// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1501// lower (resp. upper) loop bound. When called for both the lower and upper
1502// bounds, the resulting IR resembles:
1503//
1504// ```mlir
1505// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1506// ...
1507// }
1508// ```
1510 SmallVector<Value, 4> *operands,
1511 int64_t offset = 0) {
1512 auto bounds = llvm::to_vector<4>(map->getResults());
1513 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1514 operands->insert(operands->begin() + map->getNumDims(), iv);
1515 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1516 b.getContext());
1517 canonicalizeMapAndOperands(map, operands);
1518}
1519
1520// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1521// Stripmine-sink is a primitive building block for generalized tiling of
1522// imperfectly nested loops.
1523// This transformation is purely mechanical and does not check legality,
1524// profitability or even structural correctness. It is the user's
1525// responsibility to specify `targets` that are dominated by `forOp`.
1526// Returns the new AffineForOps, one per `targets`, nested immediately under
1527// each of the `targets`.
1529stripmineSink(AffineForOp forOp, uint64_t factor,
1530 ArrayRef<AffineForOp> targets) {
1531 auto originalStep = forOp.getStepAsInt();
1532 auto scaledStep = originalStep * factor;
1533 forOp.setStep(scaledStep);
1534
1535 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1536
1537 // Lower-bound map creation.
1538 auto lbMap = forOp.getLowerBoundMap();
1539 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1540 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1541
1542 // Upper-bound map creation.
1543 auto ubMap = forOp.getUpperBoundMap();
1544 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1545 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1546 /*offset=*/scaledStep);
1547
1548 auto iv = forOp.getInductionVar();
1549 SmallVector<AffineForOp, 8> innerLoops;
1550 for (auto t : targets) {
1551 // Insert newForOp before the terminator of `t`.
1552 auto b = OpBuilder::atBlockTerminator(t.getBody());
1553 auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554 ubOperands, ubMap, originalStep);
1555 auto begin = t.getBody()->begin();
1556 // Skip terminator and `newForOp` which is just before the terminator.
1557 auto nOps = t.getBody()->getOperations().size() - 2;
1558 newForOp.getBody()->getOperations().splice(
1559 newForOp.getBody()->getOperations().begin(),
1560 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1561 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1562 newForOp.getRegion());
1563 innerLoops.push_back(newForOp);
1564 }
1565
1566 return innerLoops;
1567}
1568
1569// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1570// Returns the new AffineForOps, nested immediately under `target`.
1571template <typename SizeType>
1572static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1573 AffineForOp target) {
1574 // TODO: Use cheap structural assertions that targets are nested under
1575 // forOp and that targets are not nested under each other when DominanceInfo
1576 // exposes the capability. It seems overkill to construct a whole function
1577 // dominance tree at this point.
1578 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1579 assert(res.size() == 1 && "Expected 1 inner forOp");
1580 return res[0];
1581}
1582
1585 ArrayRef<AffineForOp> targets) {
1587 SmallVector<AffineForOp, 8> currentTargets(targets);
1588 for (auto it : llvm::zip(forOps, sizes)) {
1589 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590 res.push_back(step);
1591 currentTargets = step;
1592 }
1593 return res;
1594}
1595
1597 ArrayRef<uint64_t> sizes,
1598 AffineForOp target) {
1600 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1601 res.push_back(llvm::getSingleElement(loops));
1602 return res;
1603}
1604
1605LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1606 if (loops.size() < 2)
1607 return success();
1608
1609 AffineForOp innermost = loops.back();
1610 AffineForOp outermost = loops.front();
1611 AffineBound ub = outermost.getUpperBound();
1612 AffineMap origUbMap = ub.getMap();
1613 Location loc = outermost.getLoc();
1614 OpBuilder builder(outermost);
1615 for (AffineForOp loop : loops) {
1616 // We only work on normalized loops.
1617 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618 loop.getConstantLowerBound() != 0)
1619 return failure();
1620 }
1621 SmallVector<Value, 4> upperBoundSymbols;
1622 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1623 ub.getOperands().end());
1624
1625 // 1. Store the upper bound of the outermost loop in a variable.
1626 Value prev;
1627 if (!llvm::hasSingleElement(origUbMap.getResults()))
1628 prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1629 else
1630 prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631 upperBoundSymbols.push_back(prev);
1632
1633 // 2. Emit code computing the upper bound of the coalesced loop as product of
1634 // the number of iterations of all loops.
1635 for (AffineForOp loop : loops.drop_front()) {
1636 ub = loop.getUpperBound();
1637 origUbMap = ub.getMap();
1638 ubOperands = ub.getOperands();
1639 Value upperBound;
1640 // If upper bound map has more than one result, take their minimum.
1641 if (!llvm::hasSingleElement(origUbMap.getResults()))
1642 upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1643 else
1644 upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645 upperBoundSymbols.push_back(upperBound);
1646 SmallVector<Value, 4> operands;
1647 operands.push_back(prev);
1648 operands.push_back(upperBound);
1649 // Maintain running product of loop upper bounds.
1650 prev = AffineApplyOp::create(
1651 builder, loc,
1652 AffineMap::get(/*dimCount=*/1,
1653 /*symbolCount=*/1,
1654 builder.getAffineDimExpr(0) *
1655 builder.getAffineSymbolExpr(0)),
1656 operands);
1657 }
1658 // Set upper bound of the coalesced loop.
1659 AffineMap newUbMap = AffineMap::get(
1660 /*dimCount=*/0,
1661 /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1662 outermost.setUpperBound(prev, newUbMap);
1663
1664 builder.setInsertionPointToStart(outermost.getBody());
1665
1666 // 3. Remap induction variables. For each original loop, the value of the
1667 // induction variable can be obtained by dividing the induction variable of
1668 // the linearized loop by the total number of iterations of the loops nested
1669 // in it modulo the number of iterations in this loop (remove the values
1670 // related to the outer loops):
1671 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1672 // Compute these iteratively from the innermost loop by creating a "running
1673 // quotient" of division by the range.
1674 Value previous = outermost.getInductionVar();
1675 for (unsigned idx = loops.size(); idx > 0; --idx) {
1676 if (idx != loops.size()) {
1677 SmallVector<Value, 4> operands;
1678 operands.push_back(previous);
1679 operands.push_back(upperBoundSymbols[idx]);
1680 previous = AffineApplyOp::create(builder, loc,
1682 /*dimCount=*/1, /*symbolCount=*/1,
1683 builder.getAffineDimExpr(0).floorDiv(
1684 builder.getAffineSymbolExpr(0))),
1685 operands);
1686 }
1687 // Modified value of the induction variables of the nested loops after
1688 // coalescing.
1689 Value inductionVariable;
1690 if (idx == 1) {
1691 inductionVariable = previous;
1692 } else {
1693 SmallVector<Value, 4> applyOperands;
1694 applyOperands.push_back(previous);
1695 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696 inductionVariable = AffineApplyOp::create(
1697 builder, loc,
1699 /*dimCount=*/1, /*symbolCount=*/1,
1700 builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1701 applyOperands);
1702 }
1703 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1704 inductionVariable, loops.back().getRegion());
1705 }
1706
1707 // 4. Move the operations from the innermost just above the second-outermost
1708 // loop, delete the extra terminator and the second-outermost loop.
1709 AffineForOp secondOutermostLoop = loops[1];
1710 innermost.getBody()->back().erase();
1711 outermost.getBody()->getOperations().splice(
1712 Block::iterator(secondOutermostLoop.getOperation()),
1713 innermost.getBody()->getOperations());
1714 secondOutermostLoop.erase();
1715 return success();
1716}
1717
1718void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1719 ArrayRef<Value> processorId,
1720 ArrayRef<Value> numProcessors) {
1721 assert(processorId.size() == numProcessors.size());
1722 if (processorId.empty())
1723 return;
1724
1725 OpBuilder b(forOp);
1726 Location loc(forOp.getLoc());
1728 bindSymbols(forOp.getContext(), lhs, rhs);
1729 auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1730 auto addMap = AffineMap::get(0, 2, lhs + rhs);
1731
1732 Value linearIndex = processorId.front();
1733 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1734 auto mulApplyOp = AffineApplyOp::create(
1735 b, loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1736 linearIndex = AffineApplyOp::create(b, loc, addMap,
1737 ValueRange{mulApplyOp, processorId[i]});
1738 }
1739
1740 auto mulApplyOp = AffineApplyOp::create(
1741 b, loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1742 Value lb = AffineApplyOp::create(
1743 b, loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1744 forOp.setLowerBound(lb);
1745
1746 Value step = forOp.getStep();
1747 for (auto numProcs : numProcessors)
1748 step = AffineApplyOp::create(b, loc, mulMap, ValueRange{numProcs, step});
1749 forOp.setStep(step);
1750}
1751
1752/// Given a memref region, determine the lowest depth at which transfers can be
1753/// placed for it, and return the corresponding block, start and end positions
1754/// in the block for placing incoming (read) and outgoing (write) copies
1755/// respectively. The lowest depth depends on whether the region being accessed
1756/// is hoistable with respect to one or more immediately surrounding loops.
1757static void
1759 Block::iterator &begin, Block::iterator &end,
1760 Block **copyPlacementBlock,
1761 Block::iterator *copyInPlacementStart,
1762 Block::iterator *copyOutPlacementStart) {
1763 const auto *cst = region.getConstraints();
1764 SmallVector<Value, 4> symbols;
1765 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1766
1767 SmallVector<Operation *, 4> enclosingAffineOps;
1768 getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1769 // Walk up loop parents till we find an IV on which this region is
1770 // symbolic/variant or we hit `hoistGuard`.
1771 auto it = enclosingAffineOps.rbegin();
1772 AffineForOp lastInvariantFor;
1773 for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1774 Operation *enclosingOp = *it;
1775 // We can't hoist past the definition of the memref being copied.
1776 Value memref = region.memref;
1777 if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1778 LDBG() << "memref definition will end up not dominating hoist location";
1779 break;
1780 }
1781
1782 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1783 if (!affineFor)
1784 break;
1785 // TODO: also need to be checking this for regions symbols that
1786 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1787 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1788 break;
1789 lastInvariantFor = affineFor;
1790 }
1791
1792 if (it != enclosingAffineOps.rbegin()) {
1793 *copyInPlacementStart = Block::iterator(lastInvariantFor);
1794 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1795 *copyPlacementBlock = lastInvariantFor->getBlock();
1796 } else {
1797 *copyInPlacementStart = begin;
1798 *copyOutPlacementStart = end;
1799 *copyPlacementBlock = &block;
1800 }
1801}
1802
1803// Info comprising stride and number of elements transferred every stride.
1808
1809/// Returns striding information for a copy/transfer of this region with
1810/// potentially multiple striding levels from outermost to innermost. For an
1811/// n-dimensional region, there can be at most n-1 levels of striding
1812/// successively nested.
1813// TODO: make this work with non-identity layout maps.
1814static void getMultiLevelStrides(const MemRefRegion &region,
1815 ArrayRef<int64_t> bufferShape,
1816 SmallVectorImpl<StrideInfo> *strideInfos) {
1817 if (bufferShape.size() <= 1)
1818 return;
1819
1820 int64_t numEltPerStride = 1;
1821 int64_t stride = 1;
1822 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1823 int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1824 stride *= dimSize;
1825 numEltPerStride *= bufferShape[d];
1826 // A stride is needed only if the region has a shorter extent than the
1827 // memref along the dimension *and* has an extent greater than one along the
1828 // next major dimension.
1829 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1830 strideInfos->push_back({stride, numEltPerStride});
1831 }
1832 }
1833}
1834
1835/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1836/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1837/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1838/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1839/// contain the expressions to be subtracted out from the respective copy loop
1840/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1841/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1842/// copy nest is inserted.
1843//
1844/// The copy-in nest is generated as follows as an example for a 2-d region:
1845/// for x = ...
1846/// for y = ...
1847/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1848///
1849static AffineForOp
1850generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1851 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1852 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1853 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1854 OpBuilder b) {
1855 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1856 return lbMap.getNumInputs() == lbOperands.size();
1857 }));
1858 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1859 return ubMap.getNumInputs() == ubOperands.size();
1860 }));
1861
1862 unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1863 // A copy nest can't be generated for 0-ranked memrefs.
1864 assert(rank != 0 && "non-zero rank memref expected");
1865 assert(lbMaps.size() == rank && "wrong number of lb maps");
1866 assert(ubMaps.size() == rank && "wrong number of ub maps");
1867
1868 SmallVector<Value, 4> memIndices;
1870 SmallVector<Value, 4> fastBufMapOperands;
1871 AffineForOp copyNestRoot;
1872 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1873 for (unsigned d = 0; d < rank; ++d) {
1874 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1875 ubOperands, ubMaps[d]);
1876 if (d == 0)
1877 copyNestRoot = forOp;
1878
1879 b = OpBuilder::atBlockTerminator(forOp.getBody());
1880
1881 auto fastBufOffsetMap =
1882 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1883 auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1884
1885 // Construct the subscript for the fast memref being copied into/from:
1886 // x - offset_x.
1887 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1888 b.getAffineDimExpr(2 * d));
1889 fastBufMapOperands.push_back(offset);
1890 fastBufMapOperands.push_back(forOp.getInductionVar());
1891 mayBeDeadApplys.push_back(offset);
1892
1893 // Subscript for the slow memref being copied.
1894 memIndices.push_back(forOp.getInductionVar());
1895 }
1896
1897 auto fastBufMap =
1898 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1901 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1902
1903 // Drop any dead affine.applys.
1904 for (auto applyOp : mayBeDeadApplys)
1905 if (applyOp.use_empty())
1906 applyOp.erase();
1907
1908 if (!isCopyOut) {
1909 // Copy in.
1910 auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1911 AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1912 fastBufMapOperands);
1913 return copyNestRoot;
1914 }
1915
1916 // Copy out.
1917 auto load =
1918 AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1919 AffineStoreOp::create(b, loc, load, memref, memIndices);
1921}
1922
1923[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
1924 return block.getParentOp()->emitRemark();
1925}
1926
1927/// Creates a buffer in the faster memory space for the specified memref region
1928/// (memref has to be non-zero ranked); generates a copy from the lower memory
1929/// space to this one, and replaces all loads/stores in the block range
1930/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1931/// if copies could not be generated due to yet unimplemented cases.
1932/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1933/// specify the insertion points where the incoming copies and outgoing copies,
1934/// respectively, should be inserted (the insertion happens right before the
1935/// insertion point). Since `begin` can itself be invalidated due to the memref
1936/// rewriting done from this method, the output argument `nBegin` is set to its
1937/// replacement (set to `begin` if no invalidation happens). Since outgoing
1938/// copies could have been inserted at `end`, the output argument `nEnd` is set
1939/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1940/// allocated.
1941static LogicalResult generateCopy(
1942 const MemRefRegion &region, Block *block, Block::iterator begin,
1943 Block::iterator end, Block *copyPlacementBlock,
1944 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1945 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1946 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1947 Block::iterator *nBegin, Block::iterator *nEnd) {
1948 *nBegin = begin;
1949 *nEnd = end;
1950
1951 auto f = begin->getParentOfType<FunctionOpInterface>();
1952 OpBuilder topBuilder(f.getFunctionBody());
1953 Value zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
1954
1955 *sizeInBytes = 0;
1956
1957 if (begin == end)
1958 return success();
1959
1960 // Record the last op in the block for which we are performing copy
1961 // generation. We later do the memref replacement only in [begin, lastCopyOp]
1962 // so that the original memref's used in the data movement code themselves
1963 // don't get replaced.
1964 Operation *lastCopyOp = end->getPrevNode();
1965
1966 // Is the copy out point at the end of the block where we are doing
1967 // explicit copying.
1968 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1969
1970 // Copies for read regions are going to be inserted at 'begin'.
1971 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1972 // Copies for write regions are going to be inserted at 'end'.
1973 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1974 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1975
1976 // Builder to create constants at the top level.
1977 auto func =
1978 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1979 OpBuilder top(func.getFunctionBody());
1980
1981 auto loc = region.loc;
1982 auto memref = region.memref;
1983 auto memRefType = cast<MemRefType>(memref.getType());
1984
1985 if (!memRefType.getLayout().isIdentity()) {
1986 LDBG() << "Non-identity layout map not yet supported";
1987 return failure();
1988 }
1989
1990 // Indices to use for the copying.
1991 // Indices for the original memref being copied from/to.
1992 SmallVector<Value, 4> memIndices;
1993 // Indices for the faster buffer being copied into/from.
1994 SmallVector<Value, 4> bufIndices;
1995
1996 unsigned rank = memRefType.getRank();
1997 if (rank == 0) {
1998 LDBG() << "Non-zero ranked memrefs supported";
1999 return failure();
2000 }
2001
2002 SmallVector<int64_t, 4> fastBufferShape;
2003
2004 // Compute the extents of the buffer.
2006 lbs.reserve(rank);
2007 std::optional<int64_t> numElements =
2008 region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2009 if (!numElements) {
2010 LDBG() << "Non-constant region size not supported";
2011 return failure();
2012 }
2013
2014 if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2015 // This can be supported in the future if needed.
2016 LDBG() << "Max lower bound for memref region start not supported";
2017 return failure();
2018 }
2019
2020 if (*numElements == 0) {
2021 LDBG() << "Nothing to copy";
2022 return success();
2023 }
2024
2025 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2026 for (unsigned i = 0; i < rank; ++i) {
2027 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2028 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2029 LDBG() << "Missing lower or upper bound for region along dimension: "
2030 << i;
2031 return failure();
2032 }
2033 }
2034
2035 const FlatAffineValueConstraints *cst = region.getConstraints();
2036 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2037 // on; these typically include loop IVs surrounding the level at which the
2038 // copy generation is being done or other valid symbols in MLIR.
2039 SmallVector<Value, 8> regionSymbols;
2040 cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2041
2042 // Construct the access expression for the fast memory buffer. The access
2043 // expression for a particular dimension of the fast buffer is obtained by
2044 // subtracting out the lower bound on the original memref's data region
2045 // along the corresponding dimension.
2046
2047 // Index start offsets for faster memory buffer relative to the original.
2048 SmallVector<AffineExpr, 4> fastBufOffsets;
2049 fastBufOffsets.reserve(rank);
2050 for (unsigned d = 0; d < rank; d++) {
2051 assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2052 "incorrect bound size");
2053
2054 // Set copy start location for this dimension in the lower memory space
2055 // memref.
2056 if (lbs[d].isSingleConstant()) {
2057 auto indexVal = lbs[d].getSingleConstantResult();
2058 if (indexVal == 0) {
2059 memIndices.push_back(zeroIndex);
2060 } else {
2061 memIndices.push_back(
2062 arith::ConstantIndexOp::create(top, loc, indexVal).getResult());
2063 }
2064 } else {
2065 // The coordinate for the start location is just the lower bound along the
2066 // corresponding dimension on the memory region (stored in 'offset').
2067 // Remap all inputs of the map to dimensions uniformly since in the
2068 // generate IR we need valid affine symbols as opposed to "symbols" for
2069 // the purpose of the memref region.
2070 SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2071 for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2072 symReplacements[i] = top.getAffineDimExpr(i);
2073 lbs[d] = lbs[d].replaceDimsAndSymbols(
2074 /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2075 /*numResultSyms=*/0);
2076 memIndices.push_back(
2077 AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2078 }
2079 // The fast buffer is copied into at location zero; addressing is relative.
2080 bufIndices.push_back(zeroIndex);
2081
2082 // Record the offsets since they are needed to remap the memory accesses of
2083 // the original memref further below.
2084 fastBufOffsets.push_back(lbs[d].getResult(0));
2085 }
2086
2087 // The faster memory space buffer.
2088 Value fastMemRef;
2089
2090 // Check if a buffer was already created.
2091 bool existingBuf = fastBufferMap.count(memref) > 0;
2092 if (!existingBuf) {
2093 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2094 auto fastMemRefType =
2095 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2096 fastBufferLayout, copyOptions.fastMemorySpace);
2097
2098 // Create the fast memory space buffer just before the 'affine.for'
2099 // operation.
2100 fastMemRef =
2101 memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2102 // Record it.
2103 fastBufferMap[memref] = fastMemRef;
2104 // fastMemRefType is a constant shaped memref.
2105 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2106 // We don't account for things of unknown size.
2107 *sizeInBytes = maySizeInBytes.value_or(0);
2108
2109 LLVM_DEBUG(emitRemarkForBlock(*block)
2110 << "Creating fast buffer of type " << fastMemRefType
2111 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2112 << " KiB\n");
2113 } else {
2114 // Reuse the one already created.
2115 fastMemRef = fastBufferMap[memref];
2116 }
2117
2118 auto numElementsSSA = arith::ConstantIndexOp::create(top, loc, *numElements);
2119
2120 Value dmaStride;
2121 Value numEltPerDmaStride;
2122 if (copyOptions.generateDma) {
2123 SmallVector<StrideInfo, 4> dmaStrideInfos;
2124 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2125
2126 // TODO: use all stride levels once DmaStartOp is extended for
2127 // multi-level strides.
2128 if (dmaStrideInfos.size() > 1) {
2129 LDBG() << "Only up to one level of stride supported";
2130 return failure();
2131 }
2132
2133 if (!dmaStrideInfos.empty()) {
2134 dmaStride =
2135 arith::ConstantIndexOp::create(top, loc, dmaStrideInfos[0].stride);
2136 numEltPerDmaStride = arith::ConstantIndexOp::create(
2137 top, loc, dmaStrideInfos[0].numEltPerStride);
2138 }
2139 }
2140
2141 // Create fully composed affine maps for each memref.
2142 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2143 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2144 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2145 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2146
2147 if (!copyOptions.generateDma) {
2148 // Point-wise copy generation.
2149 auto copyNest =
2150 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2151 /*lbOperands=*/regionSymbols, ubMaps,
2152 /*ubOperands=*/regionSymbols, fastBufOffsets,
2153 /*isCopyOut=*/region.isWrite(), b);
2154
2155 // Record this so that we can skip it from yet another copy.
2156 copyNests.insert(copyNest);
2157
2158 // Since new ops are being appended (for copy out's), adjust the end to
2159 // mark end of block range being processed if necessary.
2160 if (region.isWrite() && isCopyOutAtEndOfBlock)
2161 *nEnd = Block::iterator(copyNest.getOperation());
2162 } else {
2163 // DMA generation.
2164 // Create a tag (single element 1-d memref) for the DMA.
2165 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2166 copyOptions.tagMemorySpace);
2167 auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2168
2169 SmallVector<Value, 4> tagIndices({zeroIndex});
2170 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2171 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2172 if (!region.isWrite()) {
2173 // DMA non-blocking read from original buffer to fast buffer.
2174 AffineDmaStartOp::create(b, loc, memref, memAffineMap, memIndices,
2175 fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2176 tagAffineMap, tagIndices, numElementsSSA,
2177 dmaStride, numEltPerDmaStride);
2178 } else {
2179 // DMA non-blocking write from fast buffer to the original memref.
2180 auto op = AffineDmaStartOp::create(
2181 b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2182 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2183 dmaStride, numEltPerDmaStride);
2184 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2185 // end to mark end of block range being processed.
2186 if (isCopyOutAtEndOfBlock)
2187 *nEnd = Block::iterator(op.getOperation());
2188 }
2189
2190 // Matching DMA wait to block on completion; tag always has a 0 index.
2191 AffineDmaWaitOp::create(b, loc, tagMemRef, tagAffineMap, zeroIndex,
2192 numElementsSSA);
2193
2194 // Generate dealloc for the tag.
2195 auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2196 if (*nEnd == end && isCopyOutAtEndOfBlock)
2197 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2198 // mark end of range of the original.
2199 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2200 }
2201
2202 // Generate dealloc for the buffer.
2203 if (!existingBuf) {
2204 auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2205 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2206 // the fast buffer (since it marks the new end insertion point).
2207 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2208 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2209 }
2210
2211 // Replace all uses of the old memref with the faster one while remapping
2212 // access indices (subtracting out lower bound offsets for each dimension).
2213 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2214 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2215 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2216 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2217 // d2, d3 correspond to the original indices (%i, %j).
2218 SmallVector<AffineExpr, 4> remapExprs;
2219 remapExprs.reserve(rank);
2220 for (unsigned i = 0; i < rank; i++) {
2221 // The starting operands of indexRemap will be regionSymbols (the symbols on
2222 // which the memref region is parametric); then those corresponding to
2223 // the memref's original indices follow.
2224 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2225 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2226 }
2227 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2228 b.getContext());
2229
2230 // Record the begin since it may be invalidated by memref replacement.
2231 Block::iterator prevOfBegin;
2232 bool isBeginAtStartOfBlock = (begin == block->begin());
2233 if (!isBeginAtStartOfBlock)
2234 prevOfBegin = std::prev(begin);
2235
2236 auto userFilterFn = [&](Operation *user) {
2237 auto *ancestorUser = block->findAncestorOpInBlock(*user);
2238 return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2239 !lastCopyOp->isBeforeInBlock(ancestorUser);
2240 };
2241
2242 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2243 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2244 /*extraIndices=*/{}, indexRemap,
2245 /*extraOperands=*/regionSymbols,
2246 /*symbolOperands=*/{}, userFilterFn);
2247
2248 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2249
2250 return success();
2251}
2252
2253/// Construct the memref region to just include the entire memref. Returns false
2254/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2255/// enclosing loop IVs of `op` (starting from the outermost) that the region
2256/// is parametric on.
2257static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2258 MemRefRegion *region) {
2259 unsigned rank;
2260 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2261 rank = loadOp.getMemRefType().getRank();
2262 region->memref = loadOp.getMemRef();
2263 region->setWrite(false);
2264 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2265 rank = storeOp.getMemRefType().getRank();
2266 region->memref = storeOp.getMemRef();
2267 region->setWrite(true);
2268 } else {
2269 assert(false && "expected load or store op");
2270 return false;
2271 }
2272 auto memRefType = cast<MemRefType>(region->memref.getType());
2273 if (!memRefType.hasStaticShape())
2274 return false;
2275
2276 auto *regionCst = region->getConstraints();
2277
2278 // Just get the first numSymbols IVs, which the memref region is parametric
2279 // on.
2281 getAffineForIVs(*op, &ivs);
2282 ivs.resize(numParamLoopIVs);
2283 SmallVector<Value, 4> symbols;
2284 extractForInductionVars(ivs, &symbols);
2285 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2286 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2287
2288 // Memref dim sizes provide the bounds.
2289 for (unsigned d = 0; d < rank; d++) {
2290 auto dimSize = memRefType.getDimSize(d);
2291 assert(dimSize > 0 && "filtered dynamic shapes above");
2292 regionCst->addBound(BoundType::LB, d, 0);
2293 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2294 }
2295 return true;
2296}
2297
2298LogicalResult
2299mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2300 const AffineCopyOptions &copyOptions,
2301 std::optional<Value> filterMemRef,
2302 DenseSet<Operation *> &copyNests) {
2303 if (begin == end)
2304 return success();
2305
2306 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2307 "Inconsistent block begin/end args");
2308 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2309
2310 Block *block = begin->getBlock();
2311
2312 // Copies will be generated for this depth, i.e., symbolic in all loops
2313 // surrounding the this block range.
2314 unsigned copyDepth = getNestingDepth(&*begin);
2315
2316 LDBG() << "Generating copies at depth " << copyDepth;
2317 LDBG() << "from begin: "
2318 << OpWithFlags(&*begin, OpPrintingFlags().skipRegions());
2319 LDBG() << "to inclusive end: "
2320 << OpWithFlags(&*std::prev(end), OpPrintingFlags().skipRegions());
2321
2322 // List of memory regions to copy for. We need a map vector to have a
2323 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2324 // since the alloc's for example are identical except for the SSA id.
2325 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2326 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2327
2328 // Map from original memref's to the fast buffers that their accesses are
2329 // replaced with.
2330 DenseMap<Value, Value> fastBufferMap;
2331
2332 // To check for errors when walking the block.
2333 bool error = false;
2334
2335 // Walk this range of operations to gather all memory regions.
2336 block->walk(begin, end, [&](Operation *opInst) {
2337 Value memref;
2338 MemRefType memrefType;
2339 // Gather regions to allocate to buffers in faster memory space.
2340 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2341 memref = loadOp.getMemRef();
2342 memrefType = loadOp.getMemRefType();
2343 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2344 memref = storeOp.getMemRef();
2345 memrefType = storeOp.getMemRefType();
2346 }
2347 // Not an affine.load/store op.
2348 if (!memref)
2349 return;
2350
2351 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2352 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2353 memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2354 return;
2355
2356 if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2357 LDBG() << "memref definition is inside of the depth at "
2358 << "which copy-in/copy-out would happen";
2359 return;
2360 }
2361
2362 // Compute the MemRefRegion accessed.
2363 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2364 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2365 /*addMemRefDimBounds=*/false))) {
2366 LDBG() << "Error obtaining memory region: semi-affine maps?";
2367 LDBG() << "over-approximating to the entire memref";
2368 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2369 LDBG() << "non-constant memref sizes not yet supported";
2370 error = true;
2371 return;
2372 }
2373 }
2374
2375 // Each memref has a single buffer associated with it irrespective of how
2376 // many load's and store's happen on it.
2377 // TODO: in the future, when regions don't intersect and satisfy
2378 // other properties (based on load/store regions), we could consider
2379 // multiple buffers per memref.
2380
2381 // Add to the appropriate region if it's not already in it, or take a
2382 // bounding box union with the existing one if it's already in there.
2383 // Note that a memref may have both read and write regions - so update the
2384 // region in the other list if one exists (write in case of read and vice
2385 // versa) since there is a single bounding box for a memref across all reads
2386 // and writes that happen on it.
2387
2388 // Attempts to update; returns true if 'region' exists in targetRegions.
2389 auto updateRegion =
2390 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2391 &targetRegions) {
2392 const auto *const it = targetRegions.find(region->memref);
2393 if (it == targetRegions.end())
2394 return false;
2395
2396 // Perform a union with the existing region.
2397 if (failed(it->second->unionBoundingBox(*region))) {
2398 LDBG() << "Memory region bounding box failed; "
2399 << "over-approximating to the entire memref";
2400 // If the union fails, we will overapproximate.
2401 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2402 LDBG() << "non-constant memref sizes not yet supported";
2403 error = true;
2404 return true;
2405 }
2406 it->second->getConstraints()->clearAndCopyFrom(
2407 *region->getConstraints());
2408 } else {
2409 // Union was computed and stored in 'it->second': copy to 'region'.
2410 region->getConstraints()->clearAndCopyFrom(
2411 *it->second->getConstraints());
2412 }
2413 return true;
2414 };
2415
2416 bool existsInRead = updateRegion(readRegions);
2417 if (error)
2418 return;
2419 bool existsInWrite = updateRegion(writeRegions);
2420 if (error)
2421 return;
2422
2423 // Finally add it to the region list.
2424 if (region->isWrite() && !existsInWrite) {
2425 writeRegions[region->memref] = std::move(region);
2426 } else if (!region->isWrite() && !existsInRead) {
2427 readRegions[region->memref] = std::move(region);
2428 }
2429 });
2430
2431 if (error) {
2432 LDBG() << "copy generation failed for one or more memref's in this block";
2433 return failure();
2434 }
2435
2436 uint64_t totalCopyBuffersSizeInBytes = 0;
2437 bool ret = true;
2438 auto processRegions =
2439 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2440 &regions) {
2441 for (const auto &regionEntry : regions) {
2442 // For each region, hoist copy in/out past all hoistable
2443 // 'affine.for's.
2444 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2445 Block *copyPlacementBlock;
2447 *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2448 &copyInPlacementStart, &copyOutPlacementStart);
2449
2450 uint64_t sizeInBytes;
2451 Block::iterator nBegin, nEnd;
2452 LogicalResult iRet = generateCopy(
2453 *regionEntry.second, block, begin, end, copyPlacementBlock,
2454 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2455 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2456 if (succeeded(iRet)) {
2457 // begin/end could have been invalidated, and need update.
2458 begin = nBegin;
2459 end = nEnd;
2460 totalCopyBuffersSizeInBytes += sizeInBytes;
2461 }
2462 ret = ret & succeeded(iRet);
2463 }
2464 };
2465 processRegions(readRegions);
2466 processRegions(writeRegions);
2467
2468 if (!ret) {
2469 LDBG() << "copy generation failed for one or more memref's in this block";
2470 return failure();
2471 }
2472
2473 // For a range of operations, a note will be emitted at the caller.
2474 AffineForOp forOp;
2475 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2476 LLVM_DEBUG(forOp.emitRemark()
2477 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2478 << " KiB of copy buffers in fast memory space for this block");
2479 }
2480
2481 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2482 block->getParentOp()->emitWarning(
2483 "total size of all copy buffers' for this block exceeds fast memory "
2484 "capacity");
2485 }
2486
2487 return success();
2488}
2489
2490// A convenience version of affineDataCopyGenerate for all ops in the body of
2491// an AffineForOp.
2492LogicalResult mlir::affine::affineDataCopyGenerate(
2493 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2494 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2495 return affineDataCopyGenerate(forOp.getBody()->begin(),
2496 std::prev(forOp.getBody()->end()), copyOptions,
2497 filterMemRef, copyNests);
2498}
2499
2500LogicalResult mlir::affine::generateCopyForMemRegion(
2501 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2502 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2503 Block *block = analyzedOp->getBlock();
2504 auto begin = analyzedOp->getIterator();
2505 auto end = std::next(begin);
2506 DenseMap<Value, Value> fastBufferMap;
2507 DenseSet<Operation *> copyNests;
2508
2509 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2510 copyOptions, fastBufferMap, copyNests,
2511 &result.sizeInBytes, &begin, &end);
2512 if (failed(err))
2513 return err;
2514
2515 const auto &en = fastBufferMap.find(memrefRegion.memref);
2516 // In some cases (empty loops), no copy generation would have happened.
2517 if (en == fastBufferMap.end())
2518 return failure();
2519 result.alloc = en->second.getDefiningOp();
2520 assert(result.alloc && "fast buffer expected to be locally allocated");
2521 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2522 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2523 return success();
2524}
2525
2526/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2527static void
2528gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2529 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2530 // Add a new empty level to output if it doesn't exist level already.
2531 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2532 if (currLoopDepth == depthToLoops.size())
2533 depthToLoops.emplace_back();
2534
2535 for (auto &op : *block) {
2536 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2537 depthToLoops[currLoopDepth].push_back(forOp);
2538 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2539 }
2540 }
2541}
2542
2543/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2544void mlir::affine::gatherLoops(
2545 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2546 for (auto &block : func)
2547 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2548
2549 // Remove last loop level from output since it's empty.
2550 if (!depthToLoops.empty()) {
2551 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2552 depthToLoops.pop_back();
2553 }
2554}
2555
2556AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2557 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2558 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2559 SmallVector<Value, 4> lowerOperands(lbOperands);
2560 SmallVector<Value, 4> upperOperands(ubOperands);
2561
2562 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2563 canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2564 lbMap = removeDuplicateExprs(lbMap);
2565 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2566 canonicalizeMapAndOperands(&ubMap, &upperOperands);
2567 ubMap = removeDuplicateExprs(ubMap);
2568
2569 return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2570 step);
2571}
2572
2573/// Creates an AffineIfOp that encodes the conditional to choose between
2574/// the constant trip count version and an unknown trip count version of this
2575/// nest of loops. This is used to separate partial and full tiles if `loops`
2576/// has the intra-tile loops. The affine.if op is inserted at the builder
2577/// insertion point of `b`.
2579 OpBuilder b) {
2580 if (loops.empty())
2581 return nullptr;
2582
2583 auto *context = loops[0].getContext();
2584
2587 llvm::append_range(ops, loops);
2588 (void)getIndexSet(ops, &cst);
2589
2590 // Remove constraints that are independent of these loop IVs.
2591 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2592
2593 // Construct the constraint set representing the guard for full tiles. The
2594 // lower bound (and upper bound) corresponding to the full tile should be
2595 // larger (and resp. smaller) than any other lower (or upper bound).
2596 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2597 for (auto loop : loops) {
2598 (void)loop;
2599 // TODO: Non-unit stride is not an issue to generalize to.
2600 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2601 // Mark everything symbols for the purpose of finding a constant diff pair.
2602 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2603 1);
2604 unsigned fullTileLbPos, fullTileUbPos;
2605 if (!((IntegerRelation)cst)
2606 .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2607 /*boundFloorDivisor=*/nullptr,
2608 /*ub=*/nullptr, &fullTileLbPos,
2609 &fullTileUbPos)) {
2610 LDBG() << "Can't get constant diff pair for a loop";
2611 return nullptr;
2612 }
2613
2614 SmallVector<unsigned, 4> lbIndices, ubIndices;
2615 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2616
2617 auto fLb = cst.getInequality(fullTileLbPos);
2618 auto fUb = cst.getInequality(fullTileUbPos);
2619 fullTileLb.assign(fLb.begin(), fLb.end());
2620 fullTileUb.assign(fUb.begin(), fUb.end());
2621
2622 // Full tile lower bound should be >= than any other lower bound.
2623 for (auto lbIndex : lbIndices)
2624 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2625 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2626
2627 // Full tile upper bound should be <= any other upper bound.
2628 for (auto ubIndex : ubIndices)
2629 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2630 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2631
2632 cst.removeVar(0);
2633 }
2634
2635 // The previous step leads to all zeros for the full tile lb and ub position
2636 // itself; remove those and any other duplicates / trivial redundancies.
2638
2639 // Turn everything into dims conservatively since we earlier turned all
2640 // trailing ids past point loop IV into symbols. Some of these could be outer
2641 // loop IVs; we'll canonicalize anyway.
2643
2644 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2645 // ifCondSet can be null if cst was empty -- this can happen if all loops
2646 // in the nest have constant trip counts.
2647 if (!ifCondSet)
2648 return nullptr;
2649
2650 SmallVector<Value, 4> setOperands;
2651 cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2652 canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2653 return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2654 /*withElseRegion=*/true);
2655}
2656
2657/// Create the full tile loop nest (along with its body).
2658static LogicalResult
2660 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2661 fullTileLoops.reserve(inputNest.size());
2662
2663 // For each loop in the original nest identify a lower/upper bound pair such
2664 // that their difference is a constant.
2666 for (auto loop : inputNest) {
2667 // TODO: straightforward to generalize to a non-unit stride.
2668 if (loop.getStepAsInt() != 1) {
2669 LDBG() << "[tile separation] non-unit stride not implemented";
2670 return failure();
2671 }
2672 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2673 (void)getIndexSet(loopOp, &cst);
2674 // We will mark everything other than this loop IV as symbol for getting a
2675 // pair of <lb, ub> with a constant difference.
2677 unsigned lbPos, ubPos;
2678 if (!((IntegerRelation)cst)
2679 .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2680 /*boundFloorDivisor=*/nullptr,
2681 /*ub=*/nullptr, &lbPos, &ubPos) ||
2682 lbPos == ubPos) {
2683 LDBG() << "[tile separation] Can't get constant diff / "
2684 << "equalities not yet handled";
2685 return failure();
2686 }
2687
2688 // Set all variables as dimensions uniformly since some of those marked as
2689 // symbols above could be outer loop IVs (corresponding tile space IVs).
2690 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2691
2692 AffineValueMap lbVmap, ubVmap;
2693 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2694 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2695 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2696 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2697 ubVmap.getOperands(), ubVmap.getAffineMap());
2698 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2699 fullTileLoops.push_back(fullTileLoop);
2700 }
2701
2702 // Add the body for the full tile loop nest.
2703 IRMapping operandMap;
2704 for (const auto &loopEn : llvm::enumerate(inputNest))
2705 operandMap.map(loopEn.value().getInductionVar(),
2706 fullTileLoops[loopEn.index()].getInductionVar());
2707 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2708 for (auto &op : inputNest.back().getBody()->without_terminator())
2709 b.clone(op, operandMap);
2710 return success();
2711}
2712
2713LogicalResult
2714mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2715 SmallVectorImpl<AffineForOp> *fullTileNest) {
2716 if (inputNest.empty())
2717 return success();
2718
2719 auto firstLoop = inputNest[0];
2720
2721 // Each successive for op has to be nested in the other.
2722 auto prevLoop = firstLoop;
2723 for (auto loop : inputNest.drop_front(1)) {
2724 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2725 prevLoop = loop;
2726 }
2727
2728 // Create the full tile loop nest.
2729 SmallVector<AffineForOp, 4> fullTileLoops;
2730 OpBuilder b(firstLoop);
2731 if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2732 if (!fullTileLoops.empty())
2733 fullTileLoops.front().erase();
2734 return failure();
2735 }
2736
2737 // Create and insert the version select right before the root of the nest.
2738 b = OpBuilder(firstLoop);
2739 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2740 if (!ifOp) {
2741 fullTileLoops.front().erase();
2742 LDBG() << "All tiles are full tiles, or failure creating "
2743 << "separation condition";
2744 return failure();
2745 }
2746
2747 // Move the full tile into the then block.
2748 Block *thenBlock = ifOp.getThenBlock();
2749 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2750 thenBlock->getOperations().splice(
2751 std::prev(thenBlock->end()),
2752 outermostFullTileLoop->getBlock()->getOperations(),
2753 Block::iterator(outermostFullTileLoop));
2754
2755 // Move the partial tile into the else block. The partial tile is the same as
2756 // the original loop nest.
2757 Block *elseBlock = ifOp.getElseBlock();
2758 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2759 firstLoop->getBlock()->getOperations(),
2760 Block::iterator(firstLoop));
2761
2762 if (fullTileNest)
2763 *fullTileNest = std::move(fullTileLoops);
2764
2765 return success();
2766}
2767
2768LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2769 LogicalResult result(failure());
2771 getPerfectlyNestedLoops(loops, op);
2772 if (loops.size() <= 1)
2773 return success();
2774
2775 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2776 // loops with bounds defined above some loop.
2777 // 1. For each loop, find above which parent loop its operands are
2778 // defined.
2779 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2780 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2781 operandsDefinedAbove[i] = i;
2782 for (unsigned j = 0; j < i; ++j) {
2783 if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2784 operandsDefinedAbove[i] = j;
2785 break;
2786 }
2787 }
2788 }
2789
2790 // 2. Identify bands of loops such that the operands of all of them are
2791 // defined above the first loop in the band. Traverse the nest bottom-up
2792 // so that modifications don't invalidate the inner loops.
2793 for (unsigned end = loops.size(); end > 0; --end) {
2794 unsigned start = 0;
2795 for (; start < end - 1; ++start) {
2796 auto maxPos =
2797 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2798 std::next(operandsDefinedAbove.begin(), end));
2799 if (maxPos > start)
2800 continue;
2801 assert(maxPos == start &&
2802 "expected loop bounds to be known at the start of the band");
2803 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2804 if (succeeded(coalesceLoops(band)))
2805 result = success();
2806 break;
2807 }
2808 // If a band was found and transformed, keep looking at the loops above
2809 // the outermost transformed loop.
2810 if (start != end - 1)
2811 end = start + 1;
2812 }
2813 return result;
2814}
2815
2817 int64_t count = 0;
2818 Operation *currentOp = operand.getOwner();
2819 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2820 if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2821 break;
2822 currentOp = loopOp;
2823 count++;
2824 }
2825 return count;
2826}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its the output argument nEnd is set *to the new end sizeInBytes is set to the size of the fast buffer *allocated *static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
fullyComposeAffineMapAndOperands & fastBufMap
static InFlightDiagnostic emitRemarkForBlock(Block &block)
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 > > &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * > > > &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
auto load
return copyNestRoot
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 > > &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition LoopUtils.cpp:45
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
fastBufExprs
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType::iterator iterator
Definition Block.h:140
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition Block.cpp:27
OpListType & getOperations()
Definition Block.h:137
Operation & front()
Definition Block.h:153
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition Block.h:308
Operation * getTerminator()
Get the terminator operation of this block.
Definition Block.cpp:244
iterator end()
Definition Block.h:144
iterator begin()
Definition Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition Block.cpp:31
AffineMap getDimIdentityMap()
Definition Builders.cpp:383
AffineExpr getAffineDimExpr(unsigned position)
Definition Builders.cpp:364
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:207
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:562
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition Builders.h:252
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
This class represents an operand of an operation.
Definition Value.h:257
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
Definition Operation.h:1111
This class implements the operand iterators for the Operation class.
Definition ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Block * getBlock()
Returns the operation block that contains this operation.
Definition Operation.h:213
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:223
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition Value.h:208
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
AffineBound represents a lower or upper bound in the for operation.
Definition AffineOps.h:550
Value getOperand(unsigned idx)
Definition AffineOps.h:556
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition Utils.cpp:865
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 > > *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
bool isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition Utils.cpp:851
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition Utils.cpp:1463
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition Utils.cpp:2063
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
void generateUnrolledLoop(Block *loopBodyBlock, Value iv, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues, IRMapping *clonedToSrcOpsMap=nullptr)
Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs' and 'yieldedValues' as the...
Definition Utils.cpp:294
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:128
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:126
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition Utils.cpp:986
llvm::function_ref< Fn > function_ref
Definition LLVM.h:152
int64_t stride
int64_t numEltPerStride
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition LoopUtils.h:159
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition Utils.h:489
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition Utils.cpp:1160
bool isWrite() const
Definition Utils.h:537
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition Utils.cpp:1216
void setWrite(bool flag)
Definition Utils.h:538
FlatAffineValueConstraints * getConstraints()
Definition Utils.h:535
Value memref
Memref that this region corresponds to.
Definition Utils.h:570
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition Utils.h:577
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.