MLIR 22.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
22#include "mlir/IR/IRMapping.h"
23#include "mlir/IR/IntegerSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/Support/Debug.h"
28#include "llvm/Support/DebugLog.h"
29#include "llvm/Support/raw_ostream.h"
30#include <optional>
31
32#define DEBUG_TYPE "loop-utils"
33
34using namespace mlir;
35using namespace affine;
36using namespace presburger;
37using llvm::SmallMapVector;
38
39/// Computes the cleanup loop lower bound of the loop being unrolled with
40/// the specified unroll factor; this bound will also be upper bound of the main
41/// part of the unrolled loop. Computes the bound as an AffineMap with its
42/// operands or a null map when the trip count can't be expressed as an affine
43/// expression.
44static void
45getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
46 AffineMap &cleanupLbMap,
47 SmallVectorImpl<Value> &cleanupLbOperands) {
48 AffineMap tripCountMap;
49 SmallVector<Value, 4> tripCountOperands;
50 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
51 // Trip count can't be computed.
52 if (!tripCountMap) {
53 cleanupLbMap = AffineMap();
54 return;
55 }
56
57 OpBuilder b(forOp);
58 auto lbMap = forOp.getLowerBoundMap();
59 auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60 forOp.getLowerBoundOperands());
61
62 // For each upper bound expr, get the range.
63 // Eg: affine.for %i = lb to min (ub1, ub2),
64 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
65 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
66 // these affine.apply's make up the cleanup loop lower bound.
67 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
68 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
69 int64_t step = forOp.getStepAsInt();
70 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
71 auto tripCountExpr = tripCountMap.getResult(i);
72 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
73 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
74 tripCountMap.getNumSymbols(), bumpExprs[i]);
75 bumpValues[i] =
76 AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
77 }
78
79 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
80 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
81 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
82
83 cleanupLbOperands.clear();
84 cleanupLbOperands.push_back(lb);
85 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
86 cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
87 b.getContext());
88 // Simplify the cleanupLbMap + cleanupLbOperands.
89 fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
90 cleanupLbMap = simplifyAffineMap(cleanupLbMap);
91 canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
92 // Remove any affine.apply's that became dead from the simplification above.
93 for (auto v : bumpValues)
94 if (v.use_empty())
95 v.getDefiningOp()->erase();
96
97 if (lb.use_empty())
98 lb.erase();
99}
100
101/// Helper to replace uses of loop carried values (iter_args) and loop
102/// yield values while promoting single iteration affine.for ops.
103static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
104 // Replace uses of iter arguments with iter operands (initial values).
105 auto iterOperands = forOp.getInits();
106 auto iterArgs = forOp.getRegionIterArgs();
107 for (auto e : llvm::zip(iterOperands, iterArgs))
108 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
109
110 // Replace uses of loop results with the values yielded by the loop.
111 auto outerResults = forOp.getResults();
112 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113 for (auto e : llvm::zip(outerResults, innerResults))
114 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
115}
116
117/// Promotes the loop body of a forOp to its containing block if the forOp
118/// was known to have a single iteration.
119LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
120 std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
121 if (!tripCount || *tripCount != 1)
122 return failure();
123
124 // TODO: extend this for arbitrary affine bounds.
125 if (forOp.getLowerBoundMap().getNumResults() != 1)
126 return failure();
127
128 // Replaces all IV uses to its single iteration value.
129 auto iv = forOp.getInductionVar();
130 auto *parentBlock = forOp->getBlock();
131 if (!iv.use_empty()) {
132 if (forOp.hasConstantLowerBound()) {
133 auto func = forOp->getParentOfType<FunctionOpInterface>();
134 OpBuilder builder(forOp->getContext());
135 if (func)
136 builder.setInsertionPointToStart(&func.getFunctionBody().front());
137 else
138 builder.setInsertionPoint(forOp);
139 auto constOp = arith::ConstantIndexOp::create(
140 builder, forOp.getLoc(), forOp.getConstantLowerBound());
141 iv.replaceAllUsesWith(constOp);
142 } else {
143 auto lbOperands = forOp.getLowerBoundOperands();
144 auto lbMap = forOp.getLowerBoundMap();
145 OpBuilder builder(forOp);
146 if (lbMap == builder.getDimIdentityMap()) {
147 // No need of generating an affine.apply.
148 iv.replaceAllUsesWith(lbOperands[0]);
149 } else {
150 auto affineApplyOp =
151 AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152 iv.replaceAllUsesWith(affineApplyOp);
153 }
154 }
155 }
156
158
159 // Move the loop body operations, except for its terminator, to the loop's
160 // containing block.
161 forOp.getBody()->back().erase();
162 parentBlock->getOperations().splice(Block::iterator(forOp),
163 forOp.getBody()->getOperations());
164 forOp.erase();
165 return success();
166}
167
168/// Generates an affine.for op with the specified lower and upper bounds
169/// while generating the right IV remappings to realize shifts for operations in
170/// its body. The operations that go into the loop body are specified in
171/// opGroupQueue starting from the specified offset, and in that order. The
172/// first element of the pair specifies the shift applied to that group of
173/// operations; the shift is multiplied by the loop step before being applied.
174/// Returns nullptr if the generated loop simplifies to a single iteration one.
175static AffineForOp generateShiftedLoop(
176 AffineMap lbMap, AffineMap ubMap,
177 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
178 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
179 auto lbOperands = srcForOp.getLowerBoundOperands();
180 auto ubOperands = srcForOp.getUpperBoundOperands();
181
182 assert(lbMap.getNumInputs() == lbOperands.size());
183 assert(ubMap.getNumInputs() == ubOperands.size());
184
185 auto loopChunk =
186 AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187 ubMap, srcForOp.getStepAsInt());
188 auto loopChunkIV = loopChunk.getInductionVar();
189 auto srcIV = srcForOp.getInductionVar();
190
191 IRMapping operandMap;
192
193 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
194 for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195 uint64_t shift = it.first;
196 auto ops = it.second;
197 // All 'same shift' operations get added with their operands being
198 // remapped to results of cloned operations, and their IV used remapped.
199 // Generate the remapping if the shift is not zero: remappedIV = newIV -
200 // shift.
201 if (!srcIV.use_empty() && shift != 0) {
202 auto ivRemap = AffineApplyOp::create(
203 bodyBuilder, srcForOp.getLoc(),
204 bodyBuilder.getSingleDimShiftAffineMap(
205 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
206 loopChunkIV);
207 operandMap.map(srcIV, ivRemap);
208 } else {
209 operandMap.map(srcIV, loopChunkIV);
210 }
211 for (auto *op : ops)
212 bodyBuilder.clone(*op, operandMap);
213 };
214 if (succeeded(promoteIfSingleIteration(loopChunk)))
215 return AffineForOp();
216 return loopChunk;
217}
218
219// The skewing of operations with respect to one another can be used for
220// example to allow overlap of asynchronous operations (such as DMA
221// communication) with computation, or just relative shifting of operations
222// for better register reuse, locality or parallelism. As such, the shifts are
223// typically expected to be at most of the order of the number of operations.
224// This method should not be used as a substitute for loop distribution/fission.
225// This method uses an algorithm// in time linear in the number of operations
226// in the body of the for loop - (using the 'sweep line' paradigm). This method
227// asserts preservation of SSA dominance. A check for that as well as that for
228// memory-based dependence preservation check rests with the users of this
229// method.
230LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
231 ArrayRef<uint64_t> shifts,
232 bool unrollPrologueEpilogue) {
233 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 "too few/many shifts");
235 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
236 return success();
237
238 // If the trip counts aren't constant, we would need versioning and
239 // conditional guards (or context information to prevent such versioning). The
240 // better way to pipeline for such loops is to first tile them and extract
241 // constant trip count "full tiles" before applying this.
242 auto mayBeConstTripCount = getConstantTripCount(forOp);
243 if (!mayBeConstTripCount) {
244 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
245 return success();
246 }
247 uint64_t tripCount = *mayBeConstTripCount;
248
249 assert(isOpwiseShiftValid(forOp, shifts) &&
250 "shifts will lead to an invalid transformation\n");
251
252 int64_t step = forOp.getStepAsInt();
253
254 unsigned numChildOps = shifts.size();
255
256 // Do a linear time (counting) sort for the shifts.
257 uint64_t maxShift = *llvm::max_element(shifts);
258 if (maxShift >= numChildOps) {
259 // Large shifts are not the typical use case.
260 forOp.emitWarning("not shifting because shifts are unrealistically large");
261 return success();
262 }
263
264 // An array of operation groups sorted by shift amount; each group has all
265 // operations with the same shift in the order in which they appear in the
266 // body of the 'affine.for' op.
267 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268 unsigned pos = 0;
269 for (auto &op : forOp.getBody()->without_terminator()) {
270 auto shift = shifts[pos++];
271 sortedOpGroups[shift].push_back(&op);
272 }
273
274 // Unless the shifts have a specific pattern (which actually would be the
275 // common use case), prologue and epilogue are not meaningfully defined.
276 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
277 // loop generated as the prologue and the last as epilogue and unroll these
278 // fully.
279 AffineForOp prologue, epilogue;
280
281 // Do a sweep over the sorted shifts while storing open groups in a
282 // vector, and generating loop portions as necessary during the sweep. A block
283 // of operations is paired with its shift.
284 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285
286 auto origLbMap = forOp.getLowerBoundMap();
287 uint64_t lbShift = 0;
288 OpBuilder b(forOp);
289 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290 // If nothing is shifted by d, continue.
291 if (sortedOpGroups[d].empty())
292 continue;
293 if (!opGroupQueue.empty()) {
294 assert(d > 0 &&
295 "Queue expected to be empty when the first block is found");
296 // The interval for which the loop needs to be generated here is:
297 // [lbShift, min(lbShift + tripCount, d)) and the body of the
298 // loop needs to have all operations in opQueue in that order.
299 AffineForOp res;
300 if (lbShift + tripCount * step < d * step) {
302 b.getShiftedAffineMap(origLbMap, lbShift),
303 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
304 opGroupQueue, /*offset=*/0, forOp, b);
305 // Entire loop for the queued op groups generated, empty it.
306 opGroupQueue.clear();
307 lbShift += tripCount * step;
308 } else {
309 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
310 b.getShiftedAffineMap(origLbMap, d),
311 opGroupQueue, /*offset=*/0, forOp, b);
312 lbShift = d * step;
313 }
314
315 if (res) {
316 // Simplify/canonicalize the affine.for.
317 RewritePatternSet patterns(res.getContext());
318 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
319 bool erased;
321 res.getOperation(), std::move(patterns),
322 GreedyRewriteConfig().setStrictness(
324 /*changed=*/nullptr, &erased);
325 if (!erased && !prologue)
326 prologue = res;
327 if (!erased)
328 epilogue = res;
329 }
330 } else {
331 // Start of first interval.
332 lbShift = d * step;
333 }
334 // Augment the list of operations that get into the current open interval.
335 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
336 }
337
338 // Those operations groups left in the queue now need to be processed (FIFO)
339 // and their loops completed.
340 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
343 b.getShiftedAffineMap(origLbMap, ubShift),
344 opGroupQueue, /*offset=*/i, forOp, b);
345 lbShift = ubShift;
346 if (!prologue)
347 prologue = epilogue;
348 }
349
350 // Erase the original for op.
351 forOp.erase();
352
353 if (unrollPrologueEpilogue && prologue)
354 (void)loopUnrollFull(prologue);
355 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
356 (void)loopUnrollFull(epilogue);
357
358 return success();
359}
360
361/// Checks whether a loop nest is hyper-rectangular or not.
362static LogicalResult
365 SmallVector<Operation *, 8> ops(input.begin(), input.end());
366 // 0-d or 1-d is trivially hyper-rectangular.
367 if (input.size() <= 1)
368 return success();
369 if (failed(getIndexSet(ops, &cst))) {
370 LDBG() << "Index set computation failed!";
371 return failure();
372 }
373 if (!cst.isHyperRectangular(0, input.size())) {
374 LDBG() << "Non-hyperrectangular nests not supported for tiling!";
375 return failure();
376 }
377 return success();
378}
379
380/// Check if the input nest is supported for tiling and whether tiling would be
381/// legal or not.
382template <typename t>
384 ArrayRef<t> tileSizes) {
385 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387 if (llvm::any_of(input,
388 [](AffineForOp op) { return op.getNumResults() > 0; })) {
389 LDBG() << "Cannot tile nest where a loop has yield values";
390 return failure();
391 }
392
393 // Check if the supplied `for` ops are all successively nested.
394 if (!isPerfectlyNested(input)) {
395 LDBG() << "input loops not perfectly nested";
396 return failure();
397 }
398
399 // TODO: handle non hyper-rectangular spaces.
400 if (failed(checkIfHyperRectangular(input)))
401 return failure();
402
403 return success();
404}
405
406/// Move the loop body of AffineForOp 'src' from 'src' into the specified
407/// location in destination's body, ignoring the terminator.
408static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409 Block::iterator loc) {
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412 std::prev(ops.end()));
413}
414
415/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416/// body.
417static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419}
420
421/// Constructs tiled loop nest, without setting the loop bounds and move the
422/// body of the original loop nest to the tiled loop nest.
424 AffineForOp rootAffineForOp, unsigned width,
425 MutableArrayRef<AffineForOp> tiledLoops) {
426 Location loc = rootAffineForOp.getLoc();
427
428 // The outermost among the loops as we add more..
429 Operation *topLoop = rootAffineForOp.getOperation();
430 AffineForOp innermostPointLoop;
431
432 // Add intra-tile (or point) loops.
433 for (unsigned i = 0; i < width; i++) {
434 OpBuilder b(topLoop);
435 // Loop bounds will be set later.
436 AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437 pointLoop.getBody()->getOperations().splice(
438 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439 topLoop);
440 tiledLoops[2 * width - 1 - i] = pointLoop;
441 topLoop = pointLoop.getOperation();
442 if (i == 0)
443 innermostPointLoop = pointLoop;
444 }
445
446 // Add tile space loops;
447 for (unsigned i = width; i < 2 * width; i++) {
448 OpBuilder b(topLoop);
449 // Loop bounds will be set later.
450 AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451 tileSpaceLoop.getBody()->getOperations().splice(
452 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453 topLoop);
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455 topLoop = tileSpaceLoop.getOperation();
456 }
457
458 // Move the loop body of the original nest to the new one.
459 moveLoopBody(origLoops.back(), innermostPointLoop);
460}
461
462/// Set lower and upper bounds of intra-tile loops for parametric tiling.
463// TODO: Handle non-constant lower bounds.
464static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465 AffineForOp newInterTileLoop,
466 AffineForOp newIntraTileLoop,
467 Value tileSize) {
468 // The lower bound for the intra-tile loop is represented by an affine map
469 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474 // corresponding inter-tile loop.
475
476 assert(origLoop.hasConstantLowerBound() &&
477 "expected input loops to have constant lower bound.");
478
479 // Get lower bound of original loop as an affine expression.
480 AffineExpr origLowerBoundExpr;
481 origLowerBoundExpr =
482 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483
484 // Add dim operands from original lower/upper bound.
485 SmallVector<Value, 4> lbOperands, ubOperands;
486 AffineBound lb = origLoop.getLowerBound();
487 AffineBound ub = origLoop.getUpperBound();
488 lbOperands.reserve(lb.getNumOperands() + 2);
489 ubOperands.reserve(ub.getNumOperands() + 2);
490 AffineMap origLbMap = lb.getMap();
491 AffineMap origUbMap = ub.getMap();
492 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493 lbOperands.push_back(lb.getOperand(j));
494 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495 ubOperands.push_back(ub.getOperand(j));
496
497 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498 // IV.
499 lbOperands.push_back(newInterTileLoop.getInductionVar());
500 ubOperands.push_back(newInterTileLoop.getInductionVar());
501
502 // Get loop IV as an affine expression for lower/upper bound. Size of
503 // lb/ubOperands is guaranteed to be atleast one.
504 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506
507 // Add symbol operands from original lower/upper bound.
508 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512
513 // Add a new symbol operand which is the tile size for this loop.
514 lbOperands.push_back(tileSize);
515 ubOperands.push_back(tileSize);
516
517 SmallVector<AffineExpr, 4> lbBoundExprs;
518 SmallVector<AffineExpr, 4> ubBoundExprs;
519 lbBoundExprs.reserve(origLbMap.getNumResults());
520 ubBoundExprs.reserve(origUbMap.getNumResults());
521
522 // Get tiling parameter as an affine expression for lb/ub.
523 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525
526 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527 lbBoundExprs.push_back(
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529 origLowerBoundExpr);
530
531 // Get the origLoopStep as an affine expression.
532 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533
534 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535 // (tilingParameter * origLoopStep) + origlb.
536 ubBoundExprs.push_back(
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539
540 ubBoundExprs.append(origUbMap.getResults().begin(),
541 origUbMap.getResults().end());
542
543 AffineMap lbMap =
544 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545 lbBoundExprs, b.getContext());
546 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547
548 AffineMap ubMap =
549 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550 ubBoundExprs, b.getContext());
551 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552
553 // Original loop step must be preserved.
554 newIntraTileLoop.setStep(origLoop.getStepAsInt());
555}
556
557/// Set lower and upper bounds of inter-tile loops for parametric tiling.
558// TODO: Handle non-constant lower bounds.
559static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560 AffineForOp newLoop, Value tileSize) {
561 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562
563 // The lower bounds for inter-tile loops are same as the corresponding lower
564 // bounds of original loops.
565 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566
567 // The new upper bound map for inter-tile loops, assuming constant lower
568 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569 // originalLowerBound), tiling parameter); where tiling parameter is the
570 // respective tile size for that loop. For e.g. if the original ubmap was
571 // ()->(1024), the new map will be
572 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573 // Therefore a new symbol operand is inserted in the map and the result
574 // expression is overwritten.
575
576 assert(origLoop.hasConstantLowerBound() &&
577 "expected input loops to have constant lower bound.");
578
579 // Get lower bound of original loop as an affine expression.
580 AffineExpr origLowerBoundExpr;
581 origLowerBoundExpr =
582 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583
584 // Add dim operands from original upper bound.
585 SmallVector<Value, 4> ubOperands;
586 AffineBound ub = origLoop.getUpperBound();
587 ubOperands.reserve(ub.getNumOperands() + 1);
588 AffineMap origUbMap = ub.getMap();
589 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590 ubOperands.push_back(ub.getOperand(j));
591
592 // Add symbol operands from original upper bound.
593 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595
596 // Add a new symbol operand which is the tile size for this loop.
597 ubOperands.push_back(tileSize);
598
599 // Get tiling parameter as an affine expression.
600 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601
603 boundExprs.reserve(origUbMap.getNumResults());
604 int64_t origUpperBound;
605 AffineExpr origUpperBoundExpr;
606
607 // If upper bound for the original loop is constant, then the constant can
608 // be obtained as an affine expression straight away.
609 if (origLoop.hasConstantUpperBound()) {
610 origUpperBound = origLoop.getConstantUpperBound();
611
612 // Get original constant upper bound as an affine expression.
613 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614
615 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616 // originalLowerBound), tilingParameter).
617 boundExprs.push_back(
618 origLowerBoundExpr +
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620 } else {
621 // If upper bound for the original loop is not constant then two cases
622 // are possible, although there handeling is the same, 1.) The result of
623 // ubmap has only one result expression. For e.g.
624 // affine.for %i = 5 to %ub
625 //
626 // A symbol operand is added which represents the tiling parameter. The
627 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628 // where 's0' is the original upper bound and 's1' is the tiling
629 // parameter. 2.) When ubMap has more than one result expression. For e.g.
630 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631 // affine.for %i = 5 to min #map0()[%s0, %s1]
632 //
633 // A symbol operand is added which represents the tiling parameter. The
634 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636
637 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638 // originalLowerBound), tilingParameter).
639 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640 boundExprs.push_back(
641 origLowerBoundExpr +
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643 }
644
645 AffineMap ubMap =
646 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647 boundExprs, b.getContext());
648 newLoop.setUpperBound(ubOperands, ubMap);
649
650 // Original loop step must be preserved.
651 newLoop.setStep(origLoop.getStepAsInt());
652}
653
654/// Constructs and sets new loop bounds after tiling for the case of
655/// hyper-rectangular index sets, where the bounds of one dimension do not
656/// depend on other dimensions and tiling parameters are captured from SSA
657/// values. Bounds of each dimension can thus be treated independently,
658/// and deriving the new bounds is much simpler and faster than for the case of
659/// tiling arbitrary polyhedral shapes.
662 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663 assert(!origLoops.empty() && "expected atleast one loop in band");
664 assert(origLoops.size() == tileSizes.size() &&
665 "expected tiling parameter for each loop in band.");
666
667 OpBuilder b(origLoops[0].getOperation());
668 unsigned width = origLoops.size();
669
670 // Set bounds for tile space loops.
671 for (unsigned i = 0; i < width; ++i) {
672 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673 }
674
675 // Set bounds for intra-tile loops.
676 for (unsigned i = 0; i < width; ++i) {
677 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678 newLoops[i + width], tileSizes[i]);
679 }
680}
681
682/// Constructs and sets new loop bounds after tiling for the case of
683/// hyper-rectangular index sets, where the bounds of one dimension do not
684/// depend on other dimensions. Bounds of each dimension can thus be treated
685/// independently, and deriving the new bounds is much simpler and faster
686/// than for the case of tiling arbitrary polyhedral shapes.
687static void
690 ArrayRef<unsigned> tileSizes) {
691 assert(!origLoops.empty());
692 assert(origLoops.size() == tileSizes.size());
693
694 OpBuilder b(origLoops[0].getOperation());
695 unsigned width = origLoops.size();
696
697 // Bounds for tile space loops.
698 for (unsigned i = 0; i < width; i++) {
699 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703 // If the step size of original loop is x and tileSize is y then after
704 // tiling the tile space loops' step size becomes x*y.
705 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706 }
707 // Bounds for intra-tile loops.
708 for (unsigned i = 0; i < width; i++) {
709 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710 std::optional<uint64_t> mayBeConstantCount =
711 getConstantTripCount(origLoops[i]);
712 // The lower bound is just the tile-space loop.
713 AffineMap lbMap = b.getDimIdentityMap();
714 newLoops[width + i].setLowerBound(
715 /*operands=*/newLoops[i].getInductionVar(), lbMap);
716 // The step sizes of intra-tile loops is just the original loops' step size.
717 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
718
719 // Set the upper bound.
720 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
721 // Trip count is less than the tile size: upper bound is lower bound +
722 // trip count * stepSize.
723 AffineMap ubMap = b.getSingleDimShiftAffineMap(
724 *mayBeConstantCount * origLoops[i].getStepAsInt());
725 newLoops[width + i].setUpperBound(
726 /*operands=*/newLoops[i].getInductionVar(), ubMap);
727 } else if (largestDiv % tileSizes[i] != 0) {
728 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
729 // Construct the upper bound map; the operands are the original operands
730 // with 'i' (tile-space loop) appended to it. The new upper bound map is
731 // the original one with an additional expression i + tileSize * stepSize
732 // appended.
733
734 // Add dim operands from original upper bound.
735 SmallVector<Value, 4> ubOperands;
736 AffineBound ub = origLoops[i].getUpperBound();
737 ubOperands.reserve(ub.getNumOperands() + 1);
738 AffineMap origUbMap = ub.getMap();
739 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
740 ubOperands.push_back(ub.getOperand(j));
741
742 // Add dim operand for new loop upper bound.
743 ubOperands.push_back(newLoops[i].getInductionVar());
744
745 // Add symbol operands from original upper bound.
746 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
747 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
748
750 boundExprs.reserve(1 + origUbMap.getNumResults());
751 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
752 // The new upper bound map is the original one with an additional
753 // expression i + tileSize * stepSize (of original loop) appended.
754 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755 boundExprs.append(origUbMap.getResults().begin(),
756 origUbMap.getResults().end());
757 AffineMap ubMap =
758 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
759 boundExprs, b.getContext());
760 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
761 } else {
762 // No need of the min expression.
763 AffineExpr dim = b.getAffineDimExpr(0);
765 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
767 }
768 }
769}
770
771LogicalResult
773 ArrayRef<unsigned> tileSizes,
774 SmallVectorImpl<AffineForOp> *tiledNest) {
775 if (input.empty())
776 return success();
777
778 if (failed(performPreTilingChecks(input, tileSizes)))
779 return failure();
780
781 MutableArrayRef<AffineForOp> origLoops = input;
782 AffineForOp rootAffineForOp = origLoops[0];
783
784 // Note that width is at least one since the band isn't empty.
785 unsigned width = input.size();
786 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
787
788 // Construct a tiled loop nest without setting their bounds. Bounds are
789 // set later.
790 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
791
792 SmallVector<Value, 8> origLoopIVs;
793 extractForInductionVars(input, &origLoopIVs);
794
795 // Set loop bounds for the tiled loop nest.
796 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
797
798 // Replace original IVs with intra-tile loop IVs.
799 for (unsigned i = 0; i < width; i++)
800 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
801
802 // Erase the old loop nest.
803 rootAffineForOp.erase();
804
805 if (tiledNest)
806 *tiledNest = std::move(tiledLoops);
807
808 return success();
809}
810
811/// Tiles the specified band of perfectly nested loops creating tile-space
812/// loops and intra-tile loops, using SSA values as tiling parameters. A band
813/// is a contiguous set of loops.
816 SmallVectorImpl<AffineForOp> *tiledNest) {
817 if (input.empty())
818 return success();
819
820 if (failed(performPreTilingChecks(input, tileSizes)))
821 return failure();
822
823 MutableArrayRef<AffineForOp> origLoops = input;
824 AffineForOp rootAffineForOp = origLoops[0];
825 unsigned width = input.size();
826 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
827
828 // Construct a tiled loop nest without setting their bounds. Bounds are
829 // set later.
830 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
831
832 SmallVector<Value, 8> origLoopIVs;
833 extractForInductionVars(input, &origLoopIVs);
834
835 // Set loop bounds for the tiled loop nest.
837 tileSizes);
838
839 // Replace original IVs with intra-tile loop IVs.
840 for (unsigned i = 0; i < width; i++)
841 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
842
843 // Erase the old loop nest.
844 rootAffineForOp.erase();
845
846 if (tiledNest)
847 *tiledNest = std::move(tiledLoops);
848
849 return success();
850}
851
852/// Get perfectly nested sequence of loops starting at root of loop nest
853/// (the first op being another AffineFor, and the second op - a terminator).
854/// A loop is perfectly nested iff: the first op in the loop's body is another
855/// AffineForOp, and the second op is a terminator).
857 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
858 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
859 nestedLoops.push_back(root);
860 Block &body = root.getRegion().front();
861 if (body.begin() != std::prev(body.end(), 2))
862 return;
863
864 root = dyn_cast<AffineForOp>(&body.front());
865 if (!root)
866 return;
867 }
868}
869
870/// Unrolls this loop completely.
871LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
872 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
873 if (mayBeConstantTripCount.has_value()) {
874 uint64_t tripCount = *mayBeConstantTripCount;
875 if (tripCount == 0)
876 return success();
877 if (tripCount == 1)
878 return promoteIfSingleIteration(forOp);
879 return loopUnrollByFactor(forOp, tripCount);
880 }
881 return failure();
882}
883
884/// Unrolls this loop by the specified factor or by the trip count (if constant)
885/// whichever is lower.
886LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
887 uint64_t unrollFactor) {
888 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
889 if (mayBeConstantTripCount.has_value() &&
890 *mayBeConstantTripCount < unrollFactor)
891 return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
892 return loopUnrollByFactor(forOp, unrollFactor);
893}
894
895/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
896/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
897/// unrolled body. If specified, annotates the Ops in each unrolled iteration
898/// using annotateFn.
900 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
901 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
902 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
903 ValueRange iterArgs, ValueRange yieldedValues) {
904 // Builder to insert unrolled bodies just before the terminator of the body of
905 // 'forOp'.
906 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
907
908 constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
909 if (!annotateFn)
910 annotateFn = defaultAnnotateFn;
911
912 // Keep a pointer to the last non-terminator operation in the original block
913 // so that we know what to clone (since we are doing this in-place).
914 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
915
916 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
917 SmallVector<Value, 4> lastYielded(yieldedValues);
918
919 for (unsigned i = 1; i < unrollFactor; i++) {
920 IRMapping operandMap;
921
922 // Prepare operand map.
923 operandMap.map(iterArgs, lastYielded);
924
925 // If the induction variable is used, create a remapping to the value for
926 // this unrolled instance.
927 if (!forOpIV.use_empty()) {
928 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929 operandMap.map(forOpIV, ivUnroll);
930 }
931
932 // Clone the original body of 'forOp'.
933 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
934 Operation *clonedOp = builder.clone(*it, operandMap);
935 annotateFn(i, clonedOp, builder);
936 }
937
938 // Update yielded values. If the yielded value is defined outside the
939 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
940 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
941 // value with the clone corresponding to the yielded value.
942 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943 Operation *defOp = yieldedValues[i].getDefiningOp();
944 if (defOp && defOp->getBlock() == loopBodyBlock)
945 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
946 }
947 }
948
949 // Make sure we annotate the Ops in the original body. We do this last so that
950 // any annotations are not copied into the cloned Ops above.
951 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
952 annotateFn(0, &*it, builder);
953
954 // Update operands of the yield statement.
955 loopBodyBlock->getTerminator()->setOperands(lastYielded);
956}
957
958/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
959/// count is not a multiple of `unrollFactor`.
960static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
961 uint64_t unrollFactor) {
962 // Insert the cleanup loop right after 'forOp'.
963 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
964 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
965
966 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
967 // and produce results for the original users of `forOp` results.
968 auto results = forOp.getResults();
969 auto cleanupResults = cleanupForOp.getResults();
970 auto cleanupIterOperands = cleanupForOp.getInits();
971
972 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
975 }
976
977 AffineMap cleanupMap;
978 SmallVector<Value, 4> cleanupOperands;
979 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
980 if (!cleanupMap)
981 return failure();
982
983 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
984 // Promote the loop body up if this has turned into a single iteration loop.
985 (void)promoteIfSingleIteration(cleanupForOp);
986
987 // Adjust upper bound of the original loop; this is the same as the lower
988 // bound of the cleanup loop.
989 forOp.setUpperBound(cleanupOperands, cleanupMap);
990 return success();
991}
992
993/// Unrolls this loop by the specified factor. Returns success if the loop
994/// is successfully unrolled.
996 AffineForOp forOp, uint64_t unrollFactor,
997 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
998 bool cleanUpUnroll) {
999 assert(unrollFactor > 0 && "unroll factor should be positive");
1000
1001 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1002 if (unrollFactor == 1) {
1003 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1004 return failure();
1005 return success();
1006 }
1007
1008 // Nothing in the loop body other than the terminator.
1009 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1010 return success();
1011
1012 // If the trip count is lower than the unroll factor, no unrolled body.
1013 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014 if (cleanUpUnroll) {
1015 // Unroll the cleanup loop if cleanUpUnroll is specified.
1016 return loopUnrollFull(forOp);
1017 }
1018
1019 return failure();
1020 }
1021
1022 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1023 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1024 // Loops where the lower bound is a max expression or the upper bound is
1025 // a min expression and the trip count doesn't divide the unroll factor
1026 // can't be unrolled since the lower bound of the cleanup loop in such cases
1027 // cannot be expressed as an affine function or a max over affine functions.
1028 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029 forOp.getUpperBoundMap().getNumResults() != 1)
1030 return failure();
1031 if (cleanUpUnroll)
1032 // Force unroll including cleanup loop
1033 return loopUnrollFull(forOp);
1034 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1035 assert(false && "cleanup loop lower bound map for single result lower "
1036 "and upper bound maps can always be determined");
1037 }
1038
1039 ValueRange iterArgs(forOp.getRegionIterArgs());
1040 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1041
1042 // Scale the step of loop being unrolled by unroll factor.
1043 int64_t step = forOp.getStepAsInt();
1044 forOp.setStep(step * unrollFactor);
1046 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1047 [&](unsigned i, Value iv, OpBuilder b) {
1048 // iv' = iv + i * step
1049 auto d0 = b.getAffineDimExpr(0);
1050 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051 return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1052 },
1053 /*annotateFn=*/annotateFn,
1054 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1055
1056 // Promote the loop body up if this has turned into a single iteration loop.
1058 return success();
1059}
1060
1061LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1062 uint64_t unrollJamFactor) {
1063 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1064 if (mayBeConstantTripCount.has_value() &&
1065 *mayBeConstantTripCount < unrollJamFactor)
1066 return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1067 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1068}
1069
1070/// Check if all control operands of all loops are defined outside of `forOp`
1071/// and return false if not.
1072static bool areInnerBoundsInvariant(AffineForOp forOp) {
1073 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074 for (auto controlOperand : aForOp.getControlOperands()) {
1075 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076 return WalkResult::interrupt();
1077 }
1078 return WalkResult::advance();
1079 });
1080 return !walkResult.wasInterrupted();
1081}
1082
1083/// Unrolls and jams this loop by the specified factor.
1084LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1085 uint64_t unrollJamFactor) {
1086 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1087
1088 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1089 if (unrollJamFactor == 1) {
1090 if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1091 return failure();
1092 return success();
1093 }
1094
1095 // Nothing in the loop body other than the terminator.
1096 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1097 return success();
1098
1099 // If the trip count is lower than the unroll jam factor, no unroll jam.
1100 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101 LDBG() << "[failed] trip count < unroll-jam factor";
1102 return failure();
1103 }
1104
1105 // If any control operand of any inner loop of `forOp` is defined within
1106 // `forOp`, no unroll jam.
1107 if (!areInnerBoundsInvariant(forOp))
1108 return failure();
1109
1110 // Gather all sub-blocks to jam upon the loop being unrolled.
1112 jbg.walk(forOp);
1113 auto &subBlocks = jbg.subBlocks;
1114
1115 // Collect loops with iter_args.
1116 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1117 forOp.walk([&](AffineForOp aForOp) {
1118 if (aForOp.getNumIterOperands() > 0)
1119 loopsWithIterArgs.push_back(aForOp);
1120 });
1121
1122 // Get supported reductions to be used for creating reduction ops at the end.
1123 SmallVector<LoopReduction> reductions;
1124 if (forOp.getNumIterOperands() > 0)
1125 getSupportedReductions(forOp, reductions);
1126
1127 // Generate the cleanup loop if trip count isn't a multiple of
1128 // unrollJamFactor.
1129 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1130 // Loops where the lower bound is a max expression or the upper bound is
1131 // a min expression and the trip count doesn't divide the unroll factor
1132 // can't be unrolled since the lower bound of the cleanup loop in such cases
1133 // cannot be expressed as an affine function or a max over affine functions.
1134 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135 forOp.getUpperBoundMap().getNumResults() != 1)
1136 return failure();
1137 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1138 assert(false && "cleanup loop lower bound map for single result lower "
1139 "and upper bound maps can always be determined");
1140 }
1141
1142 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1143 // iteration. There are (`unrollJamFactor` - 1) iterations.
1144 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1145
1146 // For any loop with iter_args, replace it with a new loop that has
1147 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1148 // operands.
1149 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1150 IRRewriter rewriter(forOp.getContext());
1151 for (AffineForOp oldForOp : loopsWithIterArgs) {
1152 SmallVector<Value> dupIterOperands, dupYieldOperands;
1153 ValueRange oldIterOperands = oldForOp.getInits();
1154 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1155 ValueRange oldYieldOperands =
1156 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1157 // Get additional iterOperands, iterArgs, and yield operands. We will
1158 // fix iterOperands and yield operands after cloning of sub-blocks.
1159 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1162 }
1163 // Create a new loop with additional iterOperands, iter_args and yield
1164 // operands. This new loop will take the loop body of the original loop.
1165 bool forOpReplaced = oldForOp == forOp;
1166 AffineForOp newForOp =
1167 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168 rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1169 [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1170 return dupYieldOperands;
1171 }));
1172 newLoopsWithIterArgs.push_back(newForOp);
1173 // `forOp` has been replaced with a new loop.
1174 if (forOpReplaced)
1175 forOp = newForOp;
1176 // Update `operandMaps` for `newForOp` iterArgs and results.
1177 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178 unsigned oldNumIterArgs = oldIterArgs.size();
1179 ValueRange newResults = newForOp.getResults();
1180 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181 assert(oldNumIterArgs == oldNumResults &&
1182 "oldNumIterArgs must be the same as oldNumResults");
1183 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1185 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1186 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1187 // to those in the `i`th new set.
1188 operandMaps[i - 1].map(newIterArgs[j],
1189 newIterArgs[i * oldNumIterArgs + j]);
1190 operandMaps[i - 1].map(newResults[j],
1191 newResults[i * oldNumResults + j]);
1192 }
1193 }
1194 }
1195
1196 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1197 int64_t step = forOp.getStepAsInt();
1198 forOp.setStep(step * unrollJamFactor);
1199
1200 auto forOpIV = forOp.getInductionVar();
1201 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1202 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203 for (auto &subBlock : subBlocks) {
1204 // Builder to insert unroll-jammed bodies. Insert right at the end of
1205 // sub-block.
1206 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1207
1208 // If the induction variable is used, create a remapping to the value for
1209 // this unrolled instance.
1210 if (!forOpIV.use_empty()) {
1211 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1212 auto d0 = builder.getAffineDimExpr(0);
1213 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1214 auto ivUnroll =
1215 AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216 operandMaps[i - 1].map(forOpIV, ivUnroll);
1217 }
1218 // Clone the sub-block being unroll-jammed.
1219 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220 builder.clone(*it, operandMaps[i - 1]);
1221 }
1222 // Fix iterOperands and yield op operands of newly created loops.
1223 for (auto newForOp : newLoopsWithIterArgs) {
1224 unsigned oldNumIterOperands =
1225 newForOp.getNumIterOperands() / unrollJamFactor;
1226 unsigned numControlOperands = newForOp.getNumControlOperands();
1227 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229 assert(oldNumIterOperands == oldNumYieldOperands &&
1230 "oldNumIterOperands must be the same as oldNumYieldOperands");
1231 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1232 // The `i`th duplication of an old iterOperand or yield op operand
1233 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1234 // if such mapped value exists.
1235 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1236 operandMaps[i - 1].lookupOrDefault(
1237 newForOp.getOperand(numControlOperands + j)));
1238 yieldOp.setOperand(
1239 i * oldNumYieldOperands + j,
1240 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1241 }
1242 }
1243 }
1244 if (forOp.getNumResults() > 0) {
1245 // Create reduction ops to combine every `unrollJamFactor` related results
1246 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1247 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1248 // %1.
1249 rewriter.setInsertionPointAfter(forOp);
1250 auto loc = forOp.getLoc();
1251 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1252 for (LoopReduction &reduction : reductions) {
1253 unsigned pos = reduction.iterArgPosition;
1254 Value lhs = forOp.getResult(pos);
1255 Value rhs;
1257 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258 rhs = forOp.getResult(i * oldNumResults + pos);
1259 // Create ops based on reduction type.
1260 lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1261 if (!lhs)
1262 return failure();
1263 Operation *op = lhs.getDefiningOp();
1264 assert(op && "Reduction op should have been created");
1265 newOps.insert(op);
1266 }
1267 // Replace all uses except those in newly created reduction ops.
1268 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1269 }
1270 }
1271
1272 // Promote the loop body up if this has turned into a single iteration loop.
1274 return success();
1275}
1276
1277/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1278/// nested within 'forOpA' as the only non-terminator operation in its block.
1279void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1280 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281 auto &forOpABody = forOpA.getBody()->getOperations();
1282 auto &forOpBBody = forOpB.getBody()->getOperations();
1283
1284 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1285 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1286 // body containing only the terminator.
1287 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1288 forOpABody, forOpABody.begin(),
1289 std::prev(forOpABody.end()));
1290 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1291 // body (this leaves forOpB's body containing only the terminator).
1292 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293 std::prev(forOpBBody.end()));
1294 // 3) Splice forOpA into the beginning of forOpB's body.
1295 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1296 Block::iterator(forOpA));
1297}
1298
1299// Checks each dependence component against the permutation to see if the
1300// desired loop interchange would violate dependences by making the
1301// dependence component lexicographically negative.
1303 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1304 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1305 // Invert permutation map.
1306 unsigned maxLoopDepth = loops.size();
1307 SmallVector<unsigned, 4> loopPermMapInv;
1308 loopPermMapInv.resize(maxLoopDepth);
1309 for (unsigned i = 0; i < maxLoopDepth; ++i)
1310 loopPermMapInv[loopPermMap[i]] = i;
1311
1312 // Check each dependence component against the permutation to see if the
1313 // desired loop interchange permutation would make the dependence vectors
1314 // lexicographically negative.
1315 // Example 1: [-1, 1][0, 0]
1316 // Example 2: [0, 0][-1, 1]
1317 for (const auto &depComps : depCompsVec) {
1318 assert(depComps.size() >= maxLoopDepth);
1319 // Check if the first non-zero dependence component is positive.
1320 // This iterates through loops in the desired order.
1321 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1322 unsigned permIndex = loopPermMapInv[j];
1323 assert(depComps[permIndex].lb);
1324 int64_t depCompLb = *depComps[permIndex].lb;
1325 if (depCompLb > 0)
1326 break;
1327 if (depCompLb < 0)
1328 return false;
1329 }
1330 }
1331 return true;
1332}
1333
1334/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1335/// nested sequence of loops in 'loops' would violate dependences.
1337 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1338 assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1339 unsigned maxLoopDepth = loops.size();
1340 if (maxLoopDepth == 1)
1341 return true;
1342
1343 // We cannot guarantee the validity of the interchange if the loops have
1344 // iter_args, since the dependence analysis does not take them into account.
1345 // Conservatively return false in such cases.
1346 if (llvm::any_of(loops, [](AffineForOp loop) {
1347 return loop.getNumIterOperands() > 0;
1348 }))
1349 return false;
1350
1351 // Gather dependence components for dependences between all ops in loop nest
1352 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1353 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1354 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1355 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1356}
1357
1358/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1359/// in it from outermost to innermost.
1360[[maybe_unused]] bool
1362 assert(!loops.empty() && "no loops provided");
1363
1364 // We already know that the block can't be empty.
1365 auto hasTwoElements = [](Block *block) {
1366 auto secondOpIt = std::next(block->begin());
1367 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1368 };
1369
1370 auto enclosingLoop = loops.front();
1371 for (auto loop : loops.drop_front()) {
1372 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1373 // parentForOp's body should be just this loop and the terminator.
1374 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1375 return false;
1376 enclosingLoop = loop;
1377 }
1378 return true;
1379}
1380
1381// input[i] should move from position i -> permMap[i]. Returns the position in
1382// `input` that becomes the new outermost loop.
1384 ArrayRef<unsigned> permMap) {
1385 assert(input.size() == permMap.size() && "invalid permutation map size");
1386 // Check whether the permutation spec is valid. This is a small vector - we'll
1387 // just sort and check if it's iota.
1388 SmallVector<unsigned, 4> checkPermMap(permMap);
1389 llvm::sort(checkPermMap);
1390 if (llvm::any_of(llvm::enumerate(checkPermMap),
1391 [](const auto &en) { return en.value() != en.index(); }))
1392 assert(false && "invalid permutation map");
1393
1394 // Nothing to do.
1395 if (input.size() < 2)
1396 return 0;
1397
1398 assert(isPerfectlyNested(input) && "input not perfectly nested");
1399
1400 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1401 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1403 for (unsigned i = 0, e = input.size(); i < e; ++i)
1404 invPermMap.push_back({permMap[i], i});
1405 llvm::sort(invPermMap);
1406
1407 // Move the innermost loop body to the loop that would be the innermost in the
1408 // permuted nest (only if the innermost loop is going to change).
1409 if (permMap.back() != input.size() - 1) {
1410 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411 Block *srcBody = ((AffineForOp)input.back()).getBody();
1412 destBody->getOperations().splice(destBody->begin(),
1413 srcBody->getOperations(), srcBody->begin(),
1414 std::prev(srcBody->end()));
1415 }
1416
1417 // We'll move each loop in `input` in the reverse order so that its body is
1418 // empty when we are moving it; this incurs zero copies and no erasing.
1419 for (int i = input.size() - 1; i >= 0; --i) {
1420 // If this has to become the outermost loop after permutation, add it to the
1421 // parent block of the original root.
1422 if (permMap[i] == 0) {
1423 // If the root remains the same, nothing to do.
1424 if (i == 0)
1425 continue;
1426 // Make input[i] the new outermost loop moving it into parentBlock.
1427 auto *parentBlock = input[0]->getBlock();
1428 parentBlock->getOperations().splice(Block::iterator(input[0]),
1429 input[i]->getBlock()->getOperations(),
1430 Block::iterator(input[i]));
1431 continue;
1432 }
1433
1434 // If the parent in the permuted order is the same as in the original,
1435 // nothing to do.
1436 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1438 continue;
1439
1440 // Move input[i] to its surrounding loop in the transformed nest.
1441 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442 destBody->getOperations().splice(destBody->begin(),
1443 input[i]->getBlock()->getOperations(),
1444 Block::iterator(input[i]));
1445 }
1446
1447 return invPermMap[0].second;
1448}
1449
1450// Sinks all sequential loops to the innermost levels (while preserving
1451// relative order among them) and moves all parallel loops to the
1452// outermost (while again preserving relative order among them).
1453AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1455 getPerfectlyNestedLoops(loops, forOp);
1456 if (loops.size() < 2)
1457 return forOp;
1458
1459 // Gather dependence components for dependences between all ops in loop nest
1460 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1461 unsigned maxLoopDepth = loops.size();
1462 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1463 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1464
1465 // Mark loops as either parallel or sequential.
1466 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1467 for (auto &depComps : depCompsVec) {
1468 assert(depComps.size() >= maxLoopDepth);
1469 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1470 DependenceComponent &depComp = depComps[j];
1471 assert(depComp.lb.has_value() && depComp.ub.has_value());
1472 if (*depComp.lb != 0 || *depComp.ub != 0)
1473 isParallelLoop[j] = false;
1474 }
1475 }
1476
1477 unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1478
1479 // Compute permutation of loops that sinks sequential loops (and thus raises
1480 // parallel loops) while preserving relative order.
1481 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1482 unsigned nextSequentialLoop = numParallelLoops;
1483 unsigned nextParallelLoop = 0;
1484 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1485 if (isParallelLoop[i]) {
1486 loopPermMap[i] = nextParallelLoop++;
1487 } else {
1488 loopPermMap[i] = nextSequentialLoop++;
1489 }
1490 }
1491
1492 // Check if permutation 'loopPermMap' would violate dependences.
1493 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1494 return forOp;
1495 // Perform loop interchange according to permutation 'loopPermMap'.
1496 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1497 return loops[loopNestRootIndex];
1498}
1499
1500// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1501// lower (resp. upper) loop bound. When called for both the lower and upper
1502// bounds, the resulting IR resembles:
1503//
1504// ```mlir
1505// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1506// ...
1507// }
1508// ```
1510 SmallVector<Value, 4> *operands,
1511 int64_t offset = 0) {
1512 auto bounds = llvm::to_vector<4>(map->getResults());
1513 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1514 operands->insert(operands->begin() + map->getNumDims(), iv);
1515 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1516 b.getContext());
1517 canonicalizeMapAndOperands(map, operands);
1518}
1519
1520// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1521// Stripmine-sink is a primitive building block for generalized tiling of
1522// imperfectly nested loops.
1523// This transformation is purely mechanical and does not check legality,
1524// profitability or even structural correctness. It is the user's
1525// responsibility to specify `targets` that are dominated by `forOp`.
1526// Returns the new AffineForOps, one per `targets`, nested immediately under
1527// each of the `targets`.
1529stripmineSink(AffineForOp forOp, uint64_t factor,
1530 ArrayRef<AffineForOp> targets) {
1531 auto originalStep = forOp.getStepAsInt();
1532 auto scaledStep = originalStep * factor;
1533 forOp.setStep(scaledStep);
1534
1535 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1536
1537 // Lower-bound map creation.
1538 auto lbMap = forOp.getLowerBoundMap();
1539 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1540 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1541
1542 // Upper-bound map creation.
1543 auto ubMap = forOp.getUpperBoundMap();
1544 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1545 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1546 /*offset=*/scaledStep);
1547
1548 auto iv = forOp.getInductionVar();
1549 SmallVector<AffineForOp, 8> innerLoops;
1550 for (auto t : targets) {
1551 // Insert newForOp before the terminator of `t`.
1552 auto b = OpBuilder::atBlockTerminator(t.getBody());
1553 auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554 ubOperands, ubMap, originalStep);
1555 auto begin = t.getBody()->begin();
1556 // Skip terminator and `newForOp` which is just before the terminator.
1557 auto nOps = t.getBody()->getOperations().size() - 2;
1558 newForOp.getBody()->getOperations().splice(
1559 newForOp.getBody()->getOperations().begin(),
1560 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1561 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1562 newForOp.getRegion());
1563 innerLoops.push_back(newForOp);
1564 }
1565
1566 return innerLoops;
1567}
1568
1569// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1570// Returns the new AffineForOps, nested immediately under `target`.
1571template <typename SizeType>
1572static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1573 AffineForOp target) {
1574 // TODO: Use cheap structural assertions that targets are nested under
1575 // forOp and that targets are not nested under each other when DominanceInfo
1576 // exposes the capability. It seems overkill to construct a whole function
1577 // dominance tree at this point.
1578 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1579 assert(res.size() == 1 && "Expected 1 inner forOp");
1580 return res[0];
1581}
1582
1585 ArrayRef<AffineForOp> targets) {
1587 SmallVector<AffineForOp, 8> currentTargets(targets);
1588 for (auto it : llvm::zip(forOps, sizes)) {
1589 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590 res.push_back(step);
1591 currentTargets = step;
1592 }
1593 return res;
1594}
1595
1597 ArrayRef<uint64_t> sizes,
1598 AffineForOp target) {
1600 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1601 res.push_back(llvm::getSingleElement(loops));
1602 return res;
1603}
1604
1605LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1606 if (loops.size() < 2)
1607 return success();
1608
1609 AffineForOp innermost = loops.back();
1610 AffineForOp outermost = loops.front();
1611 AffineBound ub = outermost.getUpperBound();
1612 AffineMap origUbMap = ub.getMap();
1613 Location loc = outermost.getLoc();
1614 OpBuilder builder(outermost);
1615 for (AffineForOp loop : loops) {
1616 // We only work on normalized loops.
1617 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618 loop.getConstantLowerBound() != 0)
1619 return failure();
1620 }
1621 SmallVector<Value, 4> upperBoundSymbols;
1622 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1623 ub.getOperands().end());
1624
1625 // 1. Store the upper bound of the outermost loop in a variable.
1626 Value prev;
1627 if (!llvm::hasSingleElement(origUbMap.getResults()))
1628 prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1629 else
1630 prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631 upperBoundSymbols.push_back(prev);
1632
1633 // 2. Emit code computing the upper bound of the coalesced loop as product of
1634 // the number of iterations of all loops.
1635 for (AffineForOp loop : loops.drop_front()) {
1636 ub = loop.getUpperBound();
1637 origUbMap = ub.getMap();
1638 ubOperands = ub.getOperands();
1639 Value upperBound;
1640 // If upper bound map has more than one result, take their minimum.
1641 if (!llvm::hasSingleElement(origUbMap.getResults()))
1642 upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1643 else
1644 upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645 upperBoundSymbols.push_back(upperBound);
1646 SmallVector<Value, 4> operands;
1647 operands.push_back(prev);
1648 operands.push_back(upperBound);
1649 // Maintain running product of loop upper bounds.
1650 prev = AffineApplyOp::create(
1651 builder, loc,
1652 AffineMap::get(/*dimCount=*/1,
1653 /*symbolCount=*/1,
1654 builder.getAffineDimExpr(0) *
1655 builder.getAffineSymbolExpr(0)),
1656 operands);
1657 }
1658 // Set upper bound of the coalesced loop.
1659 AffineMap newUbMap = AffineMap::get(
1660 /*dimCount=*/0,
1661 /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1662 outermost.setUpperBound(prev, newUbMap);
1663
1664 builder.setInsertionPointToStart(outermost.getBody());
1665
1666 // 3. Remap induction variables. For each original loop, the value of the
1667 // induction variable can be obtained by dividing the induction variable of
1668 // the linearized loop by the total number of iterations of the loops nested
1669 // in it modulo the number of iterations in this loop (remove the values
1670 // related to the outer loops):
1671 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1672 // Compute these iteratively from the innermost loop by creating a "running
1673 // quotient" of division by the range.
1674 Value previous = outermost.getInductionVar();
1675 for (unsigned idx = loops.size(); idx > 0; --idx) {
1676 if (idx != loops.size()) {
1677 SmallVector<Value, 4> operands;
1678 operands.push_back(previous);
1679 operands.push_back(upperBoundSymbols[idx]);
1680 previous = AffineApplyOp::create(builder, loc,
1682 /*dimCount=*/1, /*symbolCount=*/1,
1683 builder.getAffineDimExpr(0).floorDiv(
1684 builder.getAffineSymbolExpr(0))),
1685 operands);
1686 }
1687 // Modified value of the induction variables of the nested loops after
1688 // coalescing.
1689 Value inductionVariable;
1690 if (idx == 1) {
1691 inductionVariable = previous;
1692 } else {
1693 SmallVector<Value, 4> applyOperands;
1694 applyOperands.push_back(previous);
1695 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696 inductionVariable = AffineApplyOp::create(
1697 builder, loc,
1699 /*dimCount=*/1, /*symbolCount=*/1,
1700 builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1701 applyOperands);
1702 }
1703 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1704 inductionVariable, loops.back().getRegion());
1705 }
1706
1707 // 4. Move the operations from the innermost just above the second-outermost
1708 // loop, delete the extra terminator and the second-outermost loop.
1709 AffineForOp secondOutermostLoop = loops[1];
1710 innermost.getBody()->back().erase();
1711 outermost.getBody()->getOperations().splice(
1712 Block::iterator(secondOutermostLoop.getOperation()),
1713 innermost.getBody()->getOperations());
1714 for (auto [iter, init] :
1715 llvm::zip_equal(secondOutermostLoop.getRegionIterArgs(),
1716 secondOutermostLoop.getInits())) {
1717 iter.replaceAllUsesWith(init);
1718 iter.dropAllUses();
1719 }
1720 secondOutermostLoop.erase();
1721 return success();
1722}
1723
1724void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1725 ArrayRef<Value> processorId,
1726 ArrayRef<Value> numProcessors) {
1727 assert(processorId.size() == numProcessors.size());
1728 if (processorId.empty())
1729 return;
1730
1731 OpBuilder b(forOp);
1732 Location loc(forOp.getLoc());
1734 bindSymbols(forOp.getContext(), lhs, rhs);
1735 auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1736 auto addMap = AffineMap::get(0, 2, lhs + rhs);
1737
1738 Value linearIndex = processorId.front();
1739 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1740 auto mulApplyOp = AffineApplyOp::create(
1741 b, loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1742 linearIndex = AffineApplyOp::create(b, loc, addMap,
1743 ValueRange{mulApplyOp, processorId[i]});
1744 }
1745
1746 auto mulApplyOp = AffineApplyOp::create(
1747 b, loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1748 Value lb = AffineApplyOp::create(
1749 b, loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1750 forOp.setLowerBound(lb);
1751
1752 Value step = forOp.getStep();
1753 for (auto numProcs : numProcessors)
1754 step = AffineApplyOp::create(b, loc, mulMap, ValueRange{numProcs, step});
1755 forOp.setStep(step);
1756}
1757
1758/// Given a memref region, determine the lowest depth at which transfers can be
1759/// placed for it, and return the corresponding block, start and end positions
1760/// in the block for placing incoming (read) and outgoing (write) copies
1761/// respectively. The lowest depth depends on whether the region being accessed
1762/// is hoistable with respect to one or more immediately surrounding loops.
1763static void
1765 Block::iterator &begin, Block::iterator &end,
1766 Block **copyPlacementBlock,
1767 Block::iterator *copyInPlacementStart,
1768 Block::iterator *copyOutPlacementStart) {
1769 const auto *cst = region.getConstraints();
1770 SmallVector<Value, 4> symbols;
1771 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1772
1773 SmallVector<Operation *, 4> enclosingAffineOps;
1774 getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1775 // Walk up loop parents till we find an IV on which this region is
1776 // symbolic/variant or we hit `hoistGuard`.
1777 auto it = enclosingAffineOps.rbegin();
1778 AffineForOp lastInvariantFor;
1779 for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1780 Operation *enclosingOp = *it;
1781 // We can't hoist past the definition of the memref being copied.
1782 Value memref = region.memref;
1783 if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1784 LDBG() << "memref definition will end up not dominating hoist location";
1785 break;
1786 }
1787
1788 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1789 if (!affineFor)
1790 break;
1791 // TODO: also need to be checking this for regions symbols that
1792 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1793 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1794 break;
1795 lastInvariantFor = affineFor;
1796 }
1797
1798 if (it != enclosingAffineOps.rbegin()) {
1799 *copyInPlacementStart = Block::iterator(lastInvariantFor);
1800 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1801 *copyPlacementBlock = lastInvariantFor->getBlock();
1802 } else {
1803 *copyInPlacementStart = begin;
1804 *copyOutPlacementStart = end;
1805 *copyPlacementBlock = &block;
1806 }
1807}
1808
1809// Info comprising stride and number of elements transferred every stride.
1814
1815/// Returns striding information for a copy/transfer of this region with
1816/// potentially multiple striding levels from outermost to innermost. For an
1817/// n-dimensional region, there can be at most n-1 levels of striding
1818/// successively nested.
1819// TODO: make this work with non-identity layout maps.
1820static void getMultiLevelStrides(const MemRefRegion &region,
1821 ArrayRef<int64_t> bufferShape,
1822 SmallVectorImpl<StrideInfo> *strideInfos) {
1823 if (bufferShape.size() <= 1)
1824 return;
1825
1826 int64_t numEltPerStride = 1;
1827 int64_t stride = 1;
1828 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1829 int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1830 stride *= dimSize;
1831 numEltPerStride *= bufferShape[d];
1832 // A stride is needed only if the region has a shorter extent than the
1833 // memref along the dimension *and* has an extent greater than one along the
1834 // next major dimension.
1835 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1836 strideInfos->push_back({stride, numEltPerStride});
1837 }
1838 }
1839}
1840
1841/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1842/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1843/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1844/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1845/// contain the expressions to be subtracted out from the respective copy loop
1846/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1847/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1848/// copy nest is inserted.
1849//
1850/// The copy-in nest is generated as follows as an example for a 2-d region:
1851/// for x = ...
1852/// for y = ...
1853/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1854///
1855static AffineForOp
1856generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1857 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1858 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1859 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1860 OpBuilder b) {
1861 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1862 return lbMap.getNumInputs() == lbOperands.size();
1863 }));
1864 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1865 return ubMap.getNumInputs() == ubOperands.size();
1866 }));
1867
1868 unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1869 // A copy nest can't be generated for 0-ranked memrefs.
1870 assert(rank != 0 && "non-zero rank memref expected");
1871 assert(lbMaps.size() == rank && "wrong number of lb maps");
1872 assert(ubMaps.size() == rank && "wrong number of ub maps");
1873
1874 SmallVector<Value, 4> memIndices;
1876 SmallVector<Value, 4> fastBufMapOperands;
1877 AffineForOp copyNestRoot;
1878 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1879 for (unsigned d = 0; d < rank; ++d) {
1880 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1881 ubOperands, ubMaps[d]);
1882 if (d == 0)
1883 copyNestRoot = forOp;
1884
1885 b = OpBuilder::atBlockTerminator(forOp.getBody());
1886
1887 auto fastBufOffsetMap =
1888 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1889 auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1890
1891 // Construct the subscript for the fast memref being copied into/from:
1892 // x - offset_x.
1893 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1894 b.getAffineDimExpr(2 * d));
1895 fastBufMapOperands.push_back(offset);
1896 fastBufMapOperands.push_back(forOp.getInductionVar());
1897 mayBeDeadApplys.push_back(offset);
1898
1899 // Subscript for the slow memref being copied.
1900 memIndices.push_back(forOp.getInductionVar());
1901 }
1902
1903 auto fastBufMap =
1904 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1907 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1908
1909 // Drop any dead affine.applys.
1910 for (auto applyOp : mayBeDeadApplys)
1911 if (applyOp.use_empty())
1912 applyOp.erase();
1913
1914 if (!isCopyOut) {
1915 // Copy in.
1916 auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1917 AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1918 fastBufMapOperands);
1919 return copyNestRoot;
1920 }
1921
1922 // Copy out.
1923 auto load =
1924 AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1925 AffineStoreOp::create(b, loc, load, memref, memIndices);
1927}
1928
1929[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
1930 return block.getParentOp()->emitRemark();
1931}
1932
1933/// Creates a buffer in the faster memory space for the specified memref region
1934/// (memref has to be non-zero ranked); generates a copy from the lower memory
1935/// space to this one, and replaces all loads/stores in the block range
1936/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1937/// if copies could not be generated due to yet unimplemented cases.
1938/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1939/// specify the insertion points where the incoming copies and outgoing copies,
1940/// respectively, should be inserted (the insertion happens right before the
1941/// insertion point). Since `begin` can itself be invalidated due to the memref
1942/// rewriting done from this method, the output argument `nBegin` is set to its
1943/// replacement (set to `begin` if no invalidation happens). Since outgoing
1944/// copies could have been inserted at `end`, the output argument `nEnd` is set
1945/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1946/// allocated.
1947static LogicalResult generateCopy(
1948 const MemRefRegion &region, Block *block, Block::iterator begin,
1949 Block::iterator end, Block *copyPlacementBlock,
1950 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1951 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1952 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1953 Block::iterator *nBegin, Block::iterator *nEnd) {
1954 *nBegin = begin;
1955 *nEnd = end;
1956
1957 auto f = begin->getParentOfType<FunctionOpInterface>();
1958 OpBuilder topBuilder(f.getFunctionBody());
1959 Value zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
1960
1961 *sizeInBytes = 0;
1962
1963 if (begin == end)
1964 return success();
1965
1966 // Record the last op in the block for which we are performing copy
1967 // generation. We later do the memref replacement only in [begin, lastCopyOp]
1968 // so that the original memref's used in the data movement code themselves
1969 // don't get replaced.
1970 Operation *lastCopyOp = end->getPrevNode();
1971
1972 // Is the copy out point at the end of the block where we are doing
1973 // explicit copying.
1974 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1975
1976 // Copies for read regions are going to be inserted at 'begin'.
1977 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1978 // Copies for write regions are going to be inserted at 'end'.
1979 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1980 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1981
1982 // Builder to create constants at the top level.
1983 auto func =
1984 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1985 OpBuilder top(func.getFunctionBody());
1986
1987 auto loc = region.loc;
1988 auto memref = region.memref;
1989 auto memRefType = cast<MemRefType>(memref.getType());
1990
1991 if (!memRefType.getLayout().isIdentity()) {
1992 LDBG() << "Non-identity layout map not yet supported";
1993 return failure();
1994 }
1995
1996 // Indices to use for the copying.
1997 // Indices for the original memref being copied from/to.
1998 SmallVector<Value, 4> memIndices;
1999 // Indices for the faster buffer being copied into/from.
2000 SmallVector<Value, 4> bufIndices;
2001
2002 unsigned rank = memRefType.getRank();
2003 if (rank == 0) {
2004 LDBG() << "Non-zero ranked memrefs supported";
2005 return failure();
2006 }
2007
2008 SmallVector<int64_t, 4> fastBufferShape;
2009
2010 // Compute the extents of the buffer.
2012 lbs.reserve(rank);
2013 std::optional<int64_t> numElements =
2014 region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2015 if (!numElements) {
2016 LDBG() << "Non-constant region size not supported";
2017 return failure();
2018 }
2019
2020 if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2021 // This can be supported in the future if needed.
2022 LDBG() << "Max lower bound for memref region start not supported";
2023 return failure();
2024 }
2025
2026 if (*numElements == 0) {
2027 LDBG() << "Nothing to copy";
2028 return success();
2029 }
2030
2031 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2032 for (unsigned i = 0; i < rank; ++i) {
2033 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2034 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2035 LDBG() << "Missing lower or upper bound for region along dimension: "
2036 << i;
2037 return failure();
2038 }
2039 }
2040
2041 const FlatAffineValueConstraints *cst = region.getConstraints();
2042 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2043 // on; these typically include loop IVs surrounding the level at which the
2044 // copy generation is being done or other valid symbols in MLIR.
2045 SmallVector<Value, 8> regionSymbols;
2046 cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2047
2048 // Construct the access expression for the fast memory buffer. The access
2049 // expression for a particular dimension of the fast buffer is obtained by
2050 // subtracting out the lower bound on the original memref's data region
2051 // along the corresponding dimension.
2052
2053 // Index start offsets for faster memory buffer relative to the original.
2054 SmallVector<AffineExpr, 4> fastBufOffsets;
2055 fastBufOffsets.reserve(rank);
2056 for (unsigned d = 0; d < rank; d++) {
2057 assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2058 "incorrect bound size");
2059
2060 // Set copy start location for this dimension in the lower memory space
2061 // memref.
2062 if (lbs[d].isSingleConstant()) {
2063 auto indexVal = lbs[d].getSingleConstantResult();
2064 if (indexVal == 0) {
2065 memIndices.push_back(zeroIndex);
2066 } else {
2067 memIndices.push_back(
2068 arith::ConstantIndexOp::create(top, loc, indexVal).getResult());
2069 }
2070 } else {
2071 // The coordinate for the start location is just the lower bound along the
2072 // corresponding dimension on the memory region (stored in 'offset').
2073 // Remap all inputs of the map to dimensions uniformly since in the
2074 // generate IR we need valid affine symbols as opposed to "symbols" for
2075 // the purpose of the memref region.
2076 SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2077 for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2078 symReplacements[i] = top.getAffineDimExpr(i);
2079 lbs[d] = lbs[d].replaceDimsAndSymbols(
2080 /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2081 /*numResultSyms=*/0);
2082 memIndices.push_back(
2083 AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2084 }
2085 // The fast buffer is copied into at location zero; addressing is relative.
2086 bufIndices.push_back(zeroIndex);
2087
2088 // Record the offsets since they are needed to remap the memory accesses of
2089 // the original memref further below.
2090 fastBufOffsets.push_back(lbs[d].getResult(0));
2091 }
2092
2093 // The faster memory space buffer.
2094 Value fastMemRef;
2095
2096 // Check if a buffer was already created.
2097 bool existingBuf = fastBufferMap.count(memref) > 0;
2098 if (!existingBuf) {
2099 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2100 auto fastMemRefType =
2101 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2102 fastBufferLayout, copyOptions.fastMemorySpace);
2103
2104 // Create the fast memory space buffer just before the 'affine.for'
2105 // operation.
2106 fastMemRef =
2107 memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2108 // Record it.
2109 fastBufferMap[memref] = fastMemRef;
2110 // fastMemRefType is a constant shaped memref.
2111 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2112 // We don't account for things of unknown size.
2113 *sizeInBytes = maySizeInBytes.value_or(0);
2114
2115 LLVM_DEBUG(emitRemarkForBlock(*block)
2116 << "Creating fast buffer of type " << fastMemRefType
2117 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2118 << " KiB\n");
2119 } else {
2120 // Reuse the one already created.
2121 fastMemRef = fastBufferMap[memref];
2122 }
2123
2124 auto numElementsSSA = arith::ConstantIndexOp::create(top, loc, *numElements);
2125
2126 Value dmaStride;
2127 Value numEltPerDmaStride;
2128 if (copyOptions.generateDma) {
2129 SmallVector<StrideInfo, 4> dmaStrideInfos;
2130 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2131
2132 // TODO: use all stride levels once DmaStartOp is extended for
2133 // multi-level strides.
2134 if (dmaStrideInfos.size() > 1) {
2135 LDBG() << "Only up to one level of stride supported";
2136 return failure();
2137 }
2138
2139 if (!dmaStrideInfos.empty()) {
2140 dmaStride =
2141 arith::ConstantIndexOp::create(top, loc, dmaStrideInfos[0].stride);
2142 numEltPerDmaStride = arith::ConstantIndexOp::create(
2143 top, loc, dmaStrideInfos[0].numEltPerStride);
2144 }
2145 }
2146
2147 // Create fully composed affine maps for each memref.
2148 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2149 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2150 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2151 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2152
2153 if (!copyOptions.generateDma) {
2154 // Point-wise copy generation.
2155 auto copyNest =
2156 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2157 /*lbOperands=*/regionSymbols, ubMaps,
2158 /*ubOperands=*/regionSymbols, fastBufOffsets,
2159 /*isCopyOut=*/region.isWrite(), b);
2160
2161 // Record this so that we can skip it from yet another copy.
2162 copyNests.insert(copyNest);
2163
2164 // Since new ops are being appended (for copy out's), adjust the end to
2165 // mark end of block range being processed if necessary.
2166 if (region.isWrite() && isCopyOutAtEndOfBlock)
2167 *nEnd = Block::iterator(copyNest.getOperation());
2168 } else {
2169 // DMA generation.
2170 // Create a tag (single element 1-d memref) for the DMA.
2171 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2172 copyOptions.tagMemorySpace);
2173 auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2174
2175 SmallVector<Value, 4> tagIndices({zeroIndex});
2176 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2177 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2178 if (!region.isWrite()) {
2179 // DMA non-blocking read from original buffer to fast buffer.
2180 AffineDmaStartOp::create(b, loc, memref, memAffineMap, memIndices,
2181 fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2182 tagAffineMap, tagIndices, numElementsSSA,
2183 dmaStride, numEltPerDmaStride);
2184 } else {
2185 // DMA non-blocking write from fast buffer to the original memref.
2186 auto op = AffineDmaStartOp::create(
2187 b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2188 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2189 dmaStride, numEltPerDmaStride);
2190 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2191 // end to mark end of block range being processed.
2192 if (isCopyOutAtEndOfBlock)
2193 *nEnd = Block::iterator(op.getOperation());
2194 }
2195
2196 // Matching DMA wait to block on completion; tag always has a 0 index.
2197 AffineDmaWaitOp::create(b, loc, tagMemRef, tagAffineMap, zeroIndex,
2198 numElementsSSA);
2199
2200 // Generate dealloc for the tag.
2201 auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2202 if (*nEnd == end && isCopyOutAtEndOfBlock)
2203 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2204 // mark end of range of the original.
2205 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2206 }
2207
2208 // Generate dealloc for the buffer.
2209 if (!existingBuf) {
2210 auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2211 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2212 // the fast buffer (since it marks the new end insertion point).
2213 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2214 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2215 }
2216
2217 // Replace all uses of the old memref with the faster one while remapping
2218 // access indices (subtracting out lower bound offsets for each dimension).
2219 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2220 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2221 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2222 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2223 // d2, d3 correspond to the original indices (%i, %j).
2224 SmallVector<AffineExpr, 4> remapExprs;
2225 remapExprs.reserve(rank);
2226 for (unsigned i = 0; i < rank; i++) {
2227 // The starting operands of indexRemap will be regionSymbols (the symbols on
2228 // which the memref region is parametric); then those corresponding to
2229 // the memref's original indices follow.
2230 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2231 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2232 }
2233 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2234 b.getContext());
2235
2236 // Record the begin since it may be invalidated by memref replacement.
2237 Block::iterator prevOfBegin;
2238 bool isBeginAtStartOfBlock = (begin == block->begin());
2239 if (!isBeginAtStartOfBlock)
2240 prevOfBegin = std::prev(begin);
2241
2242 auto userFilterFn = [&](Operation *user) {
2243 auto *ancestorUser = block->findAncestorOpInBlock(*user);
2244 return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2245 !lastCopyOp->isBeforeInBlock(ancestorUser);
2246 };
2247
2248 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2249 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2250 /*extraIndices=*/{}, indexRemap,
2251 /*extraOperands=*/regionSymbols,
2252 /*symbolOperands=*/{}, userFilterFn);
2253
2254 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2255
2256 return success();
2257}
2258
2259/// Construct the memref region to just include the entire memref. Returns false
2260/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2261/// enclosing loop IVs of `op` (starting from the outermost) that the region
2262/// is parametric on.
2263static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2264 MemRefRegion *region) {
2265 unsigned rank;
2266 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2267 rank = loadOp.getMemRefType().getRank();
2268 region->memref = loadOp.getMemRef();
2269 region->setWrite(false);
2270 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2271 rank = storeOp.getMemRefType().getRank();
2272 region->memref = storeOp.getMemRef();
2273 region->setWrite(true);
2274 } else {
2275 assert(false && "expected load or store op");
2276 return false;
2277 }
2278 auto memRefType = cast<MemRefType>(region->memref.getType());
2279 if (!memRefType.hasStaticShape())
2280 return false;
2281
2282 auto *regionCst = region->getConstraints();
2283
2284 // Just get the first numSymbols IVs, which the memref region is parametric
2285 // on.
2287 getAffineForIVs(*op, &ivs);
2288 ivs.resize(numParamLoopIVs);
2289 SmallVector<Value, 4> symbols;
2290 extractForInductionVars(ivs, &symbols);
2291 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2292 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2293
2294 // Memref dim sizes provide the bounds.
2295 for (unsigned d = 0; d < rank; d++) {
2296 auto dimSize = memRefType.getDimSize(d);
2297 assert(dimSize > 0 && "filtered dynamic shapes above");
2298 regionCst->addBound(BoundType::LB, d, 0);
2299 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2300 }
2301 return true;
2302}
2303
2304LogicalResult
2305mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2306 const AffineCopyOptions &copyOptions,
2307 std::optional<Value> filterMemRef,
2308 DenseSet<Operation *> &copyNests) {
2309 if (begin == end)
2310 return success();
2311
2312 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2313 "Inconsistent block begin/end args");
2314 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2315
2316 Block *block = begin->getBlock();
2317
2318 // Copies will be generated for this depth, i.e., symbolic in all loops
2319 // surrounding the this block range.
2320 unsigned copyDepth = getNestingDepth(&*begin);
2321
2322 LDBG() << "Generating copies at depth " << copyDepth;
2323 LDBG() << "from begin: "
2324 << OpWithFlags(&*begin, OpPrintingFlags().skipRegions());
2325 LDBG() << "to inclusive end: "
2326 << OpWithFlags(&*std::prev(end), OpPrintingFlags().skipRegions());
2327
2328 // List of memory regions to copy for. We need a map vector to have a
2329 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2330 // since the alloc's for example are identical except for the SSA id.
2331 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2332 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2333
2334 // Map from original memref's to the fast buffers that their accesses are
2335 // replaced with.
2336 DenseMap<Value, Value> fastBufferMap;
2337
2338 // To check for errors when walking the block.
2339 bool error = false;
2340
2341 // Walk this range of operations to gather all memory regions.
2342 block->walk(begin, end, [&](Operation *opInst) {
2343 Value memref;
2344 MemRefType memrefType;
2345 // Gather regions to allocate to buffers in faster memory space.
2346 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2347 memref = loadOp.getMemRef();
2348 memrefType = loadOp.getMemRefType();
2349 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2350 memref = storeOp.getMemRef();
2351 memrefType = storeOp.getMemRefType();
2352 }
2353 // Not an affine.load/store op.
2354 if (!memref)
2355 return;
2356
2357 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2358 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2359 memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2360 return;
2361
2362 if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2363 LDBG() << "memref definition is inside of the depth at "
2364 << "which copy-in/copy-out would happen";
2365 return;
2366 }
2367
2368 // Compute the MemRefRegion accessed.
2369 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2370 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2371 /*addMemRefDimBounds=*/false))) {
2372 LDBG() << "Error obtaining memory region: semi-affine maps?";
2373 LDBG() << "over-approximating to the entire memref";
2374 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2375 LDBG() << "non-constant memref sizes not yet supported";
2376 error = true;
2377 return;
2378 }
2379 }
2380
2381 // Each memref has a single buffer associated with it irrespective of how
2382 // many load's and store's happen on it.
2383 // TODO: in the future, when regions don't intersect and satisfy
2384 // other properties (based on load/store regions), we could consider
2385 // multiple buffers per memref.
2386
2387 // Add to the appropriate region if it's not already in it, or take a
2388 // bounding box union with the existing one if it's already in there.
2389 // Note that a memref may have both read and write regions - so update the
2390 // region in the other list if one exists (write in case of read and vice
2391 // versa) since there is a single bounding box for a memref across all reads
2392 // and writes that happen on it.
2393
2394 // Attempts to update; returns true if 'region' exists in targetRegions.
2395 auto updateRegion =
2396 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2397 &targetRegions) {
2398 const auto *const it = targetRegions.find(region->memref);
2399 if (it == targetRegions.end())
2400 return false;
2401
2402 // Perform a union with the existing region.
2403 if (failed(it->second->unionBoundingBox(*region))) {
2404 LDBG() << "Memory region bounding box failed; "
2405 << "over-approximating to the entire memref";
2406 // If the union fails, we will overapproximate.
2407 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2408 LDBG() << "non-constant memref sizes not yet supported";
2409 error = true;
2410 return true;
2411 }
2412 it->second->getConstraints()->clearAndCopyFrom(
2413 *region->getConstraints());
2414 } else {
2415 // Union was computed and stored in 'it->second': copy to 'region'.
2416 region->getConstraints()->clearAndCopyFrom(
2417 *it->second->getConstraints());
2418 }
2419 return true;
2420 };
2421
2422 bool existsInRead = updateRegion(readRegions);
2423 if (error)
2424 return;
2425 bool existsInWrite = updateRegion(writeRegions);
2426 if (error)
2427 return;
2428
2429 // Finally add it to the region list.
2430 if (region->isWrite() && !existsInWrite) {
2431 writeRegions[region->memref] = std::move(region);
2432 } else if (!region->isWrite() && !existsInRead) {
2433 readRegions[region->memref] = std::move(region);
2434 }
2435 });
2436
2437 if (error) {
2438 LDBG() << "copy generation failed for one or more memref's in this block";
2439 return failure();
2440 }
2441
2442 uint64_t totalCopyBuffersSizeInBytes = 0;
2443 bool ret = true;
2444 auto processRegions =
2445 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2446 &regions) {
2447 for (const auto &regionEntry : regions) {
2448 // For each region, hoist copy in/out past all hoistable
2449 // 'affine.for's.
2450 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2451 Block *copyPlacementBlock;
2453 *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2454 &copyInPlacementStart, &copyOutPlacementStart);
2455
2456 uint64_t sizeInBytes;
2457 Block::iterator nBegin, nEnd;
2458 LogicalResult iRet = generateCopy(
2459 *regionEntry.second, block, begin, end, copyPlacementBlock,
2460 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2461 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2462 if (succeeded(iRet)) {
2463 // begin/end could have been invalidated, and need update.
2464 begin = nBegin;
2465 end = nEnd;
2466 totalCopyBuffersSizeInBytes += sizeInBytes;
2467 }
2468 ret = ret & succeeded(iRet);
2469 }
2470 };
2471 processRegions(readRegions);
2472 processRegions(writeRegions);
2473
2474 if (!ret) {
2475 LDBG() << "copy generation failed for one or more memref's in this block";
2476 return failure();
2477 }
2478
2479 // For a range of operations, a note will be emitted at the caller.
2480 AffineForOp forOp;
2481 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2482 LLVM_DEBUG(forOp.emitRemark()
2483 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2484 << " KiB of copy buffers in fast memory space for this block");
2485 }
2486
2487 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2488 block->getParentOp()->emitWarning(
2489 "total size of all copy buffers' for this block exceeds fast memory "
2490 "capacity");
2491 }
2492
2493 return success();
2494}
2495
2496// A convenience version of affineDataCopyGenerate for all ops in the body of
2497// an AffineForOp.
2498LogicalResult mlir::affine::affineDataCopyGenerate(
2499 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2500 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2501 return affineDataCopyGenerate(forOp.getBody()->begin(),
2502 std::prev(forOp.getBody()->end()), copyOptions,
2503 filterMemRef, copyNests);
2504}
2505
2506LogicalResult mlir::affine::generateCopyForMemRegion(
2507 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2508 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2509 Block *block = analyzedOp->getBlock();
2510 auto begin = analyzedOp->getIterator();
2511 auto end = std::next(begin);
2512 DenseMap<Value, Value> fastBufferMap;
2513 DenseSet<Operation *> copyNests;
2514
2515 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2516 copyOptions, fastBufferMap, copyNests,
2517 &result.sizeInBytes, &begin, &end);
2518 if (failed(err))
2519 return err;
2520
2521 const auto &en = fastBufferMap.find(memrefRegion.memref);
2522 // In some cases (empty loops), no copy generation would have happened.
2523 if (en == fastBufferMap.end())
2524 return failure();
2525 result.alloc = en->second.getDefiningOp();
2526 assert(result.alloc && "fast buffer expected to be locally allocated");
2527 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2528 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2529 return success();
2530}
2531
2532/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2533static void
2534gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2535 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2536 // Add a new empty level to output if it doesn't exist level already.
2537 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2538 if (currLoopDepth == depthToLoops.size())
2539 depthToLoops.emplace_back();
2540
2541 for (auto &op : *block) {
2542 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2543 depthToLoops[currLoopDepth].push_back(forOp);
2544 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2545 }
2546 }
2547}
2548
2549/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2550void mlir::affine::gatherLoops(
2551 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2552 for (auto &block : func)
2553 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2554
2555 // Remove last loop level from output since it's empty.
2556 if (!depthToLoops.empty()) {
2557 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2558 depthToLoops.pop_back();
2559 }
2560}
2561
2562AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2563 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2564 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2565 SmallVector<Value, 4> lowerOperands(lbOperands);
2566 SmallVector<Value, 4> upperOperands(ubOperands);
2567
2568 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2569 canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2570 lbMap = removeDuplicateExprs(lbMap);
2571 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2572 canonicalizeMapAndOperands(&ubMap, &upperOperands);
2573 ubMap = removeDuplicateExprs(ubMap);
2574
2575 return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2576 step);
2577}
2578
2579/// Creates an AffineIfOp that encodes the conditional to choose between
2580/// the constant trip count version and an unknown trip count version of this
2581/// nest of loops. This is used to separate partial and full tiles if `loops`
2582/// has the intra-tile loops. The affine.if op is inserted at the builder
2583/// insertion point of `b`.
2585 OpBuilder b) {
2586 if (loops.empty())
2587 return nullptr;
2588
2589 auto *context = loops[0].getContext();
2590
2593 llvm::append_range(ops, loops);
2594 (void)getIndexSet(ops, &cst);
2595
2596 // Remove constraints that are independent of these loop IVs.
2597 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2598
2599 // Construct the constraint set representing the guard for full tiles. The
2600 // lower bound (and upper bound) corresponding to the full tile should be
2601 // larger (and resp. smaller) than any other lower (or upper bound).
2602 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2603 for (auto loop : loops) {
2604 (void)loop;
2605 // TODO: Non-unit stride is not an issue to generalize to.
2606 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2607 // Mark everything symbols for the purpose of finding a constant diff pair.
2608 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2609 1);
2610 unsigned fullTileLbPos, fullTileUbPos;
2611 if (!((IntegerRelation)cst)
2612 .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2613 /*boundFloorDivisor=*/nullptr,
2614 /*ub=*/nullptr, &fullTileLbPos,
2615 &fullTileUbPos)) {
2616 LDBG() << "Can't get constant diff pair for a loop";
2617 return nullptr;
2618 }
2619
2620 SmallVector<unsigned, 4> lbIndices, ubIndices;
2621 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2622
2623 auto fLb = cst.getInequality(fullTileLbPos);
2624 auto fUb = cst.getInequality(fullTileUbPos);
2625 fullTileLb.assign(fLb.begin(), fLb.end());
2626 fullTileUb.assign(fUb.begin(), fUb.end());
2627
2628 // Full tile lower bound should be >= than any other lower bound.
2629 for (auto lbIndex : lbIndices)
2630 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2631 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2632
2633 // Full tile upper bound should be <= any other upper bound.
2634 for (auto ubIndex : ubIndices)
2635 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2636 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2637
2638 cst.removeVar(0);
2639 }
2640
2641 // The previous step leads to all zeros for the full tile lb and ub position
2642 // itself; remove those and any other duplicates / trivial redundancies.
2644
2645 // Turn everything into dims conservatively since we earlier turned all
2646 // trailing ids past point loop IV into symbols. Some of these could be outer
2647 // loop IVs; we'll canonicalize anyway.
2649
2650 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2651 // ifCondSet can be null if cst was empty -- this can happen if all loops
2652 // in the nest have constant trip counts.
2653 if (!ifCondSet)
2654 return nullptr;
2655
2656 SmallVector<Value, 4> setOperands;
2657 cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2658 canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2659 return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2660 /*withElseRegion=*/true);
2661}
2662
2663/// Create the full tile loop nest (along with its body).
2664static LogicalResult
2666 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2667 fullTileLoops.reserve(inputNest.size());
2668
2669 // For each loop in the original nest identify a lower/upper bound pair such
2670 // that their difference is a constant.
2672 for (auto loop : inputNest) {
2673 // TODO: straightforward to generalize to a non-unit stride.
2674 if (loop.getStepAsInt() != 1) {
2675 LDBG() << "[tile separation] non-unit stride not implemented";
2676 return failure();
2677 }
2678 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2679 (void)getIndexSet(loopOp, &cst);
2680 // We will mark everything other than this loop IV as symbol for getting a
2681 // pair of <lb, ub> with a constant difference.
2683 unsigned lbPos, ubPos;
2684 if (!((IntegerRelation)cst)
2685 .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2686 /*boundFloorDivisor=*/nullptr,
2687 /*ub=*/nullptr, &lbPos, &ubPos) ||
2688 lbPos == ubPos) {
2689 LDBG() << "[tile separation] Can't get constant diff / "
2690 << "equalities not yet handled";
2691 return failure();
2692 }
2693
2694 // Set all variables as dimensions uniformly since some of those marked as
2695 // symbols above could be outer loop IVs (corresponding tile space IVs).
2696 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2697
2698 AffineValueMap lbVmap, ubVmap;
2699 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2700 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2701 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2702 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2703 ubVmap.getOperands(), ubVmap.getAffineMap());
2704 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2705 fullTileLoops.push_back(fullTileLoop);
2706 }
2707
2708 // Add the body for the full tile loop nest.
2709 IRMapping operandMap;
2710 for (const auto &loopEn : llvm::enumerate(inputNest))
2711 operandMap.map(loopEn.value().getInductionVar(),
2712 fullTileLoops[loopEn.index()].getInductionVar());
2713 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2714 for (auto &op : inputNest.back().getBody()->without_terminator())
2715 b.clone(op, operandMap);
2716 return success();
2717}
2718
2719LogicalResult
2720mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2721 SmallVectorImpl<AffineForOp> *fullTileNest) {
2722 if (inputNest.empty())
2723 return success();
2724
2725 auto firstLoop = inputNest[0];
2726
2727 // Each successive for op has to be nested in the other.
2728 auto prevLoop = firstLoop;
2729 for (auto loop : inputNest.drop_front(1)) {
2730 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2731 prevLoop = loop;
2732 }
2733
2734 // Create the full tile loop nest.
2735 SmallVector<AffineForOp, 4> fullTileLoops;
2736 OpBuilder b(firstLoop);
2737 if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2738 if (!fullTileLoops.empty())
2739 fullTileLoops.front().erase();
2740 return failure();
2741 }
2742
2743 // Create and insert the version select right before the root of the nest.
2744 b = OpBuilder(firstLoop);
2745 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2746 if (!ifOp) {
2747 fullTileLoops.front().erase();
2748 LDBG() << "All tiles are full tiles, or failure creating "
2749 << "separation condition";
2750 return failure();
2751 }
2752
2753 // Move the full tile into the then block.
2754 Block *thenBlock = ifOp.getThenBlock();
2755 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2756 thenBlock->getOperations().splice(
2757 std::prev(thenBlock->end()),
2758 outermostFullTileLoop->getBlock()->getOperations(),
2759 Block::iterator(outermostFullTileLoop));
2760
2761 // Move the partial tile into the else block. The partial tile is the same as
2762 // the original loop nest.
2763 Block *elseBlock = ifOp.getElseBlock();
2764 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2765 firstLoop->getBlock()->getOperations(),
2766 Block::iterator(firstLoop));
2767
2768 if (fullTileNest)
2769 *fullTileNest = std::move(fullTileLoops);
2770
2771 return success();
2772}
2773
2774LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2775 LogicalResult result(failure());
2777 getPerfectlyNestedLoops(loops, op);
2778 if (loops.size() <= 1)
2779 return success();
2780
2781 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2782 // loops with bounds defined above some loop.
2783 // 1. For each loop, find above which parent loop its operands are
2784 // defined.
2785 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2786 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2787 operandsDefinedAbove[i] = i;
2788 for (unsigned j = 0; j < i; ++j) {
2789 if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2790 operandsDefinedAbove[i] = j;
2791 break;
2792 }
2793 }
2794 }
2795
2796 // 2. Identify bands of loops such that the operands of all of them are
2797 // defined above the first loop in the band. Traverse the nest bottom-up
2798 // so that modifications don't invalidate the inner loops.
2799 for (unsigned end = loops.size(); end > 0; --end) {
2800 unsigned start = 0;
2801 for (; start < end - 1; ++start) {
2802 auto maxPos =
2803 *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2804 std::next(operandsDefinedAbove.begin(), end));
2805 if (maxPos > start)
2806 continue;
2807 assert(maxPos == start &&
2808 "expected loop bounds to be known at the start of the band");
2809 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2810 if (succeeded(coalesceLoops(band)))
2811 result = success();
2812 break;
2813 }
2814 // If a band was found and transformed, keep looking at the loops above
2815 // the outermost transformed loop.
2816 if (start != end - 1)
2817 end = start + 1;
2818 }
2819 return result;
2820}
2821
2823 int64_t count = 0;
2824 Operation *currentOp = operand.getOwner();
2825 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2826 if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2827 break;
2828 currentOp = loopOp;
2829 count++;
2830 }
2831 return count;
2832}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its the output argument nEnd is set *to the new end sizeInBytes is set to the size of the fast buffer *allocated *static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
fullyComposeAffineMapAndOperands & fastBufMap
static InFlightDiagnostic emitRemarkForBlock(Block &block)
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 > > &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * > > > &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
auto load
return copyNestRoot
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 > > &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition LoopUtils.cpp:45
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
fastBufExprs
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
unsigned getNumInputs() const
AffineExpr getResult(unsigned idx) const
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType::iterator iterator
Definition Block.h:140
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition Block.cpp:27
OpListType & getOperations()
Definition Block.h:137
Operation & front()
Definition Block.h:153
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition Block.h:308
Operation * getTerminator()
Get the terminator operation of this block.
Definition Block.cpp:244
iterator end()
Definition Block.h:144
iterator begin()
Definition Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition Block.cpp:31
AffineMap getDimIdentityMap()
Definition Builders.cpp:383
AffineExpr getAffineDimExpr(unsigned position)
Definition Builders.cpp:364
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
This class represents a diagnostic that is inflight and set to be reported.
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:207
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:562
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition Builders.h:252
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
This class represents an operand of an operation.
Definition Value.h:257
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
Definition Operation.h:1111
This class implements the operand iterators for the Operation class.
Definition ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Block * getBlock()
Returns the operation block that contains this operation.
Definition Operation.h:213
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:223
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition Value.h:208
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
AffineBound represents a lower or upper bound in the for operation.
Definition AffineOps.h:550
Value getOperand(unsigned idx)
Definition AffineOps.h:556
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition Utils.cpp:865
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 > > *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
bool isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition Utils.cpp:851
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition Utils.cpp:1463
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition Utils.cpp:2063
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
AffineForOp sinkSequentialLoops(AffineForOp forOp)
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:573
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
void generateUnrolledLoop(Block *loopBodyBlock, Value iv, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues, IRMapping *clonedToSrcOpsMap=nullptr)
Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs' and 'yieldedValues' as the...
Definition Utils.cpp:294
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:128
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:126
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition Utils.cpp:986
llvm::function_ref< Fn > function_ref
Definition LLVM.h:152
int64_t stride
int64_t numEltPerStride
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition LoopUtils.h:159
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition Utils.h:489
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition Utils.cpp:1160
bool isWrite() const
Definition Utils.h:537
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition Utils.cpp:1216
void setWrite(bool flag)
Definition Utils.h:538
FlatAffineValueConstraints * getConstraints()
Definition Utils.h:535
Value memref
Memref that this region corresponds to.
Definition Utils.h:570
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition Utils.h:577
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.