MLIR  19.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/IntegerSet.h"
27 #include "llvm/ADT/MapVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/raw_ostream.h"
31 #include <optional>
32 
33 #define DEBUG_TYPE "loop-utils"
34 
35 using namespace mlir;
36 using namespace affine;
37 using namespace presburger;
38 using llvm::SmallMapVector;
39 
40 /// Computes the cleanup loop lower bound of the loop being unrolled with
41 /// the specified unroll factor; this bound will also be upper bound of the main
42 /// part of the unrolled loop. Computes the bound as an AffineMap with its
43 /// operands or a null map when the trip count can't be expressed as an affine
44 /// expression.
45 static void
46 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
47  AffineMap &cleanupLbMap,
48  SmallVectorImpl<Value> &cleanupLbOperands) {
49  AffineMap tripCountMap;
50  SmallVector<Value, 4> tripCountOperands;
51  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
52  // Trip count can't be computed.
53  if (!tripCountMap) {
54  cleanupLbMap = AffineMap();
55  return;
56  }
57 
58  OpBuilder b(forOp);
59  auto lbMap = forOp.getLowerBoundMap();
60  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
61  forOp.getLowerBoundOperands());
62 
63  // For each upper bound expr, get the range.
64  // Eg: affine.for %i = lb to min (ub1, ub2),
65  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
66  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
67  // these affine.apply's make up the cleanup loop lower bound.
68  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
69  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
70  int64_t step = forOp.getStepAsInt();
71  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
72  auto tripCountExpr = tripCountMap.getResult(i);
73  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
74  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
75  tripCountMap.getNumSymbols(), bumpExprs[i]);
76  bumpValues[i] =
77  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
78  }
79 
80  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
81  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
82  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
83 
84  cleanupLbOperands.clear();
85  cleanupLbOperands.push_back(lb);
86  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
87  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
88  b.getContext());
89  // Simplify the cleanupLbMap + cleanupLbOperands.
90  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
92  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
93  // Remove any affine.apply's that became dead from the simplification above.
94  for (auto v : bumpValues)
95  if (v.use_empty())
96  v.getDefiningOp()->erase();
97 
98  if (lb.use_empty())
99  lb.erase();
100 }
101 
102 /// Helper to replace uses of loop carried values (iter_args) and loop
103 /// yield values while promoting single iteration affine.for ops.
104 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
105  // Replace uses of iter arguments with iter operands (initial values).
106  auto iterOperands = forOp.getInits();
107  auto iterArgs = forOp.getRegionIterArgs();
108  for (auto e : llvm::zip(iterOperands, iterArgs))
109  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
110 
111  // Replace uses of loop results with the values yielded by the loop.
112  auto outerResults = forOp.getResults();
113  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
114  for (auto e : llvm::zip(outerResults, innerResults))
115  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
116 }
117 
118 /// Promotes the loop body of a forOp to its containing block if the forOp
119 /// was known to have a single iteration.
121  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
122  if (!tripCount || *tripCount != 1)
123  return failure();
124 
125  // TODO: extend this for arbitrary affine bounds.
126  if (forOp.getLowerBoundMap().getNumResults() != 1)
127  return failure();
128 
129  // Replaces all IV uses to its single iteration value.
130  auto iv = forOp.getInductionVar();
131  auto *parentBlock = forOp->getBlock();
132  if (!iv.use_empty()) {
133  if (forOp.hasConstantLowerBound()) {
134  OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
135  auto constOp = topBuilder.create<arith::ConstantIndexOp>(
136  forOp.getLoc(), forOp.getConstantLowerBound());
137  iv.replaceAllUsesWith(constOp);
138  } else {
139  auto lbOperands = forOp.getLowerBoundOperands();
140  auto lbMap = forOp.getLowerBoundMap();
141  OpBuilder builder(forOp);
142  if (lbMap == builder.getDimIdentityMap()) {
143  // No need of generating an affine.apply.
144  iv.replaceAllUsesWith(lbOperands[0]);
145  } else {
146  auto affineApplyOp =
147  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
148  iv.replaceAllUsesWith(affineApplyOp);
149  }
150  }
151  }
152 
154 
155  // Move the loop body operations, except for its terminator, to the loop's
156  // containing block.
157  forOp.getBody()->back().erase();
158  parentBlock->getOperations().splice(Block::iterator(forOp),
159  forOp.getBody()->getOperations());
160  forOp.erase();
161  return success();
162 }
163 
164 /// Generates an affine.for op with the specified lower and upper bounds
165 /// while generating the right IV remappings to realize shifts for operations in
166 /// its body. The operations that go into the loop body are specified in
167 /// opGroupQueue starting from the specified offset, and in that order. The
168 /// first element of the pair specifies the shift applied to that group of
169 /// operations; the shift is multiplied by the loop step before being applied.
170 /// Returns nullptr if the generated loop simplifies to a single iteration one.
171 static AffineForOp generateShiftedLoop(
172  AffineMap lbMap, AffineMap ubMap,
173  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
174  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
175  auto lbOperands = srcForOp.getLowerBoundOperands();
176  auto ubOperands = srcForOp.getUpperBoundOperands();
177 
178  assert(lbMap.getNumInputs() == lbOperands.size());
179  assert(ubMap.getNumInputs() == ubOperands.size());
180 
181  auto loopChunk =
182  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
183  ubMap, srcForOp.getStepAsInt());
184  auto loopChunkIV = loopChunk.getInductionVar();
185  auto srcIV = srcForOp.getInductionVar();
186 
187  IRMapping operandMap;
188 
189  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
190  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
191  uint64_t shift = it.first;
192  auto ops = it.second;
193  // All 'same shift' operations get added with their operands being
194  // remapped to results of cloned operations, and their IV used remapped.
195  // Generate the remapping if the shift is not zero: remappedIV = newIV -
196  // shift.
197  if (!srcIV.use_empty() && shift != 0) {
198  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
199  srcForOp.getLoc(),
200  bodyBuilder.getSingleDimShiftAffineMap(
201  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
202  loopChunkIV);
203  operandMap.map(srcIV, ivRemap);
204  } else {
205  operandMap.map(srcIV, loopChunkIV);
206  }
207  for (auto *op : ops)
208  bodyBuilder.clone(*op, operandMap);
209  };
210  if (succeeded(promoteIfSingleIteration(loopChunk)))
211  return AffineForOp();
212  return loopChunk;
213 }
214 
215 // The skewing of operations with respect to one another can be used for
216 // example to allow overlap of asynchronous operations (such as DMA
217 // communication) with computation, or just relative shifting of operations
218 // for better register reuse, locality or parallelism. As such, the shifts are
219 // typically expected to be at most of the order of the number of operations.
220 // This method should not be used as a substitute for loop distribution/fission.
221 // This method uses an algorithm// in time linear in the number of operations
222 // in the body of the for loop - (using the 'sweep line' paradigm). This method
223 // asserts preservation of SSA dominance. A check for that as well as that for
224 // memory-based dependence preservation check rests with the users of this
225 // method.
227  ArrayRef<uint64_t> shifts,
228  bool unrollPrologueEpilogue) {
229  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
230  "too few/many shifts");
231  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
232  return success();
233 
234  // If the trip counts aren't constant, we would need versioning and
235  // conditional guards (or context information to prevent such versioning). The
236  // better way to pipeline for such loops is to first tile them and extract
237  // constant trip count "full tiles" before applying this.
238  auto mayBeConstTripCount = getConstantTripCount(forOp);
239  if (!mayBeConstTripCount) {
240  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
241  return success();
242  }
243  uint64_t tripCount = *mayBeConstTripCount;
244 
245  assert(isOpwiseShiftValid(forOp, shifts) &&
246  "shifts will lead to an invalid transformation\n");
247 
248  int64_t step = forOp.getStepAsInt();
249 
250  unsigned numChildOps = shifts.size();
251 
252  // Do a linear time (counting) sort for the shifts.
253  uint64_t maxShift = *llvm::max_element(shifts);
254  if (maxShift >= numChildOps) {
255  // Large shifts are not the typical use case.
256  forOp.emitWarning("not shifting because shifts are unrealistically large");
257  return success();
258  }
259 
260  // An array of operation groups sorted by shift amount; each group has all
261  // operations with the same shift in the order in which they appear in the
262  // body of the 'affine.for' op.
263  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
264  unsigned pos = 0;
265  for (auto &op : forOp.getBody()->without_terminator()) {
266  auto shift = shifts[pos++];
267  sortedOpGroups[shift].push_back(&op);
268  }
269 
270  // Unless the shifts have a specific pattern (which actually would be the
271  // common use case), prologue and epilogue are not meaningfully defined.
272  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
273  // loop generated as the prologue and the last as epilogue and unroll these
274  // fully.
275  AffineForOp prologue, epilogue;
276 
277  // Do a sweep over the sorted shifts while storing open groups in a
278  // vector, and generating loop portions as necessary during the sweep. A block
279  // of operations is paired with its shift.
280  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
281 
282  auto origLbMap = forOp.getLowerBoundMap();
283  uint64_t lbShift = 0;
284  OpBuilder b(forOp);
285  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
286  // If nothing is shifted by d, continue.
287  if (sortedOpGroups[d].empty())
288  continue;
289  if (!opGroupQueue.empty()) {
290  assert(d > 0 &&
291  "Queue expected to be empty when the first block is found");
292  // The interval for which the loop needs to be generated here is:
293  // [lbShift, min(lbShift + tripCount, d)) and the body of the
294  // loop needs to have all operations in opQueue in that order.
295  AffineForOp res;
296  if (lbShift + tripCount * step < d * step) {
297  res = generateShiftedLoop(
298  b.getShiftedAffineMap(origLbMap, lbShift),
299  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
300  opGroupQueue, /*offset=*/0, forOp, b);
301  // Entire loop for the queued op groups generated, empty it.
302  opGroupQueue.clear();
303  lbShift += tripCount * step;
304  } else {
305  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
306  b.getShiftedAffineMap(origLbMap, d),
307  opGroupQueue, /*offset=*/0, forOp, b);
308  lbShift = d * step;
309  }
310 
311  if (res) {
312  // Simplify/canonicalize the affine.for.
313  RewritePatternSet patterns(res.getContext());
314  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
315  GreedyRewriteConfig config;
317  bool erased;
318  (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns),
319  config, /*changed=*/nullptr, &erased);
320  if (!erased && !prologue)
321  prologue = res;
322  if (!erased)
323  epilogue = res;
324  }
325  } else {
326  // Start of first interval.
327  lbShift = d * step;
328  }
329  // Augment the list of operations that get into the current open interval.
330  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
331  }
332 
333  // Those operations groups left in the queue now need to be processed (FIFO)
334  // and their loops completed.
335  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
336  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
337  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
338  b.getShiftedAffineMap(origLbMap, ubShift),
339  opGroupQueue, /*offset=*/i, forOp, b);
340  lbShift = ubShift;
341  if (!prologue)
342  prologue = epilogue;
343  }
344 
345  // Erase the original for op.
346  forOp.erase();
347 
348  if (unrollPrologueEpilogue && prologue)
349  (void)loopUnrollFull(prologue);
350  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
351  (void)loopUnrollFull(epilogue);
352 
353  return success();
354 }
355 
356 /// Checks whether a loop nest is hyper-rectangular or not.
357 static LogicalResult
360  SmallVector<Operation *, 8> ops(input.begin(), input.end());
361  // 0-d or 1-d is trivially hyper-rectangular.
362  if (input.size() <= 1)
363  return success();
364  if (failed(getIndexSet(ops, &cst))) {
365  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
366  return failure();
367  }
368  if (!cst.isHyperRectangular(0, input.size())) {
369  LLVM_DEBUG(llvm::dbgs()
370  << "Non-hyperrectangular nests not supported for tiling!\n");
371  return failure();
372  }
373  return success();
374 }
375 
376 /// Check if the input nest is supported for tiling and whether tiling would be
377 /// legal or not.
378 template <typename t>
380  ArrayRef<t> tileSizes) {
381  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
382 
383  if (llvm::any_of(input,
384  [](AffineForOp op) { return op.getNumResults() > 0; })) {
385  LLVM_DEBUG(llvm::dbgs()
386  << "Cannot tile nest where a loop has yield values\n");
387  return failure();
388  }
389 
390  // Check if the supplied `for` ops are all successively nested.
391  if (!isPerfectlyNested(input)) {
392  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
393  return failure();
394  }
395 
396  // TODO: handle non hyper-rectangular spaces.
397  if (failed(checkIfHyperRectangular(input)))
398  return failure();
399 
400  return success();
401 }
402 
403 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
404 /// location in destination's body, ignoring the terminator.
405 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
406  Block::iterator loc) {
407  auto &ops = src.getBody()->getOperations();
408  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
409  std::prev(ops.end()));
410 }
411 
412 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
413 /// body.
414 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
415  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
416 }
417 
418 /// Constructs tiled loop nest, without setting the loop bounds and move the
419 /// body of the original loop nest to the tiled loop nest.
421  AffineForOp rootAffineForOp, unsigned width,
422  MutableArrayRef<AffineForOp> tiledLoops) {
423  Location loc = rootAffineForOp.getLoc();
424 
425  // The outermost among the loops as we add more..
426  Operation *topLoop = rootAffineForOp.getOperation();
427  AffineForOp innermostPointLoop;
428 
429  // Add intra-tile (or point) loops.
430  for (unsigned i = 0; i < width; i++) {
431  OpBuilder b(topLoop);
432  // Loop bounds will be set later.
433  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
434  pointLoop.getBody()->getOperations().splice(
435  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
436  topLoop);
437  tiledLoops[2 * width - 1 - i] = pointLoop;
438  topLoop = pointLoop.getOperation();
439  if (i == 0)
440  innermostPointLoop = pointLoop;
441  }
442 
443  // Add tile space loops;
444  for (unsigned i = width; i < 2 * width; i++) {
445  OpBuilder b(topLoop);
446  // Loop bounds will be set later.
447  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
448  tileSpaceLoop.getBody()->getOperations().splice(
449  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
450  topLoop);
451  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
452  topLoop = tileSpaceLoop.getOperation();
453  }
454 
455  // Move the loop body of the original nest to the new one.
456  moveLoopBody(origLoops.back(), innermostPointLoop);
457 }
458 
459 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
460 // TODO: Handle non-constant lower bounds.
461 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
462  AffineForOp newInterTileLoop,
463  AffineForOp newIntraTileLoop,
464  Value tileSize) {
465  // The lower bound for the intra-tile loop is represented by an affine map
466  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
467  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
468  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
469  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
470  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
471  // corresponding inter-tile loop.
472 
473  assert(origLoop.hasConstantLowerBound() &&
474  "expected input loops to have constant lower bound.");
475 
476  // Get lower bound of original loop as an affine expression.
477  AffineExpr origLowerBoundExpr;
478  origLowerBoundExpr =
479  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
480 
481  // Add dim operands from original lower/upper bound.
482  SmallVector<Value, 4> lbOperands, ubOperands;
483  AffineBound lb = origLoop.getLowerBound();
484  AffineBound ub = origLoop.getUpperBound();
485  lbOperands.reserve(lb.getNumOperands() + 2);
486  ubOperands.reserve(ub.getNumOperands() + 2);
487  AffineMap origLbMap = lb.getMap();
488  AffineMap origUbMap = ub.getMap();
489  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
490  lbOperands.push_back(lb.getOperand(j));
491  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
492  ubOperands.push_back(ub.getOperand(j));
493 
494  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
495  // IV.
496  lbOperands.push_back(newInterTileLoop.getInductionVar());
497  ubOperands.push_back(newInterTileLoop.getInductionVar());
498 
499  // Get loop IV as an affine expression for lower/upper bound. Size of
500  // lb/ubOperands is guaranteed to be atleast one.
501  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
502  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
503 
504  // Add symbol operands from original lower/upper bound.
505  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
506  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
507  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
508  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
509 
510  // Add a new symbol operand which is the tile size for this loop.
511  lbOperands.push_back(tileSize);
512  ubOperands.push_back(tileSize);
513 
514  SmallVector<AffineExpr, 4> lbBoundExprs;
515  SmallVector<AffineExpr, 4> ubBoundExprs;
516  lbBoundExprs.reserve(origLbMap.getNumResults());
517  ubBoundExprs.reserve(origUbMap.getNumResults());
518 
519  // Get tiling parameter as an affine expression for lb/ub.
520  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
521  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
522 
523  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
524  lbBoundExprs.push_back(
525  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
526  origLowerBoundExpr);
527 
528  // Get the origLoopStep as an affine expression.
529  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
530 
531  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
532  // (tilingParameter * origLoopStep) + origlb.
533  ubBoundExprs.push_back(
534  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
535  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
536 
537  ubBoundExprs.append(origUbMap.getResults().begin(),
538  origUbMap.getResults().end());
539 
540  AffineMap lbMap =
541  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
542  lbBoundExprs, b.getContext());
543  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
544 
545  AffineMap ubMap =
546  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
547  ubBoundExprs, b.getContext());
548  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
549 
550  // Original loop step must be preserved.
551  newIntraTileLoop.setStep(origLoop.getStepAsInt());
552 }
553 
554 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
555 // TODO: Handle non-constant lower bounds.
556 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
557  AffineForOp newLoop, Value tileSize) {
558  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
559 
560  // The lower bounds for inter-tile loops are same as the corresponding lower
561  // bounds of original loops.
562  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
563 
564  // The new upper bound map for inter-tile loops, assuming constant lower
565  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
566  // originalLowerBound), tiling parameter); where tiling parameter is the
567  // respective tile size for that loop. For e.g. if the original ubmap was
568  // ()->(1024), the new map will be
569  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
570  // Therefore a new symbol operand is inserted in the map and the result
571  // expression is overwritten.
572 
573  assert(origLoop.hasConstantLowerBound() &&
574  "expected input loops to have constant lower bound.");
575 
576  // Get lower bound of original loop as an affine expression.
577  AffineExpr origLowerBoundExpr;
578  origLowerBoundExpr =
579  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
580 
581  // Add dim operands from original upper bound.
582  SmallVector<Value, 4> ubOperands;
583  AffineBound ub = origLoop.getUpperBound();
584  ubOperands.reserve(ub.getNumOperands() + 1);
585  AffineMap origUbMap = ub.getMap();
586  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
587  ubOperands.push_back(ub.getOperand(j));
588 
589  // Add symbol operands from original upper bound.
590  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
591  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
592 
593  // Add a new symbol operand which is the tile size for this loop.
594  ubOperands.push_back(tileSize);
595 
596  // Get tiling parameter as an affine expression.
597  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
598 
599  SmallVector<AffineExpr, 4> boundExprs;
600  boundExprs.reserve(origUbMap.getNumResults());
601  int64_t origUpperBound;
602  AffineExpr origUpperBoundExpr;
603 
604  // If upper bound for the original loop is constant, then the constant can
605  // be obtained as an affine expression straight away.
606  if (origLoop.hasConstantUpperBound()) {
607  origUpperBound = origLoop.getConstantUpperBound();
608 
609  // Get original constant upper bound as an affine expression.
610  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
611 
612  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
613  // originalLowerBound), tilingParameter).
614  boundExprs.push_back(
615  origLowerBoundExpr +
616  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
617  } else {
618  // If upper bound for the original loop is not constant then two cases
619  // are possible, although there handeling is the same, 1.) The result of
620  // ubmap has only one result expression. For e.g.
621  // affine.for %i = 5 to %ub
622  //
623  // A symbol operand is added which represents the tiling parameter. The
624  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
625  // where 's0' is the original upper bound and 's1' is the tiling
626  // parameter. 2.) When ubMap has more than one result expression. For e.g.
627  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
628  // affine.for %i = 5 to min #map0()[%s0, %s1]
629  //
630  // A symbol operand is added which represents the tiling parameter. The
631  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
632  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
633 
634  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
635  // originalLowerBound), tilingParameter).
636  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
637  boundExprs.push_back(
638  origLowerBoundExpr +
639  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
640  }
641 
642  AffineMap ubMap =
643  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
644  boundExprs, b.getContext());
645  newLoop.setUpperBound(ubOperands, ubMap);
646 
647  // Original loop step must be preserved.
648  newLoop.setStep(origLoop.getStepAsInt());
649 }
650 
651 /// Constructs and sets new loop bounds after tiling for the case of
652 /// hyper-rectangular index sets, where the bounds of one dimension do not
653 /// depend on other dimensions and tiling parameters are captured from SSA
654 /// values. Bounds of each dimension can thus be treated independently,
655 /// and deriving the new bounds is much simpler and faster than for the case of
656 /// tiling arbitrary polyhedral shapes.
659  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
660  assert(!origLoops.empty() && "expected atleast one loop in band");
661  assert(origLoops.size() == tileSizes.size() &&
662  "expected tiling parameter for each loop in band.");
663 
664  OpBuilder b(origLoops[0].getOperation());
665  unsigned width = origLoops.size();
666 
667  // Set bounds for tile space loops.
668  for (unsigned i = 0; i < width; ++i) {
669  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
670  }
671 
672  // Set bounds for intra-tile loops.
673  for (unsigned i = 0; i < width; ++i) {
674  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
675  newLoops[i + width], tileSizes[i]);
676  }
677 }
678 
679 /// Constructs and sets new loop bounds after tiling for the case of
680 /// hyper-rectangular index sets, where the bounds of one dimension do not
681 /// depend on other dimensions. Bounds of each dimension can thus be treated
682 /// independently, and deriving the new bounds is much simpler and faster
683 /// than for the case of tiling arbitrary polyhedral shapes.
684 static void
687  ArrayRef<unsigned> tileSizes) {
688  assert(!origLoops.empty());
689  assert(origLoops.size() == tileSizes.size());
690 
691  OpBuilder b(origLoops[0].getOperation());
692  unsigned width = origLoops.size();
693 
694  // Bounds for tile space loops.
695  for (unsigned i = 0; i < width; i++) {
696  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
697  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
698  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
699  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
700  // If the step size of original loop is x and tileSize is y then after
701  // tiling the tile space loops' step size becomes x*y.
702  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
703  }
704  // Bounds for intra-tile loops.
705  for (unsigned i = 0; i < width; i++) {
706  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
707  std::optional<uint64_t> mayBeConstantCount =
708  getConstantTripCount(origLoops[i]);
709  // The lower bound is just the tile-space loop.
710  AffineMap lbMap = b.getDimIdentityMap();
711  newLoops[width + i].setLowerBound(
712  /*operands=*/newLoops[i].getInductionVar(), lbMap);
713  // The step sizes of intra-tile loops is just the original loops' step size.
714  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
715 
716  // Set the upper bound.
717  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
718  // Trip count is less than the tile size: upper bound is lower bound +
719  // trip count * stepSize.
721  *mayBeConstantCount * origLoops[i].getStepAsInt());
722  newLoops[width + i].setUpperBound(
723  /*operands=*/newLoops[i].getInductionVar(), ubMap);
724  } else if (largestDiv % tileSizes[i] != 0) {
725  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
726  // Construct the upper bound map; the operands are the original operands
727  // with 'i' (tile-space loop) appended to it. The new upper bound map is
728  // the original one with an additional expression i + tileSize * stepSize
729  // appended.
730 
731  // Add dim operands from original upper bound.
732  SmallVector<Value, 4> ubOperands;
733  AffineBound ub = origLoops[i].getUpperBound();
734  ubOperands.reserve(ub.getNumOperands() + 1);
735  AffineMap origUbMap = ub.getMap();
736  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
737  ubOperands.push_back(ub.getOperand(j));
738 
739  // Add dim operand for new loop upper bound.
740  ubOperands.push_back(newLoops[i].getInductionVar());
741 
742  // Add symbol operands from original upper bound.
743  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
744  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
745 
746  SmallVector<AffineExpr, 4> boundExprs;
747  boundExprs.reserve(1 + origUbMap.getNumResults());
748  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
749  // The new upper bound map is the original one with an additional
750  // expression i + tileSize * stepSize (of original loop) appended.
751  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
752  boundExprs.append(origUbMap.getResults().begin(),
753  origUbMap.getResults().end());
754  AffineMap ubMap =
755  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
756  boundExprs, b.getContext());
757  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
758  } else {
759  // No need of the min expression.
760  AffineExpr dim = b.getAffineDimExpr(0);
761  AffineMap ubMap = AffineMap::get(
762  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
763  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
764  }
765  }
766 }
767 
770  ArrayRef<unsigned> tileSizes,
771  SmallVectorImpl<AffineForOp> *tiledNest) {
772  if (input.empty())
773  return success();
774 
775  if (failed(performPreTilingChecks(input, tileSizes)))
776  return failure();
777 
778  MutableArrayRef<AffineForOp> origLoops = input;
779  AffineForOp rootAffineForOp = origLoops[0];
780 
781  // Note that width is at least one since the band isn't empty.
782  unsigned width = input.size();
783  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
784 
785  // Construct a tiled loop nest without setting their bounds. Bounds are
786  // set later.
787  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
788 
789  SmallVector<Value, 8> origLoopIVs;
790  extractForInductionVars(input, &origLoopIVs);
791 
792  // Set loop bounds for the tiled loop nest.
793  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
794 
795  // Replace original IVs with intra-tile loop IVs.
796  for (unsigned i = 0; i < width; i++)
797  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
798 
799  // Erase the old loop nest.
800  rootAffineForOp.erase();
801 
802  if (tiledNest)
803  *tiledNest = std::move(tiledLoops);
804 
805  return success();
806 }
807 
808 /// Tiles the specified band of perfectly nested loops creating tile-space
809 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
810 /// is a contiguous set of loops.
813  SmallVectorImpl<AffineForOp> *tiledNest) {
814  if (input.empty())
815  return success();
816 
817  if (failed(performPreTilingChecks(input, tileSizes)))
818  return failure();
819 
820  MutableArrayRef<AffineForOp> origLoops = input;
821  AffineForOp rootAffineForOp = origLoops[0];
822  unsigned width = input.size();
823  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
824 
825  // Construct a tiled loop nest without setting their bounds. Bounds are
826  // set later.
827  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
828 
829  SmallVector<Value, 8> origLoopIVs;
830  extractForInductionVars(input, &origLoopIVs);
831 
832  // Set loop bounds for the tiled loop nest.
833  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
834  tileSizes);
835 
836  // Replace original IVs with intra-tile loop IVs.
837  for (unsigned i = 0; i < width; i++)
838  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
839 
840  // Erase the old loop nest.
841  rootAffineForOp.erase();
842 
843  if (tiledNest)
844  *tiledNest = std::move(tiledLoops);
845 
846  return success();
847 }
848 
849 /// Get perfectly nested sequence of loops starting at root of loop nest
850 /// (the first op being another AffineFor, and the second op - a terminator).
851 /// A loop is perfectly nested iff: the first op in the loop's body is another
852 /// AffineForOp, and the second op is a terminator).
854  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
855  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
856  nestedLoops.push_back(root);
857  Block &body = root.getRegion().front();
858  if (body.begin() != std::prev(body.end(), 2))
859  return;
860 
861  root = dyn_cast<AffineForOp>(&body.front());
862  if (!root)
863  return;
864  }
865 }
866 
867 /// Identify valid and profitable bands of loops to tile. This is currently just
868 /// a temporary placeholder to test the mechanics of tiled code generation.
869 /// Returns all maximal outermost perfect loop nests to tile.
871  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
872  // Get maximal perfect nest of 'affine.for' insts starting from root
873  // (inclusive).
874  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
876  getPerfectlyNestedLoops(band, forOp);
877  bands->push_back(band);
878  }
879 }
880 
881 /// Unrolls this loop completely.
883  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
884  if (mayBeConstantTripCount.has_value()) {
885  uint64_t tripCount = *mayBeConstantTripCount;
886  if (tripCount == 0)
887  return success();
888  if (tripCount == 1)
889  return promoteIfSingleIteration(forOp);
890  return loopUnrollByFactor(forOp, tripCount);
891  }
892  return failure();
893 }
894 
895 /// Unrolls this loop by the specified factor or by the trip count (if constant)
896 /// whichever is lower.
898  uint64_t unrollFactor) {
899  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
900  if (mayBeConstantTripCount.has_value() &&
901  *mayBeConstantTripCount < unrollFactor)
902  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
903  return loopUnrollByFactor(forOp, unrollFactor);
904 }
905 
906 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
907 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
908 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
909 /// using annotateFn.
911  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
912  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
913  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
914  ValueRange iterArgs, ValueRange yieldedValues) {
915  // Builder to insert unrolled bodies just before the terminator of the body of
916  // 'forOp'.
917  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
918 
919  if (!annotateFn)
920  annotateFn = [](unsigned, Operation *, OpBuilder) {};
921 
922  // Keep a pointer to the last non-terminator operation in the original block
923  // so that we know what to clone (since we are doing this in-place).
924  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
925 
926  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
927  SmallVector<Value, 4> lastYielded(yieldedValues);
928 
929  for (unsigned i = 1; i < unrollFactor; i++) {
930  IRMapping operandMap;
931 
932  // Prepare operand map.
933  operandMap.map(iterArgs, lastYielded);
934 
935  // If the induction variable is used, create a remapping to the value for
936  // this unrolled instance.
937  if (!forOpIV.use_empty()) {
938  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
939  operandMap.map(forOpIV, ivUnroll);
940  }
941 
942  // Clone the original body of 'forOp'.
943  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
944  Operation *clonedOp = builder.clone(*it, operandMap);
945  annotateFn(i, clonedOp, builder);
946  }
947 
948  // Update yielded values. If the yielded value is defined outside the
949  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
950  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
951  // value with the clone corresponding to the yielded value.
952  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
953  Operation *defOp = yieldedValues[i].getDefiningOp();
954  if (defOp && defOp->getBlock() == loopBodyBlock)
955  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
956  }
957  }
958 
959  // Make sure we annotate the Ops in the original body. We do this last so that
960  // any annotations are not copied into the cloned Ops above.
961  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
962  annotateFn(0, &*it, builder);
963 
964  // Update operands of the yield statement.
965  loopBodyBlock->getTerminator()->setOperands(lastYielded);
966 }
967 
968 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
969 /// count is not a multiple of `unrollFactor`.
971  uint64_t unrollFactor) {
972  // Insert the cleanup loop right after 'forOp'.
973  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
974  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
975 
976  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
977  // and produce results for the original users of `forOp` results.
978  auto results = forOp.getResults();
979  auto cleanupResults = cleanupForOp.getResults();
980  auto cleanupIterOperands = cleanupForOp.getInits();
981 
982  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
983  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
984  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
985  }
986 
987  AffineMap cleanupMap;
988  SmallVector<Value, 4> cleanupOperands;
989  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
990  if (!cleanupMap)
991  return failure();
992 
993  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
994  // Promote the loop body up if this has turned into a single iteration loop.
995  (void)promoteIfSingleIteration(cleanupForOp);
996 
997  // Adjust upper bound of the original loop; this is the same as the lower
998  // bound of the cleanup loop.
999  forOp.setUpperBound(cleanupOperands, cleanupMap);
1000  return success();
1001 }
1002 
1003 /// Unrolls this loop by the specified factor. Returns success if the loop
1004 /// is successfully unrolled.
1006  AffineForOp forOp, uint64_t unrollFactor,
1007  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1008  bool cleanUpUnroll) {
1009  assert(unrollFactor > 0 && "unroll factor should be positive");
1010 
1011  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1012  if (unrollFactor == 1) {
1013  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1015  return failure();
1016  return success();
1017  }
1018 
1019  // Nothing in the loop body other than the terminator.
1020  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1021  return success();
1022 
1023  // If the trip count is lower than the unroll factor, no unrolled body.
1024  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1025  if (cleanUpUnroll) {
1026  // Unroll the cleanup loop if cleanUpUnroll is specified.
1027  return loopUnrollFull(forOp);
1028  }
1029 
1030  return failure();
1031  }
1032 
1033  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1034  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1035  // Loops where the lower bound is a max expression or the upper bound is
1036  // a min expression and the trip count doesn't divide the unroll factor
1037  // can't be unrolled since the lower bound of the cleanup loop in such cases
1038  // cannot be expressed as an affine function or a max over affine functions.
1039  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1040  forOp.getUpperBoundMap().getNumResults() != 1)
1041  return failure();
1042  if (cleanUpUnroll)
1043  // Force unroll including cleanup loop
1044  return loopUnrollFull(forOp);
1045  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1046  assert(false && "cleanup loop lower bound map for single result lower "
1047  "and upper bound maps can always be determined");
1048  }
1049 
1050  ValueRange iterArgs(forOp.getRegionIterArgs());
1051  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1052 
1053  // Scale the step of loop being unrolled by unroll factor.
1054  int64_t step = forOp.getStepAsInt();
1055  forOp.setStep(step * unrollFactor);
1057  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1058  [&](unsigned i, Value iv, OpBuilder b) {
1059  // iv' = iv + i * step
1060  auto d0 = b.getAffineDimExpr(0);
1061  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1062  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1063  },
1064  /*annotateFn=*/annotateFn,
1065  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1066 
1067  // Promote the loop body up if this has turned into a single iteration loop.
1068  (void)promoteIfSingleIteration(forOp);
1069  return success();
1070 }
1071 
1073  uint64_t unrollJamFactor) {
1074  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1075  if (mayBeConstantTripCount.has_value() &&
1076  *mayBeConstantTripCount < unrollJamFactor)
1077  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1078  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1079 }
1080 
1081 /// Check if all control operands of all loops are defined outside of `forOp`
1082 /// and return false if not.
1083 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1084  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1085  for (auto controlOperand : aForOp.getControlOperands()) {
1086  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1087  return WalkResult::interrupt();
1088  }
1089  return WalkResult::advance();
1090  });
1091  return !walkResult.wasInterrupted();
1092 }
1093 
1094 /// Unrolls and jams this loop by the specified factor.
1096  uint64_t unrollJamFactor) {
1097  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1098 
1099  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1100  if (unrollJamFactor == 1) {
1101  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1103  return failure();
1104  return success();
1105  }
1106 
1107  // Nothing in the loop body other than the terminator.
1108  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1109  return success();
1110 
1111  // If the trip count is lower than the unroll jam factor, no unroll jam.
1112  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1113  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1114  return failure();
1115  }
1116 
1117  // If any control operand of any inner loop of `forOp` is defined within
1118  // `forOp`, no unroll jam.
1119  if (!areInnerBoundsInvariant(forOp))
1120  return failure();
1121 
1122  // Gather all sub-blocks to jam upon the loop being unrolled.
1124  jbg.walk(forOp);
1125  auto &subBlocks = jbg.subBlocks;
1126 
1127  // Collect loops with iter_args.
1128  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1129  forOp.walk([&](AffineForOp aForOp) {
1130  if (aForOp.getNumIterOperands() > 0)
1131  loopsWithIterArgs.push_back(aForOp);
1132  });
1133 
1134  // Get supported reductions to be used for creating reduction ops at the end.
1135  SmallVector<LoopReduction> reductions;
1136  if (forOp.getNumIterOperands() > 0)
1137  getSupportedReductions(forOp, reductions);
1138 
1139  // Generate the cleanup loop if trip count isn't a multiple of
1140  // unrollJamFactor.
1141  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1142  // Loops where the lower bound is a max expression or the upper bound is
1143  // a min expression and the trip count doesn't divide the unroll factor
1144  // can't be unrolled since the lower bound of the cleanup loop in such cases
1145  // cannot be expressed as an affine function or a max over affine functions.
1146  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1147  forOp.getUpperBoundMap().getNumResults() != 1)
1148  return failure();
1149  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1150  assert(false && "cleanup loop lower bound map for single result lower "
1151  "and upper bound maps can always be determined");
1152  }
1153 
1154  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1155  // iteration. There are (`unrollJamFactor` - 1) iterations.
1156  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1157 
1158  // For any loop with iter_args, replace it with a new loop that has
1159  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1160  // operands.
1161  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1162  IRRewriter rewriter(forOp.getContext());
1163  for (AffineForOp oldForOp : loopsWithIterArgs) {
1164  SmallVector<Value> dupIterOperands, dupYieldOperands;
1165  ValueRange oldIterOperands = oldForOp.getInits();
1166  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1167  ValueRange oldYieldOperands =
1168  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1169  // Get additional iterOperands, iterArgs, and yield operands. We will
1170  // fix iterOperands and yield operands after cloning of sub-blocks.
1171  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1172  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1173  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1174  }
1175  // Create a new loop with additional iterOperands, iter_args and yield
1176  // operands. This new loop will take the loop body of the original loop.
1177  bool forOpReplaced = oldForOp == forOp;
1178  AffineForOp newForOp =
1179  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1180  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1181  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1182  return dupYieldOperands;
1183  }));
1184  newLoopsWithIterArgs.push_back(newForOp);
1185  // `forOp` has been replaced with a new loop.
1186  if (forOpReplaced)
1187  forOp = newForOp;
1188  // Update `operandMaps` for `newForOp` iterArgs and results.
1189  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1190  unsigned oldNumIterArgs = oldIterArgs.size();
1191  ValueRange newResults = newForOp.getResults();
1192  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1193  assert(oldNumIterArgs == oldNumResults &&
1194  "oldNumIterArgs must be the same as oldNumResults");
1195  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1196  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1197  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1198  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1199  // to those in the `i`th new set.
1200  operandMaps[i - 1].map(newIterArgs[j],
1201  newIterArgs[i * oldNumIterArgs + j]);
1202  operandMaps[i - 1].map(newResults[j],
1203  newResults[i * oldNumResults + j]);
1204  }
1205  }
1206  }
1207 
1208  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1209  int64_t step = forOp.getStepAsInt();
1210  forOp.setStep(step * unrollJamFactor);
1211 
1212  auto forOpIV = forOp.getInductionVar();
1213  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1214  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1215  for (auto &subBlock : subBlocks) {
1216  // Builder to insert unroll-jammed bodies. Insert right at the end of
1217  // sub-block.
1218  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1219 
1220  // If the induction variable is used, create a remapping to the value for
1221  // this unrolled instance.
1222  if (!forOpIV.use_empty()) {
1223  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1224  auto d0 = builder.getAffineDimExpr(0);
1225  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1226  auto ivUnroll =
1227  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1228  operandMaps[i - 1].map(forOpIV, ivUnroll);
1229  }
1230  // Clone the sub-block being unroll-jammed.
1231  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1232  builder.clone(*it, operandMaps[i - 1]);
1233  }
1234  // Fix iterOperands and yield op operands of newly created loops.
1235  for (auto newForOp : newLoopsWithIterArgs) {
1236  unsigned oldNumIterOperands =
1237  newForOp.getNumIterOperands() / unrollJamFactor;
1238  unsigned numControlOperands = newForOp.getNumControlOperands();
1239  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1240  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1241  assert(oldNumIterOperands == oldNumYieldOperands &&
1242  "oldNumIterOperands must be the same as oldNumYieldOperands");
1243  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1244  // The `i`th duplication of an old iterOperand or yield op operand
1245  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1246  // if such mapped value exists.
1247  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1248  operandMaps[i - 1].lookupOrDefault(
1249  newForOp.getOperand(numControlOperands + j)));
1250  yieldOp.setOperand(
1251  i * oldNumYieldOperands + j,
1252  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1253  }
1254  }
1255  }
1256  if (forOp.getNumResults() > 0) {
1257  // Create reduction ops to combine every `unrollJamFactor` related results
1258  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1259  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1260  // %1.
1261  rewriter.setInsertionPointAfter(forOp);
1262  auto loc = forOp.getLoc();
1263  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1264  for (LoopReduction &reduction : reductions) {
1265  unsigned pos = reduction.iterArgPosition;
1266  Value lhs = forOp.getResult(pos);
1267  Value rhs;
1269  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1270  rhs = forOp.getResult(i * oldNumResults + pos);
1271  // Create ops based on reduction type.
1272  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1273  if (!lhs)
1274  return failure();
1275  Operation *op = lhs.getDefiningOp();
1276  assert(op && "Reduction op should have been created");
1277  newOps.insert(op);
1278  }
1279  // Replace all uses except those in newly created reduction ops.
1280  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1281  }
1282  }
1283 
1284  // Promote the loop body up if this has turned into a single iteration loop.
1285  (void)promoteIfSingleIteration(forOp);
1286  return success();
1287 }
1288 
1289 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1290 /// nested within 'forOpA' as the only non-terminator operation in its block.
1291 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1292  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1293  auto &forOpABody = forOpA.getBody()->getOperations();
1294  auto &forOpBBody = forOpB.getBody()->getOperations();
1295 
1296  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1297  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1298  // body containing only the terminator.
1299  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1300  forOpABody, forOpABody.begin(),
1301  std::prev(forOpABody.end()));
1302  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1303  // body (this leaves forOpB's body containing only the terminator).
1304  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1305  std::prev(forOpBBody.end()));
1306  // 3) Splice forOpA into the beginning of forOpB's body.
1307  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1308  Block::iterator(forOpA));
1309 }
1310 
1311 // Checks each dependence component against the permutation to see if the
1312 // desired loop interchange would violate dependences by making the
1313 // dependence component lexicographically negative.
1315  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1316  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1317  // Invert permutation map.
1318  unsigned maxLoopDepth = loops.size();
1319  SmallVector<unsigned, 4> loopPermMapInv;
1320  loopPermMapInv.resize(maxLoopDepth);
1321  for (unsigned i = 0; i < maxLoopDepth; ++i)
1322  loopPermMapInv[loopPermMap[i]] = i;
1323 
1324  // Check each dependence component against the permutation to see if the
1325  // desired loop interchange permutation would make the dependence vectors
1326  // lexicographically negative.
1327  // Example 1: [-1, 1][0, 0]
1328  // Example 2: [0, 0][-1, 1]
1329  for (const auto &depComps : depCompsVec) {
1330  assert(depComps.size() >= maxLoopDepth);
1331  // Check if the first non-zero dependence component is positive.
1332  // This iterates through loops in the desired order.
1333  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1334  unsigned permIndex = loopPermMapInv[j];
1335  assert(depComps[permIndex].lb);
1336  int64_t depCompLb = *depComps[permIndex].lb;
1337  if (depCompLb > 0)
1338  break;
1339  if (depCompLb < 0)
1340  return false;
1341  }
1342  }
1343  return true;
1344 }
1345 
1346 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1347 /// nested sequence of loops in 'loops' would violate dependences.
1349  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1350  // Gather dependence components for dependences between all ops in loop nest
1351  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1352  assert(loopPermMap.size() == loops.size());
1353  unsigned maxLoopDepth = loops.size();
1354  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1355  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1356  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1357 }
1358 
1359 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1360 /// in it from outermost to innermost.
1361 bool LLVM_ATTRIBUTE_UNUSED
1363  assert(!loops.empty() && "no loops provided");
1364 
1365  // We already know that the block can't be empty.
1366  auto hasTwoElements = [](Block *block) {
1367  auto secondOpIt = std::next(block->begin());
1368  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1369  };
1370 
1371  auto enclosingLoop = loops.front();
1372  for (auto loop : loops.drop_front()) {
1373  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1374  // parentForOp's body should be just this loop and the terminator.
1375  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1376  return false;
1377  enclosingLoop = loop;
1378  }
1379  return true;
1380 }
1381 
1382 // input[i] should move from position i -> permMap[i]. Returns the position in
1383 // `input` that becomes the new outermost loop.
1385  ArrayRef<unsigned> permMap) {
1386  assert(input.size() == permMap.size() && "invalid permutation map size");
1387  // Check whether the permutation spec is valid. This is a small vector - we'll
1388  // just sort and check if it's iota.
1389  SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1390  llvm::sort(checkPermMap);
1391  if (llvm::any_of(llvm::enumerate(checkPermMap),
1392  [](const auto &en) { return en.value() != en.index(); }))
1393  assert(false && "invalid permutation map");
1394 
1395  // Nothing to do.
1396  if (input.size() < 2)
1397  return 0;
1398 
1399  assert(isPerfectlyNested(input) && "input not perfectly nested");
1400 
1401  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1402  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1404  for (unsigned i = 0, e = input.size(); i < e; ++i)
1405  invPermMap.push_back({permMap[i], i});
1406  llvm::sort(invPermMap);
1407 
1408  // Move the innermost loop body to the loop that would be the innermost in the
1409  // permuted nest (only if the innermost loop is going to change).
1410  if (permMap.back() != input.size() - 1) {
1411  auto *destBody = input[invPermMap.back().second].getBody();
1412  auto *srcBody = input.back().getBody();
1413  destBody->getOperations().splice(destBody->begin(),
1414  srcBody->getOperations(), srcBody->begin(),
1415  std::prev(srcBody->end()));
1416  }
1417 
1418  // We'll move each loop in `input` in the reverse order so that its body is
1419  // empty when we are moving it; this incurs zero copies and no erasing.
1420  for (int i = input.size() - 1; i >= 0; --i) {
1421  // If this has to become the outermost loop after permutation, add it to the
1422  // parent block of the original root.
1423  if (permMap[i] == 0) {
1424  // If the root remains the same, nothing to do.
1425  if (i == 0)
1426  continue;
1427  // Make input[i] the new outermost loop moving it into parentBlock.
1428  auto *parentBlock = input[0]->getBlock();
1429  parentBlock->getOperations().splice(Block::iterator(input[0]),
1430  input[i]->getBlock()->getOperations(),
1431  Block::iterator(input[i]));
1432  continue;
1433  }
1434 
1435  // If the parent in the permuted order is the same as in the original,
1436  // nothing to do.
1437  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1438  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1439  continue;
1440 
1441  // Move input[i] to its surrounding loop in the transformed nest.
1442  auto *destBody = input[parentPosInInput].getBody();
1443  destBody->getOperations().splice(destBody->begin(),
1444  input[i]->getBlock()->getOperations(),
1445  Block::iterator(input[i]));
1446  }
1447 
1448  return invPermMap[0].second;
1449 }
1450 
1451 // Sinks all sequential loops to the innermost levels (while preserving
1452 // relative order among them) and moves all parallel loops to the
1453 // outermost (while again preserving relative order among them).
1454 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1456  getPerfectlyNestedLoops(loops, forOp);
1457  if (loops.size() < 2)
1458  return forOp;
1459 
1460  // Gather dependence components for dependences between all ops in loop nest
1461  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1462  unsigned maxLoopDepth = loops.size();
1463  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1464  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1465 
1466  // Mark loops as either parallel or sequential.
1467  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1468  for (auto &depComps : depCompsVec) {
1469  assert(depComps.size() >= maxLoopDepth);
1470  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1471  DependenceComponent &depComp = depComps[j];
1472  assert(depComp.lb.has_value() && depComp.ub.has_value());
1473  if (*depComp.lb != 0 || *depComp.ub != 0)
1474  isParallelLoop[j] = false;
1475  }
1476  }
1477 
1478  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1479 
1480  // Compute permutation of loops that sinks sequential loops (and thus raises
1481  // parallel loops) while preserving relative order.
1482  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1483  unsigned nextSequentialLoop = numParallelLoops;
1484  unsigned nextParallelLoop = 0;
1485  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1486  if (isParallelLoop[i]) {
1487  loopPermMap[i] = nextParallelLoop++;
1488  } else {
1489  loopPermMap[i] = nextSequentialLoop++;
1490  }
1491  }
1492 
1493  // Check if permutation 'loopPermMap' would violate dependences.
1494  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1495  return forOp;
1496  // Perform loop interchange according to permutation 'loopPermMap'.
1497  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1498  return loops[loopNestRootIndex];
1499 }
1500 
1501 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1502 // lower (resp. upper) loop bound. When called for both the lower and upper
1503 // bounds, the resulting IR resembles:
1504 //
1505 // ```mlir
1506 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1507 // ...
1508 // }
1509 // ```
1511  SmallVector<Value, 4> *operands,
1512  int64_t offset = 0) {
1513  auto bounds = llvm::to_vector<4>(map->getResults());
1514  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1515  operands->insert(operands->begin() + map->getNumDims(), iv);
1516  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1517  b.getContext());
1518  canonicalizeMapAndOperands(map, operands);
1519 }
1520 
1521 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1522 // Stripmine-sink is a primitive building block for generalized tiling of
1523 // imperfectly nested loops.
1524 // This transformation is purely mechanical and does not check legality,
1525 // profitability or even structural correctness. It is the user's
1526 // responsibility to specify `targets` that are dominated by `forOp`.
1527 // Returns the new AffineForOps, one per `targets`, nested immediately under
1528 // each of the `targets`.
1530 stripmineSink(AffineForOp forOp, uint64_t factor,
1531  ArrayRef<AffineForOp> targets) {
1532  auto originalStep = forOp.getStepAsInt();
1533  auto scaledStep = originalStep * factor;
1534  forOp.setStep(scaledStep);
1535 
1536  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1537 
1538  // Lower-bound map creation.
1539  auto lbMap = forOp.getLowerBoundMap();
1540  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1541  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1542 
1543  // Upper-bound map creation.
1544  auto ubMap = forOp.getUpperBoundMap();
1545  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1546  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1547  /*offset=*/scaledStep);
1548 
1549  auto iv = forOp.getInductionVar();
1550  SmallVector<AffineForOp, 8> innerLoops;
1551  for (auto t : targets) {
1552  // Insert newForOp before the terminator of `t`.
1553  auto b = OpBuilder::atBlockTerminator(t.getBody());
1554  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1555  ubOperands, ubMap, originalStep);
1556  auto begin = t.getBody()->begin();
1557  // Skip terminator and `newForOp` which is just before the terminator.
1558  auto nOps = t.getBody()->getOperations().size() - 2;
1559  newForOp.getBody()->getOperations().splice(
1560  newForOp.getBody()->getOperations().begin(),
1561  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1562  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1563  newForOp.getRegion());
1564  innerLoops.push_back(newForOp);
1565  }
1566 
1567  return innerLoops;
1568 }
1569 
1570 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1571 // Returns the new AffineForOps, nested immediately under `target`.
1572 template <typename SizeType>
1573 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1574  AffineForOp target) {
1575  // TODO: Use cheap structural assertions that targets are nested under
1576  // forOp and that targets are not nested under each other when DominanceInfo
1577  // exposes the capability. It seems overkill to construct a whole function
1578  // dominance tree at this point.
1579  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1580  assert(res.size() == 1 && "Expected 1 inner forOp");
1581  return res[0];
1582 }
1583 
1586  ArrayRef<AffineForOp> targets) {
1588  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1589  for (auto it : llvm::zip(forOps, sizes)) {
1590  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1591  res.push_back(step);
1592  currentTargets = step;
1593  }
1594  return res;
1595 }
1596 
1598  ArrayRef<uint64_t> sizes,
1599  AffineForOp target) {
1601  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1602  assert(loops.size() == 1);
1603  res.push_back(loops[0]);
1604  }
1605  return res;
1606 }
1607 
1609  if (loops.size() < 2)
1610  return success();
1611 
1612  AffineForOp innermost = loops.back();
1613  AffineForOp outermost = loops.front();
1614  AffineBound ub = outermost.getUpperBound();
1615  AffineMap origUbMap = ub.getMap();
1616  Location loc = outermost.getLoc();
1617  OpBuilder builder(outermost);
1618  for (AffineForOp loop : loops) {
1619  // We only work on normalized loops.
1620  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1621  loop.getConstantLowerBound() != 0)
1622  return failure();
1623  }
1624  SmallVector<Value, 4> upperBoundSymbols;
1625  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1626  ub.getOperands().end());
1627 
1628  // 1. Store the upper bound of the outermost loop in a variable.
1629  Value prev;
1630  if (!llvm::hasSingleElement(origUbMap.getResults()))
1631  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1632  else
1633  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1634  upperBoundSymbols.push_back(prev);
1635 
1636  // 2. Emit code computing the upper bound of the coalesced loop as product of
1637  // the number of iterations of all loops.
1638  for (AffineForOp loop : loops.drop_front()) {
1639  ub = loop.getUpperBound();
1640  origUbMap = ub.getMap();
1641  ubOperands = ub.getOperands();
1642  Value upperBound;
1643  // If upper bound map has more than one result, take their minimum.
1644  if (!llvm::hasSingleElement(origUbMap.getResults()))
1645  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1646  else
1647  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1648  upperBoundSymbols.push_back(upperBound);
1649  SmallVector<Value, 4> operands;
1650  operands.push_back(prev);
1651  operands.push_back(upperBound);
1652  // Maintain running product of loop upper bounds.
1653  prev = builder.create<AffineApplyOp>(
1654  loc,
1655  AffineMap::get(/*dimCount=*/1,
1656  /*symbolCount=*/1,
1657  builder.getAffineDimExpr(0) *
1658  builder.getAffineSymbolExpr(0)),
1659  operands);
1660  }
1661  // Set upper bound of the coalesced loop.
1662  AffineMap newUbMap = AffineMap::get(
1663  /*dimCount=*/0,
1664  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1665  outermost.setUpperBound(prev, newUbMap);
1666 
1667  builder.setInsertionPointToStart(outermost.getBody());
1668 
1669  // 3. Remap induction variables. For each original loop, the value of the
1670  // induction variable can be obtained by dividing the induction variable of
1671  // the linearized loop by the total number of iterations of the loops nested
1672  // in it modulo the number of iterations in this loop (remove the values
1673  // related to the outer loops):
1674  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1675  // Compute these iteratively from the innermost loop by creating a "running
1676  // quotient" of division by the range.
1677  Value previous = outermost.getInductionVar();
1678  for (unsigned idx = loops.size(); idx > 0; --idx) {
1679  if (idx != loops.size()) {
1680  SmallVector<Value, 4> operands;
1681  operands.push_back(previous);
1682  operands.push_back(upperBoundSymbols[idx]);
1683  previous = builder.create<AffineApplyOp>(
1684  loc,
1686  /*dimCount=*/1, /*symbolCount=*/1,
1687  builder.getAffineDimExpr(0).floorDiv(
1688  builder.getAffineSymbolExpr(0))),
1689  operands);
1690  }
1691  // Modified value of the induction variables of the nested loops after
1692  // coalescing.
1693  Value inductionVariable;
1694  if (idx == 1) {
1695  inductionVariable = previous;
1696  } else {
1697  SmallVector<Value, 4> applyOperands;
1698  applyOperands.push_back(previous);
1699  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1700  inductionVariable = builder.create<AffineApplyOp>(
1701  loc,
1703  /*dimCount=*/1, /*symbolCount=*/1,
1704  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1705  applyOperands);
1706  }
1707  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1708  inductionVariable, loops.back().getRegion());
1709  }
1710 
1711  // 4. Move the operations from the innermost just above the second-outermost
1712  // loop, delete the extra terminator and the second-outermost loop.
1713  AffineForOp secondOutermostLoop = loops[1];
1714  innermost.getBody()->back().erase();
1715  outermost.getBody()->getOperations().splice(
1716  Block::iterator(secondOutermostLoop.getOperation()),
1717  innermost.getBody()->getOperations());
1718  secondOutermostLoop.erase();
1719  return success();
1720 }
1721 
1723  ArrayRef<Value> processorId,
1724  ArrayRef<Value> numProcessors) {
1725  assert(processorId.size() == numProcessors.size());
1726  if (processorId.empty())
1727  return;
1728 
1729  OpBuilder b(forOp);
1730  Location loc(forOp.getLoc());
1731  AffineExpr lhs, rhs;
1732  bindSymbols(forOp.getContext(), lhs, rhs);
1733  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1734  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1735 
1736  Value linearIndex = processorId.front();
1737  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1738  auto mulApplyOp = b.create<AffineApplyOp>(
1739  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1740  linearIndex = b.create<AffineApplyOp>(
1741  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1742  }
1743 
1744  auto mulApplyOp = b.create<AffineApplyOp>(
1745  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1746  Value lb = b.create<AffineApplyOp>(
1747  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1748  forOp.setLowerBound(lb);
1749 
1750  Value step = forOp.getStep();
1751  for (auto numProcs : numProcessors)
1752  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1753  forOp.setStep(step);
1754 }
1755 
1756 /// Given a memref region, determine the lowest depth at which transfers can be
1757 /// placed for it, and return the corresponding block, start and end positions
1758 /// in the block for placing incoming (read) and outgoing (write) copies
1759 /// respectively. The lowest depth depends on whether the region being accessed
1760 /// is hoistable with respect to one or more immediately surrounding loops.
1761 static void
1763  Block::iterator &begin, Block::iterator &end,
1764  Block **copyPlacementBlock,
1765  Block::iterator *copyInPlacementStart,
1766  Block::iterator *copyOutPlacementStart) {
1767  const auto *cst = region.getConstraints();
1768  SmallVector<Value, 4> symbols;
1769  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1770 
1771  SmallVector<AffineForOp, 4> enclosingFors;
1772  getAffineForIVs(*block.begin(), &enclosingFors);
1773  // Walk up loop parents till we find an IV on which this region is
1774  // symbolic/variant.
1775  auto it = enclosingFors.rbegin();
1776  for (auto e = enclosingFors.rend(); it != e; ++it) {
1777  // TODO: also need to be checking this for regions symbols that
1778  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1779  if (llvm::is_contained(symbols, it->getInductionVar()))
1780  break;
1781  }
1782 
1783  if (it != enclosingFors.rbegin()) {
1784  auto lastInvariantIV = *std::prev(it);
1785  *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1786  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1787  *copyPlacementBlock = lastInvariantIV->getBlock();
1788  } else {
1789  *copyInPlacementStart = begin;
1790  *copyOutPlacementStart = end;
1791  *copyPlacementBlock = &block;
1792  }
1793 }
1794 
1795 // Info comprising stride and number of elements transferred every stride.
1796 struct StrideInfo {
1797  int64_t stride;
1799 };
1800 
1801 /// Returns striding information for a copy/transfer of this region with
1802 /// potentially multiple striding levels from outermost to innermost. For an
1803 /// n-dimensional region, there can be at most n-1 levels of striding
1804 /// successively nested.
1805 // TODO: make this work with non-identity layout maps.
1806 static void getMultiLevelStrides(const MemRefRegion &region,
1807  ArrayRef<int64_t> bufferShape,
1808  SmallVectorImpl<StrideInfo> *strideInfos) {
1809  if (bufferShape.size() <= 1)
1810  return;
1811 
1812  int64_t numEltPerStride = 1;
1813  int64_t stride = 1;
1814  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1815  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1816  stride *= dimSize;
1817  numEltPerStride *= bufferShape[d];
1818  // A stride is needed only if the region has a shorter extent than the
1819  // memref along the dimension *and* has an extent greater than one along the
1820  // next major dimension.
1821  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1822  strideInfos->push_back({stride, numEltPerStride});
1823  }
1824  }
1825 }
1826 
1827 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1828 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1829 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1830 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1831 /// expressions to be subtracted out from the respective copy loop iterators in
1832 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1833 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1834 /// inserted.
1835 //
1836 /// The copy-in nest is generated as follows as an example for a 2-d region:
1837 /// for x = ...
1838 /// for y = ...
1839 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1840 ///
1841 static AffineForOp
1842 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1843  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1844  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1845  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1846  OpBuilder b) {
1847  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1848  return lbMap.getNumInputs() == lbOperands.size();
1849  }));
1850  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1851  return ubMap.getNumInputs() == ubOperands.size();
1852  }));
1853 
1854  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1855  assert(lbMaps.size() == rank && "wrong number of lb maps");
1856  assert(ubMaps.size() == rank && "wrong number of ub maps");
1857 
1858  SmallVector<Value, 4> memIndices;
1859  SmallVector<AffineExpr, 4> fastBufExprs;
1860  SmallVector<Value, 4> fastBufMapOperands;
1861  AffineForOp copyNestRoot;
1862  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1863  for (unsigned d = 0; d < rank; ++d) {
1864  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1865  ubOperands, ubMaps[d]);
1866  if (d == 0)
1867  copyNestRoot = forOp;
1868 
1869  b = OpBuilder::atBlockTerminator(forOp.getBody());
1870 
1871  auto fastBufOffsetMap =
1872  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1873  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1874 
1875  // Construct the subscript for the fast memref being copied into/from:
1876  // x - offset_x.
1877  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1878  b.getAffineDimExpr(2 * d));
1879  fastBufMapOperands.push_back(offset);
1880  fastBufMapOperands.push_back(forOp.getInductionVar());
1881  mayBeDeadApplys.push_back(offset);
1882 
1883  // Subscript for the slow memref being copied.
1884  memIndices.push_back(forOp.getInductionVar());
1885  }
1886 
1887  auto fastBufMap =
1888  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1889  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1890  fastBufMap = simplifyAffineMap(fastBufMap);
1891  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1892 
1893  // Drop any dead affine.applys.
1894  for (auto applyOp : mayBeDeadApplys)
1895  if (applyOp.use_empty())
1896  applyOp.erase();
1897 
1898  if (!isCopyOut) {
1899  // Copy in.
1900  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1901  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1902  fastBufMapOperands);
1903  return copyNestRoot;
1904  }
1905 
1906  // Copy out.
1907  auto load =
1908  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1909  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1910  return copyNestRoot;
1911 }
1912 
1913 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1915  return block.getParentOp()->emitRemark();
1916 }
1917 
1918 /// Creates a buffer in the faster memory space for the specified memref region;
1919 /// generates a copy from the lower memory space to this one, and replaces all
1920 /// loads/stores in the block range [`begin', `end') of `block' to load/store
1921 /// from that buffer. Returns failure if copies could not be generated due to
1922 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
1923 /// in copyPlacementBlock specify the insertion points where the incoming copies
1924 /// and outgoing copies, respectively, should be inserted (the insertion happens
1925 /// right before the insertion point). Since `begin` can itself be invalidated
1926 /// due to the memref rewriting done from this method, the output argument
1927 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
1928 /// happens). Since outgoing copies could have been inserted at `end`, the
1929 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
1930 /// size of the fast buffer allocated.
1932  const MemRefRegion &region, Block *block, Block::iterator begin,
1933  Block::iterator end, Block *copyPlacementBlock,
1934  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1935  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1936  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1937  Block::iterator *nBegin, Block::iterator *nEnd) {
1938  *nBegin = begin;
1939  *nEnd = end;
1940 
1941  func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1942  OpBuilder topBuilder(f.getBody());
1943  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1944 
1945  *sizeInBytes = 0;
1946 
1947  if (begin == end)
1948  return success();
1949 
1950  // Is the copy out point at the end of the block where we are doing
1951  // explicit copying.
1952  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1953 
1954  // Copies for read regions are going to be inserted at 'begin'.
1955  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1956  // Copies for write regions are going to be inserted at 'end'.
1957  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1958  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1959 
1960  // Builder to create constants at the top level.
1961  auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
1962  OpBuilder top(func.getBody());
1963 
1964  auto loc = region.loc;
1965  auto memref = region.memref;
1966  auto memRefType = cast<MemRefType>(memref.getType());
1967 
1968  if (!memRefType.getLayout().isIdentity()) {
1969  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1970  return failure();
1971  }
1972 
1973  // Indices to use for the copying.
1974  // Indices for the original memref being copied from/to.
1975  SmallVector<Value, 4> memIndices;
1976  // Indices for the faster buffer being copied into/from.
1977  SmallVector<Value, 4> bufIndices;
1978 
1979  unsigned rank = memRefType.getRank();
1980  SmallVector<int64_t, 4> fastBufferShape;
1981 
1982  // Compute the extents of the buffer.
1983  std::vector<SmallVector<int64_t, 4>> lbs;
1984  SmallVector<int64_t, 8> lbDivisors;
1985  lbs.reserve(rank);
1986  std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
1987  &fastBufferShape, &lbs, &lbDivisors);
1988  if (!numElements) {
1989  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
1990  return failure();
1991  }
1992 
1993  if (*numElements == 0) {
1994  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
1995  return success();
1996  }
1997 
1998  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
1999  for (unsigned i = 0; i < rank; ++i)
2000  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2001 
2002  const FlatAffineValueConstraints *cst = region.getConstraints();
2003  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2004  // on; these typically include loop IVs surrounding the level at which the
2005  // copy generation is being done or other valid symbols in MLIR.
2006  SmallVector<Value, 8> regionSymbols;
2007  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2008 
2009  // Construct the index expressions for the fast memory buffer. The index
2010  // expression for a particular dimension of the fast buffer is obtained by
2011  // subtracting out the lower bound on the original memref's data region
2012  // along the corresponding dimension.
2013 
2014  // Index start offsets for faster memory buffer relative to the original.
2015  SmallVector<AffineExpr, 4> fastBufOffsets;
2016  fastBufOffsets.reserve(rank);
2017  for (unsigned d = 0; d < rank; d++) {
2018  assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2019 
2020  AffineExpr offset = top.getAffineConstantExpr(0);
2021  for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2022  offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2023  assert(lbDivisors[d] > 0);
2024  offset =
2025  (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2026 
2027  // Set copy start location for this dimension in the lower memory space
2028  // memref.
2029  if (auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2030  auto indexVal = caf.getValue();
2031  if (indexVal == 0) {
2032  memIndices.push_back(zeroIndex);
2033  } else {
2034  memIndices.push_back(
2035  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2036  }
2037  } else {
2038  // The coordinate for the start location is just the lower bound along the
2039  // corresponding dimension on the memory region (stored in 'offset').
2040  auto map = AffineMap::get(
2041  cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2042  memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2043  }
2044  // The fast buffer is copied into at location zero; addressing is relative.
2045  bufIndices.push_back(zeroIndex);
2046 
2047  // Record the offsets since they are needed to remap the memory accesses of
2048  // the original memref further below.
2049  fastBufOffsets.push_back(offset);
2050  }
2051 
2052  // The faster memory space buffer.
2053  Value fastMemRef;
2054 
2055  // Check if a buffer was already created.
2056  bool existingBuf = fastBufferMap.count(memref) > 0;
2057  if (!existingBuf) {
2058  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2059  auto fastMemRefType =
2060  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2061  fastBufferLayout, copyOptions.fastMemorySpace);
2062 
2063  // Create the fast memory space buffer just before the 'affine.for'
2064  // operation.
2065  fastMemRef =
2066  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2067  // Record it.
2068  fastBufferMap[memref] = fastMemRef;
2069  // fastMemRefType is a constant shaped memref.
2070  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2071  // We don't account for things of unknown size.
2072  *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
2073 
2074  LLVM_DEBUG(emitRemarkForBlock(*block)
2075  << "Creating fast buffer of type " << fastMemRefType
2076  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2077  << " KiB\n");
2078  } else {
2079  // Reuse the one already created.
2080  fastMemRef = fastBufferMap[memref];
2081  }
2082 
2083  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2084 
2085  Value dmaStride = nullptr;
2086  Value numEltPerDmaStride = nullptr;
2087  if (copyOptions.generateDma) {
2088  SmallVector<StrideInfo, 4> dmaStrideInfos;
2089  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2090 
2091  // TODO: use all stride levels once DmaStartOp is extended for
2092  // multi-level strides.
2093  if (dmaStrideInfos.size() > 1) {
2094  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2095  return failure();
2096  }
2097 
2098  if (!dmaStrideInfos.empty()) {
2099  dmaStride =
2100  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2101  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2102  loc, dmaStrideInfos[0].numEltPerStride);
2103  }
2104  }
2105 
2106  // Record the last operation where we want the memref replacement to end. We
2107  // later do the memref replacement only in [begin, postDomFilter] so
2108  // that the original memref's used in the data movement code themselves don't
2109  // get replaced.
2110  auto postDomFilter = std::prev(end);
2111 
2112  // Create fully composed affine maps for each memref.
2113  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2114  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2115  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2116  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2117 
2118  if (!copyOptions.generateDma) {
2119  // Point-wise copy generation.
2120  auto copyNest =
2121  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2122  /*lbOperands=*/regionSymbols, ubMaps,
2123  /*ubOperands=*/regionSymbols, fastBufOffsets,
2124  /*isCopyOut=*/region.isWrite(), b);
2125 
2126  // Record this so that we can skip it from yet another copy.
2127  copyNests.insert(copyNest);
2128 
2129  // Since new ops are being appended (for copy out's), adjust the end to
2130  // mark end of block range being processed if necessary.
2131  if (region.isWrite() && isCopyOutAtEndOfBlock)
2132  *nEnd = Block::iterator(copyNest.getOperation());
2133  } else {
2134  // DMA generation.
2135  // Create a tag (single element 1-d memref) for the DMA.
2136  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2137  copyOptions.tagMemorySpace);
2138  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2139 
2140  SmallVector<Value, 4> tagIndices({zeroIndex});
2141  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2142  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2143  if (!region.isWrite()) {
2144  // DMA non-blocking read from original buffer to fast buffer.
2145  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2146  fastMemRef, bufAffineMap, bufIndices,
2147  tagMemRef, tagAffineMap, tagIndices,
2148  numElementsSSA, dmaStride, numEltPerDmaStride);
2149  } else {
2150  // DMA non-blocking write from fast buffer to the original memref.
2151  auto op = b.create<AffineDmaStartOp>(
2152  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2153  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2154  dmaStride, numEltPerDmaStride);
2155  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2156  // end to mark end of block range being processed.
2157  if (isCopyOutAtEndOfBlock)
2158  *nEnd = Block::iterator(op.getOperation());
2159  }
2160 
2161  // Matching DMA wait to block on completion; tag always has a 0 index.
2162  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2163  numElementsSSA);
2164 
2165  // Generate dealloc for the tag.
2166  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2167  if (*nEnd == end && isCopyOutAtEndOfBlock)
2168  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2169  // mark end of range of the original.
2170  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2171  }
2172 
2173  // Generate dealloc for the buffer.
2174  if (!existingBuf) {
2175  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2176  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2177  // the fast buffer (since it marks the new end insertion point).
2178  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2179  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2180  }
2181 
2182  // Replace all uses of the old memref with the faster one while remapping
2183  // access indices (subtracting out lower bound offsets for each dimension).
2184  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2185  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2186  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2187  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2188  // d2, d3 correspond to the original indices (%i, %j).
2189  SmallVector<AffineExpr, 4> remapExprs;
2190  remapExprs.reserve(rank);
2191  for (unsigned i = 0; i < rank; i++) {
2192  // The starting operands of indexRemap will be regionSymbols (the symbols on
2193  // which the memref region is parametric); then those corresponding to
2194  // the memref's original indices follow.
2195  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2196  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2197  }
2198  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2199  b.getContext());
2200 
2201  // Record the begin since it may be invalidated by memref replacement.
2202  Block::iterator prevOfBegin;
2203  bool isBeginAtStartOfBlock = (begin == block->begin());
2204  if (!isBeginAtStartOfBlock)
2205  prevOfBegin = std::prev(begin);
2206 
2207  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2208  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2209  /*extraIndices=*/{}, indexRemap,
2210  /*extraOperands=*/regionSymbols,
2211  /*symbolOperands=*/{},
2212  /*domOpFilter=*/&*begin,
2213  /*postDomOpFilter=*/&*postDomFilter);
2214 
2215  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2216 
2217  return success();
2218 }
2219 
2220 /// Construct the memref region to just include the entire memref. Returns false
2221 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2222 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2223 /// is parametric on.
2224 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2225  MemRefRegion *region) {
2226  unsigned rank;
2227  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2228  rank = loadOp.getMemRefType().getRank();
2229  region->memref = loadOp.getMemRef();
2230  region->setWrite(false);
2231  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2232  rank = storeOp.getMemRefType().getRank();
2233  region->memref = storeOp.getMemRef();
2234  region->setWrite(true);
2235  } else {
2236  assert(false && "expected load or store op");
2237  return false;
2238  }
2239  auto memRefType = cast<MemRefType>(region->memref.getType());
2240  if (!memRefType.hasStaticShape())
2241  return false;
2242 
2243  auto *regionCst = region->getConstraints();
2244 
2245  // Just get the first numSymbols IVs, which the memref region is parametric
2246  // on.
2248  getAffineForIVs(*op, &ivs);
2249  ivs.resize(numParamLoopIVs);
2250  SmallVector<Value, 4> symbols;
2251  extractForInductionVars(ivs, &symbols);
2252  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2253  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2254 
2255  // Memref dim sizes provide the bounds.
2256  for (unsigned d = 0; d < rank; d++) {
2257  auto dimSize = memRefType.getDimSize(d);
2258  assert(dimSize > 0 && "filtered dynamic shapes above");
2259  regionCst->addBound(BoundType::LB, d, 0);
2260  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2261  }
2262  return true;
2263 }
2264 
2267  const AffineCopyOptions &copyOptions,
2268  std::optional<Value> filterMemRef,
2269  DenseSet<Operation *> &copyNests) {
2270  if (begin == end)
2271  return success();
2272 
2273  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2274  "Inconsistent block begin/end args");
2275  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2276 
2277  Block *block = begin->getBlock();
2278 
2279  // Copies will be generated for this depth, i.e., symbolic in all loops
2280  // surrounding the this block range.
2281  unsigned copyDepth = getNestingDepth(&*begin);
2282 
2283  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2284  << "\n");
2285  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2286  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2287 
2288  // List of memory regions to copy for. We need a map vector to have a
2289  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2290  // since the alloc's for example are identical except for the SSA id.
2291  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2292  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2293 
2294  // Map from original memref's to the fast buffers that their accesses are
2295  // replaced with.
2296  DenseMap<Value, Value> fastBufferMap;
2297 
2298  // To check for errors when walking the block.
2299  bool error = false;
2300 
2301  // Walk this range of operations to gather all memory regions.
2302  block->walk(begin, end, [&](Operation *opInst) {
2303  // Gather regions to allocate to buffers in faster memory space.
2304  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2305  if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2306  (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2307  copyOptions.slowMemorySpace))
2308  return;
2309  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2310  if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2311  storeOp.getMemRefType().getMemorySpaceAsInt() !=
2312  copyOptions.slowMemorySpace)
2313  return;
2314  } else {
2315  // Neither load nor a store op.
2316  return;
2317  }
2318 
2319  // Compute the MemRefRegion accessed.
2320  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2321  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2322  /*addMemRefDimBounds=*/false))) {
2323  LLVM_DEBUG(llvm::dbgs()
2324  << "Error obtaining memory region: semi-affine maps?\n");
2325  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2326  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2327  LLVM_DEBUG(
2328  opInst->emitError("non-constant memref sizes not yet supported"));
2329  error = true;
2330  return;
2331  }
2332  }
2333 
2334  // Each memref has a single buffer associated with it irrespective of how
2335  // many load's and store's happen on it.
2336  // TODO: in the future, when regions don't intersect and satisfy
2337  // other properties (based on load/store regions), we could consider
2338  // multiple buffers per memref.
2339 
2340  // Add to the appropriate region if it's not already in it, or take a
2341  // bounding box union with the existing one if it's already in there.
2342  // Note that a memref may have both read and write regions - so update the
2343  // region in the other list if one exists (write in case of read and vice
2344  // versa) since there is a single bounding box for a memref across all reads
2345  // and writes that happen on it.
2346 
2347  // Attempts to update; returns true if 'region' exists in targetRegions.
2348  auto updateRegion =
2349  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2350  &targetRegions) {
2351  const auto *const it = targetRegions.find(region->memref);
2352  if (it == targetRegions.end())
2353  return false;
2354 
2355  // Perform a union with the existing region.
2356  if (failed(it->second->unionBoundingBox(*region))) {
2357  LLVM_DEBUG(llvm::dbgs()
2358  << "Memory region bounding box failed; "
2359  "over-approximating to the entire memref\n");
2360  // If the union fails, we will overapproximate.
2361  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2362  LLVM_DEBUG(opInst->emitError(
2363  "non-constant memref sizes not yet supported"));
2364  error = true;
2365  return true;
2366  }
2367  it->second->getConstraints()->clearAndCopyFrom(
2368  *region->getConstraints());
2369  } else {
2370  // Union was computed and stored in 'it->second': copy to 'region'.
2371  region->getConstraints()->clearAndCopyFrom(
2372  *it->second->getConstraints());
2373  }
2374  return true;
2375  };
2376 
2377  bool existsInRead = updateRegion(readRegions);
2378  if (error)
2379  return;
2380  bool existsInWrite = updateRegion(writeRegions);
2381  if (error)
2382  return;
2383 
2384  // Finally add it to the region list.
2385  if (region->isWrite() && !existsInWrite) {
2386  writeRegions[region->memref] = std::move(region);
2387  } else if (!region->isWrite() && !existsInRead) {
2388  readRegions[region->memref] = std::move(region);
2389  }
2390  });
2391 
2392  if (error) {
2393  LLVM_DEBUG(begin->emitError(
2394  "copy generation failed for one or more memref's in this block\n"));
2395  return failure();
2396  }
2397 
2398  uint64_t totalCopyBuffersSizeInBytes = 0;
2399  bool ret = true;
2400  auto processRegions =
2401  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2402  &regions) {
2403  for (const auto &regionEntry : regions) {
2404  // For each region, hoist copy in/out past all hoistable
2405  // 'affine.for's.
2406  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2407  Block *copyPlacementBlock;
2409  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2410  &copyInPlacementStart, &copyOutPlacementStart);
2411 
2412  uint64_t sizeInBytes;
2413  Block::iterator nBegin, nEnd;
2414  LogicalResult iRet = generateCopy(
2415  *regionEntry.second, block, begin, end, copyPlacementBlock,
2416  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2417  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2418  if (succeeded(iRet)) {
2419  // begin/end could have been invalidated, and need update.
2420  begin = nBegin;
2421  end = nEnd;
2422  totalCopyBuffersSizeInBytes += sizeInBytes;
2423  }
2424  ret = ret & succeeded(iRet);
2425  }
2426  };
2427  processRegions(readRegions);
2428  processRegions(writeRegions);
2429 
2430  if (!ret) {
2431  LLVM_DEBUG(begin->emitError(
2432  "copy generation failed for one or more memref's in this block\n"));
2433  return failure();
2434  }
2435 
2436  // For a range of operations, a note will be emitted at the caller.
2437  AffineForOp forOp;
2438  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2439  LLVM_DEBUG(forOp.emitRemark()
2440  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2441  << " KiB of copy buffers in fast memory space for this block");
2442  }
2443 
2444  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2445  block->getParentOp()->emitWarning(
2446  "total size of all copy buffers' for this block exceeds fast memory "
2447  "capacity");
2448  }
2449 
2450  return success();
2451 }
2452 
2453 // A convenience version of affineDataCopyGenerate for all ops in the body of
2454 // an AffineForOp.
2456  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2457  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2458  return affineDataCopyGenerate(forOp.getBody()->begin(),
2459  std::prev(forOp.getBody()->end()), copyOptions,
2460  filterMemRef, copyNests);
2461 }
2462 
2464  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2465  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2466  Block *block = analyzedOp->getBlock();
2467  auto begin = analyzedOp->getIterator();
2468  auto end = std::next(begin);
2469  DenseMap<Value, Value> fastBufferMap;
2470  DenseSet<Operation *> copyNests;
2471 
2472  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2473  copyOptions, fastBufferMap, copyNests,
2474  &result.sizeInBytes, &begin, &end);
2475  if (failed(err))
2476  return err;
2477 
2478  const auto &en = fastBufferMap.find(memrefRegion.memref);
2479  // In some cases (empty loops), no copy generation would have happened.
2480  if (en == fastBufferMap.end())
2481  return failure();
2482  result.alloc = en->second.getDefiningOp();
2483  assert(result.alloc && "fast buffer expected to be locally allocated");
2484  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2485  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2486  return success();
2487 }
2488 
2489 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2490 static void
2491 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2492  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2493  // Add a new empty level to output if it doesn't exist level already.
2494  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2495  if (currLoopDepth == depthToLoops.size())
2496  depthToLoops.emplace_back();
2497 
2498  for (auto &op : *block) {
2499  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2500  depthToLoops[currLoopDepth].push_back(forOp);
2501  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2502  }
2503  }
2504 }
2505 
2506 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2508  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2509  for (auto &block : func)
2510  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2511 
2512  // Remove last loop level from output since it's empty.
2513  if (!depthToLoops.empty()) {
2514  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2515  depthToLoops.pop_back();
2516  }
2517 }
2518 
2520  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2521  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2522  SmallVector<Value, 4> lowerOperands(lbOperands);
2523  SmallVector<Value, 4> upperOperands(ubOperands);
2524 
2525  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2526  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2527  lbMap = removeDuplicateExprs(lbMap);
2528  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2529  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2530  ubMap = removeDuplicateExprs(ubMap);
2531 
2532  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2533  step);
2534 }
2535 
2536 /// Creates an AffineIfOp that encodes the conditional to choose between
2537 /// the constant trip count version and an unknown trip count version of this
2538 /// nest of loops. This is used to separate partial and full tiles if `loops`
2539 /// has the intra-tile loops. The affine.if op is inserted at the builder
2540 /// insertion point of `b`.
2542  OpBuilder b) {
2543  if (loops.empty())
2544  return nullptr;
2545 
2546  auto *context = loops[0].getContext();
2547 
2550  llvm::append_range(ops, loops);
2551  (void)getIndexSet(ops, &cst);
2552 
2553  // Remove constraints that are independent of these loop IVs.
2554  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2555 
2556  // Construct the constraint set representing the guard for full tiles. The
2557  // lower bound (and upper bound) corresponding to the full tile should be
2558  // larger (and resp. smaller) than any other lower (or upper bound).
2559  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2560  for (auto loop : loops) {
2561  (void)loop;
2562  // TODO: Non-unit stride is not an issue to generalize to.
2563  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2564  // Mark everything symbols for the purpose of finding a constant diff pair.
2565  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2566  1);
2567  unsigned fullTileLbPos, fullTileUbPos;
2568  if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2569  /*boundFloorDivisor=*/nullptr,
2570  /*ub=*/nullptr, &fullTileLbPos,
2571  &fullTileUbPos)) {
2572  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2573  return nullptr;
2574  }
2575 
2576  SmallVector<unsigned, 4> lbIndices, ubIndices;
2577  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2578 
2579  auto fLb = cst.getInequality(fullTileLbPos);
2580  auto fUb = cst.getInequality(fullTileUbPos);
2581  fullTileLb.assign(fLb.begin(), fLb.end());
2582  fullTileUb.assign(fUb.begin(), fUb.end());
2583 
2584  // Full tile lower bound should be >= than any other lower bound.
2585  for (auto lbIndex : lbIndices)
2586  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2587  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2588 
2589  // Full tile upper bound should be <= any other upper bound.
2590  for (auto ubIndex : ubIndices)
2591  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2592  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2593 
2594  cst.removeVar(0);
2595  }
2596 
2597  // The previous step leads to all zeros for the full tile lb and ub position
2598  // itself; remove those and any other duplicates / trivial redundancies.
2600 
2601  // Turn everything into dims conservatively since we earlier turned all
2602  // trailing ids past point loop IV into symbols. Some of these could be outer
2603  // loop IVs; we'll canonicalize anyway.
2604  cst.setDimSymbolSeparation(0);
2605 
2606  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2607  // ifCondSet can be null if cst was empty -- this can happen if all loops
2608  // in the nest have constant trip counts.
2609  if (!ifCondSet)
2610  return nullptr;
2611 
2612  SmallVector<Value, 4> setOperands;
2613  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2614  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2615  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2616  /*withElseRegion=*/true);
2617 }
2618 
2619 /// Create the full tile loop nest (along with its body).
2620 static LogicalResult
2622  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2623  fullTileLoops.reserve(inputNest.size());
2624 
2625  // For each loop in the original nest identify a lower/upper bound pair such
2626  // that their difference is a constant.
2628  for (auto loop : inputNest) {
2629  // TODO: straightforward to generalize to a non-unit stride.
2630  if (loop.getStepAsInt() != 1) {
2631  LLVM_DEBUG(llvm::dbgs()
2632  << "[tile separation] non-unit stride not implemented\n");
2633  return failure();
2634  }
2635  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2636  (void)getIndexSet(loopOp, &cst);
2637  // We will mark everything other than this loop IV as symbol for getting a
2638  // pair of <lb, ub> with a constant difference.
2640  unsigned lbPos, ubPos;
2641  if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2642  /*boundFloorDivisor=*/nullptr,
2643  /*ub=*/nullptr, &lbPos, &ubPos) ||
2644  lbPos == ubPos) {
2645  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2646  "equalities not yet handled\n");
2647  return failure();
2648  }
2649 
2650  // Set all variables as dimensions uniformly since some of those marked as
2651  // symbols above could be outer loop IVs (corresponding tile space IVs).
2652  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2653 
2654  AffineValueMap lbVmap, ubVmap;
2655  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2656  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2657  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2658  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2659  ubVmap.getOperands(), ubVmap.getAffineMap());
2660  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2661  fullTileLoops.push_back(fullTileLoop);
2662  }
2663 
2664  // Add the body for the full tile loop nest.
2665  IRMapping operandMap;
2666  for (const auto &loopEn : llvm::enumerate(inputNest))
2667  operandMap.map(loopEn.value().getInductionVar(),
2668  fullTileLoops[loopEn.index()].getInductionVar());
2669  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2670  for (auto &op : inputNest.back().getBody()->without_terminator())
2671  b.clone(op, operandMap);
2672  return success();
2673 }
2674 
2677  SmallVectorImpl<AffineForOp> *fullTileNest) {
2678  if (inputNest.empty())
2679  return success();
2680 
2681  auto firstLoop = inputNest[0];
2682 
2683  // Each successive for op has to be nested in the other.
2684  auto prevLoop = firstLoop;
2685  for (auto loop : inputNest.drop_front(1)) {
2686  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2687  prevLoop = loop;
2688  }
2689 
2690  // Create the full tile loop nest.
2691  SmallVector<AffineForOp, 4> fullTileLoops;
2692  OpBuilder b(firstLoop);
2693  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2694  if (!fullTileLoops.empty())
2695  fullTileLoops.front().erase();
2696  return failure();
2697  }
2698 
2699  // Create and insert the version select right before the root of the nest.
2700  b = OpBuilder(firstLoop);
2701  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2702  if (!ifOp) {
2703  fullTileLoops.front().erase();
2704  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2705  "separation condition\n");
2706  return failure();
2707  }
2708 
2709  // Move the full tile into the then block.
2710  Block *thenBlock = ifOp.getThenBlock();
2711  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2712  thenBlock->getOperations().splice(
2713  std::prev(thenBlock->end()),
2714  outermostFullTileLoop->getBlock()->getOperations(),
2715  Block::iterator(outermostFullTileLoop));
2716 
2717  // Move the partial tile into the else block. The partial tile is the same as
2718  // the original loop nest.
2719  Block *elseBlock = ifOp.getElseBlock();
2720  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2721  firstLoop->getBlock()->getOperations(),
2722  Block::iterator(firstLoop));
2723 
2724  if (fullTileNest)
2725  *fullTileNest = std::move(fullTileLoops);
2726 
2727  return success();
2728 }
2729 
2731  LogicalResult result(failure());
2733  getPerfectlyNestedLoops(loops, op);
2734  if (loops.size() <= 1)
2735  return success();
2736 
2737  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2738  // loops with bounds defined above some loop.
2739  // 1. For each loop, find above which parent loop its operands are
2740  // defined.
2741  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2742  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2743  operandsDefinedAbove[i] = i;
2744  for (unsigned j = 0; j < i; ++j) {
2745  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2746  operandsDefinedAbove[i] = j;
2747  break;
2748  }
2749  }
2750  }
2751 
2752  // 2. Identify bands of loops such that the operands of all of them are
2753  // defined above the first loop in the band. Traverse the nest bottom-up
2754  // so that modifications don't invalidate the inner loops.
2755  for (unsigned end = loops.size(); end > 0; --end) {
2756  unsigned start = 0;
2757  for (; start < end - 1; ++start) {
2758  auto maxPos =
2759  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2760  std::next(operandsDefinedAbove.begin(), end));
2761  if (maxPos > start)
2762  continue;
2763  assert(maxPos == start &&
2764  "expected loop bounds to be known at the start of the band");
2765  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2766  if (succeeded(coalesceLoops(band)))
2767  result = success();
2768  break;
2769  }
2770  // If a band was found and transformed, keep looking at the loops above
2771  // the outermost transformed loop.
2772  if (start != end - 1)
2773  end = start + 1;
2774  }
2775  return result;
2776 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:379
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:657
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:420
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2224
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1530
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1914
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:358
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1762
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:171
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:405
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:556
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:461
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2491
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:910
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
Definition: LoopUtils.cpp:1931
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2621
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1806
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:685
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:970
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1083
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1314
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:414
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2541
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:46
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1510
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
Definition: LoopUtils.cpp:1842
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:104
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:69
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:888
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:47
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:386
unsigned getNumDims() const
Definition: AffineMap.cpp:382
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:395
unsigned getNumResults() const
Definition: AffineMap.cpp:390
unsigned getNumInputs() const
Definition: AffineMap.cpp:391
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:399
Block represents an ordered list of Operations.
Definition: Block.h:31
OpListType::iterator iterator
Definition: Block.h:138
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:26
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:290
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:243
OpListType & getOperations()
Definition: Block.h:135
Operation & front()
Definition: Block.h:151
iterator end()
Definition: Block.h:142
iterator begin()
Definition: Block.h:141
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:408
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:414
AffineMap getDimIdentityMap()
Definition: Builders.cpp:390
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:394
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:375
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:379
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:371
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:766
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:308
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:555
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:433
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:254
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:414
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:108
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:42
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:280
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
MLIRContext * getContext() const
Definition: PatternMatch.h:823
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
Type getType() const
Return the type of this value.
Definition: Value.h:129
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:52
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:511
Value getOperand(unsigned idx)
Definition: AffineOps.h:517
operand_range getOperands()
Definition: AffineOps.h:526
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:94
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:303
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:92
std::optional< DynamicAPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< DynamicAPInt > *lb=nullptr, DynamicAPInt *boundFloorDivisor=nullptr, SmallVectorImpl< DynamicAPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1608
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1132
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:882
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:120
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2266
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1072
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2587
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1005
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2507
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1362
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2519
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:853
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:226
unsigned permuteLoops(MutableArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1384
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1348
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2463
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1433
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:897
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1095
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:811
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1438
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:633
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1246
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1848
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1722
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1585
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1454
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:769
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1291
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2730
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:870
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1271
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2676
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2593
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:738
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:27
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:748
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
Definition: LogicalResult.h:68
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:363
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:24
@ ExistingOps
Only pre-existing ops are processed.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
int64_t stride
Definition: LoopUtils.cpp:1797
int64_t numEltPerStride
Definition: LoopUtils.cpp:1798
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:166
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:206
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:430
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:469
bool isWrite() const
Definition: Utils.h:471
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:942
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1003
void setWrite(bool flag)
Definition: Utils.h:472
Value memref
Memref that this region corresponds to.
Definition: Utils.h:517
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:524
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.