MLIR  20.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include <optional>
30 
31 #define DEBUG_TYPE "loop-utils"
32 
33 using namespace mlir;
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
37 
38 /// Computes the cleanup loop lower bound of the loop being unrolled with
39 /// the specified unroll factor; this bound will also be upper bound of the main
40 /// part of the unrolled loop. Computes the bound as an AffineMap with its
41 /// operands or a null map when the trip count can't be expressed as an affine
42 /// expression.
43 static void
44 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45  AffineMap &cleanupLbMap,
46  SmallVectorImpl<Value> &cleanupLbOperands) {
47  AffineMap tripCountMap;
48  SmallVector<Value, 4> tripCountOperands;
49  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50  // Trip count can't be computed.
51  if (!tripCountMap) {
52  cleanupLbMap = AffineMap();
53  return;
54  }
55 
56  OpBuilder b(forOp);
57  auto lbMap = forOp.getLowerBoundMap();
58  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59  forOp.getLowerBoundOperands());
60 
61  // For each upper bound expr, get the range.
62  // Eg: affine.for %i = lb to min (ub1, ub2),
63  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65  // these affine.apply's make up the cleanup loop lower bound.
66  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68  int64_t step = forOp.getStepAsInt();
69  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70  auto tripCountExpr = tripCountMap.getResult(i);
71  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
73  tripCountMap.getNumSymbols(), bumpExprs[i]);
74  bumpValues[i] =
75  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76  }
77 
78  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
81 
82  cleanupLbOperands.clear();
83  cleanupLbOperands.push_back(lb);
84  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
85  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
86  b.getContext());
87  // Simplify the cleanupLbMap + cleanupLbOperands.
88  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
89  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
90  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  // Remove any affine.apply's that became dead from the simplification above.
92  for (auto v : bumpValues)
93  if (v.use_empty())
94  v.getDefiningOp()->erase();
95 
96  if (lb.use_empty())
97  lb.erase();
98 }
99 
100 /// Helper to replace uses of loop carried values (iter_args) and loop
101 /// yield values while promoting single iteration affine.for ops.
102 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103  // Replace uses of iter arguments with iter operands (initial values).
104  auto iterOperands = forOp.getInits();
105  auto iterArgs = forOp.getRegionIterArgs();
106  for (auto e : llvm::zip(iterOperands, iterArgs))
107  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108 
109  // Replace uses of loop results with the values yielded by the loop.
110  auto outerResults = forOp.getResults();
111  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112  for (auto e : llvm::zip(outerResults, innerResults))
113  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114 }
115 
116 /// Promotes the loop body of a forOp to its containing block if the forOp
117 /// was known to have a single iteration.
118 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120  if (!tripCount || *tripCount != 1)
121  return failure();
122 
123  // TODO: extend this for arbitrary affine bounds.
124  if (forOp.getLowerBoundMap().getNumResults() != 1)
125  return failure();
126 
127  // Replaces all IV uses to its single iteration value.
128  auto iv = forOp.getInductionVar();
129  auto *parentBlock = forOp->getBlock();
130  if (!iv.use_empty()) {
131  if (forOp.hasConstantLowerBound()) {
132  OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
133  auto constOp = topBuilder.create<arith::ConstantIndexOp>(
134  forOp.getLoc(), forOp.getConstantLowerBound());
135  iv.replaceAllUsesWith(constOp);
136  } else {
137  auto lbOperands = forOp.getLowerBoundOperands();
138  auto lbMap = forOp.getLowerBoundMap();
139  OpBuilder builder(forOp);
140  if (lbMap == builder.getDimIdentityMap()) {
141  // No need of generating an affine.apply.
142  iv.replaceAllUsesWith(lbOperands[0]);
143  } else {
144  auto affineApplyOp =
145  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
146  iv.replaceAllUsesWith(affineApplyOp);
147  }
148  }
149  }
150 
152 
153  // Move the loop body operations, except for its terminator, to the loop's
154  // containing block.
155  forOp.getBody()->back().erase();
156  parentBlock->getOperations().splice(Block::iterator(forOp),
157  forOp.getBody()->getOperations());
158  forOp.erase();
159  return success();
160 }
161 
162 /// Generates an affine.for op with the specified lower and upper bounds
163 /// while generating the right IV remappings to realize shifts for operations in
164 /// its body. The operations that go into the loop body are specified in
165 /// opGroupQueue starting from the specified offset, and in that order. The
166 /// first element of the pair specifies the shift applied to that group of
167 /// operations; the shift is multiplied by the loop step before being applied.
168 /// Returns nullptr if the generated loop simplifies to a single iteration one.
169 static AffineForOp generateShiftedLoop(
170  AffineMap lbMap, AffineMap ubMap,
171  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
172  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
173  auto lbOperands = srcForOp.getLowerBoundOperands();
174  auto ubOperands = srcForOp.getUpperBoundOperands();
175 
176  assert(lbMap.getNumInputs() == lbOperands.size());
177  assert(ubMap.getNumInputs() == ubOperands.size());
178 
179  auto loopChunk =
180  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
181  ubMap, srcForOp.getStepAsInt());
182  auto loopChunkIV = loopChunk.getInductionVar();
183  auto srcIV = srcForOp.getInductionVar();
184 
185  IRMapping operandMap;
186 
187  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
188  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
189  uint64_t shift = it.first;
190  auto ops = it.second;
191  // All 'same shift' operations get added with their operands being
192  // remapped to results of cloned operations, and their IV used remapped.
193  // Generate the remapping if the shift is not zero: remappedIV = newIV -
194  // shift.
195  if (!srcIV.use_empty() && shift != 0) {
196  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
197  srcForOp.getLoc(),
198  bodyBuilder.getSingleDimShiftAffineMap(
199  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
200  loopChunkIV);
201  operandMap.map(srcIV, ivRemap);
202  } else {
203  operandMap.map(srcIV, loopChunkIV);
204  }
205  for (auto *op : ops)
206  bodyBuilder.clone(*op, operandMap);
207  };
208  if (succeeded(promoteIfSingleIteration(loopChunk)))
209  return AffineForOp();
210  return loopChunk;
211 }
212 
213 // The skewing of operations with respect to one another can be used for
214 // example to allow overlap of asynchronous operations (such as DMA
215 // communication) with computation, or just relative shifting of operations
216 // for better register reuse, locality or parallelism. As such, the shifts are
217 // typically expected to be at most of the order of the number of operations.
218 // This method should not be used as a substitute for loop distribution/fission.
219 // This method uses an algorithm// in time linear in the number of operations
220 // in the body of the for loop - (using the 'sweep line' paradigm). This method
221 // asserts preservation of SSA dominance. A check for that as well as that for
222 // memory-based dependence preservation check rests with the users of this
223 // method.
224 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
225  ArrayRef<uint64_t> shifts,
226  bool unrollPrologueEpilogue) {
227  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
228  "too few/many shifts");
229  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
230  return success();
231 
232  // If the trip counts aren't constant, we would need versioning and
233  // conditional guards (or context information to prevent such versioning). The
234  // better way to pipeline for such loops is to first tile them and extract
235  // constant trip count "full tiles" before applying this.
236  auto mayBeConstTripCount = getConstantTripCount(forOp);
237  if (!mayBeConstTripCount) {
238  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
239  return success();
240  }
241  uint64_t tripCount = *mayBeConstTripCount;
242 
243  assert(isOpwiseShiftValid(forOp, shifts) &&
244  "shifts will lead to an invalid transformation\n");
245 
246  int64_t step = forOp.getStepAsInt();
247 
248  unsigned numChildOps = shifts.size();
249 
250  // Do a linear time (counting) sort for the shifts.
251  uint64_t maxShift = *llvm::max_element(shifts);
252  if (maxShift >= numChildOps) {
253  // Large shifts are not the typical use case.
254  forOp.emitWarning("not shifting because shifts are unrealistically large");
255  return success();
256  }
257 
258  // An array of operation groups sorted by shift amount; each group has all
259  // operations with the same shift in the order in which they appear in the
260  // body of the 'affine.for' op.
261  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
262  unsigned pos = 0;
263  for (auto &op : forOp.getBody()->without_terminator()) {
264  auto shift = shifts[pos++];
265  sortedOpGroups[shift].push_back(&op);
266  }
267 
268  // Unless the shifts have a specific pattern (which actually would be the
269  // common use case), prologue and epilogue are not meaningfully defined.
270  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
271  // loop generated as the prologue and the last as epilogue and unroll these
272  // fully.
273  AffineForOp prologue, epilogue;
274 
275  // Do a sweep over the sorted shifts while storing open groups in a
276  // vector, and generating loop portions as necessary during the sweep. A block
277  // of operations is paired with its shift.
278  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
279 
280  auto origLbMap = forOp.getLowerBoundMap();
281  uint64_t lbShift = 0;
282  OpBuilder b(forOp);
283  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
284  // If nothing is shifted by d, continue.
285  if (sortedOpGroups[d].empty())
286  continue;
287  if (!opGroupQueue.empty()) {
288  assert(d > 0 &&
289  "Queue expected to be empty when the first block is found");
290  // The interval for which the loop needs to be generated here is:
291  // [lbShift, min(lbShift + tripCount, d)) and the body of the
292  // loop needs to have all operations in opQueue in that order.
293  AffineForOp res;
294  if (lbShift + tripCount * step < d * step) {
295  res = generateShiftedLoop(
296  b.getShiftedAffineMap(origLbMap, lbShift),
297  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
298  opGroupQueue, /*offset=*/0, forOp, b);
299  // Entire loop for the queued op groups generated, empty it.
300  opGroupQueue.clear();
301  lbShift += tripCount * step;
302  } else {
303  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
304  b.getShiftedAffineMap(origLbMap, d),
305  opGroupQueue, /*offset=*/0, forOp, b);
306  lbShift = d * step;
307  }
308 
309  if (res) {
310  // Simplify/canonicalize the affine.for.
311  RewritePatternSet patterns(res.getContext());
312  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
313  GreedyRewriteConfig config;
315  bool erased;
316  (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns),
317  config, /*changed=*/nullptr, &erased);
318  if (!erased && !prologue)
319  prologue = res;
320  if (!erased)
321  epilogue = res;
322  }
323  } else {
324  // Start of first interval.
325  lbShift = d * step;
326  }
327  // Augment the list of operations that get into the current open interval.
328  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
329  }
330 
331  // Those operations groups left in the queue now need to be processed (FIFO)
332  // and their loops completed.
333  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
334  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
335  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
336  b.getShiftedAffineMap(origLbMap, ubShift),
337  opGroupQueue, /*offset=*/i, forOp, b);
338  lbShift = ubShift;
339  if (!prologue)
340  prologue = epilogue;
341  }
342 
343  // Erase the original for op.
344  forOp.erase();
345 
346  if (unrollPrologueEpilogue && prologue)
347  (void)loopUnrollFull(prologue);
348  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
349  (void)loopUnrollFull(epilogue);
350 
351  return success();
352 }
353 
354 /// Checks whether a loop nest is hyper-rectangular or not.
355 static LogicalResult
358  SmallVector<Operation *, 8> ops(input.begin(), input.end());
359  // 0-d or 1-d is trivially hyper-rectangular.
360  if (input.size() <= 1)
361  return success();
362  if (failed(getIndexSet(ops, &cst))) {
363  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
364  return failure();
365  }
366  if (!cst.isHyperRectangular(0, input.size())) {
367  LLVM_DEBUG(llvm::dbgs()
368  << "Non-hyperrectangular nests not supported for tiling!\n");
369  return failure();
370  }
371  return success();
372 }
373 
374 /// Check if the input nest is supported for tiling and whether tiling would be
375 /// legal or not.
376 template <typename t>
378  ArrayRef<t> tileSizes) {
379  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
380 
381  if (llvm::any_of(input,
382  [](AffineForOp op) { return op.getNumResults() > 0; })) {
383  LLVM_DEBUG(llvm::dbgs()
384  << "Cannot tile nest where a loop has yield values\n");
385  return failure();
386  }
387 
388  // Check if the supplied `for` ops are all successively nested.
389  if (!isPerfectlyNested(input)) {
390  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
391  return failure();
392  }
393 
394  // TODO: handle non hyper-rectangular spaces.
395  if (failed(checkIfHyperRectangular(input)))
396  return failure();
397 
398  return success();
399 }
400 
401 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
402 /// location in destination's body, ignoring the terminator.
403 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
404  Block::iterator loc) {
405  auto &ops = src.getBody()->getOperations();
406  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
407  std::prev(ops.end()));
408 }
409 
410 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
411 /// body.
412 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
413  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
414 }
415 
416 /// Constructs tiled loop nest, without setting the loop bounds and move the
417 /// body of the original loop nest to the tiled loop nest.
419  AffineForOp rootAffineForOp, unsigned width,
420  MutableArrayRef<AffineForOp> tiledLoops) {
421  Location loc = rootAffineForOp.getLoc();
422 
423  // The outermost among the loops as we add more..
424  Operation *topLoop = rootAffineForOp.getOperation();
425  AffineForOp innermostPointLoop;
426 
427  // Add intra-tile (or point) loops.
428  for (unsigned i = 0; i < width; i++) {
429  OpBuilder b(topLoop);
430  // Loop bounds will be set later.
431  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
432  pointLoop.getBody()->getOperations().splice(
433  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
434  topLoop);
435  tiledLoops[2 * width - 1 - i] = pointLoop;
436  topLoop = pointLoop.getOperation();
437  if (i == 0)
438  innermostPointLoop = pointLoop;
439  }
440 
441  // Add tile space loops;
442  for (unsigned i = width; i < 2 * width; i++) {
443  OpBuilder b(topLoop);
444  // Loop bounds will be set later.
445  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
446  tileSpaceLoop.getBody()->getOperations().splice(
447  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
448  topLoop);
449  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
450  topLoop = tileSpaceLoop.getOperation();
451  }
452 
453  // Move the loop body of the original nest to the new one.
454  moveLoopBody(origLoops.back(), innermostPointLoop);
455 }
456 
457 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
458 // TODO: Handle non-constant lower bounds.
459 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
460  AffineForOp newInterTileLoop,
461  AffineForOp newIntraTileLoop,
462  Value tileSize) {
463  // The lower bound for the intra-tile loop is represented by an affine map
464  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
465  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
466  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
467  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
468  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
469  // corresponding inter-tile loop.
470 
471  assert(origLoop.hasConstantLowerBound() &&
472  "expected input loops to have constant lower bound.");
473 
474  // Get lower bound of original loop as an affine expression.
475  AffineExpr origLowerBoundExpr;
476  origLowerBoundExpr =
477  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
478 
479  // Add dim operands from original lower/upper bound.
480  SmallVector<Value, 4> lbOperands, ubOperands;
481  AffineBound lb = origLoop.getLowerBound();
482  AffineBound ub = origLoop.getUpperBound();
483  lbOperands.reserve(lb.getNumOperands() + 2);
484  ubOperands.reserve(ub.getNumOperands() + 2);
485  AffineMap origLbMap = lb.getMap();
486  AffineMap origUbMap = ub.getMap();
487  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
488  lbOperands.push_back(lb.getOperand(j));
489  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
490  ubOperands.push_back(ub.getOperand(j));
491 
492  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
493  // IV.
494  lbOperands.push_back(newInterTileLoop.getInductionVar());
495  ubOperands.push_back(newInterTileLoop.getInductionVar());
496 
497  // Get loop IV as an affine expression for lower/upper bound. Size of
498  // lb/ubOperands is guaranteed to be atleast one.
499  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
500  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
501 
502  // Add symbol operands from original lower/upper bound.
503  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
504  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
505  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
506  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
507 
508  // Add a new symbol operand which is the tile size for this loop.
509  lbOperands.push_back(tileSize);
510  ubOperands.push_back(tileSize);
511 
512  SmallVector<AffineExpr, 4> lbBoundExprs;
513  SmallVector<AffineExpr, 4> ubBoundExprs;
514  lbBoundExprs.reserve(origLbMap.getNumResults());
515  ubBoundExprs.reserve(origUbMap.getNumResults());
516 
517  // Get tiling parameter as an affine expression for lb/ub.
518  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
519  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
520 
521  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
522  lbBoundExprs.push_back(
523  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
524  origLowerBoundExpr);
525 
526  // Get the origLoopStep as an affine expression.
527  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
528 
529  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
530  // (tilingParameter * origLoopStep) + origlb.
531  ubBoundExprs.push_back(
532  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
533  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
534 
535  ubBoundExprs.append(origUbMap.getResults().begin(),
536  origUbMap.getResults().end());
537 
538  AffineMap lbMap =
539  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
540  lbBoundExprs, b.getContext());
541  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
542 
543  AffineMap ubMap =
544  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
545  ubBoundExprs, b.getContext());
546  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
547 
548  // Original loop step must be preserved.
549  newIntraTileLoop.setStep(origLoop.getStepAsInt());
550 }
551 
552 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
553 // TODO: Handle non-constant lower bounds.
554 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
555  AffineForOp newLoop, Value tileSize) {
556  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
557 
558  // The lower bounds for inter-tile loops are same as the corresponding lower
559  // bounds of original loops.
560  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
561 
562  // The new upper bound map for inter-tile loops, assuming constant lower
563  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
564  // originalLowerBound), tiling parameter); where tiling parameter is the
565  // respective tile size for that loop. For e.g. if the original ubmap was
566  // ()->(1024), the new map will be
567  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
568  // Therefore a new symbol operand is inserted in the map and the result
569  // expression is overwritten.
570 
571  assert(origLoop.hasConstantLowerBound() &&
572  "expected input loops to have constant lower bound.");
573 
574  // Get lower bound of original loop as an affine expression.
575  AffineExpr origLowerBoundExpr;
576  origLowerBoundExpr =
577  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
578 
579  // Add dim operands from original upper bound.
580  SmallVector<Value, 4> ubOperands;
581  AffineBound ub = origLoop.getUpperBound();
582  ubOperands.reserve(ub.getNumOperands() + 1);
583  AffineMap origUbMap = ub.getMap();
584  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
585  ubOperands.push_back(ub.getOperand(j));
586 
587  // Add symbol operands from original upper bound.
588  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
589  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
590 
591  // Add a new symbol operand which is the tile size for this loop.
592  ubOperands.push_back(tileSize);
593 
594  // Get tiling parameter as an affine expression.
595  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
596 
597  SmallVector<AffineExpr, 4> boundExprs;
598  boundExprs.reserve(origUbMap.getNumResults());
599  int64_t origUpperBound;
600  AffineExpr origUpperBoundExpr;
601 
602  // If upper bound for the original loop is constant, then the constant can
603  // be obtained as an affine expression straight away.
604  if (origLoop.hasConstantUpperBound()) {
605  origUpperBound = origLoop.getConstantUpperBound();
606 
607  // Get original constant upper bound as an affine expression.
608  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
609 
610  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
611  // originalLowerBound), tilingParameter).
612  boundExprs.push_back(
613  origLowerBoundExpr +
614  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
615  } else {
616  // If upper bound for the original loop is not constant then two cases
617  // are possible, although there handeling is the same, 1.) The result of
618  // ubmap has only one result expression. For e.g.
619  // affine.for %i = 5 to %ub
620  //
621  // A symbol operand is added which represents the tiling parameter. The
622  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
623  // where 's0' is the original upper bound and 's1' is the tiling
624  // parameter. 2.) When ubMap has more than one result expression. For e.g.
625  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
626  // affine.for %i = 5 to min #map0()[%s0, %s1]
627  //
628  // A symbol operand is added which represents the tiling parameter. The
629  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
630  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
631 
632  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
633  // originalLowerBound), tilingParameter).
634  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
635  boundExprs.push_back(
636  origLowerBoundExpr +
637  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
638  }
639 
640  AffineMap ubMap =
641  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
642  boundExprs, b.getContext());
643  newLoop.setUpperBound(ubOperands, ubMap);
644 
645  // Original loop step must be preserved.
646  newLoop.setStep(origLoop.getStepAsInt());
647 }
648 
649 /// Constructs and sets new loop bounds after tiling for the case of
650 /// hyper-rectangular index sets, where the bounds of one dimension do not
651 /// depend on other dimensions and tiling parameters are captured from SSA
652 /// values. Bounds of each dimension can thus be treated independently,
653 /// and deriving the new bounds is much simpler and faster than for the case of
654 /// tiling arbitrary polyhedral shapes.
657  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
658  assert(!origLoops.empty() && "expected atleast one loop in band");
659  assert(origLoops.size() == tileSizes.size() &&
660  "expected tiling parameter for each loop in band.");
661 
662  OpBuilder b(origLoops[0].getOperation());
663  unsigned width = origLoops.size();
664 
665  // Set bounds for tile space loops.
666  for (unsigned i = 0; i < width; ++i) {
667  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
668  }
669 
670  // Set bounds for intra-tile loops.
671  for (unsigned i = 0; i < width; ++i) {
672  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
673  newLoops[i + width], tileSizes[i]);
674  }
675 }
676 
677 /// Constructs and sets new loop bounds after tiling for the case of
678 /// hyper-rectangular index sets, where the bounds of one dimension do not
679 /// depend on other dimensions. Bounds of each dimension can thus be treated
680 /// independently, and deriving the new bounds is much simpler and faster
681 /// than for the case of tiling arbitrary polyhedral shapes.
682 static void
685  ArrayRef<unsigned> tileSizes) {
686  assert(!origLoops.empty());
687  assert(origLoops.size() == tileSizes.size());
688 
689  OpBuilder b(origLoops[0].getOperation());
690  unsigned width = origLoops.size();
691 
692  // Bounds for tile space loops.
693  for (unsigned i = 0; i < width; i++) {
694  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
695  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
696  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
697  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
698  // If the step size of original loop is x and tileSize is y then after
699  // tiling the tile space loops' step size becomes x*y.
700  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
701  }
702  // Bounds for intra-tile loops.
703  for (unsigned i = 0; i < width; i++) {
704  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
705  std::optional<uint64_t> mayBeConstantCount =
706  getConstantTripCount(origLoops[i]);
707  // The lower bound is just the tile-space loop.
708  AffineMap lbMap = b.getDimIdentityMap();
709  newLoops[width + i].setLowerBound(
710  /*operands=*/newLoops[i].getInductionVar(), lbMap);
711  // The step sizes of intra-tile loops is just the original loops' step size.
712  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
713 
714  // Set the upper bound.
715  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
716  // Trip count is less than the tile size: upper bound is lower bound +
717  // trip count * stepSize.
719  *mayBeConstantCount * origLoops[i].getStepAsInt());
720  newLoops[width + i].setUpperBound(
721  /*operands=*/newLoops[i].getInductionVar(), ubMap);
722  } else if (largestDiv % tileSizes[i] != 0) {
723  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
724  // Construct the upper bound map; the operands are the original operands
725  // with 'i' (tile-space loop) appended to it. The new upper bound map is
726  // the original one with an additional expression i + tileSize * stepSize
727  // appended.
728 
729  // Add dim operands from original upper bound.
730  SmallVector<Value, 4> ubOperands;
731  AffineBound ub = origLoops[i].getUpperBound();
732  ubOperands.reserve(ub.getNumOperands() + 1);
733  AffineMap origUbMap = ub.getMap();
734  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
735  ubOperands.push_back(ub.getOperand(j));
736 
737  // Add dim operand for new loop upper bound.
738  ubOperands.push_back(newLoops[i].getInductionVar());
739 
740  // Add symbol operands from original upper bound.
741  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
742  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
743 
744  SmallVector<AffineExpr, 4> boundExprs;
745  boundExprs.reserve(1 + origUbMap.getNumResults());
746  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
747  // The new upper bound map is the original one with an additional
748  // expression i + tileSize * stepSize (of original loop) appended.
749  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
750  boundExprs.append(origUbMap.getResults().begin(),
751  origUbMap.getResults().end());
752  AffineMap ubMap =
753  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
754  boundExprs, b.getContext());
755  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
756  } else {
757  // No need of the min expression.
758  AffineExpr dim = b.getAffineDimExpr(0);
759  AffineMap ubMap = AffineMap::get(
760  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
761  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
762  }
763  }
764 }
765 
766 LogicalResult
768  ArrayRef<unsigned> tileSizes,
769  SmallVectorImpl<AffineForOp> *tiledNest) {
770  if (input.empty())
771  return success();
772 
773  if (failed(performPreTilingChecks(input, tileSizes)))
774  return failure();
775 
776  MutableArrayRef<AffineForOp> origLoops = input;
777  AffineForOp rootAffineForOp = origLoops[0];
778 
779  // Note that width is at least one since the band isn't empty.
780  unsigned width = input.size();
781  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
782 
783  // Construct a tiled loop nest without setting their bounds. Bounds are
784  // set later.
785  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
786 
787  SmallVector<Value, 8> origLoopIVs;
788  extractForInductionVars(input, &origLoopIVs);
789 
790  // Set loop bounds for the tiled loop nest.
791  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
792 
793  // Replace original IVs with intra-tile loop IVs.
794  for (unsigned i = 0; i < width; i++)
795  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
796 
797  // Erase the old loop nest.
798  rootAffineForOp.erase();
799 
800  if (tiledNest)
801  *tiledNest = std::move(tiledLoops);
802 
803  return success();
804 }
805 
806 /// Tiles the specified band of perfectly nested loops creating tile-space
807 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
808 /// is a contiguous set of loops.
811  SmallVectorImpl<AffineForOp> *tiledNest) {
812  if (input.empty())
813  return success();
814 
815  if (failed(performPreTilingChecks(input, tileSizes)))
816  return failure();
817 
818  MutableArrayRef<AffineForOp> origLoops = input;
819  AffineForOp rootAffineForOp = origLoops[0];
820  unsigned width = input.size();
821  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
822 
823  // Construct a tiled loop nest without setting their bounds. Bounds are
824  // set later.
825  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
826 
827  SmallVector<Value, 8> origLoopIVs;
828  extractForInductionVars(input, &origLoopIVs);
829 
830  // Set loop bounds for the tiled loop nest.
831  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
832  tileSizes);
833 
834  // Replace original IVs with intra-tile loop IVs.
835  for (unsigned i = 0; i < width; i++)
836  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
837 
838  // Erase the old loop nest.
839  rootAffineForOp.erase();
840 
841  if (tiledNest)
842  *tiledNest = std::move(tiledLoops);
843 
844  return success();
845 }
846 
847 /// Get perfectly nested sequence of loops starting at root of loop nest
848 /// (the first op being another AffineFor, and the second op - a terminator).
849 /// A loop is perfectly nested iff: the first op in the loop's body is another
850 /// AffineForOp, and the second op is a terminator).
852  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
853  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
854  nestedLoops.push_back(root);
855  Block &body = root.getRegion().front();
856  if (body.begin() != std::prev(body.end(), 2))
857  return;
858 
859  root = dyn_cast<AffineForOp>(&body.front());
860  if (!root)
861  return;
862  }
863 }
864 
865 /// Identify valid and profitable bands of loops to tile. This is currently just
866 /// a temporary placeholder to test the mechanics of tiled code generation.
867 /// Returns all maximal outermost perfect loop nests to tile.
869  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
870  // Get maximal perfect nest of 'affine.for' insts starting from root
871  // (inclusive).
872  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
874  getPerfectlyNestedLoops(band, forOp);
875  bands->push_back(band);
876  }
877 }
878 
879 /// Unrolls this loop completely.
880 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
881  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
882  if (mayBeConstantTripCount.has_value()) {
883  uint64_t tripCount = *mayBeConstantTripCount;
884  if (tripCount == 0)
885  return success();
886  if (tripCount == 1)
887  return promoteIfSingleIteration(forOp);
888  return loopUnrollByFactor(forOp, tripCount);
889  }
890  return failure();
891 }
892 
893 /// Unrolls this loop by the specified factor or by the trip count (if constant)
894 /// whichever is lower.
895 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
896  uint64_t unrollFactor) {
897  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
898  if (mayBeConstantTripCount.has_value() &&
899  *mayBeConstantTripCount < unrollFactor)
900  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
901  return loopUnrollByFactor(forOp, unrollFactor);
902 }
903 
904 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
905 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
906 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
907 /// using annotateFn.
909  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
910  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
911  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
912  ValueRange iterArgs, ValueRange yieldedValues) {
913  // Builder to insert unrolled bodies just before the terminator of the body of
914  // 'forOp'.
915  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
916 
917  if (!annotateFn)
918  annotateFn = [](unsigned, Operation *, OpBuilder) {};
919 
920  // Keep a pointer to the last non-terminator operation in the original block
921  // so that we know what to clone (since we are doing this in-place).
922  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
923 
924  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
925  SmallVector<Value, 4> lastYielded(yieldedValues);
926 
927  for (unsigned i = 1; i < unrollFactor; i++) {
928  IRMapping operandMap;
929 
930  // Prepare operand map.
931  operandMap.map(iterArgs, lastYielded);
932 
933  // If the induction variable is used, create a remapping to the value for
934  // this unrolled instance.
935  if (!forOpIV.use_empty()) {
936  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
937  operandMap.map(forOpIV, ivUnroll);
938  }
939 
940  // Clone the original body of 'forOp'.
941  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
942  Operation *clonedOp = builder.clone(*it, operandMap);
943  annotateFn(i, clonedOp, builder);
944  }
945 
946  // Update yielded values. If the yielded value is defined outside the
947  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
948  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
949  // value with the clone corresponding to the yielded value.
950  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
951  Operation *defOp = yieldedValues[i].getDefiningOp();
952  if (defOp && defOp->getBlock() == loopBodyBlock)
953  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
954  }
955  }
956 
957  // Make sure we annotate the Ops in the original body. We do this last so that
958  // any annotations are not copied into the cloned Ops above.
959  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
960  annotateFn(0, &*it, builder);
961 
962  // Update operands of the yield statement.
963  loopBodyBlock->getTerminator()->setOperands(lastYielded);
964 }
965 
966 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
967 /// count is not a multiple of `unrollFactor`.
968 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
969  uint64_t unrollFactor) {
970  // Insert the cleanup loop right after 'forOp'.
971  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
972  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
973 
974  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
975  // and produce results for the original users of `forOp` results.
976  auto results = forOp.getResults();
977  auto cleanupResults = cleanupForOp.getResults();
978  auto cleanupIterOperands = cleanupForOp.getInits();
979 
980  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
981  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
982  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
983  }
984 
985  AffineMap cleanupMap;
986  SmallVector<Value, 4> cleanupOperands;
987  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
988  if (!cleanupMap)
989  return failure();
990 
991  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
992  // Promote the loop body up if this has turned into a single iteration loop.
993  (void)promoteIfSingleIteration(cleanupForOp);
994 
995  // Adjust upper bound of the original loop; this is the same as the lower
996  // bound of the cleanup loop.
997  forOp.setUpperBound(cleanupOperands, cleanupMap);
998  return success();
999 }
1000 
1001 /// Unrolls this loop by the specified factor. Returns success if the loop
1002 /// is successfully unrolled.
1004  AffineForOp forOp, uint64_t unrollFactor,
1005  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1006  bool cleanUpUnroll) {
1007  assert(unrollFactor > 0 && "unroll factor should be positive");
1008 
1009  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1010  if (unrollFactor == 1) {
1011  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1012  failed(promoteIfSingleIteration(forOp)))
1013  return failure();
1014  return success();
1015  }
1016 
1017  // Nothing in the loop body other than the terminator.
1018  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1019  return success();
1020 
1021  // If the trip count is lower than the unroll factor, no unrolled body.
1022  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1023  if (cleanUpUnroll) {
1024  // Unroll the cleanup loop if cleanUpUnroll is specified.
1025  return loopUnrollFull(forOp);
1026  }
1027 
1028  return failure();
1029  }
1030 
1031  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1032  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1033  // Loops where the lower bound is a max expression or the upper bound is
1034  // a min expression and the trip count doesn't divide the unroll factor
1035  // can't be unrolled since the lower bound of the cleanup loop in such cases
1036  // cannot be expressed as an affine function or a max over affine functions.
1037  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1038  forOp.getUpperBoundMap().getNumResults() != 1)
1039  return failure();
1040  if (cleanUpUnroll)
1041  // Force unroll including cleanup loop
1042  return loopUnrollFull(forOp);
1043  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1044  assert(false && "cleanup loop lower bound map for single result lower "
1045  "and upper bound maps can always be determined");
1046  }
1047 
1048  ValueRange iterArgs(forOp.getRegionIterArgs());
1049  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1050 
1051  // Scale the step of loop being unrolled by unroll factor.
1052  int64_t step = forOp.getStepAsInt();
1053  forOp.setStep(step * unrollFactor);
1055  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1056  [&](unsigned i, Value iv, OpBuilder b) {
1057  // iv' = iv + i * step
1058  auto d0 = b.getAffineDimExpr(0);
1059  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1060  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1061  },
1062  /*annotateFn=*/annotateFn,
1063  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1064 
1065  // Promote the loop body up if this has turned into a single iteration loop.
1066  (void)promoteIfSingleIteration(forOp);
1067  return success();
1068 }
1069 
1070 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1071  uint64_t unrollJamFactor) {
1072  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1073  if (mayBeConstantTripCount.has_value() &&
1074  *mayBeConstantTripCount < unrollJamFactor)
1075  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1076  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1077 }
1078 
1079 /// Check if all control operands of all loops are defined outside of `forOp`
1080 /// and return false if not.
1081 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1082  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1083  for (auto controlOperand : aForOp.getControlOperands()) {
1084  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1085  return WalkResult::interrupt();
1086  }
1087  return WalkResult::advance();
1088  });
1089  return !walkResult.wasInterrupted();
1090 }
1091 
1092 /// Unrolls and jams this loop by the specified factor.
1093 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1094  uint64_t unrollJamFactor) {
1095  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1096 
1097  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1098  if (unrollJamFactor == 1) {
1099  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1100  failed(promoteIfSingleIteration(forOp)))
1101  return failure();
1102  return success();
1103  }
1104 
1105  // Nothing in the loop body other than the terminator.
1106  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1107  return success();
1108 
1109  // If the trip count is lower than the unroll jam factor, no unroll jam.
1110  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1111  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1112  return failure();
1113  }
1114 
1115  // If any control operand of any inner loop of `forOp` is defined within
1116  // `forOp`, no unroll jam.
1117  if (!areInnerBoundsInvariant(forOp))
1118  return failure();
1119 
1120  // Gather all sub-blocks to jam upon the loop being unrolled.
1122  jbg.walk(forOp);
1123  auto &subBlocks = jbg.subBlocks;
1124 
1125  // Collect loops with iter_args.
1126  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1127  forOp.walk([&](AffineForOp aForOp) {
1128  if (aForOp.getNumIterOperands() > 0)
1129  loopsWithIterArgs.push_back(aForOp);
1130  });
1131 
1132  // Get supported reductions to be used for creating reduction ops at the end.
1133  SmallVector<LoopReduction> reductions;
1134  if (forOp.getNumIterOperands() > 0)
1135  getSupportedReductions(forOp, reductions);
1136 
1137  // Generate the cleanup loop if trip count isn't a multiple of
1138  // unrollJamFactor.
1139  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1140  // Loops where the lower bound is a max expression or the upper bound is
1141  // a min expression and the trip count doesn't divide the unroll factor
1142  // can't be unrolled since the lower bound of the cleanup loop in such cases
1143  // cannot be expressed as an affine function or a max over affine functions.
1144  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1145  forOp.getUpperBoundMap().getNumResults() != 1)
1146  return failure();
1147  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1148  assert(false && "cleanup loop lower bound map for single result lower "
1149  "and upper bound maps can always be determined");
1150  }
1151 
1152  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1153  // iteration. There are (`unrollJamFactor` - 1) iterations.
1154  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1155 
1156  // For any loop with iter_args, replace it with a new loop that has
1157  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1158  // operands.
1159  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1160  IRRewriter rewriter(forOp.getContext());
1161  for (AffineForOp oldForOp : loopsWithIterArgs) {
1162  SmallVector<Value> dupIterOperands, dupYieldOperands;
1163  ValueRange oldIterOperands = oldForOp.getInits();
1164  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1165  ValueRange oldYieldOperands =
1166  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1167  // Get additional iterOperands, iterArgs, and yield operands. We will
1168  // fix iterOperands and yield operands after cloning of sub-blocks.
1169  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1170  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1171  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1172  }
1173  // Create a new loop with additional iterOperands, iter_args and yield
1174  // operands. This new loop will take the loop body of the original loop.
1175  bool forOpReplaced = oldForOp == forOp;
1176  AffineForOp newForOp =
1177  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1178  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1179  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1180  return dupYieldOperands;
1181  }));
1182  newLoopsWithIterArgs.push_back(newForOp);
1183  // `forOp` has been replaced with a new loop.
1184  if (forOpReplaced)
1185  forOp = newForOp;
1186  // Update `operandMaps` for `newForOp` iterArgs and results.
1187  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1188  unsigned oldNumIterArgs = oldIterArgs.size();
1189  ValueRange newResults = newForOp.getResults();
1190  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1191  assert(oldNumIterArgs == oldNumResults &&
1192  "oldNumIterArgs must be the same as oldNumResults");
1193  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1194  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1195  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1196  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1197  // to those in the `i`th new set.
1198  operandMaps[i - 1].map(newIterArgs[j],
1199  newIterArgs[i * oldNumIterArgs + j]);
1200  operandMaps[i - 1].map(newResults[j],
1201  newResults[i * oldNumResults + j]);
1202  }
1203  }
1204  }
1205 
1206  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1207  int64_t step = forOp.getStepAsInt();
1208  forOp.setStep(step * unrollJamFactor);
1209 
1210  auto forOpIV = forOp.getInductionVar();
1211  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1212  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1213  for (auto &subBlock : subBlocks) {
1214  // Builder to insert unroll-jammed bodies. Insert right at the end of
1215  // sub-block.
1216  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1217 
1218  // If the induction variable is used, create a remapping to the value for
1219  // this unrolled instance.
1220  if (!forOpIV.use_empty()) {
1221  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1222  auto d0 = builder.getAffineDimExpr(0);
1223  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1224  auto ivUnroll =
1225  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1226  operandMaps[i - 1].map(forOpIV, ivUnroll);
1227  }
1228  // Clone the sub-block being unroll-jammed.
1229  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1230  builder.clone(*it, operandMaps[i - 1]);
1231  }
1232  // Fix iterOperands and yield op operands of newly created loops.
1233  for (auto newForOp : newLoopsWithIterArgs) {
1234  unsigned oldNumIterOperands =
1235  newForOp.getNumIterOperands() / unrollJamFactor;
1236  unsigned numControlOperands = newForOp.getNumControlOperands();
1237  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1238  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1239  assert(oldNumIterOperands == oldNumYieldOperands &&
1240  "oldNumIterOperands must be the same as oldNumYieldOperands");
1241  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1242  // The `i`th duplication of an old iterOperand or yield op operand
1243  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1244  // if such mapped value exists.
1245  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1246  operandMaps[i - 1].lookupOrDefault(
1247  newForOp.getOperand(numControlOperands + j)));
1248  yieldOp.setOperand(
1249  i * oldNumYieldOperands + j,
1250  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1251  }
1252  }
1253  }
1254  if (forOp.getNumResults() > 0) {
1255  // Create reduction ops to combine every `unrollJamFactor` related results
1256  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1257  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1258  // %1.
1259  rewriter.setInsertionPointAfter(forOp);
1260  auto loc = forOp.getLoc();
1261  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1262  for (LoopReduction &reduction : reductions) {
1263  unsigned pos = reduction.iterArgPosition;
1264  Value lhs = forOp.getResult(pos);
1265  Value rhs;
1267  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1268  rhs = forOp.getResult(i * oldNumResults + pos);
1269  // Create ops based on reduction type.
1270  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1271  if (!lhs)
1272  return failure();
1273  Operation *op = lhs.getDefiningOp();
1274  assert(op && "Reduction op should have been created");
1275  newOps.insert(op);
1276  }
1277  // Replace all uses except those in newly created reduction ops.
1278  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1279  }
1280  }
1281 
1282  // Promote the loop body up if this has turned into a single iteration loop.
1283  (void)promoteIfSingleIteration(forOp);
1284  return success();
1285 }
1286 
1287 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1288 /// nested within 'forOpA' as the only non-terminator operation in its block.
1289 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1290  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1291  auto &forOpABody = forOpA.getBody()->getOperations();
1292  auto &forOpBBody = forOpB.getBody()->getOperations();
1293 
1294  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1295  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1296  // body containing only the terminator.
1297  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1298  forOpABody, forOpABody.begin(),
1299  std::prev(forOpABody.end()));
1300  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1301  // body (this leaves forOpB's body containing only the terminator).
1302  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1303  std::prev(forOpBBody.end()));
1304  // 3) Splice forOpA into the beginning of forOpB's body.
1305  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1306  Block::iterator(forOpA));
1307 }
1308 
1309 // Checks each dependence component against the permutation to see if the
1310 // desired loop interchange would violate dependences by making the
1311 // dependence component lexicographically negative.
1313  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1314  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1315  // Invert permutation map.
1316  unsigned maxLoopDepth = loops.size();
1317  SmallVector<unsigned, 4> loopPermMapInv;
1318  loopPermMapInv.resize(maxLoopDepth);
1319  for (unsigned i = 0; i < maxLoopDepth; ++i)
1320  loopPermMapInv[loopPermMap[i]] = i;
1321 
1322  // Check each dependence component against the permutation to see if the
1323  // desired loop interchange permutation would make the dependence vectors
1324  // lexicographically negative.
1325  // Example 1: [-1, 1][0, 0]
1326  // Example 2: [0, 0][-1, 1]
1327  for (const auto &depComps : depCompsVec) {
1328  assert(depComps.size() >= maxLoopDepth);
1329  // Check if the first non-zero dependence component is positive.
1330  // This iterates through loops in the desired order.
1331  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1332  unsigned permIndex = loopPermMapInv[j];
1333  assert(depComps[permIndex].lb);
1334  int64_t depCompLb = *depComps[permIndex].lb;
1335  if (depCompLb > 0)
1336  break;
1337  if (depCompLb < 0)
1338  return false;
1339  }
1340  }
1341  return true;
1342 }
1343 
1344 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1345 /// nested sequence of loops in 'loops' would violate dependences.
1347  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1348  // Gather dependence components for dependences between all ops in loop nest
1349  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1350  assert(loopPermMap.size() == loops.size());
1351  unsigned maxLoopDepth = loops.size();
1352  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1353  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1354  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1355 }
1356 
1357 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1358 /// in it from outermost to innermost.
1359 bool LLVM_ATTRIBUTE_UNUSED
1361  assert(!loops.empty() && "no loops provided");
1362 
1363  // We already know that the block can't be empty.
1364  auto hasTwoElements = [](Block *block) {
1365  auto secondOpIt = std::next(block->begin());
1366  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1367  };
1368 
1369  auto enclosingLoop = loops.front();
1370  for (auto loop : loops.drop_front()) {
1371  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1372  // parentForOp's body should be just this loop and the terminator.
1373  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1374  return false;
1375  enclosingLoop = loop;
1376  }
1377  return true;
1378 }
1379 
1380 // input[i] should move from position i -> permMap[i]. Returns the position in
1381 // `input` that becomes the new outermost loop.
1383  ArrayRef<unsigned> permMap) {
1384  assert(input.size() == permMap.size() && "invalid permutation map size");
1385  // Check whether the permutation spec is valid. This is a small vector - we'll
1386  // just sort and check if it's iota.
1387  SmallVector<unsigned, 4> checkPermMap(permMap);
1388  llvm::sort(checkPermMap);
1389  if (llvm::any_of(llvm::enumerate(checkPermMap),
1390  [](const auto &en) { return en.value() != en.index(); }))
1391  assert(false && "invalid permutation map");
1392 
1393  // Nothing to do.
1394  if (input.size() < 2)
1395  return 0;
1396 
1397  assert(isPerfectlyNested(input) && "input not perfectly nested");
1398 
1399  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1400  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1402  for (unsigned i = 0, e = input.size(); i < e; ++i)
1403  invPermMap.push_back({permMap[i], i});
1404  llvm::sort(invPermMap);
1405 
1406  // Move the innermost loop body to the loop that would be the innermost in the
1407  // permuted nest (only if the innermost loop is going to change).
1408  if (permMap.back() != input.size() - 1) {
1409  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1410  Block *srcBody = ((AffineForOp)input.back()).getBody();
1411  destBody->getOperations().splice(destBody->begin(),
1412  srcBody->getOperations(), srcBody->begin(),
1413  std::prev(srcBody->end()));
1414  }
1415 
1416  // We'll move each loop in `input` in the reverse order so that its body is
1417  // empty when we are moving it; this incurs zero copies and no erasing.
1418  for (int i = input.size() - 1; i >= 0; --i) {
1419  // If this has to become the outermost loop after permutation, add it to the
1420  // parent block of the original root.
1421  if (permMap[i] == 0) {
1422  // If the root remains the same, nothing to do.
1423  if (i == 0)
1424  continue;
1425  // Make input[i] the new outermost loop moving it into parentBlock.
1426  auto *parentBlock = input[0]->getBlock();
1427  parentBlock->getOperations().splice(Block::iterator(input[0]),
1428  input[i]->getBlock()->getOperations(),
1429  Block::iterator(input[i]));
1430  continue;
1431  }
1432 
1433  // If the parent in the permuted order is the same as in the original,
1434  // nothing to do.
1435  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1436  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1437  continue;
1438 
1439  // Move input[i] to its surrounding loop in the transformed nest.
1440  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1441  destBody->getOperations().splice(destBody->begin(),
1442  input[i]->getBlock()->getOperations(),
1443  Block::iterator(input[i]));
1444  }
1445 
1446  return invPermMap[0].second;
1447 }
1448 
1449 // Sinks all sequential loops to the innermost levels (while preserving
1450 // relative order among them) and moves all parallel loops to the
1451 // outermost (while again preserving relative order among them).
1452 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1454  getPerfectlyNestedLoops(loops, forOp);
1455  if (loops.size() < 2)
1456  return forOp;
1457 
1458  // Gather dependence components for dependences between all ops in loop nest
1459  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1460  unsigned maxLoopDepth = loops.size();
1461  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1462  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1463 
1464  // Mark loops as either parallel or sequential.
1465  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1466  for (auto &depComps : depCompsVec) {
1467  assert(depComps.size() >= maxLoopDepth);
1468  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1469  DependenceComponent &depComp = depComps[j];
1470  assert(depComp.lb.has_value() && depComp.ub.has_value());
1471  if (*depComp.lb != 0 || *depComp.ub != 0)
1472  isParallelLoop[j] = false;
1473  }
1474  }
1475 
1476  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1477 
1478  // Compute permutation of loops that sinks sequential loops (and thus raises
1479  // parallel loops) while preserving relative order.
1480  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1481  unsigned nextSequentialLoop = numParallelLoops;
1482  unsigned nextParallelLoop = 0;
1483  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1484  if (isParallelLoop[i]) {
1485  loopPermMap[i] = nextParallelLoop++;
1486  } else {
1487  loopPermMap[i] = nextSequentialLoop++;
1488  }
1489  }
1490 
1491  // Check if permutation 'loopPermMap' would violate dependences.
1492  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1493  return forOp;
1494  // Perform loop interchange according to permutation 'loopPermMap'.
1495  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1496  return loops[loopNestRootIndex];
1497 }
1498 
1499 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1500 // lower (resp. upper) loop bound. When called for both the lower and upper
1501 // bounds, the resulting IR resembles:
1502 //
1503 // ```mlir
1504 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1505 // ...
1506 // }
1507 // ```
1509  SmallVector<Value, 4> *operands,
1510  int64_t offset = 0) {
1511  auto bounds = llvm::to_vector<4>(map->getResults());
1512  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1513  operands->insert(operands->begin() + map->getNumDims(), iv);
1514  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1515  b.getContext());
1516  canonicalizeMapAndOperands(map, operands);
1517 }
1518 
1519 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1520 // Stripmine-sink is a primitive building block for generalized tiling of
1521 // imperfectly nested loops.
1522 // This transformation is purely mechanical and does not check legality,
1523 // profitability or even structural correctness. It is the user's
1524 // responsibility to specify `targets` that are dominated by `forOp`.
1525 // Returns the new AffineForOps, one per `targets`, nested immediately under
1526 // each of the `targets`.
1528 stripmineSink(AffineForOp forOp, uint64_t factor,
1529  ArrayRef<AffineForOp> targets) {
1530  auto originalStep = forOp.getStepAsInt();
1531  auto scaledStep = originalStep * factor;
1532  forOp.setStep(scaledStep);
1533 
1534  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1535 
1536  // Lower-bound map creation.
1537  auto lbMap = forOp.getLowerBoundMap();
1538  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1539  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1540 
1541  // Upper-bound map creation.
1542  auto ubMap = forOp.getUpperBoundMap();
1543  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1544  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1545  /*offset=*/scaledStep);
1546 
1547  auto iv = forOp.getInductionVar();
1548  SmallVector<AffineForOp, 8> innerLoops;
1549  for (auto t : targets) {
1550  // Insert newForOp before the terminator of `t`.
1551  auto b = OpBuilder::atBlockTerminator(t.getBody());
1552  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1553  ubOperands, ubMap, originalStep);
1554  auto begin = t.getBody()->begin();
1555  // Skip terminator and `newForOp` which is just before the terminator.
1556  auto nOps = t.getBody()->getOperations().size() - 2;
1557  newForOp.getBody()->getOperations().splice(
1558  newForOp.getBody()->getOperations().begin(),
1559  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1560  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1561  newForOp.getRegion());
1562  innerLoops.push_back(newForOp);
1563  }
1564 
1565  return innerLoops;
1566 }
1567 
1568 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1569 // Returns the new AffineForOps, nested immediately under `target`.
1570 template <typename SizeType>
1571 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1572  AffineForOp target) {
1573  // TODO: Use cheap structural assertions that targets are nested under
1574  // forOp and that targets are not nested under each other when DominanceInfo
1575  // exposes the capability. It seems overkill to construct a whole function
1576  // dominance tree at this point.
1577  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1578  assert(res.size() == 1 && "Expected 1 inner forOp");
1579  return res[0];
1580 }
1581 
1584  ArrayRef<AffineForOp> targets) {
1586  SmallVector<AffineForOp, 8> currentTargets(targets);
1587  for (auto it : llvm::zip(forOps, sizes)) {
1588  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1589  res.push_back(step);
1590  currentTargets = step;
1591  }
1592  return res;
1593 }
1594 
1596  ArrayRef<uint64_t> sizes,
1597  AffineForOp target) {
1599  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1600  assert(loops.size() == 1);
1601  res.push_back(loops[0]);
1602  }
1603  return res;
1604 }
1605 
1607  if (loops.size() < 2)
1608  return success();
1609 
1610  AffineForOp innermost = loops.back();
1611  AffineForOp outermost = loops.front();
1612  AffineBound ub = outermost.getUpperBound();
1613  AffineMap origUbMap = ub.getMap();
1614  Location loc = outermost.getLoc();
1615  OpBuilder builder(outermost);
1616  for (AffineForOp loop : loops) {
1617  // We only work on normalized loops.
1618  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1619  loop.getConstantLowerBound() != 0)
1620  return failure();
1621  }
1622  SmallVector<Value, 4> upperBoundSymbols;
1623  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1624  ub.getOperands().end());
1625 
1626  // 1. Store the upper bound of the outermost loop in a variable.
1627  Value prev;
1628  if (!llvm::hasSingleElement(origUbMap.getResults()))
1629  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1630  else
1631  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1632  upperBoundSymbols.push_back(prev);
1633 
1634  // 2. Emit code computing the upper bound of the coalesced loop as product of
1635  // the number of iterations of all loops.
1636  for (AffineForOp loop : loops.drop_front()) {
1637  ub = loop.getUpperBound();
1638  origUbMap = ub.getMap();
1639  ubOperands = ub.getOperands();
1640  Value upperBound;
1641  // If upper bound map has more than one result, take their minimum.
1642  if (!llvm::hasSingleElement(origUbMap.getResults()))
1643  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1644  else
1645  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1646  upperBoundSymbols.push_back(upperBound);
1647  SmallVector<Value, 4> operands;
1648  operands.push_back(prev);
1649  operands.push_back(upperBound);
1650  // Maintain running product of loop upper bounds.
1651  prev = builder.create<AffineApplyOp>(
1652  loc,
1653  AffineMap::get(/*dimCount=*/1,
1654  /*symbolCount=*/1,
1655  builder.getAffineDimExpr(0) *
1656  builder.getAffineSymbolExpr(0)),
1657  operands);
1658  }
1659  // Set upper bound of the coalesced loop.
1660  AffineMap newUbMap = AffineMap::get(
1661  /*dimCount=*/0,
1662  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1663  outermost.setUpperBound(prev, newUbMap);
1664 
1665  builder.setInsertionPointToStart(outermost.getBody());
1666 
1667  // 3. Remap induction variables. For each original loop, the value of the
1668  // induction variable can be obtained by dividing the induction variable of
1669  // the linearized loop by the total number of iterations of the loops nested
1670  // in it modulo the number of iterations in this loop (remove the values
1671  // related to the outer loops):
1672  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1673  // Compute these iteratively from the innermost loop by creating a "running
1674  // quotient" of division by the range.
1675  Value previous = outermost.getInductionVar();
1676  for (unsigned idx = loops.size(); idx > 0; --idx) {
1677  if (idx != loops.size()) {
1678  SmallVector<Value, 4> operands;
1679  operands.push_back(previous);
1680  operands.push_back(upperBoundSymbols[idx]);
1681  previous = builder.create<AffineApplyOp>(
1682  loc,
1684  /*dimCount=*/1, /*symbolCount=*/1,
1685  builder.getAffineDimExpr(0).floorDiv(
1686  builder.getAffineSymbolExpr(0))),
1687  operands);
1688  }
1689  // Modified value of the induction variables of the nested loops after
1690  // coalescing.
1691  Value inductionVariable;
1692  if (idx == 1) {
1693  inductionVariable = previous;
1694  } else {
1695  SmallVector<Value, 4> applyOperands;
1696  applyOperands.push_back(previous);
1697  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1698  inductionVariable = builder.create<AffineApplyOp>(
1699  loc,
1701  /*dimCount=*/1, /*symbolCount=*/1,
1702  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1703  applyOperands);
1704  }
1705  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1706  inductionVariable, loops.back().getRegion());
1707  }
1708 
1709  // 4. Move the operations from the innermost just above the second-outermost
1710  // loop, delete the extra terminator and the second-outermost loop.
1711  AffineForOp secondOutermostLoop = loops[1];
1712  innermost.getBody()->back().erase();
1713  outermost.getBody()->getOperations().splice(
1714  Block::iterator(secondOutermostLoop.getOperation()),
1715  innermost.getBody()->getOperations());
1716  secondOutermostLoop.erase();
1717  return success();
1718 }
1719 
1721  ArrayRef<Value> processorId,
1722  ArrayRef<Value> numProcessors) {
1723  assert(processorId.size() == numProcessors.size());
1724  if (processorId.empty())
1725  return;
1726 
1727  OpBuilder b(forOp);
1728  Location loc(forOp.getLoc());
1729  AffineExpr lhs, rhs;
1730  bindSymbols(forOp.getContext(), lhs, rhs);
1731  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1732  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1733 
1734  Value linearIndex = processorId.front();
1735  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1736  auto mulApplyOp = b.create<AffineApplyOp>(
1737  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1738  linearIndex = b.create<AffineApplyOp>(
1739  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1740  }
1741 
1742  auto mulApplyOp = b.create<AffineApplyOp>(
1743  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1744  Value lb = b.create<AffineApplyOp>(
1745  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1746  forOp.setLowerBound(lb);
1747 
1748  Value step = forOp.getStep();
1749  for (auto numProcs : numProcessors)
1750  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1751  forOp.setStep(step);
1752 }
1753 
1754 /// Given a memref region, determine the lowest depth at which transfers can be
1755 /// placed for it, and return the corresponding block, start and end positions
1756 /// in the block for placing incoming (read) and outgoing (write) copies
1757 /// respectively. The lowest depth depends on whether the region being accessed
1758 /// is hoistable with respect to one or more immediately surrounding loops.
1759 static void
1761  Block::iterator &begin, Block::iterator &end,
1762  Block **copyPlacementBlock,
1763  Block::iterator *copyInPlacementStart,
1764  Block::iterator *copyOutPlacementStart) {
1765  const auto *cst = region.getConstraints();
1766  SmallVector<Value, 4> symbols;
1767  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1768 
1769  SmallVector<AffineForOp, 4> enclosingFors;
1770  getAffineForIVs(*block.begin(), &enclosingFors);
1771  // Walk up loop parents till we find an IV on which this region is
1772  // symbolic/variant.
1773  auto it = enclosingFors.rbegin();
1774  for (auto e = enclosingFors.rend(); it != e; ++it) {
1775  // TODO: also need to be checking this for regions symbols that
1776  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1777  if (llvm::is_contained(symbols, it->getInductionVar()))
1778  break;
1779  }
1780 
1781  if (it != enclosingFors.rbegin()) {
1782  auto lastInvariantIV = *std::prev(it);
1783  *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1784  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1785  *copyPlacementBlock = lastInvariantIV->getBlock();
1786  } else {
1787  *copyInPlacementStart = begin;
1788  *copyOutPlacementStart = end;
1789  *copyPlacementBlock = &block;
1790  }
1791 }
1792 
1793 // Info comprising stride and number of elements transferred every stride.
1794 struct StrideInfo {
1795  int64_t stride;
1797 };
1798 
1799 /// Returns striding information for a copy/transfer of this region with
1800 /// potentially multiple striding levels from outermost to innermost. For an
1801 /// n-dimensional region, there can be at most n-1 levels of striding
1802 /// successively nested.
1803 // TODO: make this work with non-identity layout maps.
1804 static void getMultiLevelStrides(const MemRefRegion &region,
1805  ArrayRef<int64_t> bufferShape,
1806  SmallVectorImpl<StrideInfo> *strideInfos) {
1807  if (bufferShape.size() <= 1)
1808  return;
1809 
1810  int64_t numEltPerStride = 1;
1811  int64_t stride = 1;
1812  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1813  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1814  stride *= dimSize;
1815  numEltPerStride *= bufferShape[d];
1816  // A stride is needed only if the region has a shorter extent than the
1817  // memref along the dimension *and* has an extent greater than one along the
1818  // next major dimension.
1819  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1820  strideInfos->push_back({stride, numEltPerStride});
1821  }
1822  }
1823 }
1824 
1825 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1826 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1827 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1828 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1829 /// expressions to be subtracted out from the respective copy loop iterators in
1830 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1831 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1832 /// inserted.
1833 //
1834 /// The copy-in nest is generated as follows as an example for a 2-d region:
1835 /// for x = ...
1836 /// for y = ...
1837 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1838 ///
1839 static AffineForOp
1840 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1841  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1842  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1843  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1844  OpBuilder b) {
1845  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1846  return lbMap.getNumInputs() == lbOperands.size();
1847  }));
1848  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1849  return ubMap.getNumInputs() == ubOperands.size();
1850  }));
1851 
1852  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1853  assert(lbMaps.size() == rank && "wrong number of lb maps");
1854  assert(ubMaps.size() == rank && "wrong number of ub maps");
1855 
1856  SmallVector<Value, 4> memIndices;
1857  SmallVector<AffineExpr, 4> fastBufExprs;
1858  SmallVector<Value, 4> fastBufMapOperands;
1859  AffineForOp copyNestRoot;
1860  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1861  for (unsigned d = 0; d < rank; ++d) {
1862  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1863  ubOperands, ubMaps[d]);
1864  if (d == 0)
1865  copyNestRoot = forOp;
1866 
1867  b = OpBuilder::atBlockTerminator(forOp.getBody());
1868 
1869  auto fastBufOffsetMap =
1870  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1871  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1872 
1873  // Construct the subscript for the fast memref being copied into/from:
1874  // x - offset_x.
1875  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1876  b.getAffineDimExpr(2 * d));
1877  fastBufMapOperands.push_back(offset);
1878  fastBufMapOperands.push_back(forOp.getInductionVar());
1879  mayBeDeadApplys.push_back(offset);
1880 
1881  // Subscript for the slow memref being copied.
1882  memIndices.push_back(forOp.getInductionVar());
1883  }
1884 
1885  auto fastBufMap =
1886  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1887  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1888  fastBufMap = simplifyAffineMap(fastBufMap);
1889  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1890 
1891  // Drop any dead affine.applys.
1892  for (auto applyOp : mayBeDeadApplys)
1893  if (applyOp.use_empty())
1894  applyOp.erase();
1895 
1896  if (!isCopyOut) {
1897  // Copy in.
1898  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1899  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1900  fastBufMapOperands);
1901  return copyNestRoot;
1902  }
1903 
1904  // Copy out.
1905  auto load =
1906  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1907  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1908  return copyNestRoot;
1909 }
1910 
1911 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1913  return block.getParentOp()->emitRemark();
1914 }
1915 
1916 /// Creates a buffer in the faster memory space for the specified memref region;
1917 /// generates a copy from the lower memory space to this one, and replaces all
1918 /// loads/stores in the block range [`begin', `end') of `block' to load/store
1919 /// from that buffer. Returns failure if copies could not be generated due to
1920 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
1921 /// in copyPlacementBlock specify the insertion points where the incoming copies
1922 /// and outgoing copies, respectively, should be inserted (the insertion happens
1923 /// right before the insertion point). Since `begin` can itself be invalidated
1924 /// due to the memref rewriting done from this method, the output argument
1925 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
1926 /// happens). Since outgoing copies could have been inserted at `end`, the
1927 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
1928 /// size of the fast buffer allocated.
1929 static LogicalResult generateCopy(
1930  const MemRefRegion &region, Block *block, Block::iterator begin,
1931  Block::iterator end, Block *copyPlacementBlock,
1932  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1933  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1934  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1935  Block::iterator *nBegin, Block::iterator *nEnd) {
1936  *nBegin = begin;
1937  *nEnd = end;
1938 
1939  func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1940  OpBuilder topBuilder(f.getBody());
1941  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1942 
1943  *sizeInBytes = 0;
1944 
1945  if (begin == end)
1946  return success();
1947 
1948  // Is the copy out point at the end of the block where we are doing
1949  // explicit copying.
1950  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1951 
1952  // Copies for read regions are going to be inserted at 'begin'.
1953  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1954  // Copies for write regions are going to be inserted at 'end'.
1955  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1956  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1957 
1958  // Builder to create constants at the top level.
1959  auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
1960  OpBuilder top(func.getBody());
1961 
1962  auto loc = region.loc;
1963  auto memref = region.memref;
1964  auto memRefType = cast<MemRefType>(memref.getType());
1965 
1966  if (!memRefType.getLayout().isIdentity()) {
1967  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1968  return failure();
1969  }
1970 
1971  // Indices to use for the copying.
1972  // Indices for the original memref being copied from/to.
1973  SmallVector<Value, 4> memIndices;
1974  // Indices for the faster buffer being copied into/from.
1975  SmallVector<Value, 4> bufIndices;
1976 
1977  unsigned rank = memRefType.getRank();
1978  SmallVector<int64_t, 4> fastBufferShape;
1979 
1980  // Compute the extents of the buffer.
1981  std::vector<SmallVector<int64_t, 4>> lbs;
1982  SmallVector<int64_t, 8> lbDivisors;
1983  lbs.reserve(rank);
1984  std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
1985  &fastBufferShape, &lbs, &lbDivisors);
1986  if (!numElements) {
1987  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
1988  return failure();
1989  }
1990 
1991  if (*numElements == 0) {
1992  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
1993  return success();
1994  }
1995 
1996  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
1997  for (unsigned i = 0; i < rank; ++i)
1998  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
1999 
2000  const FlatAffineValueConstraints *cst = region.getConstraints();
2001  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2002  // on; these typically include loop IVs surrounding the level at which the
2003  // copy generation is being done or other valid symbols in MLIR.
2004  SmallVector<Value, 8> regionSymbols;
2005  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2006 
2007  // Construct the index expressions for the fast memory buffer. The index
2008  // expression for a particular dimension of the fast buffer is obtained by
2009  // subtracting out the lower bound on the original memref's data region
2010  // along the corresponding dimension.
2011 
2012  // Index start offsets for faster memory buffer relative to the original.
2013  SmallVector<AffineExpr, 4> fastBufOffsets;
2014  fastBufOffsets.reserve(rank);
2015  for (unsigned d = 0; d < rank; d++) {
2016  assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2017 
2018  AffineExpr offset = top.getAffineConstantExpr(0);
2019  for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2020  offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2021  assert(lbDivisors[d] > 0);
2022  offset =
2023  (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2024 
2025  // Set copy start location for this dimension in the lower memory space
2026  // memref.
2027  if (auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2028  auto indexVal = caf.getValue();
2029  if (indexVal == 0) {
2030  memIndices.push_back(zeroIndex);
2031  } else {
2032  memIndices.push_back(
2033  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2034  }
2035  } else {
2036  // The coordinate for the start location is just the lower bound along the
2037  // corresponding dimension on the memory region (stored in 'offset').
2038  auto map = AffineMap::get(
2039  cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2040  memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2041  }
2042  // The fast buffer is copied into at location zero; addressing is relative.
2043  bufIndices.push_back(zeroIndex);
2044 
2045  // Record the offsets since they are needed to remap the memory accesses of
2046  // the original memref further below.
2047  fastBufOffsets.push_back(offset);
2048  }
2049 
2050  // The faster memory space buffer.
2051  Value fastMemRef;
2052 
2053  // Check if a buffer was already created.
2054  bool existingBuf = fastBufferMap.count(memref) > 0;
2055  if (!existingBuf) {
2056  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2057  auto fastMemRefType =
2058  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2059  fastBufferLayout, copyOptions.fastMemorySpace);
2060 
2061  // Create the fast memory space buffer just before the 'affine.for'
2062  // operation.
2063  fastMemRef =
2064  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2065  // Record it.
2066  fastBufferMap[memref] = fastMemRef;
2067  // fastMemRefType is a constant shaped memref.
2068  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2069  // We don't account for things of unknown size.
2070  *sizeInBytes = maySizeInBytes.value_or(0);
2071 
2072  LLVM_DEBUG(emitRemarkForBlock(*block)
2073  << "Creating fast buffer of type " << fastMemRefType
2074  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2075  << " KiB\n");
2076  } else {
2077  // Reuse the one already created.
2078  fastMemRef = fastBufferMap[memref];
2079  }
2080 
2081  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2082 
2083  Value dmaStride;
2084  Value numEltPerDmaStride;
2085  if (copyOptions.generateDma) {
2086  SmallVector<StrideInfo, 4> dmaStrideInfos;
2087  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2088 
2089  // TODO: use all stride levels once DmaStartOp is extended for
2090  // multi-level strides.
2091  if (dmaStrideInfos.size() > 1) {
2092  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2093  return failure();
2094  }
2095 
2096  if (!dmaStrideInfos.empty()) {
2097  dmaStride =
2098  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2099  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2100  loc, dmaStrideInfos[0].numEltPerStride);
2101  }
2102  }
2103 
2104  // Record the last operation where we want the memref replacement to end. We
2105  // later do the memref replacement only in [begin, postDomFilter] so
2106  // that the original memref's used in the data movement code themselves don't
2107  // get replaced.
2108  auto postDomFilter = std::prev(end);
2109 
2110  // Create fully composed affine maps for each memref.
2111  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2112  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2113  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2114  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2115 
2116  if (!copyOptions.generateDma) {
2117  // Point-wise copy generation.
2118  auto copyNest =
2119  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2120  /*lbOperands=*/regionSymbols, ubMaps,
2121  /*ubOperands=*/regionSymbols, fastBufOffsets,
2122  /*isCopyOut=*/region.isWrite(), b);
2123 
2124  // Record this so that we can skip it from yet another copy.
2125  copyNests.insert(copyNest);
2126 
2127  // Since new ops are being appended (for copy out's), adjust the end to
2128  // mark end of block range being processed if necessary.
2129  if (region.isWrite() && isCopyOutAtEndOfBlock)
2130  *nEnd = Block::iterator(copyNest.getOperation());
2131  } else {
2132  // DMA generation.
2133  // Create a tag (single element 1-d memref) for the DMA.
2134  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2135  copyOptions.tagMemorySpace);
2136  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2137 
2138  SmallVector<Value, 4> tagIndices({zeroIndex});
2139  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2140  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2141  if (!region.isWrite()) {
2142  // DMA non-blocking read from original buffer to fast buffer.
2143  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2144  fastMemRef, bufAffineMap, bufIndices,
2145  tagMemRef, tagAffineMap, tagIndices,
2146  numElementsSSA, dmaStride, numEltPerDmaStride);
2147  } else {
2148  // DMA non-blocking write from fast buffer to the original memref.
2149  auto op = b.create<AffineDmaStartOp>(
2150  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2151  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2152  dmaStride, numEltPerDmaStride);
2153  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2154  // end to mark end of block range being processed.
2155  if (isCopyOutAtEndOfBlock)
2156  *nEnd = Block::iterator(op.getOperation());
2157  }
2158 
2159  // Matching DMA wait to block on completion; tag always has a 0 index.
2160  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2161  numElementsSSA);
2162 
2163  // Generate dealloc for the tag.
2164  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2165  if (*nEnd == end && isCopyOutAtEndOfBlock)
2166  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2167  // mark end of range of the original.
2168  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2169  }
2170 
2171  // Generate dealloc for the buffer.
2172  if (!existingBuf) {
2173  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2174  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2175  // the fast buffer (since it marks the new end insertion point).
2176  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2177  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2178  }
2179 
2180  // Replace all uses of the old memref with the faster one while remapping
2181  // access indices (subtracting out lower bound offsets for each dimension).
2182  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2183  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2184  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2185  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2186  // d2, d3 correspond to the original indices (%i, %j).
2187  SmallVector<AffineExpr, 4> remapExprs;
2188  remapExprs.reserve(rank);
2189  for (unsigned i = 0; i < rank; i++) {
2190  // The starting operands of indexRemap will be regionSymbols (the symbols on
2191  // which the memref region is parametric); then those corresponding to
2192  // the memref's original indices follow.
2193  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2194  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2195  }
2196  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2197  b.getContext());
2198 
2199  // Record the begin since it may be invalidated by memref replacement.
2200  Block::iterator prevOfBegin;
2201  bool isBeginAtStartOfBlock = (begin == block->begin());
2202  if (!isBeginAtStartOfBlock)
2203  prevOfBegin = std::prev(begin);
2204 
2205  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2206  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2207  /*extraIndices=*/{}, indexRemap,
2208  /*extraOperands=*/regionSymbols,
2209  /*symbolOperands=*/{},
2210  /*domOpFilter=*/&*begin,
2211  /*postDomOpFilter=*/&*postDomFilter);
2212 
2213  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2214 
2215  return success();
2216 }
2217 
2218 /// Construct the memref region to just include the entire memref. Returns false
2219 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2220 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2221 /// is parametric on.
2222 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2223  MemRefRegion *region) {
2224  unsigned rank;
2225  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2226  rank = loadOp.getMemRefType().getRank();
2227  region->memref = loadOp.getMemRef();
2228  region->setWrite(false);
2229  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2230  rank = storeOp.getMemRefType().getRank();
2231  region->memref = storeOp.getMemRef();
2232  region->setWrite(true);
2233  } else {
2234  assert(false && "expected load or store op");
2235  return false;
2236  }
2237  auto memRefType = cast<MemRefType>(region->memref.getType());
2238  if (!memRefType.hasStaticShape())
2239  return false;
2240 
2241  auto *regionCst = region->getConstraints();
2242 
2243  // Just get the first numSymbols IVs, which the memref region is parametric
2244  // on.
2246  getAffineForIVs(*op, &ivs);
2247  ivs.resize(numParamLoopIVs);
2248  SmallVector<Value, 4> symbols;
2249  extractForInductionVars(ivs, &symbols);
2250  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2251  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2252 
2253  // Memref dim sizes provide the bounds.
2254  for (unsigned d = 0; d < rank; d++) {
2255  auto dimSize = memRefType.getDimSize(d);
2256  assert(dimSize > 0 && "filtered dynamic shapes above");
2257  regionCst->addBound(BoundType::LB, d, 0);
2258  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2259  }
2260  return true;
2261 }
2262 
2263 LogicalResult
2265  const AffineCopyOptions &copyOptions,
2266  std::optional<Value> filterMemRef,
2267  DenseSet<Operation *> &copyNests) {
2268  if (begin == end)
2269  return success();
2270 
2271  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2272  "Inconsistent block begin/end args");
2273  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2274 
2275  Block *block = begin->getBlock();
2276 
2277  // Copies will be generated for this depth, i.e., symbolic in all loops
2278  // surrounding the this block range.
2279  unsigned copyDepth = getNestingDepth(&*begin);
2280 
2281  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2282  << "\n");
2283  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2284  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2285 
2286  // List of memory regions to copy for. We need a map vector to have a
2287  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2288  // since the alloc's for example are identical except for the SSA id.
2289  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2290  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2291 
2292  // Map from original memref's to the fast buffers that their accesses are
2293  // replaced with.
2294  DenseMap<Value, Value> fastBufferMap;
2295 
2296  // To check for errors when walking the block.
2297  bool error = false;
2298 
2299  // Walk this range of operations to gather all memory regions.
2300  block->walk(begin, end, [&](Operation *opInst) {
2301  // Gather regions to allocate to buffers in faster memory space.
2302  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2303  if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2304  (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2305  copyOptions.slowMemorySpace))
2306  return;
2307  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2308  if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2309  storeOp.getMemRefType().getMemorySpaceAsInt() !=
2310  copyOptions.slowMemorySpace)
2311  return;
2312  } else {
2313  // Neither load nor a store op.
2314  return;
2315  }
2316 
2317  // Compute the MemRefRegion accessed.
2318  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2319  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2320  /*addMemRefDimBounds=*/false))) {
2321  LLVM_DEBUG(llvm::dbgs()
2322  << "Error obtaining memory region: semi-affine maps?\n");
2323  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2324  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2325  LLVM_DEBUG(
2326  opInst->emitError("non-constant memref sizes not yet supported"));
2327  error = true;
2328  return;
2329  }
2330  }
2331 
2332  // Each memref has a single buffer associated with it irrespective of how
2333  // many load's and store's happen on it.
2334  // TODO: in the future, when regions don't intersect and satisfy
2335  // other properties (based on load/store regions), we could consider
2336  // multiple buffers per memref.
2337 
2338  // Add to the appropriate region if it's not already in it, or take a
2339  // bounding box union with the existing one if it's already in there.
2340  // Note that a memref may have both read and write regions - so update the
2341  // region in the other list if one exists (write in case of read and vice
2342  // versa) since there is a single bounding box for a memref across all reads
2343  // and writes that happen on it.
2344 
2345  // Attempts to update; returns true if 'region' exists in targetRegions.
2346  auto updateRegion =
2347  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2348  &targetRegions) {
2349  const auto *const it = targetRegions.find(region->memref);
2350  if (it == targetRegions.end())
2351  return false;
2352 
2353  // Perform a union with the existing region.
2354  if (failed(it->second->unionBoundingBox(*region))) {
2355  LLVM_DEBUG(llvm::dbgs()
2356  << "Memory region bounding box failed; "
2357  "over-approximating to the entire memref\n");
2358  // If the union fails, we will overapproximate.
2359  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2360  LLVM_DEBUG(opInst->emitError(
2361  "non-constant memref sizes not yet supported"));
2362  error = true;
2363  return true;
2364  }
2365  it->second->getConstraints()->clearAndCopyFrom(
2366  *region->getConstraints());
2367  } else {
2368  // Union was computed and stored in 'it->second': copy to 'region'.
2369  region->getConstraints()->clearAndCopyFrom(
2370  *it->second->getConstraints());
2371  }
2372  return true;
2373  };
2374 
2375  bool existsInRead = updateRegion(readRegions);
2376  if (error)
2377  return;
2378  bool existsInWrite = updateRegion(writeRegions);
2379  if (error)
2380  return;
2381 
2382  // Finally add it to the region list.
2383  if (region->isWrite() && !existsInWrite) {
2384  writeRegions[region->memref] = std::move(region);
2385  } else if (!region->isWrite() && !existsInRead) {
2386  readRegions[region->memref] = std::move(region);
2387  }
2388  });
2389 
2390  if (error) {
2391  LLVM_DEBUG(begin->emitError(
2392  "copy generation failed for one or more memref's in this block\n"));
2393  return failure();
2394  }
2395 
2396  uint64_t totalCopyBuffersSizeInBytes = 0;
2397  bool ret = true;
2398  auto processRegions =
2399  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2400  &regions) {
2401  for (const auto &regionEntry : regions) {
2402  // For each region, hoist copy in/out past all hoistable
2403  // 'affine.for's.
2404  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2405  Block *copyPlacementBlock;
2407  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2408  &copyInPlacementStart, &copyOutPlacementStart);
2409 
2410  uint64_t sizeInBytes;
2411  Block::iterator nBegin, nEnd;
2412  LogicalResult iRet = generateCopy(
2413  *regionEntry.second, block, begin, end, copyPlacementBlock,
2414  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2415  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2416  if (succeeded(iRet)) {
2417  // begin/end could have been invalidated, and need update.
2418  begin = nBegin;
2419  end = nEnd;
2420  totalCopyBuffersSizeInBytes += sizeInBytes;
2421  }
2422  ret = ret & succeeded(iRet);
2423  }
2424  };
2425  processRegions(readRegions);
2426  processRegions(writeRegions);
2427 
2428  if (!ret) {
2429  LLVM_DEBUG(begin->emitError(
2430  "copy generation failed for one or more memref's in this block\n"));
2431  return failure();
2432  }
2433 
2434  // For a range of operations, a note will be emitted at the caller.
2435  AffineForOp forOp;
2436  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2437  LLVM_DEBUG(forOp.emitRemark()
2438  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2439  << " KiB of copy buffers in fast memory space for this block");
2440  }
2441 
2442  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2443  block->getParentOp()->emitWarning(
2444  "total size of all copy buffers' for this block exceeds fast memory "
2445  "capacity");
2446  }
2447 
2448  return success();
2449 }
2450 
2451 // A convenience version of affineDataCopyGenerate for all ops in the body of
2452 // an AffineForOp.
2454  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2455  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2456  return affineDataCopyGenerate(forOp.getBody()->begin(),
2457  std::prev(forOp.getBody()->end()), copyOptions,
2458  filterMemRef, copyNests);
2459 }
2460 
2462  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2463  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2464  Block *block = analyzedOp->getBlock();
2465  auto begin = analyzedOp->getIterator();
2466  auto end = std::next(begin);
2467  DenseMap<Value, Value> fastBufferMap;
2468  DenseSet<Operation *> copyNests;
2469 
2470  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2471  copyOptions, fastBufferMap, copyNests,
2472  &result.sizeInBytes, &begin, &end);
2473  if (failed(err))
2474  return err;
2475 
2476  const auto &en = fastBufferMap.find(memrefRegion.memref);
2477  // In some cases (empty loops), no copy generation would have happened.
2478  if (en == fastBufferMap.end())
2479  return failure();
2480  result.alloc = en->second.getDefiningOp();
2481  assert(result.alloc && "fast buffer expected to be locally allocated");
2482  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2483  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2484  return success();
2485 }
2486 
2487 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2488 static void
2489 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2490  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2491  // Add a new empty level to output if it doesn't exist level already.
2492  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2493  if (currLoopDepth == depthToLoops.size())
2494  depthToLoops.emplace_back();
2495 
2496  for (auto &op : *block) {
2497  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2498  depthToLoops[currLoopDepth].push_back(forOp);
2499  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2500  }
2501  }
2502 }
2503 
2504 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2506  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2507  for (auto &block : func)
2508  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2509 
2510  // Remove last loop level from output since it's empty.
2511  if (!depthToLoops.empty()) {
2512  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2513  depthToLoops.pop_back();
2514  }
2515 }
2516 
2518  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2519  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2520  SmallVector<Value, 4> lowerOperands(lbOperands);
2521  SmallVector<Value, 4> upperOperands(ubOperands);
2522 
2523  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2524  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2525  lbMap = removeDuplicateExprs(lbMap);
2526  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2527  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2528  ubMap = removeDuplicateExprs(ubMap);
2529 
2530  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2531  step);
2532 }
2533 
2534 /// Creates an AffineIfOp that encodes the conditional to choose between
2535 /// the constant trip count version and an unknown trip count version of this
2536 /// nest of loops. This is used to separate partial and full tiles if `loops`
2537 /// has the intra-tile loops. The affine.if op is inserted at the builder
2538 /// insertion point of `b`.
2540  OpBuilder b) {
2541  if (loops.empty())
2542  return nullptr;
2543 
2544  auto *context = loops[0].getContext();
2545 
2548  llvm::append_range(ops, loops);
2549  (void)getIndexSet(ops, &cst);
2550 
2551  // Remove constraints that are independent of these loop IVs.
2552  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2553 
2554  // Construct the constraint set representing the guard for full tiles. The
2555  // lower bound (and upper bound) corresponding to the full tile should be
2556  // larger (and resp. smaller) than any other lower (or upper bound).
2557  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2558  for (auto loop : loops) {
2559  (void)loop;
2560  // TODO: Non-unit stride is not an issue to generalize to.
2561  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2562  // Mark everything symbols for the purpose of finding a constant diff pair.
2563  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2564  1);
2565  unsigned fullTileLbPos, fullTileUbPos;
2566  if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2567  /*boundFloorDivisor=*/nullptr,
2568  /*ub=*/nullptr, &fullTileLbPos,
2569  &fullTileUbPos)) {
2570  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2571  return nullptr;
2572  }
2573 
2574  SmallVector<unsigned, 4> lbIndices, ubIndices;
2575  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2576 
2577  auto fLb = cst.getInequality(fullTileLbPos);
2578  auto fUb = cst.getInequality(fullTileUbPos);
2579  fullTileLb.assign(fLb.begin(), fLb.end());
2580  fullTileUb.assign(fUb.begin(), fUb.end());
2581 
2582  // Full tile lower bound should be >= than any other lower bound.
2583  for (auto lbIndex : lbIndices)
2584  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2585  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2586 
2587  // Full tile upper bound should be <= any other upper bound.
2588  for (auto ubIndex : ubIndices)
2589  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2590  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2591 
2592  cst.removeVar(0);
2593  }
2594 
2595  // The previous step leads to all zeros for the full tile lb and ub position
2596  // itself; remove those and any other duplicates / trivial redundancies.
2598 
2599  // Turn everything into dims conservatively since we earlier turned all
2600  // trailing ids past point loop IV into symbols. Some of these could be outer
2601  // loop IVs; we'll canonicalize anyway.
2602  cst.setDimSymbolSeparation(0);
2603 
2604  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2605  // ifCondSet can be null if cst was empty -- this can happen if all loops
2606  // in the nest have constant trip counts.
2607  if (!ifCondSet)
2608  return nullptr;
2609 
2610  SmallVector<Value, 4> setOperands;
2611  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2612  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2613  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2614  /*withElseRegion=*/true);
2615 }
2616 
2617 /// Create the full tile loop nest (along with its body).
2618 static LogicalResult
2620  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2621  fullTileLoops.reserve(inputNest.size());
2622 
2623  // For each loop in the original nest identify a lower/upper bound pair such
2624  // that their difference is a constant.
2626  for (auto loop : inputNest) {
2627  // TODO: straightforward to generalize to a non-unit stride.
2628  if (loop.getStepAsInt() != 1) {
2629  LLVM_DEBUG(llvm::dbgs()
2630  << "[tile separation] non-unit stride not implemented\n");
2631  return failure();
2632  }
2633  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2634  (void)getIndexSet(loopOp, &cst);
2635  // We will mark everything other than this loop IV as symbol for getting a
2636  // pair of <lb, ub> with a constant difference.
2638  unsigned lbPos, ubPos;
2639  if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2640  /*boundFloorDivisor=*/nullptr,
2641  /*ub=*/nullptr, &lbPos, &ubPos) ||
2642  lbPos == ubPos) {
2643  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2644  "equalities not yet handled\n");
2645  return failure();
2646  }
2647 
2648  // Set all variables as dimensions uniformly since some of those marked as
2649  // symbols above could be outer loop IVs (corresponding tile space IVs).
2650  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2651 
2652  AffineValueMap lbVmap, ubVmap;
2653  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2654  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2655  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2656  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2657  ubVmap.getOperands(), ubVmap.getAffineMap());
2658  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2659  fullTileLoops.push_back(fullTileLoop);
2660  }
2661 
2662  // Add the body for the full tile loop nest.
2663  IRMapping operandMap;
2664  for (const auto &loopEn : llvm::enumerate(inputNest))
2665  operandMap.map(loopEn.value().getInductionVar(),
2666  fullTileLoops[loopEn.index()].getInductionVar());
2667  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2668  for (auto &op : inputNest.back().getBody()->without_terminator())
2669  b.clone(op, operandMap);
2670  return success();
2671 }
2672 
2673 LogicalResult
2675  SmallVectorImpl<AffineForOp> *fullTileNest) {
2676  if (inputNest.empty())
2677  return success();
2678 
2679  auto firstLoop = inputNest[0];
2680 
2681  // Each successive for op has to be nested in the other.
2682  auto prevLoop = firstLoop;
2683  for (auto loop : inputNest.drop_front(1)) {
2684  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2685  prevLoop = loop;
2686  }
2687 
2688  // Create the full tile loop nest.
2689  SmallVector<AffineForOp, 4> fullTileLoops;
2690  OpBuilder b(firstLoop);
2691  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2692  if (!fullTileLoops.empty())
2693  fullTileLoops.front().erase();
2694  return failure();
2695  }
2696 
2697  // Create and insert the version select right before the root of the nest.
2698  b = OpBuilder(firstLoop);
2699  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2700  if (!ifOp) {
2701  fullTileLoops.front().erase();
2702  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2703  "separation condition\n");
2704  return failure();
2705  }
2706 
2707  // Move the full tile into the then block.
2708  Block *thenBlock = ifOp.getThenBlock();
2709  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2710  thenBlock->getOperations().splice(
2711  std::prev(thenBlock->end()),
2712  outermostFullTileLoop->getBlock()->getOperations(),
2713  Block::iterator(outermostFullTileLoop));
2714 
2715  // Move the partial tile into the else block. The partial tile is the same as
2716  // the original loop nest.
2717  Block *elseBlock = ifOp.getElseBlock();
2718  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2719  firstLoop->getBlock()->getOperations(),
2720  Block::iterator(firstLoop));
2721 
2722  if (fullTileNest)
2723  *fullTileNest = std::move(fullTileLoops);
2724 
2725  return success();
2726 }
2727 
2728 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2729  LogicalResult result(failure());
2731  getPerfectlyNestedLoops(loops, op);
2732  if (loops.size() <= 1)
2733  return success();
2734 
2735  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2736  // loops with bounds defined above some loop.
2737  // 1. For each loop, find above which parent loop its operands are
2738  // defined.
2739  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2740  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2741  operandsDefinedAbove[i] = i;
2742  for (unsigned j = 0; j < i; ++j) {
2743  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2744  operandsDefinedAbove[i] = j;
2745  break;
2746  }
2747  }
2748  }
2749 
2750  // 2. Identify bands of loops such that the operands of all of them are
2751  // defined above the first loop in the band. Traverse the nest bottom-up
2752  // so that modifications don't invalidate the inner loops.
2753  for (unsigned end = loops.size(); end > 0; --end) {
2754  unsigned start = 0;
2755  for (; start < end - 1; ++start) {
2756  auto maxPos =
2757  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2758  std::next(operandsDefinedAbove.begin(), end));
2759  if (maxPos > start)
2760  continue;
2761  assert(maxPos == start &&
2762  "expected loop bounds to be known at the start of the band");
2763  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2764  if (succeeded(coalesceLoops(band)))
2765  result = success();
2766  break;
2767  }
2768  // If a band was found and transformed, keep looking at the loops above
2769  // the outermost transformed loop.
2770  if (start != end - 1)
2771  end = start + 1;
2772  }
2773  return result;
2774 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:377
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:655
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:418
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2222
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1528
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1912
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:356
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1760
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:169
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:403
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:554
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:459
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2489
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:908
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
Definition: LoopUtils.cpp:1929
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2619
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1804
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:683
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:968
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1081
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1312
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:412
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2539
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:44
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1508
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
Definition: LoopUtils.cpp:1840
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:102
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:917
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:398
unsigned getNumDims() const
Definition: AffineMap.cpp:394
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:407
unsigned getNumResults() const
Definition: AffineMap.cpp:402
unsigned getNumInputs() const
Definition: AffineMap.cpp:403
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:411
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:29
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:305
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:246
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:33
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:441
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:447
AffineMap getDimIdentityMap()
Definition: Builders.cpp:423
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:427
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:408
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:412
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:404
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:772
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
This class helps build Operations.
Definition: Builders.h:215
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:588
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:439
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:260
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:497
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:420
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:108
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:42
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:280
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
MLIRContext * getContext() const
Definition: PatternMatch.h:829
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
Type getType() const
Return the type of this value.
Definition: Value.h:129
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:51
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:511
Value getOperand(unsigned idx)
Definition: AffineOps.h:517
operand_range getOperands()
Definition: AffineOps.h:526
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:94
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:303
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
std::optional< DynamicAPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< DynamicAPInt > *lb=nullptr, DynamicAPInt *boundFloorDivisor=nullptr, SmallVectorImpl< DynamicAPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1606
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1134
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:880
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2264
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1070
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2589
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1003
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2505
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1360
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2517
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:851
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:224
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1346
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2461
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1435
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:895
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1382
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1093
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:809
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1440
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:633
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1246
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1848
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1720
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1583
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1452
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:767
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1289
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2728
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:868
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1366
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2674
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2599
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:773
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:32
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:783
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:362
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:24
@ ExistingOps
Only pre-existing ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1795
int64_t numEltPerStride
Definition: LoopUtils.cpp:1796
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:205
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:430
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:469
bool isWrite() const
Definition: Utils.h:471
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:942
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1003
void setWrite(bool flag)
Definition: Utils.h:472
Value memref
Memref that this region corresponds to.
Definition: Utils.h:517
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:524
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.