MLIR  19.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/IntegerSet.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include <optional>
33 
34 #define DEBUG_TYPE "loop-utils"
35 
36 using namespace mlir;
37 using namespace affine;
38 using namespace presburger;
39 using llvm::SmallMapVector;
40 
41 namespace {
42 // This structure is to pass and return sets of loop parameters without
43 // confusing the order.
44 struct LoopParams {
45  Value lowerBound;
46  Value upperBound;
47  Value step;
48 };
49 } // namespace
50 
51 /// Computes the cleanup loop lower bound of the loop being unrolled with
52 /// the specified unroll factor; this bound will also be upper bound of the main
53 /// part of the unrolled loop. Computes the bound as an AffineMap with its
54 /// operands or a null map when the trip count can't be expressed as an affine
55 /// expression.
56 static void
57 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
58  AffineMap &cleanupLbMap,
59  SmallVectorImpl<Value> &cleanupLbOperands) {
60  AffineMap tripCountMap;
61  SmallVector<Value, 4> tripCountOperands;
62  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
63  // Trip count can't be computed.
64  if (!tripCountMap) {
65  cleanupLbMap = AffineMap();
66  return;
67  }
68 
69  OpBuilder b(forOp);
70  auto lbMap = forOp.getLowerBoundMap();
71  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
72  forOp.getLowerBoundOperands());
73 
74  // For each upper bound expr, get the range.
75  // Eg: affine.for %i = lb to min (ub1, ub2),
76  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
77  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
78  // these affine.apply's make up the cleanup loop lower bound.
79  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
80  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
81  int64_t step = forOp.getStepAsInt();
82  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
83  auto tripCountExpr = tripCountMap.getResult(i);
84  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
85  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
86  tripCountMap.getNumSymbols(), bumpExprs[i]);
87  bumpValues[i] =
88  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
89  }
90 
91  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
92  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
93  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
94 
95  cleanupLbOperands.clear();
96  cleanupLbOperands.push_back(lb);
97  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
98  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
99  b.getContext());
100  // Simplify the cleanupLbMap + cleanupLbOperands.
101  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
102  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
103  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
104  // Remove any affine.apply's that became dead from the simplification above.
105  for (auto v : bumpValues)
106  if (v.use_empty())
107  v.getDefiningOp()->erase();
108 
109  if (lb.use_empty())
110  lb.erase();
111 }
112 
113 /// Helper to replace uses of loop carried values (iter_args) and loop
114 /// yield values while promoting single iteration affine.for ops.
115 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
116  // Replace uses of iter arguments with iter operands (initial values).
117  auto iterOperands = forOp.getInits();
118  auto iterArgs = forOp.getRegionIterArgs();
119  for (auto e : llvm::zip(iterOperands, iterArgs))
120  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
121 
122  // Replace uses of loop results with the values yielded by the loop.
123  auto outerResults = forOp.getResults();
124  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
125  for (auto e : llvm::zip(outerResults, innerResults))
126  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
127 }
128 
129 /// Promotes the loop body of a forOp to its containing block if the forOp
130 /// was known to have a single iteration.
132  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
133  if (!tripCount || *tripCount != 1)
134  return failure();
135 
136  // TODO: extend this for arbitrary affine bounds.
137  if (forOp.getLowerBoundMap().getNumResults() != 1)
138  return failure();
139 
140  // Replaces all IV uses to its single iteration value.
141  auto iv = forOp.getInductionVar();
142  auto *parentBlock = forOp->getBlock();
143  if (!iv.use_empty()) {
144  if (forOp.hasConstantLowerBound()) {
145  OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
146  auto constOp = topBuilder.create<arith::ConstantIndexOp>(
147  forOp.getLoc(), forOp.getConstantLowerBound());
148  iv.replaceAllUsesWith(constOp);
149  } else {
150  auto lbOperands = forOp.getLowerBoundOperands();
151  auto lbMap = forOp.getLowerBoundMap();
152  OpBuilder builder(forOp);
153  if (lbMap == builder.getDimIdentityMap()) {
154  // No need of generating an affine.apply.
155  iv.replaceAllUsesWith(lbOperands[0]);
156  } else {
157  auto affineApplyOp =
158  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
159  iv.replaceAllUsesWith(affineApplyOp);
160  }
161  }
162  }
163 
165 
166  // Move the loop body operations, except for its terminator, to the loop's
167  // containing block.
168  forOp.getBody()->back().erase();
169  parentBlock->getOperations().splice(Block::iterator(forOp),
170  forOp.getBody()->getOperations());
171  forOp.erase();
172  return success();
173 }
174 
175 /// Generates an affine.for op with the specified lower and upper bounds
176 /// while generating the right IV remappings to realize shifts for operations in
177 /// its body. The operations that go into the loop body are specified in
178 /// opGroupQueue starting from the specified offset, and in that order. The
179 /// first element of the pair specifies the shift applied to that group of
180 /// operations; the shift is multiplied by the loop step before being applied.
181 /// Returns nullptr if the generated loop simplifies to a single iteration one.
182 static AffineForOp generateShiftedLoop(
183  AffineMap lbMap, AffineMap ubMap,
184  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
185  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
186  auto lbOperands = srcForOp.getLowerBoundOperands();
187  auto ubOperands = srcForOp.getUpperBoundOperands();
188 
189  assert(lbMap.getNumInputs() == lbOperands.size());
190  assert(ubMap.getNumInputs() == ubOperands.size());
191 
192  auto loopChunk =
193  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
194  ubMap, srcForOp.getStepAsInt());
195  auto loopChunkIV = loopChunk.getInductionVar();
196  auto srcIV = srcForOp.getInductionVar();
197 
198  IRMapping operandMap;
199 
200  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
201  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
202  uint64_t shift = it.first;
203  auto ops = it.second;
204  // All 'same shift' operations get added with their operands being
205  // remapped to results of cloned operations, and their IV used remapped.
206  // Generate the remapping if the shift is not zero: remappedIV = newIV -
207  // shift.
208  if (!srcIV.use_empty() && shift != 0) {
209  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
210  srcForOp.getLoc(),
211  bodyBuilder.getSingleDimShiftAffineMap(
212  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
213  loopChunkIV);
214  operandMap.map(srcIV, ivRemap);
215  } else {
216  operandMap.map(srcIV, loopChunkIV);
217  }
218  for (auto *op : ops)
219  bodyBuilder.clone(*op, operandMap);
220  };
221  if (succeeded(promoteIfSingleIteration(loopChunk)))
222  return AffineForOp();
223  return loopChunk;
224 }
225 
226 // The skewing of operations with respect to one another can be used for
227 // example to allow overlap of asynchronous operations (such as DMA
228 // communication) with computation, or just relative shifting of operations
229 // for better register reuse, locality or parallelism. As such, the shifts are
230 // typically expected to be at most of the order of the number of operations.
231 // This method should not be used as a substitute for loop distribution/fission.
232 // This method uses an algorithm// in time linear in the number of operations
233 // in the body of the for loop - (using the 'sweep line' paradigm). This method
234 // asserts preservation of SSA dominance. A check for that as well as that for
235 // memory-based dependence preservation check rests with the users of this
236 // method.
238  ArrayRef<uint64_t> shifts,
239  bool unrollPrologueEpilogue) {
240  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
241  "too few/many shifts");
242  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243  return success();
244 
245  // If the trip counts aren't constant, we would need versioning and
246  // conditional guards (or context information to prevent such versioning). The
247  // better way to pipeline for such loops is to first tile them and extract
248  // constant trip count "full tiles" before applying this.
249  auto mayBeConstTripCount = getConstantTripCount(forOp);
250  if (!mayBeConstTripCount) {
251  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
252  return success();
253  }
254  uint64_t tripCount = *mayBeConstTripCount;
255 
256  assert(isOpwiseShiftValid(forOp, shifts) &&
257  "shifts will lead to an invalid transformation\n");
258 
259  int64_t step = forOp.getStepAsInt();
260 
261  unsigned numChildOps = shifts.size();
262 
263  // Do a linear time (counting) sort for the shifts.
264  uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
265  if (maxShift >= numChildOps) {
266  // Large shifts are not the typical use case.
267  forOp.emitWarning("not shifting because shifts are unrealistically large");
268  return success();
269  }
270 
271  // An array of operation groups sorted by shift amount; each group has all
272  // operations with the same shift in the order in which they appear in the
273  // body of the 'affine.for' op.
274  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
275  unsigned pos = 0;
276  for (auto &op : forOp.getBody()->without_terminator()) {
277  auto shift = shifts[pos++];
278  sortedOpGroups[shift].push_back(&op);
279  }
280 
281  // Unless the shifts have a specific pattern (which actually would be the
282  // common use case), prologue and epilogue are not meaningfully defined.
283  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
284  // loop generated as the prologue and the last as epilogue and unroll these
285  // fully.
286  AffineForOp prologue, epilogue;
287 
288  // Do a sweep over the sorted shifts while storing open groups in a
289  // vector, and generating loop portions as necessary during the sweep. A block
290  // of operations is paired with its shift.
291  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
292 
293  auto origLbMap = forOp.getLowerBoundMap();
294  uint64_t lbShift = 0;
295  OpBuilder b(forOp);
296  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
297  // If nothing is shifted by d, continue.
298  if (sortedOpGroups[d].empty())
299  continue;
300  if (!opGroupQueue.empty()) {
301  assert(d > 0 &&
302  "Queue expected to be empty when the first block is found");
303  // The interval for which the loop needs to be generated here is:
304  // [lbShift, min(lbShift + tripCount, d)) and the body of the
305  // loop needs to have all operations in opQueue in that order.
306  AffineForOp res;
307  if (lbShift + tripCount * step < d * step) {
308  res = generateShiftedLoop(
309  b.getShiftedAffineMap(origLbMap, lbShift),
310  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
311  opGroupQueue, /*offset=*/0, forOp, b);
312  // Entire loop for the queued op groups generated, empty it.
313  opGroupQueue.clear();
314  lbShift += tripCount * step;
315  } else {
316  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
317  b.getShiftedAffineMap(origLbMap, d),
318  opGroupQueue, /*offset=*/0, forOp, b);
319  lbShift = d * step;
320  }
321 
322  if (res) {
323  // Simplify/canonicalize the affine.for.
324  RewritePatternSet patterns(res.getContext());
325  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
326  GreedyRewriteConfig config;
328  bool erased;
329  (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns),
330  config, /*changed=*/nullptr, &erased);
331  if (!erased && !prologue)
332  prologue = res;
333  if (!erased)
334  epilogue = res;
335  }
336  } else {
337  // Start of first interval.
338  lbShift = d * step;
339  }
340  // Augment the list of operations that get into the current open interval.
341  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
342  }
343 
344  // Those operations groups left in the queue now need to be processed (FIFO)
345  // and their loops completed.
346  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
347  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
348  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
349  b.getShiftedAffineMap(origLbMap, ubShift),
350  opGroupQueue, /*offset=*/i, forOp, b);
351  lbShift = ubShift;
352  if (!prologue)
353  prologue = epilogue;
354  }
355 
356  // Erase the original for op.
357  forOp.erase();
358 
359  if (unrollPrologueEpilogue && prologue)
360  (void)loopUnrollFull(prologue);
361  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
362  (void)loopUnrollFull(epilogue);
363 
364  return success();
365 }
366 
367 /// Checks whether a loop nest is hyper-rectangular or not.
368 static LogicalResult
371  SmallVector<Operation *, 8> ops(input.begin(), input.end());
372  // 0-d or 1-d is trivially hyper-rectangular.
373  if (input.size() <= 1)
374  return success();
375  if (failed(getIndexSet(ops, &cst))) {
376  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
377  return failure();
378  }
379  if (!cst.isHyperRectangular(0, input.size())) {
380  LLVM_DEBUG(llvm::dbgs()
381  << "Non-hyperrectangular nests not supported for tiling!\n");
382  return failure();
383  }
384  return success();
385 }
386 
387 /// Check if the input nest is supported for tiling and whether tiling would be
388 /// legal or not.
389 template <typename t>
391  ArrayRef<t> tileSizes) {
392  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
393 
394  if (llvm::any_of(input,
395  [](AffineForOp op) { return op.getNumResults() > 0; })) {
396  LLVM_DEBUG(llvm::dbgs()
397  << "Cannot tile nest where a loop has yield values\n");
398  return failure();
399  }
400 
401  // Check if the supplied `for` ops are all successively nested.
402  if (!isPerfectlyNested(input)) {
403  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
404  return failure();
405  }
406 
407  // TODO: handle non hyper-rectangular spaces.
408  if (failed(checkIfHyperRectangular(input)))
409  return failure();
410 
411  return success();
412 }
413 
414 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
415 /// location in destination's body, ignoring the terminator.
416 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
417  Block::iterator loc) {
418  auto &ops = src.getBody()->getOperations();
419  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
420  std::prev(ops.end()));
421 }
422 
423 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
424 /// body.
425 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
426  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
427 }
428 
429 /// Constructs tiled loop nest, without setting the loop bounds and move the
430 /// body of the original loop nest to the tiled loop nest.
432  AffineForOp rootAffineForOp, unsigned width,
433  MutableArrayRef<AffineForOp> tiledLoops) {
434  Location loc = rootAffineForOp.getLoc();
435 
436  // The outermost among the loops as we add more..
437  Operation *topLoop = rootAffineForOp.getOperation();
438  AffineForOp innermostPointLoop;
439 
440  // Add intra-tile (or point) loops.
441  for (unsigned i = 0; i < width; i++) {
442  OpBuilder b(topLoop);
443  // Loop bounds will be set later.
444  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
445  pointLoop.getBody()->getOperations().splice(
446  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
447  topLoop);
448  tiledLoops[2 * width - 1 - i] = pointLoop;
449  topLoop = pointLoop.getOperation();
450  if (i == 0)
451  innermostPointLoop = pointLoop;
452  }
453 
454  // Add tile space loops;
455  for (unsigned i = width; i < 2 * width; i++) {
456  OpBuilder b(topLoop);
457  // Loop bounds will be set later.
458  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
459  tileSpaceLoop.getBody()->getOperations().splice(
460  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
461  topLoop);
462  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
463  topLoop = tileSpaceLoop.getOperation();
464  }
465 
466  // Move the loop body of the original nest to the new one.
467  moveLoopBody(origLoops.back(), innermostPointLoop);
468 }
469 
470 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
471 // TODO: Handle non-constant lower bounds.
472 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
473  AffineForOp newInterTileLoop,
474  AffineForOp newIntraTileLoop,
475  Value tileSize) {
476  // The lower bound for the intra-tile loop is represented by an affine map
477  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
478  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
479  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
480  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
481  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
482  // corresponding inter-tile loop.
483 
484  assert(origLoop.hasConstantLowerBound() &&
485  "expected input loops to have constant lower bound.");
486 
487  // Get lower bound of original loop as an affine expression.
488  AffineExpr origLowerBoundExpr;
489  origLowerBoundExpr =
490  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
491 
492  // Add dim operands from original lower/upper bound.
493  SmallVector<Value, 4> lbOperands, ubOperands;
494  AffineBound lb = origLoop.getLowerBound();
495  AffineBound ub = origLoop.getUpperBound();
496  lbOperands.reserve(lb.getNumOperands() + 2);
497  ubOperands.reserve(ub.getNumOperands() + 2);
498  AffineMap origLbMap = lb.getMap();
499  AffineMap origUbMap = ub.getMap();
500  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
501  lbOperands.push_back(lb.getOperand(j));
502  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
503  ubOperands.push_back(ub.getOperand(j));
504 
505  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
506  // IV.
507  lbOperands.push_back(newInterTileLoop.getInductionVar());
508  ubOperands.push_back(newInterTileLoop.getInductionVar());
509 
510  // Get loop IV as an affine expression for lower/upper bound. Size of
511  // lb/ubOperands is guaranteed to be atleast one.
512  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
513  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
514 
515  // Add symbol operands from original lower/upper bound.
516  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
517  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
518  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
519  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
520 
521  // Add a new symbol operand which is the tile size for this loop.
522  lbOperands.push_back(tileSize);
523  ubOperands.push_back(tileSize);
524 
525  SmallVector<AffineExpr, 4> lbBoundExprs;
526  SmallVector<AffineExpr, 4> ubBoundExprs;
527  lbBoundExprs.reserve(origLbMap.getNumResults());
528  ubBoundExprs.reserve(origUbMap.getNumResults());
529 
530  // Get tiling parameter as an affine expression for lb/ub.
531  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
532  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
533 
534  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
535  lbBoundExprs.push_back(
536  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
537  origLowerBoundExpr);
538 
539  // Get the origLoopStep as an affine expression.
540  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
541 
542  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
543  // (tilingParameter * origLoopStep) + origlb.
544  ubBoundExprs.push_back(
545  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
546  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
547 
548  ubBoundExprs.append(origUbMap.getResults().begin(),
549  origUbMap.getResults().end());
550 
551  AffineMap lbMap =
552  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
553  lbBoundExprs, b.getContext());
554  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
555 
556  AffineMap ubMap =
557  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
558  ubBoundExprs, b.getContext());
559  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
560 
561  // Original loop step must be preserved.
562  newIntraTileLoop.setStep(origLoop.getStepAsInt());
563 }
564 
565 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
566 // TODO: Handle non-constant lower bounds.
567 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
568  AffineForOp newLoop, Value tileSize) {
569  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
570 
571  // The lower bounds for inter-tile loops are same as the corresponding lower
572  // bounds of original loops.
573  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
574 
575  // The new upper bound map for inter-tile loops, assuming constant lower
576  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
577  // originalLowerBound), tiling parameter); where tiling parameter is the
578  // respective tile size for that loop. For e.g. if the original ubmap was
579  // ()->(1024), the new map will be
580  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
581  // Therefore a new symbol operand is inserted in the map and the result
582  // expression is overwritten.
583 
584  assert(origLoop.hasConstantLowerBound() &&
585  "expected input loops to have constant lower bound.");
586 
587  // Get lower bound of original loop as an affine expression.
588  AffineExpr origLowerBoundExpr;
589  origLowerBoundExpr =
590  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
591 
592  // Add dim operands from original upper bound.
593  SmallVector<Value, 4> ubOperands;
594  AffineBound ub = origLoop.getUpperBound();
595  ubOperands.reserve(ub.getNumOperands() + 1);
596  AffineMap origUbMap = ub.getMap();
597  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
598  ubOperands.push_back(ub.getOperand(j));
599 
600  // Add symbol operands from original upper bound.
601  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
602  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
603 
604  // Add a new symbol operand which is the tile size for this loop.
605  ubOperands.push_back(tileSize);
606 
607  // Get tiling parameter as an affine expression.
608  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
609 
610  SmallVector<AffineExpr, 4> boundExprs;
611  boundExprs.reserve(origUbMap.getNumResults());
612  int64_t origUpperBound;
613  AffineExpr origUpperBoundExpr;
614 
615  // If upper bound for the original loop is constant, then the constant can
616  // be obtained as an affine expression straight away.
617  if (origLoop.hasConstantUpperBound()) {
618  origUpperBound = origLoop.getConstantUpperBound();
619 
620  // Get original constant upper bound as an affine expression.
621  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
622 
623  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
624  // originalLowerBound), tilingParameter).
625  boundExprs.push_back(
626  origLowerBoundExpr +
627  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
628  } else {
629  // If upper bound for the original loop is not constant then two cases
630  // are possible, although there handeling is the same, 1.) The result of
631  // ubmap has only one result expression. For e.g.
632  // affine.for %i = 5 to %ub
633  //
634  // A symbol operand is added which represents the tiling parameter. The
635  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
636  // where 's0' is the original upper bound and 's1' is the tiling
637  // parameter. 2.) When ubMap has more than one result expression. For e.g.
638  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
639  // affine.for %i = 5 to min #map0()[%s0, %s1]
640  //
641  // A symbol operand is added which represents the tiling parameter. The
642  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
643  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
644 
645  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
646  // originalLowerBound), tilingParameter).
647  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
648  boundExprs.push_back(
649  origLowerBoundExpr +
650  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
651  }
652 
653  AffineMap ubMap =
654  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
655  boundExprs, b.getContext());
656  newLoop.setUpperBound(ubOperands, ubMap);
657 
658  // Original loop step must be preserved.
659  newLoop.setStep(origLoop.getStepAsInt());
660 }
661 
662 /// Constructs and sets new loop bounds after tiling for the case of
663 /// hyper-rectangular index sets, where the bounds of one dimension do not
664 /// depend on other dimensions and tiling parameters are captured from SSA
665 /// values. Bounds of each dimension can thus be treated independently,
666 /// and deriving the new bounds is much simpler and faster than for the case of
667 /// tiling arbitrary polyhedral shapes.
670  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
671  assert(!origLoops.empty() && "expected atleast one loop in band");
672  assert(origLoops.size() == tileSizes.size() &&
673  "expected tiling parameter for each loop in band.");
674 
675  OpBuilder b(origLoops[0].getOperation());
676  unsigned width = origLoops.size();
677 
678  // Set bounds for tile space loops.
679  for (unsigned i = 0; i < width; ++i) {
680  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
681  }
682 
683  // Set bounds for intra-tile loops.
684  for (unsigned i = 0; i < width; ++i) {
685  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
686  newLoops[i + width], tileSizes[i]);
687  }
688 }
689 
690 /// Constructs and sets new loop bounds after tiling for the case of
691 /// hyper-rectangular index sets, where the bounds of one dimension do not
692 /// depend on other dimensions. Bounds of each dimension can thus be treated
693 /// independently, and deriving the new bounds is much simpler and faster
694 /// than for the case of tiling arbitrary polyhedral shapes.
695 static void
698  ArrayRef<unsigned> tileSizes) {
699  assert(!origLoops.empty());
700  assert(origLoops.size() == tileSizes.size());
701 
702  OpBuilder b(origLoops[0].getOperation());
703  unsigned width = origLoops.size();
704 
705  // Bounds for tile space loops.
706  for (unsigned i = 0; i < width; i++) {
707  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
708  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
709  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
710  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
711  // If the step size of original loop is x and tileSize is y then after
712  // tiling the tile space loops' step size becomes x*y.
713  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
714  }
715  // Bounds for intra-tile loops.
716  for (unsigned i = 0; i < width; i++) {
717  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
718  std::optional<uint64_t> mayBeConstantCount =
719  getConstantTripCount(origLoops[i]);
720  // The lower bound is just the tile-space loop.
721  AffineMap lbMap = b.getDimIdentityMap();
722  newLoops[width + i].setLowerBound(
723  /*operands=*/newLoops[i].getInductionVar(), lbMap);
724  // The step sizes of intra-tile loops is just the original loops' step size.
725  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
726 
727  // Set the upper bound.
728  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
729  // Trip count is less than the tile size: upper bound is lower bound +
730  // trip count * stepSize.
732  *mayBeConstantCount * origLoops[i].getStepAsInt());
733  newLoops[width + i].setUpperBound(
734  /*operands=*/newLoops[i].getInductionVar(), ubMap);
735  } else if (largestDiv % tileSizes[i] != 0) {
736  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
737  // Construct the upper bound map; the operands are the original operands
738  // with 'i' (tile-space loop) appended to it. The new upper bound map is
739  // the original one with an additional expression i + tileSize * stepSize
740  // appended.
741 
742  // Add dim operands from original upper bound.
743  SmallVector<Value, 4> ubOperands;
744  AffineBound ub = origLoops[i].getUpperBound();
745  ubOperands.reserve(ub.getNumOperands() + 1);
746  AffineMap origUbMap = ub.getMap();
747  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
748  ubOperands.push_back(ub.getOperand(j));
749 
750  // Add dim operand for new loop upper bound.
751  ubOperands.push_back(newLoops[i].getInductionVar());
752 
753  // Add symbol operands from original upper bound.
754  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
755  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
756 
757  SmallVector<AffineExpr, 4> boundExprs;
758  boundExprs.reserve(1 + origUbMap.getNumResults());
759  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
760  // The new upper bound map is the original one with an additional
761  // expression i + tileSize * stepSize (of original loop) appended.
762  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
763  boundExprs.append(origUbMap.getResults().begin(),
764  origUbMap.getResults().end());
765  AffineMap ubMap =
766  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
767  boundExprs, b.getContext());
768  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
769  } else {
770  // No need of the min expression.
771  AffineExpr dim = b.getAffineDimExpr(0);
772  AffineMap ubMap = AffineMap::get(
773  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
774  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
775  }
776  }
777 }
778 
781  ArrayRef<unsigned> tileSizes,
782  SmallVectorImpl<AffineForOp> *tiledNest) {
783  if (input.empty())
784  return success();
785 
786  if (failed(performPreTilingChecks(input, tileSizes)))
787  return failure();
788 
789  MutableArrayRef<AffineForOp> origLoops = input;
790  AffineForOp rootAffineForOp = origLoops[0];
791 
792  // Note that width is at least one since the band isn't empty.
793  unsigned width = input.size();
794  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
795 
796  // Construct a tiled loop nest without setting their bounds. Bounds are
797  // set later.
798  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
799 
800  SmallVector<Value, 8> origLoopIVs;
801  extractForInductionVars(input, &origLoopIVs);
802 
803  // Set loop bounds for the tiled loop nest.
804  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
805 
806  // Replace original IVs with intra-tile loop IVs.
807  for (unsigned i = 0; i < width; i++)
808  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
809 
810  // Erase the old loop nest.
811  rootAffineForOp.erase();
812 
813  if (tiledNest)
814  *tiledNest = std::move(tiledLoops);
815 
816  return success();
817 }
818 
819 /// Tiles the specified band of perfectly nested loops creating tile-space
820 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
821 /// is a contiguous set of loops.
824  SmallVectorImpl<AffineForOp> *tiledNest) {
825  if (input.empty())
826  return success();
827 
828  if (failed(performPreTilingChecks(input, tileSizes)))
829  return failure();
830 
831  MutableArrayRef<AffineForOp> origLoops = input;
832  AffineForOp rootAffineForOp = origLoops[0];
833  unsigned width = input.size();
834  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
835 
836  // Construct a tiled loop nest without setting their bounds. Bounds are
837  // set later.
838  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
839 
840  SmallVector<Value, 8> origLoopIVs;
841  extractForInductionVars(input, &origLoopIVs);
842 
843  // Set loop bounds for the tiled loop nest.
844  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
845  tileSizes);
846 
847  // Replace original IVs with intra-tile loop IVs.
848  for (unsigned i = 0; i < width; i++)
849  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
850 
851  // Erase the old loop nest.
852  rootAffineForOp.erase();
853 
854  if (tiledNest)
855  *tiledNest = std::move(tiledLoops);
856 
857  return success();
858 }
859 
860 /// Get perfectly nested sequence of loops starting at root of loop nest
861 /// (the first op being another AffineFor, and the second op - a terminator).
862 /// A loop is perfectly nested iff: the first op in the loop's body is another
863 /// AffineForOp, and the second op is a terminator).
865  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
866  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
867  nestedLoops.push_back(root);
868  Block &body = root.getRegion().front();
869  if (body.begin() != std::prev(body.end(), 2))
870  return;
871 
872  root = dyn_cast<AffineForOp>(&body.front());
873  if (!root)
874  return;
875  }
876 }
877 
878 /// Identify valid and profitable bands of loops to tile. This is currently just
879 /// a temporary placeholder to test the mechanics of tiled code generation.
880 /// Returns all maximal outermost perfect loop nests to tile.
882  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
883  // Get maximal perfect nest of 'affine.for' insts starting from root
884  // (inclusive).
885  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
887  getPerfectlyNestedLoops(band, forOp);
888  bands->push_back(band);
889  }
890 }
891 
892 /// Unrolls this loop completely.
894  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
895  if (mayBeConstantTripCount.has_value()) {
896  uint64_t tripCount = *mayBeConstantTripCount;
897  if (tripCount == 0)
898  return success();
899  if (tripCount == 1)
900  return promoteIfSingleIteration(forOp);
901  return loopUnrollByFactor(forOp, tripCount);
902  }
903  return failure();
904 }
905 
906 /// Unrolls this loop by the specified factor or by the trip count (if constant)
907 /// whichever is lower.
909  uint64_t unrollFactor) {
910  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
911  if (mayBeConstantTripCount.has_value() &&
912  *mayBeConstantTripCount < unrollFactor)
913  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
914  return loopUnrollByFactor(forOp, unrollFactor);
915 }
916 
917 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
918 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
919 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
920 /// using annotateFn.
922  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
923  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
924  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
925  ValueRange iterArgs, ValueRange yieldedValues) {
926  // Builder to insert unrolled bodies just before the terminator of the body of
927  // 'forOp'.
928  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
929 
930  if (!annotateFn)
931  annotateFn = [](unsigned, Operation *, OpBuilder) {};
932 
933  // Keep a pointer to the last non-terminator operation in the original block
934  // so that we know what to clone (since we are doing this in-place).
935  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
936 
937  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
938  SmallVector<Value, 4> lastYielded(yieldedValues);
939 
940  for (unsigned i = 1; i < unrollFactor; i++) {
941  IRMapping operandMap;
942 
943  // Prepare operand map.
944  operandMap.map(iterArgs, lastYielded);
945 
946  // If the induction variable is used, create a remapping to the value for
947  // this unrolled instance.
948  if (!forOpIV.use_empty()) {
949  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
950  operandMap.map(forOpIV, ivUnroll);
951  }
952 
953  // Clone the original body of 'forOp'.
954  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
955  Operation *clonedOp = builder.clone(*it, operandMap);
956  annotateFn(i, clonedOp, builder);
957  }
958 
959  // Update yielded values. If the yielded value is defined outside the
960  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
961  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
962  // value with the clone corresponding to the yielded value.
963  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
964  Operation *defOp = yieldedValues[i].getDefiningOp();
965  if (defOp && defOp->getBlock() == loopBodyBlock)
966  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
967  }
968  }
969 
970  // Make sure we annotate the Ops in the original body. We do this last so that
971  // any annotations are not copied into the cloned Ops above.
972  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
973  annotateFn(0, &*it, builder);
974 
975  // Update operands of the yield statement.
976  loopBodyBlock->getTerminator()->setOperands(lastYielded);
977 }
978 
979 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
980 /// count is not a multiple of `unrollFactor`.
982  uint64_t unrollFactor) {
983  // Insert the cleanup loop right after 'forOp'.
984  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
985  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
986 
987  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
988  // and produce results for the original users of `forOp` results.
989  auto results = forOp.getResults();
990  auto cleanupResults = cleanupForOp.getResults();
991  auto cleanupIterOperands = cleanupForOp.getInits();
992 
993  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
994  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
995  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
996  }
997 
998  AffineMap cleanupMap;
999  SmallVector<Value, 4> cleanupOperands;
1000  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1001  if (!cleanupMap)
1002  return failure();
1003 
1004  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1005  // Promote the loop body up if this has turned into a single iteration loop.
1006  (void)promoteIfSingleIteration(cleanupForOp);
1007 
1008  // Adjust upper bound of the original loop; this is the same as the lower
1009  // bound of the cleanup loop.
1010  forOp.setUpperBound(cleanupOperands, cleanupMap);
1011  return success();
1012 }
1013 
1014 /// Unrolls this loop by the specified factor. Returns success if the loop
1015 /// is successfully unrolled.
1017  AffineForOp forOp, uint64_t unrollFactor,
1018  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1019  bool cleanUpUnroll) {
1020  assert(unrollFactor > 0 && "unroll factor should be positive");
1021 
1022  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1023  if (unrollFactor == 1) {
1024  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1026  return failure();
1027  return success();
1028  }
1029 
1030  // Nothing in the loop body other than the terminator.
1031  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1032  return success();
1033 
1034  // If the trip count is lower than the unroll factor, no unrolled body.
1035  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1036  if (cleanUpUnroll) {
1037  // Unroll the cleanup loop if cleanUpUnroll is specified.
1038  return loopUnrollFull(forOp);
1039  }
1040 
1041  return failure();
1042  }
1043 
1044  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1045  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1046  // Loops where the lower bound is a max expression or the upper bound is
1047  // a min expression and the trip count doesn't divide the unroll factor
1048  // can't be unrolled since the lower bound of the cleanup loop in such cases
1049  // cannot be expressed as an affine function or a max over affine functions.
1050  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1051  forOp.getUpperBoundMap().getNumResults() != 1)
1052  return failure();
1053  if (cleanUpUnroll)
1054  // Force unroll including cleanup loop
1055  return loopUnrollFull(forOp);
1056  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1057  assert(false && "cleanup loop lower bound map for single result lower "
1058  "and upper bound maps can always be determined");
1059  }
1060 
1061  ValueRange iterArgs(forOp.getRegionIterArgs());
1062  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1063 
1064  // Scale the step of loop being unrolled by unroll factor.
1065  int64_t step = forOp.getStepAsInt();
1066  forOp.setStep(step * unrollFactor);
1068  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1069  [&](unsigned i, Value iv, OpBuilder b) {
1070  // iv' = iv + i * step
1071  auto d0 = b.getAffineDimExpr(0);
1072  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1073  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1074  },
1075  /*annotateFn=*/annotateFn,
1076  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1077 
1078  // Promote the loop body up if this has turned into a single iteration loop.
1079  (void)promoteIfSingleIteration(forOp);
1080  return success();
1081 }
1082 
1084  uint64_t unrollJamFactor) {
1085  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1086  if (mayBeConstantTripCount.has_value() &&
1087  *mayBeConstantTripCount < unrollJamFactor)
1088  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1089  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1090 }
1091 
1092 /// Check if all control operands of all loops are defined outside of `forOp`
1093 /// and return false if not.
1094 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1095  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1096  for (auto controlOperand : aForOp.getControlOperands()) {
1097  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1098  return WalkResult::interrupt();
1099  }
1100  return WalkResult::advance();
1101  });
1102  return !walkResult.wasInterrupted();
1103 }
1104 
1105 // Gathers all maximal sub-blocks of operations that do not themselves
1106 // include a for op (a operation could have a descendant for op though
1107 // in its tree). Ignore the block terminators.
1109  // Store iterators to the first and last op of each sub-block found.
1110  std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1111 
1112  // This is a linear time walk.
1113  void walk(Operation *op) {
1114  for (auto &region : op->getRegions())
1115  for (auto &block : region)
1116  walk(block);
1117  }
1118 
1119  void walk(Block &block) {
1120  for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
1121  auto subBlockStart = it;
1122  while (it != e && !isa<AffineForOp>(&*it))
1123  ++it;
1124  if (it != subBlockStart)
1125  subBlocks.emplace_back(subBlockStart, std::prev(it));
1126  // Process all for ops that appear next.
1127  while (it != e && isa<AffineForOp>(&*it))
1128  walk(&*it++);
1129  }
1130  }
1131 };
1132 
1133 /// Unrolls and jams this loop by the specified factor.
1135  uint64_t unrollJamFactor) {
1136  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1137 
1138  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1139  if (unrollJamFactor == 1) {
1140  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1142  return failure();
1143  return success();
1144  }
1145 
1146  // Nothing in the loop body other than the terminator.
1147  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1148  return success();
1149 
1150  // If the trip count is lower than the unroll jam factor, no unroll jam.
1151  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1152  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1153  return failure();
1154  }
1155 
1156  // If any control operand of any inner loop of `forOp` is defined within
1157  // `forOp`, no unroll jam.
1158  if (!areInnerBoundsInvariant(forOp))
1159  return failure();
1160 
1161  // Gather all sub-blocks to jam upon the loop being unrolled.
1162  JamBlockGatherer jbg;
1163  jbg.walk(forOp);
1164  auto &subBlocks = jbg.subBlocks;
1165 
1166  // Collect loops with iter_args.
1167  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1168  forOp.walk([&](AffineForOp aForOp) {
1169  if (aForOp.getNumIterOperands() > 0)
1170  loopsWithIterArgs.push_back(aForOp);
1171  });
1172 
1173  // Get supported reductions to be used for creating reduction ops at the end.
1174  SmallVector<LoopReduction> reductions;
1175  if (forOp.getNumIterOperands() > 0)
1176  getSupportedReductions(forOp, reductions);
1177 
1178  // Generate the cleanup loop if trip count isn't a multiple of
1179  // unrollJamFactor.
1180  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1181  // Loops where the lower bound is a max expression or the upper bound is
1182  // a min expression and the trip count doesn't divide the unroll factor
1183  // can't be unrolled since the lower bound of the cleanup loop in such cases
1184  // cannot be expressed as an affine function or a max over affine functions.
1185  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1186  forOp.getUpperBoundMap().getNumResults() != 1)
1187  return failure();
1188  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1189  assert(false && "cleanup loop lower bound map for single result lower "
1190  "and upper bound maps can always be determined");
1191  }
1192 
1193  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1194  // iteration. There are (`unrollJamFactor` - 1) iterations.
1195  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1196 
1197  // For any loop with iter_args, replace it with a new loop that has
1198  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1199  // operands.
1200  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1201  IRRewriter rewriter(forOp.getContext());
1202  for (AffineForOp oldForOp : loopsWithIterArgs) {
1203  SmallVector<Value> dupIterOperands, dupYieldOperands;
1204  ValueRange oldIterOperands = oldForOp.getInits();
1205  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1206  ValueRange oldYieldOperands =
1207  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1208  // Get additional iterOperands, iterArgs, and yield operands. We will
1209  // fix iterOperands and yield operands after cloning of sub-blocks.
1210  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1211  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1212  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1213  }
1214  // Create a new loop with additional iterOperands, iter_args and yield
1215  // operands. This new loop will take the loop body of the original loop.
1216  bool forOpReplaced = oldForOp == forOp;
1217  AffineForOp newForOp =
1218  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1219  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1220  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1221  return dupYieldOperands;
1222  }));
1223  newLoopsWithIterArgs.push_back(newForOp);
1224  // `forOp` has been replaced with a new loop.
1225  if (forOpReplaced)
1226  forOp = newForOp;
1227  // Update `operandMaps` for `newForOp` iterArgs and results.
1228  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1229  unsigned oldNumIterArgs = oldIterArgs.size();
1230  ValueRange newResults = newForOp.getResults();
1231  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1232  assert(oldNumIterArgs == oldNumResults &&
1233  "oldNumIterArgs must be the same as oldNumResults");
1234  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1235  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1236  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1237  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1238  // to those in the `i`th new set.
1239  operandMaps[i - 1].map(newIterArgs[j],
1240  newIterArgs[i * oldNumIterArgs + j]);
1241  operandMaps[i - 1].map(newResults[j],
1242  newResults[i * oldNumResults + j]);
1243  }
1244  }
1245  }
1246 
1247  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1248  int64_t step = forOp.getStepAsInt();
1249  forOp.setStep(step * unrollJamFactor);
1250 
1251  auto forOpIV = forOp.getInductionVar();
1252  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1253  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1254  for (auto &subBlock : subBlocks) {
1255  // Builder to insert unroll-jammed bodies. Insert right at the end of
1256  // sub-block.
1257  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1258 
1259  // If the induction variable is used, create a remapping to the value for
1260  // this unrolled instance.
1261  if (!forOpIV.use_empty()) {
1262  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1263  auto d0 = builder.getAffineDimExpr(0);
1264  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1265  auto ivUnroll =
1266  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1267  operandMaps[i - 1].map(forOpIV, ivUnroll);
1268  }
1269  // Clone the sub-block being unroll-jammed.
1270  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1271  builder.clone(*it, operandMaps[i - 1]);
1272  }
1273  // Fix iterOperands and yield op operands of newly created loops.
1274  for (auto newForOp : newLoopsWithIterArgs) {
1275  unsigned oldNumIterOperands =
1276  newForOp.getNumIterOperands() / unrollJamFactor;
1277  unsigned numControlOperands = newForOp.getNumControlOperands();
1278  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1279  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1280  assert(oldNumIterOperands == oldNumYieldOperands &&
1281  "oldNumIterOperands must be the same as oldNumYieldOperands");
1282  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1283  // The `i`th duplication of an old iterOperand or yield op operand
1284  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1285  // if such mapped value exists.
1286  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1287  operandMaps[i - 1].lookupOrDefault(
1288  newForOp.getOperand(numControlOperands + j)));
1289  yieldOp.setOperand(
1290  i * oldNumYieldOperands + j,
1291  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1292  }
1293  }
1294  }
1295  if (forOp.getNumResults() > 0) {
1296  // Create reduction ops to combine every `unrollJamFactor` related results
1297  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1298  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1299  // %1.
1300  rewriter.setInsertionPointAfter(forOp);
1301  auto loc = forOp.getLoc();
1302  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1303  for (LoopReduction &reduction : reductions) {
1304  unsigned pos = reduction.iterArgPosition;
1305  Value lhs = forOp.getResult(pos);
1306  Value rhs;
1308  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1309  rhs = forOp.getResult(i * oldNumResults + pos);
1310  // Create ops based on reduction type.
1311  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1312  if (!lhs)
1313  return failure();
1314  Operation *op = lhs.getDefiningOp();
1315  assert(op && "Reduction op should have been created");
1316  newOps.insert(op);
1317  }
1318  // Replace all uses except those in newly created reduction ops.
1319  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1320  }
1321  }
1322 
1323  // Promote the loop body up if this has turned into a single iteration loop.
1324  (void)promoteIfSingleIteration(forOp);
1325  return success();
1326 }
1327 
1328 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1329 /// nested within 'forOpA' as the only non-terminator operation in its block.
1330 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1331  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1332  auto &forOpABody = forOpA.getBody()->getOperations();
1333  auto &forOpBBody = forOpB.getBody()->getOperations();
1334 
1335  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1336  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1337  // body containing only the terminator.
1338  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1339  forOpABody, forOpABody.begin(),
1340  std::prev(forOpABody.end()));
1341  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1342  // body (this leaves forOpB's body containing only the terminator).
1343  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1344  std::prev(forOpBBody.end()));
1345  // 3) Splice forOpA into the beginning of forOpB's body.
1346  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1347  Block::iterator(forOpA));
1348 }
1349 
1350 // Checks each dependence component against the permutation to see if the
1351 // desired loop interchange would violate dependences by making the
1352 // dependence component lexicographically negative.
1354  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1355  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1356  // Invert permutation map.
1357  unsigned maxLoopDepth = loops.size();
1358  SmallVector<unsigned, 4> loopPermMapInv;
1359  loopPermMapInv.resize(maxLoopDepth);
1360  for (unsigned i = 0; i < maxLoopDepth; ++i)
1361  loopPermMapInv[loopPermMap[i]] = i;
1362 
1363  // Check each dependence component against the permutation to see if the
1364  // desired loop interchange permutation would make the dependence vectors
1365  // lexicographically negative.
1366  // Example 1: [-1, 1][0, 0]
1367  // Example 2: [0, 0][-1, 1]
1368  for (const auto &depComps : depCompsVec) {
1369  assert(depComps.size() >= maxLoopDepth);
1370  // Check if the first non-zero dependence component is positive.
1371  // This iterates through loops in the desired order.
1372  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1373  unsigned permIndex = loopPermMapInv[j];
1374  assert(depComps[permIndex].lb);
1375  int64_t depCompLb = *depComps[permIndex].lb;
1376  if (depCompLb > 0)
1377  break;
1378  if (depCompLb < 0)
1379  return false;
1380  }
1381  }
1382  return true;
1383 }
1384 
1385 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1386 /// nested sequence of loops in 'loops' would violate dependences.
1388  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1389  // Gather dependence components for dependences between all ops in loop nest
1390  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1391  assert(loopPermMap.size() == loops.size());
1392  unsigned maxLoopDepth = loops.size();
1393  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1394  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1395  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1396 }
1397 
1398 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1399 /// in it from outermost to innermost.
1400 bool LLVM_ATTRIBUTE_UNUSED
1402  assert(!loops.empty() && "no loops provided");
1403 
1404  // We already know that the block can't be empty.
1405  auto hasTwoElements = [](Block *block) {
1406  auto secondOpIt = std::next(block->begin());
1407  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1408  };
1409 
1410  auto enclosingLoop = loops.front();
1411  for (auto loop : loops.drop_front()) {
1412  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1413  // parentForOp's body should be just this loop and the terminator.
1414  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1415  return false;
1416  enclosingLoop = loop;
1417  }
1418  return true;
1419 }
1420 
1421 // input[i] should move from position i -> permMap[i]. Returns the position in
1422 // `input` that becomes the new outermost loop.
1424  ArrayRef<unsigned> permMap) {
1425  assert(input.size() == permMap.size() && "invalid permutation map size");
1426  // Check whether the permutation spec is valid. This is a small vector - we'll
1427  // just sort and check if it's iota.
1428  SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1429  llvm::sort(checkPermMap);
1430  if (llvm::any_of(llvm::enumerate(checkPermMap),
1431  [](const auto &en) { return en.value() != en.index(); }))
1432  assert(false && "invalid permutation map");
1433 
1434  // Nothing to do.
1435  if (input.size() < 2)
1436  return 0;
1437 
1438  assert(isPerfectlyNested(input) && "input not perfectly nested");
1439 
1440  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1441  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1443  for (unsigned i = 0, e = input.size(); i < e; ++i)
1444  invPermMap.push_back({permMap[i], i});
1445  llvm::sort(invPermMap);
1446 
1447  // Move the innermost loop body to the loop that would be the innermost in the
1448  // permuted nest (only if the innermost loop is going to change).
1449  if (permMap.back() != input.size() - 1) {
1450  auto *destBody = input[invPermMap.back().second].getBody();
1451  auto *srcBody = input.back().getBody();
1452  destBody->getOperations().splice(destBody->begin(),
1453  srcBody->getOperations(), srcBody->begin(),
1454  std::prev(srcBody->end()));
1455  }
1456 
1457  // We'll move each loop in `input` in the reverse order so that its body is
1458  // empty when we are moving it; this incurs zero copies and no erasing.
1459  for (int i = input.size() - 1; i >= 0; --i) {
1460  // If this has to become the outermost loop after permutation, add it to the
1461  // parent block of the original root.
1462  if (permMap[i] == 0) {
1463  // If the root remains the same, nothing to do.
1464  if (i == 0)
1465  continue;
1466  // Make input[i] the new outermost loop moving it into parentBlock.
1467  auto *parentBlock = input[0]->getBlock();
1468  parentBlock->getOperations().splice(Block::iterator(input[0]),
1469  input[i]->getBlock()->getOperations(),
1470  Block::iterator(input[i]));
1471  continue;
1472  }
1473 
1474  // If the parent in the permuted order is the same as in the original,
1475  // nothing to do.
1476  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1477  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1478  continue;
1479 
1480  // Move input[i] to its surrounding loop in the transformed nest.
1481  auto *destBody = input[parentPosInInput].getBody();
1482  destBody->getOperations().splice(destBody->begin(),
1483  input[i]->getBlock()->getOperations(),
1484  Block::iterator(input[i]));
1485  }
1486 
1487  return invPermMap[0].second;
1488 }
1489 
1490 // Sinks all sequential loops to the innermost levels (while preserving
1491 // relative order among them) and moves all parallel loops to the
1492 // outermost (while again preserving relative order among them).
1493 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1495  getPerfectlyNestedLoops(loops, forOp);
1496  if (loops.size() < 2)
1497  return forOp;
1498 
1499  // Gather dependence components for dependences between all ops in loop nest
1500  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1501  unsigned maxLoopDepth = loops.size();
1502  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1503  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1504 
1505  // Mark loops as either parallel or sequential.
1506  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1507  for (auto &depComps : depCompsVec) {
1508  assert(depComps.size() >= maxLoopDepth);
1509  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1510  DependenceComponent &depComp = depComps[j];
1511  assert(depComp.lb.has_value() && depComp.ub.has_value());
1512  if (*depComp.lb != 0 || *depComp.ub != 0)
1513  isParallelLoop[j] = false;
1514  }
1515  }
1516 
1517  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1518 
1519  // Compute permutation of loops that sinks sequential loops (and thus raises
1520  // parallel loops) while preserving relative order.
1521  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1522  unsigned nextSequentialLoop = numParallelLoops;
1523  unsigned nextParallelLoop = 0;
1524  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1525  if (isParallelLoop[i]) {
1526  loopPermMap[i] = nextParallelLoop++;
1527  } else {
1528  loopPermMap[i] = nextSequentialLoop++;
1529  }
1530  }
1531 
1532  // Check if permutation 'loopPermMap' would violate dependences.
1533  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1534  return forOp;
1535  // Perform loop interchange according to permutation 'loopPermMap'.
1536  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1537  return loops[loopNestRootIndex];
1538 }
1539 
1540 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1541 // lower (resp. upper) loop bound. When called for both the lower and upper
1542 // bounds, the resulting IR resembles:
1543 //
1544 // ```mlir
1545 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1546 // ...
1547 // }
1548 // ```
1550  SmallVector<Value, 4> *operands,
1551  int64_t offset = 0) {
1552  auto bounds = llvm::to_vector<4>(map->getResults());
1553  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1554  operands->insert(operands->begin() + map->getNumDims(), iv);
1555  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1556  b.getContext());
1557  canonicalizeMapAndOperands(map, operands);
1558 }
1559 
1560 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1561 // Stripmine-sink is a primitive building block for generalized tiling of
1562 // imperfectly nested loops.
1563 // This transformation is purely mechanical and does not check legality,
1564 // profitability or even structural correctness. It is the user's
1565 // responsibility to specify `targets` that are dominated by `forOp`.
1566 // Returns the new AffineForOps, one per `targets`, nested immediately under
1567 // each of the `targets`.
1569 stripmineSink(AffineForOp forOp, uint64_t factor,
1570  ArrayRef<AffineForOp> targets) {
1571  auto originalStep = forOp.getStepAsInt();
1572  auto scaledStep = originalStep * factor;
1573  forOp.setStep(scaledStep);
1574 
1575  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1576 
1577  // Lower-bound map creation.
1578  auto lbMap = forOp.getLowerBoundMap();
1579  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1580  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1581 
1582  // Upper-bound map creation.
1583  auto ubMap = forOp.getUpperBoundMap();
1584  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1585  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1586  /*offset=*/scaledStep);
1587 
1588  auto iv = forOp.getInductionVar();
1589  SmallVector<AffineForOp, 8> innerLoops;
1590  for (auto t : targets) {
1591  // Insert newForOp before the terminator of `t`.
1592  auto b = OpBuilder::atBlockTerminator(t.getBody());
1593  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1594  ubOperands, ubMap, originalStep);
1595  auto begin = t.getBody()->begin();
1596  // Skip terminator and `newForOp` which is just before the terminator.
1597  auto nOps = t.getBody()->getOperations().size() - 2;
1598  newForOp.getBody()->getOperations().splice(
1599  newForOp.getBody()->getOperations().begin(),
1600  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1601  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1602  newForOp.getRegion());
1603  innerLoops.push_back(newForOp);
1604  }
1605 
1606  return innerLoops;
1607 }
1608 
1609 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1610 // Returns the new AffineForOps, nested immediately under `target`.
1611 template <typename SizeType>
1612 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1613  AffineForOp target) {
1614  // TODO: Use cheap structural assertions that targets are nested under
1615  // forOp and that targets are not nested under each other when DominanceInfo
1616  // exposes the capability. It seems overkill to construct a whole function
1617  // dominance tree at this point.
1618  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1619  assert(res.size() == 1 && "Expected 1 inner forOp");
1620  return res[0];
1621 }
1622 
1625  ArrayRef<AffineForOp> targets) {
1627  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1628  for (auto it : llvm::zip(forOps, sizes)) {
1629  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1630  res.push_back(step);
1631  currentTargets = step;
1632  }
1633  return res;
1634 }
1635 
1637  ArrayRef<uint64_t> sizes,
1638  AffineForOp target) {
1640  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1641  assert(loops.size() == 1);
1642  res.push_back(loops[0]);
1643  }
1644  return res;
1645 }
1646 
1648  if (loops.size() < 2)
1649  return success();
1650 
1651  AffineForOp innermost = loops.back();
1652  AffineForOp outermost = loops.front();
1653  AffineBound ub = outermost.getUpperBound();
1654  AffineMap origUbMap = ub.getMap();
1655  Location loc = outermost.getLoc();
1656  OpBuilder builder(outermost);
1657  for (AffineForOp loop : loops) {
1658  // We only work on normalized loops.
1659  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1660  loop.getConstantLowerBound() != 0)
1661  return failure();
1662  }
1663  SmallVector<Value, 4> upperBoundSymbols;
1664  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1665  ub.getOperands().end());
1666 
1667  // 1. Store the upper bound of the outermost loop in a variable.
1668  Value prev;
1669  if (!llvm::hasSingleElement(origUbMap.getResults()))
1670  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1671  else
1672  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1673  upperBoundSymbols.push_back(prev);
1674 
1675  // 2. Emit code computing the upper bound of the coalesced loop as product of
1676  // the number of iterations of all loops.
1677  for (AffineForOp loop : loops.drop_front()) {
1678  ub = loop.getUpperBound();
1679  origUbMap = ub.getMap();
1680  ubOperands = ub.getOperands();
1681  Value upperBound;
1682  // If upper bound map has more than one result, take their minimum.
1683  if (!llvm::hasSingleElement(origUbMap.getResults()))
1684  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1685  else
1686  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1687  upperBoundSymbols.push_back(upperBound);
1688  SmallVector<Value, 4> operands;
1689  operands.push_back(prev);
1690  operands.push_back(upperBound);
1691  // Maintain running product of loop upper bounds.
1692  prev = builder.create<AffineApplyOp>(
1693  loc,
1694  AffineMap::get(/*dimCount=*/1,
1695  /*symbolCount=*/1,
1696  builder.getAffineDimExpr(0) *
1697  builder.getAffineSymbolExpr(0)),
1698  operands);
1699  }
1700  // Set upper bound of the coalesced loop.
1701  AffineMap newUbMap = AffineMap::get(
1702  /*dimCount=*/0,
1703  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1704  outermost.setUpperBound(prev, newUbMap);
1705 
1706  builder.setInsertionPointToStart(outermost.getBody());
1707 
1708  // 3. Remap induction variables. For each original loop, the value of the
1709  // induction variable can be obtained by dividing the induction variable of
1710  // the linearized loop by the total number of iterations of the loops nested
1711  // in it modulo the number of iterations in this loop (remove the values
1712  // related to the outer loops):
1713  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1714  // Compute these iteratively from the innermost loop by creating a "running
1715  // quotient" of division by the range.
1716  Value previous = outermost.getInductionVar();
1717  for (unsigned idx = loops.size(); idx > 0; --idx) {
1718  if (idx != loops.size()) {
1719  SmallVector<Value, 4> operands;
1720  operands.push_back(previous);
1721  operands.push_back(upperBoundSymbols[idx]);
1722  previous = builder.create<AffineApplyOp>(
1723  loc,
1725  /*dimCount=*/1, /*symbolCount=*/1,
1726  builder.getAffineDimExpr(0).floorDiv(
1727  builder.getAffineSymbolExpr(0))),
1728  operands);
1729  }
1730  // Modified value of the induction variables of the nested loops after
1731  // coalescing.
1732  Value inductionVariable;
1733  if (idx == 1) {
1734  inductionVariable = previous;
1735  } else {
1736  SmallVector<Value, 4> applyOperands;
1737  applyOperands.push_back(previous);
1738  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1739  inductionVariable = builder.create<AffineApplyOp>(
1740  loc,
1742  /*dimCount=*/1, /*symbolCount=*/1,
1743  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1744  applyOperands);
1745  }
1746  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1747  inductionVariable, loops.back().getRegion());
1748  }
1749 
1750  // 4. Move the operations from the innermost just above the second-outermost
1751  // loop, delete the extra terminator and the second-outermost loop.
1752  AffineForOp secondOutermostLoop = loops[1];
1753  innermost.getBody()->back().erase();
1754  outermost.getBody()->getOperations().splice(
1755  Block::iterator(secondOutermostLoop.getOperation()),
1756  innermost.getBody()->getOperations());
1757  secondOutermostLoop.erase();
1758  return success();
1759 }
1760 
1762  ArrayRef<Value> processorId,
1763  ArrayRef<Value> numProcessors) {
1764  assert(processorId.size() == numProcessors.size());
1765  if (processorId.empty())
1766  return;
1767 
1768  OpBuilder b(forOp);
1769  Location loc(forOp.getLoc());
1770  AffineExpr lhs, rhs;
1771  bindSymbols(forOp.getContext(), lhs, rhs);
1772  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1773  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1774 
1775  Value linearIndex = processorId.front();
1776  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1777  auto mulApplyOp = b.create<AffineApplyOp>(
1778  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1779  linearIndex = b.create<AffineApplyOp>(
1780  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1781  }
1782 
1783  auto mulApplyOp = b.create<AffineApplyOp>(
1784  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1785  Value lb = b.create<AffineApplyOp>(
1786  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1787  forOp.setLowerBound(lb);
1788 
1789  Value step = forOp.getStep();
1790  for (auto numProcs : numProcessors)
1791  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1792  forOp.setStep(step);
1793 }
1794 
1795 /// Given a memref region, determine the lowest depth at which transfers can be
1796 /// placed for it, and return the corresponding block, start and end positions
1797 /// in the block for placing incoming (read) and outgoing (write) copies
1798 /// respectively. The lowest depth depends on whether the region being accessed
1799 /// is hoistable with respect to one or more immediately surrounding loops.
1800 static void
1802  Block::iterator &begin, Block::iterator &end,
1803  Block **copyPlacementBlock,
1804  Block::iterator *copyInPlacementStart,
1805  Block::iterator *copyOutPlacementStart) {
1806  const auto *cst = region.getConstraints();
1807  SmallVector<Value, 4> symbols;
1808  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1809 
1810  SmallVector<AffineForOp, 4> enclosingFors;
1811  getAffineForIVs(*block.begin(), &enclosingFors);
1812  // Walk up loop parents till we find an IV on which this region is
1813  // symbolic/variant.
1814  auto it = enclosingFors.rbegin();
1815  for (auto e = enclosingFors.rend(); it != e; ++it) {
1816  // TODO: also need to be checking this for regions symbols that
1817  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1818  if (llvm::is_contained(symbols, it->getInductionVar()))
1819  break;
1820  }
1821 
1822  if (it != enclosingFors.rbegin()) {
1823  auto lastInvariantIV = *std::prev(it);
1824  *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1825  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1826  *copyPlacementBlock = lastInvariantIV->getBlock();
1827  } else {
1828  *copyInPlacementStart = begin;
1829  *copyOutPlacementStart = end;
1830  *copyPlacementBlock = &block;
1831  }
1832 }
1833 
1834 // Info comprising stride and number of elements transferred every stride.
1835 struct StrideInfo {
1836  int64_t stride;
1838 };
1839 
1840 /// Returns striding information for a copy/transfer of this region with
1841 /// potentially multiple striding levels from outermost to innermost. For an
1842 /// n-dimensional region, there can be at most n-1 levels of striding
1843 /// successively nested.
1844 // TODO: make this work with non-identity layout maps.
1845 static void getMultiLevelStrides(const MemRefRegion &region,
1846  ArrayRef<int64_t> bufferShape,
1847  SmallVectorImpl<StrideInfo> *strideInfos) {
1848  if (bufferShape.size() <= 1)
1849  return;
1850 
1851  int64_t numEltPerStride = 1;
1852  int64_t stride = 1;
1853  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1854  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1855  stride *= dimSize;
1856  numEltPerStride *= bufferShape[d];
1857  // A stride is needed only if the region has a shorter extent than the
1858  // memref along the dimension *and* has an extent greater than one along the
1859  // next major dimension.
1860  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1861  strideInfos->push_back({stride, numEltPerStride});
1862  }
1863  }
1864 }
1865 
1866 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1867 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1868 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1869 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1870 /// expressions to be subtracted out from the respective copy loop iterators in
1871 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1872 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1873 /// inserted.
1874 //
1875 /// The copy-in nest is generated as follows as an example for a 2-d region:
1876 /// for x = ...
1877 /// for y = ...
1878 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1879 ///
1880 static AffineForOp
1881 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1882  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1883  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1884  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1885  OpBuilder b) {
1886  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1887  return lbMap.getNumInputs() == lbOperands.size();
1888  }));
1889  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1890  return ubMap.getNumInputs() == ubOperands.size();
1891  }));
1892 
1893  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1894  assert(lbMaps.size() == rank && "wrong number of lb maps");
1895  assert(ubMaps.size() == rank && "wrong number of ub maps");
1896 
1897  SmallVector<Value, 4> memIndices;
1898  SmallVector<AffineExpr, 4> fastBufExprs;
1899  SmallVector<Value, 4> fastBufMapOperands;
1900  AffineForOp copyNestRoot;
1901  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1902  for (unsigned d = 0; d < rank; ++d) {
1903  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1904  ubOperands, ubMaps[d]);
1905  if (d == 0)
1906  copyNestRoot = forOp;
1907 
1908  b = OpBuilder::atBlockTerminator(forOp.getBody());
1909 
1910  auto fastBufOffsetMap =
1911  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1912  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1913 
1914  // Construct the subscript for the fast memref being copied into/from:
1915  // x - offset_x.
1916  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1917  b.getAffineDimExpr(2 * d));
1918  fastBufMapOperands.push_back(offset);
1919  fastBufMapOperands.push_back(forOp.getInductionVar());
1920  mayBeDeadApplys.push_back(offset);
1921 
1922  // Subscript for the slow memref being copied.
1923  memIndices.push_back(forOp.getInductionVar());
1924  }
1925 
1926  auto fastBufMap =
1927  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1928  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1929  fastBufMap = simplifyAffineMap(fastBufMap);
1930  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1931 
1932  // Drop any dead affine.applys.
1933  for (auto applyOp : mayBeDeadApplys)
1934  if (applyOp.use_empty())
1935  applyOp.erase();
1936 
1937  if (!isCopyOut) {
1938  // Copy in.
1939  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1940  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1941  fastBufMapOperands);
1942  return copyNestRoot;
1943  }
1944 
1945  // Copy out.
1946  auto load =
1947  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1948  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1949  return copyNestRoot;
1950 }
1951 
1952 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1954  return block.getParentOp()->emitRemark();
1955 }
1956 
1957 /// Creates a buffer in the faster memory space for the specified memref region;
1958 /// generates a copy from the lower memory space to this one, and replaces all
1959 /// loads/stores in the block range [`begin', `end') of `block' to load/store
1960 /// from that buffer. Returns failure if copies could not be generated due to
1961 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
1962 /// in copyPlacementBlock specify the insertion points where the incoming copies
1963 /// and outgoing copies, respectively, should be inserted (the insertion happens
1964 /// right before the insertion point). Since `begin` can itself be invalidated
1965 /// due to the memref rewriting done from this method, the output argument
1966 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
1967 /// happens). Since outgoing copies could have been inserted at `end`, the
1968 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
1969 /// size of the fast buffer allocated.
1971  const MemRefRegion &region, Block *block, Block::iterator begin,
1972  Block::iterator end, Block *copyPlacementBlock,
1973  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1974  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1975  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1976  Block::iterator *nBegin, Block::iterator *nEnd) {
1977  *nBegin = begin;
1978  *nEnd = end;
1979 
1980  func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1981  OpBuilder topBuilder(f.getBody());
1982  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1983 
1984  *sizeInBytes = 0;
1985 
1986  if (begin == end)
1987  return success();
1988 
1989  // Is the copy out point at the end of the block where we are doing
1990  // explicit copying.
1991  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1992 
1993  // Copies for read regions are going to be inserted at 'begin'.
1994  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1995  // Copies for write regions are going to be inserted at 'end'.
1996  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1997  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1998 
1999  // Builder to create constants at the top level.
2000  auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
2001  OpBuilder top(func.getBody());
2002 
2003  auto loc = region.loc;
2004  auto memref = region.memref;
2005  auto memRefType = cast<MemRefType>(memref.getType());
2006 
2007  if (!memRefType.getLayout().isIdentity()) {
2008  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2009  return failure();
2010  }
2011 
2012  // Indices to use for the copying.
2013  // Indices for the original memref being copied from/to.
2014  SmallVector<Value, 4> memIndices;
2015  // Indices for the faster buffer being copied into/from.
2016  SmallVector<Value, 4> bufIndices;
2017 
2018  unsigned rank = memRefType.getRank();
2019  SmallVector<int64_t, 4> fastBufferShape;
2020 
2021  // Compute the extents of the buffer.
2022  std::vector<SmallVector<int64_t, 4>> lbs;
2023  SmallVector<int64_t, 8> lbDivisors;
2024  lbs.reserve(rank);
2025  std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
2026  &fastBufferShape, &lbs, &lbDivisors);
2027  if (!numElements) {
2028  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2029  return failure();
2030  }
2031 
2032  if (*numElements == 0) {
2033  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2034  return success();
2035  }
2036 
2037  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2038  for (unsigned i = 0; i < rank; ++i)
2039  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2040 
2041  const FlatAffineValueConstraints *cst = region.getConstraints();
2042  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2043  // on; these typically include loop IVs surrounding the level at which the
2044  // copy generation is being done or other valid symbols in MLIR.
2045  SmallVector<Value, 8> regionSymbols;
2046  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2047 
2048  // Construct the index expressions for the fast memory buffer. The index
2049  // expression for a particular dimension of the fast buffer is obtained by
2050  // subtracting out the lower bound on the original memref's data region
2051  // along the corresponding dimension.
2052 
2053  // Index start offsets for faster memory buffer relative to the original.
2054  SmallVector<AffineExpr, 4> fastBufOffsets;
2055  fastBufOffsets.reserve(rank);
2056  for (unsigned d = 0; d < rank; d++) {
2057  assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2058 
2059  AffineExpr offset = top.getAffineConstantExpr(0);
2060  for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2061  offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2062  assert(lbDivisors[d] > 0);
2063  offset =
2064  (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2065 
2066  // Set copy start location for this dimension in the lower memory space
2067  // memref.
2068  if (auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2069  auto indexVal = caf.getValue();
2070  if (indexVal == 0) {
2071  memIndices.push_back(zeroIndex);
2072  } else {
2073  memIndices.push_back(
2074  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2075  }
2076  } else {
2077  // The coordinate for the start location is just the lower bound along the
2078  // corresponding dimension on the memory region (stored in 'offset').
2079  auto map = AffineMap::get(
2080  cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2081  memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2082  }
2083  // The fast buffer is copied into at location zero; addressing is relative.
2084  bufIndices.push_back(zeroIndex);
2085 
2086  // Record the offsets since they are needed to remap the memory accesses of
2087  // the original memref further below.
2088  fastBufOffsets.push_back(offset);
2089  }
2090 
2091  // The faster memory space buffer.
2092  Value fastMemRef;
2093 
2094  // Check if a buffer was already created.
2095  bool existingBuf = fastBufferMap.count(memref) > 0;
2096  if (!existingBuf) {
2097  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2098  auto fastMemRefType =
2099  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2100  fastBufferLayout, copyOptions.fastMemorySpace);
2101 
2102  // Create the fast memory space buffer just before the 'affine.for'
2103  // operation.
2104  fastMemRef =
2105  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2106  // Record it.
2107  fastBufferMap[memref] = fastMemRef;
2108  // fastMemRefType is a constant shaped memref.
2109  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2110  // We don't account for things of unknown size.
2111  *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
2112 
2113  LLVM_DEBUG(emitRemarkForBlock(*block)
2114  << "Creating fast buffer of type " << fastMemRefType
2115  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2116  << " KiB\n");
2117  } else {
2118  // Reuse the one already created.
2119  fastMemRef = fastBufferMap[memref];
2120  }
2121 
2122  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2123 
2124  Value dmaStride = nullptr;
2125  Value numEltPerDmaStride = nullptr;
2126  if (copyOptions.generateDma) {
2127  SmallVector<StrideInfo, 4> dmaStrideInfos;
2128  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2129 
2130  // TODO: use all stride levels once DmaStartOp is extended for
2131  // multi-level strides.
2132  if (dmaStrideInfos.size() > 1) {
2133  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2134  return failure();
2135  }
2136 
2137  if (!dmaStrideInfos.empty()) {
2138  dmaStride =
2139  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2140  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2141  loc, dmaStrideInfos[0].numEltPerStride);
2142  }
2143  }
2144 
2145  // Record the last operation where we want the memref replacement to end. We
2146  // later do the memref replacement only in [begin, postDomFilter] so
2147  // that the original memref's used in the data movement code themselves don't
2148  // get replaced.
2149  auto postDomFilter = std::prev(end);
2150 
2151  // Create fully composed affine maps for each memref.
2152  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2153  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2154  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2155  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2156 
2157  if (!copyOptions.generateDma) {
2158  // Point-wise copy generation.
2159  auto copyNest =
2160  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2161  /*lbOperands=*/regionSymbols, ubMaps,
2162  /*ubOperands=*/regionSymbols, fastBufOffsets,
2163  /*isCopyOut=*/region.isWrite(), b);
2164 
2165  // Record this so that we can skip it from yet another copy.
2166  copyNests.insert(copyNest);
2167 
2168  // Since new ops are being appended (for copy out's), adjust the end to
2169  // mark end of block range being processed if necessary.
2170  if (region.isWrite() && isCopyOutAtEndOfBlock)
2171  *nEnd = Block::iterator(copyNest.getOperation());
2172  } else {
2173  // DMA generation.
2174  // Create a tag (single element 1-d memref) for the DMA.
2175  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2176  copyOptions.tagMemorySpace);
2177  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2178 
2179  SmallVector<Value, 4> tagIndices({zeroIndex});
2180  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2181  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2182  if (!region.isWrite()) {
2183  // DMA non-blocking read from original buffer to fast buffer.
2184  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2185  fastMemRef, bufAffineMap, bufIndices,
2186  tagMemRef, tagAffineMap, tagIndices,
2187  numElementsSSA, dmaStride, numEltPerDmaStride);
2188  } else {
2189  // DMA non-blocking write from fast buffer to the original memref.
2190  auto op = b.create<AffineDmaStartOp>(
2191  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2192  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2193  dmaStride, numEltPerDmaStride);
2194  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2195  // end to mark end of block range being processed.
2196  if (isCopyOutAtEndOfBlock)
2197  *nEnd = Block::iterator(op.getOperation());
2198  }
2199 
2200  // Matching DMA wait to block on completion; tag always has a 0 index.
2201  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2202  numElementsSSA);
2203 
2204  // Generate dealloc for the tag.
2205  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2206  if (*nEnd == end && isCopyOutAtEndOfBlock)
2207  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2208  // mark end of range of the original.
2209  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2210  }
2211 
2212  // Generate dealloc for the buffer.
2213  if (!existingBuf) {
2214  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2215  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2216  // the fast buffer (since it marks the new end insertion point).
2217  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2218  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2219  }
2220 
2221  // Replace all uses of the old memref with the faster one while remapping
2222  // access indices (subtracting out lower bound offsets for each dimension).
2223  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2224  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2225  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2226  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2227  // d2, d3 correspond to the original indices (%i, %j).
2228  SmallVector<AffineExpr, 4> remapExprs;
2229  remapExprs.reserve(rank);
2230  for (unsigned i = 0; i < rank; i++) {
2231  // The starting operands of indexRemap will be regionSymbols (the symbols on
2232  // which the memref region is parametric); then those corresponding to
2233  // the memref's original indices follow.
2234  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2235  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2236  }
2237  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2238  b.getContext());
2239 
2240  // Record the begin since it may be invalidated by memref replacement.
2241  Block::iterator prevOfBegin;
2242  bool isBeginAtStartOfBlock = (begin == block->begin());
2243  if (!isBeginAtStartOfBlock)
2244  prevOfBegin = std::prev(begin);
2245 
2246  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2247  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2248  /*extraIndices=*/{}, indexRemap,
2249  /*extraOperands=*/regionSymbols,
2250  /*symbolOperands=*/{},
2251  /*domOpFilter=*/&*begin,
2252  /*postDomOpFilter=*/&*postDomFilter);
2253 
2254  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2255 
2256  return success();
2257 }
2258 
2259 /// Construct the memref region to just include the entire memref. Returns false
2260 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2261 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2262 /// is parametric on.
2263 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2264  MemRefRegion *region) {
2265  unsigned rank;
2266  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2267  rank = loadOp.getMemRefType().getRank();
2268  region->memref = loadOp.getMemRef();
2269  region->setWrite(false);
2270  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2271  rank = storeOp.getMemRefType().getRank();
2272  region->memref = storeOp.getMemRef();
2273  region->setWrite(true);
2274  } else {
2275  assert(false && "expected load or store op");
2276  return false;
2277  }
2278  auto memRefType = cast<MemRefType>(region->memref.getType());
2279  if (!memRefType.hasStaticShape())
2280  return false;
2281 
2282  auto *regionCst = region->getConstraints();
2283 
2284  // Just get the first numSymbols IVs, which the memref region is parametric
2285  // on.
2287  getAffineForIVs(*op, &ivs);
2288  ivs.resize(numParamLoopIVs);
2289  SmallVector<Value, 4> symbols;
2290  extractForInductionVars(ivs, &symbols);
2291  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2292  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2293 
2294  // Memref dim sizes provide the bounds.
2295  for (unsigned d = 0; d < rank; d++) {
2296  auto dimSize = memRefType.getDimSize(d);
2297  assert(dimSize > 0 && "filtered dynamic shapes above");
2298  regionCst->addBound(BoundType::LB, d, 0);
2299  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2300  }
2301  return true;
2302 }
2303 
2306  const AffineCopyOptions &copyOptions,
2307  std::optional<Value> filterMemRef,
2308  DenseSet<Operation *> &copyNests) {
2309  if (begin == end)
2310  return success();
2311 
2312  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2313  "Inconsistent block begin/end args");
2314  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2315 
2316  Block *block = begin->getBlock();
2317 
2318  // Copies will be generated for this depth, i.e., symbolic in all loops
2319  // surrounding the this block range.
2320  unsigned copyDepth = getNestingDepth(&*begin);
2321 
2322  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2323  << "\n");
2324  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2325  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2326 
2327  // List of memory regions to copy for. We need a map vector to have a
2328  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2329  // since the alloc's for example are identical except for the SSA id.
2330  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2331  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2332 
2333  // Map from original memref's to the fast buffers that their accesses are
2334  // replaced with.
2335  DenseMap<Value, Value> fastBufferMap;
2336 
2337  // To check for errors when walking the block.
2338  bool error = false;
2339 
2340  // Walk this range of operations to gather all memory regions.
2341  block->walk(begin, end, [&](Operation *opInst) {
2342  // Gather regions to allocate to buffers in faster memory space.
2343  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2344  if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2345  (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2346  copyOptions.slowMemorySpace))
2347  return;
2348  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2349  if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2350  storeOp.getMemRefType().getMemorySpaceAsInt() !=
2351  copyOptions.slowMemorySpace)
2352  return;
2353  } else {
2354  // Neither load nor a store op.
2355  return;
2356  }
2357 
2358  // Compute the MemRefRegion accessed.
2359  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2360  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2361  /*addMemRefDimBounds=*/false))) {
2362  LLVM_DEBUG(llvm::dbgs()
2363  << "Error obtaining memory region: semi-affine maps?\n");
2364  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2365  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2366  LLVM_DEBUG(
2367  opInst->emitError("non-constant memref sizes not yet supported"));
2368  error = true;
2369  return;
2370  }
2371  }
2372 
2373  // Each memref has a single buffer associated with it irrespective of how
2374  // many load's and store's happen on it.
2375  // TODO: in the future, when regions don't intersect and satisfy
2376  // other properties (based on load/store regions), we could consider
2377  // multiple buffers per memref.
2378 
2379  // Add to the appropriate region if it's not already in it, or take a
2380  // bounding box union with the existing one if it's already in there.
2381  // Note that a memref may have both read and write regions - so update the
2382  // region in the other list if one exists (write in case of read and vice
2383  // versa) since there is a single bounding box for a memref across all reads
2384  // and writes that happen on it.
2385 
2386  // Attempts to update; returns true if 'region' exists in targetRegions.
2387  auto updateRegion =
2388  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2389  &targetRegions) {
2390  const auto *const it = targetRegions.find(region->memref);
2391  if (it == targetRegions.end())
2392  return false;
2393 
2394  // Perform a union with the existing region.
2395  if (failed(it->second->unionBoundingBox(*region))) {
2396  LLVM_DEBUG(llvm::dbgs()
2397  << "Memory region bounding box failed; "
2398  "over-approximating to the entire memref\n");
2399  // If the union fails, we will overapproximate.
2400  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2401  LLVM_DEBUG(opInst->emitError(
2402  "non-constant memref sizes not yet supported"));
2403  error = true;
2404  return true;
2405  }
2406  it->second->getConstraints()->clearAndCopyFrom(
2407  *region->getConstraints());
2408  } else {
2409  // Union was computed and stored in 'it->second': copy to 'region'.
2410  region->getConstraints()->clearAndCopyFrom(
2411  *it->second->getConstraints());
2412  }
2413  return true;
2414  };
2415 
2416  bool existsInRead = updateRegion(readRegions);
2417  if (error)
2418  return;
2419  bool existsInWrite = updateRegion(writeRegions);
2420  if (error)
2421  return;
2422 
2423  // Finally add it to the region list.
2424  if (region->isWrite() && !existsInWrite) {
2425  writeRegions[region->memref] = std::move(region);
2426  } else if (!region->isWrite() && !existsInRead) {
2427  readRegions[region->memref] = std::move(region);
2428  }
2429  });
2430 
2431  if (error) {
2432  LLVM_DEBUG(begin->emitError(
2433  "copy generation failed for one or more memref's in this block\n"));
2434  return failure();
2435  }
2436 
2437  uint64_t totalCopyBuffersSizeInBytes = 0;
2438  bool ret = true;
2439  auto processRegions =
2440  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2441  &regions) {
2442  for (const auto &regionEntry : regions) {
2443  // For each region, hoist copy in/out past all hoistable
2444  // 'affine.for's.
2445  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2446  Block *copyPlacementBlock;
2448  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2449  &copyInPlacementStart, &copyOutPlacementStart);
2450 
2451  uint64_t sizeInBytes;
2452  Block::iterator nBegin, nEnd;
2453  LogicalResult iRet = generateCopy(
2454  *regionEntry.second, block, begin, end, copyPlacementBlock,
2455  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2456  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2457  if (succeeded(iRet)) {
2458  // begin/end could have been invalidated, and need update.
2459  begin = nBegin;
2460  end = nEnd;
2461  totalCopyBuffersSizeInBytes += sizeInBytes;
2462  }
2463  ret = ret & succeeded(iRet);
2464  }
2465  };
2466  processRegions(readRegions);
2467  processRegions(writeRegions);
2468 
2469  if (!ret) {
2470  LLVM_DEBUG(begin->emitError(
2471  "copy generation failed for one or more memref's in this block\n"));
2472  return failure();
2473  }
2474 
2475  // For a range of operations, a note will be emitted at the caller.
2476  AffineForOp forOp;
2477  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2478  LLVM_DEBUG(forOp.emitRemark()
2479  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2480  << " KiB of copy buffers in fast memory space for this block");
2481  }
2482 
2483  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2484  block->getParentOp()->emitWarning(
2485  "total size of all copy buffers' for this block exceeds fast memory "
2486  "capacity");
2487  }
2488 
2489  return success();
2490 }
2491 
2492 // A convenience version of affineDataCopyGenerate for all ops in the body of
2493 // an AffineForOp.
2495  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2496  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2497  return affineDataCopyGenerate(forOp.getBody()->begin(),
2498  std::prev(forOp.getBody()->end()), copyOptions,
2499  filterMemRef, copyNests);
2500 }
2501 
2503  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2504  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2505  Block *block = analyzedOp->getBlock();
2506  auto begin = analyzedOp->getIterator();
2507  auto end = std::next(begin);
2508  DenseMap<Value, Value> fastBufferMap;
2509  DenseSet<Operation *> copyNests;
2510 
2511  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2512  copyOptions, fastBufferMap, copyNests,
2513  &result.sizeInBytes, &begin, &end);
2514  if (failed(err))
2515  return err;
2516 
2517  const auto &en = fastBufferMap.find(memrefRegion.memref);
2518  // In some cases (empty loops), no copy generation would have happened.
2519  if (en == fastBufferMap.end())
2520  return failure();
2521  result.alloc = en->second.getDefiningOp();
2522  assert(result.alloc && "fast buffer expected to be locally allocated");
2523  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2524  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2525  return success();
2526 }
2527 
2528 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2529 static void
2530 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2531  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2532  // Add a new empty level to output if it doesn't exist level already.
2533  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2534  if (currLoopDepth == depthToLoops.size())
2535  depthToLoops.emplace_back();
2536 
2537  for (auto &op : *block) {
2538  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2539  depthToLoops[currLoopDepth].push_back(forOp);
2540  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2541  }
2542  }
2543 }
2544 
2545 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2547  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2548  for (auto &block : func)
2549  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2550 
2551  // Remove last loop level from output since it's empty.
2552  if (!depthToLoops.empty()) {
2553  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2554  depthToLoops.pop_back();
2555  }
2556 }
2557 
2559  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2560  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2561  SmallVector<Value, 4> lowerOperands(lbOperands);
2562  SmallVector<Value, 4> upperOperands(ubOperands);
2563 
2564  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2565  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2566  lbMap = removeDuplicateExprs(lbMap);
2567  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2568  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2569  ubMap = removeDuplicateExprs(ubMap);
2570 
2571  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2572  step);
2573 }
2574 
2575 /// Creates an AffineIfOp that encodes the conditional to choose between
2576 /// the constant trip count version and an unknown trip count version of this
2577 /// nest of loops. This is used to separate partial and full tiles if `loops`
2578 /// has the intra-tile loops. The affine.if op is inserted at the builder
2579 /// insertion point of `b`.
2581  OpBuilder b) {
2582  if (loops.empty())
2583  return nullptr;
2584 
2585  auto *context = loops[0].getContext();
2586 
2589  llvm::append_range(ops, loops);
2590  (void)getIndexSet(ops, &cst);
2591 
2592  // Remove constraints that are independent of these loop IVs.
2593  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2594 
2595  // Construct the constraint set representing the guard for full tiles. The
2596  // lower bound (and upper bound) corresponding to the full tile should be
2597  // larger (and resp. smaller) than any other lower (or upper bound).
2598  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2599  for (auto loop : loops) {
2600  (void)loop;
2601  // TODO: Non-unit stride is not an issue to generalize to.
2602  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2603  // Mark everything symbols for the purpose of finding a constant diff pair.
2604  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2605  1);
2606  unsigned fullTileLbPos, fullTileUbPos;
2607  if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2608  /*boundFloorDivisor=*/nullptr,
2609  /*ub=*/nullptr, &fullTileLbPos,
2610  &fullTileUbPos)) {
2611  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2612  return nullptr;
2613  }
2614 
2615  SmallVector<unsigned, 4> lbIndices, ubIndices;
2616  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2617 
2618  auto fLb = cst.getInequality(fullTileLbPos);
2619  auto fUb = cst.getInequality(fullTileUbPos);
2620  fullTileLb.assign(fLb.begin(), fLb.end());
2621  fullTileUb.assign(fUb.begin(), fUb.end());
2622 
2623  // Full tile lower bound should be >= than any other lower bound.
2624  for (auto lbIndex : lbIndices)
2625  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2626  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2627 
2628  // Full tile upper bound should be <= any other upper bound.
2629  for (auto ubIndex : ubIndices)
2630  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2631  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2632 
2633  cst.removeVar(0);
2634  }
2635 
2636  // The previous step leads to all zeros for the full tile lb and ub position
2637  // itself; remove those and any other duplicates / trivial redundancies.
2639 
2640  // Turn everything into dims conservatively since we earlier turned all
2641  // trailing ids past point loop IV into symbols. Some of these could be outer
2642  // loop IVs; we'll canonicalize anyway.
2643  cst.setDimSymbolSeparation(0);
2644 
2645  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2646  // ifCondSet can be null if cst was empty -- this can happen if all loops
2647  // in the nest have constant trip counts.
2648  if (!ifCondSet)
2649  return nullptr;
2650 
2651  SmallVector<Value, 4> setOperands;
2652  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2653  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2654  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2655  /*withElseRegion=*/true);
2656 }
2657 
2658 /// Create the full tile loop nest (along with its body).
2659 static LogicalResult
2661  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2662  fullTileLoops.reserve(inputNest.size());
2663 
2664  // For each loop in the original nest identify a lower/upper bound pair such
2665  // that their difference is a constant.
2667  for (auto loop : inputNest) {
2668  // TODO: straightforward to generalize to a non-unit stride.
2669  if (loop.getStepAsInt() != 1) {
2670  LLVM_DEBUG(llvm::dbgs()
2671  << "[tile separation] non-unit stride not implemented\n");
2672  return failure();
2673  }
2674  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2675  (void)getIndexSet(loopOp, &cst);
2676  // We will mark everything other than this loop IV as symbol for getting a
2677  // pair of <lb, ub> with a constant difference.
2679  unsigned lbPos, ubPos;
2680  if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2681  /*boundFloorDivisor=*/nullptr,
2682  /*ub=*/nullptr, &lbPos, &ubPos) ||
2683  lbPos == ubPos) {
2684  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2685  "equalities not yet handled\n");
2686  return failure();
2687  }
2688 
2689  // Set all variables as dimensions uniformly since some of those marked as
2690  // symbols above could be outer loop IVs (corresponding tile space IVs).
2691  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2692 
2693  AffineValueMap lbVmap, ubVmap;
2694  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2695  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2696  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2697  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2698  ubVmap.getOperands(), ubVmap.getAffineMap());
2699  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2700  fullTileLoops.push_back(fullTileLoop);
2701  }
2702 
2703  // Add the body for the full tile loop nest.
2704  IRMapping operandMap;
2705  for (const auto &loopEn : llvm::enumerate(inputNest))
2706  operandMap.map(loopEn.value().getInductionVar(),
2707  fullTileLoops[loopEn.index()].getInductionVar());
2708  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2709  for (auto &op : inputNest.back().getBody()->without_terminator())
2710  b.clone(op, operandMap);
2711  return success();
2712 }
2713 
2716  SmallVectorImpl<AffineForOp> *fullTileNest) {
2717  if (inputNest.empty())
2718  return success();
2719 
2720  auto firstLoop = inputNest[0];
2721 
2722  // Each successive for op has to be nested in the other.
2723  auto prevLoop = firstLoop;
2724  for (auto loop : inputNest.drop_front(1)) {
2725  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2726  prevLoop = loop;
2727  }
2728 
2729  // Create the full tile loop nest.
2730  SmallVector<AffineForOp, 4> fullTileLoops;
2731  OpBuilder b(firstLoop);
2732  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2733  if (!fullTileLoops.empty())
2734  fullTileLoops.front().erase();
2735  return failure();
2736  }
2737 
2738  // Create and insert the version select right before the root of the nest.
2739  b = OpBuilder(firstLoop);
2740  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2741  if (!ifOp) {
2742  fullTileLoops.front().erase();
2743  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2744  "separation condition\n");
2745  return failure();
2746  }
2747 
2748  // Move the full tile into the then block.
2749  Block *thenBlock = ifOp.getThenBlock();
2750  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2751  thenBlock->getOperations().splice(
2752  std::prev(thenBlock->end()),
2753  outermostFullTileLoop->getBlock()->getOperations(),
2754  Block::iterator(outermostFullTileLoop));
2755 
2756  // Move the partial tile into the else block. The partial tile is the same as
2757  // the original loop nest.
2758  Block *elseBlock = ifOp.getElseBlock();
2759  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2760  firstLoop->getBlock()->getOperations(),
2761  Block::iterator(firstLoop));
2762 
2763  if (fullTileNest)
2764  *fullTileNest = std::move(fullTileLoops);
2765 
2766  return success();
2767 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:390
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:668
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:431
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2263
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1569
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1953
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:369
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1801
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:182
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:416
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:567
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:472
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2530
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:921
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
Definition: LoopUtils.cpp:1970
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2660
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1845
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:696
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:981
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1094
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1353
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:425
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2580
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:57
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1549
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
Definition: LoopUtils.cpp:1881
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:115
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:69
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:882
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:47
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:384
unsigned getNumDims() const
Definition: AffineMap.cpp:380
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:393
unsigned getNumResults() const
Definition: AffineMap.cpp:388
unsigned getNumInputs() const
Definition: AffineMap.cpp:389
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:397
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:137
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:26
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:289
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:243
OpListType & getOperations()
Definition: Block.h:134
Operation & front()
Definition: Block.h:150
iterator end()
Definition: Block.h:141
iterator begin()
Definition: Block.h:140
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:408
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:414
AffineMap getDimIdentityMap()
Definition: Builders.cpp:390
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:394
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:375
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:379
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:371
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:731
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:308
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:553
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:433
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:254
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:414
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:108
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:42
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:280
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:672
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
MLIRContext * getContext() const
Definition: PatternMatch.h:785
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:378
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:214
Type getType() const
Return the type of this value.
Definition: Value.h:125
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:52
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:497
Value getOperand(unsigned idx)
Definition: AffineOps.h:503
operand_range getOperands()
Definition: AffineOps.h:512
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:94
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:291
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:90
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
ArrayRef< MPInt > getInequality(unsigned idx) const
MPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
std::optional< MPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< MPInt > *lb=nullptr, MPInt *boundFloorDivisor=nullptr, SmallVectorImpl< MPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1647
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1125
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:893
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:131
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2305
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1083
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2578
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1016
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2546
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1401
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2558
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:864
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:237
unsigned permuteLoops(MutableArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1423
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1387
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2502
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1426
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:908
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1134
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:822
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1431
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:633
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1246
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1848
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1761
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1624
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1493
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:780
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1330
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:881
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1267
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2715
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2539
void walk(Operation *op, function_ref< void(Region *)> callback, WalkOrder order)
Walk all of the regions, blocks, or operations nested under (and including) the given operation.
Definition: Visitors.h:137
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:736
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:28
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:746
int64_t floorDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's floordiv operation on constants.
Definition: MathExtras.h:33
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
Definition: MathExtras.h:23
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
Definition: LogicalResult.h:68
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:363
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
@ ExistingOps
Only pre-existing ops are processed.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
std::vector< std::pair< Block::iterator, Block::iterator > > subBlocks
Definition: LoopUtils.cpp:1110
void walk(Operation *op)
Definition: LoopUtils.cpp:1113
void walk(Block &block)
Definition: LoopUtils.cpp:1119
int64_t stride
Definition: LoopUtils.cpp:1836
int64_t numEltPerStride
Definition: LoopUtils.cpp:1837
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:166
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:206
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:430
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:469
bool isWrite() const
Definition: Utils.h:471
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:942
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1003
void setWrite(bool flag)
Definition: Utils.h:472
Value memref
Memref that this region corresponds to.
Definition: Utils.h:517
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:524
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.