MLIR  21.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include <optional>
30 
31 #define DEBUG_TYPE "loop-utils"
32 
33 using namespace mlir;
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
37 
38 /// Computes the cleanup loop lower bound of the loop being unrolled with
39 /// the specified unroll factor; this bound will also be upper bound of the main
40 /// part of the unrolled loop. Computes the bound as an AffineMap with its
41 /// operands or a null map when the trip count can't be expressed as an affine
42 /// expression.
43 static void
44 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45  AffineMap &cleanupLbMap,
46  SmallVectorImpl<Value> &cleanupLbOperands) {
47  AffineMap tripCountMap;
48  SmallVector<Value, 4> tripCountOperands;
49  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50  // Trip count can't be computed.
51  if (!tripCountMap) {
52  cleanupLbMap = AffineMap();
53  return;
54  }
55 
56  OpBuilder b(forOp);
57  auto lbMap = forOp.getLowerBoundMap();
58  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59  forOp.getLowerBoundOperands());
60 
61  // For each upper bound expr, get the range.
62  // Eg: affine.for %i = lb to min (ub1, ub2),
63  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65  // these affine.apply's make up the cleanup loop lower bound.
66  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68  int64_t step = forOp.getStepAsInt();
69  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70  auto tripCountExpr = tripCountMap.getResult(i);
71  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
73  tripCountMap.getNumSymbols(), bumpExprs[i]);
74  bumpValues[i] =
75  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76  }
77 
78  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
81 
82  cleanupLbOperands.clear();
83  cleanupLbOperands.push_back(lb);
84  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
85  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
86  b.getContext());
87  // Simplify the cleanupLbMap + cleanupLbOperands.
88  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
89  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
90  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  // Remove any affine.apply's that became dead from the simplification above.
92  for (auto v : bumpValues)
93  if (v.use_empty())
94  v.getDefiningOp()->erase();
95 
96  if (lb.use_empty())
97  lb.erase();
98 }
99 
100 /// Helper to replace uses of loop carried values (iter_args) and loop
101 /// yield values while promoting single iteration affine.for ops.
102 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103  // Replace uses of iter arguments with iter operands (initial values).
104  auto iterOperands = forOp.getInits();
105  auto iterArgs = forOp.getRegionIterArgs();
106  for (auto e : llvm::zip(iterOperands, iterArgs))
107  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108 
109  // Replace uses of loop results with the values yielded by the loop.
110  auto outerResults = forOp.getResults();
111  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112  for (auto e : llvm::zip(outerResults, innerResults))
113  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114 }
115 
116 /// Promotes the loop body of a forOp to its containing block if the forOp
117 /// was known to have a single iteration.
118 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120  if (!tripCount || *tripCount != 1)
121  return failure();
122 
123  // TODO: extend this for arbitrary affine bounds.
124  if (forOp.getLowerBoundMap().getNumResults() != 1)
125  return failure();
126 
127  // Replaces all IV uses to its single iteration value.
128  auto iv = forOp.getInductionVar();
129  auto *parentBlock = forOp->getBlock();
130  if (!iv.use_empty()) {
131  if (forOp.hasConstantLowerBound()) {
132  auto func = forOp->getParentOfType<FunctionOpInterface>();
133  OpBuilder builder(forOp->getContext());
134  if (func)
135  builder.setInsertionPointToStart(&func.getFunctionBody().front());
136  else
137  builder.setInsertionPoint(forOp);
138  auto constOp = builder.create<arith::ConstantIndexOp>(
139  forOp.getLoc(), forOp.getConstantLowerBound());
140  iv.replaceAllUsesWith(constOp);
141  } else {
142  auto lbOperands = forOp.getLowerBoundOperands();
143  auto lbMap = forOp.getLowerBoundMap();
144  OpBuilder builder(forOp);
145  if (lbMap == builder.getDimIdentityMap()) {
146  // No need of generating an affine.apply.
147  iv.replaceAllUsesWith(lbOperands[0]);
148  } else {
149  auto affineApplyOp =
150  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151  iv.replaceAllUsesWith(affineApplyOp);
152  }
153  }
154  }
155 
157 
158  // Move the loop body operations, except for its terminator, to the loop's
159  // containing block.
160  forOp.getBody()->back().erase();
161  parentBlock->getOperations().splice(Block::iterator(forOp),
162  forOp.getBody()->getOperations());
163  forOp.erase();
164  return success();
165 }
166 
167 /// Generates an affine.for op with the specified lower and upper bounds
168 /// while generating the right IV remappings to realize shifts for operations in
169 /// its body. The operations that go into the loop body are specified in
170 /// opGroupQueue starting from the specified offset, and in that order. The
171 /// first element of the pair specifies the shift applied to that group of
172 /// operations; the shift is multiplied by the loop step before being applied.
173 /// Returns nullptr if the generated loop simplifies to a single iteration one.
174 static AffineForOp generateShiftedLoop(
175  AffineMap lbMap, AffineMap ubMap,
176  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178  auto lbOperands = srcForOp.getLowerBoundOperands();
179  auto ubOperands = srcForOp.getUpperBoundOperands();
180 
181  assert(lbMap.getNumInputs() == lbOperands.size());
182  assert(ubMap.getNumInputs() == ubOperands.size());
183 
184  auto loopChunk =
185  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186  ubMap, srcForOp.getStepAsInt());
187  auto loopChunkIV = loopChunk.getInductionVar();
188  auto srcIV = srcForOp.getInductionVar();
189 
190  IRMapping operandMap;
191 
192  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
193  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194  uint64_t shift = it.first;
195  auto ops = it.second;
196  // All 'same shift' operations get added with their operands being
197  // remapped to results of cloned operations, and their IV used remapped.
198  // Generate the remapping if the shift is not zero: remappedIV = newIV -
199  // shift.
200  if (!srcIV.use_empty() && shift != 0) {
201  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202  srcForOp.getLoc(),
203  bodyBuilder.getSingleDimShiftAffineMap(
204  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205  loopChunkIV);
206  operandMap.map(srcIV, ivRemap);
207  } else {
208  operandMap.map(srcIV, loopChunkIV);
209  }
210  for (auto *op : ops)
211  bodyBuilder.clone(*op, operandMap);
212  };
213  if (succeeded(promoteIfSingleIteration(loopChunk)))
214  return AffineForOp();
215  return loopChunk;
216 }
217 
218 // The skewing of operations with respect to one another can be used for
219 // example to allow overlap of asynchronous operations (such as DMA
220 // communication) with computation, or just relative shifting of operations
221 // for better register reuse, locality or parallelism. As such, the shifts are
222 // typically expected to be at most of the order of the number of operations.
223 // This method should not be used as a substitute for loop distribution/fission.
224 // This method uses an algorithm// in time linear in the number of operations
225 // in the body of the for loop - (using the 'sweep line' paradigm). This method
226 // asserts preservation of SSA dominance. A check for that as well as that for
227 // memory-based dependence preservation check rests with the users of this
228 // method.
229 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230  ArrayRef<uint64_t> shifts,
231  bool unrollPrologueEpilogue) {
232  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233  "too few/many shifts");
234  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235  return success();
236 
237  // If the trip counts aren't constant, we would need versioning and
238  // conditional guards (or context information to prevent such versioning). The
239  // better way to pipeline for such loops is to first tile them and extract
240  // constant trip count "full tiles" before applying this.
241  auto mayBeConstTripCount = getConstantTripCount(forOp);
242  if (!mayBeConstTripCount) {
243  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244  return success();
245  }
246  uint64_t tripCount = *mayBeConstTripCount;
247 
248  assert(isOpwiseShiftValid(forOp, shifts) &&
249  "shifts will lead to an invalid transformation\n");
250 
251  int64_t step = forOp.getStepAsInt();
252 
253  unsigned numChildOps = shifts.size();
254 
255  // Do a linear time (counting) sort for the shifts.
256  uint64_t maxShift = *llvm::max_element(shifts);
257  if (maxShift >= numChildOps) {
258  // Large shifts are not the typical use case.
259  forOp.emitWarning("not shifting because shifts are unrealistically large");
260  return success();
261  }
262 
263  // An array of operation groups sorted by shift amount; each group has all
264  // operations with the same shift in the order in which they appear in the
265  // body of the 'affine.for' op.
266  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267  unsigned pos = 0;
268  for (auto &op : forOp.getBody()->without_terminator()) {
269  auto shift = shifts[pos++];
270  sortedOpGroups[shift].push_back(&op);
271  }
272 
273  // Unless the shifts have a specific pattern (which actually would be the
274  // common use case), prologue and epilogue are not meaningfully defined.
275  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276  // loop generated as the prologue and the last as epilogue and unroll these
277  // fully.
278  AffineForOp prologue, epilogue;
279 
280  // Do a sweep over the sorted shifts while storing open groups in a
281  // vector, and generating loop portions as necessary during the sweep. A block
282  // of operations is paired with its shift.
283  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284 
285  auto origLbMap = forOp.getLowerBoundMap();
286  uint64_t lbShift = 0;
287  OpBuilder b(forOp);
288  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289  // If nothing is shifted by d, continue.
290  if (sortedOpGroups[d].empty())
291  continue;
292  if (!opGroupQueue.empty()) {
293  assert(d > 0 &&
294  "Queue expected to be empty when the first block is found");
295  // The interval for which the loop needs to be generated here is:
296  // [lbShift, min(lbShift + tripCount, d)) and the body of the
297  // loop needs to have all operations in opQueue in that order.
298  AffineForOp res;
299  if (lbShift + tripCount * step < d * step) {
300  res = generateShiftedLoop(
301  b.getShiftedAffineMap(origLbMap, lbShift),
302  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303  opGroupQueue, /*offset=*/0, forOp, b);
304  // Entire loop for the queued op groups generated, empty it.
305  opGroupQueue.clear();
306  lbShift += tripCount * step;
307  } else {
308  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309  b.getShiftedAffineMap(origLbMap, d),
310  opGroupQueue, /*offset=*/0, forOp, b);
311  lbShift = d * step;
312  }
313 
314  if (res) {
315  // Simplify/canonicalize the affine.for.
316  RewritePatternSet patterns(res.getContext());
317  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
320  bool erased;
321  (void)applyOpPatternsGreedily(res.getOperation(), std::move(patterns),
322  config, /*changed=*/nullptr, &erased);
323  if (!erased && !prologue)
324  prologue = res;
325  if (!erased)
326  epilogue = res;
327  }
328  } else {
329  // Start of first interval.
330  lbShift = d * step;
331  }
332  // Augment the list of operations that get into the current open interval.
333  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
334  }
335 
336  // Those operations groups left in the queue now need to be processed (FIFO)
337  // and their loops completed.
338  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
339  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
340  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
341  b.getShiftedAffineMap(origLbMap, ubShift),
342  opGroupQueue, /*offset=*/i, forOp, b);
343  lbShift = ubShift;
344  if (!prologue)
345  prologue = epilogue;
346  }
347 
348  // Erase the original for op.
349  forOp.erase();
350 
351  if (unrollPrologueEpilogue && prologue)
352  (void)loopUnrollFull(prologue);
353  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
354  (void)loopUnrollFull(epilogue);
355 
356  return success();
357 }
358 
359 /// Checks whether a loop nest is hyper-rectangular or not.
360 static LogicalResult
363  SmallVector<Operation *, 8> ops(input.begin(), input.end());
364  // 0-d or 1-d is trivially hyper-rectangular.
365  if (input.size() <= 1)
366  return success();
367  if (failed(getIndexSet(ops, &cst))) {
368  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
369  return failure();
370  }
371  if (!cst.isHyperRectangular(0, input.size())) {
372  LLVM_DEBUG(llvm::dbgs()
373  << "Non-hyperrectangular nests not supported for tiling!\n");
374  return failure();
375  }
376  return success();
377 }
378 
379 /// Check if the input nest is supported for tiling and whether tiling would be
380 /// legal or not.
381 template <typename t>
383  ArrayRef<t> tileSizes) {
384  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
385 
386  if (llvm::any_of(input,
387  [](AffineForOp op) { return op.getNumResults() > 0; })) {
388  LLVM_DEBUG(llvm::dbgs()
389  << "Cannot tile nest where a loop has yield values\n");
390  return failure();
391  }
392 
393  // Check if the supplied `for` ops are all successively nested.
394  if (!isPerfectlyNested(input)) {
395  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
396  return failure();
397  }
398 
399  // TODO: handle non hyper-rectangular spaces.
400  if (failed(checkIfHyperRectangular(input)))
401  return failure();
402 
403  return success();
404 }
405 
406 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
407 /// location in destination's body, ignoring the terminator.
408 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409  Block::iterator loc) {
410  auto &ops = src.getBody()->getOperations();
411  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412  std::prev(ops.end()));
413 }
414 
415 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416 /// body.
417 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419 }
420 
421 /// Constructs tiled loop nest, without setting the loop bounds and move the
422 /// body of the original loop nest to the tiled loop nest.
424  AffineForOp rootAffineForOp, unsigned width,
425  MutableArrayRef<AffineForOp> tiledLoops) {
426  Location loc = rootAffineForOp.getLoc();
427 
428  // The outermost among the loops as we add more..
429  Operation *topLoop = rootAffineForOp.getOperation();
430  AffineForOp innermostPointLoop;
431 
432  // Add intra-tile (or point) loops.
433  for (unsigned i = 0; i < width; i++) {
434  OpBuilder b(topLoop);
435  // Loop bounds will be set later.
436  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
437  pointLoop.getBody()->getOperations().splice(
438  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439  topLoop);
440  tiledLoops[2 * width - 1 - i] = pointLoop;
441  topLoop = pointLoop.getOperation();
442  if (i == 0)
443  innermostPointLoop = pointLoop;
444  }
445 
446  // Add tile space loops;
447  for (unsigned i = width; i < 2 * width; i++) {
448  OpBuilder b(topLoop);
449  // Loop bounds will be set later.
450  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
451  tileSpaceLoop.getBody()->getOperations().splice(
452  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453  topLoop);
454  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455  topLoop = tileSpaceLoop.getOperation();
456  }
457 
458  // Move the loop body of the original nest to the new one.
459  moveLoopBody(origLoops.back(), innermostPointLoop);
460 }
461 
462 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
463 // TODO: Handle non-constant lower bounds.
464 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465  AffineForOp newInterTileLoop,
466  AffineForOp newIntraTileLoop,
467  Value tileSize) {
468  // The lower bound for the intra-tile loop is represented by an affine map
469  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474  // corresponding inter-tile loop.
475 
476  assert(origLoop.hasConstantLowerBound() &&
477  "expected input loops to have constant lower bound.");
478 
479  // Get lower bound of original loop as an affine expression.
480  AffineExpr origLowerBoundExpr;
481  origLowerBoundExpr =
482  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483 
484  // Add dim operands from original lower/upper bound.
485  SmallVector<Value, 4> lbOperands, ubOperands;
486  AffineBound lb = origLoop.getLowerBound();
487  AffineBound ub = origLoop.getUpperBound();
488  lbOperands.reserve(lb.getNumOperands() + 2);
489  ubOperands.reserve(ub.getNumOperands() + 2);
490  AffineMap origLbMap = lb.getMap();
491  AffineMap origUbMap = ub.getMap();
492  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493  lbOperands.push_back(lb.getOperand(j));
494  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495  ubOperands.push_back(ub.getOperand(j));
496 
497  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498  // IV.
499  lbOperands.push_back(newInterTileLoop.getInductionVar());
500  ubOperands.push_back(newInterTileLoop.getInductionVar());
501 
502  // Get loop IV as an affine expression for lower/upper bound. Size of
503  // lb/ubOperands is guaranteed to be atleast one.
504  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506 
507  // Add symbol operands from original lower/upper bound.
508  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512 
513  // Add a new symbol operand which is the tile size for this loop.
514  lbOperands.push_back(tileSize);
515  ubOperands.push_back(tileSize);
516 
517  SmallVector<AffineExpr, 4> lbBoundExprs;
518  SmallVector<AffineExpr, 4> ubBoundExprs;
519  lbBoundExprs.reserve(origLbMap.getNumResults());
520  ubBoundExprs.reserve(origUbMap.getNumResults());
521 
522  // Get tiling parameter as an affine expression for lb/ub.
523  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525 
526  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527  lbBoundExprs.push_back(
528  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529  origLowerBoundExpr);
530 
531  // Get the origLoopStep as an affine expression.
532  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533 
534  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535  // (tilingParameter * origLoopStep) + origlb.
536  ubBoundExprs.push_back(
537  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539 
540  ubBoundExprs.append(origUbMap.getResults().begin(),
541  origUbMap.getResults().end());
542 
543  AffineMap lbMap =
544  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545  lbBoundExprs, b.getContext());
546  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547 
548  AffineMap ubMap =
549  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550  ubBoundExprs, b.getContext());
551  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552 
553  // Original loop step must be preserved.
554  newIntraTileLoop.setStep(origLoop.getStepAsInt());
555 }
556 
557 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
558 // TODO: Handle non-constant lower bounds.
559 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560  AffineForOp newLoop, Value tileSize) {
561  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562 
563  // The lower bounds for inter-tile loops are same as the corresponding lower
564  // bounds of original loops.
565  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566 
567  // The new upper bound map for inter-tile loops, assuming constant lower
568  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569  // originalLowerBound), tiling parameter); where tiling parameter is the
570  // respective tile size for that loop. For e.g. if the original ubmap was
571  // ()->(1024), the new map will be
572  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573  // Therefore a new symbol operand is inserted in the map and the result
574  // expression is overwritten.
575 
576  assert(origLoop.hasConstantLowerBound() &&
577  "expected input loops to have constant lower bound.");
578 
579  // Get lower bound of original loop as an affine expression.
580  AffineExpr origLowerBoundExpr;
581  origLowerBoundExpr =
582  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583 
584  // Add dim operands from original upper bound.
585  SmallVector<Value, 4> ubOperands;
586  AffineBound ub = origLoop.getUpperBound();
587  ubOperands.reserve(ub.getNumOperands() + 1);
588  AffineMap origUbMap = ub.getMap();
589  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590  ubOperands.push_back(ub.getOperand(j));
591 
592  // Add symbol operands from original upper bound.
593  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595 
596  // Add a new symbol operand which is the tile size for this loop.
597  ubOperands.push_back(tileSize);
598 
599  // Get tiling parameter as an affine expression.
600  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601 
602  SmallVector<AffineExpr, 4> boundExprs;
603  boundExprs.reserve(origUbMap.getNumResults());
604  int64_t origUpperBound;
605  AffineExpr origUpperBoundExpr;
606 
607  // If upper bound for the original loop is constant, then the constant can
608  // be obtained as an affine expression straight away.
609  if (origLoop.hasConstantUpperBound()) {
610  origUpperBound = origLoop.getConstantUpperBound();
611 
612  // Get original constant upper bound as an affine expression.
613  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614 
615  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616  // originalLowerBound), tilingParameter).
617  boundExprs.push_back(
618  origLowerBoundExpr +
619  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620  } else {
621  // If upper bound for the original loop is not constant then two cases
622  // are possible, although there handeling is the same, 1.) The result of
623  // ubmap has only one result expression. For e.g.
624  // affine.for %i = 5 to %ub
625  //
626  // A symbol operand is added which represents the tiling parameter. The
627  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628  // where 's0' is the original upper bound and 's1' is the tiling
629  // parameter. 2.) When ubMap has more than one result expression. For e.g.
630  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631  // affine.for %i = 5 to min #map0()[%s0, %s1]
632  //
633  // A symbol operand is added which represents the tiling parameter. The
634  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636 
637  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638  // originalLowerBound), tilingParameter).
639  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640  boundExprs.push_back(
641  origLowerBoundExpr +
642  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643  }
644 
645  AffineMap ubMap =
646  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647  boundExprs, b.getContext());
648  newLoop.setUpperBound(ubOperands, ubMap);
649 
650  // Original loop step must be preserved.
651  newLoop.setStep(origLoop.getStepAsInt());
652 }
653 
654 /// Constructs and sets new loop bounds after tiling for the case of
655 /// hyper-rectangular index sets, where the bounds of one dimension do not
656 /// depend on other dimensions and tiling parameters are captured from SSA
657 /// values. Bounds of each dimension can thus be treated independently,
658 /// and deriving the new bounds is much simpler and faster than for the case of
659 /// tiling arbitrary polyhedral shapes.
662  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663  assert(!origLoops.empty() && "expected atleast one loop in band");
664  assert(origLoops.size() == tileSizes.size() &&
665  "expected tiling parameter for each loop in band.");
666 
667  OpBuilder b(origLoops[0].getOperation());
668  unsigned width = origLoops.size();
669 
670  // Set bounds for tile space loops.
671  for (unsigned i = 0; i < width; ++i) {
672  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673  }
674 
675  // Set bounds for intra-tile loops.
676  for (unsigned i = 0; i < width; ++i) {
677  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678  newLoops[i + width], tileSizes[i]);
679  }
680 }
681 
682 /// Constructs and sets new loop bounds after tiling for the case of
683 /// hyper-rectangular index sets, where the bounds of one dimension do not
684 /// depend on other dimensions. Bounds of each dimension can thus be treated
685 /// independently, and deriving the new bounds is much simpler and faster
686 /// than for the case of tiling arbitrary polyhedral shapes.
687 static void
690  ArrayRef<unsigned> tileSizes) {
691  assert(!origLoops.empty());
692  assert(origLoops.size() == tileSizes.size());
693 
694  OpBuilder b(origLoops[0].getOperation());
695  unsigned width = origLoops.size();
696 
697  // Bounds for tile space loops.
698  for (unsigned i = 0; i < width; i++) {
699  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703  // If the step size of original loop is x and tileSize is y then after
704  // tiling the tile space loops' step size becomes x*y.
705  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706  }
707  // Bounds for intra-tile loops.
708  for (unsigned i = 0; i < width; i++) {
709  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710  std::optional<uint64_t> mayBeConstantCount =
711  getConstantTripCount(origLoops[i]);
712  // The lower bound is just the tile-space loop.
713  AffineMap lbMap = b.getDimIdentityMap();
714  newLoops[width + i].setLowerBound(
715  /*operands=*/newLoops[i].getInductionVar(), lbMap);
716  // The step sizes of intra-tile loops is just the original loops' step size.
717  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
718 
719  // Set the upper bound.
720  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
721  // Trip count is less than the tile size: upper bound is lower bound +
722  // trip count * stepSize.
724  *mayBeConstantCount * origLoops[i].getStepAsInt());
725  newLoops[width + i].setUpperBound(
726  /*operands=*/newLoops[i].getInductionVar(), ubMap);
727  } else if (largestDiv % tileSizes[i] != 0) {
728  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
729  // Construct the upper bound map; the operands are the original operands
730  // with 'i' (tile-space loop) appended to it. The new upper bound map is
731  // the original one with an additional expression i + tileSize * stepSize
732  // appended.
733 
734  // Add dim operands from original upper bound.
735  SmallVector<Value, 4> ubOperands;
736  AffineBound ub = origLoops[i].getUpperBound();
737  ubOperands.reserve(ub.getNumOperands() + 1);
738  AffineMap origUbMap = ub.getMap();
739  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
740  ubOperands.push_back(ub.getOperand(j));
741 
742  // Add dim operand for new loop upper bound.
743  ubOperands.push_back(newLoops[i].getInductionVar());
744 
745  // Add symbol operands from original upper bound.
746  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
747  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
748 
749  SmallVector<AffineExpr, 4> boundExprs;
750  boundExprs.reserve(1 + origUbMap.getNumResults());
751  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
752  // The new upper bound map is the original one with an additional
753  // expression i + tileSize * stepSize (of original loop) appended.
754  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755  boundExprs.append(origUbMap.getResults().begin(),
756  origUbMap.getResults().end());
757  AffineMap ubMap =
758  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
759  boundExprs, b.getContext());
760  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
761  } else {
762  // No need of the min expression.
763  AffineExpr dim = b.getAffineDimExpr(0);
764  AffineMap ubMap = AffineMap::get(
765  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
767  }
768  }
769 }
770 
771 LogicalResult
773  ArrayRef<unsigned> tileSizes,
774  SmallVectorImpl<AffineForOp> *tiledNest) {
775  if (input.empty())
776  return success();
777 
778  if (failed(performPreTilingChecks(input, tileSizes)))
779  return failure();
780 
781  MutableArrayRef<AffineForOp> origLoops = input;
782  AffineForOp rootAffineForOp = origLoops[0];
783 
784  // Note that width is at least one since the band isn't empty.
785  unsigned width = input.size();
786  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
787 
788  // Construct a tiled loop nest without setting their bounds. Bounds are
789  // set later.
790  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
791 
792  SmallVector<Value, 8> origLoopIVs;
793  extractForInductionVars(input, &origLoopIVs);
794 
795  // Set loop bounds for the tiled loop nest.
796  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
797 
798  // Replace original IVs with intra-tile loop IVs.
799  for (unsigned i = 0; i < width; i++)
800  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
801 
802  // Erase the old loop nest.
803  rootAffineForOp.erase();
804 
805  if (tiledNest)
806  *tiledNest = std::move(tiledLoops);
807 
808  return success();
809 }
810 
811 /// Tiles the specified band of perfectly nested loops creating tile-space
812 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
813 /// is a contiguous set of loops.
816  SmallVectorImpl<AffineForOp> *tiledNest) {
817  if (input.empty())
818  return success();
819 
820  if (failed(performPreTilingChecks(input, tileSizes)))
821  return failure();
822 
823  MutableArrayRef<AffineForOp> origLoops = input;
824  AffineForOp rootAffineForOp = origLoops[0];
825  unsigned width = input.size();
826  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
827 
828  // Construct a tiled loop nest without setting their bounds. Bounds are
829  // set later.
830  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
831 
832  SmallVector<Value, 8> origLoopIVs;
833  extractForInductionVars(input, &origLoopIVs);
834 
835  // Set loop bounds for the tiled loop nest.
836  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
837  tileSizes);
838 
839  // Replace original IVs with intra-tile loop IVs.
840  for (unsigned i = 0; i < width; i++)
841  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
842 
843  // Erase the old loop nest.
844  rootAffineForOp.erase();
845 
846  if (tiledNest)
847  *tiledNest = std::move(tiledLoops);
848 
849  return success();
850 }
851 
852 /// Get perfectly nested sequence of loops starting at root of loop nest
853 /// (the first op being another AffineFor, and the second op - a terminator).
854 /// A loop is perfectly nested iff: the first op in the loop's body is another
855 /// AffineForOp, and the second op is a terminator).
857  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
858  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
859  nestedLoops.push_back(root);
860  Block &body = root.getRegion().front();
861  if (body.begin() != std::prev(body.end(), 2))
862  return;
863 
864  root = dyn_cast<AffineForOp>(&body.front());
865  if (!root)
866  return;
867  }
868 }
869 
870 /// Identify valid and profitable bands of loops to tile. This is currently just
871 /// a temporary placeholder to test the mechanics of tiled code generation.
872 /// Returns all maximal outermost perfect loop nests to tile.
874  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
875  // Get maximal perfect nest of 'affine.for' insts starting from root
876  // (inclusive).
877  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
879  getPerfectlyNestedLoops(band, forOp);
880  bands->push_back(band);
881  }
882 }
883 
884 /// Unrolls this loop completely.
885 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
886  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
887  if (mayBeConstantTripCount.has_value()) {
888  uint64_t tripCount = *mayBeConstantTripCount;
889  if (tripCount == 0)
890  return success();
891  if (tripCount == 1)
892  return promoteIfSingleIteration(forOp);
893  return loopUnrollByFactor(forOp, tripCount);
894  }
895  return failure();
896 }
897 
898 /// Unrolls this loop by the specified factor or by the trip count (if constant)
899 /// whichever is lower.
900 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
901  uint64_t unrollFactor) {
902  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
903  if (mayBeConstantTripCount.has_value() &&
904  *mayBeConstantTripCount < unrollFactor)
905  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
906  return loopUnrollByFactor(forOp, unrollFactor);
907 }
908 
909 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
910 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
911 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
912 /// using annotateFn.
914  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
915  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
916  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
917  ValueRange iterArgs, ValueRange yieldedValues) {
918  // Builder to insert unrolled bodies just before the terminator of the body of
919  // 'forOp'.
920  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
921 
922  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
923  if (!annotateFn)
924  annotateFn = defaultAnnotateFn;
925 
926  // Keep a pointer to the last non-terminator operation in the original block
927  // so that we know what to clone (since we are doing this in-place).
928  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
929 
930  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
931  SmallVector<Value, 4> lastYielded(yieldedValues);
932 
933  for (unsigned i = 1; i < unrollFactor; i++) {
934  IRMapping operandMap;
935 
936  // Prepare operand map.
937  operandMap.map(iterArgs, lastYielded);
938 
939  // If the induction variable is used, create a remapping to the value for
940  // this unrolled instance.
941  if (!forOpIV.use_empty()) {
942  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
943  operandMap.map(forOpIV, ivUnroll);
944  }
945 
946  // Clone the original body of 'forOp'.
947  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
948  Operation *clonedOp = builder.clone(*it, operandMap);
949  annotateFn(i, clonedOp, builder);
950  }
951 
952  // Update yielded values. If the yielded value is defined outside the
953  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
954  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
955  // value with the clone corresponding to the yielded value.
956  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
957  Operation *defOp = yieldedValues[i].getDefiningOp();
958  if (defOp && defOp->getBlock() == loopBodyBlock)
959  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
960  }
961  }
962 
963  // Make sure we annotate the Ops in the original body. We do this last so that
964  // any annotations are not copied into the cloned Ops above.
965  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
966  annotateFn(0, &*it, builder);
967 
968  // Update operands of the yield statement.
969  loopBodyBlock->getTerminator()->setOperands(lastYielded);
970 }
971 
972 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
973 /// count is not a multiple of `unrollFactor`.
974 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
975  uint64_t unrollFactor) {
976  // Insert the cleanup loop right after 'forOp'.
977  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
978  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
979 
980  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
981  // and produce results for the original users of `forOp` results.
982  auto results = forOp.getResults();
983  auto cleanupResults = cleanupForOp.getResults();
984  auto cleanupIterOperands = cleanupForOp.getInits();
985 
986  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
987  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
988  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
989  }
990 
991  AffineMap cleanupMap;
992  SmallVector<Value, 4> cleanupOperands;
993  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
994  if (!cleanupMap)
995  return failure();
996 
997  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
998  // Promote the loop body up if this has turned into a single iteration loop.
999  (void)promoteIfSingleIteration(cleanupForOp);
1000 
1001  // Adjust upper bound of the original loop; this is the same as the lower
1002  // bound of the cleanup loop.
1003  forOp.setUpperBound(cleanupOperands, cleanupMap);
1004  return success();
1005 }
1006 
1007 /// Unrolls this loop by the specified factor. Returns success if the loop
1008 /// is successfully unrolled.
1010  AffineForOp forOp, uint64_t unrollFactor,
1011  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1012  bool cleanUpUnroll) {
1013  assert(unrollFactor > 0 && "unroll factor should be positive");
1014 
1015  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1016  if (unrollFactor == 1) {
1017  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1018  failed(promoteIfSingleIteration(forOp)))
1019  return failure();
1020  return success();
1021  }
1022 
1023  // Nothing in the loop body other than the terminator.
1024  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1025  return success();
1026 
1027  // If the trip count is lower than the unroll factor, no unrolled body.
1028  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1029  if (cleanUpUnroll) {
1030  // Unroll the cleanup loop if cleanUpUnroll is specified.
1031  return loopUnrollFull(forOp);
1032  }
1033 
1034  return failure();
1035  }
1036 
1037  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1038  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1039  // Loops where the lower bound is a max expression or the upper bound is
1040  // a min expression and the trip count doesn't divide the unroll factor
1041  // can't be unrolled since the lower bound of the cleanup loop in such cases
1042  // cannot be expressed as an affine function or a max over affine functions.
1043  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1044  forOp.getUpperBoundMap().getNumResults() != 1)
1045  return failure();
1046  if (cleanUpUnroll)
1047  // Force unroll including cleanup loop
1048  return loopUnrollFull(forOp);
1049  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1050  assert(false && "cleanup loop lower bound map for single result lower "
1051  "and upper bound maps can always be determined");
1052  }
1053 
1054  ValueRange iterArgs(forOp.getRegionIterArgs());
1055  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1056 
1057  // Scale the step of loop being unrolled by unroll factor.
1058  int64_t step = forOp.getStepAsInt();
1059  forOp.setStep(step * unrollFactor);
1061  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1062  [&](unsigned i, Value iv, OpBuilder b) {
1063  // iv' = iv + i * step
1064  auto d0 = b.getAffineDimExpr(0);
1065  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1066  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1067  },
1068  /*annotateFn=*/annotateFn,
1069  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1070 
1071  // Promote the loop body up if this has turned into a single iteration loop.
1072  (void)promoteIfSingleIteration(forOp);
1073  return success();
1074 }
1075 
1076 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1077  uint64_t unrollJamFactor) {
1078  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1079  if (mayBeConstantTripCount.has_value() &&
1080  *mayBeConstantTripCount < unrollJamFactor)
1081  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1082  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1083 }
1084 
1085 /// Check if all control operands of all loops are defined outside of `forOp`
1086 /// and return false if not.
1087 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1088  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1089  for (auto controlOperand : aForOp.getControlOperands()) {
1090  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1091  return WalkResult::interrupt();
1092  }
1093  return WalkResult::advance();
1094  });
1095  return !walkResult.wasInterrupted();
1096 }
1097 
1098 /// Unrolls and jams this loop by the specified factor.
1099 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1100  uint64_t unrollJamFactor) {
1101  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1102 
1103  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1104  if (unrollJamFactor == 1) {
1105  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1106  failed(promoteIfSingleIteration(forOp)))
1107  return failure();
1108  return success();
1109  }
1110 
1111  // Nothing in the loop body other than the terminator.
1112  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1113  return success();
1114 
1115  // If the trip count is lower than the unroll jam factor, no unroll jam.
1116  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1117  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1118  return failure();
1119  }
1120 
1121  // If any control operand of any inner loop of `forOp` is defined within
1122  // `forOp`, no unroll jam.
1123  if (!areInnerBoundsInvariant(forOp))
1124  return failure();
1125 
1126  // Gather all sub-blocks to jam upon the loop being unrolled.
1128  jbg.walk(forOp);
1129  auto &subBlocks = jbg.subBlocks;
1130 
1131  // Collect loops with iter_args.
1132  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1133  forOp.walk([&](AffineForOp aForOp) {
1134  if (aForOp.getNumIterOperands() > 0)
1135  loopsWithIterArgs.push_back(aForOp);
1136  });
1137 
1138  // Get supported reductions to be used for creating reduction ops at the end.
1139  SmallVector<LoopReduction> reductions;
1140  if (forOp.getNumIterOperands() > 0)
1141  getSupportedReductions(forOp, reductions);
1142 
1143  // Generate the cleanup loop if trip count isn't a multiple of
1144  // unrollJamFactor.
1145  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1146  // Loops where the lower bound is a max expression or the upper bound is
1147  // a min expression and the trip count doesn't divide the unroll factor
1148  // can't be unrolled since the lower bound of the cleanup loop in such cases
1149  // cannot be expressed as an affine function or a max over affine functions.
1150  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1151  forOp.getUpperBoundMap().getNumResults() != 1)
1152  return failure();
1153  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1154  assert(false && "cleanup loop lower bound map for single result lower "
1155  "and upper bound maps can always be determined");
1156  }
1157 
1158  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1159  // iteration. There are (`unrollJamFactor` - 1) iterations.
1160  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1161 
1162  // For any loop with iter_args, replace it with a new loop that has
1163  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1164  // operands.
1165  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1166  IRRewriter rewriter(forOp.getContext());
1167  for (AffineForOp oldForOp : loopsWithIterArgs) {
1168  SmallVector<Value> dupIterOperands, dupYieldOperands;
1169  ValueRange oldIterOperands = oldForOp.getInits();
1170  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1171  ValueRange oldYieldOperands =
1172  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1173  // Get additional iterOperands, iterArgs, and yield operands. We will
1174  // fix iterOperands and yield operands after cloning of sub-blocks.
1175  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1176  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1177  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1178  }
1179  // Create a new loop with additional iterOperands, iter_args and yield
1180  // operands. This new loop will take the loop body of the original loop.
1181  bool forOpReplaced = oldForOp == forOp;
1182  AffineForOp newForOp =
1183  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1184  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1185  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1186  return dupYieldOperands;
1187  }));
1188  newLoopsWithIterArgs.push_back(newForOp);
1189  // `forOp` has been replaced with a new loop.
1190  if (forOpReplaced)
1191  forOp = newForOp;
1192  // Update `operandMaps` for `newForOp` iterArgs and results.
1193  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1194  unsigned oldNumIterArgs = oldIterArgs.size();
1195  ValueRange newResults = newForOp.getResults();
1196  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1197  assert(oldNumIterArgs == oldNumResults &&
1198  "oldNumIterArgs must be the same as oldNumResults");
1199  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1200  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1201  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1202  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1203  // to those in the `i`th new set.
1204  operandMaps[i - 1].map(newIterArgs[j],
1205  newIterArgs[i * oldNumIterArgs + j]);
1206  operandMaps[i - 1].map(newResults[j],
1207  newResults[i * oldNumResults + j]);
1208  }
1209  }
1210  }
1211 
1212  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1213  int64_t step = forOp.getStepAsInt();
1214  forOp.setStep(step * unrollJamFactor);
1215 
1216  auto forOpIV = forOp.getInductionVar();
1217  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1218  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1219  for (auto &subBlock : subBlocks) {
1220  // Builder to insert unroll-jammed bodies. Insert right at the end of
1221  // sub-block.
1222  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1223 
1224  // If the induction variable is used, create a remapping to the value for
1225  // this unrolled instance.
1226  if (!forOpIV.use_empty()) {
1227  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1228  auto d0 = builder.getAffineDimExpr(0);
1229  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1230  auto ivUnroll =
1231  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1232  operandMaps[i - 1].map(forOpIV, ivUnroll);
1233  }
1234  // Clone the sub-block being unroll-jammed.
1235  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1236  builder.clone(*it, operandMaps[i - 1]);
1237  }
1238  // Fix iterOperands and yield op operands of newly created loops.
1239  for (auto newForOp : newLoopsWithIterArgs) {
1240  unsigned oldNumIterOperands =
1241  newForOp.getNumIterOperands() / unrollJamFactor;
1242  unsigned numControlOperands = newForOp.getNumControlOperands();
1243  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1244  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1245  assert(oldNumIterOperands == oldNumYieldOperands &&
1246  "oldNumIterOperands must be the same as oldNumYieldOperands");
1247  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1248  // The `i`th duplication of an old iterOperand or yield op operand
1249  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1250  // if such mapped value exists.
1251  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1252  operandMaps[i - 1].lookupOrDefault(
1253  newForOp.getOperand(numControlOperands + j)));
1254  yieldOp.setOperand(
1255  i * oldNumYieldOperands + j,
1256  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1257  }
1258  }
1259  }
1260  if (forOp.getNumResults() > 0) {
1261  // Create reduction ops to combine every `unrollJamFactor` related results
1262  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1263  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1264  // %1.
1265  rewriter.setInsertionPointAfter(forOp);
1266  auto loc = forOp.getLoc();
1267  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1268  for (LoopReduction &reduction : reductions) {
1269  unsigned pos = reduction.iterArgPosition;
1270  Value lhs = forOp.getResult(pos);
1271  Value rhs;
1273  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1274  rhs = forOp.getResult(i * oldNumResults + pos);
1275  // Create ops based on reduction type.
1276  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1277  if (!lhs)
1278  return failure();
1279  Operation *op = lhs.getDefiningOp();
1280  assert(op && "Reduction op should have been created");
1281  newOps.insert(op);
1282  }
1283  // Replace all uses except those in newly created reduction ops.
1284  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1285  }
1286  }
1287 
1288  // Promote the loop body up if this has turned into a single iteration loop.
1289  (void)promoteIfSingleIteration(forOp);
1290  return success();
1291 }
1292 
1293 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1294 /// nested within 'forOpA' as the only non-terminator operation in its block.
1295 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1296  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1297  auto &forOpABody = forOpA.getBody()->getOperations();
1298  auto &forOpBBody = forOpB.getBody()->getOperations();
1299 
1300  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1301  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1302  // body containing only the terminator.
1303  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1304  forOpABody, forOpABody.begin(),
1305  std::prev(forOpABody.end()));
1306  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1307  // body (this leaves forOpB's body containing only the terminator).
1308  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1309  std::prev(forOpBBody.end()));
1310  // 3) Splice forOpA into the beginning of forOpB's body.
1311  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1312  Block::iterator(forOpA));
1313 }
1314 
1315 // Checks each dependence component against the permutation to see if the
1316 // desired loop interchange would violate dependences by making the
1317 // dependence component lexicographically negative.
1319  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1320  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1321  // Invert permutation map.
1322  unsigned maxLoopDepth = loops.size();
1323  SmallVector<unsigned, 4> loopPermMapInv;
1324  loopPermMapInv.resize(maxLoopDepth);
1325  for (unsigned i = 0; i < maxLoopDepth; ++i)
1326  loopPermMapInv[loopPermMap[i]] = i;
1327 
1328  // Check each dependence component against the permutation to see if the
1329  // desired loop interchange permutation would make the dependence vectors
1330  // lexicographically negative.
1331  // Example 1: [-1, 1][0, 0]
1332  // Example 2: [0, 0][-1, 1]
1333  for (const auto &depComps : depCompsVec) {
1334  assert(depComps.size() >= maxLoopDepth);
1335  // Check if the first non-zero dependence component is positive.
1336  // This iterates through loops in the desired order.
1337  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1338  unsigned permIndex = loopPermMapInv[j];
1339  assert(depComps[permIndex].lb);
1340  int64_t depCompLb = *depComps[permIndex].lb;
1341  if (depCompLb > 0)
1342  break;
1343  if (depCompLb < 0)
1344  return false;
1345  }
1346  }
1347  return true;
1348 }
1349 
1350 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1351 /// nested sequence of loops in 'loops' would violate dependences.
1353  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1354  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1355  unsigned maxLoopDepth = loops.size();
1356  if (maxLoopDepth == 1)
1357  return true;
1358  // Gather dependence components for dependences between all ops in loop nest
1359  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1360  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1361  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1362  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1363 }
1364 
1365 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1366 /// in it from outermost to innermost.
1367 bool LLVM_ATTRIBUTE_UNUSED
1369  assert(!loops.empty() && "no loops provided");
1370 
1371  // We already know that the block can't be empty.
1372  auto hasTwoElements = [](Block *block) {
1373  auto secondOpIt = std::next(block->begin());
1374  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1375  };
1376 
1377  auto enclosingLoop = loops.front();
1378  for (auto loop : loops.drop_front()) {
1379  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1380  // parentForOp's body should be just this loop and the terminator.
1381  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1382  return false;
1383  enclosingLoop = loop;
1384  }
1385  return true;
1386 }
1387 
1388 // input[i] should move from position i -> permMap[i]. Returns the position in
1389 // `input` that becomes the new outermost loop.
1391  ArrayRef<unsigned> permMap) {
1392  assert(input.size() == permMap.size() && "invalid permutation map size");
1393  // Check whether the permutation spec is valid. This is a small vector - we'll
1394  // just sort and check if it's iota.
1395  SmallVector<unsigned, 4> checkPermMap(permMap);
1396  llvm::sort(checkPermMap);
1397  if (llvm::any_of(llvm::enumerate(checkPermMap),
1398  [](const auto &en) { return en.value() != en.index(); }))
1399  assert(false && "invalid permutation map");
1400 
1401  // Nothing to do.
1402  if (input.size() < 2)
1403  return 0;
1404 
1405  assert(isPerfectlyNested(input) && "input not perfectly nested");
1406 
1407  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1408  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1410  for (unsigned i = 0, e = input.size(); i < e; ++i)
1411  invPermMap.push_back({permMap[i], i});
1412  llvm::sort(invPermMap);
1413 
1414  // Move the innermost loop body to the loop that would be the innermost in the
1415  // permuted nest (only if the innermost loop is going to change).
1416  if (permMap.back() != input.size() - 1) {
1417  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1418  Block *srcBody = ((AffineForOp)input.back()).getBody();
1419  destBody->getOperations().splice(destBody->begin(),
1420  srcBody->getOperations(), srcBody->begin(),
1421  std::prev(srcBody->end()));
1422  }
1423 
1424  // We'll move each loop in `input` in the reverse order so that its body is
1425  // empty when we are moving it; this incurs zero copies and no erasing.
1426  for (int i = input.size() - 1; i >= 0; --i) {
1427  // If this has to become the outermost loop after permutation, add it to the
1428  // parent block of the original root.
1429  if (permMap[i] == 0) {
1430  // If the root remains the same, nothing to do.
1431  if (i == 0)
1432  continue;
1433  // Make input[i] the new outermost loop moving it into parentBlock.
1434  auto *parentBlock = input[0]->getBlock();
1435  parentBlock->getOperations().splice(Block::iterator(input[0]),
1436  input[i]->getBlock()->getOperations(),
1437  Block::iterator(input[i]));
1438  continue;
1439  }
1440 
1441  // If the parent in the permuted order is the same as in the original,
1442  // nothing to do.
1443  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1444  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1445  continue;
1446 
1447  // Move input[i] to its surrounding loop in the transformed nest.
1448  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1449  destBody->getOperations().splice(destBody->begin(),
1450  input[i]->getBlock()->getOperations(),
1451  Block::iterator(input[i]));
1452  }
1453 
1454  return invPermMap[0].second;
1455 }
1456 
1457 // Sinks all sequential loops to the innermost levels (while preserving
1458 // relative order among them) and moves all parallel loops to the
1459 // outermost (while again preserving relative order among them).
1460 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1462  getPerfectlyNestedLoops(loops, forOp);
1463  if (loops.size() < 2)
1464  return forOp;
1465 
1466  // Gather dependence components for dependences between all ops in loop nest
1467  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1468  unsigned maxLoopDepth = loops.size();
1469  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1470  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1471 
1472  // Mark loops as either parallel or sequential.
1473  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1474  for (auto &depComps : depCompsVec) {
1475  assert(depComps.size() >= maxLoopDepth);
1476  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1477  DependenceComponent &depComp = depComps[j];
1478  assert(depComp.lb.has_value() && depComp.ub.has_value());
1479  if (*depComp.lb != 0 || *depComp.ub != 0)
1480  isParallelLoop[j] = false;
1481  }
1482  }
1483 
1484  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1485 
1486  // Compute permutation of loops that sinks sequential loops (and thus raises
1487  // parallel loops) while preserving relative order.
1488  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1489  unsigned nextSequentialLoop = numParallelLoops;
1490  unsigned nextParallelLoop = 0;
1491  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1492  if (isParallelLoop[i]) {
1493  loopPermMap[i] = nextParallelLoop++;
1494  } else {
1495  loopPermMap[i] = nextSequentialLoop++;
1496  }
1497  }
1498 
1499  // Check if permutation 'loopPermMap' would violate dependences.
1500  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1501  return forOp;
1502  // Perform loop interchange according to permutation 'loopPermMap'.
1503  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1504  return loops[loopNestRootIndex];
1505 }
1506 
1507 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1508 // lower (resp. upper) loop bound. When called for both the lower and upper
1509 // bounds, the resulting IR resembles:
1510 //
1511 // ```mlir
1512 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1513 // ...
1514 // }
1515 // ```
1517  SmallVector<Value, 4> *operands,
1518  int64_t offset = 0) {
1519  auto bounds = llvm::to_vector<4>(map->getResults());
1520  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1521  operands->insert(operands->begin() + map->getNumDims(), iv);
1522  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1523  b.getContext());
1524  canonicalizeMapAndOperands(map, operands);
1525 }
1526 
1527 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1528 // Stripmine-sink is a primitive building block for generalized tiling of
1529 // imperfectly nested loops.
1530 // This transformation is purely mechanical and does not check legality,
1531 // profitability or even structural correctness. It is the user's
1532 // responsibility to specify `targets` that are dominated by `forOp`.
1533 // Returns the new AffineForOps, one per `targets`, nested immediately under
1534 // each of the `targets`.
1536 stripmineSink(AffineForOp forOp, uint64_t factor,
1537  ArrayRef<AffineForOp> targets) {
1538  auto originalStep = forOp.getStepAsInt();
1539  auto scaledStep = originalStep * factor;
1540  forOp.setStep(scaledStep);
1541 
1542  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1543 
1544  // Lower-bound map creation.
1545  auto lbMap = forOp.getLowerBoundMap();
1546  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1547  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1548 
1549  // Upper-bound map creation.
1550  auto ubMap = forOp.getUpperBoundMap();
1551  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1552  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1553  /*offset=*/scaledStep);
1554 
1555  auto iv = forOp.getInductionVar();
1556  SmallVector<AffineForOp, 8> innerLoops;
1557  for (auto t : targets) {
1558  // Insert newForOp before the terminator of `t`.
1559  auto b = OpBuilder::atBlockTerminator(t.getBody());
1560  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1561  ubOperands, ubMap, originalStep);
1562  auto begin = t.getBody()->begin();
1563  // Skip terminator and `newForOp` which is just before the terminator.
1564  auto nOps = t.getBody()->getOperations().size() - 2;
1565  newForOp.getBody()->getOperations().splice(
1566  newForOp.getBody()->getOperations().begin(),
1567  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1568  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1569  newForOp.getRegion());
1570  innerLoops.push_back(newForOp);
1571  }
1572 
1573  return innerLoops;
1574 }
1575 
1576 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1577 // Returns the new AffineForOps, nested immediately under `target`.
1578 template <typename SizeType>
1579 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1580  AffineForOp target) {
1581  // TODO: Use cheap structural assertions that targets are nested under
1582  // forOp and that targets are not nested under each other when DominanceInfo
1583  // exposes the capability. It seems overkill to construct a whole function
1584  // dominance tree at this point.
1585  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1586  assert(res.size() == 1 && "Expected 1 inner forOp");
1587  return res[0];
1588 }
1589 
1592  ArrayRef<AffineForOp> targets) {
1594  SmallVector<AffineForOp, 8> currentTargets(targets);
1595  for (auto it : llvm::zip(forOps, sizes)) {
1596  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1597  res.push_back(step);
1598  currentTargets = step;
1599  }
1600  return res;
1601 }
1602 
1604  ArrayRef<uint64_t> sizes,
1605  AffineForOp target) {
1607  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1608  res.push_back(llvm::getSingleElement(loops));
1609  return res;
1610 }
1611 
1613  if (loops.size() < 2)
1614  return success();
1615 
1616  AffineForOp innermost = loops.back();
1617  AffineForOp outermost = loops.front();
1618  AffineBound ub = outermost.getUpperBound();
1619  AffineMap origUbMap = ub.getMap();
1620  Location loc = outermost.getLoc();
1621  OpBuilder builder(outermost);
1622  for (AffineForOp loop : loops) {
1623  // We only work on normalized loops.
1624  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1625  loop.getConstantLowerBound() != 0)
1626  return failure();
1627  }
1628  SmallVector<Value, 4> upperBoundSymbols;
1629  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1630  ub.getOperands().end());
1631 
1632  // 1. Store the upper bound of the outermost loop in a variable.
1633  Value prev;
1634  if (!llvm::hasSingleElement(origUbMap.getResults()))
1635  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1636  else
1637  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1638  upperBoundSymbols.push_back(prev);
1639 
1640  // 2. Emit code computing the upper bound of the coalesced loop as product of
1641  // the number of iterations of all loops.
1642  for (AffineForOp loop : loops.drop_front()) {
1643  ub = loop.getUpperBound();
1644  origUbMap = ub.getMap();
1645  ubOperands = ub.getOperands();
1646  Value upperBound;
1647  // If upper bound map has more than one result, take their minimum.
1648  if (!llvm::hasSingleElement(origUbMap.getResults()))
1649  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1650  else
1651  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1652  upperBoundSymbols.push_back(upperBound);
1653  SmallVector<Value, 4> operands;
1654  operands.push_back(prev);
1655  operands.push_back(upperBound);
1656  // Maintain running product of loop upper bounds.
1657  prev = builder.create<AffineApplyOp>(
1658  loc,
1659  AffineMap::get(/*dimCount=*/1,
1660  /*symbolCount=*/1,
1661  builder.getAffineDimExpr(0) *
1662  builder.getAffineSymbolExpr(0)),
1663  operands);
1664  }
1665  // Set upper bound of the coalesced loop.
1666  AffineMap newUbMap = AffineMap::get(
1667  /*dimCount=*/0,
1668  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1669  outermost.setUpperBound(prev, newUbMap);
1670 
1671  builder.setInsertionPointToStart(outermost.getBody());
1672 
1673  // 3. Remap induction variables. For each original loop, the value of the
1674  // induction variable can be obtained by dividing the induction variable of
1675  // the linearized loop by the total number of iterations of the loops nested
1676  // in it modulo the number of iterations in this loop (remove the values
1677  // related to the outer loops):
1678  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1679  // Compute these iteratively from the innermost loop by creating a "running
1680  // quotient" of division by the range.
1681  Value previous = outermost.getInductionVar();
1682  for (unsigned idx = loops.size(); idx > 0; --idx) {
1683  if (idx != loops.size()) {
1684  SmallVector<Value, 4> operands;
1685  operands.push_back(previous);
1686  operands.push_back(upperBoundSymbols[idx]);
1687  previous = builder.create<AffineApplyOp>(
1688  loc,
1690  /*dimCount=*/1, /*symbolCount=*/1,
1691  builder.getAffineDimExpr(0).floorDiv(
1692  builder.getAffineSymbolExpr(0))),
1693  operands);
1694  }
1695  // Modified value of the induction variables of the nested loops after
1696  // coalescing.
1697  Value inductionVariable;
1698  if (idx == 1) {
1699  inductionVariable = previous;
1700  } else {
1701  SmallVector<Value, 4> applyOperands;
1702  applyOperands.push_back(previous);
1703  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1704  inductionVariable = builder.create<AffineApplyOp>(
1705  loc,
1707  /*dimCount=*/1, /*symbolCount=*/1,
1708  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1709  applyOperands);
1710  }
1711  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1712  inductionVariable, loops.back().getRegion());
1713  }
1714 
1715  // 4. Move the operations from the innermost just above the second-outermost
1716  // loop, delete the extra terminator and the second-outermost loop.
1717  AffineForOp secondOutermostLoop = loops[1];
1718  innermost.getBody()->back().erase();
1719  outermost.getBody()->getOperations().splice(
1720  Block::iterator(secondOutermostLoop.getOperation()),
1721  innermost.getBody()->getOperations());
1722  secondOutermostLoop.erase();
1723  return success();
1724 }
1725 
1727  ArrayRef<Value> processorId,
1728  ArrayRef<Value> numProcessors) {
1729  assert(processorId.size() == numProcessors.size());
1730  if (processorId.empty())
1731  return;
1732 
1733  OpBuilder b(forOp);
1734  Location loc(forOp.getLoc());
1735  AffineExpr lhs, rhs;
1736  bindSymbols(forOp.getContext(), lhs, rhs);
1737  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1738  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1739 
1740  Value linearIndex = processorId.front();
1741  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1742  auto mulApplyOp = b.create<AffineApplyOp>(
1743  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1744  linearIndex = b.create<AffineApplyOp>(
1745  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1746  }
1747 
1748  auto mulApplyOp = b.create<AffineApplyOp>(
1749  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1750  Value lb = b.create<AffineApplyOp>(
1751  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1752  forOp.setLowerBound(lb);
1753 
1754  Value step = forOp.getStep();
1755  for (auto numProcs : numProcessors)
1756  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1757  forOp.setStep(step);
1758 }
1759 
1760 /// Given a memref region, determine the lowest depth at which transfers can be
1761 /// placed for it, and return the corresponding block, start and end positions
1762 /// in the block for placing incoming (read) and outgoing (write) copies
1763 /// respectively. The lowest depth depends on whether the region being accessed
1764 /// is hoistable with respect to one or more immediately surrounding loops.
1765 static void
1767  Block::iterator &begin, Block::iterator &end,
1768  Block **copyPlacementBlock,
1769  Block::iterator *copyInPlacementStart,
1770  Block::iterator *copyOutPlacementStart) {
1771  const auto *cst = region.getConstraints();
1772  SmallVector<Value, 4> symbols;
1773  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1774 
1775  SmallVector<Operation *, 4> enclosingAffineOps;
1776  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1777  // Walk up loop parents till we find an IV on which this region is
1778  // symbolic/variant or we hit `hoistGuard`.
1779  auto it = enclosingAffineOps.rbegin();
1780  AffineForOp lastInvariantFor;
1781  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1782  Operation *enclosingOp = *it;
1783  // We can't hoist past the definition of the memref being copied.
1784  Value memref = region.memref;
1785  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1786  LLVM_DEBUG(
1787  llvm::dbgs()
1788  << "memref definition will end up not dominating hoist location\n");
1789  break;
1790  }
1791 
1792  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1793  if (!affineFor)
1794  break;
1795  // TODO: also need to be checking this for regions symbols that
1796  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1797  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1798  break;
1799  lastInvariantFor = affineFor;
1800  }
1801 
1802  if (it != enclosingAffineOps.rbegin()) {
1803  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1804  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1805  *copyPlacementBlock = lastInvariantFor->getBlock();
1806  } else {
1807  *copyInPlacementStart = begin;
1808  *copyOutPlacementStart = end;
1809  *copyPlacementBlock = &block;
1810  }
1811 }
1812 
1813 // Info comprising stride and number of elements transferred every stride.
1814 struct StrideInfo {
1815  int64_t stride;
1817 };
1818 
1819 /// Returns striding information for a copy/transfer of this region with
1820 /// potentially multiple striding levels from outermost to innermost. For an
1821 /// n-dimensional region, there can be at most n-1 levels of striding
1822 /// successively nested.
1823 // TODO: make this work with non-identity layout maps.
1824 static void getMultiLevelStrides(const MemRefRegion &region,
1825  ArrayRef<int64_t> bufferShape,
1826  SmallVectorImpl<StrideInfo> *strideInfos) {
1827  if (bufferShape.size() <= 1)
1828  return;
1829 
1830  int64_t numEltPerStride = 1;
1831  int64_t stride = 1;
1832  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1833  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1834  stride *= dimSize;
1835  numEltPerStride *= bufferShape[d];
1836  // A stride is needed only if the region has a shorter extent than the
1837  // memref along the dimension *and* has an extent greater than one along the
1838  // next major dimension.
1839  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1840  strideInfos->push_back({stride, numEltPerStride});
1841  }
1842  }
1843 }
1844 
1845 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1846 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1847 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1848 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1849 /// contain the expressions to be subtracted out from the respective copy loop
1850 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1851 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1852 /// copy nest is inserted.
1853 //
1854 /// The copy-in nest is generated as follows as an example for a 2-d region:
1855 /// for x = ...
1856 /// for y = ...
1857 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1858 ///
1859 static AffineForOp
1860 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1861  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1862  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1863  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1864  OpBuilder b) {
1865  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1866  return lbMap.getNumInputs() == lbOperands.size();
1867  }));
1868  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1869  return ubMap.getNumInputs() == ubOperands.size();
1870  }));
1871 
1872  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1873  // A copy nest can't be generated for 0-ranked memrefs.
1874  assert(rank != 0 && "non-zero rank memref expected");
1875  assert(lbMaps.size() == rank && "wrong number of lb maps");
1876  assert(ubMaps.size() == rank && "wrong number of ub maps");
1877 
1878  SmallVector<Value, 4> memIndices;
1879  SmallVector<AffineExpr, 4> fastBufExprs;
1880  SmallVector<Value, 4> fastBufMapOperands;
1881  AffineForOp copyNestRoot;
1882  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1883  for (unsigned d = 0; d < rank; ++d) {
1884  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1885  ubOperands, ubMaps[d]);
1886  if (d == 0)
1887  copyNestRoot = forOp;
1888 
1889  b = OpBuilder::atBlockTerminator(forOp.getBody());
1890 
1891  auto fastBufOffsetMap =
1892  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1893  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1894 
1895  // Construct the subscript for the fast memref being copied into/from:
1896  // x - offset_x.
1897  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1898  b.getAffineDimExpr(2 * d));
1899  fastBufMapOperands.push_back(offset);
1900  fastBufMapOperands.push_back(forOp.getInductionVar());
1901  mayBeDeadApplys.push_back(offset);
1902 
1903  // Subscript for the slow memref being copied.
1904  memIndices.push_back(forOp.getInductionVar());
1905  }
1906 
1907  auto fastBufMap =
1908  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1909  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1910  fastBufMap = simplifyAffineMap(fastBufMap);
1911  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1912 
1913  // Drop any dead affine.applys.
1914  for (auto applyOp : mayBeDeadApplys)
1915  if (applyOp.use_empty())
1916  applyOp.erase();
1917 
1918  if (!isCopyOut) {
1919  // Copy in.
1920  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1921  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1922  fastBufMapOperands);
1923  return copyNestRoot;
1924  }
1925 
1926  // Copy out.
1927  auto load =
1928  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1929  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1930  return copyNestRoot;
1931 }
1932 
1933 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1935  return block.getParentOp()->emitRemark();
1936 }
1937 
1938 /// Creates a buffer in the faster memory space for the specified memref region
1939 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1940 /// space to this one, and replaces all loads/stores in the block range
1941 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1942 /// if copies could not be generated due to yet unimplemented cases.
1943 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1944 /// specify the insertion points where the incoming copies and outgoing copies,
1945 /// respectively, should be inserted (the insertion happens right before the
1946 /// insertion point). Since `begin` can itself be invalidated due to the memref
1947 /// rewriting done from this method, the output argument `nBegin` is set to its
1948 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1949 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1950 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1951 /// allocated.
1952 static LogicalResult generateCopy(
1953  const MemRefRegion &region, Block *block, Block::iterator begin,
1954  Block::iterator end, Block *copyPlacementBlock,
1955  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1956  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1957  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1958  Block::iterator *nBegin, Block::iterator *nEnd) {
1959  *nBegin = begin;
1960  *nEnd = end;
1961 
1962  auto f = begin->getParentOfType<FunctionOpInterface>();
1963  OpBuilder topBuilder(f.getFunctionBody());
1964  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1965 
1966  *sizeInBytes = 0;
1967 
1968  if (begin == end)
1969  return success();
1970 
1971  // Is the copy out point at the end of the block where we are doing
1972  // explicit copying.
1973  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1974 
1975  // Copies for read regions are going to be inserted at 'begin'.
1976  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1977  // Copies for write regions are going to be inserted at 'end'.
1978  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1979  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1980 
1981  // Builder to create constants at the top level.
1982  auto func =
1983  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1984  OpBuilder top(func.getFunctionBody());
1985 
1986  auto loc = region.loc;
1987  auto memref = region.memref;
1988  auto memRefType = cast<MemRefType>(memref.getType());
1989 
1990  if (!memRefType.getLayout().isIdentity()) {
1991  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1992  return failure();
1993  }
1994 
1995  // Indices to use for the copying.
1996  // Indices for the original memref being copied from/to.
1997  SmallVector<Value, 4> memIndices;
1998  // Indices for the faster buffer being copied into/from.
1999  SmallVector<Value, 4> bufIndices;
2000 
2001  unsigned rank = memRefType.getRank();
2002  if (rank == 0) {
2003  LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2004  return failure();
2005  }
2006 
2007  SmallVector<int64_t, 4> fastBufferShape;
2008 
2009  // Compute the extents of the buffer.
2011  lbs.reserve(rank);
2012  std::optional<int64_t> numElements =
2013  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2014  if (!numElements) {
2015  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2016  return failure();
2017  }
2018 
2019  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2020  // This can be supported in the future if needed.
2021  LLVM_DEBUG(llvm::dbgs()
2022  << "Max lower bound for memref region start not supported\n");
2023  return failure();
2024  }
2025 
2026  if (*numElements == 0) {
2027  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2028  return success();
2029  }
2030 
2031  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2032  for (unsigned i = 0; i < rank; ++i) {
2033  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2034  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2035  LLVM_DEBUG(llvm::dbgs()
2036  << "Missing lower or upper bound for region along dimension: "
2037  << i << '\n');
2038  return failure();
2039  }
2040  }
2041 
2042  const FlatAffineValueConstraints *cst = region.getConstraints();
2043  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2044  // on; these typically include loop IVs surrounding the level at which the
2045  // copy generation is being done or other valid symbols in MLIR.
2046  SmallVector<Value, 8> regionSymbols;
2047  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2048 
2049  // Construct the access expression for the fast memory buffer. The access
2050  // expression for a particular dimension of the fast buffer is obtained by
2051  // subtracting out the lower bound on the original memref's data region
2052  // along the corresponding dimension.
2053 
2054  // Index start offsets for faster memory buffer relative to the original.
2055  SmallVector<AffineExpr, 4> fastBufOffsets;
2056  fastBufOffsets.reserve(rank);
2057  for (unsigned d = 0; d < rank; d++) {
2058  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2059  "incorrect bound size");
2060 
2061  // Set copy start location for this dimension in the lower memory space
2062  // memref.
2063  if (lbs[d].isSingleConstant()) {
2064  auto indexVal = lbs[d].getSingleConstantResult();
2065  if (indexVal == 0) {
2066  memIndices.push_back(zeroIndex);
2067  } else {
2068  memIndices.push_back(
2069  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2070  }
2071  } else {
2072  // The coordinate for the start location is just the lower bound along the
2073  // corresponding dimension on the memory region (stored in 'offset').
2074  // Remap all inputs of the map to dimensions uniformly since in the
2075  // generate IR we need valid affine symbols as opposed to "symbols" for
2076  // the purpose of the memref region.
2077  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2078  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2079  symReplacements[i] = top.getAffineDimExpr(i);
2080  lbs[d] = lbs[d].replaceDimsAndSymbols(
2081  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2082  /*numResultSyms=*/0);
2083  memIndices.push_back(b.create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2084  }
2085  // The fast buffer is copied into at location zero; addressing is relative.
2086  bufIndices.push_back(zeroIndex);
2087 
2088  // Record the offsets since they are needed to remap the memory accesses of
2089  // the original memref further below.
2090  fastBufOffsets.push_back(lbs[d].getResult(0));
2091  }
2092 
2093  // The faster memory space buffer.
2094  Value fastMemRef;
2095 
2096  // Check if a buffer was already created.
2097  bool existingBuf = fastBufferMap.count(memref) > 0;
2098  if (!existingBuf) {
2099  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2100  auto fastMemRefType =
2101  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2102  fastBufferLayout, copyOptions.fastMemorySpace);
2103 
2104  // Create the fast memory space buffer just before the 'affine.for'
2105  // operation.
2106  fastMemRef =
2107  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2108  // Record it.
2109  fastBufferMap[memref] = fastMemRef;
2110  // fastMemRefType is a constant shaped memref.
2111  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2112  // We don't account for things of unknown size.
2113  *sizeInBytes = maySizeInBytes.value_or(0);
2114 
2115  LLVM_DEBUG(emitRemarkForBlock(*block)
2116  << "Creating fast buffer of type " << fastMemRefType
2117  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2118  << " KiB\n");
2119  } else {
2120  // Reuse the one already created.
2121  fastMemRef = fastBufferMap[memref];
2122  }
2123 
2124  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2125 
2126  Value dmaStride;
2127  Value numEltPerDmaStride;
2128  if (copyOptions.generateDma) {
2129  SmallVector<StrideInfo, 4> dmaStrideInfos;
2130  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2131 
2132  // TODO: use all stride levels once DmaStartOp is extended for
2133  // multi-level strides.
2134  if (dmaStrideInfos.size() > 1) {
2135  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2136  return failure();
2137  }
2138 
2139  if (!dmaStrideInfos.empty()) {
2140  dmaStride =
2141  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2142  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2143  loc, dmaStrideInfos[0].numEltPerStride);
2144  }
2145  }
2146 
2147  // Record the last operation where we want the memref replacement to end. We
2148  // later do the memref replacement only in [begin, postDomFilter] so
2149  // that the original memref's used in the data movement code themselves don't
2150  // get replaced.
2151  auto postDomFilter = std::prev(end);
2152 
2153  // Create fully composed affine maps for each memref.
2154  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2155  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2156  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2157  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2158 
2159  if (!copyOptions.generateDma) {
2160  // Point-wise copy generation.
2161  auto copyNest =
2162  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2163  /*lbOperands=*/regionSymbols, ubMaps,
2164  /*ubOperands=*/regionSymbols, fastBufOffsets,
2165  /*isCopyOut=*/region.isWrite(), b);
2166 
2167  // Record this so that we can skip it from yet another copy.
2168  copyNests.insert(copyNest);
2169 
2170  // Since new ops are being appended (for copy out's), adjust the end to
2171  // mark end of block range being processed if necessary.
2172  if (region.isWrite() && isCopyOutAtEndOfBlock)
2173  *nEnd = Block::iterator(copyNest.getOperation());
2174  } else {
2175  // DMA generation.
2176  // Create a tag (single element 1-d memref) for the DMA.
2177  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2178  copyOptions.tagMemorySpace);
2179  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2180 
2181  SmallVector<Value, 4> tagIndices({zeroIndex});
2182  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2183  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2184  if (!region.isWrite()) {
2185  // DMA non-blocking read from original buffer to fast buffer.
2186  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2187  fastMemRef, bufAffineMap, bufIndices,
2188  tagMemRef, tagAffineMap, tagIndices,
2189  numElementsSSA, dmaStride, numEltPerDmaStride);
2190  } else {
2191  // DMA non-blocking write from fast buffer to the original memref.
2192  auto op = b.create<AffineDmaStartOp>(
2193  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2194  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2195  dmaStride, numEltPerDmaStride);
2196  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2197  // end to mark end of block range being processed.
2198  if (isCopyOutAtEndOfBlock)
2199  *nEnd = Block::iterator(op.getOperation());
2200  }
2201 
2202  // Matching DMA wait to block on completion; tag always has a 0 index.
2203  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2204  numElementsSSA);
2205 
2206  // Generate dealloc for the tag.
2207  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2208  if (*nEnd == end && isCopyOutAtEndOfBlock)
2209  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2210  // mark end of range of the original.
2211  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2212  }
2213 
2214  // Generate dealloc for the buffer.
2215  if (!existingBuf) {
2216  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2217  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2218  // the fast buffer (since it marks the new end insertion point).
2219  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2220  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2221  }
2222 
2223  // Replace all uses of the old memref with the faster one while remapping
2224  // access indices (subtracting out lower bound offsets for each dimension).
2225  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2226  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2227  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2228  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2229  // d2, d3 correspond to the original indices (%i, %j).
2230  SmallVector<AffineExpr, 4> remapExprs;
2231  remapExprs.reserve(rank);
2232  for (unsigned i = 0; i < rank; i++) {
2233  // The starting operands of indexRemap will be regionSymbols (the symbols on
2234  // which the memref region is parametric); then those corresponding to
2235  // the memref's original indices follow.
2236  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2237  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2238  }
2239  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2240  b.getContext());
2241 
2242  // Record the begin since it may be invalidated by memref replacement.
2243  Block::iterator prevOfBegin;
2244  bool isBeginAtStartOfBlock = (begin == block->begin());
2245  if (!isBeginAtStartOfBlock)
2246  prevOfBegin = std::prev(begin);
2247 
2248  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2249  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2250  /*extraIndices=*/{}, indexRemap,
2251  /*extraOperands=*/regionSymbols,
2252  /*symbolOperands=*/{},
2253  /*domOpFilter=*/&*begin,
2254  /*postDomOpFilter=*/&*postDomFilter);
2255 
2256  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2257 
2258  return success();
2259 }
2260 
2261 /// Construct the memref region to just include the entire memref. Returns false
2262 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2263 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2264 /// is parametric on.
2265 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2266  MemRefRegion *region) {
2267  unsigned rank;
2268  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2269  rank = loadOp.getMemRefType().getRank();
2270  region->memref = loadOp.getMemRef();
2271  region->setWrite(false);
2272  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2273  rank = storeOp.getMemRefType().getRank();
2274  region->memref = storeOp.getMemRef();
2275  region->setWrite(true);
2276  } else {
2277  assert(false && "expected load or store op");
2278  return false;
2279  }
2280  auto memRefType = cast<MemRefType>(region->memref.getType());
2281  if (!memRefType.hasStaticShape())
2282  return false;
2283 
2284  auto *regionCst = region->getConstraints();
2285 
2286  // Just get the first numSymbols IVs, which the memref region is parametric
2287  // on.
2289  getAffineForIVs(*op, &ivs);
2290  ivs.resize(numParamLoopIVs);
2291  SmallVector<Value, 4> symbols;
2292  extractForInductionVars(ivs, &symbols);
2293  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2294  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2295 
2296  // Memref dim sizes provide the bounds.
2297  for (unsigned d = 0; d < rank; d++) {
2298  auto dimSize = memRefType.getDimSize(d);
2299  assert(dimSize > 0 && "filtered dynamic shapes above");
2300  regionCst->addBound(BoundType::LB, d, 0);
2301  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2302  }
2303  return true;
2304 }
2305 
2306 LogicalResult
2308  const AffineCopyOptions &copyOptions,
2309  std::optional<Value> filterMemRef,
2310  DenseSet<Operation *> &copyNests) {
2311  if (begin == end)
2312  return success();
2313 
2314  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2315  "Inconsistent block begin/end args");
2316  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2317 
2318  Block *block = begin->getBlock();
2319 
2320  // Copies will be generated for this depth, i.e., symbolic in all loops
2321  // surrounding the this block range.
2322  unsigned copyDepth = getNestingDepth(&*begin);
2323 
2324  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2325  << "\n");
2326  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2327  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2328 
2329  // List of memory regions to copy for. We need a map vector to have a
2330  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2331  // since the alloc's for example are identical except for the SSA id.
2332  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2333  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2334 
2335  // Map from original memref's to the fast buffers that their accesses are
2336  // replaced with.
2337  DenseMap<Value, Value> fastBufferMap;
2338 
2339  // To check for errors when walking the block.
2340  bool error = false;
2341 
2342  // Walk this range of operations to gather all memory regions.
2343  block->walk(begin, end, [&](Operation *opInst) {
2344  Value memref;
2345  MemRefType memrefType;
2346  // Gather regions to allocate to buffers in faster memory space.
2347  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2348  memref = loadOp.getMemRef();
2349  memrefType = loadOp.getMemRefType();
2350  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2351  memref = storeOp.getMemRef();
2352  memrefType = storeOp.getMemRefType();
2353  }
2354  // Not an affine.load/store op.
2355  if (!memref)
2356  return;
2357 
2358  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2359  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2360  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2361  return;
2362 
2363  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2364  LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2365  "which copy-in/copy-out would happen\n");
2366  return;
2367  }
2368 
2369  // Compute the MemRefRegion accessed.
2370  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2371  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2372  /*addMemRefDimBounds=*/false))) {
2373  LLVM_DEBUG(llvm::dbgs()
2374  << "Error obtaining memory region: semi-affine maps?\n");
2375  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2376  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2377  LLVM_DEBUG(
2378  opInst->emitError("non-constant memref sizes not yet supported"));
2379  error = true;
2380  return;
2381  }
2382  }
2383 
2384  // Each memref has a single buffer associated with it irrespective of how
2385  // many load's and store's happen on it.
2386  // TODO: in the future, when regions don't intersect and satisfy
2387  // other properties (based on load/store regions), we could consider
2388  // multiple buffers per memref.
2389 
2390  // Add to the appropriate region if it's not already in it, or take a
2391  // bounding box union with the existing one if it's already in there.
2392  // Note that a memref may have both read and write regions - so update the
2393  // region in the other list if one exists (write in case of read and vice
2394  // versa) since there is a single bounding box for a memref across all reads
2395  // and writes that happen on it.
2396 
2397  // Attempts to update; returns true if 'region' exists in targetRegions.
2398  auto updateRegion =
2399  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2400  &targetRegions) {
2401  const auto *const it = targetRegions.find(region->memref);
2402  if (it == targetRegions.end())
2403  return false;
2404 
2405  // Perform a union with the existing region.
2406  if (failed(it->second->unionBoundingBox(*region))) {
2407  LLVM_DEBUG(llvm::dbgs()
2408  << "Memory region bounding box failed; "
2409  "over-approximating to the entire memref\n");
2410  // If the union fails, we will overapproximate.
2411  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2412  LLVM_DEBUG(opInst->emitError(
2413  "non-constant memref sizes not yet supported"));
2414  error = true;
2415  return true;
2416  }
2417  it->second->getConstraints()->clearAndCopyFrom(
2418  *region->getConstraints());
2419  } else {
2420  // Union was computed and stored in 'it->second': copy to 'region'.
2421  region->getConstraints()->clearAndCopyFrom(
2422  *it->second->getConstraints());
2423  }
2424  return true;
2425  };
2426 
2427  bool existsInRead = updateRegion(readRegions);
2428  if (error)
2429  return;
2430  bool existsInWrite = updateRegion(writeRegions);
2431  if (error)
2432  return;
2433 
2434  // Finally add it to the region list.
2435  if (region->isWrite() && !existsInWrite) {
2436  writeRegions[region->memref] = std::move(region);
2437  } else if (!region->isWrite() && !existsInRead) {
2438  readRegions[region->memref] = std::move(region);
2439  }
2440  });
2441 
2442  if (error) {
2443  LLVM_DEBUG(begin->emitError(
2444  "copy generation failed for one or more memref's in this block\n"));
2445  return failure();
2446  }
2447 
2448  uint64_t totalCopyBuffersSizeInBytes = 0;
2449  bool ret = true;
2450  auto processRegions =
2451  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2452  &regions) {
2453  for (const auto &regionEntry : regions) {
2454  // For each region, hoist copy in/out past all hoistable
2455  // 'affine.for's.
2456  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2457  Block *copyPlacementBlock;
2459  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2460  &copyInPlacementStart, &copyOutPlacementStart);
2461 
2462  uint64_t sizeInBytes;
2463  Block::iterator nBegin, nEnd;
2464  LogicalResult iRet = generateCopy(
2465  *regionEntry.second, block, begin, end, copyPlacementBlock,
2466  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2467  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2468  if (succeeded(iRet)) {
2469  // begin/end could have been invalidated, and need update.
2470  begin = nBegin;
2471  end = nEnd;
2472  totalCopyBuffersSizeInBytes += sizeInBytes;
2473  }
2474  ret = ret & succeeded(iRet);
2475  }
2476  };
2477  processRegions(readRegions);
2478  processRegions(writeRegions);
2479 
2480  if (!ret) {
2481  LLVM_DEBUG(begin->emitError(
2482  "copy generation failed for one or more memref's in this block\n"));
2483  return failure();
2484  }
2485 
2486  // For a range of operations, a note will be emitted at the caller.
2487  AffineForOp forOp;
2488  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2489  LLVM_DEBUG(forOp.emitRemark()
2490  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2491  << " KiB of copy buffers in fast memory space for this block");
2492  }
2493 
2494  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2495  block->getParentOp()->emitWarning(
2496  "total size of all copy buffers' for this block exceeds fast memory "
2497  "capacity");
2498  }
2499 
2500  return success();
2501 }
2502 
2503 // A convenience version of affineDataCopyGenerate for all ops in the body of
2504 // an AffineForOp.
2506  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2507  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2508  return affineDataCopyGenerate(forOp.getBody()->begin(),
2509  std::prev(forOp.getBody()->end()), copyOptions,
2510  filterMemRef, copyNests);
2511 }
2512 
2514  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2515  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2516  Block *block = analyzedOp->getBlock();
2517  auto begin = analyzedOp->getIterator();
2518  auto end = std::next(begin);
2519  DenseMap<Value, Value> fastBufferMap;
2520  DenseSet<Operation *> copyNests;
2521 
2522  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2523  copyOptions, fastBufferMap, copyNests,
2524  &result.sizeInBytes, &begin, &end);
2525  if (failed(err))
2526  return err;
2527 
2528  const auto &en = fastBufferMap.find(memrefRegion.memref);
2529  // In some cases (empty loops), no copy generation would have happened.
2530  if (en == fastBufferMap.end())
2531  return failure();
2532  result.alloc = en->second.getDefiningOp();
2533  assert(result.alloc && "fast buffer expected to be locally allocated");
2534  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2535  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2536  return success();
2537 }
2538 
2539 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2540 static void
2541 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2542  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2543  // Add a new empty level to output if it doesn't exist level already.
2544  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2545  if (currLoopDepth == depthToLoops.size())
2546  depthToLoops.emplace_back();
2547 
2548  for (auto &op : *block) {
2549  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2550  depthToLoops[currLoopDepth].push_back(forOp);
2551  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2552  }
2553  }
2554 }
2555 
2556 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2558  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2559  for (auto &block : func)
2560  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2561 
2562  // Remove last loop level from output since it's empty.
2563  if (!depthToLoops.empty()) {
2564  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2565  depthToLoops.pop_back();
2566  }
2567 }
2568 
2570  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2571  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2572  SmallVector<Value, 4> lowerOperands(lbOperands);
2573  SmallVector<Value, 4> upperOperands(ubOperands);
2574 
2575  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2576  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2577  lbMap = removeDuplicateExprs(lbMap);
2578  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2579  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2580  ubMap = removeDuplicateExprs(ubMap);
2581 
2582  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2583  step);
2584 }
2585 
2586 /// Creates an AffineIfOp that encodes the conditional to choose between
2587 /// the constant trip count version and an unknown trip count version of this
2588 /// nest of loops. This is used to separate partial and full tiles if `loops`
2589 /// has the intra-tile loops. The affine.if op is inserted at the builder
2590 /// insertion point of `b`.
2592  OpBuilder b) {
2593  if (loops.empty())
2594  return nullptr;
2595 
2596  auto *context = loops[0].getContext();
2597 
2600  llvm::append_range(ops, loops);
2601  (void)getIndexSet(ops, &cst);
2602 
2603  // Remove constraints that are independent of these loop IVs.
2604  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2605 
2606  // Construct the constraint set representing the guard for full tiles. The
2607  // lower bound (and upper bound) corresponding to the full tile should be
2608  // larger (and resp. smaller) than any other lower (or upper bound).
2609  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2610  for (auto loop : loops) {
2611  (void)loop;
2612  // TODO: Non-unit stride is not an issue to generalize to.
2613  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2614  // Mark everything symbols for the purpose of finding a constant diff pair.
2615  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2616  1);
2617  unsigned fullTileLbPos, fullTileUbPos;
2618  if (!((IntegerRelation)cst)
2619  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2620  /*boundFloorDivisor=*/nullptr,
2621  /*ub=*/nullptr, &fullTileLbPos,
2622  &fullTileUbPos)) {
2623  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2624  return nullptr;
2625  }
2626 
2627  SmallVector<unsigned, 4> lbIndices, ubIndices;
2628  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2629 
2630  auto fLb = cst.getInequality(fullTileLbPos);
2631  auto fUb = cst.getInequality(fullTileUbPos);
2632  fullTileLb.assign(fLb.begin(), fLb.end());
2633  fullTileUb.assign(fUb.begin(), fUb.end());
2634 
2635  // Full tile lower bound should be >= than any other lower bound.
2636  for (auto lbIndex : lbIndices)
2637  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2638  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2639 
2640  // Full tile upper bound should be <= any other upper bound.
2641  for (auto ubIndex : ubIndices)
2642  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2643  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2644 
2645  cst.removeVar(0);
2646  }
2647 
2648  // The previous step leads to all zeros for the full tile lb and ub position
2649  // itself; remove those and any other duplicates / trivial redundancies.
2651 
2652  // Turn everything into dims conservatively since we earlier turned all
2653  // trailing ids past point loop IV into symbols. Some of these could be outer
2654  // loop IVs; we'll canonicalize anyway.
2655  cst.setDimSymbolSeparation(0);
2656 
2657  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2658  // ifCondSet can be null if cst was empty -- this can happen if all loops
2659  // in the nest have constant trip counts.
2660  if (!ifCondSet)
2661  return nullptr;
2662 
2663  SmallVector<Value, 4> setOperands;
2664  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2665  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2666  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2667  /*withElseRegion=*/true);
2668 }
2669 
2670 /// Create the full tile loop nest (along with its body).
2671 static LogicalResult
2673  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2674  fullTileLoops.reserve(inputNest.size());
2675 
2676  // For each loop in the original nest identify a lower/upper bound pair such
2677  // that their difference is a constant.
2679  for (auto loop : inputNest) {
2680  // TODO: straightforward to generalize to a non-unit stride.
2681  if (loop.getStepAsInt() != 1) {
2682  LLVM_DEBUG(llvm::dbgs()
2683  << "[tile separation] non-unit stride not implemented\n");
2684  return failure();
2685  }
2686  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2687  (void)getIndexSet(loopOp, &cst);
2688  // We will mark everything other than this loop IV as symbol for getting a
2689  // pair of <lb, ub> with a constant difference.
2691  unsigned lbPos, ubPos;
2692  if (!((IntegerRelation)cst)
2693  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2694  /*boundFloorDivisor=*/nullptr,
2695  /*ub=*/nullptr, &lbPos, &ubPos) ||
2696  lbPos == ubPos) {
2697  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2698  "equalities not yet handled\n");
2699  return failure();
2700  }
2701 
2702  // Set all variables as dimensions uniformly since some of those marked as
2703  // symbols above could be outer loop IVs (corresponding tile space IVs).
2704  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2705 
2706  AffineValueMap lbVmap, ubVmap;
2707  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2708  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2709  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2710  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2711  ubVmap.getOperands(), ubVmap.getAffineMap());
2712  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2713  fullTileLoops.push_back(fullTileLoop);
2714  }
2715 
2716  // Add the body for the full tile loop nest.
2717  IRMapping operandMap;
2718  for (const auto &loopEn : llvm::enumerate(inputNest))
2719  operandMap.map(loopEn.value().getInductionVar(),
2720  fullTileLoops[loopEn.index()].getInductionVar());
2721  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2722  for (auto &op : inputNest.back().getBody()->without_terminator())
2723  b.clone(op, operandMap);
2724  return success();
2725 }
2726 
2727 LogicalResult
2729  SmallVectorImpl<AffineForOp> *fullTileNest) {
2730  if (inputNest.empty())
2731  return success();
2732 
2733  auto firstLoop = inputNest[0];
2734 
2735  // Each successive for op has to be nested in the other.
2736  auto prevLoop = firstLoop;
2737  for (auto loop : inputNest.drop_front(1)) {
2738  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2739  prevLoop = loop;
2740  }
2741 
2742  // Create the full tile loop nest.
2743  SmallVector<AffineForOp, 4> fullTileLoops;
2744  OpBuilder b(firstLoop);
2745  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2746  if (!fullTileLoops.empty())
2747  fullTileLoops.front().erase();
2748  return failure();
2749  }
2750 
2751  // Create and insert the version select right before the root of the nest.
2752  b = OpBuilder(firstLoop);
2753  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2754  if (!ifOp) {
2755  fullTileLoops.front().erase();
2756  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2757  "separation condition\n");
2758  return failure();
2759  }
2760 
2761  // Move the full tile into the then block.
2762  Block *thenBlock = ifOp.getThenBlock();
2763  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2764  thenBlock->getOperations().splice(
2765  std::prev(thenBlock->end()),
2766  outermostFullTileLoop->getBlock()->getOperations(),
2767  Block::iterator(outermostFullTileLoop));
2768 
2769  // Move the partial tile into the else block. The partial tile is the same as
2770  // the original loop nest.
2771  Block *elseBlock = ifOp.getElseBlock();
2772  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2773  firstLoop->getBlock()->getOperations(),
2774  Block::iterator(firstLoop));
2775 
2776  if (fullTileNest)
2777  *fullTileNest = std::move(fullTileLoops);
2778 
2779  return success();
2780 }
2781 
2782 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2783  LogicalResult result(failure());
2785  getPerfectlyNestedLoops(loops, op);
2786  if (loops.size() <= 1)
2787  return success();
2788 
2789  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2790  // loops with bounds defined above some loop.
2791  // 1. For each loop, find above which parent loop its operands are
2792  // defined.
2793  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2794  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2795  operandsDefinedAbove[i] = i;
2796  for (unsigned j = 0; j < i; ++j) {
2797  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2798  operandsDefinedAbove[i] = j;
2799  break;
2800  }
2801  }
2802  }
2803 
2804  // 2. Identify bands of loops such that the operands of all of them are
2805  // defined above the first loop in the band. Traverse the nest bottom-up
2806  // so that modifications don't invalidate the inner loops.
2807  for (unsigned end = loops.size(); end > 0; --end) {
2808  unsigned start = 0;
2809  for (; start < end - 1; ++start) {
2810  auto maxPos =
2811  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2812  std::next(operandsDefinedAbove.begin(), end));
2813  if (maxPos > start)
2814  continue;
2815  assert(maxPos == start &&
2816  "expected loop bounds to be known at the start of the band");
2817  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2818  if (succeeded(coalesceLoops(band)))
2819  result = success();
2820  break;
2821  }
2822  // If a band was found and transformed, keep looking at the loops above
2823  // the outermost transformed loop.
2824  if (start != end - 1)
2825  end = start + 1;
2826  }
2827  return result;
2828 }
2829 
2831  int64_t count = 0;
2832  Operation *currentOp = operand.getOwner();
2833  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2834  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2835  break;
2836  currentOp = loopOp;
2837  count++;
2838  }
2839  return count;
2840 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:382
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:660
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:423
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2265
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1536
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1934
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:361
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1766
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:174
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:408
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:559
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:464
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2541
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:913
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1952
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2672
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1824
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:688
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:974
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1087
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1318
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:417
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2591
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:44
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1516
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1860
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:102
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:398
unsigned getNumDims() const
Definition: AffineMap.cpp:394
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:407
unsigned getNumResults() const
Definition: AffineMap.cpp:402
unsigned getNumInputs() const
Definition: AffineMap.cpp:403
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:411
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:29
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:305
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:246
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:33
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:397
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:403
AffineMap getDimIdentityMap()
Definition: Builders.cpp:379
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:383
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:364
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:368
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:360
MLIRContext * getContext() const
Definition: Builders.h:56
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:734
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:549
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:243
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:194
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
static WalkResult advance()
Definition: Visitors.h:51
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:523
Value getOperand(unsigned idx)
Definition: AffineOps.h:529
operand_range getOperands()
Definition: AffineOps.h:538
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:106
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:315
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1612
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1157
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:885
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2307
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1076
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2670
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1009
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:776
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2557
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1368
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2569
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:856
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:229
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1352
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2513
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1508
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:900
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1390
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1099
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:814
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1513
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:762
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1382
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2830
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1985
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1726
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1591
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1460
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:772
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1295
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2782
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:873
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1370
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2728
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2661
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:773
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:35
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:783
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingOps
Only pre-existing ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1815
int64_t numEltPerStride
Definition: LoopUtils.cpp:1816
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:205
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1071
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:527
bool isWrite() const
Definition: Utils.h:529
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1127
void setWrite(bool flag)
Definition: Utils.h:530
Value memref
Memref that this region corresponds to.
Definition: Utils.h:562
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:569
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.