MLIR  21.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include <optional>
30 
31 #define DEBUG_TYPE "loop-utils"
32 
33 using namespace mlir;
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
37 
38 /// Computes the cleanup loop lower bound of the loop being unrolled with
39 /// the specified unroll factor; this bound will also be upper bound of the main
40 /// part of the unrolled loop. Computes the bound as an AffineMap with its
41 /// operands or a null map when the trip count can't be expressed as an affine
42 /// expression.
43 static void
44 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45  AffineMap &cleanupLbMap,
46  SmallVectorImpl<Value> &cleanupLbOperands) {
47  AffineMap tripCountMap;
48  SmallVector<Value, 4> tripCountOperands;
49  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50  // Trip count can't be computed.
51  if (!tripCountMap) {
52  cleanupLbMap = AffineMap();
53  return;
54  }
55 
56  OpBuilder b(forOp);
57  auto lbMap = forOp.getLowerBoundMap();
58  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59  forOp.getLowerBoundOperands());
60 
61  // For each upper bound expr, get the range.
62  // Eg: affine.for %i = lb to min (ub1, ub2),
63  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65  // these affine.apply's make up the cleanup loop lower bound.
66  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68  int64_t step = forOp.getStepAsInt();
69  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70  auto tripCountExpr = tripCountMap.getResult(i);
71  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
73  tripCountMap.getNumSymbols(), bumpExprs[i]);
74  bumpValues[i] =
75  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76  }
77 
78  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
81 
82  cleanupLbOperands.clear();
83  cleanupLbOperands.push_back(lb);
84  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
85  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
86  b.getContext());
87  // Simplify the cleanupLbMap + cleanupLbOperands.
88  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
89  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
90  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  // Remove any affine.apply's that became dead from the simplification above.
92  for (auto v : bumpValues)
93  if (v.use_empty())
94  v.getDefiningOp()->erase();
95 
96  if (lb.use_empty())
97  lb.erase();
98 }
99 
100 /// Helper to replace uses of loop carried values (iter_args) and loop
101 /// yield values while promoting single iteration affine.for ops.
102 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103  // Replace uses of iter arguments with iter operands (initial values).
104  auto iterOperands = forOp.getInits();
105  auto iterArgs = forOp.getRegionIterArgs();
106  for (auto e : llvm::zip(iterOperands, iterArgs))
107  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108 
109  // Replace uses of loop results with the values yielded by the loop.
110  auto outerResults = forOp.getResults();
111  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112  for (auto e : llvm::zip(outerResults, innerResults))
113  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114 }
115 
116 /// Promotes the loop body of a forOp to its containing block if the forOp
117 /// was known to have a single iteration.
118 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120  if (!tripCount || *tripCount != 1)
121  return failure();
122 
123  // TODO: extend this for arbitrary affine bounds.
124  if (forOp.getLowerBoundMap().getNumResults() != 1)
125  return failure();
126 
127  // Replaces all IV uses to its single iteration value.
128  auto iv = forOp.getInductionVar();
129  auto *parentBlock = forOp->getBlock();
130  if (!iv.use_empty()) {
131  if (forOp.hasConstantLowerBound()) {
132  auto func = forOp->getParentOfType<FunctionOpInterface>();
133  OpBuilder builder(forOp->getContext());
134  if (func)
135  builder.setInsertionPointToStart(&func.getFunctionBody().front());
136  else
137  builder.setInsertionPoint(forOp);
138  auto constOp = builder.create<arith::ConstantIndexOp>(
139  forOp.getLoc(), forOp.getConstantLowerBound());
140  iv.replaceAllUsesWith(constOp);
141  } else {
142  auto lbOperands = forOp.getLowerBoundOperands();
143  auto lbMap = forOp.getLowerBoundMap();
144  OpBuilder builder(forOp);
145  if (lbMap == builder.getDimIdentityMap()) {
146  // No need of generating an affine.apply.
147  iv.replaceAllUsesWith(lbOperands[0]);
148  } else {
149  auto affineApplyOp =
150  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151  iv.replaceAllUsesWith(affineApplyOp);
152  }
153  }
154  }
155 
157 
158  // Move the loop body operations, except for its terminator, to the loop's
159  // containing block.
160  forOp.getBody()->back().erase();
161  parentBlock->getOperations().splice(Block::iterator(forOp),
162  forOp.getBody()->getOperations());
163  forOp.erase();
164  return success();
165 }
166 
167 /// Generates an affine.for op with the specified lower and upper bounds
168 /// while generating the right IV remappings to realize shifts for operations in
169 /// its body. The operations that go into the loop body are specified in
170 /// opGroupQueue starting from the specified offset, and in that order. The
171 /// first element of the pair specifies the shift applied to that group of
172 /// operations; the shift is multiplied by the loop step before being applied.
173 /// Returns nullptr if the generated loop simplifies to a single iteration one.
174 static AffineForOp generateShiftedLoop(
175  AffineMap lbMap, AffineMap ubMap,
176  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178  auto lbOperands = srcForOp.getLowerBoundOperands();
179  auto ubOperands = srcForOp.getUpperBoundOperands();
180 
181  assert(lbMap.getNumInputs() == lbOperands.size());
182  assert(ubMap.getNumInputs() == ubOperands.size());
183 
184  auto loopChunk =
185  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186  ubMap, srcForOp.getStepAsInt());
187  auto loopChunkIV = loopChunk.getInductionVar();
188  auto srcIV = srcForOp.getInductionVar();
189 
190  IRMapping operandMap;
191 
192  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
193  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194  uint64_t shift = it.first;
195  auto ops = it.second;
196  // All 'same shift' operations get added with their operands being
197  // remapped to results of cloned operations, and their IV used remapped.
198  // Generate the remapping if the shift is not zero: remappedIV = newIV -
199  // shift.
200  if (!srcIV.use_empty() && shift != 0) {
201  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202  srcForOp.getLoc(),
203  bodyBuilder.getSingleDimShiftAffineMap(
204  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205  loopChunkIV);
206  operandMap.map(srcIV, ivRemap);
207  } else {
208  operandMap.map(srcIV, loopChunkIV);
209  }
210  for (auto *op : ops)
211  bodyBuilder.clone(*op, operandMap);
212  };
213  if (succeeded(promoteIfSingleIteration(loopChunk)))
214  return AffineForOp();
215  return loopChunk;
216 }
217 
218 // The skewing of operations with respect to one another can be used for
219 // example to allow overlap of asynchronous operations (such as DMA
220 // communication) with computation, or just relative shifting of operations
221 // for better register reuse, locality or parallelism. As such, the shifts are
222 // typically expected to be at most of the order of the number of operations.
223 // This method should not be used as a substitute for loop distribution/fission.
224 // This method uses an algorithm// in time linear in the number of operations
225 // in the body of the for loop - (using the 'sweep line' paradigm). This method
226 // asserts preservation of SSA dominance. A check for that as well as that for
227 // memory-based dependence preservation check rests with the users of this
228 // method.
229 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230  ArrayRef<uint64_t> shifts,
231  bool unrollPrologueEpilogue) {
232  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233  "too few/many shifts");
234  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235  return success();
236 
237  // If the trip counts aren't constant, we would need versioning and
238  // conditional guards (or context information to prevent such versioning). The
239  // better way to pipeline for such loops is to first tile them and extract
240  // constant trip count "full tiles" before applying this.
241  auto mayBeConstTripCount = getConstantTripCount(forOp);
242  if (!mayBeConstTripCount) {
243  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244  return success();
245  }
246  uint64_t tripCount = *mayBeConstTripCount;
247 
248  assert(isOpwiseShiftValid(forOp, shifts) &&
249  "shifts will lead to an invalid transformation\n");
250 
251  int64_t step = forOp.getStepAsInt();
252 
253  unsigned numChildOps = shifts.size();
254 
255  // Do a linear time (counting) sort for the shifts.
256  uint64_t maxShift = *llvm::max_element(shifts);
257  if (maxShift >= numChildOps) {
258  // Large shifts are not the typical use case.
259  forOp.emitWarning("not shifting because shifts are unrealistically large");
260  return success();
261  }
262 
263  // An array of operation groups sorted by shift amount; each group has all
264  // operations with the same shift in the order in which they appear in the
265  // body of the 'affine.for' op.
266  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267  unsigned pos = 0;
268  for (auto &op : forOp.getBody()->without_terminator()) {
269  auto shift = shifts[pos++];
270  sortedOpGroups[shift].push_back(&op);
271  }
272 
273  // Unless the shifts have a specific pattern (which actually would be the
274  // common use case), prologue and epilogue are not meaningfully defined.
275  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276  // loop generated as the prologue and the last as epilogue and unroll these
277  // fully.
278  AffineForOp prologue, epilogue;
279 
280  // Do a sweep over the sorted shifts while storing open groups in a
281  // vector, and generating loop portions as necessary during the sweep. A block
282  // of operations is paired with its shift.
283  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284 
285  auto origLbMap = forOp.getLowerBoundMap();
286  uint64_t lbShift = 0;
287  OpBuilder b(forOp);
288  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289  // If nothing is shifted by d, continue.
290  if (sortedOpGroups[d].empty())
291  continue;
292  if (!opGroupQueue.empty()) {
293  assert(d > 0 &&
294  "Queue expected to be empty when the first block is found");
295  // The interval for which the loop needs to be generated here is:
296  // [lbShift, min(lbShift + tripCount, d)) and the body of the
297  // loop needs to have all operations in opQueue in that order.
298  AffineForOp res;
299  if (lbShift + tripCount * step < d * step) {
300  res = generateShiftedLoop(
301  b.getShiftedAffineMap(origLbMap, lbShift),
302  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303  opGroupQueue, /*offset=*/0, forOp, b);
304  // Entire loop for the queued op groups generated, empty it.
305  opGroupQueue.clear();
306  lbShift += tripCount * step;
307  } else {
308  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309  b.getShiftedAffineMap(origLbMap, d),
310  opGroupQueue, /*offset=*/0, forOp, b);
311  lbShift = d * step;
312  }
313 
314  if (res) {
315  // Simplify/canonicalize the affine.for.
316  RewritePatternSet patterns(res.getContext());
317  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
318  bool erased;
320  res.getOperation(), std::move(patterns),
323  /*changed=*/nullptr, &erased);
324  if (!erased && !prologue)
325  prologue = res;
326  if (!erased)
327  epilogue = res;
328  }
329  } else {
330  // Start of first interval.
331  lbShift = d * step;
332  }
333  // Augment the list of operations that get into the current open interval.
334  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
335  }
336 
337  // Those operations groups left in the queue now need to be processed (FIFO)
338  // and their loops completed.
339  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
341  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
342  b.getShiftedAffineMap(origLbMap, ubShift),
343  opGroupQueue, /*offset=*/i, forOp, b);
344  lbShift = ubShift;
345  if (!prologue)
346  prologue = epilogue;
347  }
348 
349  // Erase the original for op.
350  forOp.erase();
351 
352  if (unrollPrologueEpilogue && prologue)
353  (void)loopUnrollFull(prologue);
354  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355  (void)loopUnrollFull(epilogue);
356 
357  return success();
358 }
359 
360 /// Checks whether a loop nest is hyper-rectangular or not.
361 static LogicalResult
364  SmallVector<Operation *, 8> ops(input.begin(), input.end());
365  // 0-d or 1-d is trivially hyper-rectangular.
366  if (input.size() <= 1)
367  return success();
368  if (failed(getIndexSet(ops, &cst))) {
369  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370  return failure();
371  }
372  if (!cst.isHyperRectangular(0, input.size())) {
373  LLVM_DEBUG(llvm::dbgs()
374  << "Non-hyperrectangular nests not supported for tiling!\n");
375  return failure();
376  }
377  return success();
378 }
379 
380 /// Check if the input nest is supported for tiling and whether tiling would be
381 /// legal or not.
382 template <typename t>
384  ArrayRef<t> tileSizes) {
385  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386 
387  if (llvm::any_of(input,
388  [](AffineForOp op) { return op.getNumResults() > 0; })) {
389  LLVM_DEBUG(llvm::dbgs()
390  << "Cannot tile nest where a loop has yield values\n");
391  return failure();
392  }
393 
394  // Check if the supplied `for` ops are all successively nested.
395  if (!isPerfectlyNested(input)) {
396  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397  return failure();
398  }
399 
400  // TODO: handle non hyper-rectangular spaces.
401  if (failed(checkIfHyperRectangular(input)))
402  return failure();
403 
404  return success();
405 }
406 
407 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
408 /// location in destination's body, ignoring the terminator.
409 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410  Block::iterator loc) {
411  auto &ops = src.getBody()->getOperations();
412  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413  std::prev(ops.end()));
414 }
415 
416 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417 /// body.
418 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
420 }
421 
422 /// Constructs tiled loop nest, without setting the loop bounds and move the
423 /// body of the original loop nest to the tiled loop nest.
425  AffineForOp rootAffineForOp, unsigned width,
426  MutableArrayRef<AffineForOp> tiledLoops) {
427  Location loc = rootAffineForOp.getLoc();
428 
429  // The outermost among the loops as we add more..
430  Operation *topLoop = rootAffineForOp.getOperation();
431  AffineForOp innermostPointLoop;
432 
433  // Add intra-tile (or point) loops.
434  for (unsigned i = 0; i < width; i++) {
435  OpBuilder b(topLoop);
436  // Loop bounds will be set later.
437  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
438  pointLoop.getBody()->getOperations().splice(
439  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440  topLoop);
441  tiledLoops[2 * width - 1 - i] = pointLoop;
442  topLoop = pointLoop.getOperation();
443  if (i == 0)
444  innermostPointLoop = pointLoop;
445  }
446 
447  // Add tile space loops;
448  for (unsigned i = width; i < 2 * width; i++) {
449  OpBuilder b(topLoop);
450  // Loop bounds will be set later.
451  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
452  tileSpaceLoop.getBody()->getOperations().splice(
453  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454  topLoop);
455  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456  topLoop = tileSpaceLoop.getOperation();
457  }
458 
459  // Move the loop body of the original nest to the new one.
460  moveLoopBody(origLoops.back(), innermostPointLoop);
461 }
462 
463 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
464 // TODO: Handle non-constant lower bounds.
465 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466  AffineForOp newInterTileLoop,
467  AffineForOp newIntraTileLoop,
468  Value tileSize) {
469  // The lower bound for the intra-tile loop is represented by an affine map
470  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
471  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
473  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475  // corresponding inter-tile loop.
476 
477  assert(origLoop.hasConstantLowerBound() &&
478  "expected input loops to have constant lower bound.");
479 
480  // Get lower bound of original loop as an affine expression.
481  AffineExpr origLowerBoundExpr;
482  origLowerBoundExpr =
483  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
484 
485  // Add dim operands from original lower/upper bound.
486  SmallVector<Value, 4> lbOperands, ubOperands;
487  AffineBound lb = origLoop.getLowerBound();
488  AffineBound ub = origLoop.getUpperBound();
489  lbOperands.reserve(lb.getNumOperands() + 2);
490  ubOperands.reserve(ub.getNumOperands() + 2);
491  AffineMap origLbMap = lb.getMap();
492  AffineMap origUbMap = ub.getMap();
493  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
494  lbOperands.push_back(lb.getOperand(j));
495  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
496  ubOperands.push_back(ub.getOperand(j));
497 
498  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
499  // IV.
500  lbOperands.push_back(newInterTileLoop.getInductionVar());
501  ubOperands.push_back(newInterTileLoop.getInductionVar());
502 
503  // Get loop IV as an affine expression for lower/upper bound. Size of
504  // lb/ubOperands is guaranteed to be atleast one.
505  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
506  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
507 
508  // Add symbol operands from original lower/upper bound.
509  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
510  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
511  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
512  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
513 
514  // Add a new symbol operand which is the tile size for this loop.
515  lbOperands.push_back(tileSize);
516  ubOperands.push_back(tileSize);
517 
518  SmallVector<AffineExpr, 4> lbBoundExprs;
519  SmallVector<AffineExpr, 4> ubBoundExprs;
520  lbBoundExprs.reserve(origLbMap.getNumResults());
521  ubBoundExprs.reserve(origUbMap.getNumResults());
522 
523  // Get tiling parameter as an affine expression for lb/ub.
524  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
525  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
526 
527  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528  lbBoundExprs.push_back(
529  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530  origLowerBoundExpr);
531 
532  // Get the origLoopStep as an affine expression.
533  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
534 
535  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
536  // (tilingParameter * origLoopStep) + origlb.
537  ubBoundExprs.push_back(
538  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 
541  ubBoundExprs.append(origUbMap.getResults().begin(),
542  origUbMap.getResults().end());
543 
544  AffineMap lbMap =
545  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
546  lbBoundExprs, b.getContext());
547  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
548 
549  AffineMap ubMap =
550  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
551  ubBoundExprs, b.getContext());
552  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
553 
554  // Original loop step must be preserved.
555  newIntraTileLoop.setStep(origLoop.getStepAsInt());
556 }
557 
558 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
559 // TODO: Handle non-constant lower bounds.
560 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561  AffineForOp newLoop, Value tileSize) {
562  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563 
564  // The lower bounds for inter-tile loops are same as the corresponding lower
565  // bounds of original loops.
566  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
567 
568  // The new upper bound map for inter-tile loops, assuming constant lower
569  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570  // originalLowerBound), tiling parameter); where tiling parameter is the
571  // respective tile size for that loop. For e.g. if the original ubmap was
572  // ()->(1024), the new map will be
573  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574  // Therefore a new symbol operand is inserted in the map and the result
575  // expression is overwritten.
576 
577  assert(origLoop.hasConstantLowerBound() &&
578  "expected input loops to have constant lower bound.");
579 
580  // Get lower bound of original loop as an affine expression.
581  AffineExpr origLowerBoundExpr;
582  origLowerBoundExpr =
583  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
584 
585  // Add dim operands from original upper bound.
586  SmallVector<Value, 4> ubOperands;
587  AffineBound ub = origLoop.getUpperBound();
588  ubOperands.reserve(ub.getNumOperands() + 1);
589  AffineMap origUbMap = ub.getMap();
590  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
591  ubOperands.push_back(ub.getOperand(j));
592 
593  // Add symbol operands from original upper bound.
594  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
595  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
596 
597  // Add a new symbol operand which is the tile size for this loop.
598  ubOperands.push_back(tileSize);
599 
600  // Get tiling parameter as an affine expression.
601  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
602 
603  SmallVector<AffineExpr, 4> boundExprs;
604  boundExprs.reserve(origUbMap.getNumResults());
605  int64_t origUpperBound;
606  AffineExpr origUpperBoundExpr;
607 
608  // If upper bound for the original loop is constant, then the constant can
609  // be obtained as an affine expression straight away.
610  if (origLoop.hasConstantUpperBound()) {
611  origUpperBound = origLoop.getConstantUpperBound();
612 
613  // Get original constant upper bound as an affine expression.
614  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
615 
616  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617  // originalLowerBound), tilingParameter).
618  boundExprs.push_back(
619  origLowerBoundExpr +
620  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
621  } else {
622  // If upper bound for the original loop is not constant then two cases
623  // are possible, although there handeling is the same, 1.) The result of
624  // ubmap has only one result expression. For e.g.
625  // affine.for %i = 5 to %ub
626  //
627  // A symbol operand is added which represents the tiling parameter. The
628  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629  // where 's0' is the original upper bound and 's1' is the tiling
630  // parameter. 2.) When ubMap has more than one result expression. For e.g.
631  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
632  // affine.for %i = 5 to min #map0()[%s0, %s1]
633  //
634  // A symbol operand is added which represents the tiling parameter. The
635  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637 
638  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639  // originalLowerBound), tilingParameter).
640  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641  boundExprs.push_back(
642  origLowerBoundExpr +
643  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
644  }
645 
646  AffineMap ubMap =
647  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
648  boundExprs, b.getContext());
649  newLoop.setUpperBound(ubOperands, ubMap);
650 
651  // Original loop step must be preserved.
652  newLoop.setStep(origLoop.getStepAsInt());
653 }
654 
655 /// Constructs and sets new loop bounds after tiling for the case of
656 /// hyper-rectangular index sets, where the bounds of one dimension do not
657 /// depend on other dimensions and tiling parameters are captured from SSA
658 /// values. Bounds of each dimension can thus be treated independently,
659 /// and deriving the new bounds is much simpler and faster than for the case of
660 /// tiling arbitrary polyhedral shapes.
663  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664  assert(!origLoops.empty() && "expected atleast one loop in band");
665  assert(origLoops.size() == tileSizes.size() &&
666  "expected tiling parameter for each loop in band.");
667 
668  OpBuilder b(origLoops[0].getOperation());
669  unsigned width = origLoops.size();
670 
671  // Set bounds for tile space loops.
672  for (unsigned i = 0; i < width; ++i) {
673  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
674  }
675 
676  // Set bounds for intra-tile loops.
677  for (unsigned i = 0; i < width; ++i) {
678  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
679  newLoops[i + width], tileSizes[i]);
680  }
681 }
682 
683 /// Constructs and sets new loop bounds after tiling for the case of
684 /// hyper-rectangular index sets, where the bounds of one dimension do not
685 /// depend on other dimensions. Bounds of each dimension can thus be treated
686 /// independently, and deriving the new bounds is much simpler and faster
687 /// than for the case of tiling arbitrary polyhedral shapes.
688 static void
691  ArrayRef<unsigned> tileSizes) {
692  assert(!origLoops.empty());
693  assert(origLoops.size() == tileSizes.size());
694 
695  OpBuilder b(origLoops[0].getOperation());
696  unsigned width = origLoops.size();
697 
698  // Bounds for tile space loops.
699  for (unsigned i = 0; i < width; i++) {
700  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
703  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
704  // If the step size of original loop is x and tileSize is y then after
705  // tiling the tile space loops' step size becomes x*y.
706  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
707  }
708  // Bounds for intra-tile loops.
709  for (unsigned i = 0; i < width; i++) {
710  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
711  std::optional<uint64_t> mayBeConstantCount =
712  getConstantTripCount(origLoops[i]);
713  // The lower bound is just the tile-space loop.
714  AffineMap lbMap = b.getDimIdentityMap();
715  newLoops[width + i].setLowerBound(
716  /*operands=*/newLoops[i].getInductionVar(), lbMap);
717  // The step sizes of intra-tile loops is just the original loops' step size.
718  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
719 
720  // Set the upper bound.
721  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
722  // Trip count is less than the tile size: upper bound is lower bound +
723  // trip count * stepSize.
725  *mayBeConstantCount * origLoops[i].getStepAsInt());
726  newLoops[width + i].setUpperBound(
727  /*operands=*/newLoops[i].getInductionVar(), ubMap);
728  } else if (largestDiv % tileSizes[i] != 0) {
729  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730  // Construct the upper bound map; the operands are the original operands
731  // with 'i' (tile-space loop) appended to it. The new upper bound map is
732  // the original one with an additional expression i + tileSize * stepSize
733  // appended.
734 
735  // Add dim operands from original upper bound.
736  SmallVector<Value, 4> ubOperands;
737  AffineBound ub = origLoops[i].getUpperBound();
738  ubOperands.reserve(ub.getNumOperands() + 1);
739  AffineMap origUbMap = ub.getMap();
740  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
741  ubOperands.push_back(ub.getOperand(j));
742 
743  // Add dim operand for new loop upper bound.
744  ubOperands.push_back(newLoops[i].getInductionVar());
745 
746  // Add symbol operands from original upper bound.
747  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
748  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
749 
750  SmallVector<AffineExpr, 4> boundExprs;
751  boundExprs.reserve(1 + origUbMap.getNumResults());
752  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
753  // The new upper bound map is the original one with an additional
754  // expression i + tileSize * stepSize (of original loop) appended.
755  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
756  boundExprs.append(origUbMap.getResults().begin(),
757  origUbMap.getResults().end());
758  AffineMap ubMap =
759  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
760  boundExprs, b.getContext());
761  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
762  } else {
763  // No need of the min expression.
764  AffineExpr dim = b.getAffineDimExpr(0);
765  AffineMap ubMap = AffineMap::get(
766  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
767  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
768  }
769  }
770 }
771 
772 LogicalResult
774  ArrayRef<unsigned> tileSizes,
775  SmallVectorImpl<AffineForOp> *tiledNest) {
776  if (input.empty())
777  return success();
778 
779  if (failed(performPreTilingChecks(input, tileSizes)))
780  return failure();
781 
782  MutableArrayRef<AffineForOp> origLoops = input;
783  AffineForOp rootAffineForOp = origLoops[0];
784 
785  // Note that width is at least one since the band isn't empty.
786  unsigned width = input.size();
787  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
788 
789  // Construct a tiled loop nest without setting their bounds. Bounds are
790  // set later.
791  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792 
793  SmallVector<Value, 8> origLoopIVs;
794  extractForInductionVars(input, &origLoopIVs);
795 
796  // Set loop bounds for the tiled loop nest.
797  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
798 
799  // Replace original IVs with intra-tile loop IVs.
800  for (unsigned i = 0; i < width; i++)
801  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
802 
803  // Erase the old loop nest.
804  rootAffineForOp.erase();
805 
806  if (tiledNest)
807  *tiledNest = std::move(tiledLoops);
808 
809  return success();
810 }
811 
812 /// Tiles the specified band of perfectly nested loops creating tile-space
813 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
814 /// is a contiguous set of loops.
817  SmallVectorImpl<AffineForOp> *tiledNest) {
818  if (input.empty())
819  return success();
820 
821  if (failed(performPreTilingChecks(input, tileSizes)))
822  return failure();
823 
824  MutableArrayRef<AffineForOp> origLoops = input;
825  AffineForOp rootAffineForOp = origLoops[0];
826  unsigned width = input.size();
827  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
828 
829  // Construct a tiled loop nest without setting their bounds. Bounds are
830  // set later.
831  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832 
833  SmallVector<Value, 8> origLoopIVs;
834  extractForInductionVars(input, &origLoopIVs);
835 
836  // Set loop bounds for the tiled loop nest.
837  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
838  tileSizes);
839 
840  // Replace original IVs with intra-tile loop IVs.
841  for (unsigned i = 0; i < width; i++)
842  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
843 
844  // Erase the old loop nest.
845  rootAffineForOp.erase();
846 
847  if (tiledNest)
848  *tiledNest = std::move(tiledLoops);
849 
850  return success();
851 }
852 
853 /// Get perfectly nested sequence of loops starting at root of loop nest
854 /// (the first op being another AffineFor, and the second op - a terminator).
855 /// A loop is perfectly nested iff: the first op in the loop's body is another
856 /// AffineForOp, and the second op is a terminator).
858  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
860  nestedLoops.push_back(root);
861  Block &body = root.getRegion().front();
862  if (body.begin() != std::prev(body.end(), 2))
863  return;
864 
865  root = dyn_cast<AffineForOp>(&body.front());
866  if (!root)
867  return;
868  }
869 }
870 
871 /// Identify valid and profitable bands of loops to tile. This is currently just
872 /// a temporary placeholder to test the mechanics of tiled code generation.
873 /// Returns all maximal outermost perfect loop nests to tile.
875  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
876  // Get maximal perfect nest of 'affine.for' insts starting from root
877  // (inclusive).
878  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
880  getPerfectlyNestedLoops(band, forOp);
881  bands->push_back(band);
882  }
883 }
884 
885 /// Unrolls this loop completely.
886 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888  if (mayBeConstantTripCount.has_value()) {
889  uint64_t tripCount = *mayBeConstantTripCount;
890  if (tripCount == 0)
891  return success();
892  if (tripCount == 1)
893  return promoteIfSingleIteration(forOp);
894  return loopUnrollByFactor(forOp, tripCount);
895  }
896  return failure();
897 }
898 
899 /// Unrolls this loop by the specified factor or by the trip count (if constant)
900 /// whichever is lower.
901 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902  uint64_t unrollFactor) {
903  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904  if (mayBeConstantTripCount.has_value() &&
905  *mayBeConstantTripCount < unrollFactor)
906  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
907  return loopUnrollByFactor(forOp, unrollFactor);
908 }
909 
910 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
913 /// using annotateFn.
915  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918  ValueRange iterArgs, ValueRange yieldedValues) {
919  // Builder to insert unrolled bodies just before the terminator of the body of
920  // 'forOp'.
921  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
922 
923  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924  if (!annotateFn)
925  annotateFn = defaultAnnotateFn;
926 
927  // Keep a pointer to the last non-terminator operation in the original block
928  // so that we know what to clone (since we are doing this in-place).
929  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
930 
931  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932  SmallVector<Value, 4> lastYielded(yieldedValues);
933 
934  for (unsigned i = 1; i < unrollFactor; i++) {
935  IRMapping operandMap;
936 
937  // Prepare operand map.
938  operandMap.map(iterArgs, lastYielded);
939 
940  // If the induction variable is used, create a remapping to the value for
941  // this unrolled instance.
942  if (!forOpIV.use_empty()) {
943  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944  operandMap.map(forOpIV, ivUnroll);
945  }
946 
947  // Clone the original body of 'forOp'.
948  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
949  Operation *clonedOp = builder.clone(*it, operandMap);
950  annotateFn(i, clonedOp, builder);
951  }
952 
953  // Update yielded values. If the yielded value is defined outside the
954  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956  // value with the clone corresponding to the yielded value.
957  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958  Operation *defOp = yieldedValues[i].getDefiningOp();
959  if (defOp && defOp->getBlock() == loopBodyBlock)
960  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
961  }
962  }
963 
964  // Make sure we annotate the Ops in the original body. We do this last so that
965  // any annotations are not copied into the cloned Ops above.
966  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
967  annotateFn(0, &*it, builder);
968 
969  // Update operands of the yield statement.
970  loopBodyBlock->getTerminator()->setOperands(lastYielded);
971 }
972 
973 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974 /// count is not a multiple of `unrollFactor`.
975 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976  uint64_t unrollFactor) {
977  // Insert the cleanup loop right after 'forOp'.
978  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
979  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
980 
981  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982  // and produce results for the original users of `forOp` results.
983  auto results = forOp.getResults();
984  auto cleanupResults = cleanupForOp.getResults();
985  auto cleanupIterOperands = cleanupForOp.getInits();
986 
987  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
989  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
990  }
991 
992  AffineMap cleanupMap;
993  SmallVector<Value, 4> cleanupOperands;
994  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
995  if (!cleanupMap)
996  return failure();
997 
998  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
999  // Promote the loop body up if this has turned into a single iteration loop.
1000  (void)promoteIfSingleIteration(cleanupForOp);
1001 
1002  // Adjust upper bound of the original loop; this is the same as the lower
1003  // bound of the cleanup loop.
1004  forOp.setUpperBound(cleanupOperands, cleanupMap);
1005  return success();
1006 }
1007 
1008 /// Unrolls this loop by the specified factor. Returns success if the loop
1009 /// is successfully unrolled.
1011  AffineForOp forOp, uint64_t unrollFactor,
1012  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013  bool cleanUpUnroll) {
1014  assert(unrollFactor > 0 && "unroll factor should be positive");
1015 
1016  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017  if (unrollFactor == 1) {
1018  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1019  return failure();
1020  return success();
1021  }
1022 
1023  // Nothing in the loop body other than the terminator.
1024  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1025  return success();
1026 
1027  // If the trip count is lower than the unroll factor, no unrolled body.
1028  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1029  if (cleanUpUnroll) {
1030  // Unroll the cleanup loop if cleanUpUnroll is specified.
1031  return loopUnrollFull(forOp);
1032  }
1033 
1034  return failure();
1035  }
1036 
1037  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1038  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1039  // Loops where the lower bound is a max expression or the upper bound is
1040  // a min expression and the trip count doesn't divide the unroll factor
1041  // can't be unrolled since the lower bound of the cleanup loop in such cases
1042  // cannot be expressed as an affine function or a max over affine functions.
1043  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1044  forOp.getUpperBoundMap().getNumResults() != 1)
1045  return failure();
1046  if (cleanUpUnroll)
1047  // Force unroll including cleanup loop
1048  return loopUnrollFull(forOp);
1049  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1050  assert(false && "cleanup loop lower bound map for single result lower "
1051  "and upper bound maps can always be determined");
1052  }
1053 
1054  ValueRange iterArgs(forOp.getRegionIterArgs());
1055  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1056 
1057  // Scale the step of loop being unrolled by unroll factor.
1058  int64_t step = forOp.getStepAsInt();
1059  forOp.setStep(step * unrollFactor);
1061  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1062  [&](unsigned i, Value iv, OpBuilder b) {
1063  // iv' = iv + i * step
1064  auto d0 = b.getAffineDimExpr(0);
1065  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1066  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1067  },
1068  /*annotateFn=*/annotateFn,
1069  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1070 
1071  // Promote the loop body up if this has turned into a single iteration loop.
1072  (void)promoteIfSingleIteration(forOp);
1073  return success();
1074 }
1075 
1076 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1077  uint64_t unrollJamFactor) {
1078  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1079  if (mayBeConstantTripCount.has_value() &&
1080  *mayBeConstantTripCount < unrollJamFactor)
1081  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1082  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1083 }
1084 
1085 /// Check if all control operands of all loops are defined outside of `forOp`
1086 /// and return false if not.
1087 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1088  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1089  for (auto controlOperand : aForOp.getControlOperands()) {
1090  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1091  return WalkResult::interrupt();
1092  }
1093  return WalkResult::advance();
1094  });
1095  return !walkResult.wasInterrupted();
1096 }
1097 
1098 /// Unrolls and jams this loop by the specified factor.
1099 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1100  uint64_t unrollJamFactor) {
1101  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1102 
1103  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1104  if (unrollJamFactor == 1) {
1105  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1106  return failure();
1107  return success();
1108  }
1109 
1110  // Nothing in the loop body other than the terminator.
1111  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1112  return success();
1113 
1114  // If the trip count is lower than the unroll jam factor, no unroll jam.
1115  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1116  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1117  return failure();
1118  }
1119 
1120  // If any control operand of any inner loop of `forOp` is defined within
1121  // `forOp`, no unroll jam.
1122  if (!areInnerBoundsInvariant(forOp))
1123  return failure();
1124 
1125  // Gather all sub-blocks to jam upon the loop being unrolled.
1127  jbg.walk(forOp);
1128  auto &subBlocks = jbg.subBlocks;
1129 
1130  // Collect loops with iter_args.
1131  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1132  forOp.walk([&](AffineForOp aForOp) {
1133  if (aForOp.getNumIterOperands() > 0)
1134  loopsWithIterArgs.push_back(aForOp);
1135  });
1136 
1137  // Get supported reductions to be used for creating reduction ops at the end.
1138  SmallVector<LoopReduction> reductions;
1139  if (forOp.getNumIterOperands() > 0)
1140  getSupportedReductions(forOp, reductions);
1141 
1142  // Generate the cleanup loop if trip count isn't a multiple of
1143  // unrollJamFactor.
1144  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1145  // Loops where the lower bound is a max expression or the upper bound is
1146  // a min expression and the trip count doesn't divide the unroll factor
1147  // can't be unrolled since the lower bound of the cleanup loop in such cases
1148  // cannot be expressed as an affine function or a max over affine functions.
1149  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1150  forOp.getUpperBoundMap().getNumResults() != 1)
1151  return failure();
1152  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1153  assert(false && "cleanup loop lower bound map for single result lower "
1154  "and upper bound maps can always be determined");
1155  }
1156 
1157  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1158  // iteration. There are (`unrollJamFactor` - 1) iterations.
1159  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1160 
1161  // For any loop with iter_args, replace it with a new loop that has
1162  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1163  // operands.
1164  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1165  IRRewriter rewriter(forOp.getContext());
1166  for (AffineForOp oldForOp : loopsWithIterArgs) {
1167  SmallVector<Value> dupIterOperands, dupYieldOperands;
1168  ValueRange oldIterOperands = oldForOp.getInits();
1169  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1170  ValueRange oldYieldOperands =
1171  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1172  // Get additional iterOperands, iterArgs, and yield operands. We will
1173  // fix iterOperands and yield operands after cloning of sub-blocks.
1174  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1175  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1176  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1177  }
1178  // Create a new loop with additional iterOperands, iter_args and yield
1179  // operands. This new loop will take the loop body of the original loop.
1180  bool forOpReplaced = oldForOp == forOp;
1181  AffineForOp newForOp =
1182  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1183  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1184  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1185  return dupYieldOperands;
1186  }));
1187  newLoopsWithIterArgs.push_back(newForOp);
1188  // `forOp` has been replaced with a new loop.
1189  if (forOpReplaced)
1190  forOp = newForOp;
1191  // Update `operandMaps` for `newForOp` iterArgs and results.
1192  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1193  unsigned oldNumIterArgs = oldIterArgs.size();
1194  ValueRange newResults = newForOp.getResults();
1195  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1196  assert(oldNumIterArgs == oldNumResults &&
1197  "oldNumIterArgs must be the same as oldNumResults");
1198  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1199  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1200  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1201  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1202  // to those in the `i`th new set.
1203  operandMaps[i - 1].map(newIterArgs[j],
1204  newIterArgs[i * oldNumIterArgs + j]);
1205  operandMaps[i - 1].map(newResults[j],
1206  newResults[i * oldNumResults + j]);
1207  }
1208  }
1209  }
1210 
1211  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1212  int64_t step = forOp.getStepAsInt();
1213  forOp.setStep(step * unrollJamFactor);
1214 
1215  auto forOpIV = forOp.getInductionVar();
1216  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1217  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1218  for (auto &subBlock : subBlocks) {
1219  // Builder to insert unroll-jammed bodies. Insert right at the end of
1220  // sub-block.
1221  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1222 
1223  // If the induction variable is used, create a remapping to the value for
1224  // this unrolled instance.
1225  if (!forOpIV.use_empty()) {
1226  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1227  auto d0 = builder.getAffineDimExpr(0);
1228  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1229  auto ivUnroll =
1230  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1231  operandMaps[i - 1].map(forOpIV, ivUnroll);
1232  }
1233  // Clone the sub-block being unroll-jammed.
1234  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1235  builder.clone(*it, operandMaps[i - 1]);
1236  }
1237  // Fix iterOperands and yield op operands of newly created loops.
1238  for (auto newForOp : newLoopsWithIterArgs) {
1239  unsigned oldNumIterOperands =
1240  newForOp.getNumIterOperands() / unrollJamFactor;
1241  unsigned numControlOperands = newForOp.getNumControlOperands();
1242  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1243  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1244  assert(oldNumIterOperands == oldNumYieldOperands &&
1245  "oldNumIterOperands must be the same as oldNumYieldOperands");
1246  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1247  // The `i`th duplication of an old iterOperand or yield op operand
1248  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1249  // if such mapped value exists.
1250  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1251  operandMaps[i - 1].lookupOrDefault(
1252  newForOp.getOperand(numControlOperands + j)));
1253  yieldOp.setOperand(
1254  i * oldNumYieldOperands + j,
1255  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1256  }
1257  }
1258  }
1259  if (forOp.getNumResults() > 0) {
1260  // Create reduction ops to combine every `unrollJamFactor` related results
1261  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1262  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1263  // %1.
1264  rewriter.setInsertionPointAfter(forOp);
1265  auto loc = forOp.getLoc();
1266  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1267  for (LoopReduction &reduction : reductions) {
1268  unsigned pos = reduction.iterArgPosition;
1269  Value lhs = forOp.getResult(pos);
1270  Value rhs;
1272  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1273  rhs = forOp.getResult(i * oldNumResults + pos);
1274  // Create ops based on reduction type.
1275  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1276  if (!lhs)
1277  return failure();
1278  Operation *op = lhs.getDefiningOp();
1279  assert(op && "Reduction op should have been created");
1280  newOps.insert(op);
1281  }
1282  // Replace all uses except those in newly created reduction ops.
1283  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1284  }
1285  }
1286 
1287  // Promote the loop body up if this has turned into a single iteration loop.
1288  (void)promoteIfSingleIteration(forOp);
1289  return success();
1290 }
1291 
1292 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1293 /// nested within 'forOpA' as the only non-terminator operation in its block.
1294 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1295  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1296  auto &forOpABody = forOpA.getBody()->getOperations();
1297  auto &forOpBBody = forOpB.getBody()->getOperations();
1298 
1299  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1300  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1301  // body containing only the terminator.
1302  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1303  forOpABody, forOpABody.begin(),
1304  std::prev(forOpABody.end()));
1305  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1306  // body (this leaves forOpB's body containing only the terminator).
1307  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1308  std::prev(forOpBBody.end()));
1309  // 3) Splice forOpA into the beginning of forOpB's body.
1310  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1311  Block::iterator(forOpA));
1312 }
1313 
1314 // Checks each dependence component against the permutation to see if the
1315 // desired loop interchange would violate dependences by making the
1316 // dependence component lexicographically negative.
1318  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1319  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1320  // Invert permutation map.
1321  unsigned maxLoopDepth = loops.size();
1322  SmallVector<unsigned, 4> loopPermMapInv;
1323  loopPermMapInv.resize(maxLoopDepth);
1324  for (unsigned i = 0; i < maxLoopDepth; ++i)
1325  loopPermMapInv[loopPermMap[i]] = i;
1326 
1327  // Check each dependence component against the permutation to see if the
1328  // desired loop interchange permutation would make the dependence vectors
1329  // lexicographically negative.
1330  // Example 1: [-1, 1][0, 0]
1331  // Example 2: [0, 0][-1, 1]
1332  for (const auto &depComps : depCompsVec) {
1333  assert(depComps.size() >= maxLoopDepth);
1334  // Check if the first non-zero dependence component is positive.
1335  // This iterates through loops in the desired order.
1336  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1337  unsigned permIndex = loopPermMapInv[j];
1338  assert(depComps[permIndex].lb);
1339  int64_t depCompLb = *depComps[permIndex].lb;
1340  if (depCompLb > 0)
1341  break;
1342  if (depCompLb < 0)
1343  return false;
1344  }
1345  }
1346  return true;
1347 }
1348 
1349 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1350 /// nested sequence of loops in 'loops' would violate dependences.
1352  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1353  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1354  unsigned maxLoopDepth = loops.size();
1355  if (maxLoopDepth == 1)
1356  return true;
1357  // Gather dependence components for dependences between all ops in loop nest
1358  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1359  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1361  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1362 }
1363 
1364 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1365 /// in it from outermost to innermost.
1366 bool LLVM_ATTRIBUTE_UNUSED
1368  assert(!loops.empty() && "no loops provided");
1369 
1370  // We already know that the block can't be empty.
1371  auto hasTwoElements = [](Block *block) {
1372  auto secondOpIt = std::next(block->begin());
1373  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1374  };
1375 
1376  auto enclosingLoop = loops.front();
1377  for (auto loop : loops.drop_front()) {
1378  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1379  // parentForOp's body should be just this loop and the terminator.
1380  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1381  return false;
1382  enclosingLoop = loop;
1383  }
1384  return true;
1385 }
1386 
1387 // input[i] should move from position i -> permMap[i]. Returns the position in
1388 // `input` that becomes the new outermost loop.
1390  ArrayRef<unsigned> permMap) {
1391  assert(input.size() == permMap.size() && "invalid permutation map size");
1392  // Check whether the permutation spec is valid. This is a small vector - we'll
1393  // just sort and check if it's iota.
1394  SmallVector<unsigned, 4> checkPermMap(permMap);
1395  llvm::sort(checkPermMap);
1396  if (llvm::any_of(llvm::enumerate(checkPermMap),
1397  [](const auto &en) { return en.value() != en.index(); }))
1398  assert(false && "invalid permutation map");
1399 
1400  // Nothing to do.
1401  if (input.size() < 2)
1402  return 0;
1403 
1404  assert(isPerfectlyNested(input) && "input not perfectly nested");
1405 
1406  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1407  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1409  for (unsigned i = 0, e = input.size(); i < e; ++i)
1410  invPermMap.push_back({permMap[i], i});
1411  llvm::sort(invPermMap);
1412 
1413  // Move the innermost loop body to the loop that would be the innermost in the
1414  // permuted nest (only if the innermost loop is going to change).
1415  if (permMap.back() != input.size() - 1) {
1416  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1417  Block *srcBody = ((AffineForOp)input.back()).getBody();
1418  destBody->getOperations().splice(destBody->begin(),
1419  srcBody->getOperations(), srcBody->begin(),
1420  std::prev(srcBody->end()));
1421  }
1422 
1423  // We'll move each loop in `input` in the reverse order so that its body is
1424  // empty when we are moving it; this incurs zero copies and no erasing.
1425  for (int i = input.size() - 1; i >= 0; --i) {
1426  // If this has to become the outermost loop after permutation, add it to the
1427  // parent block of the original root.
1428  if (permMap[i] == 0) {
1429  // If the root remains the same, nothing to do.
1430  if (i == 0)
1431  continue;
1432  // Make input[i] the new outermost loop moving it into parentBlock.
1433  auto *parentBlock = input[0]->getBlock();
1434  parentBlock->getOperations().splice(Block::iterator(input[0]),
1435  input[i]->getBlock()->getOperations(),
1436  Block::iterator(input[i]));
1437  continue;
1438  }
1439 
1440  // If the parent in the permuted order is the same as in the original,
1441  // nothing to do.
1442  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1443  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1444  continue;
1445 
1446  // Move input[i] to its surrounding loop in the transformed nest.
1447  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1448  destBody->getOperations().splice(destBody->begin(),
1449  input[i]->getBlock()->getOperations(),
1450  Block::iterator(input[i]));
1451  }
1452 
1453  return invPermMap[0].second;
1454 }
1455 
1456 // Sinks all sequential loops to the innermost levels (while preserving
1457 // relative order among them) and moves all parallel loops to the
1458 // outermost (while again preserving relative order among them).
1459 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1461  getPerfectlyNestedLoops(loops, forOp);
1462  if (loops.size() < 2)
1463  return forOp;
1464 
1465  // Gather dependence components for dependences between all ops in loop nest
1466  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1467  unsigned maxLoopDepth = loops.size();
1468  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1469  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1470 
1471  // Mark loops as either parallel or sequential.
1472  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1473  for (auto &depComps : depCompsVec) {
1474  assert(depComps.size() >= maxLoopDepth);
1475  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1476  DependenceComponent &depComp = depComps[j];
1477  assert(depComp.lb.has_value() && depComp.ub.has_value());
1478  if (*depComp.lb != 0 || *depComp.ub != 0)
1479  isParallelLoop[j] = false;
1480  }
1481  }
1482 
1483  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1484 
1485  // Compute permutation of loops that sinks sequential loops (and thus raises
1486  // parallel loops) while preserving relative order.
1487  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1488  unsigned nextSequentialLoop = numParallelLoops;
1489  unsigned nextParallelLoop = 0;
1490  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1491  if (isParallelLoop[i]) {
1492  loopPermMap[i] = nextParallelLoop++;
1493  } else {
1494  loopPermMap[i] = nextSequentialLoop++;
1495  }
1496  }
1497 
1498  // Check if permutation 'loopPermMap' would violate dependences.
1499  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1500  return forOp;
1501  // Perform loop interchange according to permutation 'loopPermMap'.
1502  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1503  return loops[loopNestRootIndex];
1504 }
1505 
1506 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1507 // lower (resp. upper) loop bound. When called for both the lower and upper
1508 // bounds, the resulting IR resembles:
1509 //
1510 // ```mlir
1511 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1512 // ...
1513 // }
1514 // ```
1516  SmallVector<Value, 4> *operands,
1517  int64_t offset = 0) {
1518  auto bounds = llvm::to_vector<4>(map->getResults());
1519  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1520  operands->insert(operands->begin() + map->getNumDims(), iv);
1521  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1522  b.getContext());
1523  canonicalizeMapAndOperands(map, operands);
1524 }
1525 
1526 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1527 // Stripmine-sink is a primitive building block for generalized tiling of
1528 // imperfectly nested loops.
1529 // This transformation is purely mechanical and does not check legality,
1530 // profitability or even structural correctness. It is the user's
1531 // responsibility to specify `targets` that are dominated by `forOp`.
1532 // Returns the new AffineForOps, one per `targets`, nested immediately under
1533 // each of the `targets`.
1535 stripmineSink(AffineForOp forOp, uint64_t factor,
1536  ArrayRef<AffineForOp> targets) {
1537  auto originalStep = forOp.getStepAsInt();
1538  auto scaledStep = originalStep * factor;
1539  forOp.setStep(scaledStep);
1540 
1541  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1542 
1543  // Lower-bound map creation.
1544  auto lbMap = forOp.getLowerBoundMap();
1545  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1546  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1547 
1548  // Upper-bound map creation.
1549  auto ubMap = forOp.getUpperBoundMap();
1550  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1551  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1552  /*offset=*/scaledStep);
1553 
1554  auto iv = forOp.getInductionVar();
1555  SmallVector<AffineForOp, 8> innerLoops;
1556  for (auto t : targets) {
1557  // Insert newForOp before the terminator of `t`.
1558  auto b = OpBuilder::atBlockTerminator(t.getBody());
1559  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1560  ubOperands, ubMap, originalStep);
1561  auto begin = t.getBody()->begin();
1562  // Skip terminator and `newForOp` which is just before the terminator.
1563  auto nOps = t.getBody()->getOperations().size() - 2;
1564  newForOp.getBody()->getOperations().splice(
1565  newForOp.getBody()->getOperations().begin(),
1566  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1567  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1568  newForOp.getRegion());
1569  innerLoops.push_back(newForOp);
1570  }
1571 
1572  return innerLoops;
1573 }
1574 
1575 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1576 // Returns the new AffineForOps, nested immediately under `target`.
1577 template <typename SizeType>
1578 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1579  AffineForOp target) {
1580  // TODO: Use cheap structural assertions that targets are nested under
1581  // forOp and that targets are not nested under each other when DominanceInfo
1582  // exposes the capability. It seems overkill to construct a whole function
1583  // dominance tree at this point.
1584  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1585  assert(res.size() == 1 && "Expected 1 inner forOp");
1586  return res[0];
1587 }
1588 
1591  ArrayRef<AffineForOp> targets) {
1593  SmallVector<AffineForOp, 8> currentTargets(targets);
1594  for (auto it : llvm::zip(forOps, sizes)) {
1595  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1596  res.push_back(step);
1597  currentTargets = step;
1598  }
1599  return res;
1600 }
1601 
1603  ArrayRef<uint64_t> sizes,
1604  AffineForOp target) {
1606  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1607  res.push_back(llvm::getSingleElement(loops));
1608  return res;
1609 }
1610 
1612  if (loops.size() < 2)
1613  return success();
1614 
1615  AffineForOp innermost = loops.back();
1616  AffineForOp outermost = loops.front();
1617  AffineBound ub = outermost.getUpperBound();
1618  AffineMap origUbMap = ub.getMap();
1619  Location loc = outermost.getLoc();
1620  OpBuilder builder(outermost);
1621  for (AffineForOp loop : loops) {
1622  // We only work on normalized loops.
1623  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1624  loop.getConstantLowerBound() != 0)
1625  return failure();
1626  }
1627  SmallVector<Value, 4> upperBoundSymbols;
1628  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1629  ub.getOperands().end());
1630 
1631  // 1. Store the upper bound of the outermost loop in a variable.
1632  Value prev;
1633  if (!llvm::hasSingleElement(origUbMap.getResults()))
1634  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1635  else
1636  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1637  upperBoundSymbols.push_back(prev);
1638 
1639  // 2. Emit code computing the upper bound of the coalesced loop as product of
1640  // the number of iterations of all loops.
1641  for (AffineForOp loop : loops.drop_front()) {
1642  ub = loop.getUpperBound();
1643  origUbMap = ub.getMap();
1644  ubOperands = ub.getOperands();
1645  Value upperBound;
1646  // If upper bound map has more than one result, take their minimum.
1647  if (!llvm::hasSingleElement(origUbMap.getResults()))
1648  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1649  else
1650  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1651  upperBoundSymbols.push_back(upperBound);
1652  SmallVector<Value, 4> operands;
1653  operands.push_back(prev);
1654  operands.push_back(upperBound);
1655  // Maintain running product of loop upper bounds.
1656  prev = builder.create<AffineApplyOp>(
1657  loc,
1658  AffineMap::get(/*dimCount=*/1,
1659  /*symbolCount=*/1,
1660  builder.getAffineDimExpr(0) *
1661  builder.getAffineSymbolExpr(0)),
1662  operands);
1663  }
1664  // Set upper bound of the coalesced loop.
1665  AffineMap newUbMap = AffineMap::get(
1666  /*dimCount=*/0,
1667  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1668  outermost.setUpperBound(prev, newUbMap);
1669 
1670  builder.setInsertionPointToStart(outermost.getBody());
1671 
1672  // 3. Remap induction variables. For each original loop, the value of the
1673  // induction variable can be obtained by dividing the induction variable of
1674  // the linearized loop by the total number of iterations of the loops nested
1675  // in it modulo the number of iterations in this loop (remove the values
1676  // related to the outer loops):
1677  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1678  // Compute these iteratively from the innermost loop by creating a "running
1679  // quotient" of division by the range.
1680  Value previous = outermost.getInductionVar();
1681  for (unsigned idx = loops.size(); idx > 0; --idx) {
1682  if (idx != loops.size()) {
1683  SmallVector<Value, 4> operands;
1684  operands.push_back(previous);
1685  operands.push_back(upperBoundSymbols[idx]);
1686  previous = builder.create<AffineApplyOp>(
1687  loc,
1689  /*dimCount=*/1, /*symbolCount=*/1,
1690  builder.getAffineDimExpr(0).floorDiv(
1691  builder.getAffineSymbolExpr(0))),
1692  operands);
1693  }
1694  // Modified value of the induction variables of the nested loops after
1695  // coalescing.
1696  Value inductionVariable;
1697  if (idx == 1) {
1698  inductionVariable = previous;
1699  } else {
1700  SmallVector<Value, 4> applyOperands;
1701  applyOperands.push_back(previous);
1702  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1703  inductionVariable = builder.create<AffineApplyOp>(
1704  loc,
1706  /*dimCount=*/1, /*symbolCount=*/1,
1707  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1708  applyOperands);
1709  }
1710  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1711  inductionVariable, loops.back().getRegion());
1712  }
1713 
1714  // 4. Move the operations from the innermost just above the second-outermost
1715  // loop, delete the extra terminator and the second-outermost loop.
1716  AffineForOp secondOutermostLoop = loops[1];
1717  innermost.getBody()->back().erase();
1718  outermost.getBody()->getOperations().splice(
1719  Block::iterator(secondOutermostLoop.getOperation()),
1720  innermost.getBody()->getOperations());
1721  secondOutermostLoop.erase();
1722  return success();
1723 }
1724 
1726  ArrayRef<Value> processorId,
1727  ArrayRef<Value> numProcessors) {
1728  assert(processorId.size() == numProcessors.size());
1729  if (processorId.empty())
1730  return;
1731 
1732  OpBuilder b(forOp);
1733  Location loc(forOp.getLoc());
1734  AffineExpr lhs, rhs;
1735  bindSymbols(forOp.getContext(), lhs, rhs);
1736  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1737  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1738 
1739  Value linearIndex = processorId.front();
1740  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1741  auto mulApplyOp = b.create<AffineApplyOp>(
1742  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1743  linearIndex = b.create<AffineApplyOp>(
1744  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1745  }
1746 
1747  auto mulApplyOp = b.create<AffineApplyOp>(
1748  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1749  Value lb = b.create<AffineApplyOp>(
1750  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1751  forOp.setLowerBound(lb);
1752 
1753  Value step = forOp.getStep();
1754  for (auto numProcs : numProcessors)
1755  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1756  forOp.setStep(step);
1757 }
1758 
1759 /// Given a memref region, determine the lowest depth at which transfers can be
1760 /// placed for it, and return the corresponding block, start and end positions
1761 /// in the block for placing incoming (read) and outgoing (write) copies
1762 /// respectively. The lowest depth depends on whether the region being accessed
1763 /// is hoistable with respect to one or more immediately surrounding loops.
1764 static void
1766  Block::iterator &begin, Block::iterator &end,
1767  Block **copyPlacementBlock,
1768  Block::iterator *copyInPlacementStart,
1769  Block::iterator *copyOutPlacementStart) {
1770  const auto *cst = region.getConstraints();
1771  SmallVector<Value, 4> symbols;
1772  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1773 
1774  SmallVector<Operation *, 4> enclosingAffineOps;
1775  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1776  // Walk up loop parents till we find an IV on which this region is
1777  // symbolic/variant or we hit `hoistGuard`.
1778  auto it = enclosingAffineOps.rbegin();
1779  AffineForOp lastInvariantFor;
1780  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1781  Operation *enclosingOp = *it;
1782  // We can't hoist past the definition of the memref being copied.
1783  Value memref = region.memref;
1784  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1785  LLVM_DEBUG(
1786  llvm::dbgs()
1787  << "memref definition will end up not dominating hoist location\n");
1788  break;
1789  }
1790 
1791  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1792  if (!affineFor)
1793  break;
1794  // TODO: also need to be checking this for regions symbols that
1795  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1796  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1797  break;
1798  lastInvariantFor = affineFor;
1799  }
1800 
1801  if (it != enclosingAffineOps.rbegin()) {
1802  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1803  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1804  *copyPlacementBlock = lastInvariantFor->getBlock();
1805  } else {
1806  *copyInPlacementStart = begin;
1807  *copyOutPlacementStart = end;
1808  *copyPlacementBlock = &block;
1809  }
1810 }
1811 
1812 // Info comprising stride and number of elements transferred every stride.
1813 struct StrideInfo {
1814  int64_t stride;
1816 };
1817 
1818 /// Returns striding information for a copy/transfer of this region with
1819 /// potentially multiple striding levels from outermost to innermost. For an
1820 /// n-dimensional region, there can be at most n-1 levels of striding
1821 /// successively nested.
1822 // TODO: make this work with non-identity layout maps.
1823 static void getMultiLevelStrides(const MemRefRegion &region,
1824  ArrayRef<int64_t> bufferShape,
1825  SmallVectorImpl<StrideInfo> *strideInfos) {
1826  if (bufferShape.size() <= 1)
1827  return;
1828 
1829  int64_t numEltPerStride = 1;
1830  int64_t stride = 1;
1831  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1832  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1833  stride *= dimSize;
1834  numEltPerStride *= bufferShape[d];
1835  // A stride is needed only if the region has a shorter extent than the
1836  // memref along the dimension *and* has an extent greater than one along the
1837  // next major dimension.
1838  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1839  strideInfos->push_back({stride, numEltPerStride});
1840  }
1841  }
1842 }
1843 
1844 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1845 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1846 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1847 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1848 /// contain the expressions to be subtracted out from the respective copy loop
1849 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1850 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1851 /// copy nest is inserted.
1852 //
1853 /// The copy-in nest is generated as follows as an example for a 2-d region:
1854 /// for x = ...
1855 /// for y = ...
1856 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1857 ///
1858 static AffineForOp
1859 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1860  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1861  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1862  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1863  OpBuilder b) {
1864  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1865  return lbMap.getNumInputs() == lbOperands.size();
1866  }));
1867  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1868  return ubMap.getNumInputs() == ubOperands.size();
1869  }));
1870 
1871  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1872  // A copy nest can't be generated for 0-ranked memrefs.
1873  assert(rank != 0 && "non-zero rank memref expected");
1874  assert(lbMaps.size() == rank && "wrong number of lb maps");
1875  assert(ubMaps.size() == rank && "wrong number of ub maps");
1876 
1877  SmallVector<Value, 4> memIndices;
1878  SmallVector<AffineExpr, 4> fastBufExprs;
1879  SmallVector<Value, 4> fastBufMapOperands;
1880  AffineForOp copyNestRoot;
1881  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1882  for (unsigned d = 0; d < rank; ++d) {
1883  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1884  ubOperands, ubMaps[d]);
1885  if (d == 0)
1886  copyNestRoot = forOp;
1887 
1888  b = OpBuilder::atBlockTerminator(forOp.getBody());
1889 
1890  auto fastBufOffsetMap =
1891  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1892  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1893 
1894  // Construct the subscript for the fast memref being copied into/from:
1895  // x - offset_x.
1896  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1897  b.getAffineDimExpr(2 * d));
1898  fastBufMapOperands.push_back(offset);
1899  fastBufMapOperands.push_back(forOp.getInductionVar());
1900  mayBeDeadApplys.push_back(offset);
1901 
1902  // Subscript for the slow memref being copied.
1903  memIndices.push_back(forOp.getInductionVar());
1904  }
1905 
1906  auto fastBufMap =
1907  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1908  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1909  fastBufMap = simplifyAffineMap(fastBufMap);
1910  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1911 
1912  // Drop any dead affine.applys.
1913  for (auto applyOp : mayBeDeadApplys)
1914  if (applyOp.use_empty())
1915  applyOp.erase();
1916 
1917  if (!isCopyOut) {
1918  // Copy in.
1919  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1920  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1921  fastBufMapOperands);
1922  return copyNestRoot;
1923  }
1924 
1925  // Copy out.
1926  auto load =
1927  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1928  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1929  return copyNestRoot;
1930 }
1931 
1932 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1934  return block.getParentOp()->emitRemark();
1935 }
1936 
1937 /// Creates a buffer in the faster memory space for the specified memref region
1938 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1939 /// space to this one, and replaces all loads/stores in the block range
1940 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1941 /// if copies could not be generated due to yet unimplemented cases.
1942 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1943 /// specify the insertion points where the incoming copies and outgoing copies,
1944 /// respectively, should be inserted (the insertion happens right before the
1945 /// insertion point). Since `begin` can itself be invalidated due to the memref
1946 /// rewriting done from this method, the output argument `nBegin` is set to its
1947 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1948 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1949 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1950 /// allocated.
1951 static LogicalResult generateCopy(
1952  const MemRefRegion &region, Block *block, Block::iterator begin,
1953  Block::iterator end, Block *copyPlacementBlock,
1954  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1955  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1956  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1957  Block::iterator *nBegin, Block::iterator *nEnd) {
1958  *nBegin = begin;
1959  *nEnd = end;
1960 
1961  auto f = begin->getParentOfType<FunctionOpInterface>();
1962  OpBuilder topBuilder(f.getFunctionBody());
1963  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1964 
1965  *sizeInBytes = 0;
1966 
1967  if (begin == end)
1968  return success();
1969 
1970  // Is the copy out point at the end of the block where we are doing
1971  // explicit copying.
1972  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1973 
1974  // Copies for read regions are going to be inserted at 'begin'.
1975  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1976  // Copies for write regions are going to be inserted at 'end'.
1977  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1978  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1979 
1980  // Builder to create constants at the top level.
1981  auto func =
1982  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1983  OpBuilder top(func.getFunctionBody());
1984 
1985  auto loc = region.loc;
1986  auto memref = region.memref;
1987  auto memRefType = cast<MemRefType>(memref.getType());
1988 
1989  if (!memRefType.getLayout().isIdentity()) {
1990  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1991  return failure();
1992  }
1993 
1994  // Indices to use for the copying.
1995  // Indices for the original memref being copied from/to.
1996  SmallVector<Value, 4> memIndices;
1997  // Indices for the faster buffer being copied into/from.
1998  SmallVector<Value, 4> bufIndices;
1999 
2000  unsigned rank = memRefType.getRank();
2001  if (rank == 0) {
2002  LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2003  return failure();
2004  }
2005 
2006  SmallVector<int64_t, 4> fastBufferShape;
2007 
2008  // Compute the extents of the buffer.
2010  lbs.reserve(rank);
2011  std::optional<int64_t> numElements =
2012  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2013  if (!numElements) {
2014  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2015  return failure();
2016  }
2017 
2018  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2019  // This can be supported in the future if needed.
2020  LLVM_DEBUG(llvm::dbgs()
2021  << "Max lower bound for memref region start not supported\n");
2022  return failure();
2023  }
2024 
2025  if (*numElements == 0) {
2026  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2027  return success();
2028  }
2029 
2030  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2031  for (unsigned i = 0; i < rank; ++i) {
2032  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2033  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2034  LLVM_DEBUG(llvm::dbgs()
2035  << "Missing lower or upper bound for region along dimension: "
2036  << i << '\n');
2037  return failure();
2038  }
2039  }
2040 
2041  const FlatAffineValueConstraints *cst = region.getConstraints();
2042  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2043  // on; these typically include loop IVs surrounding the level at which the
2044  // copy generation is being done or other valid symbols in MLIR.
2045  SmallVector<Value, 8> regionSymbols;
2046  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2047 
2048  // Construct the access expression for the fast memory buffer. The access
2049  // expression for a particular dimension of the fast buffer is obtained by
2050  // subtracting out the lower bound on the original memref's data region
2051  // along the corresponding dimension.
2052 
2053  // Index start offsets for faster memory buffer relative to the original.
2054  SmallVector<AffineExpr, 4> fastBufOffsets;
2055  fastBufOffsets.reserve(rank);
2056  for (unsigned d = 0; d < rank; d++) {
2057  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2058  "incorrect bound size");
2059 
2060  // Set copy start location for this dimension in the lower memory space
2061  // memref.
2062  if (lbs[d].isSingleConstant()) {
2063  auto indexVal = lbs[d].getSingleConstantResult();
2064  if (indexVal == 0) {
2065  memIndices.push_back(zeroIndex);
2066  } else {
2067  memIndices.push_back(
2068  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2069  }
2070  } else {
2071  // The coordinate for the start location is just the lower bound along the
2072  // corresponding dimension on the memory region (stored in 'offset').
2073  // Remap all inputs of the map to dimensions uniformly since in the
2074  // generate IR we need valid affine symbols as opposed to "symbols" for
2075  // the purpose of the memref region.
2076  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2077  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2078  symReplacements[i] = top.getAffineDimExpr(i);
2079  lbs[d] = lbs[d].replaceDimsAndSymbols(
2080  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2081  /*numResultSyms=*/0);
2082  memIndices.push_back(b.create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2083  }
2084  // The fast buffer is copied into at location zero; addressing is relative.
2085  bufIndices.push_back(zeroIndex);
2086 
2087  // Record the offsets since they are needed to remap the memory accesses of
2088  // the original memref further below.
2089  fastBufOffsets.push_back(lbs[d].getResult(0));
2090  }
2091 
2092  // The faster memory space buffer.
2093  Value fastMemRef;
2094 
2095  // Check if a buffer was already created.
2096  bool existingBuf = fastBufferMap.count(memref) > 0;
2097  if (!existingBuf) {
2098  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2099  auto fastMemRefType =
2100  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2101  fastBufferLayout, copyOptions.fastMemorySpace);
2102 
2103  // Create the fast memory space buffer just before the 'affine.for'
2104  // operation.
2105  fastMemRef =
2106  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2107  // Record it.
2108  fastBufferMap[memref] = fastMemRef;
2109  // fastMemRefType is a constant shaped memref.
2110  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2111  // We don't account for things of unknown size.
2112  *sizeInBytes = maySizeInBytes.value_or(0);
2113 
2114  LLVM_DEBUG(emitRemarkForBlock(*block)
2115  << "Creating fast buffer of type " << fastMemRefType
2116  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2117  << " KiB\n");
2118  } else {
2119  // Reuse the one already created.
2120  fastMemRef = fastBufferMap[memref];
2121  }
2122 
2123  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2124 
2125  Value dmaStride;
2126  Value numEltPerDmaStride;
2127  if (copyOptions.generateDma) {
2128  SmallVector<StrideInfo, 4> dmaStrideInfos;
2129  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2130 
2131  // TODO: use all stride levels once DmaStartOp is extended for
2132  // multi-level strides.
2133  if (dmaStrideInfos.size() > 1) {
2134  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2135  return failure();
2136  }
2137 
2138  if (!dmaStrideInfos.empty()) {
2139  dmaStride =
2140  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2141  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2142  loc, dmaStrideInfos[0].numEltPerStride);
2143  }
2144  }
2145 
2146  // Record the last operation where we want the memref replacement to end. We
2147  // later do the memref replacement only in [begin, postDomFilter] so
2148  // that the original memref's used in the data movement code themselves don't
2149  // get replaced.
2150  auto postDomFilter = std::prev(end);
2151 
2152  // Create fully composed affine maps for each memref.
2153  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2154  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2155  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2156  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2157 
2158  if (!copyOptions.generateDma) {
2159  // Point-wise copy generation.
2160  auto copyNest =
2161  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2162  /*lbOperands=*/regionSymbols, ubMaps,
2163  /*ubOperands=*/regionSymbols, fastBufOffsets,
2164  /*isCopyOut=*/region.isWrite(), b);
2165 
2166  // Record this so that we can skip it from yet another copy.
2167  copyNests.insert(copyNest);
2168 
2169  // Since new ops are being appended (for copy out's), adjust the end to
2170  // mark end of block range being processed if necessary.
2171  if (region.isWrite() && isCopyOutAtEndOfBlock)
2172  *nEnd = Block::iterator(copyNest.getOperation());
2173  } else {
2174  // DMA generation.
2175  // Create a tag (single element 1-d memref) for the DMA.
2176  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2177  copyOptions.tagMemorySpace);
2178  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2179 
2180  SmallVector<Value, 4> tagIndices({zeroIndex});
2181  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2182  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2183  if (!region.isWrite()) {
2184  // DMA non-blocking read from original buffer to fast buffer.
2185  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2186  fastMemRef, bufAffineMap, bufIndices,
2187  tagMemRef, tagAffineMap, tagIndices,
2188  numElementsSSA, dmaStride, numEltPerDmaStride);
2189  } else {
2190  // DMA non-blocking write from fast buffer to the original memref.
2191  auto op = b.create<AffineDmaStartOp>(
2192  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2193  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2194  dmaStride, numEltPerDmaStride);
2195  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2196  // end to mark end of block range being processed.
2197  if (isCopyOutAtEndOfBlock)
2198  *nEnd = Block::iterator(op.getOperation());
2199  }
2200 
2201  // Matching DMA wait to block on completion; tag always has a 0 index.
2202  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2203  numElementsSSA);
2204 
2205  // Generate dealloc for the tag.
2206  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2207  if (*nEnd == end && isCopyOutAtEndOfBlock)
2208  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2209  // mark end of range of the original.
2210  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2211  }
2212 
2213  // Generate dealloc for the buffer.
2214  if (!existingBuf) {
2215  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2216  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2217  // the fast buffer (since it marks the new end insertion point).
2218  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2219  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2220  }
2221 
2222  // Replace all uses of the old memref with the faster one while remapping
2223  // access indices (subtracting out lower bound offsets for each dimension).
2224  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2225  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2226  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2227  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2228  // d2, d3 correspond to the original indices (%i, %j).
2229  SmallVector<AffineExpr, 4> remapExprs;
2230  remapExprs.reserve(rank);
2231  for (unsigned i = 0; i < rank; i++) {
2232  // The starting operands of indexRemap will be regionSymbols (the symbols on
2233  // which the memref region is parametric); then those corresponding to
2234  // the memref's original indices follow.
2235  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2236  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2237  }
2238  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2239  b.getContext());
2240 
2241  // Record the begin since it may be invalidated by memref replacement.
2242  Block::iterator prevOfBegin;
2243  bool isBeginAtStartOfBlock = (begin == block->begin());
2244  if (!isBeginAtStartOfBlock)
2245  prevOfBegin = std::prev(begin);
2246 
2247  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2248  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2249  /*extraIndices=*/{}, indexRemap,
2250  /*extraOperands=*/regionSymbols,
2251  /*symbolOperands=*/{},
2252  /*domOpFilter=*/&*begin,
2253  /*postDomOpFilter=*/&*postDomFilter);
2254 
2255  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2256 
2257  return success();
2258 }
2259 
2260 /// Construct the memref region to just include the entire memref. Returns false
2261 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2262 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2263 /// is parametric on.
2264 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2265  MemRefRegion *region) {
2266  unsigned rank;
2267  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2268  rank = loadOp.getMemRefType().getRank();
2269  region->memref = loadOp.getMemRef();
2270  region->setWrite(false);
2271  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2272  rank = storeOp.getMemRefType().getRank();
2273  region->memref = storeOp.getMemRef();
2274  region->setWrite(true);
2275  } else {
2276  assert(false && "expected load or store op");
2277  return false;
2278  }
2279  auto memRefType = cast<MemRefType>(region->memref.getType());
2280  if (!memRefType.hasStaticShape())
2281  return false;
2282 
2283  auto *regionCst = region->getConstraints();
2284 
2285  // Just get the first numSymbols IVs, which the memref region is parametric
2286  // on.
2288  getAffineForIVs(*op, &ivs);
2289  ivs.resize(numParamLoopIVs);
2290  SmallVector<Value, 4> symbols;
2291  extractForInductionVars(ivs, &symbols);
2292  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2293  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2294 
2295  // Memref dim sizes provide the bounds.
2296  for (unsigned d = 0; d < rank; d++) {
2297  auto dimSize = memRefType.getDimSize(d);
2298  assert(dimSize > 0 && "filtered dynamic shapes above");
2299  regionCst->addBound(BoundType::LB, d, 0);
2300  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2301  }
2302  return true;
2303 }
2304 
2305 LogicalResult
2307  const AffineCopyOptions &copyOptions,
2308  std::optional<Value> filterMemRef,
2309  DenseSet<Operation *> &copyNests) {
2310  if (begin == end)
2311  return success();
2312 
2313  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2314  "Inconsistent block begin/end args");
2315  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2316 
2317  Block *block = begin->getBlock();
2318 
2319  // Copies will be generated for this depth, i.e., symbolic in all loops
2320  // surrounding the this block range.
2321  unsigned copyDepth = getNestingDepth(&*begin);
2322 
2323  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2324  << "\n");
2325  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2326  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2327 
2328  // List of memory regions to copy for. We need a map vector to have a
2329  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2330  // since the alloc's for example are identical except for the SSA id.
2331  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2332  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2333 
2334  // Map from original memref's to the fast buffers that their accesses are
2335  // replaced with.
2336  DenseMap<Value, Value> fastBufferMap;
2337 
2338  // To check for errors when walking the block.
2339  bool error = false;
2340 
2341  // Walk this range of operations to gather all memory regions.
2342  block->walk(begin, end, [&](Operation *opInst) {
2343  Value memref;
2344  MemRefType memrefType;
2345  // Gather regions to allocate to buffers in faster memory space.
2346  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2347  memref = loadOp.getMemRef();
2348  memrefType = loadOp.getMemRefType();
2349  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2350  memref = storeOp.getMemRef();
2351  memrefType = storeOp.getMemRefType();
2352  }
2353  // Not an affine.load/store op.
2354  if (!memref)
2355  return;
2356 
2357  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2358  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2359  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2360  return;
2361 
2362  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2363  LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2364  "which copy-in/copy-out would happen\n");
2365  return;
2366  }
2367 
2368  // Compute the MemRefRegion accessed.
2369  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2370  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2371  /*addMemRefDimBounds=*/false))) {
2372  LLVM_DEBUG(llvm::dbgs()
2373  << "Error obtaining memory region: semi-affine maps?\n");
2374  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2375  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2376  LLVM_DEBUG(
2377  opInst->emitError("non-constant memref sizes not yet supported"));
2378  error = true;
2379  return;
2380  }
2381  }
2382 
2383  // Each memref has a single buffer associated with it irrespective of how
2384  // many load's and store's happen on it.
2385  // TODO: in the future, when regions don't intersect and satisfy
2386  // other properties (based on load/store regions), we could consider
2387  // multiple buffers per memref.
2388 
2389  // Add to the appropriate region if it's not already in it, or take a
2390  // bounding box union with the existing one if it's already in there.
2391  // Note that a memref may have both read and write regions - so update the
2392  // region in the other list if one exists (write in case of read and vice
2393  // versa) since there is a single bounding box for a memref across all reads
2394  // and writes that happen on it.
2395 
2396  // Attempts to update; returns true if 'region' exists in targetRegions.
2397  auto updateRegion =
2398  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2399  &targetRegions) {
2400  const auto *const it = targetRegions.find(region->memref);
2401  if (it == targetRegions.end())
2402  return false;
2403 
2404  // Perform a union with the existing region.
2405  if (failed(it->second->unionBoundingBox(*region))) {
2406  LLVM_DEBUG(llvm::dbgs()
2407  << "Memory region bounding box failed; "
2408  "over-approximating to the entire memref\n");
2409  // If the union fails, we will overapproximate.
2410  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2411  LLVM_DEBUG(opInst->emitError(
2412  "non-constant memref sizes not yet supported"));
2413  error = true;
2414  return true;
2415  }
2416  it->second->getConstraints()->clearAndCopyFrom(
2417  *region->getConstraints());
2418  } else {
2419  // Union was computed and stored in 'it->second': copy to 'region'.
2420  region->getConstraints()->clearAndCopyFrom(
2421  *it->second->getConstraints());
2422  }
2423  return true;
2424  };
2425 
2426  bool existsInRead = updateRegion(readRegions);
2427  if (error)
2428  return;
2429  bool existsInWrite = updateRegion(writeRegions);
2430  if (error)
2431  return;
2432 
2433  // Finally add it to the region list.
2434  if (region->isWrite() && !existsInWrite) {
2435  writeRegions[region->memref] = std::move(region);
2436  } else if (!region->isWrite() && !existsInRead) {
2437  readRegions[region->memref] = std::move(region);
2438  }
2439  });
2440 
2441  if (error) {
2442  LLVM_DEBUG(begin->emitError(
2443  "copy generation failed for one or more memref's in this block\n"));
2444  return failure();
2445  }
2446 
2447  uint64_t totalCopyBuffersSizeInBytes = 0;
2448  bool ret = true;
2449  auto processRegions =
2450  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2451  &regions) {
2452  for (const auto &regionEntry : regions) {
2453  // For each region, hoist copy in/out past all hoistable
2454  // 'affine.for's.
2455  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2456  Block *copyPlacementBlock;
2458  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2459  &copyInPlacementStart, &copyOutPlacementStart);
2460 
2461  uint64_t sizeInBytes;
2462  Block::iterator nBegin, nEnd;
2463  LogicalResult iRet = generateCopy(
2464  *regionEntry.second, block, begin, end, copyPlacementBlock,
2465  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2466  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2467  if (succeeded(iRet)) {
2468  // begin/end could have been invalidated, and need update.
2469  begin = nBegin;
2470  end = nEnd;
2471  totalCopyBuffersSizeInBytes += sizeInBytes;
2472  }
2473  ret = ret & succeeded(iRet);
2474  }
2475  };
2476  processRegions(readRegions);
2477  processRegions(writeRegions);
2478 
2479  if (!ret) {
2480  LLVM_DEBUG(begin->emitError(
2481  "copy generation failed for one or more memref's in this block\n"));
2482  return failure();
2483  }
2484 
2485  // For a range of operations, a note will be emitted at the caller.
2486  AffineForOp forOp;
2487  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2488  LLVM_DEBUG(forOp.emitRemark()
2489  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2490  << " KiB of copy buffers in fast memory space for this block");
2491  }
2492 
2493  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2494  block->getParentOp()->emitWarning(
2495  "total size of all copy buffers' for this block exceeds fast memory "
2496  "capacity");
2497  }
2498 
2499  return success();
2500 }
2501 
2502 // A convenience version of affineDataCopyGenerate for all ops in the body of
2503 // an AffineForOp.
2505  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2506  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2507  return affineDataCopyGenerate(forOp.getBody()->begin(),
2508  std::prev(forOp.getBody()->end()), copyOptions,
2509  filterMemRef, copyNests);
2510 }
2511 
2513  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2514  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2515  Block *block = analyzedOp->getBlock();
2516  auto begin = analyzedOp->getIterator();
2517  auto end = std::next(begin);
2518  DenseMap<Value, Value> fastBufferMap;
2519  DenseSet<Operation *> copyNests;
2520 
2521  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2522  copyOptions, fastBufferMap, copyNests,
2523  &result.sizeInBytes, &begin, &end);
2524  if (failed(err))
2525  return err;
2526 
2527  const auto &en = fastBufferMap.find(memrefRegion.memref);
2528  // In some cases (empty loops), no copy generation would have happened.
2529  if (en == fastBufferMap.end())
2530  return failure();
2531  result.alloc = en->second.getDefiningOp();
2532  assert(result.alloc && "fast buffer expected to be locally allocated");
2533  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2534  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2535  return success();
2536 }
2537 
2538 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2539 static void
2540 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2541  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2542  // Add a new empty level to output if it doesn't exist level already.
2543  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2544  if (currLoopDepth == depthToLoops.size())
2545  depthToLoops.emplace_back();
2546 
2547  for (auto &op : *block) {
2548  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2549  depthToLoops[currLoopDepth].push_back(forOp);
2550  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2551  }
2552  }
2553 }
2554 
2555 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2557  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2558  for (auto &block : func)
2559  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2560 
2561  // Remove last loop level from output since it's empty.
2562  if (!depthToLoops.empty()) {
2563  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2564  depthToLoops.pop_back();
2565  }
2566 }
2567 
2569  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2570  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2571  SmallVector<Value, 4> lowerOperands(lbOperands);
2572  SmallVector<Value, 4> upperOperands(ubOperands);
2573 
2574  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2575  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2576  lbMap = removeDuplicateExprs(lbMap);
2577  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2578  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2579  ubMap = removeDuplicateExprs(ubMap);
2580 
2581  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2582  step);
2583 }
2584 
2585 /// Creates an AffineIfOp that encodes the conditional to choose between
2586 /// the constant trip count version and an unknown trip count version of this
2587 /// nest of loops. This is used to separate partial and full tiles if `loops`
2588 /// has the intra-tile loops. The affine.if op is inserted at the builder
2589 /// insertion point of `b`.
2591  OpBuilder b) {
2592  if (loops.empty())
2593  return nullptr;
2594 
2595  auto *context = loops[0].getContext();
2596 
2599  llvm::append_range(ops, loops);
2600  (void)getIndexSet(ops, &cst);
2601 
2602  // Remove constraints that are independent of these loop IVs.
2603  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2604 
2605  // Construct the constraint set representing the guard for full tiles. The
2606  // lower bound (and upper bound) corresponding to the full tile should be
2607  // larger (and resp. smaller) than any other lower (or upper bound).
2608  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2609  for (auto loop : loops) {
2610  (void)loop;
2611  // TODO: Non-unit stride is not an issue to generalize to.
2612  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2613  // Mark everything symbols for the purpose of finding a constant diff pair.
2614  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2615  1);
2616  unsigned fullTileLbPos, fullTileUbPos;
2617  if (!((IntegerRelation)cst)
2618  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2619  /*boundFloorDivisor=*/nullptr,
2620  /*ub=*/nullptr, &fullTileLbPos,
2621  &fullTileUbPos)) {
2622  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2623  return nullptr;
2624  }
2625 
2626  SmallVector<unsigned, 4> lbIndices, ubIndices;
2627  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2628 
2629  auto fLb = cst.getInequality(fullTileLbPos);
2630  auto fUb = cst.getInequality(fullTileUbPos);
2631  fullTileLb.assign(fLb.begin(), fLb.end());
2632  fullTileUb.assign(fUb.begin(), fUb.end());
2633 
2634  // Full tile lower bound should be >= than any other lower bound.
2635  for (auto lbIndex : lbIndices)
2636  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2637  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2638 
2639  // Full tile upper bound should be <= any other upper bound.
2640  for (auto ubIndex : ubIndices)
2641  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2642  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2643 
2644  cst.removeVar(0);
2645  }
2646 
2647  // The previous step leads to all zeros for the full tile lb and ub position
2648  // itself; remove those and any other duplicates / trivial redundancies.
2650 
2651  // Turn everything into dims conservatively since we earlier turned all
2652  // trailing ids past point loop IV into symbols. Some of these could be outer
2653  // loop IVs; we'll canonicalize anyway.
2654  cst.setDimSymbolSeparation(0);
2655 
2656  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2657  // ifCondSet can be null if cst was empty -- this can happen if all loops
2658  // in the nest have constant trip counts.
2659  if (!ifCondSet)
2660  return nullptr;
2661 
2662  SmallVector<Value, 4> setOperands;
2663  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2664  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2665  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2666  /*withElseRegion=*/true);
2667 }
2668 
2669 /// Create the full tile loop nest (along with its body).
2670 static LogicalResult
2672  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2673  fullTileLoops.reserve(inputNest.size());
2674 
2675  // For each loop in the original nest identify a lower/upper bound pair such
2676  // that their difference is a constant.
2678  for (auto loop : inputNest) {
2679  // TODO: straightforward to generalize to a non-unit stride.
2680  if (loop.getStepAsInt() != 1) {
2681  LLVM_DEBUG(llvm::dbgs()
2682  << "[tile separation] non-unit stride not implemented\n");
2683  return failure();
2684  }
2685  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2686  (void)getIndexSet(loopOp, &cst);
2687  // We will mark everything other than this loop IV as symbol for getting a
2688  // pair of <lb, ub> with a constant difference.
2690  unsigned lbPos, ubPos;
2691  if (!((IntegerRelation)cst)
2692  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2693  /*boundFloorDivisor=*/nullptr,
2694  /*ub=*/nullptr, &lbPos, &ubPos) ||
2695  lbPos == ubPos) {
2696  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2697  "equalities not yet handled\n");
2698  return failure();
2699  }
2700 
2701  // Set all variables as dimensions uniformly since some of those marked as
2702  // symbols above could be outer loop IVs (corresponding tile space IVs).
2703  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2704 
2705  AffineValueMap lbVmap, ubVmap;
2706  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2707  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2708  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2709  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2710  ubVmap.getOperands(), ubVmap.getAffineMap());
2711  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2712  fullTileLoops.push_back(fullTileLoop);
2713  }
2714 
2715  // Add the body for the full tile loop nest.
2716  IRMapping operandMap;
2717  for (const auto &loopEn : llvm::enumerate(inputNest))
2718  operandMap.map(loopEn.value().getInductionVar(),
2719  fullTileLoops[loopEn.index()].getInductionVar());
2720  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2721  for (auto &op : inputNest.back().getBody()->without_terminator())
2722  b.clone(op, operandMap);
2723  return success();
2724 }
2725 
2726 LogicalResult
2728  SmallVectorImpl<AffineForOp> *fullTileNest) {
2729  if (inputNest.empty())
2730  return success();
2731 
2732  auto firstLoop = inputNest[0];
2733 
2734  // Each successive for op has to be nested in the other.
2735  auto prevLoop = firstLoop;
2736  for (auto loop : inputNest.drop_front(1)) {
2737  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2738  prevLoop = loop;
2739  }
2740 
2741  // Create the full tile loop nest.
2742  SmallVector<AffineForOp, 4> fullTileLoops;
2743  OpBuilder b(firstLoop);
2744  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2745  if (!fullTileLoops.empty())
2746  fullTileLoops.front().erase();
2747  return failure();
2748  }
2749 
2750  // Create and insert the version select right before the root of the nest.
2751  b = OpBuilder(firstLoop);
2752  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2753  if (!ifOp) {
2754  fullTileLoops.front().erase();
2755  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2756  "separation condition\n");
2757  return failure();
2758  }
2759 
2760  // Move the full tile into the then block.
2761  Block *thenBlock = ifOp.getThenBlock();
2762  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2763  thenBlock->getOperations().splice(
2764  std::prev(thenBlock->end()),
2765  outermostFullTileLoop->getBlock()->getOperations(),
2766  Block::iterator(outermostFullTileLoop));
2767 
2768  // Move the partial tile into the else block. The partial tile is the same as
2769  // the original loop nest.
2770  Block *elseBlock = ifOp.getElseBlock();
2771  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2772  firstLoop->getBlock()->getOperations(),
2773  Block::iterator(firstLoop));
2774 
2775  if (fullTileNest)
2776  *fullTileNest = std::move(fullTileLoops);
2777 
2778  return success();
2779 }
2780 
2781 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2782  LogicalResult result(failure());
2784  getPerfectlyNestedLoops(loops, op);
2785  if (loops.size() <= 1)
2786  return success();
2787 
2788  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2789  // loops with bounds defined above some loop.
2790  // 1. For each loop, find above which parent loop its operands are
2791  // defined.
2792  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2793  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2794  operandsDefinedAbove[i] = i;
2795  for (unsigned j = 0; j < i; ++j) {
2796  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2797  operandsDefinedAbove[i] = j;
2798  break;
2799  }
2800  }
2801  }
2802 
2803  // 2. Identify bands of loops such that the operands of all of them are
2804  // defined above the first loop in the band. Traverse the nest bottom-up
2805  // so that modifications don't invalidate the inner loops.
2806  for (unsigned end = loops.size(); end > 0; --end) {
2807  unsigned start = 0;
2808  for (; start < end - 1; ++start) {
2809  auto maxPos =
2810  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2811  std::next(operandsDefinedAbove.begin(), end));
2812  if (maxPos > start)
2813  continue;
2814  assert(maxPos == start &&
2815  "expected loop bounds to be known at the start of the band");
2816  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2817  if (succeeded(coalesceLoops(band)))
2818  result = success();
2819  break;
2820  }
2821  // If a band was found and transformed, keep looking at the loops above
2822  // the outermost transformed loop.
2823  if (start != end - 1)
2824  end = start + 1;
2825  }
2826  return result;
2827 }
2828 
2830  int64_t count = 0;
2831  Operation *currentOp = operand.getOwner();
2832  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2833  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2834  break;
2835  currentOp = loopOp;
2836  count++;
2837  }
2838  return count;
2839 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:383
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:661
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:424
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2264
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1535
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1933
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:362
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1765
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:174
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:409
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:560
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:465
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2540
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:914
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1951
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2671
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1823
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:689
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:975
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1087
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1317
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:418
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2590
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:44
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1515
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1859
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:102
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:398
unsigned getNumDims() const
Definition: AffineMap.cpp:394
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:407
unsigned getNumResults() const
Definition: AffineMap.cpp:402
unsigned getNumInputs() const
Definition: AffineMap.cpp:403
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:411
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:29
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:305
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:246
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:33
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:399
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:405
AffineMap getDimIdentityMap()
Definition: Builders.cpp:381
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:385
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:366
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:370
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:362
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:729
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:551
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:257
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:719
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:208
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
static WalkResult advance()
Definition: Visitors.h:51
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:529
Value getOperand(unsigned idx)
Definition: AffineOps.h:535
operand_range getOperands()
Definition: AffineOps.h:544
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:106
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:315
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1611
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:886
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2306
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1076
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2773
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1010
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:776
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2556
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1367
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2568
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:857
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:229
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1351
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2512
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1612
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:901
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1389
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1099
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:815
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1259
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1617
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:762
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1376
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2829
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1979
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1590
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1459
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:773
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1294
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2781
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:874
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1305
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2727
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2685
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:770
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:35
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:780
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1814
int64_t numEltPerStride
Definition: LoopUtils.cpp:1815
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:205
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1071
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:527
bool isWrite() const
Definition: Utils.h:529
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1127
void setWrite(bool flag)
Definition: Utils.h:530
Value memref
Memref that this region corresponds to.
Definition: Utils.h:562
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:569
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.