MLIR  21.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include <optional>
30 
31 #define DEBUG_TYPE "loop-utils"
32 
33 using namespace mlir;
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
37 
38 /// Computes the cleanup loop lower bound of the loop being unrolled with
39 /// the specified unroll factor; this bound will also be upper bound of the main
40 /// part of the unrolled loop. Computes the bound as an AffineMap with its
41 /// operands or a null map when the trip count can't be expressed as an affine
42 /// expression.
43 static void
44 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45  AffineMap &cleanupLbMap,
46  SmallVectorImpl<Value> &cleanupLbOperands) {
47  AffineMap tripCountMap;
48  SmallVector<Value, 4> tripCountOperands;
49  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50  // Trip count can't be computed.
51  if (!tripCountMap) {
52  cleanupLbMap = AffineMap();
53  return;
54  }
55 
56  OpBuilder b(forOp);
57  auto lbMap = forOp.getLowerBoundMap();
58  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59  forOp.getLowerBoundOperands());
60 
61  // For each upper bound expr, get the range.
62  // Eg: affine.for %i = lb to min (ub1, ub2),
63  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65  // these affine.apply's make up the cleanup loop lower bound.
66  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68  int64_t step = forOp.getStepAsInt();
69  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70  auto tripCountExpr = tripCountMap.getResult(i);
71  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
73  tripCountMap.getNumSymbols(), bumpExprs[i]);
74  bumpValues[i] =
75  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76  }
77 
78  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
81 
82  cleanupLbOperands.clear();
83  cleanupLbOperands.push_back(lb);
84  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
85  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
86  b.getContext());
87  // Simplify the cleanupLbMap + cleanupLbOperands.
88  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
89  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
90  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  // Remove any affine.apply's that became dead from the simplification above.
92  for (auto v : bumpValues)
93  if (v.use_empty())
94  v.getDefiningOp()->erase();
95 
96  if (lb.use_empty())
97  lb.erase();
98 }
99 
100 /// Helper to replace uses of loop carried values (iter_args) and loop
101 /// yield values while promoting single iteration affine.for ops.
102 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103  // Replace uses of iter arguments with iter operands (initial values).
104  auto iterOperands = forOp.getInits();
105  auto iterArgs = forOp.getRegionIterArgs();
106  for (auto e : llvm::zip(iterOperands, iterArgs))
107  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108 
109  // Replace uses of loop results with the values yielded by the loop.
110  auto outerResults = forOp.getResults();
111  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112  for (auto e : llvm::zip(outerResults, innerResults))
113  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114 }
115 
116 /// Promotes the loop body of a forOp to its containing block if the forOp
117 /// was known to have a single iteration.
118 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120  if (!tripCount || *tripCount != 1)
121  return failure();
122 
123  // TODO: extend this for arbitrary affine bounds.
124  if (forOp.getLowerBoundMap().getNumResults() != 1)
125  return failure();
126 
127  // Replaces all IV uses to its single iteration value.
128  auto iv = forOp.getInductionVar();
129  auto *parentBlock = forOp->getBlock();
130  if (!iv.use_empty()) {
131  if (forOp.hasConstantLowerBound()) {
132  auto func = forOp->getParentOfType<FunctionOpInterface>();
133  OpBuilder builder(forOp->getContext());
134  if (func)
135  builder.setInsertionPointToStart(&func.getFunctionBody().front());
136  else
137  builder.setInsertionPoint(forOp);
138  auto constOp = builder.create<arith::ConstantIndexOp>(
139  forOp.getLoc(), forOp.getConstantLowerBound());
140  iv.replaceAllUsesWith(constOp);
141  } else {
142  auto lbOperands = forOp.getLowerBoundOperands();
143  auto lbMap = forOp.getLowerBoundMap();
144  OpBuilder builder(forOp);
145  if (lbMap == builder.getDimIdentityMap()) {
146  // No need of generating an affine.apply.
147  iv.replaceAllUsesWith(lbOperands[0]);
148  } else {
149  auto affineApplyOp =
150  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151  iv.replaceAllUsesWith(affineApplyOp);
152  }
153  }
154  }
155 
157 
158  // Move the loop body operations, except for its terminator, to the loop's
159  // containing block.
160  forOp.getBody()->back().erase();
161  parentBlock->getOperations().splice(Block::iterator(forOp),
162  forOp.getBody()->getOperations());
163  forOp.erase();
164  return success();
165 }
166 
167 /// Generates an affine.for op with the specified lower and upper bounds
168 /// while generating the right IV remappings to realize shifts for operations in
169 /// its body. The operations that go into the loop body are specified in
170 /// opGroupQueue starting from the specified offset, and in that order. The
171 /// first element of the pair specifies the shift applied to that group of
172 /// operations; the shift is multiplied by the loop step before being applied.
173 /// Returns nullptr if the generated loop simplifies to a single iteration one.
174 static AffineForOp generateShiftedLoop(
175  AffineMap lbMap, AffineMap ubMap,
176  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178  auto lbOperands = srcForOp.getLowerBoundOperands();
179  auto ubOperands = srcForOp.getUpperBoundOperands();
180 
181  assert(lbMap.getNumInputs() == lbOperands.size());
182  assert(ubMap.getNumInputs() == ubOperands.size());
183 
184  auto loopChunk =
185  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186  ubMap, srcForOp.getStepAsInt());
187  auto loopChunkIV = loopChunk.getInductionVar();
188  auto srcIV = srcForOp.getInductionVar();
189 
190  IRMapping operandMap;
191 
192  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
193  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194  uint64_t shift = it.first;
195  auto ops = it.second;
196  // All 'same shift' operations get added with their operands being
197  // remapped to results of cloned operations, and their IV used remapped.
198  // Generate the remapping if the shift is not zero: remappedIV = newIV -
199  // shift.
200  if (!srcIV.use_empty() && shift != 0) {
201  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202  srcForOp.getLoc(),
203  bodyBuilder.getSingleDimShiftAffineMap(
204  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205  loopChunkIV);
206  operandMap.map(srcIV, ivRemap);
207  } else {
208  operandMap.map(srcIV, loopChunkIV);
209  }
210  for (auto *op : ops)
211  bodyBuilder.clone(*op, operandMap);
212  };
213  if (succeeded(promoteIfSingleIteration(loopChunk)))
214  return AffineForOp();
215  return loopChunk;
216 }
217 
218 // The skewing of operations with respect to one another can be used for
219 // example to allow overlap of asynchronous operations (such as DMA
220 // communication) with computation, or just relative shifting of operations
221 // for better register reuse, locality or parallelism. As such, the shifts are
222 // typically expected to be at most of the order of the number of operations.
223 // This method should not be used as a substitute for loop distribution/fission.
224 // This method uses an algorithm// in time linear in the number of operations
225 // in the body of the for loop - (using the 'sweep line' paradigm). This method
226 // asserts preservation of SSA dominance. A check for that as well as that for
227 // memory-based dependence preservation check rests with the users of this
228 // method.
229 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230  ArrayRef<uint64_t> shifts,
231  bool unrollPrologueEpilogue) {
232  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233  "too few/many shifts");
234  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235  return success();
236 
237  // If the trip counts aren't constant, we would need versioning and
238  // conditional guards (or context information to prevent such versioning). The
239  // better way to pipeline for such loops is to first tile them and extract
240  // constant trip count "full tiles" before applying this.
241  auto mayBeConstTripCount = getConstantTripCount(forOp);
242  if (!mayBeConstTripCount) {
243  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244  return success();
245  }
246  uint64_t tripCount = *mayBeConstTripCount;
247 
248  assert(isOpwiseShiftValid(forOp, shifts) &&
249  "shifts will lead to an invalid transformation\n");
250 
251  int64_t step = forOp.getStepAsInt();
252 
253  unsigned numChildOps = shifts.size();
254 
255  // Do a linear time (counting) sort for the shifts.
256  uint64_t maxShift = *llvm::max_element(shifts);
257  if (maxShift >= numChildOps) {
258  // Large shifts are not the typical use case.
259  forOp.emitWarning("not shifting because shifts are unrealistically large");
260  return success();
261  }
262 
263  // An array of operation groups sorted by shift amount; each group has all
264  // operations with the same shift in the order in which they appear in the
265  // body of the 'affine.for' op.
266  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267  unsigned pos = 0;
268  for (auto &op : forOp.getBody()->without_terminator()) {
269  auto shift = shifts[pos++];
270  sortedOpGroups[shift].push_back(&op);
271  }
272 
273  // Unless the shifts have a specific pattern (which actually would be the
274  // common use case), prologue and epilogue are not meaningfully defined.
275  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276  // loop generated as the prologue and the last as epilogue and unroll these
277  // fully.
278  AffineForOp prologue, epilogue;
279 
280  // Do a sweep over the sorted shifts while storing open groups in a
281  // vector, and generating loop portions as necessary during the sweep. A block
282  // of operations is paired with its shift.
283  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284 
285  auto origLbMap = forOp.getLowerBoundMap();
286  uint64_t lbShift = 0;
287  OpBuilder b(forOp);
288  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289  // If nothing is shifted by d, continue.
290  if (sortedOpGroups[d].empty())
291  continue;
292  if (!opGroupQueue.empty()) {
293  assert(d > 0 &&
294  "Queue expected to be empty when the first block is found");
295  // The interval for which the loop needs to be generated here is:
296  // [lbShift, min(lbShift + tripCount, d)) and the body of the
297  // loop needs to have all operations in opQueue in that order.
298  AffineForOp res;
299  if (lbShift + tripCount * step < d * step) {
300  res = generateShiftedLoop(
301  b.getShiftedAffineMap(origLbMap, lbShift),
302  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303  opGroupQueue, /*offset=*/0, forOp, b);
304  // Entire loop for the queued op groups generated, empty it.
305  opGroupQueue.clear();
306  lbShift += tripCount * step;
307  } else {
308  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309  b.getShiftedAffineMap(origLbMap, d),
310  opGroupQueue, /*offset=*/0, forOp, b);
311  lbShift = d * step;
312  }
313 
314  if (res) {
315  // Simplify/canonicalize the affine.for.
316  RewritePatternSet patterns(res.getContext());
317  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
318  bool erased;
320  res.getOperation(), std::move(patterns),
323  /*changed=*/nullptr, &erased);
324  if (!erased && !prologue)
325  prologue = res;
326  if (!erased)
327  epilogue = res;
328  }
329  } else {
330  // Start of first interval.
331  lbShift = d * step;
332  }
333  // Augment the list of operations that get into the current open interval.
334  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
335  }
336 
337  // Those operations groups left in the queue now need to be processed (FIFO)
338  // and their loops completed.
339  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
341  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
342  b.getShiftedAffineMap(origLbMap, ubShift),
343  opGroupQueue, /*offset=*/i, forOp, b);
344  lbShift = ubShift;
345  if (!prologue)
346  prologue = epilogue;
347  }
348 
349  // Erase the original for op.
350  forOp.erase();
351 
352  if (unrollPrologueEpilogue && prologue)
353  (void)loopUnrollFull(prologue);
354  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355  (void)loopUnrollFull(epilogue);
356 
357  return success();
358 }
359 
360 /// Checks whether a loop nest is hyper-rectangular or not.
361 static LogicalResult
364  SmallVector<Operation *, 8> ops(input.begin(), input.end());
365  // 0-d or 1-d is trivially hyper-rectangular.
366  if (input.size() <= 1)
367  return success();
368  if (failed(getIndexSet(ops, &cst))) {
369  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370  return failure();
371  }
372  if (!cst.isHyperRectangular(0, input.size())) {
373  LLVM_DEBUG(llvm::dbgs()
374  << "Non-hyperrectangular nests not supported for tiling!\n");
375  return failure();
376  }
377  return success();
378 }
379 
380 /// Check if the input nest is supported for tiling and whether tiling would be
381 /// legal or not.
382 template <typename t>
384  ArrayRef<t> tileSizes) {
385  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386 
387  if (llvm::any_of(input,
388  [](AffineForOp op) { return op.getNumResults() > 0; })) {
389  LLVM_DEBUG(llvm::dbgs()
390  << "Cannot tile nest where a loop has yield values\n");
391  return failure();
392  }
393 
394  // Check if the supplied `for` ops are all successively nested.
395  if (!isPerfectlyNested(input)) {
396  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397  return failure();
398  }
399 
400  // TODO: handle non hyper-rectangular spaces.
401  if (failed(checkIfHyperRectangular(input)))
402  return failure();
403 
404  return success();
405 }
406 
407 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
408 /// location in destination's body, ignoring the terminator.
409 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410  Block::iterator loc) {
411  auto &ops = src.getBody()->getOperations();
412  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413  std::prev(ops.end()));
414 }
415 
416 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417 /// body.
418 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
420 }
421 
422 /// Constructs tiled loop nest, without setting the loop bounds and move the
423 /// body of the original loop nest to the tiled loop nest.
425  AffineForOp rootAffineForOp, unsigned width,
426  MutableArrayRef<AffineForOp> tiledLoops) {
427  Location loc = rootAffineForOp.getLoc();
428 
429  // The outermost among the loops as we add more..
430  Operation *topLoop = rootAffineForOp.getOperation();
431  AffineForOp innermostPointLoop;
432 
433  // Add intra-tile (or point) loops.
434  for (unsigned i = 0; i < width; i++) {
435  OpBuilder b(topLoop);
436  // Loop bounds will be set later.
437  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
438  pointLoop.getBody()->getOperations().splice(
439  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440  topLoop);
441  tiledLoops[2 * width - 1 - i] = pointLoop;
442  topLoop = pointLoop.getOperation();
443  if (i == 0)
444  innermostPointLoop = pointLoop;
445  }
446 
447  // Add tile space loops;
448  for (unsigned i = width; i < 2 * width; i++) {
449  OpBuilder b(topLoop);
450  // Loop bounds will be set later.
451  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
452  tileSpaceLoop.getBody()->getOperations().splice(
453  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454  topLoop);
455  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456  topLoop = tileSpaceLoop.getOperation();
457  }
458 
459  // Move the loop body of the original nest to the new one.
460  moveLoopBody(origLoops.back(), innermostPointLoop);
461 }
462 
463 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
464 // TODO: Handle non-constant lower bounds.
465 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466  AffineForOp newInterTileLoop,
467  AffineForOp newIntraTileLoop,
468  Value tileSize) {
469  // The lower bound for the intra-tile loop is represented by an affine map
470  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
471  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
473  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475  // corresponding inter-tile loop.
476 
477  assert(origLoop.hasConstantLowerBound() &&
478  "expected input loops to have constant lower bound.");
479 
480  // Get lower bound of original loop as an affine expression.
481  AffineExpr origLowerBoundExpr;
482  origLowerBoundExpr =
483  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
484 
485  // Add dim operands from original lower/upper bound.
486  SmallVector<Value, 4> lbOperands, ubOperands;
487  AffineBound lb = origLoop.getLowerBound();
488  AffineBound ub = origLoop.getUpperBound();
489  lbOperands.reserve(lb.getNumOperands() + 2);
490  ubOperands.reserve(ub.getNumOperands() + 2);
491  AffineMap origLbMap = lb.getMap();
492  AffineMap origUbMap = ub.getMap();
493  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
494  lbOperands.push_back(lb.getOperand(j));
495  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
496  ubOperands.push_back(ub.getOperand(j));
497 
498  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
499  // IV.
500  lbOperands.push_back(newInterTileLoop.getInductionVar());
501  ubOperands.push_back(newInterTileLoop.getInductionVar());
502 
503  // Get loop IV as an affine expression for lower/upper bound. Size of
504  // lb/ubOperands is guaranteed to be atleast one.
505  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
506  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
507 
508  // Add symbol operands from original lower/upper bound.
509  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
510  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
511  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
512  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
513 
514  // Add a new symbol operand which is the tile size for this loop.
515  lbOperands.push_back(tileSize);
516  ubOperands.push_back(tileSize);
517 
518  SmallVector<AffineExpr, 4> lbBoundExprs;
519  SmallVector<AffineExpr, 4> ubBoundExprs;
520  lbBoundExprs.reserve(origLbMap.getNumResults());
521  ubBoundExprs.reserve(origUbMap.getNumResults());
522 
523  // Get tiling parameter as an affine expression for lb/ub.
524  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
525  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
526 
527  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528  lbBoundExprs.push_back(
529  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530  origLowerBoundExpr);
531 
532  // Get the origLoopStep as an affine expression.
533  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
534 
535  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
536  // (tilingParameter * origLoopStep) + origlb.
537  ubBoundExprs.push_back(
538  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 
541  ubBoundExprs.append(origUbMap.getResults().begin(),
542  origUbMap.getResults().end());
543 
544  AffineMap lbMap =
545  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
546  lbBoundExprs, b.getContext());
547  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
548 
549  AffineMap ubMap =
550  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
551  ubBoundExprs, b.getContext());
552  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
553 
554  // Original loop step must be preserved.
555  newIntraTileLoop.setStep(origLoop.getStepAsInt());
556 }
557 
558 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
559 // TODO: Handle non-constant lower bounds.
560 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561  AffineForOp newLoop, Value tileSize) {
562  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563 
564  // The lower bounds for inter-tile loops are same as the corresponding lower
565  // bounds of original loops.
566  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
567 
568  // The new upper bound map for inter-tile loops, assuming constant lower
569  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570  // originalLowerBound), tiling parameter); where tiling parameter is the
571  // respective tile size for that loop. For e.g. if the original ubmap was
572  // ()->(1024), the new map will be
573  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574  // Therefore a new symbol operand is inserted in the map and the result
575  // expression is overwritten.
576 
577  assert(origLoop.hasConstantLowerBound() &&
578  "expected input loops to have constant lower bound.");
579 
580  // Get lower bound of original loop as an affine expression.
581  AffineExpr origLowerBoundExpr;
582  origLowerBoundExpr =
583  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
584 
585  // Add dim operands from original upper bound.
586  SmallVector<Value, 4> ubOperands;
587  AffineBound ub = origLoop.getUpperBound();
588  ubOperands.reserve(ub.getNumOperands() + 1);
589  AffineMap origUbMap = ub.getMap();
590  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
591  ubOperands.push_back(ub.getOperand(j));
592 
593  // Add symbol operands from original upper bound.
594  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
595  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
596 
597  // Add a new symbol operand which is the tile size for this loop.
598  ubOperands.push_back(tileSize);
599 
600  // Get tiling parameter as an affine expression.
601  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
602 
603  SmallVector<AffineExpr, 4> boundExprs;
604  boundExprs.reserve(origUbMap.getNumResults());
605  int64_t origUpperBound;
606  AffineExpr origUpperBoundExpr;
607 
608  // If upper bound for the original loop is constant, then the constant can
609  // be obtained as an affine expression straight away.
610  if (origLoop.hasConstantUpperBound()) {
611  origUpperBound = origLoop.getConstantUpperBound();
612 
613  // Get original constant upper bound as an affine expression.
614  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
615 
616  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617  // originalLowerBound), tilingParameter).
618  boundExprs.push_back(
619  origLowerBoundExpr +
620  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
621  } else {
622  // If upper bound for the original loop is not constant then two cases
623  // are possible, although there handeling is the same, 1.) The result of
624  // ubmap has only one result expression. For e.g.
625  // affine.for %i = 5 to %ub
626  //
627  // A symbol operand is added which represents the tiling parameter. The
628  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629  // where 's0' is the original upper bound and 's1' is the tiling
630  // parameter. 2.) When ubMap has more than one result expression. For e.g.
631  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
632  // affine.for %i = 5 to min #map0()[%s0, %s1]
633  //
634  // A symbol operand is added which represents the tiling parameter. The
635  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637 
638  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639  // originalLowerBound), tilingParameter).
640  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641  boundExprs.push_back(
642  origLowerBoundExpr +
643  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
644  }
645 
646  AffineMap ubMap =
647  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
648  boundExprs, b.getContext());
649  newLoop.setUpperBound(ubOperands, ubMap);
650 
651  // Original loop step must be preserved.
652  newLoop.setStep(origLoop.getStepAsInt());
653 }
654 
655 /// Constructs and sets new loop bounds after tiling for the case of
656 /// hyper-rectangular index sets, where the bounds of one dimension do not
657 /// depend on other dimensions and tiling parameters are captured from SSA
658 /// values. Bounds of each dimension can thus be treated independently,
659 /// and deriving the new bounds is much simpler and faster than for the case of
660 /// tiling arbitrary polyhedral shapes.
663  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664  assert(!origLoops.empty() && "expected atleast one loop in band");
665  assert(origLoops.size() == tileSizes.size() &&
666  "expected tiling parameter for each loop in band.");
667 
668  OpBuilder b(origLoops[0].getOperation());
669  unsigned width = origLoops.size();
670 
671  // Set bounds for tile space loops.
672  for (unsigned i = 0; i < width; ++i) {
673  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
674  }
675 
676  // Set bounds for intra-tile loops.
677  for (unsigned i = 0; i < width; ++i) {
678  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
679  newLoops[i + width], tileSizes[i]);
680  }
681 }
682 
683 /// Constructs and sets new loop bounds after tiling for the case of
684 /// hyper-rectangular index sets, where the bounds of one dimension do not
685 /// depend on other dimensions. Bounds of each dimension can thus be treated
686 /// independently, and deriving the new bounds is much simpler and faster
687 /// than for the case of tiling arbitrary polyhedral shapes.
688 static void
691  ArrayRef<unsigned> tileSizes) {
692  assert(!origLoops.empty());
693  assert(origLoops.size() == tileSizes.size());
694 
695  OpBuilder b(origLoops[0].getOperation());
696  unsigned width = origLoops.size();
697 
698  // Bounds for tile space loops.
699  for (unsigned i = 0; i < width; i++) {
700  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
703  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
704  // If the step size of original loop is x and tileSize is y then after
705  // tiling the tile space loops' step size becomes x*y.
706  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
707  }
708  // Bounds for intra-tile loops.
709  for (unsigned i = 0; i < width; i++) {
710  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
711  std::optional<uint64_t> mayBeConstantCount =
712  getConstantTripCount(origLoops[i]);
713  // The lower bound is just the tile-space loop.
714  AffineMap lbMap = b.getDimIdentityMap();
715  newLoops[width + i].setLowerBound(
716  /*operands=*/newLoops[i].getInductionVar(), lbMap);
717  // The step sizes of intra-tile loops is just the original loops' step size.
718  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
719 
720  // Set the upper bound.
721  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
722  // Trip count is less than the tile size: upper bound is lower bound +
723  // trip count * stepSize.
725  *mayBeConstantCount * origLoops[i].getStepAsInt());
726  newLoops[width + i].setUpperBound(
727  /*operands=*/newLoops[i].getInductionVar(), ubMap);
728  } else if (largestDiv % tileSizes[i] != 0) {
729  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730  // Construct the upper bound map; the operands are the original operands
731  // with 'i' (tile-space loop) appended to it. The new upper bound map is
732  // the original one with an additional expression i + tileSize * stepSize
733  // appended.
734 
735  // Add dim operands from original upper bound.
736  SmallVector<Value, 4> ubOperands;
737  AffineBound ub = origLoops[i].getUpperBound();
738  ubOperands.reserve(ub.getNumOperands() + 1);
739  AffineMap origUbMap = ub.getMap();
740  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
741  ubOperands.push_back(ub.getOperand(j));
742 
743  // Add dim operand for new loop upper bound.
744  ubOperands.push_back(newLoops[i].getInductionVar());
745 
746  // Add symbol operands from original upper bound.
747  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
748  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
749 
750  SmallVector<AffineExpr, 4> boundExprs;
751  boundExprs.reserve(1 + origUbMap.getNumResults());
752  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
753  // The new upper bound map is the original one with an additional
754  // expression i + tileSize * stepSize (of original loop) appended.
755  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
756  boundExprs.append(origUbMap.getResults().begin(),
757  origUbMap.getResults().end());
758  AffineMap ubMap =
759  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
760  boundExprs, b.getContext());
761  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
762  } else {
763  // No need of the min expression.
764  AffineExpr dim = b.getAffineDimExpr(0);
765  AffineMap ubMap = AffineMap::get(
766  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
767  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
768  }
769  }
770 }
771 
772 LogicalResult
774  ArrayRef<unsigned> tileSizes,
775  SmallVectorImpl<AffineForOp> *tiledNest) {
776  if (input.empty())
777  return success();
778 
779  if (failed(performPreTilingChecks(input, tileSizes)))
780  return failure();
781 
782  MutableArrayRef<AffineForOp> origLoops = input;
783  AffineForOp rootAffineForOp = origLoops[0];
784 
785  // Note that width is at least one since the band isn't empty.
786  unsigned width = input.size();
787  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
788 
789  // Construct a tiled loop nest without setting their bounds. Bounds are
790  // set later.
791  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792 
793  SmallVector<Value, 8> origLoopIVs;
794  extractForInductionVars(input, &origLoopIVs);
795 
796  // Set loop bounds for the tiled loop nest.
797  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
798 
799  // Replace original IVs with intra-tile loop IVs.
800  for (unsigned i = 0; i < width; i++)
801  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
802 
803  // Erase the old loop nest.
804  rootAffineForOp.erase();
805 
806  if (tiledNest)
807  *tiledNest = std::move(tiledLoops);
808 
809  return success();
810 }
811 
812 /// Tiles the specified band of perfectly nested loops creating tile-space
813 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
814 /// is a contiguous set of loops.
817  SmallVectorImpl<AffineForOp> *tiledNest) {
818  if (input.empty())
819  return success();
820 
821  if (failed(performPreTilingChecks(input, tileSizes)))
822  return failure();
823 
824  MutableArrayRef<AffineForOp> origLoops = input;
825  AffineForOp rootAffineForOp = origLoops[0];
826  unsigned width = input.size();
827  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
828 
829  // Construct a tiled loop nest without setting their bounds. Bounds are
830  // set later.
831  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832 
833  SmallVector<Value, 8> origLoopIVs;
834  extractForInductionVars(input, &origLoopIVs);
835 
836  // Set loop bounds for the tiled loop nest.
837  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
838  tileSizes);
839 
840  // Replace original IVs with intra-tile loop IVs.
841  for (unsigned i = 0; i < width; i++)
842  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
843 
844  // Erase the old loop nest.
845  rootAffineForOp.erase();
846 
847  if (tiledNest)
848  *tiledNest = std::move(tiledLoops);
849 
850  return success();
851 }
852 
853 /// Get perfectly nested sequence of loops starting at root of loop nest
854 /// (the first op being another AffineFor, and the second op - a terminator).
855 /// A loop is perfectly nested iff: the first op in the loop's body is another
856 /// AffineForOp, and the second op is a terminator).
858  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
860  nestedLoops.push_back(root);
861  Block &body = root.getRegion().front();
862  if (body.begin() != std::prev(body.end(), 2))
863  return;
864 
865  root = dyn_cast<AffineForOp>(&body.front());
866  if (!root)
867  return;
868  }
869 }
870 
871 /// Identify valid and profitable bands of loops to tile. This is currently just
872 /// a temporary placeholder to test the mechanics of tiled code generation.
873 /// Returns all maximal outermost perfect loop nests to tile.
875  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
876  // Get maximal perfect nest of 'affine.for' insts starting from root
877  // (inclusive).
878  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
880  getPerfectlyNestedLoops(band, forOp);
881  bands->push_back(band);
882  }
883 }
884 
885 /// Unrolls this loop completely.
886 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888  if (mayBeConstantTripCount.has_value()) {
889  uint64_t tripCount = *mayBeConstantTripCount;
890  if (tripCount == 0)
891  return success();
892  if (tripCount == 1)
893  return promoteIfSingleIteration(forOp);
894  return loopUnrollByFactor(forOp, tripCount);
895  }
896  return failure();
897 }
898 
899 /// Unrolls this loop by the specified factor or by the trip count (if constant)
900 /// whichever is lower.
901 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902  uint64_t unrollFactor) {
903  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904  if (mayBeConstantTripCount.has_value() &&
905  *mayBeConstantTripCount < unrollFactor)
906  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
907  return loopUnrollByFactor(forOp, unrollFactor);
908 }
909 
910 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
913 /// using annotateFn.
915  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918  ValueRange iterArgs, ValueRange yieldedValues) {
919  // Builder to insert unrolled bodies just before the terminator of the body of
920  // 'forOp'.
921  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
922 
923  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924  if (!annotateFn)
925  annotateFn = defaultAnnotateFn;
926 
927  // Keep a pointer to the last non-terminator operation in the original block
928  // so that we know what to clone (since we are doing this in-place).
929  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
930 
931  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932  SmallVector<Value, 4> lastYielded(yieldedValues);
933 
934  for (unsigned i = 1; i < unrollFactor; i++) {
935  IRMapping operandMap;
936 
937  // Prepare operand map.
938  operandMap.map(iterArgs, lastYielded);
939 
940  // If the induction variable is used, create a remapping to the value for
941  // this unrolled instance.
942  if (!forOpIV.use_empty()) {
943  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944  operandMap.map(forOpIV, ivUnroll);
945  }
946 
947  // Clone the original body of 'forOp'.
948  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
949  Operation *clonedOp = builder.clone(*it, operandMap);
950  annotateFn(i, clonedOp, builder);
951  }
952 
953  // Update yielded values. If the yielded value is defined outside the
954  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956  // value with the clone corresponding to the yielded value.
957  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958  Operation *defOp = yieldedValues[i].getDefiningOp();
959  if (defOp && defOp->getBlock() == loopBodyBlock)
960  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
961  }
962  }
963 
964  // Make sure we annotate the Ops in the original body. We do this last so that
965  // any annotations are not copied into the cloned Ops above.
966  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
967  annotateFn(0, &*it, builder);
968 
969  // Update operands of the yield statement.
970  loopBodyBlock->getTerminator()->setOperands(lastYielded);
971 }
972 
973 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974 /// count is not a multiple of `unrollFactor`.
975 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976  uint64_t unrollFactor) {
977  // Insert the cleanup loop right after 'forOp'.
978  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
979  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
980 
981  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982  // and produce results for the original users of `forOp` results.
983  auto results = forOp.getResults();
984  auto cleanupResults = cleanupForOp.getResults();
985  auto cleanupIterOperands = cleanupForOp.getInits();
986 
987  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
989  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
990  }
991 
992  AffineMap cleanupMap;
993  SmallVector<Value, 4> cleanupOperands;
994  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
995  if (!cleanupMap)
996  return failure();
997 
998  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
999  // Promote the loop body up if this has turned into a single iteration loop.
1000  (void)promoteIfSingleIteration(cleanupForOp);
1001 
1002  // Adjust upper bound of the original loop; this is the same as the lower
1003  // bound of the cleanup loop.
1004  forOp.setUpperBound(cleanupOperands, cleanupMap);
1005  return success();
1006 }
1007 
1008 /// Unrolls this loop by the specified factor. Returns success if the loop
1009 /// is successfully unrolled.
1011  AffineForOp forOp, uint64_t unrollFactor,
1012  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013  bool cleanUpUnroll) {
1014  assert(unrollFactor > 0 && "unroll factor should be positive");
1015 
1016  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017  if (unrollFactor == 1) {
1018  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1019  return failure();
1020  return success();
1021  }
1022 
1023  // Nothing in the loop body other than the terminator.
1024  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1025  return success();
1026 
1027  // If the trip count is lower than the unroll factor, no unrolled body.
1028  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1029  if (cleanUpUnroll) {
1030  // Unroll the cleanup loop if cleanUpUnroll is specified.
1031  return loopUnrollFull(forOp);
1032  }
1033 
1034  return failure();
1035  }
1036 
1037  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1038  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1039  // Loops where the lower bound is a max expression or the upper bound is
1040  // a min expression and the trip count doesn't divide the unroll factor
1041  // can't be unrolled since the lower bound of the cleanup loop in such cases
1042  // cannot be expressed as an affine function or a max over affine functions.
1043  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1044  forOp.getUpperBoundMap().getNumResults() != 1)
1045  return failure();
1046  if (cleanUpUnroll)
1047  // Force unroll including cleanup loop
1048  return loopUnrollFull(forOp);
1049  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1050  assert(false && "cleanup loop lower bound map for single result lower "
1051  "and upper bound maps can always be determined");
1052  }
1053 
1054  ValueRange iterArgs(forOp.getRegionIterArgs());
1055  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1056 
1057  // Scale the step of loop being unrolled by unroll factor.
1058  int64_t step = forOp.getStepAsInt();
1059  forOp.setStep(step * unrollFactor);
1061  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1062  [&](unsigned i, Value iv, OpBuilder b) {
1063  // iv' = iv + i * step
1064  auto d0 = b.getAffineDimExpr(0);
1065  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1066  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1067  },
1068  /*annotateFn=*/annotateFn,
1069  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1070 
1071  // Promote the loop body up if this has turned into a single iteration loop.
1072  (void)promoteIfSingleIteration(forOp);
1073  return success();
1074 }
1075 
1076 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1077  uint64_t unrollJamFactor) {
1078  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1079  if (mayBeConstantTripCount.has_value() &&
1080  *mayBeConstantTripCount < unrollJamFactor)
1081  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1082  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1083 }
1084 
1085 /// Check if all control operands of all loops are defined outside of `forOp`
1086 /// and return false if not.
1087 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1088  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1089  for (auto controlOperand : aForOp.getControlOperands()) {
1090  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1091  return WalkResult::interrupt();
1092  }
1093  return WalkResult::advance();
1094  });
1095  return !walkResult.wasInterrupted();
1096 }
1097 
1098 /// Unrolls and jams this loop by the specified factor.
1099 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1100  uint64_t unrollJamFactor) {
1101  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1102 
1103  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1104  if (unrollJamFactor == 1) {
1105  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1106  return failure();
1107  return success();
1108  }
1109 
1110  // Nothing in the loop body other than the terminator.
1111  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1112  return success();
1113 
1114  // If the trip count is lower than the unroll jam factor, no unroll jam.
1115  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1116  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1117  return failure();
1118  }
1119 
1120  // If any control operand of any inner loop of `forOp` is defined within
1121  // `forOp`, no unroll jam.
1122  if (!areInnerBoundsInvariant(forOp))
1123  return failure();
1124 
1125  // Gather all sub-blocks to jam upon the loop being unrolled.
1127  jbg.walk(forOp);
1128  auto &subBlocks = jbg.subBlocks;
1129 
1130  // Collect loops with iter_args.
1131  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1132  forOp.walk([&](AffineForOp aForOp) {
1133  if (aForOp.getNumIterOperands() > 0)
1134  loopsWithIterArgs.push_back(aForOp);
1135  });
1136 
1137  // Get supported reductions to be used for creating reduction ops at the end.
1138  SmallVector<LoopReduction> reductions;
1139  if (forOp.getNumIterOperands() > 0)
1140  getSupportedReductions(forOp, reductions);
1141 
1142  // Generate the cleanup loop if trip count isn't a multiple of
1143  // unrollJamFactor.
1144  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1145  // Loops where the lower bound is a max expression or the upper bound is
1146  // a min expression and the trip count doesn't divide the unroll factor
1147  // can't be unrolled since the lower bound of the cleanup loop in such cases
1148  // cannot be expressed as an affine function or a max over affine functions.
1149  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1150  forOp.getUpperBoundMap().getNumResults() != 1)
1151  return failure();
1152  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1153  assert(false && "cleanup loop lower bound map for single result lower "
1154  "and upper bound maps can always be determined");
1155  }
1156 
1157  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1158  // iteration. There are (`unrollJamFactor` - 1) iterations.
1159  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1160 
1161  // For any loop with iter_args, replace it with a new loop that has
1162  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1163  // operands.
1164  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1165  IRRewriter rewriter(forOp.getContext());
1166  for (AffineForOp oldForOp : loopsWithIterArgs) {
1167  SmallVector<Value> dupIterOperands, dupYieldOperands;
1168  ValueRange oldIterOperands = oldForOp.getInits();
1169  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1170  ValueRange oldYieldOperands =
1171  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1172  // Get additional iterOperands, iterArgs, and yield operands. We will
1173  // fix iterOperands and yield operands after cloning of sub-blocks.
1174  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1175  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1176  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1177  }
1178  // Create a new loop with additional iterOperands, iter_args and yield
1179  // operands. This new loop will take the loop body of the original loop.
1180  bool forOpReplaced = oldForOp == forOp;
1181  AffineForOp newForOp =
1182  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1183  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1184  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1185  return dupYieldOperands;
1186  }));
1187  newLoopsWithIterArgs.push_back(newForOp);
1188  // `forOp` has been replaced with a new loop.
1189  if (forOpReplaced)
1190  forOp = newForOp;
1191  // Update `operandMaps` for `newForOp` iterArgs and results.
1192  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1193  unsigned oldNumIterArgs = oldIterArgs.size();
1194  ValueRange newResults = newForOp.getResults();
1195  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1196  assert(oldNumIterArgs == oldNumResults &&
1197  "oldNumIterArgs must be the same as oldNumResults");
1198  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1199  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1200  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1201  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1202  // to those in the `i`th new set.
1203  operandMaps[i - 1].map(newIterArgs[j],
1204  newIterArgs[i * oldNumIterArgs + j]);
1205  operandMaps[i - 1].map(newResults[j],
1206  newResults[i * oldNumResults + j]);
1207  }
1208  }
1209  }
1210 
1211  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1212  int64_t step = forOp.getStepAsInt();
1213  forOp.setStep(step * unrollJamFactor);
1214 
1215  auto forOpIV = forOp.getInductionVar();
1216  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1217  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1218  for (auto &subBlock : subBlocks) {
1219  // Builder to insert unroll-jammed bodies. Insert right at the end of
1220  // sub-block.
1221  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1222 
1223  // If the induction variable is used, create a remapping to the value for
1224  // this unrolled instance.
1225  if (!forOpIV.use_empty()) {
1226  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1227  auto d0 = builder.getAffineDimExpr(0);
1228  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1229  auto ivUnroll =
1230  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1231  operandMaps[i - 1].map(forOpIV, ivUnroll);
1232  }
1233  // Clone the sub-block being unroll-jammed.
1234  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1235  builder.clone(*it, operandMaps[i - 1]);
1236  }
1237  // Fix iterOperands and yield op operands of newly created loops.
1238  for (auto newForOp : newLoopsWithIterArgs) {
1239  unsigned oldNumIterOperands =
1240  newForOp.getNumIterOperands() / unrollJamFactor;
1241  unsigned numControlOperands = newForOp.getNumControlOperands();
1242  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1243  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1244  assert(oldNumIterOperands == oldNumYieldOperands &&
1245  "oldNumIterOperands must be the same as oldNumYieldOperands");
1246  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1247  // The `i`th duplication of an old iterOperand or yield op operand
1248  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1249  // if such mapped value exists.
1250  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1251  operandMaps[i - 1].lookupOrDefault(
1252  newForOp.getOperand(numControlOperands + j)));
1253  yieldOp.setOperand(
1254  i * oldNumYieldOperands + j,
1255  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1256  }
1257  }
1258  }
1259  if (forOp.getNumResults() > 0) {
1260  // Create reduction ops to combine every `unrollJamFactor` related results
1261  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1262  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1263  // %1.
1264  rewriter.setInsertionPointAfter(forOp);
1265  auto loc = forOp.getLoc();
1266  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1267  for (LoopReduction &reduction : reductions) {
1268  unsigned pos = reduction.iterArgPosition;
1269  Value lhs = forOp.getResult(pos);
1270  Value rhs;
1272  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1273  rhs = forOp.getResult(i * oldNumResults + pos);
1274  // Create ops based on reduction type.
1275  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1276  if (!lhs)
1277  return failure();
1278  Operation *op = lhs.getDefiningOp();
1279  assert(op && "Reduction op should have been created");
1280  newOps.insert(op);
1281  }
1282  // Replace all uses except those in newly created reduction ops.
1283  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1284  }
1285  }
1286 
1287  // Promote the loop body up if this has turned into a single iteration loop.
1288  (void)promoteIfSingleIteration(forOp);
1289  return success();
1290 }
1291 
1292 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1293 /// nested within 'forOpA' as the only non-terminator operation in its block.
1294 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1295  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1296  auto &forOpABody = forOpA.getBody()->getOperations();
1297  auto &forOpBBody = forOpB.getBody()->getOperations();
1298 
1299  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1300  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1301  // body containing only the terminator.
1302  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1303  forOpABody, forOpABody.begin(),
1304  std::prev(forOpABody.end()));
1305  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1306  // body (this leaves forOpB's body containing only the terminator).
1307  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1308  std::prev(forOpBBody.end()));
1309  // 3) Splice forOpA into the beginning of forOpB's body.
1310  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1311  Block::iterator(forOpA));
1312 }
1313 
1314 // Checks each dependence component against the permutation to see if the
1315 // desired loop interchange would violate dependences by making the
1316 // dependence component lexicographically negative.
1318  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1319  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1320  // Invert permutation map.
1321  unsigned maxLoopDepth = loops.size();
1322  SmallVector<unsigned, 4> loopPermMapInv;
1323  loopPermMapInv.resize(maxLoopDepth);
1324  for (unsigned i = 0; i < maxLoopDepth; ++i)
1325  loopPermMapInv[loopPermMap[i]] = i;
1326 
1327  // Check each dependence component against the permutation to see if the
1328  // desired loop interchange permutation would make the dependence vectors
1329  // lexicographically negative.
1330  // Example 1: [-1, 1][0, 0]
1331  // Example 2: [0, 0][-1, 1]
1332  for (const auto &depComps : depCompsVec) {
1333  assert(depComps.size() >= maxLoopDepth);
1334  // Check if the first non-zero dependence component is positive.
1335  // This iterates through loops in the desired order.
1336  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1337  unsigned permIndex = loopPermMapInv[j];
1338  assert(depComps[permIndex].lb);
1339  int64_t depCompLb = *depComps[permIndex].lb;
1340  if (depCompLb > 0)
1341  break;
1342  if (depCompLb < 0)
1343  return false;
1344  }
1345  }
1346  return true;
1347 }
1348 
1349 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1350 /// nested sequence of loops in 'loops' would violate dependences.
1352  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1353  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1354  unsigned maxLoopDepth = loops.size();
1355  if (maxLoopDepth == 1)
1356  return true;
1357  // Gather dependence components for dependences between all ops in loop nest
1358  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1359  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1361  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1362 }
1363 
1364 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1365 /// in it from outermost to innermost.
1366 bool LLVM_ATTRIBUTE_UNUSED
1368  assert(!loops.empty() && "no loops provided");
1369 
1370  // We already know that the block can't be empty.
1371  auto hasTwoElements = [](Block *block) {
1372  auto secondOpIt = std::next(block->begin());
1373  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1374  };
1375 
1376  auto enclosingLoop = loops.front();
1377  for (auto loop : loops.drop_front()) {
1378  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1379  // parentForOp's body should be just this loop and the terminator.
1380  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1381  return false;
1382  enclosingLoop = loop;
1383  }
1384  return true;
1385 }
1386 
1387 // input[i] should move from position i -> permMap[i]. Returns the position in
1388 // `input` that becomes the new outermost loop.
1390  ArrayRef<unsigned> permMap) {
1391  assert(input.size() == permMap.size() && "invalid permutation map size");
1392  // Check whether the permutation spec is valid. This is a small vector - we'll
1393  // just sort and check if it's iota.
1394  SmallVector<unsigned, 4> checkPermMap(permMap);
1395  llvm::sort(checkPermMap);
1396  if (llvm::any_of(llvm::enumerate(checkPermMap),
1397  [](const auto &en) { return en.value() != en.index(); }))
1398  assert(false && "invalid permutation map");
1399 
1400  // Nothing to do.
1401  if (input.size() < 2)
1402  return 0;
1403 
1404  assert(isPerfectlyNested(input) && "input not perfectly nested");
1405 
1406  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1407  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1409  for (unsigned i = 0, e = input.size(); i < e; ++i)
1410  invPermMap.push_back({permMap[i], i});
1411  llvm::sort(invPermMap);
1412 
1413  // Move the innermost loop body to the loop that would be the innermost in the
1414  // permuted nest (only if the innermost loop is going to change).
1415  if (permMap.back() != input.size() - 1) {
1416  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1417  Block *srcBody = ((AffineForOp)input.back()).getBody();
1418  destBody->getOperations().splice(destBody->begin(),
1419  srcBody->getOperations(), srcBody->begin(),
1420  std::prev(srcBody->end()));
1421  }
1422 
1423  // We'll move each loop in `input` in the reverse order so that its body is
1424  // empty when we are moving it; this incurs zero copies and no erasing.
1425  for (int i = input.size() - 1; i >= 0; --i) {
1426  // If this has to become the outermost loop after permutation, add it to the
1427  // parent block of the original root.
1428  if (permMap[i] == 0) {
1429  // If the root remains the same, nothing to do.
1430  if (i == 0)
1431  continue;
1432  // Make input[i] the new outermost loop moving it into parentBlock.
1433  auto *parentBlock = input[0]->getBlock();
1434  parentBlock->getOperations().splice(Block::iterator(input[0]),
1435  input[i]->getBlock()->getOperations(),
1436  Block::iterator(input[i]));
1437  continue;
1438  }
1439 
1440  // If the parent in the permuted order is the same as in the original,
1441  // nothing to do.
1442  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1443  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1444  continue;
1445 
1446  // Move input[i] to its surrounding loop in the transformed nest.
1447  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1448  destBody->getOperations().splice(destBody->begin(),
1449  input[i]->getBlock()->getOperations(),
1450  Block::iterator(input[i]));
1451  }
1452 
1453  return invPermMap[0].second;
1454 }
1455 
1456 // Sinks all sequential loops to the innermost levels (while preserving
1457 // relative order among them) and moves all parallel loops to the
1458 // outermost (while again preserving relative order among them).
1459 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1461  getPerfectlyNestedLoops(loops, forOp);
1462  if (loops.size() < 2)
1463  return forOp;
1464 
1465  // Gather dependence components for dependences between all ops in loop nest
1466  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1467  unsigned maxLoopDepth = loops.size();
1468  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1469  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1470 
1471  // Mark loops as either parallel or sequential.
1472  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1473  for (auto &depComps : depCompsVec) {
1474  assert(depComps.size() >= maxLoopDepth);
1475  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1476  DependenceComponent &depComp = depComps[j];
1477  assert(depComp.lb.has_value() && depComp.ub.has_value());
1478  if (*depComp.lb != 0 || *depComp.ub != 0)
1479  isParallelLoop[j] = false;
1480  }
1481  }
1482 
1483  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1484 
1485  // Compute permutation of loops that sinks sequential loops (and thus raises
1486  // parallel loops) while preserving relative order.
1487  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1488  unsigned nextSequentialLoop = numParallelLoops;
1489  unsigned nextParallelLoop = 0;
1490  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1491  if (isParallelLoop[i]) {
1492  loopPermMap[i] = nextParallelLoop++;
1493  } else {
1494  loopPermMap[i] = nextSequentialLoop++;
1495  }
1496  }
1497 
1498  // Check if permutation 'loopPermMap' would violate dependences.
1499  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1500  return forOp;
1501  // Perform loop interchange according to permutation 'loopPermMap'.
1502  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1503  return loops[loopNestRootIndex];
1504 }
1505 
1506 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1507 // lower (resp. upper) loop bound. When called for both the lower and upper
1508 // bounds, the resulting IR resembles:
1509 //
1510 // ```mlir
1511 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1512 // ...
1513 // }
1514 // ```
1516  SmallVector<Value, 4> *operands,
1517  int64_t offset = 0) {
1518  auto bounds = llvm::to_vector<4>(map->getResults());
1519  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1520  operands->insert(operands->begin() + map->getNumDims(), iv);
1521  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1522  b.getContext());
1523  canonicalizeMapAndOperands(map, operands);
1524 }
1525 
1526 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1527 // Stripmine-sink is a primitive building block for generalized tiling of
1528 // imperfectly nested loops.
1529 // This transformation is purely mechanical and does not check legality,
1530 // profitability or even structural correctness. It is the user's
1531 // responsibility to specify `targets` that are dominated by `forOp`.
1532 // Returns the new AffineForOps, one per `targets`, nested immediately under
1533 // each of the `targets`.
1535 stripmineSink(AffineForOp forOp, uint64_t factor,
1536  ArrayRef<AffineForOp> targets) {
1537  auto originalStep = forOp.getStepAsInt();
1538  auto scaledStep = originalStep * factor;
1539  forOp.setStep(scaledStep);
1540 
1541  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1542 
1543  // Lower-bound map creation.
1544  auto lbMap = forOp.getLowerBoundMap();
1545  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1546  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1547 
1548  // Upper-bound map creation.
1549  auto ubMap = forOp.getUpperBoundMap();
1550  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1551  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1552  /*offset=*/scaledStep);
1553 
1554  auto iv = forOp.getInductionVar();
1555  SmallVector<AffineForOp, 8> innerLoops;
1556  for (auto t : targets) {
1557  // Insert newForOp before the terminator of `t`.
1558  auto b = OpBuilder::atBlockTerminator(t.getBody());
1559  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1560  ubOperands, ubMap, originalStep);
1561  auto begin = t.getBody()->begin();
1562  // Skip terminator and `newForOp` which is just before the terminator.
1563  auto nOps = t.getBody()->getOperations().size() - 2;
1564  newForOp.getBody()->getOperations().splice(
1565  newForOp.getBody()->getOperations().begin(),
1566  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1567  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1568  newForOp.getRegion());
1569  innerLoops.push_back(newForOp);
1570  }
1571 
1572  return innerLoops;
1573 }
1574 
1575 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1576 // Returns the new AffineForOps, nested immediately under `target`.
1577 template <typename SizeType>
1578 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1579  AffineForOp target) {
1580  // TODO: Use cheap structural assertions that targets are nested under
1581  // forOp and that targets are not nested under each other when DominanceInfo
1582  // exposes the capability. It seems overkill to construct a whole function
1583  // dominance tree at this point.
1584  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1585  assert(res.size() == 1 && "Expected 1 inner forOp");
1586  return res[0];
1587 }
1588 
1591  ArrayRef<AffineForOp> targets) {
1593  SmallVector<AffineForOp, 8> currentTargets(targets);
1594  for (auto it : llvm::zip(forOps, sizes)) {
1595  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1596  res.push_back(step);
1597  currentTargets = step;
1598  }
1599  return res;
1600 }
1601 
1603  ArrayRef<uint64_t> sizes,
1604  AffineForOp target) {
1606  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1607  res.push_back(llvm::getSingleElement(loops));
1608  return res;
1609 }
1610 
1612  if (loops.size() < 2)
1613  return success();
1614 
1615  AffineForOp innermost = loops.back();
1616  AffineForOp outermost = loops.front();
1617  AffineBound ub = outermost.getUpperBound();
1618  AffineMap origUbMap = ub.getMap();
1619  Location loc = outermost.getLoc();
1620  OpBuilder builder(outermost);
1621  for (AffineForOp loop : loops) {
1622  // We only work on normalized loops.
1623  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1624  loop.getConstantLowerBound() != 0)
1625  return failure();
1626  }
1627  SmallVector<Value, 4> upperBoundSymbols;
1628  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1629  ub.getOperands().end());
1630 
1631  // 1. Store the upper bound of the outermost loop in a variable.
1632  Value prev;
1633  if (!llvm::hasSingleElement(origUbMap.getResults()))
1634  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1635  else
1636  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1637  upperBoundSymbols.push_back(prev);
1638 
1639  // 2. Emit code computing the upper bound of the coalesced loop as product of
1640  // the number of iterations of all loops.
1641  for (AffineForOp loop : loops.drop_front()) {
1642  ub = loop.getUpperBound();
1643  origUbMap = ub.getMap();
1644  ubOperands = ub.getOperands();
1645  Value upperBound;
1646  // If upper bound map has more than one result, take their minimum.
1647  if (!llvm::hasSingleElement(origUbMap.getResults()))
1648  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1649  else
1650  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1651  upperBoundSymbols.push_back(upperBound);
1652  SmallVector<Value, 4> operands;
1653  operands.push_back(prev);
1654  operands.push_back(upperBound);
1655  // Maintain running product of loop upper bounds.
1656  prev = builder.create<AffineApplyOp>(
1657  loc,
1658  AffineMap::get(/*dimCount=*/1,
1659  /*symbolCount=*/1,
1660  builder.getAffineDimExpr(0) *
1661  builder.getAffineSymbolExpr(0)),
1662  operands);
1663  }
1664  // Set upper bound of the coalesced loop.
1665  AffineMap newUbMap = AffineMap::get(
1666  /*dimCount=*/0,
1667  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1668  outermost.setUpperBound(prev, newUbMap);
1669 
1670  builder.setInsertionPointToStart(outermost.getBody());
1671 
1672  // 3. Remap induction variables. For each original loop, the value of the
1673  // induction variable can be obtained by dividing the induction variable of
1674  // the linearized loop by the total number of iterations of the loops nested
1675  // in it modulo the number of iterations in this loop (remove the values
1676  // related to the outer loops):
1677  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1678  // Compute these iteratively from the innermost loop by creating a "running
1679  // quotient" of division by the range.
1680  Value previous = outermost.getInductionVar();
1681  for (unsigned idx = loops.size(); idx > 0; --idx) {
1682  if (idx != loops.size()) {
1683  SmallVector<Value, 4> operands;
1684  operands.push_back(previous);
1685  operands.push_back(upperBoundSymbols[idx]);
1686  previous = builder.create<AffineApplyOp>(
1687  loc,
1689  /*dimCount=*/1, /*symbolCount=*/1,
1690  builder.getAffineDimExpr(0).floorDiv(
1691  builder.getAffineSymbolExpr(0))),
1692  operands);
1693  }
1694  // Modified value of the induction variables of the nested loops after
1695  // coalescing.
1696  Value inductionVariable;
1697  if (idx == 1) {
1698  inductionVariable = previous;
1699  } else {
1700  SmallVector<Value, 4> applyOperands;
1701  applyOperands.push_back(previous);
1702  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1703  inductionVariable = builder.create<AffineApplyOp>(
1704  loc,
1706  /*dimCount=*/1, /*symbolCount=*/1,
1707  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1708  applyOperands);
1709  }
1710  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1711  inductionVariable, loops.back().getRegion());
1712  }
1713 
1714  // 4. Move the operations from the innermost just above the second-outermost
1715  // loop, delete the extra terminator and the second-outermost loop.
1716  AffineForOp secondOutermostLoop = loops[1];
1717  innermost.getBody()->back().erase();
1718  outermost.getBody()->getOperations().splice(
1719  Block::iterator(secondOutermostLoop.getOperation()),
1720  innermost.getBody()->getOperations());
1721  secondOutermostLoop.erase();
1722  return success();
1723 }
1724 
1726  ArrayRef<Value> processorId,
1727  ArrayRef<Value> numProcessors) {
1728  assert(processorId.size() == numProcessors.size());
1729  if (processorId.empty())
1730  return;
1731 
1732  OpBuilder b(forOp);
1733  Location loc(forOp.getLoc());
1734  AffineExpr lhs, rhs;
1735  bindSymbols(forOp.getContext(), lhs, rhs);
1736  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1737  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1738 
1739  Value linearIndex = processorId.front();
1740  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1741  auto mulApplyOp = b.create<AffineApplyOp>(
1742  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1743  linearIndex = b.create<AffineApplyOp>(
1744  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1745  }
1746 
1747  auto mulApplyOp = b.create<AffineApplyOp>(
1748  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1749  Value lb = b.create<AffineApplyOp>(
1750  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1751  forOp.setLowerBound(lb);
1752 
1753  Value step = forOp.getStep();
1754  for (auto numProcs : numProcessors)
1755  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1756  forOp.setStep(step);
1757 }
1758 
1759 /// Given a memref region, determine the lowest depth at which transfers can be
1760 /// placed for it, and return the corresponding block, start and end positions
1761 /// in the block for placing incoming (read) and outgoing (write) copies
1762 /// respectively. The lowest depth depends on whether the region being accessed
1763 /// is hoistable with respect to one or more immediately surrounding loops.
1764 static void
1766  Block::iterator &begin, Block::iterator &end,
1767  Block **copyPlacementBlock,
1768  Block::iterator *copyInPlacementStart,
1769  Block::iterator *copyOutPlacementStart) {
1770  const auto *cst = region.getConstraints();
1771  SmallVector<Value, 4> symbols;
1772  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1773 
1774  SmallVector<Operation *, 4> enclosingAffineOps;
1775  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1776  // Walk up loop parents till we find an IV on which this region is
1777  // symbolic/variant or we hit `hoistGuard`.
1778  auto it = enclosingAffineOps.rbegin();
1779  AffineForOp lastInvariantFor;
1780  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1781  Operation *enclosingOp = *it;
1782  // We can't hoist past the definition of the memref being copied.
1783  Value memref = region.memref;
1784  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1785  LLVM_DEBUG(
1786  llvm::dbgs()
1787  << "memref definition will end up not dominating hoist location\n");
1788  break;
1789  }
1790 
1791  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1792  if (!affineFor)
1793  break;
1794  // TODO: also need to be checking this for regions symbols that
1795  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1796  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1797  break;
1798  lastInvariantFor = affineFor;
1799  }
1800 
1801  if (it != enclosingAffineOps.rbegin()) {
1802  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1803  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1804  *copyPlacementBlock = lastInvariantFor->getBlock();
1805  } else {
1806  *copyInPlacementStart = begin;
1807  *copyOutPlacementStart = end;
1808  *copyPlacementBlock = &block;
1809  }
1810 }
1811 
1812 // Info comprising stride and number of elements transferred every stride.
1813 struct StrideInfo {
1814  int64_t stride;
1816 };
1817 
1818 /// Returns striding information for a copy/transfer of this region with
1819 /// potentially multiple striding levels from outermost to innermost. For an
1820 /// n-dimensional region, there can be at most n-1 levels of striding
1821 /// successively nested.
1822 // TODO: make this work with non-identity layout maps.
1823 static void getMultiLevelStrides(const MemRefRegion &region,
1824  ArrayRef<int64_t> bufferShape,
1825  SmallVectorImpl<StrideInfo> *strideInfos) {
1826  if (bufferShape.size() <= 1)
1827  return;
1828 
1829  int64_t numEltPerStride = 1;
1830  int64_t stride = 1;
1831  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1832  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1833  stride *= dimSize;
1834  numEltPerStride *= bufferShape[d];
1835  // A stride is needed only if the region has a shorter extent than the
1836  // memref along the dimension *and* has an extent greater than one along the
1837  // next major dimension.
1838  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1839  strideInfos->push_back({stride, numEltPerStride});
1840  }
1841  }
1842 }
1843 
1844 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1845 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1846 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1847 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1848 /// contain the expressions to be subtracted out from the respective copy loop
1849 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1850 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1851 /// copy nest is inserted.
1852 //
1853 /// The copy-in nest is generated as follows as an example for a 2-d region:
1854 /// for x = ...
1855 /// for y = ...
1856 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1857 ///
1858 static AffineForOp
1859 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1860  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1861  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1862  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1863  OpBuilder b) {
1864  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1865  return lbMap.getNumInputs() == lbOperands.size();
1866  }));
1867  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1868  return ubMap.getNumInputs() == ubOperands.size();
1869  }));
1870 
1871  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1872  // A copy nest can't be generated for 0-ranked memrefs.
1873  assert(rank != 0 && "non-zero rank memref expected");
1874  assert(lbMaps.size() == rank && "wrong number of lb maps");
1875  assert(ubMaps.size() == rank && "wrong number of ub maps");
1876 
1877  SmallVector<Value, 4> memIndices;
1878  SmallVector<AffineExpr, 4> fastBufExprs;
1879  SmallVector<Value, 4> fastBufMapOperands;
1880  AffineForOp copyNestRoot;
1881  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1882  for (unsigned d = 0; d < rank; ++d) {
1883  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1884  ubOperands, ubMaps[d]);
1885  if (d == 0)
1886  copyNestRoot = forOp;
1887 
1888  b = OpBuilder::atBlockTerminator(forOp.getBody());
1889 
1890  auto fastBufOffsetMap =
1891  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1892  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1893 
1894  // Construct the subscript for the fast memref being copied into/from:
1895  // x - offset_x.
1896  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1897  b.getAffineDimExpr(2 * d));
1898  fastBufMapOperands.push_back(offset);
1899  fastBufMapOperands.push_back(forOp.getInductionVar());
1900  mayBeDeadApplys.push_back(offset);
1901 
1902  // Subscript for the slow memref being copied.
1903  memIndices.push_back(forOp.getInductionVar());
1904  }
1905 
1906  auto fastBufMap =
1907  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1908  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1909  fastBufMap = simplifyAffineMap(fastBufMap);
1910  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1911 
1912  // Drop any dead affine.applys.
1913  for (auto applyOp : mayBeDeadApplys)
1914  if (applyOp.use_empty())
1915  applyOp.erase();
1916 
1917  if (!isCopyOut) {
1918  // Copy in.
1919  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1920  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1921  fastBufMapOperands);
1922  return copyNestRoot;
1923  }
1924 
1925  // Copy out.
1926  auto load =
1927  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1928  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1929  return copyNestRoot;
1930 }
1931 
1932 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1934  return block.getParentOp()->emitRemark();
1935 }
1936 
1937 /// Creates a buffer in the faster memory space for the specified memref region
1938 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1939 /// space to this one, and replaces all loads/stores in the block range
1940 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1941 /// if copies could not be generated due to yet unimplemented cases.
1942 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1943 /// specify the insertion points where the incoming copies and outgoing copies,
1944 /// respectively, should be inserted (the insertion happens right before the
1945 /// insertion point). Since `begin` can itself be invalidated due to the memref
1946 /// rewriting done from this method, the output argument `nBegin` is set to its
1947 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1948 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1949 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1950 /// allocated.
1951 static LogicalResult generateCopy(
1952  const MemRefRegion &region, Block *block, Block::iterator begin,
1953  Block::iterator end, Block *copyPlacementBlock,
1954  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1955  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1956  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1957  Block::iterator *nBegin, Block::iterator *nEnd) {
1958  *nBegin = begin;
1959  *nEnd = end;
1960 
1961  auto f = begin->getParentOfType<FunctionOpInterface>();
1962  OpBuilder topBuilder(f.getFunctionBody());
1963  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1964 
1965  *sizeInBytes = 0;
1966 
1967  if (begin == end)
1968  return success();
1969 
1970  // Record the last op in the block for which we are performing copy
1971  // generation. We later do the memref replacement only in [begin, lastCopyOp]
1972  // so that the original memref's used in the data movement code themselves
1973  // don't get replaced.
1974  Operation *lastCopyOp = end->getPrevNode();
1975 
1976  // Is the copy out point at the end of the block where we are doing
1977  // explicit copying.
1978  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1979 
1980  // Copies for read regions are going to be inserted at 'begin'.
1981  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1982  // Copies for write regions are going to be inserted at 'end'.
1983  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1984  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1985 
1986  // Builder to create constants at the top level.
1987  auto func =
1988  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1989  OpBuilder top(func.getFunctionBody());
1990 
1991  auto loc = region.loc;
1992  auto memref = region.memref;
1993  auto memRefType = cast<MemRefType>(memref.getType());
1994 
1995  if (!memRefType.getLayout().isIdentity()) {
1996  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1997  return failure();
1998  }
1999 
2000  // Indices to use for the copying.
2001  // Indices for the original memref being copied from/to.
2002  SmallVector<Value, 4> memIndices;
2003  // Indices for the faster buffer being copied into/from.
2004  SmallVector<Value, 4> bufIndices;
2005 
2006  unsigned rank = memRefType.getRank();
2007  if (rank == 0) {
2008  LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2009  return failure();
2010  }
2011 
2012  SmallVector<int64_t, 4> fastBufferShape;
2013 
2014  // Compute the extents of the buffer.
2016  lbs.reserve(rank);
2017  std::optional<int64_t> numElements =
2018  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2019  if (!numElements) {
2020  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2021  return failure();
2022  }
2023 
2024  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2025  // This can be supported in the future if needed.
2026  LLVM_DEBUG(llvm::dbgs()
2027  << "Max lower bound for memref region start not supported\n");
2028  return failure();
2029  }
2030 
2031  if (*numElements == 0) {
2032  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2033  return success();
2034  }
2035 
2036  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2037  for (unsigned i = 0; i < rank; ++i) {
2038  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2039  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2040  LLVM_DEBUG(llvm::dbgs()
2041  << "Missing lower or upper bound for region along dimension: "
2042  << i << '\n');
2043  return failure();
2044  }
2045  }
2046 
2047  const FlatAffineValueConstraints *cst = region.getConstraints();
2048  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2049  // on; these typically include loop IVs surrounding the level at which the
2050  // copy generation is being done or other valid symbols in MLIR.
2051  SmallVector<Value, 8> regionSymbols;
2052  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2053 
2054  // Construct the access expression for the fast memory buffer. The access
2055  // expression for a particular dimension of the fast buffer is obtained by
2056  // subtracting out the lower bound on the original memref's data region
2057  // along the corresponding dimension.
2058 
2059  // Index start offsets for faster memory buffer relative to the original.
2060  SmallVector<AffineExpr, 4> fastBufOffsets;
2061  fastBufOffsets.reserve(rank);
2062  for (unsigned d = 0; d < rank; d++) {
2063  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2064  "incorrect bound size");
2065 
2066  // Set copy start location for this dimension in the lower memory space
2067  // memref.
2068  if (lbs[d].isSingleConstant()) {
2069  auto indexVal = lbs[d].getSingleConstantResult();
2070  if (indexVal == 0) {
2071  memIndices.push_back(zeroIndex);
2072  } else {
2073  memIndices.push_back(
2074  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2075  }
2076  } else {
2077  // The coordinate for the start location is just the lower bound along the
2078  // corresponding dimension on the memory region (stored in 'offset').
2079  // Remap all inputs of the map to dimensions uniformly since in the
2080  // generate IR we need valid affine symbols as opposed to "symbols" for
2081  // the purpose of the memref region.
2082  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2083  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2084  symReplacements[i] = top.getAffineDimExpr(i);
2085  lbs[d] = lbs[d].replaceDimsAndSymbols(
2086  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2087  /*numResultSyms=*/0);
2088  memIndices.push_back(b.create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2089  }
2090  // The fast buffer is copied into at location zero; addressing is relative.
2091  bufIndices.push_back(zeroIndex);
2092 
2093  // Record the offsets since they are needed to remap the memory accesses of
2094  // the original memref further below.
2095  fastBufOffsets.push_back(lbs[d].getResult(0));
2096  }
2097 
2098  // The faster memory space buffer.
2099  Value fastMemRef;
2100 
2101  // Check if a buffer was already created.
2102  bool existingBuf = fastBufferMap.count(memref) > 0;
2103  if (!existingBuf) {
2104  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2105  auto fastMemRefType =
2106  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2107  fastBufferLayout, copyOptions.fastMemorySpace);
2108 
2109  // Create the fast memory space buffer just before the 'affine.for'
2110  // operation.
2111  fastMemRef =
2112  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2113  // Record it.
2114  fastBufferMap[memref] = fastMemRef;
2115  // fastMemRefType is a constant shaped memref.
2116  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2117  // We don't account for things of unknown size.
2118  *sizeInBytes = maySizeInBytes.value_or(0);
2119 
2120  LLVM_DEBUG(emitRemarkForBlock(*block)
2121  << "Creating fast buffer of type " << fastMemRefType
2122  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2123  << " KiB\n");
2124  } else {
2125  // Reuse the one already created.
2126  fastMemRef = fastBufferMap[memref];
2127  }
2128 
2129  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2130 
2131  Value dmaStride;
2132  Value numEltPerDmaStride;
2133  if (copyOptions.generateDma) {
2134  SmallVector<StrideInfo, 4> dmaStrideInfos;
2135  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2136 
2137  // TODO: use all stride levels once DmaStartOp is extended for
2138  // multi-level strides.
2139  if (dmaStrideInfos.size() > 1) {
2140  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2141  return failure();
2142  }
2143 
2144  if (!dmaStrideInfos.empty()) {
2145  dmaStride =
2146  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2147  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2148  loc, dmaStrideInfos[0].numEltPerStride);
2149  }
2150  }
2151 
2152  // Create fully composed affine maps for each memref.
2153  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2154  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2155  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2156  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2157 
2158  if (!copyOptions.generateDma) {
2159  // Point-wise copy generation.
2160  auto copyNest =
2161  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2162  /*lbOperands=*/regionSymbols, ubMaps,
2163  /*ubOperands=*/regionSymbols, fastBufOffsets,
2164  /*isCopyOut=*/region.isWrite(), b);
2165 
2166  // Record this so that we can skip it from yet another copy.
2167  copyNests.insert(copyNest);
2168 
2169  // Since new ops are being appended (for copy out's), adjust the end to
2170  // mark end of block range being processed if necessary.
2171  if (region.isWrite() && isCopyOutAtEndOfBlock)
2172  *nEnd = Block::iterator(copyNest.getOperation());
2173  } else {
2174  // DMA generation.
2175  // Create a tag (single element 1-d memref) for the DMA.
2176  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2177  copyOptions.tagMemorySpace);
2178  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2179 
2180  SmallVector<Value, 4> tagIndices({zeroIndex});
2181  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2182  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2183  if (!region.isWrite()) {
2184  // DMA non-blocking read from original buffer to fast buffer.
2185  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2186  fastMemRef, bufAffineMap, bufIndices,
2187  tagMemRef, tagAffineMap, tagIndices,
2188  numElementsSSA, dmaStride, numEltPerDmaStride);
2189  } else {
2190  // DMA non-blocking write from fast buffer to the original memref.
2191  auto op = b.create<AffineDmaStartOp>(
2192  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2193  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2194  dmaStride, numEltPerDmaStride);
2195  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2196  // end to mark end of block range being processed.
2197  if (isCopyOutAtEndOfBlock)
2198  *nEnd = Block::iterator(op.getOperation());
2199  }
2200 
2201  // Matching DMA wait to block on completion; tag always has a 0 index.
2202  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2203  numElementsSSA);
2204 
2205  // Generate dealloc for the tag.
2206  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2207  if (*nEnd == end && isCopyOutAtEndOfBlock)
2208  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2209  // mark end of range of the original.
2210  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2211  }
2212 
2213  // Generate dealloc for the buffer.
2214  if (!existingBuf) {
2215  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2216  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2217  // the fast buffer (since it marks the new end insertion point).
2218  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2219  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2220  }
2221 
2222  // Replace all uses of the old memref with the faster one while remapping
2223  // access indices (subtracting out lower bound offsets for each dimension).
2224  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2225  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2226  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2227  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2228  // d2, d3 correspond to the original indices (%i, %j).
2229  SmallVector<AffineExpr, 4> remapExprs;
2230  remapExprs.reserve(rank);
2231  for (unsigned i = 0; i < rank; i++) {
2232  // The starting operands of indexRemap will be regionSymbols (the symbols on
2233  // which the memref region is parametric); then those corresponding to
2234  // the memref's original indices follow.
2235  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2236  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2237  }
2238  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2239  b.getContext());
2240 
2241  // Record the begin since it may be invalidated by memref replacement.
2242  Block::iterator prevOfBegin;
2243  bool isBeginAtStartOfBlock = (begin == block->begin());
2244  if (!isBeginAtStartOfBlock)
2245  prevOfBegin = std::prev(begin);
2246 
2247  auto userFilterFn = [&](Operation *user) {
2248  auto *ancestorUser = block->findAncestorOpInBlock(*user);
2249  return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2250  !lastCopyOp->isBeforeInBlock(ancestorUser);
2251  };
2252 
2253  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2254  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2255  /*extraIndices=*/{}, indexRemap,
2256  /*extraOperands=*/regionSymbols,
2257  /*symbolOperands=*/{}, userFilterFn);
2258 
2259  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2260 
2261  return success();
2262 }
2263 
2264 /// Construct the memref region to just include the entire memref. Returns false
2265 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2266 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2267 /// is parametric on.
2268 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2269  MemRefRegion *region) {
2270  unsigned rank;
2271  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2272  rank = loadOp.getMemRefType().getRank();
2273  region->memref = loadOp.getMemRef();
2274  region->setWrite(false);
2275  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2276  rank = storeOp.getMemRefType().getRank();
2277  region->memref = storeOp.getMemRef();
2278  region->setWrite(true);
2279  } else {
2280  assert(false && "expected load or store op");
2281  return false;
2282  }
2283  auto memRefType = cast<MemRefType>(region->memref.getType());
2284  if (!memRefType.hasStaticShape())
2285  return false;
2286 
2287  auto *regionCst = region->getConstraints();
2288 
2289  // Just get the first numSymbols IVs, which the memref region is parametric
2290  // on.
2292  getAffineForIVs(*op, &ivs);
2293  ivs.resize(numParamLoopIVs);
2294  SmallVector<Value, 4> symbols;
2295  extractForInductionVars(ivs, &symbols);
2296  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2297  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2298 
2299  // Memref dim sizes provide the bounds.
2300  for (unsigned d = 0; d < rank; d++) {
2301  auto dimSize = memRefType.getDimSize(d);
2302  assert(dimSize > 0 && "filtered dynamic shapes above");
2303  regionCst->addBound(BoundType::LB, d, 0);
2304  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2305  }
2306  return true;
2307 }
2308 
2309 LogicalResult
2311  const AffineCopyOptions &copyOptions,
2312  std::optional<Value> filterMemRef,
2313  DenseSet<Operation *> &copyNests) {
2314  if (begin == end)
2315  return success();
2316 
2317  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2318  "Inconsistent block begin/end args");
2319  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2320 
2321  Block *block = begin->getBlock();
2322 
2323  // Copies will be generated for this depth, i.e., symbolic in all loops
2324  // surrounding the this block range.
2325  unsigned copyDepth = getNestingDepth(&*begin);
2326 
2327  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2328  << "\n");
2329  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2330  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2331 
2332  // List of memory regions to copy for. We need a map vector to have a
2333  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2334  // since the alloc's for example are identical except for the SSA id.
2335  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2336  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2337 
2338  // Map from original memref's to the fast buffers that their accesses are
2339  // replaced with.
2340  DenseMap<Value, Value> fastBufferMap;
2341 
2342  // To check for errors when walking the block.
2343  bool error = false;
2344 
2345  // Walk this range of operations to gather all memory regions.
2346  block->walk(begin, end, [&](Operation *opInst) {
2347  Value memref;
2348  MemRefType memrefType;
2349  // Gather regions to allocate to buffers in faster memory space.
2350  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2351  memref = loadOp.getMemRef();
2352  memrefType = loadOp.getMemRefType();
2353  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2354  memref = storeOp.getMemRef();
2355  memrefType = storeOp.getMemRefType();
2356  }
2357  // Not an affine.load/store op.
2358  if (!memref)
2359  return;
2360 
2361  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2362  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2363  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2364  return;
2365 
2366  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2367  LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2368  "which copy-in/copy-out would happen\n");
2369  return;
2370  }
2371 
2372  // Compute the MemRefRegion accessed.
2373  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2374  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2375  /*addMemRefDimBounds=*/false))) {
2376  LLVM_DEBUG(llvm::dbgs()
2377  << "Error obtaining memory region: semi-affine maps?\n");
2378  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2379  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2380  LLVM_DEBUG(
2381  opInst->emitError("non-constant memref sizes not yet supported"));
2382  error = true;
2383  return;
2384  }
2385  }
2386 
2387  // Each memref has a single buffer associated with it irrespective of how
2388  // many load's and store's happen on it.
2389  // TODO: in the future, when regions don't intersect and satisfy
2390  // other properties (based on load/store regions), we could consider
2391  // multiple buffers per memref.
2392 
2393  // Add to the appropriate region if it's not already in it, or take a
2394  // bounding box union with the existing one if it's already in there.
2395  // Note that a memref may have both read and write regions - so update the
2396  // region in the other list if one exists (write in case of read and vice
2397  // versa) since there is a single bounding box for a memref across all reads
2398  // and writes that happen on it.
2399 
2400  // Attempts to update; returns true if 'region' exists in targetRegions.
2401  auto updateRegion =
2402  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2403  &targetRegions) {
2404  const auto *const it = targetRegions.find(region->memref);
2405  if (it == targetRegions.end())
2406  return false;
2407 
2408  // Perform a union with the existing region.
2409  if (failed(it->second->unionBoundingBox(*region))) {
2410  LLVM_DEBUG(llvm::dbgs()
2411  << "Memory region bounding box failed; "
2412  "over-approximating to the entire memref\n");
2413  // If the union fails, we will overapproximate.
2414  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2415  LLVM_DEBUG(opInst->emitError(
2416  "non-constant memref sizes not yet supported"));
2417  error = true;
2418  return true;
2419  }
2420  it->second->getConstraints()->clearAndCopyFrom(
2421  *region->getConstraints());
2422  } else {
2423  // Union was computed and stored in 'it->second': copy to 'region'.
2424  region->getConstraints()->clearAndCopyFrom(
2425  *it->second->getConstraints());
2426  }
2427  return true;
2428  };
2429 
2430  bool existsInRead = updateRegion(readRegions);
2431  if (error)
2432  return;
2433  bool existsInWrite = updateRegion(writeRegions);
2434  if (error)
2435  return;
2436 
2437  // Finally add it to the region list.
2438  if (region->isWrite() && !existsInWrite) {
2439  writeRegions[region->memref] = std::move(region);
2440  } else if (!region->isWrite() && !existsInRead) {
2441  readRegions[region->memref] = std::move(region);
2442  }
2443  });
2444 
2445  if (error) {
2446  LLVM_DEBUG(begin->emitError(
2447  "copy generation failed for one or more memref's in this block\n"));
2448  return failure();
2449  }
2450 
2451  uint64_t totalCopyBuffersSizeInBytes = 0;
2452  bool ret = true;
2453  auto processRegions =
2454  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2455  &regions) {
2456  for (const auto &regionEntry : regions) {
2457  // For each region, hoist copy in/out past all hoistable
2458  // 'affine.for's.
2459  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2460  Block *copyPlacementBlock;
2462  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2463  &copyInPlacementStart, &copyOutPlacementStart);
2464 
2465  uint64_t sizeInBytes;
2466  Block::iterator nBegin, nEnd;
2467  LogicalResult iRet = generateCopy(
2468  *regionEntry.second, block, begin, end, copyPlacementBlock,
2469  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2470  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2471  if (succeeded(iRet)) {
2472  // begin/end could have been invalidated, and need update.
2473  begin = nBegin;
2474  end = nEnd;
2475  totalCopyBuffersSizeInBytes += sizeInBytes;
2476  }
2477  ret = ret & succeeded(iRet);
2478  }
2479  };
2480  processRegions(readRegions);
2481  processRegions(writeRegions);
2482 
2483  if (!ret) {
2484  LLVM_DEBUG(begin->emitError(
2485  "copy generation failed for one or more memref's in this block\n"));
2486  return failure();
2487  }
2488 
2489  // For a range of operations, a note will be emitted at the caller.
2490  AffineForOp forOp;
2491  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2492  LLVM_DEBUG(forOp.emitRemark()
2493  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2494  << " KiB of copy buffers in fast memory space for this block");
2495  }
2496 
2497  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2498  block->getParentOp()->emitWarning(
2499  "total size of all copy buffers' for this block exceeds fast memory "
2500  "capacity");
2501  }
2502 
2503  return success();
2504 }
2505 
2506 // A convenience version of affineDataCopyGenerate for all ops in the body of
2507 // an AffineForOp.
2509  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2510  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2511  return affineDataCopyGenerate(forOp.getBody()->begin(),
2512  std::prev(forOp.getBody()->end()), copyOptions,
2513  filterMemRef, copyNests);
2514 }
2515 
2517  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2518  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2519  Block *block = analyzedOp->getBlock();
2520  auto begin = analyzedOp->getIterator();
2521  auto end = std::next(begin);
2522  DenseMap<Value, Value> fastBufferMap;
2523  DenseSet<Operation *> copyNests;
2524 
2525  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2526  copyOptions, fastBufferMap, copyNests,
2527  &result.sizeInBytes, &begin, &end);
2528  if (failed(err))
2529  return err;
2530 
2531  const auto &en = fastBufferMap.find(memrefRegion.memref);
2532  // In some cases (empty loops), no copy generation would have happened.
2533  if (en == fastBufferMap.end())
2534  return failure();
2535  result.alloc = en->second.getDefiningOp();
2536  assert(result.alloc && "fast buffer expected to be locally allocated");
2537  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2538  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2539  return success();
2540 }
2541 
2542 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2543 static void
2544 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2545  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2546  // Add a new empty level to output if it doesn't exist level already.
2547  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2548  if (currLoopDepth == depthToLoops.size())
2549  depthToLoops.emplace_back();
2550 
2551  for (auto &op : *block) {
2552  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2553  depthToLoops[currLoopDepth].push_back(forOp);
2554  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2555  }
2556  }
2557 }
2558 
2559 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2561  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2562  for (auto &block : func)
2563  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2564 
2565  // Remove last loop level from output since it's empty.
2566  if (!depthToLoops.empty()) {
2567  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2568  depthToLoops.pop_back();
2569  }
2570 }
2571 
2573  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2574  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2575  SmallVector<Value, 4> lowerOperands(lbOperands);
2576  SmallVector<Value, 4> upperOperands(ubOperands);
2577 
2578  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2579  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2580  lbMap = removeDuplicateExprs(lbMap);
2581  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2582  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2583  ubMap = removeDuplicateExprs(ubMap);
2584 
2585  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2586  step);
2587 }
2588 
2589 /// Creates an AffineIfOp that encodes the conditional to choose between
2590 /// the constant trip count version and an unknown trip count version of this
2591 /// nest of loops. This is used to separate partial and full tiles if `loops`
2592 /// has the intra-tile loops. The affine.if op is inserted at the builder
2593 /// insertion point of `b`.
2595  OpBuilder b) {
2596  if (loops.empty())
2597  return nullptr;
2598 
2599  auto *context = loops[0].getContext();
2600 
2603  llvm::append_range(ops, loops);
2604  (void)getIndexSet(ops, &cst);
2605 
2606  // Remove constraints that are independent of these loop IVs.
2607  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2608 
2609  // Construct the constraint set representing the guard for full tiles. The
2610  // lower bound (and upper bound) corresponding to the full tile should be
2611  // larger (and resp. smaller) than any other lower (or upper bound).
2612  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2613  for (auto loop : loops) {
2614  (void)loop;
2615  // TODO: Non-unit stride is not an issue to generalize to.
2616  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2617  // Mark everything symbols for the purpose of finding a constant diff pair.
2618  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2619  1);
2620  unsigned fullTileLbPos, fullTileUbPos;
2621  if (!((IntegerRelation)cst)
2622  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2623  /*boundFloorDivisor=*/nullptr,
2624  /*ub=*/nullptr, &fullTileLbPos,
2625  &fullTileUbPos)) {
2626  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2627  return nullptr;
2628  }
2629 
2630  SmallVector<unsigned, 4> lbIndices, ubIndices;
2631  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2632 
2633  auto fLb = cst.getInequality(fullTileLbPos);
2634  auto fUb = cst.getInequality(fullTileUbPos);
2635  fullTileLb.assign(fLb.begin(), fLb.end());
2636  fullTileUb.assign(fUb.begin(), fUb.end());
2637 
2638  // Full tile lower bound should be >= than any other lower bound.
2639  for (auto lbIndex : lbIndices)
2640  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2641  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2642 
2643  // Full tile upper bound should be <= any other upper bound.
2644  for (auto ubIndex : ubIndices)
2645  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2646  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2647 
2648  cst.removeVar(0);
2649  }
2650 
2651  // The previous step leads to all zeros for the full tile lb and ub position
2652  // itself; remove those and any other duplicates / trivial redundancies.
2654 
2655  // Turn everything into dims conservatively since we earlier turned all
2656  // trailing ids past point loop IV into symbols. Some of these could be outer
2657  // loop IVs; we'll canonicalize anyway.
2658  cst.setDimSymbolSeparation(0);
2659 
2660  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2661  // ifCondSet can be null if cst was empty -- this can happen if all loops
2662  // in the nest have constant trip counts.
2663  if (!ifCondSet)
2664  return nullptr;
2665 
2666  SmallVector<Value, 4> setOperands;
2667  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2668  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2669  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2670  /*withElseRegion=*/true);
2671 }
2672 
2673 /// Create the full tile loop nest (along with its body).
2674 static LogicalResult
2676  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2677  fullTileLoops.reserve(inputNest.size());
2678 
2679  // For each loop in the original nest identify a lower/upper bound pair such
2680  // that their difference is a constant.
2682  for (auto loop : inputNest) {
2683  // TODO: straightforward to generalize to a non-unit stride.
2684  if (loop.getStepAsInt() != 1) {
2685  LLVM_DEBUG(llvm::dbgs()
2686  << "[tile separation] non-unit stride not implemented\n");
2687  return failure();
2688  }
2689  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2690  (void)getIndexSet(loopOp, &cst);
2691  // We will mark everything other than this loop IV as symbol for getting a
2692  // pair of <lb, ub> with a constant difference.
2694  unsigned lbPos, ubPos;
2695  if (!((IntegerRelation)cst)
2696  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2697  /*boundFloorDivisor=*/nullptr,
2698  /*ub=*/nullptr, &lbPos, &ubPos) ||
2699  lbPos == ubPos) {
2700  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2701  "equalities not yet handled\n");
2702  return failure();
2703  }
2704 
2705  // Set all variables as dimensions uniformly since some of those marked as
2706  // symbols above could be outer loop IVs (corresponding tile space IVs).
2707  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2708 
2709  AffineValueMap lbVmap, ubVmap;
2710  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2711  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2712  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2713  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2714  ubVmap.getOperands(), ubVmap.getAffineMap());
2715  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2716  fullTileLoops.push_back(fullTileLoop);
2717  }
2718 
2719  // Add the body for the full tile loop nest.
2720  IRMapping operandMap;
2721  for (const auto &loopEn : llvm::enumerate(inputNest))
2722  operandMap.map(loopEn.value().getInductionVar(),
2723  fullTileLoops[loopEn.index()].getInductionVar());
2724  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2725  for (auto &op : inputNest.back().getBody()->without_terminator())
2726  b.clone(op, operandMap);
2727  return success();
2728 }
2729 
2730 LogicalResult
2732  SmallVectorImpl<AffineForOp> *fullTileNest) {
2733  if (inputNest.empty())
2734  return success();
2735 
2736  auto firstLoop = inputNest[0];
2737 
2738  // Each successive for op has to be nested in the other.
2739  auto prevLoop = firstLoop;
2740  for (auto loop : inputNest.drop_front(1)) {
2741  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2742  prevLoop = loop;
2743  }
2744 
2745  // Create the full tile loop nest.
2746  SmallVector<AffineForOp, 4> fullTileLoops;
2747  OpBuilder b(firstLoop);
2748  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2749  if (!fullTileLoops.empty())
2750  fullTileLoops.front().erase();
2751  return failure();
2752  }
2753 
2754  // Create and insert the version select right before the root of the nest.
2755  b = OpBuilder(firstLoop);
2756  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2757  if (!ifOp) {
2758  fullTileLoops.front().erase();
2759  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2760  "separation condition\n");
2761  return failure();
2762  }
2763 
2764  // Move the full tile into the then block.
2765  Block *thenBlock = ifOp.getThenBlock();
2766  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2767  thenBlock->getOperations().splice(
2768  std::prev(thenBlock->end()),
2769  outermostFullTileLoop->getBlock()->getOperations(),
2770  Block::iterator(outermostFullTileLoop));
2771 
2772  // Move the partial tile into the else block. The partial tile is the same as
2773  // the original loop nest.
2774  Block *elseBlock = ifOp.getElseBlock();
2775  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2776  firstLoop->getBlock()->getOperations(),
2777  Block::iterator(firstLoop));
2778 
2779  if (fullTileNest)
2780  *fullTileNest = std::move(fullTileLoops);
2781 
2782  return success();
2783 }
2784 
2785 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2786  LogicalResult result(failure());
2788  getPerfectlyNestedLoops(loops, op);
2789  if (loops.size() <= 1)
2790  return success();
2791 
2792  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2793  // loops with bounds defined above some loop.
2794  // 1. For each loop, find above which parent loop its operands are
2795  // defined.
2796  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2797  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2798  operandsDefinedAbove[i] = i;
2799  for (unsigned j = 0; j < i; ++j) {
2800  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2801  operandsDefinedAbove[i] = j;
2802  break;
2803  }
2804  }
2805  }
2806 
2807  // 2. Identify bands of loops such that the operands of all of them are
2808  // defined above the first loop in the band. Traverse the nest bottom-up
2809  // so that modifications don't invalidate the inner loops.
2810  for (unsigned end = loops.size(); end > 0; --end) {
2811  unsigned start = 0;
2812  for (; start < end - 1; ++start) {
2813  auto maxPos =
2814  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2815  std::next(operandsDefinedAbove.begin(), end));
2816  if (maxPos > start)
2817  continue;
2818  assert(maxPos == start &&
2819  "expected loop bounds to be known at the start of the band");
2820  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2821  if (succeeded(coalesceLoops(band)))
2822  result = success();
2823  break;
2824  }
2825  // If a band was found and transformed, keep looking at the loops above
2826  // the outermost transformed loop.
2827  if (start != end - 1)
2828  end = start + 1;
2829  }
2830  return result;
2831 }
2832 
2834  int64_t count = 0;
2835  Operation *currentOp = operand.getOwner();
2836  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2837  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2838  break;
2839  currentOp = loopOp;
2840  count++;
2841  }
2842  return count;
2843 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:383
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:661
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:424
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2268
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1535
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1933
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:362
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1765
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:174
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:409
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:560
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:465
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2544
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:914
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1951
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2675
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1823
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:689
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:975
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1087
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1317
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:418
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2594
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:44
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1515
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1859
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:102
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:961
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:394
unsigned getNumDims() const
Definition: AffineMap.cpp:390
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:403
unsigned getNumResults() const
Definition: AffineMap.cpp:398
unsigned getNumInputs() const
Definition: AffineMap.cpp:399
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:407
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition: Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:27
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:305
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:31
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:396
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:402
AffineMap getDimIdentityMap()
Definition: Builders.cpp:378
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:382
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:363
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:367
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:359
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:729
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:452
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:257
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition: Operation.cpp:385
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:718
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:267
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:236
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:288
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:208
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
static WalkResult advance()
Definition: WalkResult.h:47
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:529
Value getOperand(unsigned idx)
Definition: AffineOps.h:535
operand_range getOperands()
Definition: AffineOps.h:544
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:106
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:315
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:97
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1611
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:886
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2310
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1076
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2782
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1010
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:773
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2560
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1367
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2572
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:857
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:229
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1351
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2516
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1621
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:901
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1389
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1099
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:815
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1262
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1626
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:759
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1373
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2833
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1976
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1590
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1459
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:773
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1294
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, llvm::function_ref< bool(Operation *)> userFilterFn=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1304
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2785
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:874
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2731
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2701
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:766
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:32
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:776
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1814
int64_t numEltPerStride
Definition: LoopUtils.cpp:1815
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:205
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1068
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:527
bool isWrite() const
Definition: Utils.h:529
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1124
void setWrite(bool flag)
Definition: Utils.h:530
Value memref
Memref that this region corresponds to.
Definition: Utils.h:562
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:569
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.