MLIR  21.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/SmallPtrSet.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include <optional>
30 
31 #define DEBUG_TYPE "loop-utils"
32 
33 using namespace mlir;
34 using namespace affine;
35 using namespace presburger;
36 using llvm::SmallMapVector;
37 
38 /// Computes the cleanup loop lower bound of the loop being unrolled with
39 /// the specified unroll factor; this bound will also be upper bound of the main
40 /// part of the unrolled loop. Computes the bound as an AffineMap with its
41 /// operands or a null map when the trip count can't be expressed as an affine
42 /// expression.
43 static void
44 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45  AffineMap &cleanupLbMap,
46  SmallVectorImpl<Value> &cleanupLbOperands) {
47  AffineMap tripCountMap;
48  SmallVector<Value, 4> tripCountOperands;
49  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50  // Trip count can't be computed.
51  if (!tripCountMap) {
52  cleanupLbMap = AffineMap();
53  return;
54  }
55 
56  OpBuilder b(forOp);
57  auto lbMap = forOp.getLowerBoundMap();
58  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59  forOp.getLowerBoundOperands());
60 
61  // For each upper bound expr, get the range.
62  // Eg: affine.for %i = lb to min (ub1, ub2),
63  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65  // these affine.apply's make up the cleanup loop lower bound.
66  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68  int64_t step = forOp.getStepAsInt();
69  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70  auto tripCountExpr = tripCountMap.getResult(i);
71  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
73  tripCountMap.getNumSymbols(), bumpExprs[i]);
74  bumpValues[i] =
75  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76  }
77 
78  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
81 
82  cleanupLbOperands.clear();
83  cleanupLbOperands.push_back(lb);
84  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
85  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
86  b.getContext());
87  // Simplify the cleanupLbMap + cleanupLbOperands.
88  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
89  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
90  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
91  // Remove any affine.apply's that became dead from the simplification above.
92  for (auto v : bumpValues)
93  if (v.use_empty())
94  v.getDefiningOp()->erase();
95 
96  if (lb.use_empty())
97  lb.erase();
98 }
99 
100 /// Helper to replace uses of loop carried values (iter_args) and loop
101 /// yield values while promoting single iteration affine.for ops.
102 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103  // Replace uses of iter arguments with iter operands (initial values).
104  auto iterOperands = forOp.getInits();
105  auto iterArgs = forOp.getRegionIterArgs();
106  for (auto e : llvm::zip(iterOperands, iterArgs))
107  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108 
109  // Replace uses of loop results with the values yielded by the loop.
110  auto outerResults = forOp.getResults();
111  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112  for (auto e : llvm::zip(outerResults, innerResults))
113  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114 }
115 
116 /// Promotes the loop body of a forOp to its containing block if the forOp
117 /// was known to have a single iteration.
118 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120  if (!tripCount || *tripCount != 1)
121  return failure();
122 
123  // TODO: extend this for arbitrary affine bounds.
124  if (forOp.getLowerBoundMap().getNumResults() != 1)
125  return failure();
126 
127  // Replaces all IV uses to its single iteration value.
128  auto iv = forOp.getInductionVar();
129  auto *parentBlock = forOp->getBlock();
130  if (!iv.use_empty()) {
131  if (forOp.hasConstantLowerBound()) {
132  auto func = forOp->getParentOfType<FunctionOpInterface>();
133  OpBuilder builder(forOp->getContext());
134  if (func)
135  builder.setInsertionPointToStart(&func.getFunctionBody().front());
136  else
137  builder.setInsertionPoint(forOp);
138  auto constOp = builder.create<arith::ConstantIndexOp>(
139  forOp.getLoc(), forOp.getConstantLowerBound());
140  iv.replaceAllUsesWith(constOp);
141  } else {
142  auto lbOperands = forOp.getLowerBoundOperands();
143  auto lbMap = forOp.getLowerBoundMap();
144  OpBuilder builder(forOp);
145  if (lbMap == builder.getDimIdentityMap()) {
146  // No need of generating an affine.apply.
147  iv.replaceAllUsesWith(lbOperands[0]);
148  } else {
149  auto affineApplyOp =
150  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151  iv.replaceAllUsesWith(affineApplyOp);
152  }
153  }
154  }
155 
157 
158  // Move the loop body operations, except for its terminator, to the loop's
159  // containing block.
160  forOp.getBody()->back().erase();
161  parentBlock->getOperations().splice(Block::iterator(forOp),
162  forOp.getBody()->getOperations());
163  forOp.erase();
164  return success();
165 }
166 
167 /// Generates an affine.for op with the specified lower and upper bounds
168 /// while generating the right IV remappings to realize shifts for operations in
169 /// its body. The operations that go into the loop body are specified in
170 /// opGroupQueue starting from the specified offset, and in that order. The
171 /// first element of the pair specifies the shift applied to that group of
172 /// operations; the shift is multiplied by the loop step before being applied.
173 /// Returns nullptr if the generated loop simplifies to a single iteration one.
174 static AffineForOp generateShiftedLoop(
175  AffineMap lbMap, AffineMap ubMap,
176  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178  auto lbOperands = srcForOp.getLowerBoundOperands();
179  auto ubOperands = srcForOp.getUpperBoundOperands();
180 
181  assert(lbMap.getNumInputs() == lbOperands.size());
182  assert(ubMap.getNumInputs() == ubOperands.size());
183 
184  auto loopChunk =
185  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186  ubMap, srcForOp.getStepAsInt());
187  auto loopChunkIV = loopChunk.getInductionVar();
188  auto srcIV = srcForOp.getInductionVar();
189 
190  IRMapping operandMap;
191 
192  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
193  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
194  uint64_t shift = it.first;
195  auto ops = it.second;
196  // All 'same shift' operations get added with their operands being
197  // remapped to results of cloned operations, and their IV used remapped.
198  // Generate the remapping if the shift is not zero: remappedIV = newIV -
199  // shift.
200  if (!srcIV.use_empty() && shift != 0) {
201  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202  srcForOp.getLoc(),
203  bodyBuilder.getSingleDimShiftAffineMap(
204  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205  loopChunkIV);
206  operandMap.map(srcIV, ivRemap);
207  } else {
208  operandMap.map(srcIV, loopChunkIV);
209  }
210  for (auto *op : ops)
211  bodyBuilder.clone(*op, operandMap);
212  };
213  if (succeeded(promoteIfSingleIteration(loopChunk)))
214  return AffineForOp();
215  return loopChunk;
216 }
217 
218 // The skewing of operations with respect to one another can be used for
219 // example to allow overlap of asynchronous operations (such as DMA
220 // communication) with computation, or just relative shifting of operations
221 // for better register reuse, locality or parallelism. As such, the shifts are
222 // typically expected to be at most of the order of the number of operations.
223 // This method should not be used as a substitute for loop distribution/fission.
224 // This method uses an algorithm// in time linear in the number of operations
225 // in the body of the for loop - (using the 'sweep line' paradigm). This method
226 // asserts preservation of SSA dominance. A check for that as well as that for
227 // memory-based dependence preservation check rests with the users of this
228 // method.
229 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230  ArrayRef<uint64_t> shifts,
231  bool unrollPrologueEpilogue) {
232  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233  "too few/many shifts");
234  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235  return success();
236 
237  // If the trip counts aren't constant, we would need versioning and
238  // conditional guards (or context information to prevent such versioning). The
239  // better way to pipeline for such loops is to first tile them and extract
240  // constant trip count "full tiles" before applying this.
241  auto mayBeConstTripCount = getConstantTripCount(forOp);
242  if (!mayBeConstTripCount) {
243  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244  return success();
245  }
246  uint64_t tripCount = *mayBeConstTripCount;
247 
248  assert(isOpwiseShiftValid(forOp, shifts) &&
249  "shifts will lead to an invalid transformation\n");
250 
251  int64_t step = forOp.getStepAsInt();
252 
253  unsigned numChildOps = shifts.size();
254 
255  // Do a linear time (counting) sort for the shifts.
256  uint64_t maxShift = *llvm::max_element(shifts);
257  if (maxShift >= numChildOps) {
258  // Large shifts are not the typical use case.
259  forOp.emitWarning("not shifting because shifts are unrealistically large");
260  return success();
261  }
262 
263  // An array of operation groups sorted by shift amount; each group has all
264  // operations with the same shift in the order in which they appear in the
265  // body of the 'affine.for' op.
266  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267  unsigned pos = 0;
268  for (auto &op : forOp.getBody()->without_terminator()) {
269  auto shift = shifts[pos++];
270  sortedOpGroups[shift].push_back(&op);
271  }
272 
273  // Unless the shifts have a specific pattern (which actually would be the
274  // common use case), prologue and epilogue are not meaningfully defined.
275  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276  // loop generated as the prologue and the last as epilogue and unroll these
277  // fully.
278  AffineForOp prologue, epilogue;
279 
280  // Do a sweep over the sorted shifts while storing open groups in a
281  // vector, and generating loop portions as necessary during the sweep. A block
282  // of operations is paired with its shift.
283  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284 
285  auto origLbMap = forOp.getLowerBoundMap();
286  uint64_t lbShift = 0;
287  OpBuilder b(forOp);
288  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289  // If nothing is shifted by d, continue.
290  if (sortedOpGroups[d].empty())
291  continue;
292  if (!opGroupQueue.empty()) {
293  assert(d > 0 &&
294  "Queue expected to be empty when the first block is found");
295  // The interval for which the loop needs to be generated here is:
296  // [lbShift, min(lbShift + tripCount, d)) and the body of the
297  // loop needs to have all operations in opQueue in that order.
298  AffineForOp res;
299  if (lbShift + tripCount * step < d * step) {
300  res = generateShiftedLoop(
301  b.getShiftedAffineMap(origLbMap, lbShift),
302  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303  opGroupQueue, /*offset=*/0, forOp, b);
304  // Entire loop for the queued op groups generated, empty it.
305  opGroupQueue.clear();
306  lbShift += tripCount * step;
307  } else {
308  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309  b.getShiftedAffineMap(origLbMap, d),
310  opGroupQueue, /*offset=*/0, forOp, b);
311  lbShift = d * step;
312  }
313 
314  if (res) {
315  // Simplify/canonicalize the affine.for.
316  RewritePatternSet patterns(res.getContext());
317  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
318  bool erased;
320  res.getOperation(), std::move(patterns),
323  /*changed=*/nullptr, &erased);
324  if (!erased && !prologue)
325  prologue = res;
326  if (!erased)
327  epilogue = res;
328  }
329  } else {
330  // Start of first interval.
331  lbShift = d * step;
332  }
333  // Augment the list of operations that get into the current open interval.
334  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
335  }
336 
337  // Those operations groups left in the queue now need to be processed (FIFO)
338  // and their loops completed.
339  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
341  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
342  b.getShiftedAffineMap(origLbMap, ubShift),
343  opGroupQueue, /*offset=*/i, forOp, b);
344  lbShift = ubShift;
345  if (!prologue)
346  prologue = epilogue;
347  }
348 
349  // Erase the original for op.
350  forOp.erase();
351 
352  if (unrollPrologueEpilogue && prologue)
353  (void)loopUnrollFull(prologue);
354  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355  (void)loopUnrollFull(epilogue);
356 
357  return success();
358 }
359 
360 /// Checks whether a loop nest is hyper-rectangular or not.
361 static LogicalResult
364  SmallVector<Operation *, 8> ops(input.begin(), input.end());
365  // 0-d or 1-d is trivially hyper-rectangular.
366  if (input.size() <= 1)
367  return success();
368  if (failed(getIndexSet(ops, &cst))) {
369  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370  return failure();
371  }
372  if (!cst.isHyperRectangular(0, input.size())) {
373  LLVM_DEBUG(llvm::dbgs()
374  << "Non-hyperrectangular nests not supported for tiling!\n");
375  return failure();
376  }
377  return success();
378 }
379 
380 /// Check if the input nest is supported for tiling and whether tiling would be
381 /// legal or not.
382 template <typename t>
384  ArrayRef<t> tileSizes) {
385  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386 
387  if (llvm::any_of(input,
388  [](AffineForOp op) { return op.getNumResults() > 0; })) {
389  LLVM_DEBUG(llvm::dbgs()
390  << "Cannot tile nest where a loop has yield values\n");
391  return failure();
392  }
393 
394  // Check if the supplied `for` ops are all successively nested.
395  if (!isPerfectlyNested(input)) {
396  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397  return failure();
398  }
399 
400  // TODO: handle non hyper-rectangular spaces.
401  if (failed(checkIfHyperRectangular(input)))
402  return failure();
403 
404  return success();
405 }
406 
407 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
408 /// location in destination's body, ignoring the terminator.
409 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410  Block::iterator loc) {
411  auto &ops = src.getBody()->getOperations();
412  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413  std::prev(ops.end()));
414 }
415 
416 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417 /// body.
418 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
420 }
421 
422 /// Constructs tiled loop nest, without setting the loop bounds and move the
423 /// body of the original loop nest to the tiled loop nest.
425  AffineForOp rootAffineForOp, unsigned width,
426  MutableArrayRef<AffineForOp> tiledLoops) {
427  Location loc = rootAffineForOp.getLoc();
428 
429  // The outermost among the loops as we add more..
430  Operation *topLoop = rootAffineForOp.getOperation();
431  AffineForOp innermostPointLoop;
432 
433  // Add intra-tile (or point) loops.
434  for (unsigned i = 0; i < width; i++) {
435  OpBuilder b(topLoop);
436  // Loop bounds will be set later.
437  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
438  pointLoop.getBody()->getOperations().splice(
439  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440  topLoop);
441  tiledLoops[2 * width - 1 - i] = pointLoop;
442  topLoop = pointLoop.getOperation();
443  if (i == 0)
444  innermostPointLoop = pointLoop;
445  }
446 
447  // Add tile space loops;
448  for (unsigned i = width; i < 2 * width; i++) {
449  OpBuilder b(topLoop);
450  // Loop bounds will be set later.
451  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
452  tileSpaceLoop.getBody()->getOperations().splice(
453  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454  topLoop);
455  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456  topLoop = tileSpaceLoop.getOperation();
457  }
458 
459  // Move the loop body of the original nest to the new one.
460  moveLoopBody(origLoops.back(), innermostPointLoop);
461 }
462 
463 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
464 // TODO: Handle non-constant lower bounds.
465 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466  AffineForOp newInterTileLoop,
467  AffineForOp newIntraTileLoop,
468  Value tileSize) {
469  // The lower bound for the intra-tile loop is represented by an affine map
470  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
471  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
473  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475  // corresponding inter-tile loop.
476 
477  assert(origLoop.hasConstantLowerBound() &&
478  "expected input loops to have constant lower bound.");
479 
480  // Get lower bound of original loop as an affine expression.
481  AffineExpr origLowerBoundExpr;
482  origLowerBoundExpr =
483  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
484 
485  // Add dim operands from original lower/upper bound.
486  SmallVector<Value, 4> lbOperands, ubOperands;
487  AffineBound lb = origLoop.getLowerBound();
488  AffineBound ub = origLoop.getUpperBound();
489  lbOperands.reserve(lb.getNumOperands() + 2);
490  ubOperands.reserve(ub.getNumOperands() + 2);
491  AffineMap origLbMap = lb.getMap();
492  AffineMap origUbMap = ub.getMap();
493  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
494  lbOperands.push_back(lb.getOperand(j));
495  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
496  ubOperands.push_back(ub.getOperand(j));
497 
498  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
499  // IV.
500  lbOperands.push_back(newInterTileLoop.getInductionVar());
501  ubOperands.push_back(newInterTileLoop.getInductionVar());
502 
503  // Get loop IV as an affine expression for lower/upper bound. Size of
504  // lb/ubOperands is guaranteed to be atleast one.
505  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
506  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
507 
508  // Add symbol operands from original lower/upper bound.
509  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
510  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
511  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
512  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
513 
514  // Add a new symbol operand which is the tile size for this loop.
515  lbOperands.push_back(tileSize);
516  ubOperands.push_back(tileSize);
517 
518  SmallVector<AffineExpr, 4> lbBoundExprs;
519  SmallVector<AffineExpr, 4> ubBoundExprs;
520  lbBoundExprs.reserve(origLbMap.getNumResults());
521  ubBoundExprs.reserve(origUbMap.getNumResults());
522 
523  // Get tiling parameter as an affine expression for lb/ub.
524  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
525  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
526 
527  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528  lbBoundExprs.push_back(
529  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530  origLowerBoundExpr);
531 
532  // Get the origLoopStep as an affine expression.
533  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
534 
535  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
536  // (tilingParameter * origLoopStep) + origlb.
537  ubBoundExprs.push_back(
538  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540 
541  ubBoundExprs.append(origUbMap.getResults().begin(),
542  origUbMap.getResults().end());
543 
544  AffineMap lbMap =
545  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
546  lbBoundExprs, b.getContext());
547  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
548 
549  AffineMap ubMap =
550  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
551  ubBoundExprs, b.getContext());
552  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
553 
554  // Original loop step must be preserved.
555  newIntraTileLoop.setStep(origLoop.getStepAsInt());
556 }
557 
558 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
559 // TODO: Handle non-constant lower bounds.
560 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561  AffineForOp newLoop, Value tileSize) {
562  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563 
564  // The lower bounds for inter-tile loops are same as the corresponding lower
565  // bounds of original loops.
566  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
567 
568  // The new upper bound map for inter-tile loops, assuming constant lower
569  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570  // originalLowerBound), tiling parameter); where tiling parameter is the
571  // respective tile size for that loop. For e.g. if the original ubmap was
572  // ()->(1024), the new map will be
573  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574  // Therefore a new symbol operand is inserted in the map and the result
575  // expression is overwritten.
576 
577  assert(origLoop.hasConstantLowerBound() &&
578  "expected input loops to have constant lower bound.");
579 
580  // Get lower bound of original loop as an affine expression.
581  AffineExpr origLowerBoundExpr;
582  origLowerBoundExpr =
583  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
584 
585  // Add dim operands from original upper bound.
586  SmallVector<Value, 4> ubOperands;
587  AffineBound ub = origLoop.getUpperBound();
588  ubOperands.reserve(ub.getNumOperands() + 1);
589  AffineMap origUbMap = ub.getMap();
590  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
591  ubOperands.push_back(ub.getOperand(j));
592 
593  // Add symbol operands from original upper bound.
594  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
595  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
596 
597  // Add a new symbol operand which is the tile size for this loop.
598  ubOperands.push_back(tileSize);
599 
600  // Get tiling parameter as an affine expression.
601  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
602 
603  SmallVector<AffineExpr, 4> boundExprs;
604  boundExprs.reserve(origUbMap.getNumResults());
605  int64_t origUpperBound;
606  AffineExpr origUpperBoundExpr;
607 
608  // If upper bound for the original loop is constant, then the constant can
609  // be obtained as an affine expression straight away.
610  if (origLoop.hasConstantUpperBound()) {
611  origUpperBound = origLoop.getConstantUpperBound();
612 
613  // Get original constant upper bound as an affine expression.
614  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
615 
616  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617  // originalLowerBound), tilingParameter).
618  boundExprs.push_back(
619  origLowerBoundExpr +
620  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
621  } else {
622  // If upper bound for the original loop is not constant then two cases
623  // are possible, although there handeling is the same, 1.) The result of
624  // ubmap has only one result expression. For e.g.
625  // affine.for %i = 5 to %ub
626  //
627  // A symbol operand is added which represents the tiling parameter. The
628  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629  // where 's0' is the original upper bound and 's1' is the tiling
630  // parameter. 2.) When ubMap has more than one result expression. For e.g.
631  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
632  // affine.for %i = 5 to min #map0()[%s0, %s1]
633  //
634  // A symbol operand is added which represents the tiling parameter. The
635  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637 
638  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639  // originalLowerBound), tilingParameter).
640  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641  boundExprs.push_back(
642  origLowerBoundExpr +
643  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
644  }
645 
646  AffineMap ubMap =
647  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
648  boundExprs, b.getContext());
649  newLoop.setUpperBound(ubOperands, ubMap);
650 
651  // Original loop step must be preserved.
652  newLoop.setStep(origLoop.getStepAsInt());
653 }
654 
655 /// Constructs and sets new loop bounds after tiling for the case of
656 /// hyper-rectangular index sets, where the bounds of one dimension do not
657 /// depend on other dimensions and tiling parameters are captured from SSA
658 /// values. Bounds of each dimension can thus be treated independently,
659 /// and deriving the new bounds is much simpler and faster than for the case of
660 /// tiling arbitrary polyhedral shapes.
663  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664  assert(!origLoops.empty() && "expected atleast one loop in band");
665  assert(origLoops.size() == tileSizes.size() &&
666  "expected tiling parameter for each loop in band.");
667 
668  OpBuilder b(origLoops[0].getOperation());
669  unsigned width = origLoops.size();
670 
671  // Set bounds for tile space loops.
672  for (unsigned i = 0; i < width; ++i) {
673  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
674  }
675 
676  // Set bounds for intra-tile loops.
677  for (unsigned i = 0; i < width; ++i) {
678  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
679  newLoops[i + width], tileSizes[i]);
680  }
681 }
682 
683 /// Constructs and sets new loop bounds after tiling for the case of
684 /// hyper-rectangular index sets, where the bounds of one dimension do not
685 /// depend on other dimensions. Bounds of each dimension can thus be treated
686 /// independently, and deriving the new bounds is much simpler and faster
687 /// than for the case of tiling arbitrary polyhedral shapes.
688 static void
691  ArrayRef<unsigned> tileSizes) {
692  assert(!origLoops.empty());
693  assert(origLoops.size() == tileSizes.size());
694 
695  OpBuilder b(origLoops[0].getOperation());
696  unsigned width = origLoops.size();
697 
698  // Bounds for tile space loops.
699  for (unsigned i = 0; i < width; i++) {
700  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
703  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
704  // If the step size of original loop is x and tileSize is y then after
705  // tiling the tile space loops' step size becomes x*y.
706  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
707  }
708  // Bounds for intra-tile loops.
709  for (unsigned i = 0; i < width; i++) {
710  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
711  std::optional<uint64_t> mayBeConstantCount =
712  getConstantTripCount(origLoops[i]);
713  // The lower bound is just the tile-space loop.
714  AffineMap lbMap = b.getDimIdentityMap();
715  newLoops[width + i].setLowerBound(
716  /*operands=*/newLoops[i].getInductionVar(), lbMap);
717  // The step sizes of intra-tile loops is just the original loops' step size.
718  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
719 
720  // Set the upper bound.
721  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
722  // Trip count is less than the tile size: upper bound is lower bound +
723  // trip count * stepSize.
725  *mayBeConstantCount * origLoops[i].getStepAsInt());
726  newLoops[width + i].setUpperBound(
727  /*operands=*/newLoops[i].getInductionVar(), ubMap);
728  } else if (largestDiv % tileSizes[i] != 0) {
729  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730  // Construct the upper bound map; the operands are the original operands
731  // with 'i' (tile-space loop) appended to it. The new upper bound map is
732  // the original one with an additional expression i + tileSize * stepSize
733  // appended.
734 
735  // Add dim operands from original upper bound.
736  SmallVector<Value, 4> ubOperands;
737  AffineBound ub = origLoops[i].getUpperBound();
738  ubOperands.reserve(ub.getNumOperands() + 1);
739  AffineMap origUbMap = ub.getMap();
740  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
741  ubOperands.push_back(ub.getOperand(j));
742 
743  // Add dim operand for new loop upper bound.
744  ubOperands.push_back(newLoops[i].getInductionVar());
745 
746  // Add symbol operands from original upper bound.
747  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
748  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
749 
750  SmallVector<AffineExpr, 4> boundExprs;
751  boundExprs.reserve(1 + origUbMap.getNumResults());
752  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
753  // The new upper bound map is the original one with an additional
754  // expression i + tileSize * stepSize (of original loop) appended.
755  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
756  boundExprs.append(origUbMap.getResults().begin(),
757  origUbMap.getResults().end());
758  AffineMap ubMap =
759  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
760  boundExprs, b.getContext());
761  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
762  } else {
763  // No need of the min expression.
764  AffineExpr dim = b.getAffineDimExpr(0);
765  AffineMap ubMap = AffineMap::get(
766  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
767  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
768  }
769  }
770 }
771 
772 LogicalResult
774  ArrayRef<unsigned> tileSizes,
775  SmallVectorImpl<AffineForOp> *tiledNest) {
776  if (input.empty())
777  return success();
778 
779  if (failed(performPreTilingChecks(input, tileSizes)))
780  return failure();
781 
782  MutableArrayRef<AffineForOp> origLoops = input;
783  AffineForOp rootAffineForOp = origLoops[0];
784 
785  // Note that width is at least one since the band isn't empty.
786  unsigned width = input.size();
787  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
788 
789  // Construct a tiled loop nest without setting their bounds. Bounds are
790  // set later.
791  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792 
793  SmallVector<Value, 8> origLoopIVs;
794  extractForInductionVars(input, &origLoopIVs);
795 
796  // Set loop bounds for the tiled loop nest.
797  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
798 
799  // Replace original IVs with intra-tile loop IVs.
800  for (unsigned i = 0; i < width; i++)
801  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
802 
803  // Erase the old loop nest.
804  rootAffineForOp.erase();
805 
806  if (tiledNest)
807  *tiledNest = std::move(tiledLoops);
808 
809  return success();
810 }
811 
812 /// Tiles the specified band of perfectly nested loops creating tile-space
813 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
814 /// is a contiguous set of loops.
817  SmallVectorImpl<AffineForOp> *tiledNest) {
818  if (input.empty())
819  return success();
820 
821  if (failed(performPreTilingChecks(input, tileSizes)))
822  return failure();
823 
824  MutableArrayRef<AffineForOp> origLoops = input;
825  AffineForOp rootAffineForOp = origLoops[0];
826  unsigned width = input.size();
827  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
828 
829  // Construct a tiled loop nest without setting their bounds. Bounds are
830  // set later.
831  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832 
833  SmallVector<Value, 8> origLoopIVs;
834  extractForInductionVars(input, &origLoopIVs);
835 
836  // Set loop bounds for the tiled loop nest.
837  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
838  tileSizes);
839 
840  // Replace original IVs with intra-tile loop IVs.
841  for (unsigned i = 0; i < width; i++)
842  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
843 
844  // Erase the old loop nest.
845  rootAffineForOp.erase();
846 
847  if (tiledNest)
848  *tiledNest = std::move(tiledLoops);
849 
850  return success();
851 }
852 
853 /// Get perfectly nested sequence of loops starting at root of loop nest
854 /// (the first op being another AffineFor, and the second op - a terminator).
855 /// A loop is perfectly nested iff: the first op in the loop's body is another
856 /// AffineForOp, and the second op is a terminator).
858  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
860  nestedLoops.push_back(root);
861  Block &body = root.getRegion().front();
862  if (body.begin() != std::prev(body.end(), 2))
863  return;
864 
865  root = dyn_cast<AffineForOp>(&body.front());
866  if (!root)
867  return;
868  }
869 }
870 
871 /// Identify valid and profitable bands of loops to tile. This is currently just
872 /// a temporary placeholder to test the mechanics of tiled code generation.
873 /// Returns all maximal outermost perfect loop nests to tile.
875  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
876  // Get maximal perfect nest of 'affine.for' insts starting from root
877  // (inclusive).
878  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
880  getPerfectlyNestedLoops(band, forOp);
881  bands->push_back(band);
882  }
883 }
884 
885 /// Unrolls this loop completely.
886 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888  if (mayBeConstantTripCount.has_value()) {
889  uint64_t tripCount = *mayBeConstantTripCount;
890  if (tripCount == 0)
891  return success();
892  if (tripCount == 1)
893  return promoteIfSingleIteration(forOp);
894  return loopUnrollByFactor(forOp, tripCount);
895  }
896  return failure();
897 }
898 
899 /// Unrolls this loop by the specified factor or by the trip count (if constant)
900 /// whichever is lower.
901 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902  uint64_t unrollFactor) {
903  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904  if (mayBeConstantTripCount.has_value() &&
905  *mayBeConstantTripCount < unrollFactor)
906  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
907  return loopUnrollByFactor(forOp, unrollFactor);
908 }
909 
910 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
913 /// using annotateFn.
915  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918  ValueRange iterArgs, ValueRange yieldedValues) {
919  // Builder to insert unrolled bodies just before the terminator of the body of
920  // 'forOp'.
921  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
922 
923  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924  if (!annotateFn)
925  annotateFn = defaultAnnotateFn;
926 
927  // Keep a pointer to the last non-terminator operation in the original block
928  // so that we know what to clone (since we are doing this in-place).
929  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
930 
931  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932  SmallVector<Value, 4> lastYielded(yieldedValues);
933 
934  for (unsigned i = 1; i < unrollFactor; i++) {
935  IRMapping operandMap;
936 
937  // Prepare operand map.
938  operandMap.map(iterArgs, lastYielded);
939 
940  // If the induction variable is used, create a remapping to the value for
941  // this unrolled instance.
942  if (!forOpIV.use_empty()) {
943  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944  operandMap.map(forOpIV, ivUnroll);
945  }
946 
947  // Clone the original body of 'forOp'.
948  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
949  Operation *clonedOp = builder.clone(*it, operandMap);
950  annotateFn(i, clonedOp, builder);
951  }
952 
953  // Update yielded values. If the yielded value is defined outside the
954  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956  // value with the clone corresponding to the yielded value.
957  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958  Operation *defOp = yieldedValues[i].getDefiningOp();
959  if (defOp && defOp->getBlock() == loopBodyBlock)
960  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
961  }
962  }
963 
964  // Make sure we annotate the Ops in the original body. We do this last so that
965  // any annotations are not copied into the cloned Ops above.
966  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
967  annotateFn(0, &*it, builder);
968 
969  // Update operands of the yield statement.
970  loopBodyBlock->getTerminator()->setOperands(lastYielded);
971 }
972 
973 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974 /// count is not a multiple of `unrollFactor`.
975 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976  uint64_t unrollFactor) {
977  // Insert the cleanup loop right after 'forOp'.
978  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
979  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
980 
981  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982  // and produce results for the original users of `forOp` results.
983  auto results = forOp.getResults();
984  auto cleanupResults = cleanupForOp.getResults();
985  auto cleanupIterOperands = cleanupForOp.getInits();
986 
987  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
989  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
990  }
991 
992  AffineMap cleanupMap;
993  SmallVector<Value, 4> cleanupOperands;
994  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
995  if (!cleanupMap)
996  return failure();
997 
998  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
999  // Promote the loop body up if this has turned into a single iteration loop.
1000  (void)promoteIfSingleIteration(cleanupForOp);
1001 
1002  // Adjust upper bound of the original loop; this is the same as the lower
1003  // bound of the cleanup loop.
1004  forOp.setUpperBound(cleanupOperands, cleanupMap);
1005  return success();
1006 }
1007 
1008 /// Unrolls this loop by the specified factor. Returns success if the loop
1009 /// is successfully unrolled.
1011  AffineForOp forOp, uint64_t unrollFactor,
1012  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013  bool cleanUpUnroll) {
1014  assert(unrollFactor > 0 && "unroll factor should be positive");
1015 
1016  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017  if (unrollFactor == 1) {
1018  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1019  failed(promoteIfSingleIteration(forOp)))
1020  return failure();
1021  return success();
1022  }
1023 
1024  // Nothing in the loop body other than the terminator.
1025  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1026  return success();
1027 
1028  // If the trip count is lower than the unroll factor, no unrolled body.
1029  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1030  if (cleanUpUnroll) {
1031  // Unroll the cleanup loop if cleanUpUnroll is specified.
1032  return loopUnrollFull(forOp);
1033  }
1034 
1035  return failure();
1036  }
1037 
1038  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1039  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1040  // Loops where the lower bound is a max expression or the upper bound is
1041  // a min expression and the trip count doesn't divide the unroll factor
1042  // can't be unrolled since the lower bound of the cleanup loop in such cases
1043  // cannot be expressed as an affine function or a max over affine functions.
1044  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1045  forOp.getUpperBoundMap().getNumResults() != 1)
1046  return failure();
1047  if (cleanUpUnroll)
1048  // Force unroll including cleanup loop
1049  return loopUnrollFull(forOp);
1050  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1051  assert(false && "cleanup loop lower bound map for single result lower "
1052  "and upper bound maps can always be determined");
1053  }
1054 
1055  ValueRange iterArgs(forOp.getRegionIterArgs());
1056  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1057 
1058  // Scale the step of loop being unrolled by unroll factor.
1059  int64_t step = forOp.getStepAsInt();
1060  forOp.setStep(step * unrollFactor);
1062  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1063  [&](unsigned i, Value iv, OpBuilder b) {
1064  // iv' = iv + i * step
1065  auto d0 = b.getAffineDimExpr(0);
1066  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1067  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1068  },
1069  /*annotateFn=*/annotateFn,
1070  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1071 
1072  // Promote the loop body up if this has turned into a single iteration loop.
1073  (void)promoteIfSingleIteration(forOp);
1074  return success();
1075 }
1076 
1077 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1078  uint64_t unrollJamFactor) {
1079  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1080  if (mayBeConstantTripCount.has_value() &&
1081  *mayBeConstantTripCount < unrollJamFactor)
1082  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1083  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1084 }
1085 
1086 /// Check if all control operands of all loops are defined outside of `forOp`
1087 /// and return false if not.
1088 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1089  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1090  for (auto controlOperand : aForOp.getControlOperands()) {
1091  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1092  return WalkResult::interrupt();
1093  }
1094  return WalkResult::advance();
1095  });
1096  return !walkResult.wasInterrupted();
1097 }
1098 
1099 /// Unrolls and jams this loop by the specified factor.
1100 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1101  uint64_t unrollJamFactor) {
1102  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1103 
1104  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1105  if (unrollJamFactor == 1) {
1106  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1107  failed(promoteIfSingleIteration(forOp)))
1108  return failure();
1109  return success();
1110  }
1111 
1112  // Nothing in the loop body other than the terminator.
1113  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1114  return success();
1115 
1116  // If the trip count is lower than the unroll jam factor, no unroll jam.
1117  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1118  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1119  return failure();
1120  }
1121 
1122  // If any control operand of any inner loop of `forOp` is defined within
1123  // `forOp`, no unroll jam.
1124  if (!areInnerBoundsInvariant(forOp))
1125  return failure();
1126 
1127  // Gather all sub-blocks to jam upon the loop being unrolled.
1129  jbg.walk(forOp);
1130  auto &subBlocks = jbg.subBlocks;
1131 
1132  // Collect loops with iter_args.
1133  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1134  forOp.walk([&](AffineForOp aForOp) {
1135  if (aForOp.getNumIterOperands() > 0)
1136  loopsWithIterArgs.push_back(aForOp);
1137  });
1138 
1139  // Get supported reductions to be used for creating reduction ops at the end.
1140  SmallVector<LoopReduction> reductions;
1141  if (forOp.getNumIterOperands() > 0)
1142  getSupportedReductions(forOp, reductions);
1143 
1144  // Generate the cleanup loop if trip count isn't a multiple of
1145  // unrollJamFactor.
1146  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1147  // Loops where the lower bound is a max expression or the upper bound is
1148  // a min expression and the trip count doesn't divide the unroll factor
1149  // can't be unrolled since the lower bound of the cleanup loop in such cases
1150  // cannot be expressed as an affine function or a max over affine functions.
1151  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1152  forOp.getUpperBoundMap().getNumResults() != 1)
1153  return failure();
1154  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1155  assert(false && "cleanup loop lower bound map for single result lower "
1156  "and upper bound maps can always be determined");
1157  }
1158 
1159  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1160  // iteration. There are (`unrollJamFactor` - 1) iterations.
1161  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1162 
1163  // For any loop with iter_args, replace it with a new loop that has
1164  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1165  // operands.
1166  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1167  IRRewriter rewriter(forOp.getContext());
1168  for (AffineForOp oldForOp : loopsWithIterArgs) {
1169  SmallVector<Value> dupIterOperands, dupYieldOperands;
1170  ValueRange oldIterOperands = oldForOp.getInits();
1171  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1172  ValueRange oldYieldOperands =
1173  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1174  // Get additional iterOperands, iterArgs, and yield operands. We will
1175  // fix iterOperands and yield operands after cloning of sub-blocks.
1176  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1177  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1178  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1179  }
1180  // Create a new loop with additional iterOperands, iter_args and yield
1181  // operands. This new loop will take the loop body of the original loop.
1182  bool forOpReplaced = oldForOp == forOp;
1183  AffineForOp newForOp =
1184  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1185  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1186  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1187  return dupYieldOperands;
1188  }));
1189  newLoopsWithIterArgs.push_back(newForOp);
1190  // `forOp` has been replaced with a new loop.
1191  if (forOpReplaced)
1192  forOp = newForOp;
1193  // Update `operandMaps` for `newForOp` iterArgs and results.
1194  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1195  unsigned oldNumIterArgs = oldIterArgs.size();
1196  ValueRange newResults = newForOp.getResults();
1197  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1198  assert(oldNumIterArgs == oldNumResults &&
1199  "oldNumIterArgs must be the same as oldNumResults");
1200  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1201  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1202  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1203  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1204  // to those in the `i`th new set.
1205  operandMaps[i - 1].map(newIterArgs[j],
1206  newIterArgs[i * oldNumIterArgs + j]);
1207  operandMaps[i - 1].map(newResults[j],
1208  newResults[i * oldNumResults + j]);
1209  }
1210  }
1211  }
1212 
1213  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1214  int64_t step = forOp.getStepAsInt();
1215  forOp.setStep(step * unrollJamFactor);
1216 
1217  auto forOpIV = forOp.getInductionVar();
1218  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1219  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1220  for (auto &subBlock : subBlocks) {
1221  // Builder to insert unroll-jammed bodies. Insert right at the end of
1222  // sub-block.
1223  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1224 
1225  // If the induction variable is used, create a remapping to the value for
1226  // this unrolled instance.
1227  if (!forOpIV.use_empty()) {
1228  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1229  auto d0 = builder.getAffineDimExpr(0);
1230  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1231  auto ivUnroll =
1232  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1233  operandMaps[i - 1].map(forOpIV, ivUnroll);
1234  }
1235  // Clone the sub-block being unroll-jammed.
1236  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1237  builder.clone(*it, operandMaps[i - 1]);
1238  }
1239  // Fix iterOperands and yield op operands of newly created loops.
1240  for (auto newForOp : newLoopsWithIterArgs) {
1241  unsigned oldNumIterOperands =
1242  newForOp.getNumIterOperands() / unrollJamFactor;
1243  unsigned numControlOperands = newForOp.getNumControlOperands();
1244  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1245  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1246  assert(oldNumIterOperands == oldNumYieldOperands &&
1247  "oldNumIterOperands must be the same as oldNumYieldOperands");
1248  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1249  // The `i`th duplication of an old iterOperand or yield op operand
1250  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1251  // if such mapped value exists.
1252  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1253  operandMaps[i - 1].lookupOrDefault(
1254  newForOp.getOperand(numControlOperands + j)));
1255  yieldOp.setOperand(
1256  i * oldNumYieldOperands + j,
1257  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1258  }
1259  }
1260  }
1261  if (forOp.getNumResults() > 0) {
1262  // Create reduction ops to combine every `unrollJamFactor` related results
1263  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1264  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1265  // %1.
1266  rewriter.setInsertionPointAfter(forOp);
1267  auto loc = forOp.getLoc();
1268  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1269  for (LoopReduction &reduction : reductions) {
1270  unsigned pos = reduction.iterArgPosition;
1271  Value lhs = forOp.getResult(pos);
1272  Value rhs;
1274  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1275  rhs = forOp.getResult(i * oldNumResults + pos);
1276  // Create ops based on reduction type.
1277  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1278  if (!lhs)
1279  return failure();
1280  Operation *op = lhs.getDefiningOp();
1281  assert(op && "Reduction op should have been created");
1282  newOps.insert(op);
1283  }
1284  // Replace all uses except those in newly created reduction ops.
1285  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1286  }
1287  }
1288 
1289  // Promote the loop body up if this has turned into a single iteration loop.
1290  (void)promoteIfSingleIteration(forOp);
1291  return success();
1292 }
1293 
1294 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1295 /// nested within 'forOpA' as the only non-terminator operation in its block.
1296 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1297  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1298  auto &forOpABody = forOpA.getBody()->getOperations();
1299  auto &forOpBBody = forOpB.getBody()->getOperations();
1300 
1301  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1302  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1303  // body containing only the terminator.
1304  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1305  forOpABody, forOpABody.begin(),
1306  std::prev(forOpABody.end()));
1307  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1308  // body (this leaves forOpB's body containing only the terminator).
1309  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1310  std::prev(forOpBBody.end()));
1311  // 3) Splice forOpA into the beginning of forOpB's body.
1312  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1313  Block::iterator(forOpA));
1314 }
1315 
1316 // Checks each dependence component against the permutation to see if the
1317 // desired loop interchange would violate dependences by making the
1318 // dependence component lexicographically negative.
1320  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1321  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1322  // Invert permutation map.
1323  unsigned maxLoopDepth = loops.size();
1324  SmallVector<unsigned, 4> loopPermMapInv;
1325  loopPermMapInv.resize(maxLoopDepth);
1326  for (unsigned i = 0; i < maxLoopDepth; ++i)
1327  loopPermMapInv[loopPermMap[i]] = i;
1328 
1329  // Check each dependence component against the permutation to see if the
1330  // desired loop interchange permutation would make the dependence vectors
1331  // lexicographically negative.
1332  // Example 1: [-1, 1][0, 0]
1333  // Example 2: [0, 0][-1, 1]
1334  for (const auto &depComps : depCompsVec) {
1335  assert(depComps.size() >= maxLoopDepth);
1336  // Check if the first non-zero dependence component is positive.
1337  // This iterates through loops in the desired order.
1338  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1339  unsigned permIndex = loopPermMapInv[j];
1340  assert(depComps[permIndex].lb);
1341  int64_t depCompLb = *depComps[permIndex].lb;
1342  if (depCompLb > 0)
1343  break;
1344  if (depCompLb < 0)
1345  return false;
1346  }
1347  }
1348  return true;
1349 }
1350 
1351 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1352 /// nested sequence of loops in 'loops' would violate dependences.
1354  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1355  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1356  unsigned maxLoopDepth = loops.size();
1357  if (maxLoopDepth == 1)
1358  return true;
1359  // Gather dependence components for dependences between all ops in loop nest
1360  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1361  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1362  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1363  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1364 }
1365 
1366 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1367 /// in it from outermost to innermost.
1368 bool LLVM_ATTRIBUTE_UNUSED
1370  assert(!loops.empty() && "no loops provided");
1371 
1372  // We already know that the block can't be empty.
1373  auto hasTwoElements = [](Block *block) {
1374  auto secondOpIt = std::next(block->begin());
1375  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1376  };
1377 
1378  auto enclosingLoop = loops.front();
1379  for (auto loop : loops.drop_front()) {
1380  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1381  // parentForOp's body should be just this loop and the terminator.
1382  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1383  return false;
1384  enclosingLoop = loop;
1385  }
1386  return true;
1387 }
1388 
1389 // input[i] should move from position i -> permMap[i]. Returns the position in
1390 // `input` that becomes the new outermost loop.
1392  ArrayRef<unsigned> permMap) {
1393  assert(input.size() == permMap.size() && "invalid permutation map size");
1394  // Check whether the permutation spec is valid. This is a small vector - we'll
1395  // just sort and check if it's iota.
1396  SmallVector<unsigned, 4> checkPermMap(permMap);
1397  llvm::sort(checkPermMap);
1398  if (llvm::any_of(llvm::enumerate(checkPermMap),
1399  [](const auto &en) { return en.value() != en.index(); }))
1400  assert(false && "invalid permutation map");
1401 
1402  // Nothing to do.
1403  if (input.size() < 2)
1404  return 0;
1405 
1406  assert(isPerfectlyNested(input) && "input not perfectly nested");
1407 
1408  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1409  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1411  for (unsigned i = 0, e = input.size(); i < e; ++i)
1412  invPermMap.push_back({permMap[i], i});
1413  llvm::sort(invPermMap);
1414 
1415  // Move the innermost loop body to the loop that would be the innermost in the
1416  // permuted nest (only if the innermost loop is going to change).
1417  if (permMap.back() != input.size() - 1) {
1418  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1419  Block *srcBody = ((AffineForOp)input.back()).getBody();
1420  destBody->getOperations().splice(destBody->begin(),
1421  srcBody->getOperations(), srcBody->begin(),
1422  std::prev(srcBody->end()));
1423  }
1424 
1425  // We'll move each loop in `input` in the reverse order so that its body is
1426  // empty when we are moving it; this incurs zero copies and no erasing.
1427  for (int i = input.size() - 1; i >= 0; --i) {
1428  // If this has to become the outermost loop after permutation, add it to the
1429  // parent block of the original root.
1430  if (permMap[i] == 0) {
1431  // If the root remains the same, nothing to do.
1432  if (i == 0)
1433  continue;
1434  // Make input[i] the new outermost loop moving it into parentBlock.
1435  auto *parentBlock = input[0]->getBlock();
1436  parentBlock->getOperations().splice(Block::iterator(input[0]),
1437  input[i]->getBlock()->getOperations(),
1438  Block::iterator(input[i]));
1439  continue;
1440  }
1441 
1442  // If the parent in the permuted order is the same as in the original,
1443  // nothing to do.
1444  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1445  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1446  continue;
1447 
1448  // Move input[i] to its surrounding loop in the transformed nest.
1449  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1450  destBody->getOperations().splice(destBody->begin(),
1451  input[i]->getBlock()->getOperations(),
1452  Block::iterator(input[i]));
1453  }
1454 
1455  return invPermMap[0].second;
1456 }
1457 
1458 // Sinks all sequential loops to the innermost levels (while preserving
1459 // relative order among them) and moves all parallel loops to the
1460 // outermost (while again preserving relative order among them).
1461 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1463  getPerfectlyNestedLoops(loops, forOp);
1464  if (loops.size() < 2)
1465  return forOp;
1466 
1467  // Gather dependence components for dependences between all ops in loop nest
1468  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1469  unsigned maxLoopDepth = loops.size();
1470  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1471  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1472 
1473  // Mark loops as either parallel or sequential.
1474  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1475  for (auto &depComps : depCompsVec) {
1476  assert(depComps.size() >= maxLoopDepth);
1477  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1478  DependenceComponent &depComp = depComps[j];
1479  assert(depComp.lb.has_value() && depComp.ub.has_value());
1480  if (*depComp.lb != 0 || *depComp.ub != 0)
1481  isParallelLoop[j] = false;
1482  }
1483  }
1484 
1485  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1486 
1487  // Compute permutation of loops that sinks sequential loops (and thus raises
1488  // parallel loops) while preserving relative order.
1489  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1490  unsigned nextSequentialLoop = numParallelLoops;
1491  unsigned nextParallelLoop = 0;
1492  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1493  if (isParallelLoop[i]) {
1494  loopPermMap[i] = nextParallelLoop++;
1495  } else {
1496  loopPermMap[i] = nextSequentialLoop++;
1497  }
1498  }
1499 
1500  // Check if permutation 'loopPermMap' would violate dependences.
1501  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1502  return forOp;
1503  // Perform loop interchange according to permutation 'loopPermMap'.
1504  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1505  return loops[loopNestRootIndex];
1506 }
1507 
1508 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1509 // lower (resp. upper) loop bound. When called for both the lower and upper
1510 // bounds, the resulting IR resembles:
1511 //
1512 // ```mlir
1513 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1514 // ...
1515 // }
1516 // ```
1518  SmallVector<Value, 4> *operands,
1519  int64_t offset = 0) {
1520  auto bounds = llvm::to_vector<4>(map->getResults());
1521  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1522  operands->insert(operands->begin() + map->getNumDims(), iv);
1523  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1524  b.getContext());
1525  canonicalizeMapAndOperands(map, operands);
1526 }
1527 
1528 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1529 // Stripmine-sink is a primitive building block for generalized tiling of
1530 // imperfectly nested loops.
1531 // This transformation is purely mechanical and does not check legality,
1532 // profitability or even structural correctness. It is the user's
1533 // responsibility to specify `targets` that are dominated by `forOp`.
1534 // Returns the new AffineForOps, one per `targets`, nested immediately under
1535 // each of the `targets`.
1537 stripmineSink(AffineForOp forOp, uint64_t factor,
1538  ArrayRef<AffineForOp> targets) {
1539  auto originalStep = forOp.getStepAsInt();
1540  auto scaledStep = originalStep * factor;
1541  forOp.setStep(scaledStep);
1542 
1543  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1544 
1545  // Lower-bound map creation.
1546  auto lbMap = forOp.getLowerBoundMap();
1547  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1548  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1549 
1550  // Upper-bound map creation.
1551  auto ubMap = forOp.getUpperBoundMap();
1552  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1553  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1554  /*offset=*/scaledStep);
1555 
1556  auto iv = forOp.getInductionVar();
1557  SmallVector<AffineForOp, 8> innerLoops;
1558  for (auto t : targets) {
1559  // Insert newForOp before the terminator of `t`.
1560  auto b = OpBuilder::atBlockTerminator(t.getBody());
1561  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1562  ubOperands, ubMap, originalStep);
1563  auto begin = t.getBody()->begin();
1564  // Skip terminator and `newForOp` which is just before the terminator.
1565  auto nOps = t.getBody()->getOperations().size() - 2;
1566  newForOp.getBody()->getOperations().splice(
1567  newForOp.getBody()->getOperations().begin(),
1568  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1569  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1570  newForOp.getRegion());
1571  innerLoops.push_back(newForOp);
1572  }
1573 
1574  return innerLoops;
1575 }
1576 
1577 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1578 // Returns the new AffineForOps, nested immediately under `target`.
1579 template <typename SizeType>
1580 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1581  AffineForOp target) {
1582  // TODO: Use cheap structural assertions that targets are nested under
1583  // forOp and that targets are not nested under each other when DominanceInfo
1584  // exposes the capability. It seems overkill to construct a whole function
1585  // dominance tree at this point.
1586  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1587  assert(res.size() == 1 && "Expected 1 inner forOp");
1588  return res[0];
1589 }
1590 
1593  ArrayRef<AffineForOp> targets) {
1595  SmallVector<AffineForOp, 8> currentTargets(targets);
1596  for (auto it : llvm::zip(forOps, sizes)) {
1597  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1598  res.push_back(step);
1599  currentTargets = step;
1600  }
1601  return res;
1602 }
1603 
1605  ArrayRef<uint64_t> sizes,
1606  AffineForOp target) {
1608  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1609  res.push_back(llvm::getSingleElement(loops));
1610  return res;
1611 }
1612 
1614  if (loops.size() < 2)
1615  return success();
1616 
1617  AffineForOp innermost = loops.back();
1618  AffineForOp outermost = loops.front();
1619  AffineBound ub = outermost.getUpperBound();
1620  AffineMap origUbMap = ub.getMap();
1621  Location loc = outermost.getLoc();
1622  OpBuilder builder(outermost);
1623  for (AffineForOp loop : loops) {
1624  // We only work on normalized loops.
1625  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1626  loop.getConstantLowerBound() != 0)
1627  return failure();
1628  }
1629  SmallVector<Value, 4> upperBoundSymbols;
1630  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1631  ub.getOperands().end());
1632 
1633  // 1. Store the upper bound of the outermost loop in a variable.
1634  Value prev;
1635  if (!llvm::hasSingleElement(origUbMap.getResults()))
1636  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1637  else
1638  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1639  upperBoundSymbols.push_back(prev);
1640 
1641  // 2. Emit code computing the upper bound of the coalesced loop as product of
1642  // the number of iterations of all loops.
1643  for (AffineForOp loop : loops.drop_front()) {
1644  ub = loop.getUpperBound();
1645  origUbMap = ub.getMap();
1646  ubOperands = ub.getOperands();
1647  Value upperBound;
1648  // If upper bound map has more than one result, take their minimum.
1649  if (!llvm::hasSingleElement(origUbMap.getResults()))
1650  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1651  else
1652  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1653  upperBoundSymbols.push_back(upperBound);
1654  SmallVector<Value, 4> operands;
1655  operands.push_back(prev);
1656  operands.push_back(upperBound);
1657  // Maintain running product of loop upper bounds.
1658  prev = builder.create<AffineApplyOp>(
1659  loc,
1660  AffineMap::get(/*dimCount=*/1,
1661  /*symbolCount=*/1,
1662  builder.getAffineDimExpr(0) *
1663  builder.getAffineSymbolExpr(0)),
1664  operands);
1665  }
1666  // Set upper bound of the coalesced loop.
1667  AffineMap newUbMap = AffineMap::get(
1668  /*dimCount=*/0,
1669  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1670  outermost.setUpperBound(prev, newUbMap);
1671 
1672  builder.setInsertionPointToStart(outermost.getBody());
1673 
1674  // 3. Remap induction variables. For each original loop, the value of the
1675  // induction variable can be obtained by dividing the induction variable of
1676  // the linearized loop by the total number of iterations of the loops nested
1677  // in it modulo the number of iterations in this loop (remove the values
1678  // related to the outer loops):
1679  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1680  // Compute these iteratively from the innermost loop by creating a "running
1681  // quotient" of division by the range.
1682  Value previous = outermost.getInductionVar();
1683  for (unsigned idx = loops.size(); idx > 0; --idx) {
1684  if (idx != loops.size()) {
1685  SmallVector<Value, 4> operands;
1686  operands.push_back(previous);
1687  operands.push_back(upperBoundSymbols[idx]);
1688  previous = builder.create<AffineApplyOp>(
1689  loc,
1691  /*dimCount=*/1, /*symbolCount=*/1,
1692  builder.getAffineDimExpr(0).floorDiv(
1693  builder.getAffineSymbolExpr(0))),
1694  operands);
1695  }
1696  // Modified value of the induction variables of the nested loops after
1697  // coalescing.
1698  Value inductionVariable;
1699  if (idx == 1) {
1700  inductionVariable = previous;
1701  } else {
1702  SmallVector<Value, 4> applyOperands;
1703  applyOperands.push_back(previous);
1704  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1705  inductionVariable = builder.create<AffineApplyOp>(
1706  loc,
1708  /*dimCount=*/1, /*symbolCount=*/1,
1709  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1710  applyOperands);
1711  }
1712  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1713  inductionVariable, loops.back().getRegion());
1714  }
1715 
1716  // 4. Move the operations from the innermost just above the second-outermost
1717  // loop, delete the extra terminator and the second-outermost loop.
1718  AffineForOp secondOutermostLoop = loops[1];
1719  innermost.getBody()->back().erase();
1720  outermost.getBody()->getOperations().splice(
1721  Block::iterator(secondOutermostLoop.getOperation()),
1722  innermost.getBody()->getOperations());
1723  secondOutermostLoop.erase();
1724  return success();
1725 }
1726 
1728  ArrayRef<Value> processorId,
1729  ArrayRef<Value> numProcessors) {
1730  assert(processorId.size() == numProcessors.size());
1731  if (processorId.empty())
1732  return;
1733 
1734  OpBuilder b(forOp);
1735  Location loc(forOp.getLoc());
1736  AffineExpr lhs, rhs;
1737  bindSymbols(forOp.getContext(), lhs, rhs);
1738  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1739  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1740 
1741  Value linearIndex = processorId.front();
1742  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1743  auto mulApplyOp = b.create<AffineApplyOp>(
1744  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1745  linearIndex = b.create<AffineApplyOp>(
1746  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1747  }
1748 
1749  auto mulApplyOp = b.create<AffineApplyOp>(
1750  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1751  Value lb = b.create<AffineApplyOp>(
1752  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1753  forOp.setLowerBound(lb);
1754 
1755  Value step = forOp.getStep();
1756  for (auto numProcs : numProcessors)
1757  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1758  forOp.setStep(step);
1759 }
1760 
1761 /// Given a memref region, determine the lowest depth at which transfers can be
1762 /// placed for it, and return the corresponding block, start and end positions
1763 /// in the block for placing incoming (read) and outgoing (write) copies
1764 /// respectively. The lowest depth depends on whether the region being accessed
1765 /// is hoistable with respect to one or more immediately surrounding loops.
1766 static void
1768  Block::iterator &begin, Block::iterator &end,
1769  Block **copyPlacementBlock,
1770  Block::iterator *copyInPlacementStart,
1771  Block::iterator *copyOutPlacementStart) {
1772  const auto *cst = region.getConstraints();
1773  SmallVector<Value, 4> symbols;
1774  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1775 
1776  SmallVector<Operation *, 4> enclosingAffineOps;
1777  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1778  // Walk up loop parents till we find an IV on which this region is
1779  // symbolic/variant or we hit `hoistGuard`.
1780  auto it = enclosingAffineOps.rbegin();
1781  AffineForOp lastInvariantFor;
1782  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1783  Operation *enclosingOp = *it;
1784  // We can't hoist past the definition of the memref being copied.
1785  Value memref = region.memref;
1786  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1787  LLVM_DEBUG(
1788  llvm::dbgs()
1789  << "memref definition will end up not dominating hoist location\n");
1790  break;
1791  }
1792 
1793  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1794  if (!affineFor)
1795  break;
1796  // TODO: also need to be checking this for regions symbols that
1797  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1798  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1799  break;
1800  lastInvariantFor = affineFor;
1801  }
1802 
1803  if (it != enclosingAffineOps.rbegin()) {
1804  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1805  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1806  *copyPlacementBlock = lastInvariantFor->getBlock();
1807  } else {
1808  *copyInPlacementStart = begin;
1809  *copyOutPlacementStart = end;
1810  *copyPlacementBlock = &block;
1811  }
1812 }
1813 
1814 // Info comprising stride and number of elements transferred every stride.
1815 struct StrideInfo {
1816  int64_t stride;
1818 };
1819 
1820 /// Returns striding information for a copy/transfer of this region with
1821 /// potentially multiple striding levels from outermost to innermost. For an
1822 /// n-dimensional region, there can be at most n-1 levels of striding
1823 /// successively nested.
1824 // TODO: make this work with non-identity layout maps.
1825 static void getMultiLevelStrides(const MemRefRegion &region,
1826  ArrayRef<int64_t> bufferShape,
1827  SmallVectorImpl<StrideInfo> *strideInfos) {
1828  if (bufferShape.size() <= 1)
1829  return;
1830 
1831  int64_t numEltPerStride = 1;
1832  int64_t stride = 1;
1833  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1834  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1835  stride *= dimSize;
1836  numEltPerStride *= bufferShape[d];
1837  // A stride is needed only if the region has a shorter extent than the
1838  // memref along the dimension *and* has an extent greater than one along the
1839  // next major dimension.
1840  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1841  strideInfos->push_back({stride, numEltPerStride});
1842  }
1843  }
1844 }
1845 
1846 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1847 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1848 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1849 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1850 /// contain the expressions to be subtracted out from the respective copy loop
1851 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1852 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1853 /// copy nest is inserted.
1854 //
1855 /// The copy-in nest is generated as follows as an example for a 2-d region:
1856 /// for x = ...
1857 /// for y = ...
1858 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1859 ///
1860 static AffineForOp
1861 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1862  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1863  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1864  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1865  OpBuilder b) {
1866  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1867  return lbMap.getNumInputs() == lbOperands.size();
1868  }));
1869  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1870  return ubMap.getNumInputs() == ubOperands.size();
1871  }));
1872 
1873  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1874  // A copy nest can't be generated for 0-ranked memrefs.
1875  assert(rank != 0 && "non-zero rank memref expected");
1876  assert(lbMaps.size() == rank && "wrong number of lb maps");
1877  assert(ubMaps.size() == rank && "wrong number of ub maps");
1878 
1879  SmallVector<Value, 4> memIndices;
1880  SmallVector<AffineExpr, 4> fastBufExprs;
1881  SmallVector<Value, 4> fastBufMapOperands;
1882  AffineForOp copyNestRoot;
1883  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1884  for (unsigned d = 0; d < rank; ++d) {
1885  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1886  ubOperands, ubMaps[d]);
1887  if (d == 0)
1888  copyNestRoot = forOp;
1889 
1890  b = OpBuilder::atBlockTerminator(forOp.getBody());
1891 
1892  auto fastBufOffsetMap =
1893  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1894  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1895 
1896  // Construct the subscript for the fast memref being copied into/from:
1897  // x - offset_x.
1898  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1899  b.getAffineDimExpr(2 * d));
1900  fastBufMapOperands.push_back(offset);
1901  fastBufMapOperands.push_back(forOp.getInductionVar());
1902  mayBeDeadApplys.push_back(offset);
1903 
1904  // Subscript for the slow memref being copied.
1905  memIndices.push_back(forOp.getInductionVar());
1906  }
1907 
1908  auto fastBufMap =
1909  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1910  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1911  fastBufMap = simplifyAffineMap(fastBufMap);
1912  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1913 
1914  // Drop any dead affine.applys.
1915  for (auto applyOp : mayBeDeadApplys)
1916  if (applyOp.use_empty())
1917  applyOp.erase();
1918 
1919  if (!isCopyOut) {
1920  // Copy in.
1921  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1922  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1923  fastBufMapOperands);
1924  return copyNestRoot;
1925  }
1926 
1927  // Copy out.
1928  auto load =
1929  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1930  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1931  return copyNestRoot;
1932 }
1933 
1934 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1936  return block.getParentOp()->emitRemark();
1937 }
1938 
1939 /// Creates a buffer in the faster memory space for the specified memref region
1940 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1941 /// space to this one, and replaces all loads/stores in the block range
1942 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1943 /// if copies could not be generated due to yet unimplemented cases.
1944 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1945 /// specify the insertion points where the incoming copies and outgoing copies,
1946 /// respectively, should be inserted (the insertion happens right before the
1947 /// insertion point). Since `begin` can itself be invalidated due to the memref
1948 /// rewriting done from this method, the output argument `nBegin` is set to its
1949 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1950 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1951 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1952 /// allocated.
1953 static LogicalResult generateCopy(
1954  const MemRefRegion &region, Block *block, Block::iterator begin,
1955  Block::iterator end, Block *copyPlacementBlock,
1956  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1957  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1958  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1959  Block::iterator *nBegin, Block::iterator *nEnd) {
1960  *nBegin = begin;
1961  *nEnd = end;
1962 
1963  auto f = begin->getParentOfType<FunctionOpInterface>();
1964  OpBuilder topBuilder(f.getFunctionBody());
1965  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1966 
1967  *sizeInBytes = 0;
1968 
1969  if (begin == end)
1970  return success();
1971 
1972  // Is the copy out point at the end of the block where we are doing
1973  // explicit copying.
1974  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1975 
1976  // Copies for read regions are going to be inserted at 'begin'.
1977  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1978  // Copies for write regions are going to be inserted at 'end'.
1979  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1980  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1981 
1982  // Builder to create constants at the top level.
1983  auto func =
1984  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1985  OpBuilder top(func.getFunctionBody());
1986 
1987  auto loc = region.loc;
1988  auto memref = region.memref;
1989  auto memRefType = cast<MemRefType>(memref.getType());
1990 
1991  if (!memRefType.getLayout().isIdentity()) {
1992  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1993  return failure();
1994  }
1995 
1996  // Indices to use for the copying.
1997  // Indices for the original memref being copied from/to.
1998  SmallVector<Value, 4> memIndices;
1999  // Indices for the faster buffer being copied into/from.
2000  SmallVector<Value, 4> bufIndices;
2001 
2002  unsigned rank = memRefType.getRank();
2003  if (rank == 0) {
2004  LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2005  return failure();
2006  }
2007 
2008  SmallVector<int64_t, 4> fastBufferShape;
2009 
2010  // Compute the extents of the buffer.
2012  lbs.reserve(rank);
2013  std::optional<int64_t> numElements =
2014  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2015  if (!numElements) {
2016  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2017  return failure();
2018  }
2019 
2020  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2021  // This can be supported in the future if needed.
2022  LLVM_DEBUG(llvm::dbgs()
2023  << "Max lower bound for memref region start not supported\n");
2024  return failure();
2025  }
2026 
2027  if (*numElements == 0) {
2028  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2029  return success();
2030  }
2031 
2032  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2033  for (unsigned i = 0; i < rank; ++i) {
2034  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2035  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2036  LLVM_DEBUG(llvm::dbgs()
2037  << "Missing lower or upper bound for region along dimension: "
2038  << i << '\n');
2039  return failure();
2040  }
2041  }
2042 
2043  const FlatAffineValueConstraints *cst = region.getConstraints();
2044  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2045  // on; these typically include loop IVs surrounding the level at which the
2046  // copy generation is being done or other valid symbols in MLIR.
2047  SmallVector<Value, 8> regionSymbols;
2048  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2049 
2050  // Construct the access expression for the fast memory buffer. The access
2051  // expression for a particular dimension of the fast buffer is obtained by
2052  // subtracting out the lower bound on the original memref's data region
2053  // along the corresponding dimension.
2054 
2055  // Index start offsets for faster memory buffer relative to the original.
2056  SmallVector<AffineExpr, 4> fastBufOffsets;
2057  fastBufOffsets.reserve(rank);
2058  for (unsigned d = 0; d < rank; d++) {
2059  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2060  "incorrect bound size");
2061 
2062  // Set copy start location for this dimension in the lower memory space
2063  // memref.
2064  if (lbs[d].isSingleConstant()) {
2065  auto indexVal = lbs[d].getSingleConstantResult();
2066  if (indexVal == 0) {
2067  memIndices.push_back(zeroIndex);
2068  } else {
2069  memIndices.push_back(
2070  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2071  }
2072  } else {
2073  // The coordinate for the start location is just the lower bound along the
2074  // corresponding dimension on the memory region (stored in 'offset').
2075  // Remap all inputs of the map to dimensions uniformly since in the
2076  // generate IR we need valid affine symbols as opposed to "symbols" for
2077  // the purpose of the memref region.
2078  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2079  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2080  symReplacements[i] = top.getAffineDimExpr(i);
2081  lbs[d] = lbs[d].replaceDimsAndSymbols(
2082  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2083  /*numResultSyms=*/0);
2084  memIndices.push_back(b.create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2085  }
2086  // The fast buffer is copied into at location zero; addressing is relative.
2087  bufIndices.push_back(zeroIndex);
2088 
2089  // Record the offsets since they are needed to remap the memory accesses of
2090  // the original memref further below.
2091  fastBufOffsets.push_back(lbs[d].getResult(0));
2092  }
2093 
2094  // The faster memory space buffer.
2095  Value fastMemRef;
2096 
2097  // Check if a buffer was already created.
2098  bool existingBuf = fastBufferMap.count(memref) > 0;
2099  if (!existingBuf) {
2100  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2101  auto fastMemRefType =
2102  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2103  fastBufferLayout, copyOptions.fastMemorySpace);
2104 
2105  // Create the fast memory space buffer just before the 'affine.for'
2106  // operation.
2107  fastMemRef =
2108  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2109  // Record it.
2110  fastBufferMap[memref] = fastMemRef;
2111  // fastMemRefType is a constant shaped memref.
2112  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2113  // We don't account for things of unknown size.
2114  *sizeInBytes = maySizeInBytes.value_or(0);
2115 
2116  LLVM_DEBUG(emitRemarkForBlock(*block)
2117  << "Creating fast buffer of type " << fastMemRefType
2118  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2119  << " KiB\n");
2120  } else {
2121  // Reuse the one already created.
2122  fastMemRef = fastBufferMap[memref];
2123  }
2124 
2125  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2126 
2127  Value dmaStride;
2128  Value numEltPerDmaStride;
2129  if (copyOptions.generateDma) {
2130  SmallVector<StrideInfo, 4> dmaStrideInfos;
2131  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2132 
2133  // TODO: use all stride levels once DmaStartOp is extended for
2134  // multi-level strides.
2135  if (dmaStrideInfos.size() > 1) {
2136  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2137  return failure();
2138  }
2139 
2140  if (!dmaStrideInfos.empty()) {
2141  dmaStride =
2142  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2143  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2144  loc, dmaStrideInfos[0].numEltPerStride);
2145  }
2146  }
2147 
2148  // Record the last operation where we want the memref replacement to end. We
2149  // later do the memref replacement only in [begin, postDomFilter] so
2150  // that the original memref's used in the data movement code themselves don't
2151  // get replaced.
2152  auto postDomFilter = std::prev(end);
2153 
2154  // Create fully composed affine maps for each memref.
2155  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2156  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2157  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2158  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2159 
2160  if (!copyOptions.generateDma) {
2161  // Point-wise copy generation.
2162  auto copyNest =
2163  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2164  /*lbOperands=*/regionSymbols, ubMaps,
2165  /*ubOperands=*/regionSymbols, fastBufOffsets,
2166  /*isCopyOut=*/region.isWrite(), b);
2167 
2168  // Record this so that we can skip it from yet another copy.
2169  copyNests.insert(copyNest);
2170 
2171  // Since new ops are being appended (for copy out's), adjust the end to
2172  // mark end of block range being processed if necessary.
2173  if (region.isWrite() && isCopyOutAtEndOfBlock)
2174  *nEnd = Block::iterator(copyNest.getOperation());
2175  } else {
2176  // DMA generation.
2177  // Create a tag (single element 1-d memref) for the DMA.
2178  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2179  copyOptions.tagMemorySpace);
2180  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2181 
2182  SmallVector<Value, 4> tagIndices({zeroIndex});
2183  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2184  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2185  if (!region.isWrite()) {
2186  // DMA non-blocking read from original buffer to fast buffer.
2187  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2188  fastMemRef, bufAffineMap, bufIndices,
2189  tagMemRef, tagAffineMap, tagIndices,
2190  numElementsSSA, dmaStride, numEltPerDmaStride);
2191  } else {
2192  // DMA non-blocking write from fast buffer to the original memref.
2193  auto op = b.create<AffineDmaStartOp>(
2194  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2195  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2196  dmaStride, numEltPerDmaStride);
2197  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2198  // end to mark end of block range being processed.
2199  if (isCopyOutAtEndOfBlock)
2200  *nEnd = Block::iterator(op.getOperation());
2201  }
2202 
2203  // Matching DMA wait to block on completion; tag always has a 0 index.
2204  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2205  numElementsSSA);
2206 
2207  // Generate dealloc for the tag.
2208  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2209  if (*nEnd == end && isCopyOutAtEndOfBlock)
2210  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2211  // mark end of range of the original.
2212  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2213  }
2214 
2215  // Generate dealloc for the buffer.
2216  if (!existingBuf) {
2217  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2218  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2219  // the fast buffer (since it marks the new end insertion point).
2220  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2221  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2222  }
2223 
2224  // Replace all uses of the old memref with the faster one while remapping
2225  // access indices (subtracting out lower bound offsets for each dimension).
2226  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2227  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2228  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2229  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2230  // d2, d3 correspond to the original indices (%i, %j).
2231  SmallVector<AffineExpr, 4> remapExprs;
2232  remapExprs.reserve(rank);
2233  for (unsigned i = 0; i < rank; i++) {
2234  // The starting operands of indexRemap will be regionSymbols (the symbols on
2235  // which the memref region is parametric); then those corresponding to
2236  // the memref's original indices follow.
2237  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2238  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2239  }
2240  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2241  b.getContext());
2242 
2243  // Record the begin since it may be invalidated by memref replacement.
2244  Block::iterator prevOfBegin;
2245  bool isBeginAtStartOfBlock = (begin == block->begin());
2246  if (!isBeginAtStartOfBlock)
2247  prevOfBegin = std::prev(begin);
2248 
2249  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2250  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2251  /*extraIndices=*/{}, indexRemap,
2252  /*extraOperands=*/regionSymbols,
2253  /*symbolOperands=*/{},
2254  /*domOpFilter=*/&*begin,
2255  /*postDomOpFilter=*/&*postDomFilter);
2256 
2257  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2258 
2259  return success();
2260 }
2261 
2262 /// Construct the memref region to just include the entire memref. Returns false
2263 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2264 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2265 /// is parametric on.
2266 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2267  MemRefRegion *region) {
2268  unsigned rank;
2269  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2270  rank = loadOp.getMemRefType().getRank();
2271  region->memref = loadOp.getMemRef();
2272  region->setWrite(false);
2273  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2274  rank = storeOp.getMemRefType().getRank();
2275  region->memref = storeOp.getMemRef();
2276  region->setWrite(true);
2277  } else {
2278  assert(false && "expected load or store op");
2279  return false;
2280  }
2281  auto memRefType = cast<MemRefType>(region->memref.getType());
2282  if (!memRefType.hasStaticShape())
2283  return false;
2284 
2285  auto *regionCst = region->getConstraints();
2286 
2287  // Just get the first numSymbols IVs, which the memref region is parametric
2288  // on.
2290  getAffineForIVs(*op, &ivs);
2291  ivs.resize(numParamLoopIVs);
2292  SmallVector<Value, 4> symbols;
2293  extractForInductionVars(ivs, &symbols);
2294  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2295  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2296 
2297  // Memref dim sizes provide the bounds.
2298  for (unsigned d = 0; d < rank; d++) {
2299  auto dimSize = memRefType.getDimSize(d);
2300  assert(dimSize > 0 && "filtered dynamic shapes above");
2301  regionCst->addBound(BoundType::LB, d, 0);
2302  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2303  }
2304  return true;
2305 }
2306 
2307 LogicalResult
2309  const AffineCopyOptions &copyOptions,
2310  std::optional<Value> filterMemRef,
2311  DenseSet<Operation *> &copyNests) {
2312  if (begin == end)
2313  return success();
2314 
2315  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2316  "Inconsistent block begin/end args");
2317  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2318 
2319  Block *block = begin->getBlock();
2320 
2321  // Copies will be generated for this depth, i.e., symbolic in all loops
2322  // surrounding the this block range.
2323  unsigned copyDepth = getNestingDepth(&*begin);
2324 
2325  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2326  << "\n");
2327  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2328  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2329 
2330  // List of memory regions to copy for. We need a map vector to have a
2331  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2332  // since the alloc's for example are identical except for the SSA id.
2333  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2334  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2335 
2336  // Map from original memref's to the fast buffers that their accesses are
2337  // replaced with.
2338  DenseMap<Value, Value> fastBufferMap;
2339 
2340  // To check for errors when walking the block.
2341  bool error = false;
2342 
2343  // Walk this range of operations to gather all memory regions.
2344  block->walk(begin, end, [&](Operation *opInst) {
2345  Value memref;
2346  MemRefType memrefType;
2347  // Gather regions to allocate to buffers in faster memory space.
2348  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2349  memref = loadOp.getMemRef();
2350  memrefType = loadOp.getMemRefType();
2351  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2352  memref = storeOp.getMemRef();
2353  memrefType = storeOp.getMemRefType();
2354  }
2355  // Not an affine.load/store op.
2356  if (!memref)
2357  return;
2358 
2359  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2360  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2361  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2362  return;
2363 
2364  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2365  LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2366  "which copy-in/copy-out would happen\n");
2367  return;
2368  }
2369 
2370  // Compute the MemRefRegion accessed.
2371  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2372  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2373  /*addMemRefDimBounds=*/false))) {
2374  LLVM_DEBUG(llvm::dbgs()
2375  << "Error obtaining memory region: semi-affine maps?\n");
2376  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2377  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2378  LLVM_DEBUG(
2379  opInst->emitError("non-constant memref sizes not yet supported"));
2380  error = true;
2381  return;
2382  }
2383  }
2384 
2385  // Each memref has a single buffer associated with it irrespective of how
2386  // many load's and store's happen on it.
2387  // TODO: in the future, when regions don't intersect and satisfy
2388  // other properties (based on load/store regions), we could consider
2389  // multiple buffers per memref.
2390 
2391  // Add to the appropriate region if it's not already in it, or take a
2392  // bounding box union with the existing one if it's already in there.
2393  // Note that a memref may have both read and write regions - so update the
2394  // region in the other list if one exists (write in case of read and vice
2395  // versa) since there is a single bounding box for a memref across all reads
2396  // and writes that happen on it.
2397 
2398  // Attempts to update; returns true if 'region' exists in targetRegions.
2399  auto updateRegion =
2400  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2401  &targetRegions) {
2402  const auto *const it = targetRegions.find(region->memref);
2403  if (it == targetRegions.end())
2404  return false;
2405 
2406  // Perform a union with the existing region.
2407  if (failed(it->second->unionBoundingBox(*region))) {
2408  LLVM_DEBUG(llvm::dbgs()
2409  << "Memory region bounding box failed; "
2410  "over-approximating to the entire memref\n");
2411  // If the union fails, we will overapproximate.
2412  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2413  LLVM_DEBUG(opInst->emitError(
2414  "non-constant memref sizes not yet supported"));
2415  error = true;
2416  return true;
2417  }
2418  it->second->getConstraints()->clearAndCopyFrom(
2419  *region->getConstraints());
2420  } else {
2421  // Union was computed and stored in 'it->second': copy to 'region'.
2422  region->getConstraints()->clearAndCopyFrom(
2423  *it->second->getConstraints());
2424  }
2425  return true;
2426  };
2427 
2428  bool existsInRead = updateRegion(readRegions);
2429  if (error)
2430  return;
2431  bool existsInWrite = updateRegion(writeRegions);
2432  if (error)
2433  return;
2434 
2435  // Finally add it to the region list.
2436  if (region->isWrite() && !existsInWrite) {
2437  writeRegions[region->memref] = std::move(region);
2438  } else if (!region->isWrite() && !existsInRead) {
2439  readRegions[region->memref] = std::move(region);
2440  }
2441  });
2442 
2443  if (error) {
2444  LLVM_DEBUG(begin->emitError(
2445  "copy generation failed for one or more memref's in this block\n"));
2446  return failure();
2447  }
2448 
2449  uint64_t totalCopyBuffersSizeInBytes = 0;
2450  bool ret = true;
2451  auto processRegions =
2452  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2453  &regions) {
2454  for (const auto &regionEntry : regions) {
2455  // For each region, hoist copy in/out past all hoistable
2456  // 'affine.for's.
2457  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2458  Block *copyPlacementBlock;
2460  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2461  &copyInPlacementStart, &copyOutPlacementStart);
2462 
2463  uint64_t sizeInBytes;
2464  Block::iterator nBegin, nEnd;
2465  LogicalResult iRet = generateCopy(
2466  *regionEntry.second, block, begin, end, copyPlacementBlock,
2467  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2468  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2469  if (succeeded(iRet)) {
2470  // begin/end could have been invalidated, and need update.
2471  begin = nBegin;
2472  end = nEnd;
2473  totalCopyBuffersSizeInBytes += sizeInBytes;
2474  }
2475  ret = ret & succeeded(iRet);
2476  }
2477  };
2478  processRegions(readRegions);
2479  processRegions(writeRegions);
2480 
2481  if (!ret) {
2482  LLVM_DEBUG(begin->emitError(
2483  "copy generation failed for one or more memref's in this block\n"));
2484  return failure();
2485  }
2486 
2487  // For a range of operations, a note will be emitted at the caller.
2488  AffineForOp forOp;
2489  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2490  LLVM_DEBUG(forOp.emitRemark()
2491  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2492  << " KiB of copy buffers in fast memory space for this block");
2493  }
2494 
2495  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2496  block->getParentOp()->emitWarning(
2497  "total size of all copy buffers' for this block exceeds fast memory "
2498  "capacity");
2499  }
2500 
2501  return success();
2502 }
2503 
2504 // A convenience version of affineDataCopyGenerate for all ops in the body of
2505 // an AffineForOp.
2507  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2508  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2509  return affineDataCopyGenerate(forOp.getBody()->begin(),
2510  std::prev(forOp.getBody()->end()), copyOptions,
2511  filterMemRef, copyNests);
2512 }
2513 
2515  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2516  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2517  Block *block = analyzedOp->getBlock();
2518  auto begin = analyzedOp->getIterator();
2519  auto end = std::next(begin);
2520  DenseMap<Value, Value> fastBufferMap;
2521  DenseSet<Operation *> copyNests;
2522 
2523  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2524  copyOptions, fastBufferMap, copyNests,
2525  &result.sizeInBytes, &begin, &end);
2526  if (failed(err))
2527  return err;
2528 
2529  const auto &en = fastBufferMap.find(memrefRegion.memref);
2530  // In some cases (empty loops), no copy generation would have happened.
2531  if (en == fastBufferMap.end())
2532  return failure();
2533  result.alloc = en->second.getDefiningOp();
2534  assert(result.alloc && "fast buffer expected to be locally allocated");
2535  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2536  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2537  return success();
2538 }
2539 
2540 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2541 static void
2542 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2543  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2544  // Add a new empty level to output if it doesn't exist level already.
2545  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2546  if (currLoopDepth == depthToLoops.size())
2547  depthToLoops.emplace_back();
2548 
2549  for (auto &op : *block) {
2550  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2551  depthToLoops[currLoopDepth].push_back(forOp);
2552  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2553  }
2554  }
2555 }
2556 
2557 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2559  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2560  for (auto &block : func)
2561  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2562 
2563  // Remove last loop level from output since it's empty.
2564  if (!depthToLoops.empty()) {
2565  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2566  depthToLoops.pop_back();
2567  }
2568 }
2569 
2571  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2572  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2573  SmallVector<Value, 4> lowerOperands(lbOperands);
2574  SmallVector<Value, 4> upperOperands(ubOperands);
2575 
2576  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2577  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2578  lbMap = removeDuplicateExprs(lbMap);
2579  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2580  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2581  ubMap = removeDuplicateExprs(ubMap);
2582 
2583  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2584  step);
2585 }
2586 
2587 /// Creates an AffineIfOp that encodes the conditional to choose between
2588 /// the constant trip count version and an unknown trip count version of this
2589 /// nest of loops. This is used to separate partial and full tiles if `loops`
2590 /// has the intra-tile loops. The affine.if op is inserted at the builder
2591 /// insertion point of `b`.
2593  OpBuilder b) {
2594  if (loops.empty())
2595  return nullptr;
2596 
2597  auto *context = loops[0].getContext();
2598 
2601  llvm::append_range(ops, loops);
2602  (void)getIndexSet(ops, &cst);
2603 
2604  // Remove constraints that are independent of these loop IVs.
2605  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2606 
2607  // Construct the constraint set representing the guard for full tiles. The
2608  // lower bound (and upper bound) corresponding to the full tile should be
2609  // larger (and resp. smaller) than any other lower (or upper bound).
2610  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2611  for (auto loop : loops) {
2612  (void)loop;
2613  // TODO: Non-unit stride is not an issue to generalize to.
2614  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2615  // Mark everything symbols for the purpose of finding a constant diff pair.
2616  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2617  1);
2618  unsigned fullTileLbPos, fullTileUbPos;
2619  if (!((IntegerRelation)cst)
2620  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2621  /*boundFloorDivisor=*/nullptr,
2622  /*ub=*/nullptr, &fullTileLbPos,
2623  &fullTileUbPos)) {
2624  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2625  return nullptr;
2626  }
2627 
2628  SmallVector<unsigned, 4> lbIndices, ubIndices;
2629  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2630 
2631  auto fLb = cst.getInequality(fullTileLbPos);
2632  auto fUb = cst.getInequality(fullTileUbPos);
2633  fullTileLb.assign(fLb.begin(), fLb.end());
2634  fullTileUb.assign(fUb.begin(), fUb.end());
2635 
2636  // Full tile lower bound should be >= than any other lower bound.
2637  for (auto lbIndex : lbIndices)
2638  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2639  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2640 
2641  // Full tile upper bound should be <= any other upper bound.
2642  for (auto ubIndex : ubIndices)
2643  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2644  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2645 
2646  cst.removeVar(0);
2647  }
2648 
2649  // The previous step leads to all zeros for the full tile lb and ub position
2650  // itself; remove those and any other duplicates / trivial redundancies.
2652 
2653  // Turn everything into dims conservatively since we earlier turned all
2654  // trailing ids past point loop IV into symbols. Some of these could be outer
2655  // loop IVs; we'll canonicalize anyway.
2656  cst.setDimSymbolSeparation(0);
2657 
2658  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2659  // ifCondSet can be null if cst was empty -- this can happen if all loops
2660  // in the nest have constant trip counts.
2661  if (!ifCondSet)
2662  return nullptr;
2663 
2664  SmallVector<Value, 4> setOperands;
2665  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2666  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2667  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2668  /*withElseRegion=*/true);
2669 }
2670 
2671 /// Create the full tile loop nest (along with its body).
2672 static LogicalResult
2674  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2675  fullTileLoops.reserve(inputNest.size());
2676 
2677  // For each loop in the original nest identify a lower/upper bound pair such
2678  // that their difference is a constant.
2680  for (auto loop : inputNest) {
2681  // TODO: straightforward to generalize to a non-unit stride.
2682  if (loop.getStepAsInt() != 1) {
2683  LLVM_DEBUG(llvm::dbgs()
2684  << "[tile separation] non-unit stride not implemented\n");
2685  return failure();
2686  }
2687  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2688  (void)getIndexSet(loopOp, &cst);
2689  // We will mark everything other than this loop IV as symbol for getting a
2690  // pair of <lb, ub> with a constant difference.
2692  unsigned lbPos, ubPos;
2693  if (!((IntegerRelation)cst)
2694  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2695  /*boundFloorDivisor=*/nullptr,
2696  /*ub=*/nullptr, &lbPos, &ubPos) ||
2697  lbPos == ubPos) {
2698  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2699  "equalities not yet handled\n");
2700  return failure();
2701  }
2702 
2703  // Set all variables as dimensions uniformly since some of those marked as
2704  // symbols above could be outer loop IVs (corresponding tile space IVs).
2705  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2706 
2707  AffineValueMap lbVmap, ubVmap;
2708  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2709  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2710  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2711  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2712  ubVmap.getOperands(), ubVmap.getAffineMap());
2713  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2714  fullTileLoops.push_back(fullTileLoop);
2715  }
2716 
2717  // Add the body for the full tile loop nest.
2718  IRMapping operandMap;
2719  for (const auto &loopEn : llvm::enumerate(inputNest))
2720  operandMap.map(loopEn.value().getInductionVar(),
2721  fullTileLoops[loopEn.index()].getInductionVar());
2722  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2723  for (auto &op : inputNest.back().getBody()->without_terminator())
2724  b.clone(op, operandMap);
2725  return success();
2726 }
2727 
2728 LogicalResult
2730  SmallVectorImpl<AffineForOp> *fullTileNest) {
2731  if (inputNest.empty())
2732  return success();
2733 
2734  auto firstLoop = inputNest[0];
2735 
2736  // Each successive for op has to be nested in the other.
2737  auto prevLoop = firstLoop;
2738  for (auto loop : inputNest.drop_front(1)) {
2739  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2740  prevLoop = loop;
2741  }
2742 
2743  // Create the full tile loop nest.
2744  SmallVector<AffineForOp, 4> fullTileLoops;
2745  OpBuilder b(firstLoop);
2746  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2747  if (!fullTileLoops.empty())
2748  fullTileLoops.front().erase();
2749  return failure();
2750  }
2751 
2752  // Create and insert the version select right before the root of the nest.
2753  b = OpBuilder(firstLoop);
2754  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2755  if (!ifOp) {
2756  fullTileLoops.front().erase();
2757  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2758  "separation condition\n");
2759  return failure();
2760  }
2761 
2762  // Move the full tile into the then block.
2763  Block *thenBlock = ifOp.getThenBlock();
2764  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2765  thenBlock->getOperations().splice(
2766  std::prev(thenBlock->end()),
2767  outermostFullTileLoop->getBlock()->getOperations(),
2768  Block::iterator(outermostFullTileLoop));
2769 
2770  // Move the partial tile into the else block. The partial tile is the same as
2771  // the original loop nest.
2772  Block *elseBlock = ifOp.getElseBlock();
2773  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2774  firstLoop->getBlock()->getOperations(),
2775  Block::iterator(firstLoop));
2776 
2777  if (fullTileNest)
2778  *fullTileNest = std::move(fullTileLoops);
2779 
2780  return success();
2781 }
2782 
2783 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2784  LogicalResult result(failure());
2786  getPerfectlyNestedLoops(loops, op);
2787  if (loops.size() <= 1)
2788  return success();
2789 
2790  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2791  // loops with bounds defined above some loop.
2792  // 1. For each loop, find above which parent loop its operands are
2793  // defined.
2794  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2795  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2796  operandsDefinedAbove[i] = i;
2797  for (unsigned j = 0; j < i; ++j) {
2798  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2799  operandsDefinedAbove[i] = j;
2800  break;
2801  }
2802  }
2803  }
2804 
2805  // 2. Identify bands of loops such that the operands of all of them are
2806  // defined above the first loop in the band. Traverse the nest bottom-up
2807  // so that modifications don't invalidate the inner loops.
2808  for (unsigned end = loops.size(); end > 0; --end) {
2809  unsigned start = 0;
2810  for (; start < end - 1; ++start) {
2811  auto maxPos =
2812  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2813  std::next(operandsDefinedAbove.begin(), end));
2814  if (maxPos > start)
2815  continue;
2816  assert(maxPos == start &&
2817  "expected loop bounds to be known at the start of the band");
2818  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2819  if (succeeded(coalesceLoops(band)))
2820  result = success();
2821  break;
2822  }
2823  // If a band was found and transformed, keep looking at the loops above
2824  // the outermost transformed loop.
2825  if (start != end - 1)
2826  end = start + 1;
2827  }
2828  return result;
2829 }
2830 
2832  int64_t count = 0;
2833  Operation *currentOp = operand.getOwner();
2834  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2835  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2836  break;
2837  currentOp = loopOp;
2838  count++;
2839  }
2840  return count;
2841 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:383
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:661
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:424
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2266
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1537
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1935
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:362
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1767
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:174
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:409
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:560
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:465
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2542
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:914
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1953
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2673
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1825
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:689
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:975
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1088
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1319
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:418
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2592
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:44
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1517
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1861
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:102
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:398
unsigned getNumDims() const
Definition: AffineMap.cpp:394
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:407
unsigned getNumResults() const
Definition: AffineMap.cpp:402
unsigned getNumInputs() const
Definition: AffineMap.cpp:403
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:411
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:29
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:305
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:246
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:33
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:397
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:403
AffineMap getDimIdentityMap()
Definition: Builders.cpp:379
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:383
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:364
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:368
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:360
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:730
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
This class helps build Operations.
Definition: Builders.h:204
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:549
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:428
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:249
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:409
This class represents an operand of an operation.
Definition: Value.h:243
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:719
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:194
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
static WalkResult advance()
Definition: Visitors.h:51
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:523
Value getOperand(unsigned idx)
Definition: AffineOps.h:529
operand_range getOperands()
Definition: AffineOps.h:538
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:106
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:315
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1613
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1164
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:886
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2308
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1077
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2677
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1010
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:776
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2558
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1369
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2570
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:857
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:229
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1353
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2514
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1515
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:901
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1391
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1100
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:815
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1520
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:762
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1376
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2831
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1979
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1727
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1592
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1461
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:773
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1296
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2783
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:874
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1305
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2729
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2656
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:773
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:35
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:783
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1816
int64_t numEltPerStride
Definition: LoopUtils.cpp:1817
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:205
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1071
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:527
bool isWrite() const
Definition: Utils.h:529
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1127
void setWrite(bool flag)
Definition: Utils.h:530
Value memref
Memref that this region corresponds to.
Definition: Utils.h:562
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:569
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.