MLIR  22.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/DebugLog.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "loop-utils"
33 
34 using namespace mlir;
35 using namespace affine;
36 using namespace presburger;
37 using llvm::SmallMapVector;
38 
39 /// Computes the cleanup loop lower bound of the loop being unrolled with
40 /// the specified unroll factor; this bound will also be upper bound of the main
41 /// part of the unrolled loop. Computes the bound as an AffineMap with its
42 /// operands or a null map when the trip count can't be expressed as an affine
43 /// expression.
44 static void
45 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
46  AffineMap &cleanupLbMap,
47  SmallVectorImpl<Value> &cleanupLbOperands) {
48  AffineMap tripCountMap;
49  SmallVector<Value, 4> tripCountOperands;
50  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
51  // Trip count can't be computed.
52  if (!tripCountMap) {
53  cleanupLbMap = AffineMap();
54  return;
55  }
56 
57  OpBuilder b(forOp);
58  auto lbMap = forOp.getLowerBoundMap();
59  auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60  forOp.getLowerBoundOperands());
61 
62  // For each upper bound expr, get the range.
63  // Eg: affine.for %i = lb to min (ub1, ub2),
64  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
65  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
66  // these affine.apply's make up the cleanup loop lower bound.
67  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
68  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
69  int64_t step = forOp.getStepAsInt();
70  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
71  auto tripCountExpr = tripCountMap.getResult(i);
72  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
73  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
74  tripCountMap.getNumSymbols(), bumpExprs[i]);
75  bumpValues[i] =
76  AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
77  }
78 
79  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
80  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
81  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
82 
83  cleanupLbOperands.clear();
84  cleanupLbOperands.push_back(lb);
85  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
86  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
87  b.getContext());
88  // Simplify the cleanupLbMap + cleanupLbOperands.
89  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
90  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
91  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
92  // Remove any affine.apply's that became dead from the simplification above.
93  for (auto v : bumpValues)
94  if (v.use_empty())
95  v.getDefiningOp()->erase();
96 
97  if (lb.use_empty())
98  lb.erase();
99 }
100 
101 /// Helper to replace uses of loop carried values (iter_args) and loop
102 /// yield values while promoting single iteration affine.for ops.
103 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
104  // Replace uses of iter arguments with iter operands (initial values).
105  auto iterOperands = forOp.getInits();
106  auto iterArgs = forOp.getRegionIterArgs();
107  for (auto e : llvm::zip(iterOperands, iterArgs))
108  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
109 
110  // Replace uses of loop results with the values yielded by the loop.
111  auto outerResults = forOp.getResults();
112  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113  for (auto e : llvm::zip(outerResults, innerResults))
114  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
115 }
116 
117 /// Promotes the loop body of a forOp to its containing block if the forOp
118 /// was known to have a single iteration.
119 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
120  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
121  if (!tripCount || *tripCount != 1)
122  return failure();
123 
124  // TODO: extend this for arbitrary affine bounds.
125  if (forOp.getLowerBoundMap().getNumResults() != 1)
126  return failure();
127 
128  // Replaces all IV uses to its single iteration value.
129  auto iv = forOp.getInductionVar();
130  auto *parentBlock = forOp->getBlock();
131  if (!iv.use_empty()) {
132  if (forOp.hasConstantLowerBound()) {
133  auto func = forOp->getParentOfType<FunctionOpInterface>();
134  OpBuilder builder(forOp->getContext());
135  if (func)
136  builder.setInsertionPointToStart(&func.getFunctionBody().front());
137  else
138  builder.setInsertionPoint(forOp);
139  auto constOp = arith::ConstantIndexOp::create(
140  builder, forOp.getLoc(), forOp.getConstantLowerBound());
141  iv.replaceAllUsesWith(constOp);
142  } else {
143  auto lbOperands = forOp.getLowerBoundOperands();
144  auto lbMap = forOp.getLowerBoundMap();
145  OpBuilder builder(forOp);
146  if (lbMap == builder.getDimIdentityMap()) {
147  // No need of generating an affine.apply.
148  iv.replaceAllUsesWith(lbOperands[0]);
149  } else {
150  auto affineApplyOp =
151  AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152  iv.replaceAllUsesWith(affineApplyOp);
153  }
154  }
155  }
156 
158 
159  // Move the loop body operations, except for its terminator, to the loop's
160  // containing block.
161  forOp.getBody()->back().erase();
162  parentBlock->getOperations().splice(Block::iterator(forOp),
163  forOp.getBody()->getOperations());
164  forOp.erase();
165  return success();
166 }
167 
168 /// Generates an affine.for op with the specified lower and upper bounds
169 /// while generating the right IV remappings to realize shifts for operations in
170 /// its body. The operations that go into the loop body are specified in
171 /// opGroupQueue starting from the specified offset, and in that order. The
172 /// first element of the pair specifies the shift applied to that group of
173 /// operations; the shift is multiplied by the loop step before being applied.
174 /// Returns nullptr if the generated loop simplifies to a single iteration one.
175 static AffineForOp generateShiftedLoop(
176  AffineMap lbMap, AffineMap ubMap,
177  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
178  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
179  auto lbOperands = srcForOp.getLowerBoundOperands();
180  auto ubOperands = srcForOp.getUpperBoundOperands();
181 
182  assert(lbMap.getNumInputs() == lbOperands.size());
183  assert(ubMap.getNumInputs() == ubOperands.size());
184 
185  auto loopChunk =
186  AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187  ubMap, srcForOp.getStepAsInt());
188  auto loopChunkIV = loopChunk.getInductionVar();
189  auto srcIV = srcForOp.getInductionVar();
190 
191  IRMapping operandMap;
192 
193  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
194  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195  uint64_t shift = it.first;
196  auto ops = it.second;
197  // All 'same shift' operations get added with their operands being
198  // remapped to results of cloned operations, and their IV used remapped.
199  // Generate the remapping if the shift is not zero: remappedIV = newIV -
200  // shift.
201  if (!srcIV.use_empty() && shift != 0) {
202  auto ivRemap = AffineApplyOp::create(
203  bodyBuilder, srcForOp.getLoc(),
204  bodyBuilder.getSingleDimShiftAffineMap(
205  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
206  loopChunkIV);
207  operandMap.map(srcIV, ivRemap);
208  } else {
209  operandMap.map(srcIV, loopChunkIV);
210  }
211  for (auto *op : ops)
212  bodyBuilder.clone(*op, operandMap);
213  };
214  if (succeeded(promoteIfSingleIteration(loopChunk)))
215  return AffineForOp();
216  return loopChunk;
217 }
218 
219 // The skewing of operations with respect to one another can be used for
220 // example to allow overlap of asynchronous operations (such as DMA
221 // communication) with computation, or just relative shifting of operations
222 // for better register reuse, locality or parallelism. As such, the shifts are
223 // typically expected to be at most of the order of the number of operations.
224 // This method should not be used as a substitute for loop distribution/fission.
225 // This method uses an algorithm// in time linear in the number of operations
226 // in the body of the for loop - (using the 'sweep line' paradigm). This method
227 // asserts preservation of SSA dominance. A check for that as well as that for
228 // memory-based dependence preservation check rests with the users of this
229 // method.
230 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
231  ArrayRef<uint64_t> shifts,
232  bool unrollPrologueEpilogue) {
233  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234  "too few/many shifts");
235  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
236  return success();
237 
238  // If the trip counts aren't constant, we would need versioning and
239  // conditional guards (or context information to prevent such versioning). The
240  // better way to pipeline for such loops is to first tile them and extract
241  // constant trip count "full tiles" before applying this.
242  auto mayBeConstTripCount = getConstantTripCount(forOp);
243  if (!mayBeConstTripCount) {
244  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
245  return success();
246  }
247  uint64_t tripCount = *mayBeConstTripCount;
248 
249  assert(isOpwiseShiftValid(forOp, shifts) &&
250  "shifts will lead to an invalid transformation\n");
251 
252  int64_t step = forOp.getStepAsInt();
253 
254  unsigned numChildOps = shifts.size();
255 
256  // Do a linear time (counting) sort for the shifts.
257  uint64_t maxShift = *llvm::max_element(shifts);
258  if (maxShift >= numChildOps) {
259  // Large shifts are not the typical use case.
260  forOp.emitWarning("not shifting because shifts are unrealistically large");
261  return success();
262  }
263 
264  // An array of operation groups sorted by shift amount; each group has all
265  // operations with the same shift in the order in which they appear in the
266  // body of the 'affine.for' op.
267  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268  unsigned pos = 0;
269  for (auto &op : forOp.getBody()->without_terminator()) {
270  auto shift = shifts[pos++];
271  sortedOpGroups[shift].push_back(&op);
272  }
273 
274  // Unless the shifts have a specific pattern (which actually would be the
275  // common use case), prologue and epilogue are not meaningfully defined.
276  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
277  // loop generated as the prologue and the last as epilogue and unroll these
278  // fully.
279  AffineForOp prologue, epilogue;
280 
281  // Do a sweep over the sorted shifts while storing open groups in a
282  // vector, and generating loop portions as necessary during the sweep. A block
283  // of operations is paired with its shift.
284  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285 
286  auto origLbMap = forOp.getLowerBoundMap();
287  uint64_t lbShift = 0;
288  OpBuilder b(forOp);
289  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290  // If nothing is shifted by d, continue.
291  if (sortedOpGroups[d].empty())
292  continue;
293  if (!opGroupQueue.empty()) {
294  assert(d > 0 &&
295  "Queue expected to be empty when the first block is found");
296  // The interval for which the loop needs to be generated here is:
297  // [lbShift, min(lbShift + tripCount, d)) and the body of the
298  // loop needs to have all operations in opQueue in that order.
299  AffineForOp res;
300  if (lbShift + tripCount * step < d * step) {
301  res = generateShiftedLoop(
302  b.getShiftedAffineMap(origLbMap, lbShift),
303  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
304  opGroupQueue, /*offset=*/0, forOp, b);
305  // Entire loop for the queued op groups generated, empty it.
306  opGroupQueue.clear();
307  lbShift += tripCount * step;
308  } else {
309  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
310  b.getShiftedAffineMap(origLbMap, d),
311  opGroupQueue, /*offset=*/0, forOp, b);
312  lbShift = d * step;
313  }
314 
315  if (res) {
316  // Simplify/canonicalize the affine.for.
317  RewritePatternSet patterns(res.getContext());
318  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
319  bool erased;
321  res.getOperation(), std::move(patterns),
324  /*changed=*/nullptr, &erased);
325  if (!erased && !prologue)
326  prologue = res;
327  if (!erased)
328  epilogue = res;
329  }
330  } else {
331  // Start of first interval.
332  lbShift = d * step;
333  }
334  // Augment the list of operations that get into the current open interval.
335  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
336  }
337 
338  // Those operations groups left in the queue now need to be processed (FIFO)
339  // and their loops completed.
340  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
343  b.getShiftedAffineMap(origLbMap, ubShift),
344  opGroupQueue, /*offset=*/i, forOp, b);
345  lbShift = ubShift;
346  if (!prologue)
347  prologue = epilogue;
348  }
349 
350  // Erase the original for op.
351  forOp.erase();
352 
353  if (unrollPrologueEpilogue && prologue)
354  (void)loopUnrollFull(prologue);
355  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
356  (void)loopUnrollFull(epilogue);
357 
358  return success();
359 }
360 
361 /// Checks whether a loop nest is hyper-rectangular or not.
362 static LogicalResult
365  SmallVector<Operation *, 8> ops(input.begin(), input.end());
366  // 0-d or 1-d is trivially hyper-rectangular.
367  if (input.size() <= 1)
368  return success();
369  if (failed(getIndexSet(ops, &cst))) {
370  LDBG() << "Index set computation failed!";
371  return failure();
372  }
373  if (!cst.isHyperRectangular(0, input.size())) {
374  LDBG() << "Non-hyperrectangular nests not supported for tiling!";
375  return failure();
376  }
377  return success();
378 }
379 
380 /// Check if the input nest is supported for tiling and whether tiling would be
381 /// legal or not.
382 template <typename t>
384  ArrayRef<t> tileSizes) {
385  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386 
387  if (llvm::any_of(input,
388  [](AffineForOp op) { return op.getNumResults() > 0; })) {
389  LDBG() << "Cannot tile nest where a loop has yield values";
390  return failure();
391  }
392 
393  // Check if the supplied `for` ops are all successively nested.
394  if (!isPerfectlyNested(input)) {
395  LDBG() << "input loops not perfectly nested";
396  return failure();
397  }
398 
399  // TODO: handle non hyper-rectangular spaces.
400  if (failed(checkIfHyperRectangular(input)))
401  return failure();
402 
403  return success();
404 }
405 
406 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
407 /// location in destination's body, ignoring the terminator.
408 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409  Block::iterator loc) {
410  auto &ops = src.getBody()->getOperations();
411  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412  std::prev(ops.end()));
413 }
414 
415 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416 /// body.
417 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419 }
420 
421 /// Constructs tiled loop nest, without setting the loop bounds and move the
422 /// body of the original loop nest to the tiled loop nest.
424  AffineForOp rootAffineForOp, unsigned width,
425  MutableArrayRef<AffineForOp> tiledLoops) {
426  Location loc = rootAffineForOp.getLoc();
427 
428  // The outermost among the loops as we add more..
429  Operation *topLoop = rootAffineForOp.getOperation();
430  AffineForOp innermostPointLoop;
431 
432  // Add intra-tile (or point) loops.
433  for (unsigned i = 0; i < width; i++) {
434  OpBuilder b(topLoop);
435  // Loop bounds will be set later.
436  AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437  pointLoop.getBody()->getOperations().splice(
438  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439  topLoop);
440  tiledLoops[2 * width - 1 - i] = pointLoop;
441  topLoop = pointLoop.getOperation();
442  if (i == 0)
443  innermostPointLoop = pointLoop;
444  }
445 
446  // Add tile space loops;
447  for (unsigned i = width; i < 2 * width; i++) {
448  OpBuilder b(topLoop);
449  // Loop bounds will be set later.
450  AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451  tileSpaceLoop.getBody()->getOperations().splice(
452  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453  topLoop);
454  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455  topLoop = tileSpaceLoop.getOperation();
456  }
457 
458  // Move the loop body of the original nest to the new one.
459  moveLoopBody(origLoops.back(), innermostPointLoop);
460 }
461 
462 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
463 // TODO: Handle non-constant lower bounds.
464 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465  AffineForOp newInterTileLoop,
466  AffineForOp newIntraTileLoop,
467  Value tileSize) {
468  // The lower bound for the intra-tile loop is represented by an affine map
469  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474  // corresponding inter-tile loop.
475 
476  assert(origLoop.hasConstantLowerBound() &&
477  "expected input loops to have constant lower bound.");
478 
479  // Get lower bound of original loop as an affine expression.
480  AffineExpr origLowerBoundExpr;
481  origLowerBoundExpr =
482  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483 
484  // Add dim operands from original lower/upper bound.
485  SmallVector<Value, 4> lbOperands, ubOperands;
486  AffineBound lb = origLoop.getLowerBound();
487  AffineBound ub = origLoop.getUpperBound();
488  lbOperands.reserve(lb.getNumOperands() + 2);
489  ubOperands.reserve(ub.getNumOperands() + 2);
490  AffineMap origLbMap = lb.getMap();
491  AffineMap origUbMap = ub.getMap();
492  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493  lbOperands.push_back(lb.getOperand(j));
494  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495  ubOperands.push_back(ub.getOperand(j));
496 
497  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498  // IV.
499  lbOperands.push_back(newInterTileLoop.getInductionVar());
500  ubOperands.push_back(newInterTileLoop.getInductionVar());
501 
502  // Get loop IV as an affine expression for lower/upper bound. Size of
503  // lb/ubOperands is guaranteed to be atleast one.
504  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506 
507  // Add symbol operands from original lower/upper bound.
508  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512 
513  // Add a new symbol operand which is the tile size for this loop.
514  lbOperands.push_back(tileSize);
515  ubOperands.push_back(tileSize);
516 
517  SmallVector<AffineExpr, 4> lbBoundExprs;
518  SmallVector<AffineExpr, 4> ubBoundExprs;
519  lbBoundExprs.reserve(origLbMap.getNumResults());
520  ubBoundExprs.reserve(origUbMap.getNumResults());
521 
522  // Get tiling parameter as an affine expression for lb/ub.
523  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525 
526  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527  lbBoundExprs.push_back(
528  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529  origLowerBoundExpr);
530 
531  // Get the origLoopStep as an affine expression.
532  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533 
534  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535  // (tilingParameter * origLoopStep) + origlb.
536  ubBoundExprs.push_back(
537  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539 
540  ubBoundExprs.append(origUbMap.getResults().begin(),
541  origUbMap.getResults().end());
542 
543  AffineMap lbMap =
544  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545  lbBoundExprs, b.getContext());
546  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547 
548  AffineMap ubMap =
549  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550  ubBoundExprs, b.getContext());
551  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552 
553  // Original loop step must be preserved.
554  newIntraTileLoop.setStep(origLoop.getStepAsInt());
555 }
556 
557 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
558 // TODO: Handle non-constant lower bounds.
559 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560  AffineForOp newLoop, Value tileSize) {
561  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562 
563  // The lower bounds for inter-tile loops are same as the corresponding lower
564  // bounds of original loops.
565  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566 
567  // The new upper bound map for inter-tile loops, assuming constant lower
568  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569  // originalLowerBound), tiling parameter); where tiling parameter is the
570  // respective tile size for that loop. For e.g. if the original ubmap was
571  // ()->(1024), the new map will be
572  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573  // Therefore a new symbol operand is inserted in the map and the result
574  // expression is overwritten.
575 
576  assert(origLoop.hasConstantLowerBound() &&
577  "expected input loops to have constant lower bound.");
578 
579  // Get lower bound of original loop as an affine expression.
580  AffineExpr origLowerBoundExpr;
581  origLowerBoundExpr =
582  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583 
584  // Add dim operands from original upper bound.
585  SmallVector<Value, 4> ubOperands;
586  AffineBound ub = origLoop.getUpperBound();
587  ubOperands.reserve(ub.getNumOperands() + 1);
588  AffineMap origUbMap = ub.getMap();
589  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590  ubOperands.push_back(ub.getOperand(j));
591 
592  // Add symbol operands from original upper bound.
593  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595 
596  // Add a new symbol operand which is the tile size for this loop.
597  ubOperands.push_back(tileSize);
598 
599  // Get tiling parameter as an affine expression.
600  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601 
602  SmallVector<AffineExpr, 4> boundExprs;
603  boundExprs.reserve(origUbMap.getNumResults());
604  int64_t origUpperBound;
605  AffineExpr origUpperBoundExpr;
606 
607  // If upper bound for the original loop is constant, then the constant can
608  // be obtained as an affine expression straight away.
609  if (origLoop.hasConstantUpperBound()) {
610  origUpperBound = origLoop.getConstantUpperBound();
611 
612  // Get original constant upper bound as an affine expression.
613  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614 
615  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616  // originalLowerBound), tilingParameter).
617  boundExprs.push_back(
618  origLowerBoundExpr +
619  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620  } else {
621  // If upper bound for the original loop is not constant then two cases
622  // are possible, although there handeling is the same, 1.) The result of
623  // ubmap has only one result expression. For e.g.
624  // affine.for %i = 5 to %ub
625  //
626  // A symbol operand is added which represents the tiling parameter. The
627  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628  // where 's0' is the original upper bound and 's1' is the tiling
629  // parameter. 2.) When ubMap has more than one result expression. For e.g.
630  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631  // affine.for %i = 5 to min #map0()[%s0, %s1]
632  //
633  // A symbol operand is added which represents the tiling parameter. The
634  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636 
637  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638  // originalLowerBound), tilingParameter).
639  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640  boundExprs.push_back(
641  origLowerBoundExpr +
642  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643  }
644 
645  AffineMap ubMap =
646  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647  boundExprs, b.getContext());
648  newLoop.setUpperBound(ubOperands, ubMap);
649 
650  // Original loop step must be preserved.
651  newLoop.setStep(origLoop.getStepAsInt());
652 }
653 
654 /// Constructs and sets new loop bounds after tiling for the case of
655 /// hyper-rectangular index sets, where the bounds of one dimension do not
656 /// depend on other dimensions and tiling parameters are captured from SSA
657 /// values. Bounds of each dimension can thus be treated independently,
658 /// and deriving the new bounds is much simpler and faster than for the case of
659 /// tiling arbitrary polyhedral shapes.
662  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663  assert(!origLoops.empty() && "expected atleast one loop in band");
664  assert(origLoops.size() == tileSizes.size() &&
665  "expected tiling parameter for each loop in band.");
666 
667  OpBuilder b(origLoops[0].getOperation());
668  unsigned width = origLoops.size();
669 
670  // Set bounds for tile space loops.
671  for (unsigned i = 0; i < width; ++i) {
672  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673  }
674 
675  // Set bounds for intra-tile loops.
676  for (unsigned i = 0; i < width; ++i) {
677  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678  newLoops[i + width], tileSizes[i]);
679  }
680 }
681 
682 /// Constructs and sets new loop bounds after tiling for the case of
683 /// hyper-rectangular index sets, where the bounds of one dimension do not
684 /// depend on other dimensions. Bounds of each dimension can thus be treated
685 /// independently, and deriving the new bounds is much simpler and faster
686 /// than for the case of tiling arbitrary polyhedral shapes.
687 static void
690  ArrayRef<unsigned> tileSizes) {
691  assert(!origLoops.empty());
692  assert(origLoops.size() == tileSizes.size());
693 
694  OpBuilder b(origLoops[0].getOperation());
695  unsigned width = origLoops.size();
696 
697  // Bounds for tile space loops.
698  for (unsigned i = 0; i < width; i++) {
699  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703  // If the step size of original loop is x and tileSize is y then after
704  // tiling the tile space loops' step size becomes x*y.
705  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706  }
707  // Bounds for intra-tile loops.
708  for (unsigned i = 0; i < width; i++) {
709  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710  std::optional<uint64_t> mayBeConstantCount =
711  getConstantTripCount(origLoops[i]);
712  // The lower bound is just the tile-space loop.
713  AffineMap lbMap = b.getDimIdentityMap();
714  newLoops[width + i].setLowerBound(
715  /*operands=*/newLoops[i].getInductionVar(), lbMap);
716  // The step sizes of intra-tile loops is just the original loops' step size.
717  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
718 
719  // Set the upper bound.
720  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
721  // Trip count is less than the tile size: upper bound is lower bound +
722  // trip count * stepSize.
724  *mayBeConstantCount * origLoops[i].getStepAsInt());
725  newLoops[width + i].setUpperBound(
726  /*operands=*/newLoops[i].getInductionVar(), ubMap);
727  } else if (largestDiv % tileSizes[i] != 0) {
728  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
729  // Construct the upper bound map; the operands are the original operands
730  // with 'i' (tile-space loop) appended to it. The new upper bound map is
731  // the original one with an additional expression i + tileSize * stepSize
732  // appended.
733 
734  // Add dim operands from original upper bound.
735  SmallVector<Value, 4> ubOperands;
736  AffineBound ub = origLoops[i].getUpperBound();
737  ubOperands.reserve(ub.getNumOperands() + 1);
738  AffineMap origUbMap = ub.getMap();
739  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
740  ubOperands.push_back(ub.getOperand(j));
741 
742  // Add dim operand for new loop upper bound.
743  ubOperands.push_back(newLoops[i].getInductionVar());
744 
745  // Add symbol operands from original upper bound.
746  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
747  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
748 
749  SmallVector<AffineExpr, 4> boundExprs;
750  boundExprs.reserve(1 + origUbMap.getNumResults());
751  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
752  // The new upper bound map is the original one with an additional
753  // expression i + tileSize * stepSize (of original loop) appended.
754  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755  boundExprs.append(origUbMap.getResults().begin(),
756  origUbMap.getResults().end());
757  AffineMap ubMap =
758  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
759  boundExprs, b.getContext());
760  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
761  } else {
762  // No need of the min expression.
763  AffineExpr dim = b.getAffineDimExpr(0);
764  AffineMap ubMap = AffineMap::get(
765  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
767  }
768  }
769 }
770 
771 LogicalResult
773  ArrayRef<unsigned> tileSizes,
774  SmallVectorImpl<AffineForOp> *tiledNest) {
775  if (input.empty())
776  return success();
777 
778  if (failed(performPreTilingChecks(input, tileSizes)))
779  return failure();
780 
781  MutableArrayRef<AffineForOp> origLoops = input;
782  AffineForOp rootAffineForOp = origLoops[0];
783 
784  // Note that width is at least one since the band isn't empty.
785  unsigned width = input.size();
786  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
787 
788  // Construct a tiled loop nest without setting their bounds. Bounds are
789  // set later.
790  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
791 
792  SmallVector<Value, 8> origLoopIVs;
793  extractForInductionVars(input, &origLoopIVs);
794 
795  // Set loop bounds for the tiled loop nest.
796  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
797 
798  // Replace original IVs with intra-tile loop IVs.
799  for (unsigned i = 0; i < width; i++)
800  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
801 
802  // Erase the old loop nest.
803  rootAffineForOp.erase();
804 
805  if (tiledNest)
806  *tiledNest = std::move(tiledLoops);
807 
808  return success();
809 }
810 
811 /// Tiles the specified band of perfectly nested loops creating tile-space
812 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
813 /// is a contiguous set of loops.
816  SmallVectorImpl<AffineForOp> *tiledNest) {
817  if (input.empty())
818  return success();
819 
820  if (failed(performPreTilingChecks(input, tileSizes)))
821  return failure();
822 
823  MutableArrayRef<AffineForOp> origLoops = input;
824  AffineForOp rootAffineForOp = origLoops[0];
825  unsigned width = input.size();
826  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
827 
828  // Construct a tiled loop nest without setting their bounds. Bounds are
829  // set later.
830  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
831 
832  SmallVector<Value, 8> origLoopIVs;
833  extractForInductionVars(input, &origLoopIVs);
834 
835  // Set loop bounds for the tiled loop nest.
836  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
837  tileSizes);
838 
839  // Replace original IVs with intra-tile loop IVs.
840  for (unsigned i = 0; i < width; i++)
841  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
842 
843  // Erase the old loop nest.
844  rootAffineForOp.erase();
845 
846  if (tiledNest)
847  *tiledNest = std::move(tiledLoops);
848 
849  return success();
850 }
851 
852 /// Get perfectly nested sequence of loops starting at root of loop nest
853 /// (the first op being another AffineFor, and the second op - a terminator).
854 /// A loop is perfectly nested iff: the first op in the loop's body is another
855 /// AffineForOp, and the second op is a terminator).
857  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
858  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
859  nestedLoops.push_back(root);
860  Block &body = root.getRegion().front();
861  if (body.begin() != std::prev(body.end(), 2))
862  return;
863 
864  root = dyn_cast<AffineForOp>(&body.front());
865  if (!root)
866  return;
867  }
868 }
869 
870 /// Unrolls this loop completely.
871 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
872  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
873  if (mayBeConstantTripCount.has_value()) {
874  uint64_t tripCount = *mayBeConstantTripCount;
875  if (tripCount == 0)
876  return success();
877  if (tripCount == 1)
878  return promoteIfSingleIteration(forOp);
879  return loopUnrollByFactor(forOp, tripCount);
880  }
881  return failure();
882 }
883 
884 /// Unrolls this loop by the specified factor or by the trip count (if constant)
885 /// whichever is lower.
886 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
887  uint64_t unrollFactor) {
888  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
889  if (mayBeConstantTripCount.has_value() &&
890  *mayBeConstantTripCount < unrollFactor)
891  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
892  return loopUnrollByFactor(forOp, unrollFactor);
893 }
894 
895 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
896 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
897 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
898 /// using annotateFn.
900  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
901  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
902  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
903  ValueRange iterArgs, ValueRange yieldedValues) {
904  // Builder to insert unrolled bodies just before the terminator of the body of
905  // 'forOp'.
906  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
907 
908  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
909  if (!annotateFn)
910  annotateFn = defaultAnnotateFn;
911 
912  // Keep a pointer to the last non-terminator operation in the original block
913  // so that we know what to clone (since we are doing this in-place).
914  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
915 
916  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
917  SmallVector<Value, 4> lastYielded(yieldedValues);
918 
919  for (unsigned i = 1; i < unrollFactor; i++) {
920  IRMapping operandMap;
921 
922  // Prepare operand map.
923  operandMap.map(iterArgs, lastYielded);
924 
925  // If the induction variable is used, create a remapping to the value for
926  // this unrolled instance.
927  if (!forOpIV.use_empty()) {
928  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929  operandMap.map(forOpIV, ivUnroll);
930  }
931 
932  // Clone the original body of 'forOp'.
933  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
934  Operation *clonedOp = builder.clone(*it, operandMap);
935  annotateFn(i, clonedOp, builder);
936  }
937 
938  // Update yielded values. If the yielded value is defined outside the
939  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
940  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
941  // value with the clone corresponding to the yielded value.
942  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943  Operation *defOp = yieldedValues[i].getDefiningOp();
944  if (defOp && defOp->getBlock() == loopBodyBlock)
945  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
946  }
947  }
948 
949  // Make sure we annotate the Ops in the original body. We do this last so that
950  // any annotations are not copied into the cloned Ops above.
951  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
952  annotateFn(0, &*it, builder);
953 
954  // Update operands of the yield statement.
955  loopBodyBlock->getTerminator()->setOperands(lastYielded);
956 }
957 
958 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
959 /// count is not a multiple of `unrollFactor`.
960 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
961  uint64_t unrollFactor) {
962  // Insert the cleanup loop right after 'forOp'.
963  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
964  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
965 
966  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
967  // and produce results for the original users of `forOp` results.
968  auto results = forOp.getResults();
969  auto cleanupResults = cleanupForOp.getResults();
970  auto cleanupIterOperands = cleanupForOp.getInits();
971 
972  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
975  }
976 
977  AffineMap cleanupMap;
978  SmallVector<Value, 4> cleanupOperands;
979  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
980  if (!cleanupMap)
981  return failure();
982 
983  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
984  // Promote the loop body up if this has turned into a single iteration loop.
985  (void)promoteIfSingleIteration(cleanupForOp);
986 
987  // Adjust upper bound of the original loop; this is the same as the lower
988  // bound of the cleanup loop.
989  forOp.setUpperBound(cleanupOperands, cleanupMap);
990  return success();
991 }
992 
993 /// Unrolls this loop by the specified factor. Returns success if the loop
994 /// is successfully unrolled.
996  AffineForOp forOp, uint64_t unrollFactor,
997  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
998  bool cleanUpUnroll) {
999  assert(unrollFactor > 0 && "unroll factor should be positive");
1000 
1001  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1002  if (unrollFactor == 1) {
1003  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1004  return failure();
1005  return success();
1006  }
1007 
1008  // Nothing in the loop body other than the terminator.
1009  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1010  return success();
1011 
1012  // If the trip count is lower than the unroll factor, no unrolled body.
1013  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014  if (cleanUpUnroll) {
1015  // Unroll the cleanup loop if cleanUpUnroll is specified.
1016  return loopUnrollFull(forOp);
1017  }
1018 
1019  return failure();
1020  }
1021 
1022  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1023  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1024  // Loops where the lower bound is a max expression or the upper bound is
1025  // a min expression and the trip count doesn't divide the unroll factor
1026  // can't be unrolled since the lower bound of the cleanup loop in such cases
1027  // cannot be expressed as an affine function or a max over affine functions.
1028  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029  forOp.getUpperBoundMap().getNumResults() != 1)
1030  return failure();
1031  if (cleanUpUnroll)
1032  // Force unroll including cleanup loop
1033  return loopUnrollFull(forOp);
1034  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1035  assert(false && "cleanup loop lower bound map for single result lower "
1036  "and upper bound maps can always be determined");
1037  }
1038 
1039  ValueRange iterArgs(forOp.getRegionIterArgs());
1040  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1041 
1042  // Scale the step of loop being unrolled by unroll factor.
1043  int64_t step = forOp.getStepAsInt();
1044  forOp.setStep(step * unrollFactor);
1046  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1047  [&](unsigned i, Value iv, OpBuilder b) {
1048  // iv' = iv + i * step
1049  auto d0 = b.getAffineDimExpr(0);
1050  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051  return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1052  },
1053  /*annotateFn=*/annotateFn,
1054  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1055 
1056  // Promote the loop body up if this has turned into a single iteration loop.
1057  (void)promoteIfSingleIteration(forOp);
1058  return success();
1059 }
1060 
1061 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1062  uint64_t unrollJamFactor) {
1063  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1064  if (mayBeConstantTripCount.has_value() &&
1065  *mayBeConstantTripCount < unrollJamFactor)
1066  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1067  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1068 }
1069 
1070 /// Check if all control operands of all loops are defined outside of `forOp`
1071 /// and return false if not.
1072 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1073  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074  for (auto controlOperand : aForOp.getControlOperands()) {
1075  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076  return WalkResult::interrupt();
1077  }
1078  return WalkResult::advance();
1079  });
1080  return !walkResult.wasInterrupted();
1081 }
1082 
1083 /// Unrolls and jams this loop by the specified factor.
1084 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1085  uint64_t unrollJamFactor) {
1086  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1087 
1088  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1089  if (unrollJamFactor == 1) {
1090  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1091  return failure();
1092  return success();
1093  }
1094 
1095  // Nothing in the loop body other than the terminator.
1096  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1097  return success();
1098 
1099  // If the trip count is lower than the unroll jam factor, no unroll jam.
1100  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101  LDBG() << "[failed] trip count < unroll-jam factor";
1102  return failure();
1103  }
1104 
1105  // If any control operand of any inner loop of `forOp` is defined within
1106  // `forOp`, no unroll jam.
1107  if (!areInnerBoundsInvariant(forOp))
1108  return failure();
1109 
1110  // Gather all sub-blocks to jam upon the loop being unrolled.
1112  jbg.walk(forOp);
1113  auto &subBlocks = jbg.subBlocks;
1114 
1115  // Collect loops with iter_args.
1116  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1117  forOp.walk([&](AffineForOp aForOp) {
1118  if (aForOp.getNumIterOperands() > 0)
1119  loopsWithIterArgs.push_back(aForOp);
1120  });
1121 
1122  // Get supported reductions to be used for creating reduction ops at the end.
1123  SmallVector<LoopReduction> reductions;
1124  if (forOp.getNumIterOperands() > 0)
1125  getSupportedReductions(forOp, reductions);
1126 
1127  // Generate the cleanup loop if trip count isn't a multiple of
1128  // unrollJamFactor.
1129  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1130  // Loops where the lower bound is a max expression or the upper bound is
1131  // a min expression and the trip count doesn't divide the unroll factor
1132  // can't be unrolled since the lower bound of the cleanup loop in such cases
1133  // cannot be expressed as an affine function or a max over affine functions.
1134  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135  forOp.getUpperBoundMap().getNumResults() != 1)
1136  return failure();
1137  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1138  assert(false && "cleanup loop lower bound map for single result lower "
1139  "and upper bound maps can always be determined");
1140  }
1141 
1142  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1143  // iteration. There are (`unrollJamFactor` - 1) iterations.
1144  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1145 
1146  // For any loop with iter_args, replace it with a new loop that has
1147  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1148  // operands.
1149  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1150  IRRewriter rewriter(forOp.getContext());
1151  for (AffineForOp oldForOp : loopsWithIterArgs) {
1152  SmallVector<Value> dupIterOperands, dupYieldOperands;
1153  ValueRange oldIterOperands = oldForOp.getInits();
1154  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1155  ValueRange oldYieldOperands =
1156  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1157  // Get additional iterOperands, iterArgs, and yield operands. We will
1158  // fix iterOperands and yield operands after cloning of sub-blocks.
1159  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1162  }
1163  // Create a new loop with additional iterOperands, iter_args and yield
1164  // operands. This new loop will take the loop body of the original loop.
1165  bool forOpReplaced = oldForOp == forOp;
1166  AffineForOp newForOp =
1167  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1169  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1170  return dupYieldOperands;
1171  }));
1172  newLoopsWithIterArgs.push_back(newForOp);
1173  // `forOp` has been replaced with a new loop.
1174  if (forOpReplaced)
1175  forOp = newForOp;
1176  // Update `operandMaps` for `newForOp` iterArgs and results.
1177  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178  unsigned oldNumIterArgs = oldIterArgs.size();
1179  ValueRange newResults = newForOp.getResults();
1180  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181  assert(oldNumIterArgs == oldNumResults &&
1182  "oldNumIterArgs must be the same as oldNumResults");
1183  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1185  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1186  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1187  // to those in the `i`th new set.
1188  operandMaps[i - 1].map(newIterArgs[j],
1189  newIterArgs[i * oldNumIterArgs + j]);
1190  operandMaps[i - 1].map(newResults[j],
1191  newResults[i * oldNumResults + j]);
1192  }
1193  }
1194  }
1195 
1196  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1197  int64_t step = forOp.getStepAsInt();
1198  forOp.setStep(step * unrollJamFactor);
1199 
1200  auto forOpIV = forOp.getInductionVar();
1201  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1202  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203  for (auto &subBlock : subBlocks) {
1204  // Builder to insert unroll-jammed bodies. Insert right at the end of
1205  // sub-block.
1206  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1207 
1208  // If the induction variable is used, create a remapping to the value for
1209  // this unrolled instance.
1210  if (!forOpIV.use_empty()) {
1211  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1212  auto d0 = builder.getAffineDimExpr(0);
1213  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1214  auto ivUnroll =
1215  AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216  operandMaps[i - 1].map(forOpIV, ivUnroll);
1217  }
1218  // Clone the sub-block being unroll-jammed.
1219  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220  builder.clone(*it, operandMaps[i - 1]);
1221  }
1222  // Fix iterOperands and yield op operands of newly created loops.
1223  for (auto newForOp : newLoopsWithIterArgs) {
1224  unsigned oldNumIterOperands =
1225  newForOp.getNumIterOperands() / unrollJamFactor;
1226  unsigned numControlOperands = newForOp.getNumControlOperands();
1227  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229  assert(oldNumIterOperands == oldNumYieldOperands &&
1230  "oldNumIterOperands must be the same as oldNumYieldOperands");
1231  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1232  // The `i`th duplication of an old iterOperand or yield op operand
1233  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1234  // if such mapped value exists.
1235  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1236  operandMaps[i - 1].lookupOrDefault(
1237  newForOp.getOperand(numControlOperands + j)));
1238  yieldOp.setOperand(
1239  i * oldNumYieldOperands + j,
1240  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1241  }
1242  }
1243  }
1244  if (forOp.getNumResults() > 0) {
1245  // Create reduction ops to combine every `unrollJamFactor` related results
1246  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1247  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1248  // %1.
1249  rewriter.setInsertionPointAfter(forOp);
1250  auto loc = forOp.getLoc();
1251  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1252  for (LoopReduction &reduction : reductions) {
1253  unsigned pos = reduction.iterArgPosition;
1254  Value lhs = forOp.getResult(pos);
1255  Value rhs;
1257  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258  rhs = forOp.getResult(i * oldNumResults + pos);
1259  // Create ops based on reduction type.
1260  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1261  if (!lhs)
1262  return failure();
1263  Operation *op = lhs.getDefiningOp();
1264  assert(op && "Reduction op should have been created");
1265  newOps.insert(op);
1266  }
1267  // Replace all uses except those in newly created reduction ops.
1268  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1269  }
1270  }
1271 
1272  // Promote the loop body up if this has turned into a single iteration loop.
1273  (void)promoteIfSingleIteration(forOp);
1274  return success();
1275 }
1276 
1277 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1278 /// nested within 'forOpA' as the only non-terminator operation in its block.
1279 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1280  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281  auto &forOpABody = forOpA.getBody()->getOperations();
1282  auto &forOpBBody = forOpB.getBody()->getOperations();
1283 
1284  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1285  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1286  // body containing only the terminator.
1287  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1288  forOpABody, forOpABody.begin(),
1289  std::prev(forOpABody.end()));
1290  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1291  // body (this leaves forOpB's body containing only the terminator).
1292  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293  std::prev(forOpBBody.end()));
1294  // 3) Splice forOpA into the beginning of forOpB's body.
1295  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1296  Block::iterator(forOpA));
1297 }
1298 
1299 // Checks each dependence component against the permutation to see if the
1300 // desired loop interchange would violate dependences by making the
1301 // dependence component lexicographically negative.
1303  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1304  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1305  // Invert permutation map.
1306  unsigned maxLoopDepth = loops.size();
1307  SmallVector<unsigned, 4> loopPermMapInv;
1308  loopPermMapInv.resize(maxLoopDepth);
1309  for (unsigned i = 0; i < maxLoopDepth; ++i)
1310  loopPermMapInv[loopPermMap[i]] = i;
1311 
1312  // Check each dependence component against the permutation to see if the
1313  // desired loop interchange permutation would make the dependence vectors
1314  // lexicographically negative.
1315  // Example 1: [-1, 1][0, 0]
1316  // Example 2: [0, 0][-1, 1]
1317  for (const auto &depComps : depCompsVec) {
1318  assert(depComps.size() >= maxLoopDepth);
1319  // Check if the first non-zero dependence component is positive.
1320  // This iterates through loops in the desired order.
1321  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1322  unsigned permIndex = loopPermMapInv[j];
1323  assert(depComps[permIndex].lb);
1324  int64_t depCompLb = *depComps[permIndex].lb;
1325  if (depCompLb > 0)
1326  break;
1327  if (depCompLb < 0)
1328  return false;
1329  }
1330  }
1331  return true;
1332 }
1333 
1334 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1335 /// nested sequence of loops in 'loops' would violate dependences.
1337  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1338  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1339  unsigned maxLoopDepth = loops.size();
1340  if (maxLoopDepth == 1)
1341  return true;
1342 
1343  // We cannot guarantee the validity of the interchange if the loops have
1344  // iter_args, since the dependence analysis does not take them into account.
1345  // Conservatively return false in such cases.
1346  if (llvm::any_of(loops, [](AffineForOp loop) {
1347  return loop.getNumIterOperands() > 0;
1348  }))
1349  return false;
1350 
1351  // Gather dependence components for dependences between all ops in loop nest
1352  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1353  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1354  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1355  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1356 }
1357 
1358 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1359 /// in it from outermost to innermost.
1360 [[maybe_unused]] bool
1362  assert(!loops.empty() && "no loops provided");
1363 
1364  // We already know that the block can't be empty.
1365  auto hasTwoElements = [](Block *block) {
1366  auto secondOpIt = std::next(block->begin());
1367  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1368  };
1369 
1370  auto enclosingLoop = loops.front();
1371  for (auto loop : loops.drop_front()) {
1372  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1373  // parentForOp's body should be just this loop and the terminator.
1374  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1375  return false;
1376  enclosingLoop = loop;
1377  }
1378  return true;
1379 }
1380 
1381 // input[i] should move from position i -> permMap[i]. Returns the position in
1382 // `input` that becomes the new outermost loop.
1384  ArrayRef<unsigned> permMap) {
1385  assert(input.size() == permMap.size() && "invalid permutation map size");
1386  // Check whether the permutation spec is valid. This is a small vector - we'll
1387  // just sort and check if it's iota.
1388  SmallVector<unsigned, 4> checkPermMap(permMap);
1389  llvm::sort(checkPermMap);
1390  if (llvm::any_of(llvm::enumerate(checkPermMap),
1391  [](const auto &en) { return en.value() != en.index(); }))
1392  assert(false && "invalid permutation map");
1393 
1394  // Nothing to do.
1395  if (input.size() < 2)
1396  return 0;
1397 
1398  assert(isPerfectlyNested(input) && "input not perfectly nested");
1399 
1400  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1401  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1403  for (unsigned i = 0, e = input.size(); i < e; ++i)
1404  invPermMap.push_back({permMap[i], i});
1405  llvm::sort(invPermMap);
1406 
1407  // Move the innermost loop body to the loop that would be the innermost in the
1408  // permuted nest (only if the innermost loop is going to change).
1409  if (permMap.back() != input.size() - 1) {
1410  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411  Block *srcBody = ((AffineForOp)input.back()).getBody();
1412  destBody->getOperations().splice(destBody->begin(),
1413  srcBody->getOperations(), srcBody->begin(),
1414  std::prev(srcBody->end()));
1415  }
1416 
1417  // We'll move each loop in `input` in the reverse order so that its body is
1418  // empty when we are moving it; this incurs zero copies and no erasing.
1419  for (int i = input.size() - 1; i >= 0; --i) {
1420  // If this has to become the outermost loop after permutation, add it to the
1421  // parent block of the original root.
1422  if (permMap[i] == 0) {
1423  // If the root remains the same, nothing to do.
1424  if (i == 0)
1425  continue;
1426  // Make input[i] the new outermost loop moving it into parentBlock.
1427  auto *parentBlock = input[0]->getBlock();
1428  parentBlock->getOperations().splice(Block::iterator(input[0]),
1429  input[i]->getBlock()->getOperations(),
1430  Block::iterator(input[i]));
1431  continue;
1432  }
1433 
1434  // If the parent in the permuted order is the same as in the original,
1435  // nothing to do.
1436  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1438  continue;
1439 
1440  // Move input[i] to its surrounding loop in the transformed nest.
1441  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442  destBody->getOperations().splice(destBody->begin(),
1443  input[i]->getBlock()->getOperations(),
1444  Block::iterator(input[i]));
1445  }
1446 
1447  return invPermMap[0].second;
1448 }
1449 
1450 // Sinks all sequential loops to the innermost levels (while preserving
1451 // relative order among them) and moves all parallel loops to the
1452 // outermost (while again preserving relative order among them).
1453 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1455  getPerfectlyNestedLoops(loops, forOp);
1456  if (loops.size() < 2)
1457  return forOp;
1458 
1459  // Gather dependence components for dependences between all ops in loop nest
1460  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1461  unsigned maxLoopDepth = loops.size();
1462  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1463  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1464 
1465  // Mark loops as either parallel or sequential.
1466  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1467  for (auto &depComps : depCompsVec) {
1468  assert(depComps.size() >= maxLoopDepth);
1469  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1470  DependenceComponent &depComp = depComps[j];
1471  assert(depComp.lb.has_value() && depComp.ub.has_value());
1472  if (*depComp.lb != 0 || *depComp.ub != 0)
1473  isParallelLoop[j] = false;
1474  }
1475  }
1476 
1477  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1478 
1479  // Compute permutation of loops that sinks sequential loops (and thus raises
1480  // parallel loops) while preserving relative order.
1481  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1482  unsigned nextSequentialLoop = numParallelLoops;
1483  unsigned nextParallelLoop = 0;
1484  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1485  if (isParallelLoop[i]) {
1486  loopPermMap[i] = nextParallelLoop++;
1487  } else {
1488  loopPermMap[i] = nextSequentialLoop++;
1489  }
1490  }
1491 
1492  // Check if permutation 'loopPermMap' would violate dependences.
1493  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1494  return forOp;
1495  // Perform loop interchange according to permutation 'loopPermMap'.
1496  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1497  return loops[loopNestRootIndex];
1498 }
1499 
1500 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1501 // lower (resp. upper) loop bound. When called for both the lower and upper
1502 // bounds, the resulting IR resembles:
1503 //
1504 // ```mlir
1505 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1506 // ...
1507 // }
1508 // ```
1510  SmallVector<Value, 4> *operands,
1511  int64_t offset = 0) {
1512  auto bounds = llvm::to_vector<4>(map->getResults());
1513  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1514  operands->insert(operands->begin() + map->getNumDims(), iv);
1515  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1516  b.getContext());
1517  canonicalizeMapAndOperands(map, operands);
1518 }
1519 
1520 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1521 // Stripmine-sink is a primitive building block for generalized tiling of
1522 // imperfectly nested loops.
1523 // This transformation is purely mechanical and does not check legality,
1524 // profitability or even structural correctness. It is the user's
1525 // responsibility to specify `targets` that are dominated by `forOp`.
1526 // Returns the new AffineForOps, one per `targets`, nested immediately under
1527 // each of the `targets`.
1529 stripmineSink(AffineForOp forOp, uint64_t factor,
1530  ArrayRef<AffineForOp> targets) {
1531  auto originalStep = forOp.getStepAsInt();
1532  auto scaledStep = originalStep * factor;
1533  forOp.setStep(scaledStep);
1534 
1535  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1536 
1537  // Lower-bound map creation.
1538  auto lbMap = forOp.getLowerBoundMap();
1539  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1540  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1541 
1542  // Upper-bound map creation.
1543  auto ubMap = forOp.getUpperBoundMap();
1544  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1545  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1546  /*offset=*/scaledStep);
1547 
1548  auto iv = forOp.getInductionVar();
1549  SmallVector<AffineForOp, 8> innerLoops;
1550  for (auto t : targets) {
1551  // Insert newForOp before the terminator of `t`.
1552  auto b = OpBuilder::atBlockTerminator(t.getBody());
1553  auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554  ubOperands, ubMap, originalStep);
1555  auto begin = t.getBody()->begin();
1556  // Skip terminator and `newForOp` which is just before the terminator.
1557  auto nOps = t.getBody()->getOperations().size() - 2;
1558  newForOp.getBody()->getOperations().splice(
1559  newForOp.getBody()->getOperations().begin(),
1560  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1561  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1562  newForOp.getRegion());
1563  innerLoops.push_back(newForOp);
1564  }
1565 
1566  return innerLoops;
1567 }
1568 
1569 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1570 // Returns the new AffineForOps, nested immediately under `target`.
1571 template <typename SizeType>
1572 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1573  AffineForOp target) {
1574  // TODO: Use cheap structural assertions that targets are nested under
1575  // forOp and that targets are not nested under each other when DominanceInfo
1576  // exposes the capability. It seems overkill to construct a whole function
1577  // dominance tree at this point.
1578  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1579  assert(res.size() == 1 && "Expected 1 inner forOp");
1580  return res[0];
1581 }
1582 
1585  ArrayRef<AffineForOp> targets) {
1587  SmallVector<AffineForOp, 8> currentTargets(targets);
1588  for (auto it : llvm::zip(forOps, sizes)) {
1589  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590  res.push_back(step);
1591  currentTargets = step;
1592  }
1593  return res;
1594 }
1595 
1597  ArrayRef<uint64_t> sizes,
1598  AffineForOp target) {
1600  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1601  res.push_back(llvm::getSingleElement(loops));
1602  return res;
1603 }
1604 
1606  if (loops.size() < 2)
1607  return success();
1608 
1609  AffineForOp innermost = loops.back();
1610  AffineForOp outermost = loops.front();
1611  AffineBound ub = outermost.getUpperBound();
1612  AffineMap origUbMap = ub.getMap();
1613  Location loc = outermost.getLoc();
1614  OpBuilder builder(outermost);
1615  for (AffineForOp loop : loops) {
1616  // We only work on normalized loops.
1617  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618  loop.getConstantLowerBound() != 0)
1619  return failure();
1620  }
1621  SmallVector<Value, 4> upperBoundSymbols;
1622  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1623  ub.getOperands().end());
1624 
1625  // 1. Store the upper bound of the outermost loop in a variable.
1626  Value prev;
1627  if (!llvm::hasSingleElement(origUbMap.getResults()))
1628  prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1629  else
1630  prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631  upperBoundSymbols.push_back(prev);
1632 
1633  // 2. Emit code computing the upper bound of the coalesced loop as product of
1634  // the number of iterations of all loops.
1635  for (AffineForOp loop : loops.drop_front()) {
1636  ub = loop.getUpperBound();
1637  origUbMap = ub.getMap();
1638  ubOperands = ub.getOperands();
1639  Value upperBound;
1640  // If upper bound map has more than one result, take their minimum.
1641  if (!llvm::hasSingleElement(origUbMap.getResults()))
1642  upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1643  else
1644  upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645  upperBoundSymbols.push_back(upperBound);
1646  SmallVector<Value, 4> operands;
1647  operands.push_back(prev);
1648  operands.push_back(upperBound);
1649  // Maintain running product of loop upper bounds.
1650  prev = AffineApplyOp::create(
1651  builder, loc,
1652  AffineMap::get(/*dimCount=*/1,
1653  /*symbolCount=*/1,
1654  builder.getAffineDimExpr(0) *
1655  builder.getAffineSymbolExpr(0)),
1656  operands);
1657  }
1658  // Set upper bound of the coalesced loop.
1659  AffineMap newUbMap = AffineMap::get(
1660  /*dimCount=*/0,
1661  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1662  outermost.setUpperBound(prev, newUbMap);
1663 
1664  builder.setInsertionPointToStart(outermost.getBody());
1665 
1666  // 3. Remap induction variables. For each original loop, the value of the
1667  // induction variable can be obtained by dividing the induction variable of
1668  // the linearized loop by the total number of iterations of the loops nested
1669  // in it modulo the number of iterations in this loop (remove the values
1670  // related to the outer loops):
1671  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1672  // Compute these iteratively from the innermost loop by creating a "running
1673  // quotient" of division by the range.
1674  Value previous = outermost.getInductionVar();
1675  for (unsigned idx = loops.size(); idx > 0; --idx) {
1676  if (idx != loops.size()) {
1677  SmallVector<Value, 4> operands;
1678  operands.push_back(previous);
1679  operands.push_back(upperBoundSymbols[idx]);
1680  previous = AffineApplyOp::create(builder, loc,
1682  /*dimCount=*/1, /*symbolCount=*/1,
1683  builder.getAffineDimExpr(0).floorDiv(
1684  builder.getAffineSymbolExpr(0))),
1685  operands);
1686  }
1687  // Modified value of the induction variables of the nested loops after
1688  // coalescing.
1689  Value inductionVariable;
1690  if (idx == 1) {
1691  inductionVariable = previous;
1692  } else {
1693  SmallVector<Value, 4> applyOperands;
1694  applyOperands.push_back(previous);
1695  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696  inductionVariable = AffineApplyOp::create(
1697  builder, loc,
1699  /*dimCount=*/1, /*symbolCount=*/1,
1700  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1701  applyOperands);
1702  }
1703  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1704  inductionVariable, loops.back().getRegion());
1705  }
1706 
1707  // 4. Move the operations from the innermost just above the second-outermost
1708  // loop, delete the extra terminator and the second-outermost loop.
1709  AffineForOp secondOutermostLoop = loops[1];
1710  innermost.getBody()->back().erase();
1711  outermost.getBody()->getOperations().splice(
1712  Block::iterator(secondOutermostLoop.getOperation()),
1713  innermost.getBody()->getOperations());
1714  secondOutermostLoop.erase();
1715  return success();
1716 }
1717 
1719  ArrayRef<Value> processorId,
1720  ArrayRef<Value> numProcessors) {
1721  assert(processorId.size() == numProcessors.size());
1722  if (processorId.empty())
1723  return;
1724 
1725  OpBuilder b(forOp);
1726  Location loc(forOp.getLoc());
1727  AffineExpr lhs, rhs;
1728  bindSymbols(forOp.getContext(), lhs, rhs);
1729  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1730  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1731 
1732  Value linearIndex = processorId.front();
1733  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1734  auto mulApplyOp = AffineApplyOp::create(
1735  b, loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1736  linearIndex = AffineApplyOp::create(b, loc, addMap,
1737  ValueRange{mulApplyOp, processorId[i]});
1738  }
1739 
1740  auto mulApplyOp = AffineApplyOp::create(
1741  b, loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1742  Value lb = AffineApplyOp::create(
1743  b, loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1744  forOp.setLowerBound(lb);
1745 
1746  Value step = forOp.getStep();
1747  for (auto numProcs : numProcessors)
1748  step = AffineApplyOp::create(b, loc, mulMap, ValueRange{numProcs, step});
1749  forOp.setStep(step);
1750 }
1751 
1752 /// Given a memref region, determine the lowest depth at which transfers can be
1753 /// placed for it, and return the corresponding block, start and end positions
1754 /// in the block for placing incoming (read) and outgoing (write) copies
1755 /// respectively. The lowest depth depends on whether the region being accessed
1756 /// is hoistable with respect to one or more immediately surrounding loops.
1757 static void
1759  Block::iterator &begin, Block::iterator &end,
1760  Block **copyPlacementBlock,
1761  Block::iterator *copyInPlacementStart,
1762  Block::iterator *copyOutPlacementStart) {
1763  const auto *cst = region.getConstraints();
1764  SmallVector<Value, 4> symbols;
1765  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1766 
1767  SmallVector<Operation *, 4> enclosingAffineOps;
1768  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1769  // Walk up loop parents till we find an IV on which this region is
1770  // symbolic/variant or we hit `hoistGuard`.
1771  auto it = enclosingAffineOps.rbegin();
1772  AffineForOp lastInvariantFor;
1773  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1774  Operation *enclosingOp = *it;
1775  // We can't hoist past the definition of the memref being copied.
1776  Value memref = region.memref;
1777  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1778  LDBG() << "memref definition will end up not dominating hoist location";
1779  break;
1780  }
1781 
1782  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1783  if (!affineFor)
1784  break;
1785  // TODO: also need to be checking this for regions symbols that
1786  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1787  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1788  break;
1789  lastInvariantFor = affineFor;
1790  }
1791 
1792  if (it != enclosingAffineOps.rbegin()) {
1793  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1794  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1795  *copyPlacementBlock = lastInvariantFor->getBlock();
1796  } else {
1797  *copyInPlacementStart = begin;
1798  *copyOutPlacementStart = end;
1799  *copyPlacementBlock = &block;
1800  }
1801 }
1802 
1803 // Info comprising stride and number of elements transferred every stride.
1804 struct StrideInfo {
1805  int64_t stride;
1807 };
1808 
1809 /// Returns striding information for a copy/transfer of this region with
1810 /// potentially multiple striding levels from outermost to innermost. For an
1811 /// n-dimensional region, there can be at most n-1 levels of striding
1812 /// successively nested.
1813 // TODO: make this work with non-identity layout maps.
1814 static void getMultiLevelStrides(const MemRefRegion &region,
1815  ArrayRef<int64_t> bufferShape,
1816  SmallVectorImpl<StrideInfo> *strideInfos) {
1817  if (bufferShape.size() <= 1)
1818  return;
1819 
1820  int64_t numEltPerStride = 1;
1821  int64_t stride = 1;
1822  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1823  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1824  stride *= dimSize;
1825  numEltPerStride *= bufferShape[d];
1826  // A stride is needed only if the region has a shorter extent than the
1827  // memref along the dimension *and* has an extent greater than one along the
1828  // next major dimension.
1829  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1830  strideInfos->push_back({stride, numEltPerStride});
1831  }
1832  }
1833 }
1834 
1835 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1836 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1837 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1838 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1839 /// contain the expressions to be subtracted out from the respective copy loop
1840 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1841 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1842 /// copy nest is inserted.
1843 //
1844 /// The copy-in nest is generated as follows as an example for a 2-d region:
1845 /// for x = ...
1846 /// for y = ...
1847 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1848 ///
1849 static AffineForOp
1850 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1851  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1852  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1853  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1854  OpBuilder b) {
1855  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1856  return lbMap.getNumInputs() == lbOperands.size();
1857  }));
1858  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1859  return ubMap.getNumInputs() == ubOperands.size();
1860  }));
1861 
1862  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1863  // A copy nest can't be generated for 0-ranked memrefs.
1864  assert(rank != 0 && "non-zero rank memref expected");
1865  assert(lbMaps.size() == rank && "wrong number of lb maps");
1866  assert(ubMaps.size() == rank && "wrong number of ub maps");
1867 
1868  SmallVector<Value, 4> memIndices;
1869  SmallVector<AffineExpr, 4> fastBufExprs;
1870  SmallVector<Value, 4> fastBufMapOperands;
1871  AffineForOp copyNestRoot;
1872  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1873  for (unsigned d = 0; d < rank; ++d) {
1874  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1875  ubOperands, ubMaps[d]);
1876  if (d == 0)
1877  copyNestRoot = forOp;
1878 
1879  b = OpBuilder::atBlockTerminator(forOp.getBody());
1880 
1881  auto fastBufOffsetMap =
1882  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1883  auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1884 
1885  // Construct the subscript for the fast memref being copied into/from:
1886  // x - offset_x.
1887  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1888  b.getAffineDimExpr(2 * d));
1889  fastBufMapOperands.push_back(offset);
1890  fastBufMapOperands.push_back(forOp.getInductionVar());
1891  mayBeDeadApplys.push_back(offset);
1892 
1893  // Subscript for the slow memref being copied.
1894  memIndices.push_back(forOp.getInductionVar());
1895  }
1896 
1897  auto fastBufMap =
1898  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1899  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1900  fastBufMap = simplifyAffineMap(fastBufMap);
1901  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1902 
1903  // Drop any dead affine.applys.
1904  for (auto applyOp : mayBeDeadApplys)
1905  if (applyOp.use_empty())
1906  applyOp.erase();
1907 
1908  if (!isCopyOut) {
1909  // Copy in.
1910  auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1911  AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1912  fastBufMapOperands);
1913  return copyNestRoot;
1914  }
1915 
1916  // Copy out.
1917  auto load =
1918  AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1919  AffineStoreOp::create(b, loc, load, memref, memIndices);
1920  return copyNestRoot;
1921 }
1922 
1923 [[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
1924  return block.getParentOp()->emitRemark();
1925 }
1926 
1927 /// Creates a buffer in the faster memory space for the specified memref region
1928 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1929 /// space to this one, and replaces all loads/stores in the block range
1930 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1931 /// if copies could not be generated due to yet unimplemented cases.
1932 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1933 /// specify the insertion points where the incoming copies and outgoing copies,
1934 /// respectively, should be inserted (the insertion happens right before the
1935 /// insertion point). Since `begin` can itself be invalidated due to the memref
1936 /// rewriting done from this method, the output argument `nBegin` is set to its
1937 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1938 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1939 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1940 /// allocated.
1941 static LogicalResult generateCopy(
1942  const MemRefRegion &region, Block *block, Block::iterator begin,
1943  Block::iterator end, Block *copyPlacementBlock,
1944  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1945  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1946  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1947  Block::iterator *nBegin, Block::iterator *nEnd) {
1948  *nBegin = begin;
1949  *nEnd = end;
1950 
1951  auto f = begin->getParentOfType<FunctionOpInterface>();
1952  OpBuilder topBuilder(f.getFunctionBody());
1953  Value zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
1954 
1955  *sizeInBytes = 0;
1956 
1957  if (begin == end)
1958  return success();
1959 
1960  // Record the last op in the block for which we are performing copy
1961  // generation. We later do the memref replacement only in [begin, lastCopyOp]
1962  // so that the original memref's used in the data movement code themselves
1963  // don't get replaced.
1964  Operation *lastCopyOp = end->getPrevNode();
1965 
1966  // Is the copy out point at the end of the block where we are doing
1967  // explicit copying.
1968  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1969 
1970  // Copies for read regions are going to be inserted at 'begin'.
1971  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1972  // Copies for write regions are going to be inserted at 'end'.
1973  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1974  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1975 
1976  // Builder to create constants at the top level.
1977  auto func =
1978  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1979  OpBuilder top(func.getFunctionBody());
1980 
1981  auto loc = region.loc;
1982  auto memref = region.memref;
1983  auto memRefType = cast<MemRefType>(memref.getType());
1984 
1985  if (!memRefType.getLayout().isIdentity()) {
1986  LDBG() << "Non-identity layout map not yet supported";
1987  return failure();
1988  }
1989 
1990  // Indices to use for the copying.
1991  // Indices for the original memref being copied from/to.
1992  SmallVector<Value, 4> memIndices;
1993  // Indices for the faster buffer being copied into/from.
1994  SmallVector<Value, 4> bufIndices;
1995 
1996  unsigned rank = memRefType.getRank();
1997  if (rank == 0) {
1998  LDBG() << "Non-zero ranked memrefs supported";
1999  return failure();
2000  }
2001 
2002  SmallVector<int64_t, 4> fastBufferShape;
2003 
2004  // Compute the extents of the buffer.
2006  lbs.reserve(rank);
2007  std::optional<int64_t> numElements =
2008  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2009  if (!numElements) {
2010  LDBG() << "Non-constant region size not supported";
2011  return failure();
2012  }
2013 
2014  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2015  // This can be supported in the future if needed.
2016  LDBG() << "Max lower bound for memref region start not supported";
2017  return failure();
2018  }
2019 
2020  if (*numElements == 0) {
2021  LDBG() << "Nothing to copy";
2022  return success();
2023  }
2024 
2025  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2026  for (unsigned i = 0; i < rank; ++i) {
2027  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2028  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2029  LDBG() << "Missing lower or upper bound for region along dimension: "
2030  << i;
2031  return failure();
2032  }
2033  }
2034 
2035  const FlatAffineValueConstraints *cst = region.getConstraints();
2036  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2037  // on; these typically include loop IVs surrounding the level at which the
2038  // copy generation is being done or other valid symbols in MLIR.
2039  SmallVector<Value, 8> regionSymbols;
2040  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2041 
2042  // Construct the access expression for the fast memory buffer. The access
2043  // expression for a particular dimension of the fast buffer is obtained by
2044  // subtracting out the lower bound on the original memref's data region
2045  // along the corresponding dimension.
2046 
2047  // Index start offsets for faster memory buffer relative to the original.
2048  SmallVector<AffineExpr, 4> fastBufOffsets;
2049  fastBufOffsets.reserve(rank);
2050  for (unsigned d = 0; d < rank; d++) {
2051  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2052  "incorrect bound size");
2053 
2054  // Set copy start location for this dimension in the lower memory space
2055  // memref.
2056  if (lbs[d].isSingleConstant()) {
2057  auto indexVal = lbs[d].getSingleConstantResult();
2058  if (indexVal == 0) {
2059  memIndices.push_back(zeroIndex);
2060  } else {
2061  memIndices.push_back(
2062  arith::ConstantIndexOp::create(top, loc, indexVal).getResult());
2063  }
2064  } else {
2065  // The coordinate for the start location is just the lower bound along the
2066  // corresponding dimension on the memory region (stored in 'offset').
2067  // Remap all inputs of the map to dimensions uniformly since in the
2068  // generate IR we need valid affine symbols as opposed to "symbols" for
2069  // the purpose of the memref region.
2070  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2071  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2072  symReplacements[i] = top.getAffineDimExpr(i);
2073  lbs[d] = lbs[d].replaceDimsAndSymbols(
2074  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2075  /*numResultSyms=*/0);
2076  memIndices.push_back(
2077  AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2078  }
2079  // The fast buffer is copied into at location zero; addressing is relative.
2080  bufIndices.push_back(zeroIndex);
2081 
2082  // Record the offsets since they are needed to remap the memory accesses of
2083  // the original memref further below.
2084  fastBufOffsets.push_back(lbs[d].getResult(0));
2085  }
2086 
2087  // The faster memory space buffer.
2088  Value fastMemRef;
2089 
2090  // Check if a buffer was already created.
2091  bool existingBuf = fastBufferMap.count(memref) > 0;
2092  if (!existingBuf) {
2093  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2094  auto fastMemRefType =
2095  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2096  fastBufferLayout, copyOptions.fastMemorySpace);
2097 
2098  // Create the fast memory space buffer just before the 'affine.for'
2099  // operation.
2100  fastMemRef =
2101  memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2102  // Record it.
2103  fastBufferMap[memref] = fastMemRef;
2104  // fastMemRefType is a constant shaped memref.
2105  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2106  // We don't account for things of unknown size.
2107  *sizeInBytes = maySizeInBytes.value_or(0);
2108 
2109  LLVM_DEBUG(emitRemarkForBlock(*block)
2110  << "Creating fast buffer of type " << fastMemRefType
2111  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2112  << " KiB\n");
2113  } else {
2114  // Reuse the one already created.
2115  fastMemRef = fastBufferMap[memref];
2116  }
2117 
2118  auto numElementsSSA = arith::ConstantIndexOp::create(top, loc, *numElements);
2119 
2120  Value dmaStride;
2121  Value numEltPerDmaStride;
2122  if (copyOptions.generateDma) {
2123  SmallVector<StrideInfo, 4> dmaStrideInfos;
2124  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2125 
2126  // TODO: use all stride levels once DmaStartOp is extended for
2127  // multi-level strides.
2128  if (dmaStrideInfos.size() > 1) {
2129  LDBG() << "Only up to one level of stride supported";
2130  return failure();
2131  }
2132 
2133  if (!dmaStrideInfos.empty()) {
2134  dmaStride =
2135  arith::ConstantIndexOp::create(top, loc, dmaStrideInfos[0].stride);
2136  numEltPerDmaStride = arith::ConstantIndexOp::create(
2137  top, loc, dmaStrideInfos[0].numEltPerStride);
2138  }
2139  }
2140 
2141  // Create fully composed affine maps for each memref.
2142  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2143  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2144  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2145  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2146 
2147  if (!copyOptions.generateDma) {
2148  // Point-wise copy generation.
2149  auto copyNest =
2150  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2151  /*lbOperands=*/regionSymbols, ubMaps,
2152  /*ubOperands=*/regionSymbols, fastBufOffsets,
2153  /*isCopyOut=*/region.isWrite(), b);
2154 
2155  // Record this so that we can skip it from yet another copy.
2156  copyNests.insert(copyNest);
2157 
2158  // Since new ops are being appended (for copy out's), adjust the end to
2159  // mark end of block range being processed if necessary.
2160  if (region.isWrite() && isCopyOutAtEndOfBlock)
2161  *nEnd = Block::iterator(copyNest.getOperation());
2162  } else {
2163  // DMA generation.
2164  // Create a tag (single element 1-d memref) for the DMA.
2165  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2166  copyOptions.tagMemorySpace);
2167  auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2168 
2169  SmallVector<Value, 4> tagIndices({zeroIndex});
2170  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2171  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2172  if (!region.isWrite()) {
2173  // DMA non-blocking read from original buffer to fast buffer.
2174  AffineDmaStartOp::create(b, loc, memref, memAffineMap, memIndices,
2175  fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2176  tagAffineMap, tagIndices, numElementsSSA,
2177  dmaStride, numEltPerDmaStride);
2178  } else {
2179  // DMA non-blocking write from fast buffer to the original memref.
2180  auto op = AffineDmaStartOp::create(
2181  b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2182  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2183  dmaStride, numEltPerDmaStride);
2184  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2185  // end to mark end of block range being processed.
2186  if (isCopyOutAtEndOfBlock)
2187  *nEnd = Block::iterator(op.getOperation());
2188  }
2189 
2190  // Matching DMA wait to block on completion; tag always has a 0 index.
2191  AffineDmaWaitOp::create(b, loc, tagMemRef, tagAffineMap, zeroIndex,
2192  numElementsSSA);
2193 
2194  // Generate dealloc for the tag.
2195  auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2196  if (*nEnd == end && isCopyOutAtEndOfBlock)
2197  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2198  // mark end of range of the original.
2199  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2200  }
2201 
2202  // Generate dealloc for the buffer.
2203  if (!existingBuf) {
2204  auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2205  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2206  // the fast buffer (since it marks the new end insertion point).
2207  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2208  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2209  }
2210 
2211  // Replace all uses of the old memref with the faster one while remapping
2212  // access indices (subtracting out lower bound offsets for each dimension).
2213  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2214  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2215  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2216  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2217  // d2, d3 correspond to the original indices (%i, %j).
2218  SmallVector<AffineExpr, 4> remapExprs;
2219  remapExprs.reserve(rank);
2220  for (unsigned i = 0; i < rank; i++) {
2221  // The starting operands of indexRemap will be regionSymbols (the symbols on
2222  // which the memref region is parametric); then those corresponding to
2223  // the memref's original indices follow.
2224  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2225  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2226  }
2227  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2228  b.getContext());
2229 
2230  // Record the begin since it may be invalidated by memref replacement.
2231  Block::iterator prevOfBegin;
2232  bool isBeginAtStartOfBlock = (begin == block->begin());
2233  if (!isBeginAtStartOfBlock)
2234  prevOfBegin = std::prev(begin);
2235 
2236  auto userFilterFn = [&](Operation *user) {
2237  auto *ancestorUser = block->findAncestorOpInBlock(*user);
2238  return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2239  !lastCopyOp->isBeforeInBlock(ancestorUser);
2240  };
2241 
2242  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2243  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2244  /*extraIndices=*/{}, indexRemap,
2245  /*extraOperands=*/regionSymbols,
2246  /*symbolOperands=*/{}, userFilterFn);
2247 
2248  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2249 
2250  return success();
2251 }
2252 
2253 /// Construct the memref region to just include the entire memref. Returns false
2254 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2255 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2256 /// is parametric on.
2257 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2258  MemRefRegion *region) {
2259  unsigned rank;
2260  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2261  rank = loadOp.getMemRefType().getRank();
2262  region->memref = loadOp.getMemRef();
2263  region->setWrite(false);
2264  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2265  rank = storeOp.getMemRefType().getRank();
2266  region->memref = storeOp.getMemRef();
2267  region->setWrite(true);
2268  } else {
2269  assert(false && "expected load or store op");
2270  return false;
2271  }
2272  auto memRefType = cast<MemRefType>(region->memref.getType());
2273  if (!memRefType.hasStaticShape())
2274  return false;
2275 
2276  auto *regionCst = region->getConstraints();
2277 
2278  // Just get the first numSymbols IVs, which the memref region is parametric
2279  // on.
2281  getAffineForIVs(*op, &ivs);
2282  ivs.resize(numParamLoopIVs);
2283  SmallVector<Value, 4> symbols;
2284  extractForInductionVars(ivs, &symbols);
2285  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2286  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2287 
2288  // Memref dim sizes provide the bounds.
2289  for (unsigned d = 0; d < rank; d++) {
2290  auto dimSize = memRefType.getDimSize(d);
2291  assert(dimSize > 0 && "filtered dynamic shapes above");
2292  regionCst->addBound(BoundType::LB, d, 0);
2293  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2294  }
2295  return true;
2296 }
2297 
2298 LogicalResult
2300  const AffineCopyOptions &copyOptions,
2301  std::optional<Value> filterMemRef,
2302  DenseSet<Operation *> &copyNests) {
2303  if (begin == end)
2304  return success();
2305 
2306  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2307  "Inconsistent block begin/end args");
2308  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2309 
2310  Block *block = begin->getBlock();
2311 
2312  // Copies will be generated for this depth, i.e., symbolic in all loops
2313  // surrounding the this block range.
2314  unsigned copyDepth = getNestingDepth(&*begin);
2315 
2316  LDBG() << "Generating copies at depth " << copyDepth;
2317  LDBG() << "from begin: "
2318  << OpWithFlags(&*begin, OpPrintingFlags().skipRegions());
2319  LDBG() << "to inclusive end: "
2320  << OpWithFlags(&*std::prev(end), OpPrintingFlags().skipRegions());
2321 
2322  // List of memory regions to copy for. We need a map vector to have a
2323  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2324  // since the alloc's for example are identical except for the SSA id.
2325  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2326  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2327 
2328  // Map from original memref's to the fast buffers that their accesses are
2329  // replaced with.
2330  DenseMap<Value, Value> fastBufferMap;
2331 
2332  // To check for errors when walking the block.
2333  bool error = false;
2334 
2335  // Walk this range of operations to gather all memory regions.
2336  block->walk(begin, end, [&](Operation *opInst) {
2337  Value memref;
2338  MemRefType memrefType;
2339  // Gather regions to allocate to buffers in faster memory space.
2340  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2341  memref = loadOp.getMemRef();
2342  memrefType = loadOp.getMemRefType();
2343  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2344  memref = storeOp.getMemRef();
2345  memrefType = storeOp.getMemRefType();
2346  }
2347  // Not an affine.load/store op.
2348  if (!memref)
2349  return;
2350 
2351  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2352  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2353  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2354  return;
2355 
2356  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2357  LDBG() << "memref definition is inside of the depth at "
2358  << "which copy-in/copy-out would happen";
2359  return;
2360  }
2361 
2362  // Compute the MemRefRegion accessed.
2363  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2364  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2365  /*addMemRefDimBounds=*/false))) {
2366  LDBG() << "Error obtaining memory region: semi-affine maps?";
2367  LDBG() << "over-approximating to the entire memref";
2368  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2369  LDBG() << "non-constant memref sizes not yet supported";
2370  error = true;
2371  return;
2372  }
2373  }
2374 
2375  // Each memref has a single buffer associated with it irrespective of how
2376  // many load's and store's happen on it.
2377  // TODO: in the future, when regions don't intersect and satisfy
2378  // other properties (based on load/store regions), we could consider
2379  // multiple buffers per memref.
2380 
2381  // Add to the appropriate region if it's not already in it, or take a
2382  // bounding box union with the existing one if it's already in there.
2383  // Note that a memref may have both read and write regions - so update the
2384  // region in the other list if one exists (write in case of read and vice
2385  // versa) since there is a single bounding box for a memref across all reads
2386  // and writes that happen on it.
2387 
2388  // Attempts to update; returns true if 'region' exists in targetRegions.
2389  auto updateRegion =
2390  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2391  &targetRegions) {
2392  const auto *const it = targetRegions.find(region->memref);
2393  if (it == targetRegions.end())
2394  return false;
2395 
2396  // Perform a union with the existing region.
2397  if (failed(it->second->unionBoundingBox(*region))) {
2398  LDBG() << "Memory region bounding box failed; "
2399  << "over-approximating to the entire memref";
2400  // If the union fails, we will overapproximate.
2401  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2402  LDBG() << "non-constant memref sizes not yet supported";
2403  error = true;
2404  return true;
2405  }
2406  it->second->getConstraints()->clearAndCopyFrom(
2407  *region->getConstraints());
2408  } else {
2409  // Union was computed and stored in 'it->second': copy to 'region'.
2410  region->getConstraints()->clearAndCopyFrom(
2411  *it->second->getConstraints());
2412  }
2413  return true;
2414  };
2415 
2416  bool existsInRead = updateRegion(readRegions);
2417  if (error)
2418  return;
2419  bool existsInWrite = updateRegion(writeRegions);
2420  if (error)
2421  return;
2422 
2423  // Finally add it to the region list.
2424  if (region->isWrite() && !existsInWrite) {
2425  writeRegions[region->memref] = std::move(region);
2426  } else if (!region->isWrite() && !existsInRead) {
2427  readRegions[region->memref] = std::move(region);
2428  }
2429  });
2430 
2431  if (error) {
2432  LDBG() << "copy generation failed for one or more memref's in this block";
2433  return failure();
2434  }
2435 
2436  uint64_t totalCopyBuffersSizeInBytes = 0;
2437  bool ret = true;
2438  auto processRegions =
2439  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2440  &regions) {
2441  for (const auto &regionEntry : regions) {
2442  // For each region, hoist copy in/out past all hoistable
2443  // 'affine.for's.
2444  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2445  Block *copyPlacementBlock;
2447  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2448  &copyInPlacementStart, &copyOutPlacementStart);
2449 
2450  uint64_t sizeInBytes;
2451  Block::iterator nBegin, nEnd;
2452  LogicalResult iRet = generateCopy(
2453  *regionEntry.second, block, begin, end, copyPlacementBlock,
2454  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2455  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2456  if (succeeded(iRet)) {
2457  // begin/end could have been invalidated, and need update.
2458  begin = nBegin;
2459  end = nEnd;
2460  totalCopyBuffersSizeInBytes += sizeInBytes;
2461  }
2462  ret = ret & succeeded(iRet);
2463  }
2464  };
2465  processRegions(readRegions);
2466  processRegions(writeRegions);
2467 
2468  if (!ret) {
2469  LDBG() << "copy generation failed for one or more memref's in this block";
2470  return failure();
2471  }
2472 
2473  // For a range of operations, a note will be emitted at the caller.
2474  AffineForOp forOp;
2475  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2476  LLVM_DEBUG(forOp.emitRemark()
2477  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2478  << " KiB of copy buffers in fast memory space for this block");
2479  }
2480 
2481  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2482  block->getParentOp()->emitWarning(
2483  "total size of all copy buffers' for this block exceeds fast memory "
2484  "capacity");
2485  }
2486 
2487  return success();
2488 }
2489 
2490 // A convenience version of affineDataCopyGenerate for all ops in the body of
2491 // an AffineForOp.
2493  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2494  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2495  return affineDataCopyGenerate(forOp.getBody()->begin(),
2496  std::prev(forOp.getBody()->end()), copyOptions,
2497  filterMemRef, copyNests);
2498 }
2499 
2501  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2502  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2503  Block *block = analyzedOp->getBlock();
2504  auto begin = analyzedOp->getIterator();
2505  auto end = std::next(begin);
2506  DenseMap<Value, Value> fastBufferMap;
2507  DenseSet<Operation *> copyNests;
2508 
2509  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2510  copyOptions, fastBufferMap, copyNests,
2511  &result.sizeInBytes, &begin, &end);
2512  if (failed(err))
2513  return err;
2514 
2515  const auto &en = fastBufferMap.find(memrefRegion.memref);
2516  // In some cases (empty loops), no copy generation would have happened.
2517  if (en == fastBufferMap.end())
2518  return failure();
2519  result.alloc = en->second.getDefiningOp();
2520  assert(result.alloc && "fast buffer expected to be locally allocated");
2521  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2522  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2523  return success();
2524 }
2525 
2526 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2527 static void
2528 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2529  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2530  // Add a new empty level to output if it doesn't exist level already.
2531  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2532  if (currLoopDepth == depthToLoops.size())
2533  depthToLoops.emplace_back();
2534 
2535  for (auto &op : *block) {
2536  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2537  depthToLoops[currLoopDepth].push_back(forOp);
2538  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2539  }
2540  }
2541 }
2542 
2543 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2545  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2546  for (auto &block : func)
2547  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2548 
2549  // Remove last loop level from output since it's empty.
2550  if (!depthToLoops.empty()) {
2551  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2552  depthToLoops.pop_back();
2553  }
2554 }
2555 
2557  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2558  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2559  SmallVector<Value, 4> lowerOperands(lbOperands);
2560  SmallVector<Value, 4> upperOperands(ubOperands);
2561 
2562  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2563  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2564  lbMap = removeDuplicateExprs(lbMap);
2565  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2566  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2567  ubMap = removeDuplicateExprs(ubMap);
2568 
2569  return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2570  step);
2571 }
2572 
2573 /// Creates an AffineIfOp that encodes the conditional to choose between
2574 /// the constant trip count version and an unknown trip count version of this
2575 /// nest of loops. This is used to separate partial and full tiles if `loops`
2576 /// has the intra-tile loops. The affine.if op is inserted at the builder
2577 /// insertion point of `b`.
2579  OpBuilder b) {
2580  if (loops.empty())
2581  return nullptr;
2582 
2583  auto *context = loops[0].getContext();
2584 
2587  llvm::append_range(ops, loops);
2588  (void)getIndexSet(ops, &cst);
2589 
2590  // Remove constraints that are independent of these loop IVs.
2591  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2592 
2593  // Construct the constraint set representing the guard for full tiles. The
2594  // lower bound (and upper bound) corresponding to the full tile should be
2595  // larger (and resp. smaller) than any other lower (or upper bound).
2596  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2597  for (auto loop : loops) {
2598  (void)loop;
2599  // TODO: Non-unit stride is not an issue to generalize to.
2600  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2601  // Mark everything symbols for the purpose of finding a constant diff pair.
2602  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2603  1);
2604  unsigned fullTileLbPos, fullTileUbPos;
2605  if (!((IntegerRelation)cst)
2606  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2607  /*boundFloorDivisor=*/nullptr,
2608  /*ub=*/nullptr, &fullTileLbPos,
2609  &fullTileUbPos)) {
2610  LDBG() << "Can't get constant diff pair for a loop";
2611  return nullptr;
2612  }
2613 
2614  SmallVector<unsigned, 4> lbIndices, ubIndices;
2615  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2616 
2617  auto fLb = cst.getInequality(fullTileLbPos);
2618  auto fUb = cst.getInequality(fullTileUbPos);
2619  fullTileLb.assign(fLb.begin(), fLb.end());
2620  fullTileUb.assign(fUb.begin(), fUb.end());
2621 
2622  // Full tile lower bound should be >= than any other lower bound.
2623  for (auto lbIndex : lbIndices)
2624  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2625  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2626 
2627  // Full tile upper bound should be <= any other upper bound.
2628  for (auto ubIndex : ubIndices)
2629  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2630  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2631 
2632  cst.removeVar(0);
2633  }
2634 
2635  // The previous step leads to all zeros for the full tile lb and ub position
2636  // itself; remove those and any other duplicates / trivial redundancies.
2638 
2639  // Turn everything into dims conservatively since we earlier turned all
2640  // trailing ids past point loop IV into symbols. Some of these could be outer
2641  // loop IVs; we'll canonicalize anyway.
2642  cst.setDimSymbolSeparation(0);
2643 
2644  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2645  // ifCondSet can be null if cst was empty -- this can happen if all loops
2646  // in the nest have constant trip counts.
2647  if (!ifCondSet)
2648  return nullptr;
2649 
2650  SmallVector<Value, 4> setOperands;
2651  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2652  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2653  return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2654  /*withElseRegion=*/true);
2655 }
2656 
2657 /// Create the full tile loop nest (along with its body).
2658 static LogicalResult
2660  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2661  fullTileLoops.reserve(inputNest.size());
2662 
2663  // For each loop in the original nest identify a lower/upper bound pair such
2664  // that their difference is a constant.
2666  for (auto loop : inputNest) {
2667  // TODO: straightforward to generalize to a non-unit stride.
2668  if (loop.getStepAsInt() != 1) {
2669  LDBG() << "[tile separation] non-unit stride not implemented";
2670  return failure();
2671  }
2672  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2673  (void)getIndexSet(loopOp, &cst);
2674  // We will mark everything other than this loop IV as symbol for getting a
2675  // pair of <lb, ub> with a constant difference.
2677  unsigned lbPos, ubPos;
2678  if (!((IntegerRelation)cst)
2679  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2680  /*boundFloorDivisor=*/nullptr,
2681  /*ub=*/nullptr, &lbPos, &ubPos) ||
2682  lbPos == ubPos) {
2683  LDBG() << "[tile separation] Can't get constant diff / "
2684  << "equalities not yet handled";
2685  return failure();
2686  }
2687 
2688  // Set all variables as dimensions uniformly since some of those marked as
2689  // symbols above could be outer loop IVs (corresponding tile space IVs).
2690  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2691 
2692  AffineValueMap lbVmap, ubVmap;
2693  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2694  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2695  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2696  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2697  ubVmap.getOperands(), ubVmap.getAffineMap());
2698  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2699  fullTileLoops.push_back(fullTileLoop);
2700  }
2701 
2702  // Add the body for the full tile loop nest.
2703  IRMapping operandMap;
2704  for (const auto &loopEn : llvm::enumerate(inputNest))
2705  operandMap.map(loopEn.value().getInductionVar(),
2706  fullTileLoops[loopEn.index()].getInductionVar());
2707  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2708  for (auto &op : inputNest.back().getBody()->without_terminator())
2709  b.clone(op, operandMap);
2710  return success();
2711 }
2712 
2713 LogicalResult
2715  SmallVectorImpl<AffineForOp> *fullTileNest) {
2716  if (inputNest.empty())
2717  return success();
2718 
2719  auto firstLoop = inputNest[0];
2720 
2721  // Each successive for op has to be nested in the other.
2722  auto prevLoop = firstLoop;
2723  for (auto loop : inputNest.drop_front(1)) {
2724  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2725  prevLoop = loop;
2726  }
2727 
2728  // Create the full tile loop nest.
2729  SmallVector<AffineForOp, 4> fullTileLoops;
2730  OpBuilder b(firstLoop);
2731  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2732  if (!fullTileLoops.empty())
2733  fullTileLoops.front().erase();
2734  return failure();
2735  }
2736 
2737  // Create and insert the version select right before the root of the nest.
2738  b = OpBuilder(firstLoop);
2739  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2740  if (!ifOp) {
2741  fullTileLoops.front().erase();
2742  LDBG() << "All tiles are full tiles, or failure creating "
2743  << "separation condition";
2744  return failure();
2745  }
2746 
2747  // Move the full tile into the then block.
2748  Block *thenBlock = ifOp.getThenBlock();
2749  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2750  thenBlock->getOperations().splice(
2751  std::prev(thenBlock->end()),
2752  outermostFullTileLoop->getBlock()->getOperations(),
2753  Block::iterator(outermostFullTileLoop));
2754 
2755  // Move the partial tile into the else block. The partial tile is the same as
2756  // the original loop nest.
2757  Block *elseBlock = ifOp.getElseBlock();
2758  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2759  firstLoop->getBlock()->getOperations(),
2760  Block::iterator(firstLoop));
2761 
2762  if (fullTileNest)
2763  *fullTileNest = std::move(fullTileLoops);
2764 
2765  return success();
2766 }
2767 
2768 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2769  LogicalResult result(failure());
2771  getPerfectlyNestedLoops(loops, op);
2772  if (loops.size() <= 1)
2773  return success();
2774 
2775  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2776  // loops with bounds defined above some loop.
2777  // 1. For each loop, find above which parent loop its operands are
2778  // defined.
2779  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2780  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2781  operandsDefinedAbove[i] = i;
2782  for (unsigned j = 0; j < i; ++j) {
2783  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2784  operandsDefinedAbove[i] = j;
2785  break;
2786  }
2787  }
2788  }
2789 
2790  // 2. Identify bands of loops such that the operands of all of them are
2791  // defined above the first loop in the band. Traverse the nest bottom-up
2792  // so that modifications don't invalidate the inner loops.
2793  for (unsigned end = loops.size(); end > 0; --end) {
2794  unsigned start = 0;
2795  for (; start < end - 1; ++start) {
2796  auto maxPos =
2797  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2798  std::next(operandsDefinedAbove.begin(), end));
2799  if (maxPos > start)
2800  continue;
2801  assert(maxPos == start &&
2802  "expected loop bounds to be known at the start of the band");
2803  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2804  if (succeeded(coalesceLoops(band)))
2805  result = success();
2806  break;
2807  }
2808  // If a band was found and transformed, keep looking at the loops above
2809  // the outermost transformed loop.
2810  if (start != end - 1)
2811  end = start + 1;
2812  }
2813  return result;
2814 }
2815 
2817  int64_t count = 0;
2818  Operation *currentOp = operand.getOwner();
2819  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2820  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2821  break;
2822  currentOp = loopOp;
2823  count++;
2824  }
2825  return count;
2826 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:383
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:660
static InFlightDiagnostic emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1923
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:423
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2257
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1529
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:363
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1758
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:175
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:408
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:559
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:464
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2528
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1941
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2659
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1814
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:688
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:960
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1072
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1302
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:417
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2578
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:45
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1509
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1850
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:103
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:959
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:394
unsigned getNumDims() const
Definition: AffineMap.cpp:390
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:403
unsigned getNumResults() const
Definition: AffineMap.cpp:398
unsigned getNumInputs() const
Definition: AffineMap.cpp:399
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:407
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition: Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:27
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:308
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:31
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:401
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:407
AffineMap getDimIdentityMap()
Definition: Builders.cpp:383
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:387
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:368
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:372
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:364
MLIRContext * getContext() const
Definition: Builders.h:56
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:774
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:316
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:207
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:562
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:431
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:252
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:412
This class represents an operand of an operation.
Definition: Value.h:257
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
Definition: Operation.h:1111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition: Operation.cpp:386
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:719
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:289
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:208
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:39
static WalkResult advance()
Definition: WalkResult.h:47
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:550
Value getOperand(unsigned idx)
Definition: AffineOps.h:556
operand_range getOperands()
Definition: AffineOps.h:565
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
Definition: AffineOps.cpp:1888
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
Definition: AffineOps.cpp:2084
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1605
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:871
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:119
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2299
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1061
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2962
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:995
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:865
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2544
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2556
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:856
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:230
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1336
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2500
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1759
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:886
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1383
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1084
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:814
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1400
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1764
bool isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1361
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:851
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1463
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2816
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:2063
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1718
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1584
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1453
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:772
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1279
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, llvm::function_ref< bool(Operation *)> userFilterFn=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1306
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2768
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2714
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2786
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:561
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:766
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:35
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:776
void generateUnrolledLoop(Block *loopBodyBlock, Value iv, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues, IRMapping *clonedToSrcOpsMap=nullptr)
Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs' and 'yieldedValues' as the...
Definition: Utils.cpp:294
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1805
int64_t numEltPerStride
Definition: LoopUtils.cpp:1806
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:159
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:199
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:489
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1160
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:535
bool isWrite() const
Definition: Utils.h:537
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1216
void setWrite(bool flag)
Definition: Utils.h:538
Value memref
Memref that this region corresponds to.
Definition: Utils.h:570
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:577
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.