MLIR  22.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
22 #include "mlir/IR/IRMapping.h"
23 #include "mlir/IR/IntegerSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/DebugLog.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "loop-utils"
33 
34 using namespace mlir;
35 using namespace affine;
36 using namespace presburger;
37 using llvm::SmallMapVector;
38 
39 /// Computes the cleanup loop lower bound of the loop being unrolled with
40 /// the specified unroll factor; this bound will also be upper bound of the main
41 /// part of the unrolled loop. Computes the bound as an AffineMap with its
42 /// operands or a null map when the trip count can't be expressed as an affine
43 /// expression.
44 static void
45 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
46  AffineMap &cleanupLbMap,
47  SmallVectorImpl<Value> &cleanupLbOperands) {
48  AffineMap tripCountMap;
49  SmallVector<Value, 4> tripCountOperands;
50  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
51  // Trip count can't be computed.
52  if (!tripCountMap) {
53  cleanupLbMap = AffineMap();
54  return;
55  }
56 
57  OpBuilder b(forOp);
58  auto lbMap = forOp.getLowerBoundMap();
59  auto lb = AffineApplyOp::create(b, forOp.getLoc(), lbMap,
60  forOp.getLowerBoundOperands());
61 
62  // For each upper bound expr, get the range.
63  // Eg: affine.for %i = lb to min (ub1, ub2),
64  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
65  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
66  // these affine.apply's make up the cleanup loop lower bound.
67  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
68  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
69  int64_t step = forOp.getStepAsInt();
70  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
71  auto tripCountExpr = tripCountMap.getResult(i);
72  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
73  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
74  tripCountMap.getNumSymbols(), bumpExprs[i]);
75  bumpValues[i] =
76  AffineApplyOp::create(b, forOp.getLoc(), bumpMap, tripCountOperands);
77  }
78 
79  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
80  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
81  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
82 
83  cleanupLbOperands.clear();
84  cleanupLbOperands.push_back(lb);
85  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
86  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
87  b.getContext());
88  // Simplify the cleanupLbMap + cleanupLbOperands.
89  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
90  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
91  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
92  // Remove any affine.apply's that became dead from the simplification above.
93  for (auto v : bumpValues)
94  if (v.use_empty())
95  v.getDefiningOp()->erase();
96 
97  if (lb.use_empty())
98  lb.erase();
99 }
100 
101 /// Helper to replace uses of loop carried values (iter_args) and loop
102 /// yield values while promoting single iteration affine.for ops.
103 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
104  // Replace uses of iter arguments with iter operands (initial values).
105  auto iterOperands = forOp.getInits();
106  auto iterArgs = forOp.getRegionIterArgs();
107  for (auto e : llvm::zip(iterOperands, iterArgs))
108  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
109 
110  // Replace uses of loop results with the values yielded by the loop.
111  auto outerResults = forOp.getResults();
112  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
113  for (auto e : llvm::zip(outerResults, innerResults))
114  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
115 }
116 
117 /// Promotes the loop body of a forOp to its containing block if the forOp
118 /// was known to have a single iteration.
119 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
120  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
121  if (!tripCount || *tripCount != 1)
122  return failure();
123 
124  // TODO: extend this for arbitrary affine bounds.
125  if (forOp.getLowerBoundMap().getNumResults() != 1)
126  return failure();
127 
128  // Replaces all IV uses to its single iteration value.
129  auto iv = forOp.getInductionVar();
130  auto *parentBlock = forOp->getBlock();
131  if (!iv.use_empty()) {
132  if (forOp.hasConstantLowerBound()) {
133  auto func = forOp->getParentOfType<FunctionOpInterface>();
134  OpBuilder builder(forOp->getContext());
135  if (func)
136  builder.setInsertionPointToStart(&func.getFunctionBody().front());
137  else
138  builder.setInsertionPoint(forOp);
139  auto constOp = arith::ConstantIndexOp::create(
140  builder, forOp.getLoc(), forOp.getConstantLowerBound());
141  iv.replaceAllUsesWith(constOp);
142  } else {
143  auto lbOperands = forOp.getLowerBoundOperands();
144  auto lbMap = forOp.getLowerBoundMap();
145  OpBuilder builder(forOp);
146  if (lbMap == builder.getDimIdentityMap()) {
147  // No need of generating an affine.apply.
148  iv.replaceAllUsesWith(lbOperands[0]);
149  } else {
150  auto affineApplyOp =
151  AffineApplyOp::create(builder, forOp.getLoc(), lbMap, lbOperands);
152  iv.replaceAllUsesWith(affineApplyOp);
153  }
154  }
155  }
156 
158 
159  // Move the loop body operations, except for its terminator, to the loop's
160  // containing block.
161  forOp.getBody()->back().erase();
162  parentBlock->getOperations().splice(Block::iterator(forOp),
163  forOp.getBody()->getOperations());
164  forOp.erase();
165  return success();
166 }
167 
168 /// Generates an affine.for op with the specified lower and upper bounds
169 /// while generating the right IV remappings to realize shifts for operations in
170 /// its body. The operations that go into the loop body are specified in
171 /// opGroupQueue starting from the specified offset, and in that order. The
172 /// first element of the pair specifies the shift applied to that group of
173 /// operations; the shift is multiplied by the loop step before being applied.
174 /// Returns nullptr if the generated loop simplifies to a single iteration one.
175 static AffineForOp generateShiftedLoop(
176  AffineMap lbMap, AffineMap ubMap,
177  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
178  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
179  auto lbOperands = srcForOp.getLowerBoundOperands();
180  auto ubOperands = srcForOp.getUpperBoundOperands();
181 
182  assert(lbMap.getNumInputs() == lbOperands.size());
183  assert(ubMap.getNumInputs() == ubOperands.size());
184 
185  auto loopChunk =
186  AffineForOp::create(b, srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
187  ubMap, srcForOp.getStepAsInt());
188  auto loopChunkIV = loopChunk.getInductionVar();
189  auto srcIV = srcForOp.getInductionVar();
190 
191  IRMapping operandMap;
192 
193  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
194  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
195  uint64_t shift = it.first;
196  auto ops = it.second;
197  // All 'same shift' operations get added with their operands being
198  // remapped to results of cloned operations, and their IV used remapped.
199  // Generate the remapping if the shift is not zero: remappedIV = newIV -
200  // shift.
201  if (!srcIV.use_empty() && shift != 0) {
202  auto ivRemap = AffineApplyOp::create(
203  bodyBuilder, srcForOp.getLoc(),
204  bodyBuilder.getSingleDimShiftAffineMap(
205  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
206  loopChunkIV);
207  operandMap.map(srcIV, ivRemap);
208  } else {
209  operandMap.map(srcIV, loopChunkIV);
210  }
211  for (auto *op : ops)
212  bodyBuilder.clone(*op, operandMap);
213  };
214  if (succeeded(promoteIfSingleIteration(loopChunk)))
215  return AffineForOp();
216  return loopChunk;
217 }
218 
219 // The skewing of operations with respect to one another can be used for
220 // example to allow overlap of asynchronous operations (such as DMA
221 // communication) with computation, or just relative shifting of operations
222 // for better register reuse, locality or parallelism. As such, the shifts are
223 // typically expected to be at most of the order of the number of operations.
224 // This method should not be used as a substitute for loop distribution/fission.
225 // This method uses an algorithm// in time linear in the number of operations
226 // in the body of the for loop - (using the 'sweep line' paradigm). This method
227 // asserts preservation of SSA dominance. A check for that as well as that for
228 // memory-based dependence preservation check rests with the users of this
229 // method.
230 LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
231  ArrayRef<uint64_t> shifts,
232  bool unrollPrologueEpilogue) {
233  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234  "too few/many shifts");
235  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
236  return success();
237 
238  // If the trip counts aren't constant, we would need versioning and
239  // conditional guards (or context information to prevent such versioning). The
240  // better way to pipeline for such loops is to first tile them and extract
241  // constant trip count "full tiles" before applying this.
242  auto mayBeConstTripCount = getConstantTripCount(forOp);
243  if (!mayBeConstTripCount) {
244  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
245  return success();
246  }
247  uint64_t tripCount = *mayBeConstTripCount;
248 
249  assert(isOpwiseShiftValid(forOp, shifts) &&
250  "shifts will lead to an invalid transformation\n");
251 
252  int64_t step = forOp.getStepAsInt();
253 
254  unsigned numChildOps = shifts.size();
255 
256  // Do a linear time (counting) sort for the shifts.
257  uint64_t maxShift = *llvm::max_element(shifts);
258  if (maxShift >= numChildOps) {
259  // Large shifts are not the typical use case.
260  forOp.emitWarning("not shifting because shifts are unrealistically large");
261  return success();
262  }
263 
264  // An array of operation groups sorted by shift amount; each group has all
265  // operations with the same shift in the order in which they appear in the
266  // body of the 'affine.for' op.
267  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
268  unsigned pos = 0;
269  for (auto &op : forOp.getBody()->without_terminator()) {
270  auto shift = shifts[pos++];
271  sortedOpGroups[shift].push_back(&op);
272  }
273 
274  // Unless the shifts have a specific pattern (which actually would be the
275  // common use case), prologue and epilogue are not meaningfully defined.
276  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
277  // loop generated as the prologue and the last as epilogue and unroll these
278  // fully.
279  AffineForOp prologue, epilogue;
280 
281  // Do a sweep over the sorted shifts while storing open groups in a
282  // vector, and generating loop portions as necessary during the sweep. A block
283  // of operations is paired with its shift.
284  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
285 
286  auto origLbMap = forOp.getLowerBoundMap();
287  uint64_t lbShift = 0;
288  OpBuilder b(forOp);
289  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
290  // If nothing is shifted by d, continue.
291  if (sortedOpGroups[d].empty())
292  continue;
293  if (!opGroupQueue.empty()) {
294  assert(d > 0 &&
295  "Queue expected to be empty when the first block is found");
296  // The interval for which the loop needs to be generated here is:
297  // [lbShift, min(lbShift + tripCount, d)) and the body of the
298  // loop needs to have all operations in opQueue in that order.
299  AffineForOp res;
300  if (lbShift + tripCount * step < d * step) {
301  res = generateShiftedLoop(
302  b.getShiftedAffineMap(origLbMap, lbShift),
303  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
304  opGroupQueue, /*offset=*/0, forOp, b);
305  // Entire loop for the queued op groups generated, empty it.
306  opGroupQueue.clear();
307  lbShift += tripCount * step;
308  } else {
309  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
310  b.getShiftedAffineMap(origLbMap, d),
311  opGroupQueue, /*offset=*/0, forOp, b);
312  lbShift = d * step;
313  }
314 
315  if (res) {
316  // Simplify/canonicalize the affine.for.
317  RewritePatternSet patterns(res.getContext());
318  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
319  bool erased;
321  res.getOperation(), std::move(patterns),
324  /*changed=*/nullptr, &erased);
325  if (!erased && !prologue)
326  prologue = res;
327  if (!erased)
328  epilogue = res;
329  }
330  } else {
331  // Start of first interval.
332  lbShift = d * step;
333  }
334  // Augment the list of operations that get into the current open interval.
335  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
336  }
337 
338  // Those operations groups left in the queue now need to be processed (FIFO)
339  // and their loops completed.
340  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
341  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
342  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
343  b.getShiftedAffineMap(origLbMap, ubShift),
344  opGroupQueue, /*offset=*/i, forOp, b);
345  lbShift = ubShift;
346  if (!prologue)
347  prologue = epilogue;
348  }
349 
350  // Erase the original for op.
351  forOp.erase();
352 
353  if (unrollPrologueEpilogue && prologue)
354  (void)loopUnrollFull(prologue);
355  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
356  (void)loopUnrollFull(epilogue);
357 
358  return success();
359 }
360 
361 /// Checks whether a loop nest is hyper-rectangular or not.
362 static LogicalResult
365  SmallVector<Operation *, 8> ops(input.begin(), input.end());
366  // 0-d or 1-d is trivially hyper-rectangular.
367  if (input.size() <= 1)
368  return success();
369  if (failed(getIndexSet(ops, &cst))) {
370  LDBG() << "Index set computation failed!";
371  return failure();
372  }
373  if (!cst.isHyperRectangular(0, input.size())) {
374  LDBG() << "Non-hyperrectangular nests not supported for tiling!";
375  return failure();
376  }
377  return success();
378 }
379 
380 /// Check if the input nest is supported for tiling and whether tiling would be
381 /// legal or not.
382 template <typename t>
384  ArrayRef<t> tileSizes) {
385  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386 
387  if (llvm::any_of(input,
388  [](AffineForOp op) { return op.getNumResults() > 0; })) {
389  LDBG() << "Cannot tile nest where a loop has yield values";
390  return failure();
391  }
392 
393  // Check if the supplied `for` ops are all successively nested.
394  if (!isPerfectlyNested(input)) {
395  LDBG() << "input loops not perfectly nested";
396  return failure();
397  }
398 
399  // TODO: handle non hyper-rectangular spaces.
400  if (failed(checkIfHyperRectangular(input)))
401  return failure();
402 
403  return success();
404 }
405 
406 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
407 /// location in destination's body, ignoring the terminator.
408 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
409  Block::iterator loc) {
410  auto &ops = src.getBody()->getOperations();
411  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
412  std::prev(ops.end()));
413 }
414 
415 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
416 /// body.
417 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
418  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
419 }
420 
421 /// Constructs tiled loop nest, without setting the loop bounds and move the
422 /// body of the original loop nest to the tiled loop nest.
424  AffineForOp rootAffineForOp, unsigned width,
425  MutableArrayRef<AffineForOp> tiledLoops) {
426  Location loc = rootAffineForOp.getLoc();
427 
428  // The outermost among the loops as we add more..
429  Operation *topLoop = rootAffineForOp.getOperation();
430  AffineForOp innermostPointLoop;
431 
432  // Add intra-tile (or point) loops.
433  for (unsigned i = 0; i < width; i++) {
434  OpBuilder b(topLoop);
435  // Loop bounds will be set later.
436  AffineForOp pointLoop = AffineForOp::create(b, loc, 0, 0);
437  pointLoop.getBody()->getOperations().splice(
438  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
439  topLoop);
440  tiledLoops[2 * width - 1 - i] = pointLoop;
441  topLoop = pointLoop.getOperation();
442  if (i == 0)
443  innermostPointLoop = pointLoop;
444  }
445 
446  // Add tile space loops;
447  for (unsigned i = width; i < 2 * width; i++) {
448  OpBuilder b(topLoop);
449  // Loop bounds will be set later.
450  AffineForOp tileSpaceLoop = AffineForOp::create(b, loc, 0, 0);
451  tileSpaceLoop.getBody()->getOperations().splice(
452  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
453  topLoop);
454  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
455  topLoop = tileSpaceLoop.getOperation();
456  }
457 
458  // Move the loop body of the original nest to the new one.
459  moveLoopBody(origLoops.back(), innermostPointLoop);
460 }
461 
462 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
463 // TODO: Handle non-constant lower bounds.
464 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
465  AffineForOp newInterTileLoop,
466  AffineForOp newIntraTileLoop,
467  Value tileSize) {
468  // The lower bound for the intra-tile loop is represented by an affine map
469  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
473  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
474  // corresponding inter-tile loop.
475 
476  assert(origLoop.hasConstantLowerBound() &&
477  "expected input loops to have constant lower bound.");
478 
479  // Get lower bound of original loop as an affine expression.
480  AffineExpr origLowerBoundExpr;
481  origLowerBoundExpr =
482  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
483 
484  // Add dim operands from original lower/upper bound.
485  SmallVector<Value, 4> lbOperands, ubOperands;
486  AffineBound lb = origLoop.getLowerBound();
487  AffineBound ub = origLoop.getUpperBound();
488  lbOperands.reserve(lb.getNumOperands() + 2);
489  ubOperands.reserve(ub.getNumOperands() + 2);
490  AffineMap origLbMap = lb.getMap();
491  AffineMap origUbMap = ub.getMap();
492  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
493  lbOperands.push_back(lb.getOperand(j));
494  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
495  ubOperands.push_back(ub.getOperand(j));
496 
497  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
498  // IV.
499  lbOperands.push_back(newInterTileLoop.getInductionVar());
500  ubOperands.push_back(newInterTileLoop.getInductionVar());
501 
502  // Get loop IV as an affine expression for lower/upper bound. Size of
503  // lb/ubOperands is guaranteed to be atleast one.
504  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
506 
507  // Add symbol operands from original lower/upper bound.
508  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
509  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
510  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
511  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
512 
513  // Add a new symbol operand which is the tile size for this loop.
514  lbOperands.push_back(tileSize);
515  ubOperands.push_back(tileSize);
516 
517  SmallVector<AffineExpr, 4> lbBoundExprs;
518  SmallVector<AffineExpr, 4> ubBoundExprs;
519  lbBoundExprs.reserve(origLbMap.getNumResults());
520  ubBoundExprs.reserve(origUbMap.getNumResults());
521 
522  // Get tiling parameter as an affine expression for lb/ub.
523  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
524  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
525 
526  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
527  lbBoundExprs.push_back(
528  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
529  origLowerBoundExpr);
530 
531  // Get the origLoopStep as an affine expression.
532  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
533 
534  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
535  // (tilingParameter * origLoopStep) + origlb.
536  ubBoundExprs.push_back(
537  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
538  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
539 
540  ubBoundExprs.append(origUbMap.getResults().begin(),
541  origUbMap.getResults().end());
542 
543  AffineMap lbMap =
544  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
545  lbBoundExprs, b.getContext());
546  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
547 
548  AffineMap ubMap =
549  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
550  ubBoundExprs, b.getContext());
551  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
552 
553  // Original loop step must be preserved.
554  newIntraTileLoop.setStep(origLoop.getStepAsInt());
555 }
556 
557 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
558 // TODO: Handle non-constant lower bounds.
559 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
560  AffineForOp newLoop, Value tileSize) {
561  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
562 
563  // The lower bounds for inter-tile loops are same as the corresponding lower
564  // bounds of original loops.
565  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
566 
567  // The new upper bound map for inter-tile loops, assuming constant lower
568  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
569  // originalLowerBound), tiling parameter); where tiling parameter is the
570  // respective tile size for that loop. For e.g. if the original ubmap was
571  // ()->(1024), the new map will be
572  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
573  // Therefore a new symbol operand is inserted in the map and the result
574  // expression is overwritten.
575 
576  assert(origLoop.hasConstantLowerBound() &&
577  "expected input loops to have constant lower bound.");
578 
579  // Get lower bound of original loop as an affine expression.
580  AffineExpr origLowerBoundExpr;
581  origLowerBoundExpr =
582  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
583 
584  // Add dim operands from original upper bound.
585  SmallVector<Value, 4> ubOperands;
586  AffineBound ub = origLoop.getUpperBound();
587  ubOperands.reserve(ub.getNumOperands() + 1);
588  AffineMap origUbMap = ub.getMap();
589  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
590  ubOperands.push_back(ub.getOperand(j));
591 
592  // Add symbol operands from original upper bound.
593  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
594  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
595 
596  // Add a new symbol operand which is the tile size for this loop.
597  ubOperands.push_back(tileSize);
598 
599  // Get tiling parameter as an affine expression.
600  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
601 
602  SmallVector<AffineExpr, 4> boundExprs;
603  boundExprs.reserve(origUbMap.getNumResults());
604  int64_t origUpperBound;
605  AffineExpr origUpperBoundExpr;
606 
607  // If upper bound for the original loop is constant, then the constant can
608  // be obtained as an affine expression straight away.
609  if (origLoop.hasConstantUpperBound()) {
610  origUpperBound = origLoop.getConstantUpperBound();
611 
612  // Get original constant upper bound as an affine expression.
613  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
614 
615  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
616  // originalLowerBound), tilingParameter).
617  boundExprs.push_back(
618  origLowerBoundExpr +
619  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
620  } else {
621  // If upper bound for the original loop is not constant then two cases
622  // are possible, although there handeling is the same, 1.) The result of
623  // ubmap has only one result expression. For e.g.
624  // affine.for %i = 5 to %ub
625  //
626  // A symbol operand is added which represents the tiling parameter. The
627  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
628  // where 's0' is the original upper bound and 's1' is the tiling
629  // parameter. 2.) When ubMap has more than one result expression. For e.g.
630  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
631  // affine.for %i = 5 to min #map0()[%s0, %s1]
632  //
633  // A symbol operand is added which represents the tiling parameter. The
634  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
636 
637  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
638  // originalLowerBound), tilingParameter).
639  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
640  boundExprs.push_back(
641  origLowerBoundExpr +
642  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
643  }
644 
645  AffineMap ubMap =
646  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
647  boundExprs, b.getContext());
648  newLoop.setUpperBound(ubOperands, ubMap);
649 
650  // Original loop step must be preserved.
651  newLoop.setStep(origLoop.getStepAsInt());
652 }
653 
654 /// Constructs and sets new loop bounds after tiling for the case of
655 /// hyper-rectangular index sets, where the bounds of one dimension do not
656 /// depend on other dimensions and tiling parameters are captured from SSA
657 /// values. Bounds of each dimension can thus be treated independently,
658 /// and deriving the new bounds is much simpler and faster than for the case of
659 /// tiling arbitrary polyhedral shapes.
662  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
663  assert(!origLoops.empty() && "expected atleast one loop in band");
664  assert(origLoops.size() == tileSizes.size() &&
665  "expected tiling parameter for each loop in band.");
666 
667  OpBuilder b(origLoops[0].getOperation());
668  unsigned width = origLoops.size();
669 
670  // Set bounds for tile space loops.
671  for (unsigned i = 0; i < width; ++i) {
672  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
673  }
674 
675  // Set bounds for intra-tile loops.
676  for (unsigned i = 0; i < width; ++i) {
677  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
678  newLoops[i + width], tileSizes[i]);
679  }
680 }
681 
682 /// Constructs and sets new loop bounds after tiling for the case of
683 /// hyper-rectangular index sets, where the bounds of one dimension do not
684 /// depend on other dimensions. Bounds of each dimension can thus be treated
685 /// independently, and deriving the new bounds is much simpler and faster
686 /// than for the case of tiling arbitrary polyhedral shapes.
687 static void
690  ArrayRef<unsigned> tileSizes) {
691  assert(!origLoops.empty());
692  assert(origLoops.size() == tileSizes.size());
693 
694  OpBuilder b(origLoops[0].getOperation());
695  unsigned width = origLoops.size();
696 
697  // Bounds for tile space loops.
698  for (unsigned i = 0; i < width; i++) {
699  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
700  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
701  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
702  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
703  // If the step size of original loop is x and tileSize is y then after
704  // tiling the tile space loops' step size becomes x*y.
705  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
706  }
707  // Bounds for intra-tile loops.
708  for (unsigned i = 0; i < width; i++) {
709  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
710  std::optional<uint64_t> mayBeConstantCount =
711  getConstantTripCount(origLoops[i]);
712  // The lower bound is just the tile-space loop.
713  AffineMap lbMap = b.getDimIdentityMap();
714  newLoops[width + i].setLowerBound(
715  /*operands=*/newLoops[i].getInductionVar(), lbMap);
716  // The step sizes of intra-tile loops is just the original loops' step size.
717  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
718 
719  // Set the upper bound.
720  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
721  // Trip count is less than the tile size: upper bound is lower bound +
722  // trip count * stepSize.
724  *mayBeConstantCount * origLoops[i].getStepAsInt());
725  newLoops[width + i].setUpperBound(
726  /*operands=*/newLoops[i].getInductionVar(), ubMap);
727  } else if (largestDiv % tileSizes[i] != 0) {
728  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
729  // Construct the upper bound map; the operands are the original operands
730  // with 'i' (tile-space loop) appended to it. The new upper bound map is
731  // the original one with an additional expression i + tileSize * stepSize
732  // appended.
733 
734  // Add dim operands from original upper bound.
735  SmallVector<Value, 4> ubOperands;
736  AffineBound ub = origLoops[i].getUpperBound();
737  ubOperands.reserve(ub.getNumOperands() + 1);
738  AffineMap origUbMap = ub.getMap();
739  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
740  ubOperands.push_back(ub.getOperand(j));
741 
742  // Add dim operand for new loop upper bound.
743  ubOperands.push_back(newLoops[i].getInductionVar());
744 
745  // Add symbol operands from original upper bound.
746  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
747  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
748 
749  SmallVector<AffineExpr, 4> boundExprs;
750  boundExprs.reserve(1 + origUbMap.getNumResults());
751  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
752  // The new upper bound map is the original one with an additional
753  // expression i + tileSize * stepSize (of original loop) appended.
754  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
755  boundExprs.append(origUbMap.getResults().begin(),
756  origUbMap.getResults().end());
757  AffineMap ubMap =
758  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
759  boundExprs, b.getContext());
760  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
761  } else {
762  // No need of the min expression.
763  AffineExpr dim = b.getAffineDimExpr(0);
764  AffineMap ubMap = AffineMap::get(
765  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
766  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
767  }
768  }
769 }
770 
771 LogicalResult
773  ArrayRef<unsigned> tileSizes,
774  SmallVectorImpl<AffineForOp> *tiledNest) {
775  if (input.empty())
776  return success();
777 
778  if (failed(performPreTilingChecks(input, tileSizes)))
779  return failure();
780 
781  MutableArrayRef<AffineForOp> origLoops = input;
782  AffineForOp rootAffineForOp = origLoops[0];
783 
784  // Note that width is at least one since the band isn't empty.
785  unsigned width = input.size();
786  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
787 
788  // Construct a tiled loop nest without setting their bounds. Bounds are
789  // set later.
790  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
791 
792  SmallVector<Value, 8> origLoopIVs;
793  extractForInductionVars(input, &origLoopIVs);
794 
795  // Set loop bounds for the tiled loop nest.
796  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
797 
798  // Replace original IVs with intra-tile loop IVs.
799  for (unsigned i = 0; i < width; i++)
800  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
801 
802  // Erase the old loop nest.
803  rootAffineForOp.erase();
804 
805  if (tiledNest)
806  *tiledNest = std::move(tiledLoops);
807 
808  return success();
809 }
810 
811 /// Tiles the specified band of perfectly nested loops creating tile-space
812 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
813 /// is a contiguous set of loops.
816  SmallVectorImpl<AffineForOp> *tiledNest) {
817  if (input.empty())
818  return success();
819 
820  if (failed(performPreTilingChecks(input, tileSizes)))
821  return failure();
822 
823  MutableArrayRef<AffineForOp> origLoops = input;
824  AffineForOp rootAffineForOp = origLoops[0];
825  unsigned width = input.size();
826  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
827 
828  // Construct a tiled loop nest without setting their bounds. Bounds are
829  // set later.
830  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
831 
832  SmallVector<Value, 8> origLoopIVs;
833  extractForInductionVars(input, &origLoopIVs);
834 
835  // Set loop bounds for the tiled loop nest.
836  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
837  tileSizes);
838 
839  // Replace original IVs with intra-tile loop IVs.
840  for (unsigned i = 0; i < width; i++)
841  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
842 
843  // Erase the old loop nest.
844  rootAffineForOp.erase();
845 
846  if (tiledNest)
847  *tiledNest = std::move(tiledLoops);
848 
849  return success();
850 }
851 
852 /// Get perfectly nested sequence of loops starting at root of loop nest
853 /// (the first op being another AffineFor, and the second op - a terminator).
854 /// A loop is perfectly nested iff: the first op in the loop's body is another
855 /// AffineForOp, and the second op is a terminator).
857  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
858  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
859  nestedLoops.push_back(root);
860  Block &body = root.getRegion().front();
861  if (body.begin() != std::prev(body.end(), 2))
862  return;
863 
864  root = dyn_cast<AffineForOp>(&body.front());
865  if (!root)
866  return;
867  }
868 }
869 
870 /// Unrolls this loop completely.
871 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
872  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
873  if (mayBeConstantTripCount.has_value()) {
874  uint64_t tripCount = *mayBeConstantTripCount;
875  if (tripCount == 0)
876  return success();
877  if (tripCount == 1)
878  return promoteIfSingleIteration(forOp);
879  return loopUnrollByFactor(forOp, tripCount);
880  }
881  return failure();
882 }
883 
884 /// Unrolls this loop by the specified factor or by the trip count (if constant)
885 /// whichever is lower.
886 LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
887  uint64_t unrollFactor) {
888  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
889  if (mayBeConstantTripCount.has_value() &&
890  *mayBeConstantTripCount < unrollFactor)
891  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
892  return loopUnrollByFactor(forOp, unrollFactor);
893 }
894 
895 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
896 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
897 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
898 /// using annotateFn.
900  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
901  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
902  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
903  ValueRange iterArgs, ValueRange yieldedValues) {
904  // Builder to insert unrolled bodies just before the terminator of the body of
905  // 'forOp'.
906  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
907 
908  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
909  if (!annotateFn)
910  annotateFn = defaultAnnotateFn;
911 
912  // Keep a pointer to the last non-terminator operation in the original block
913  // so that we know what to clone (since we are doing this in-place).
914  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
915 
916  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
917  SmallVector<Value, 4> lastYielded(yieldedValues);
918 
919  for (unsigned i = 1; i < unrollFactor; i++) {
920  IRMapping operandMap;
921 
922  // Prepare operand map.
923  operandMap.map(iterArgs, lastYielded);
924 
925  // If the induction variable is used, create a remapping to the value for
926  // this unrolled instance.
927  if (!forOpIV.use_empty()) {
928  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
929  operandMap.map(forOpIV, ivUnroll);
930  }
931 
932  // Clone the original body of 'forOp'.
933  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
934  Operation *clonedOp = builder.clone(*it, operandMap);
935  annotateFn(i, clonedOp, builder);
936  }
937 
938  // Update yielded values. If the yielded value is defined outside the
939  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
940  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
941  // value with the clone corresponding to the yielded value.
942  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
943  Operation *defOp = yieldedValues[i].getDefiningOp();
944  if (defOp && defOp->getBlock() == loopBodyBlock)
945  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
946  }
947  }
948 
949  // Make sure we annotate the Ops in the original body. We do this last so that
950  // any annotations are not copied into the cloned Ops above.
951  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
952  annotateFn(0, &*it, builder);
953 
954  // Update operands of the yield statement.
955  loopBodyBlock->getTerminator()->setOperands(lastYielded);
956 }
957 
958 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
959 /// count is not a multiple of `unrollFactor`.
960 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
961  uint64_t unrollFactor) {
962  // Insert the cleanup loop right after 'forOp'.
963  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
964  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
965 
966  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
967  // and produce results for the original users of `forOp` results.
968  auto results = forOp.getResults();
969  auto cleanupResults = cleanupForOp.getResults();
970  auto cleanupIterOperands = cleanupForOp.getInits();
971 
972  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
973  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
974  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
975  }
976 
977  AffineMap cleanupMap;
978  SmallVector<Value, 4> cleanupOperands;
979  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
980  if (!cleanupMap)
981  return failure();
982 
983  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
984  // Promote the loop body up if this has turned into a single iteration loop.
985  (void)promoteIfSingleIteration(cleanupForOp);
986 
987  // Adjust upper bound of the original loop; this is the same as the lower
988  // bound of the cleanup loop.
989  forOp.setUpperBound(cleanupOperands, cleanupMap);
990  return success();
991 }
992 
993 /// Unrolls this loop by the specified factor. Returns success if the loop
994 /// is successfully unrolled.
996  AffineForOp forOp, uint64_t unrollFactor,
997  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
998  bool cleanUpUnroll) {
999  assert(unrollFactor > 0 && "unroll factor should be positive");
1000 
1001  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1002  if (unrollFactor == 1) {
1003  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1004  return failure();
1005  return success();
1006  }
1007 
1008  // Nothing in the loop body other than the terminator.
1009  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1010  return success();
1011 
1012  // If the trip count is lower than the unroll factor, no unrolled body.
1013  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1014  if (cleanUpUnroll) {
1015  // Unroll the cleanup loop if cleanUpUnroll is specified.
1016  return loopUnrollFull(forOp);
1017  }
1018 
1019  return failure();
1020  }
1021 
1022  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1023  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1024  // Loops where the lower bound is a max expression or the upper bound is
1025  // a min expression and the trip count doesn't divide the unroll factor
1026  // can't be unrolled since the lower bound of the cleanup loop in such cases
1027  // cannot be expressed as an affine function or a max over affine functions.
1028  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1029  forOp.getUpperBoundMap().getNumResults() != 1)
1030  return failure();
1031  if (cleanUpUnroll)
1032  // Force unroll including cleanup loop
1033  return loopUnrollFull(forOp);
1034  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1035  assert(false && "cleanup loop lower bound map for single result lower "
1036  "and upper bound maps can always be determined");
1037  }
1038 
1039  ValueRange iterArgs(forOp.getRegionIterArgs());
1040  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1041 
1042  // Scale the step of loop being unrolled by unroll factor.
1043  int64_t step = forOp.getStepAsInt();
1044  forOp.setStep(step * unrollFactor);
1046  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1047  [&](unsigned i, Value iv, OpBuilder b) {
1048  // iv' = iv + i * step
1049  auto d0 = b.getAffineDimExpr(0);
1050  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1051  return AffineApplyOp::create(b, forOp.getLoc(), bumpMap, iv);
1052  },
1053  /*annotateFn=*/annotateFn,
1054  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1055 
1056  // Promote the loop body up if this has turned into a single iteration loop.
1057  (void)promoteIfSingleIteration(forOp);
1058  return success();
1059 }
1060 
1061 LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1062  uint64_t unrollJamFactor) {
1063  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1064  if (mayBeConstantTripCount.has_value() &&
1065  *mayBeConstantTripCount < unrollJamFactor)
1066  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1067  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1068 }
1069 
1070 /// Check if all control operands of all loops are defined outside of `forOp`
1071 /// and return false if not.
1072 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1073  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1074  for (auto controlOperand : aForOp.getControlOperands()) {
1075  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1076  return WalkResult::interrupt();
1077  }
1078  return WalkResult::advance();
1079  });
1080  return !walkResult.wasInterrupted();
1081 }
1082 
1083 /// Unrolls and jams this loop by the specified factor.
1084 LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1085  uint64_t unrollJamFactor) {
1086  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1087 
1088  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1089  if (unrollJamFactor == 1) {
1090  if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
1091  return failure();
1092  return success();
1093  }
1094 
1095  // Nothing in the loop body other than the terminator.
1096  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1097  return success();
1098 
1099  // If the trip count is lower than the unroll jam factor, no unroll jam.
1100  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1101  LDBG() << "[failed] trip count < unroll-jam factor";
1102  return failure();
1103  }
1104 
1105  // If any control operand of any inner loop of `forOp` is defined within
1106  // `forOp`, no unroll jam.
1107  if (!areInnerBoundsInvariant(forOp))
1108  return failure();
1109 
1110  // Gather all sub-blocks to jam upon the loop being unrolled.
1112  jbg.walk(forOp);
1113  auto &subBlocks = jbg.subBlocks;
1114 
1115  // Collect loops with iter_args.
1116  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1117  forOp.walk([&](AffineForOp aForOp) {
1118  if (aForOp.getNumIterOperands() > 0)
1119  loopsWithIterArgs.push_back(aForOp);
1120  });
1121 
1122  // Get supported reductions to be used for creating reduction ops at the end.
1123  SmallVector<LoopReduction> reductions;
1124  if (forOp.getNumIterOperands() > 0)
1125  getSupportedReductions(forOp, reductions);
1126 
1127  // Generate the cleanup loop if trip count isn't a multiple of
1128  // unrollJamFactor.
1129  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1130  // Loops where the lower bound is a max expression or the upper bound is
1131  // a min expression and the trip count doesn't divide the unroll factor
1132  // can't be unrolled since the lower bound of the cleanup loop in such cases
1133  // cannot be expressed as an affine function or a max over affine functions.
1134  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1135  forOp.getUpperBoundMap().getNumResults() != 1)
1136  return failure();
1137  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1138  assert(false && "cleanup loop lower bound map for single result lower "
1139  "and upper bound maps can always be determined");
1140  }
1141 
1142  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1143  // iteration. There are (`unrollJamFactor` - 1) iterations.
1144  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1145 
1146  // For any loop with iter_args, replace it with a new loop that has
1147  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1148  // operands.
1149  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1150  IRRewriter rewriter(forOp.getContext());
1151  for (AffineForOp oldForOp : loopsWithIterArgs) {
1152  SmallVector<Value> dupIterOperands, dupYieldOperands;
1153  ValueRange oldIterOperands = oldForOp.getInits();
1154  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1155  ValueRange oldYieldOperands =
1156  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1157  // Get additional iterOperands, iterArgs, and yield operands. We will
1158  // fix iterOperands and yield operands after cloning of sub-blocks.
1159  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1160  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1161  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1162  }
1163  // Create a new loop with additional iterOperands, iter_args and yield
1164  // operands. This new loop will take the loop body of the original loop.
1165  bool forOpReplaced = oldForOp == forOp;
1166  AffineForOp newForOp =
1167  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1168  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1169  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1170  return dupYieldOperands;
1171  }));
1172  newLoopsWithIterArgs.push_back(newForOp);
1173  // `forOp` has been replaced with a new loop.
1174  if (forOpReplaced)
1175  forOp = newForOp;
1176  // Update `operandMaps` for `newForOp` iterArgs and results.
1177  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1178  unsigned oldNumIterArgs = oldIterArgs.size();
1179  ValueRange newResults = newForOp.getResults();
1180  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1181  assert(oldNumIterArgs == oldNumResults &&
1182  "oldNumIterArgs must be the same as oldNumResults");
1183  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1184  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1185  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1186  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1187  // to those in the `i`th new set.
1188  operandMaps[i - 1].map(newIterArgs[j],
1189  newIterArgs[i * oldNumIterArgs + j]);
1190  operandMaps[i - 1].map(newResults[j],
1191  newResults[i * oldNumResults + j]);
1192  }
1193  }
1194  }
1195 
1196  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1197  int64_t step = forOp.getStepAsInt();
1198  forOp.setStep(step * unrollJamFactor);
1199 
1200  auto forOpIV = forOp.getInductionVar();
1201  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1202  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1203  for (auto &subBlock : subBlocks) {
1204  // Builder to insert unroll-jammed bodies. Insert right at the end of
1205  // sub-block.
1206  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1207 
1208  // If the induction variable is used, create a remapping to the value for
1209  // this unrolled instance.
1210  if (!forOpIV.use_empty()) {
1211  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1212  auto d0 = builder.getAffineDimExpr(0);
1213  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1214  auto ivUnroll =
1215  AffineApplyOp::create(builder, forOp.getLoc(), bumpMap, forOpIV);
1216  operandMaps[i - 1].map(forOpIV, ivUnroll);
1217  }
1218  // Clone the sub-block being unroll-jammed.
1219  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1220  builder.clone(*it, operandMaps[i - 1]);
1221  }
1222  // Fix iterOperands and yield op operands of newly created loops.
1223  for (auto newForOp : newLoopsWithIterArgs) {
1224  unsigned oldNumIterOperands =
1225  newForOp.getNumIterOperands() / unrollJamFactor;
1226  unsigned numControlOperands = newForOp.getNumControlOperands();
1227  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1228  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1229  assert(oldNumIterOperands == oldNumYieldOperands &&
1230  "oldNumIterOperands must be the same as oldNumYieldOperands");
1231  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1232  // The `i`th duplication of an old iterOperand or yield op operand
1233  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1234  // if such mapped value exists.
1235  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1236  operandMaps[i - 1].lookupOrDefault(
1237  newForOp.getOperand(numControlOperands + j)));
1238  yieldOp.setOperand(
1239  i * oldNumYieldOperands + j,
1240  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1241  }
1242  }
1243  }
1244  if (forOp.getNumResults() > 0) {
1245  // Create reduction ops to combine every `unrollJamFactor` related results
1246  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1247  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1248  // %1.
1249  rewriter.setInsertionPointAfter(forOp);
1250  auto loc = forOp.getLoc();
1251  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1252  for (LoopReduction &reduction : reductions) {
1253  unsigned pos = reduction.iterArgPosition;
1254  Value lhs = forOp.getResult(pos);
1255  Value rhs;
1257  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1258  rhs = forOp.getResult(i * oldNumResults + pos);
1259  // Create ops based on reduction type.
1260  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1261  if (!lhs)
1262  return failure();
1263  Operation *op = lhs.getDefiningOp();
1264  assert(op && "Reduction op should have been created");
1265  newOps.insert(op);
1266  }
1267  // Replace all uses except those in newly created reduction ops.
1268  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1269  }
1270  }
1271 
1272  // Promote the loop body up if this has turned into a single iteration loop.
1273  (void)promoteIfSingleIteration(forOp);
1274  return success();
1275 }
1276 
1277 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1278 /// nested within 'forOpA' as the only non-terminator operation in its block.
1279 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1280  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1281  auto &forOpABody = forOpA.getBody()->getOperations();
1282  auto &forOpBBody = forOpB.getBody()->getOperations();
1283 
1284  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1285  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1286  // body containing only the terminator.
1287  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1288  forOpABody, forOpABody.begin(),
1289  std::prev(forOpABody.end()));
1290  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1291  // body (this leaves forOpB's body containing only the terminator).
1292  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1293  std::prev(forOpBBody.end()));
1294  // 3) Splice forOpA into the beginning of forOpB's body.
1295  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1296  Block::iterator(forOpA));
1297 }
1298 
1299 // Checks each dependence component against the permutation to see if the
1300 // desired loop interchange would violate dependences by making the
1301 // dependence component lexicographically negative.
1303  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1304  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1305  // Invert permutation map.
1306  unsigned maxLoopDepth = loops.size();
1307  SmallVector<unsigned, 4> loopPermMapInv;
1308  loopPermMapInv.resize(maxLoopDepth);
1309  for (unsigned i = 0; i < maxLoopDepth; ++i)
1310  loopPermMapInv[loopPermMap[i]] = i;
1311 
1312  // Check each dependence component against the permutation to see if the
1313  // desired loop interchange permutation would make the dependence vectors
1314  // lexicographically negative.
1315  // Example 1: [-1, 1][0, 0]
1316  // Example 2: [0, 0][-1, 1]
1317  for (const auto &depComps : depCompsVec) {
1318  assert(depComps.size() >= maxLoopDepth);
1319  // Check if the first non-zero dependence component is positive.
1320  // This iterates through loops in the desired order.
1321  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1322  unsigned permIndex = loopPermMapInv[j];
1323  assert(depComps[permIndex].lb);
1324  int64_t depCompLb = *depComps[permIndex].lb;
1325  if (depCompLb > 0)
1326  break;
1327  if (depCompLb < 0)
1328  return false;
1329  }
1330  }
1331  return true;
1332 }
1333 
1334 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1335 /// nested sequence of loops in 'loops' would violate dependences.
1337  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1338  assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1339  unsigned maxLoopDepth = loops.size();
1340  if (maxLoopDepth == 1)
1341  return true;
1342 
1343  // We cannot guarantee the validity of the interchange if the loops have
1344  // iter_args, since the dependence analysis does not take them into account.
1345  // Conservatively return false in such cases.
1346  if (llvm::any_of(loops, [](AffineForOp loop) {
1347  return loop.getNumIterOperands() > 0;
1348  }))
1349  return false;
1350 
1351  // Gather dependence components for dependences between all ops in loop nest
1352  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1353  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1354  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1355  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1356 }
1357 
1358 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1359 /// in it from outermost to innermost.
1360 bool LLVM_ATTRIBUTE_UNUSED
1362  assert(!loops.empty() && "no loops provided");
1363 
1364  // We already know that the block can't be empty.
1365  auto hasTwoElements = [](Block *block) {
1366  auto secondOpIt = std::next(block->begin());
1367  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1368  };
1369 
1370  auto enclosingLoop = loops.front();
1371  for (auto loop : loops.drop_front()) {
1372  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1373  // parentForOp's body should be just this loop and the terminator.
1374  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1375  return false;
1376  enclosingLoop = loop;
1377  }
1378  return true;
1379 }
1380 
1381 // input[i] should move from position i -> permMap[i]. Returns the position in
1382 // `input` that becomes the new outermost loop.
1384  ArrayRef<unsigned> permMap) {
1385  assert(input.size() == permMap.size() && "invalid permutation map size");
1386  // Check whether the permutation spec is valid. This is a small vector - we'll
1387  // just sort and check if it's iota.
1388  SmallVector<unsigned, 4> checkPermMap(permMap);
1389  llvm::sort(checkPermMap);
1390  if (llvm::any_of(llvm::enumerate(checkPermMap),
1391  [](const auto &en) { return en.value() != en.index(); }))
1392  assert(false && "invalid permutation map");
1393 
1394  // Nothing to do.
1395  if (input.size() < 2)
1396  return 0;
1397 
1398  assert(isPerfectlyNested(input) && "input not perfectly nested");
1399 
1400  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1401  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1403  for (unsigned i = 0, e = input.size(); i < e; ++i)
1404  invPermMap.push_back({permMap[i], i});
1405  llvm::sort(invPermMap);
1406 
1407  // Move the innermost loop body to the loop that would be the innermost in the
1408  // permuted nest (only if the innermost loop is going to change).
1409  if (permMap.back() != input.size() - 1) {
1410  Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1411  Block *srcBody = ((AffineForOp)input.back()).getBody();
1412  destBody->getOperations().splice(destBody->begin(),
1413  srcBody->getOperations(), srcBody->begin(),
1414  std::prev(srcBody->end()));
1415  }
1416 
1417  // We'll move each loop in `input` in the reverse order so that its body is
1418  // empty when we are moving it; this incurs zero copies and no erasing.
1419  for (int i = input.size() - 1; i >= 0; --i) {
1420  // If this has to become the outermost loop after permutation, add it to the
1421  // parent block of the original root.
1422  if (permMap[i] == 0) {
1423  // If the root remains the same, nothing to do.
1424  if (i == 0)
1425  continue;
1426  // Make input[i] the new outermost loop moving it into parentBlock.
1427  auto *parentBlock = input[0]->getBlock();
1428  parentBlock->getOperations().splice(Block::iterator(input[0]),
1429  input[i]->getBlock()->getOperations(),
1430  Block::iterator(input[i]));
1431  continue;
1432  }
1433 
1434  // If the parent in the permuted order is the same as in the original,
1435  // nothing to do.
1436  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1437  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1438  continue;
1439 
1440  // Move input[i] to its surrounding loop in the transformed nest.
1441  auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1442  destBody->getOperations().splice(destBody->begin(),
1443  input[i]->getBlock()->getOperations(),
1444  Block::iterator(input[i]));
1445  }
1446 
1447  return invPermMap[0].second;
1448 }
1449 
1450 // Sinks all sequential loops to the innermost levels (while preserving
1451 // relative order among them) and moves all parallel loops to the
1452 // outermost (while again preserving relative order among them).
1453 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1455  getPerfectlyNestedLoops(loops, forOp);
1456  if (loops.size() < 2)
1457  return forOp;
1458 
1459  // Gather dependence components for dependences between all ops in loop nest
1460  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1461  unsigned maxLoopDepth = loops.size();
1462  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1463  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1464 
1465  // Mark loops as either parallel or sequential.
1466  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1467  for (auto &depComps : depCompsVec) {
1468  assert(depComps.size() >= maxLoopDepth);
1469  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1470  DependenceComponent &depComp = depComps[j];
1471  assert(depComp.lb.has_value() && depComp.ub.has_value());
1472  if (*depComp.lb != 0 || *depComp.ub != 0)
1473  isParallelLoop[j] = false;
1474  }
1475  }
1476 
1477  unsigned numParallelLoops = llvm::count(isParallelLoop, true);
1478 
1479  // Compute permutation of loops that sinks sequential loops (and thus raises
1480  // parallel loops) while preserving relative order.
1481  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1482  unsigned nextSequentialLoop = numParallelLoops;
1483  unsigned nextParallelLoop = 0;
1484  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1485  if (isParallelLoop[i]) {
1486  loopPermMap[i] = nextParallelLoop++;
1487  } else {
1488  loopPermMap[i] = nextSequentialLoop++;
1489  }
1490  }
1491 
1492  // Check if permutation 'loopPermMap' would violate dependences.
1493  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1494  return forOp;
1495  // Perform loop interchange according to permutation 'loopPermMap'.
1496  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1497  return loops[loopNestRootIndex];
1498 }
1499 
1500 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1501 // lower (resp. upper) loop bound. When called for both the lower and upper
1502 // bounds, the resulting IR resembles:
1503 //
1504 // ```mlir
1505 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1506 // ...
1507 // }
1508 // ```
1510  SmallVector<Value, 4> *operands,
1511  int64_t offset = 0) {
1512  auto bounds = llvm::to_vector<4>(map->getResults());
1513  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1514  operands->insert(operands->begin() + map->getNumDims(), iv);
1515  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1516  b.getContext());
1517  canonicalizeMapAndOperands(map, operands);
1518 }
1519 
1520 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1521 // Stripmine-sink is a primitive building block for generalized tiling of
1522 // imperfectly nested loops.
1523 // This transformation is purely mechanical and does not check legality,
1524 // profitability or even structural correctness. It is the user's
1525 // responsibility to specify `targets` that are dominated by `forOp`.
1526 // Returns the new AffineForOps, one per `targets`, nested immediately under
1527 // each of the `targets`.
1529 stripmineSink(AffineForOp forOp, uint64_t factor,
1530  ArrayRef<AffineForOp> targets) {
1531  auto originalStep = forOp.getStepAsInt();
1532  auto scaledStep = originalStep * factor;
1533  forOp.setStep(scaledStep);
1534 
1535  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1536 
1537  // Lower-bound map creation.
1538  auto lbMap = forOp.getLowerBoundMap();
1539  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1540  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1541 
1542  // Upper-bound map creation.
1543  auto ubMap = forOp.getUpperBoundMap();
1544  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1545  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1546  /*offset=*/scaledStep);
1547 
1548  auto iv = forOp.getInductionVar();
1549  SmallVector<AffineForOp, 8> innerLoops;
1550  for (auto t : targets) {
1551  // Insert newForOp before the terminator of `t`.
1552  auto b = OpBuilder::atBlockTerminator(t.getBody());
1553  auto newForOp = AffineForOp::create(b, t.getLoc(), lbOperands, lbMap,
1554  ubOperands, ubMap, originalStep);
1555  auto begin = t.getBody()->begin();
1556  // Skip terminator and `newForOp` which is just before the terminator.
1557  auto nOps = t.getBody()->getOperations().size() - 2;
1558  newForOp.getBody()->getOperations().splice(
1559  newForOp.getBody()->getOperations().begin(),
1560  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1561  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1562  newForOp.getRegion());
1563  innerLoops.push_back(newForOp);
1564  }
1565 
1566  return innerLoops;
1567 }
1568 
1569 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1570 // Returns the new AffineForOps, nested immediately under `target`.
1571 template <typename SizeType>
1572 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1573  AffineForOp target) {
1574  // TODO: Use cheap structural assertions that targets are nested under
1575  // forOp and that targets are not nested under each other when DominanceInfo
1576  // exposes the capability. It seems overkill to construct a whole function
1577  // dominance tree at this point.
1578  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1579  assert(res.size() == 1 && "Expected 1 inner forOp");
1580  return res[0];
1581 }
1582 
1585  ArrayRef<AffineForOp> targets) {
1587  SmallVector<AffineForOp, 8> currentTargets(targets);
1588  for (auto it : llvm::zip(forOps, sizes)) {
1589  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1590  res.push_back(step);
1591  currentTargets = step;
1592  }
1593  return res;
1594 }
1595 
1597  ArrayRef<uint64_t> sizes,
1598  AffineForOp target) {
1600  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1601  res.push_back(llvm::getSingleElement(loops));
1602  return res;
1603 }
1604 
1606  if (loops.size() < 2)
1607  return success();
1608 
1609  AffineForOp innermost = loops.back();
1610  AffineForOp outermost = loops.front();
1611  AffineBound ub = outermost.getUpperBound();
1612  AffineMap origUbMap = ub.getMap();
1613  Location loc = outermost.getLoc();
1614  OpBuilder builder(outermost);
1615  for (AffineForOp loop : loops) {
1616  // We only work on normalized loops.
1617  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1618  loop.getConstantLowerBound() != 0)
1619  return failure();
1620  }
1621  SmallVector<Value, 4> upperBoundSymbols;
1622  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1623  ub.getOperands().end());
1624 
1625  // 1. Store the upper bound of the outermost loop in a variable.
1626  Value prev;
1627  if (!llvm::hasSingleElement(origUbMap.getResults()))
1628  prev = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1629  else
1630  prev = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1631  upperBoundSymbols.push_back(prev);
1632 
1633  // 2. Emit code computing the upper bound of the coalesced loop as product of
1634  // the number of iterations of all loops.
1635  for (AffineForOp loop : loops.drop_front()) {
1636  ub = loop.getUpperBound();
1637  origUbMap = ub.getMap();
1638  ubOperands = ub.getOperands();
1639  Value upperBound;
1640  // If upper bound map has more than one result, take their minimum.
1641  if (!llvm::hasSingleElement(origUbMap.getResults()))
1642  upperBound = AffineMinOp::create(builder, loc, origUbMap, ubOperands);
1643  else
1644  upperBound = AffineApplyOp::create(builder, loc, origUbMap, ubOperands);
1645  upperBoundSymbols.push_back(upperBound);
1646  SmallVector<Value, 4> operands;
1647  operands.push_back(prev);
1648  operands.push_back(upperBound);
1649  // Maintain running product of loop upper bounds.
1650  prev = AffineApplyOp::create(
1651  builder, loc,
1652  AffineMap::get(/*dimCount=*/1,
1653  /*symbolCount=*/1,
1654  builder.getAffineDimExpr(0) *
1655  builder.getAffineSymbolExpr(0)),
1656  operands);
1657  }
1658  // Set upper bound of the coalesced loop.
1659  AffineMap newUbMap = AffineMap::get(
1660  /*dimCount=*/0,
1661  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1662  outermost.setUpperBound(prev, newUbMap);
1663 
1664  builder.setInsertionPointToStart(outermost.getBody());
1665 
1666  // 3. Remap induction variables. For each original loop, the value of the
1667  // induction variable can be obtained by dividing the induction variable of
1668  // the linearized loop by the total number of iterations of the loops nested
1669  // in it modulo the number of iterations in this loop (remove the values
1670  // related to the outer loops):
1671  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1672  // Compute these iteratively from the innermost loop by creating a "running
1673  // quotient" of division by the range.
1674  Value previous = outermost.getInductionVar();
1675  for (unsigned idx = loops.size(); idx > 0; --idx) {
1676  if (idx != loops.size()) {
1677  SmallVector<Value, 4> operands;
1678  operands.push_back(previous);
1679  operands.push_back(upperBoundSymbols[idx]);
1680  previous = AffineApplyOp::create(builder, loc,
1682  /*dimCount=*/1, /*symbolCount=*/1,
1683  builder.getAffineDimExpr(0).floorDiv(
1684  builder.getAffineSymbolExpr(0))),
1685  operands);
1686  }
1687  // Modified value of the induction variables of the nested loops after
1688  // coalescing.
1689  Value inductionVariable;
1690  if (idx == 1) {
1691  inductionVariable = previous;
1692  } else {
1693  SmallVector<Value, 4> applyOperands;
1694  applyOperands.push_back(previous);
1695  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1696  inductionVariable = AffineApplyOp::create(
1697  builder, loc,
1699  /*dimCount=*/1, /*symbolCount=*/1,
1700  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1701  applyOperands);
1702  }
1703  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1704  inductionVariable, loops.back().getRegion());
1705  }
1706 
1707  // 4. Move the operations from the innermost just above the second-outermost
1708  // loop, delete the extra terminator and the second-outermost loop.
1709  AffineForOp secondOutermostLoop = loops[1];
1710  innermost.getBody()->back().erase();
1711  outermost.getBody()->getOperations().splice(
1712  Block::iterator(secondOutermostLoop.getOperation()),
1713  innermost.getBody()->getOperations());
1714  secondOutermostLoop.erase();
1715  return success();
1716 }
1717 
1719  ArrayRef<Value> processorId,
1720  ArrayRef<Value> numProcessors) {
1721  assert(processorId.size() == numProcessors.size());
1722  if (processorId.empty())
1723  return;
1724 
1725  OpBuilder b(forOp);
1726  Location loc(forOp.getLoc());
1727  AffineExpr lhs, rhs;
1728  bindSymbols(forOp.getContext(), lhs, rhs);
1729  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1730  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1731 
1732  Value linearIndex = processorId.front();
1733  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1734  auto mulApplyOp = AffineApplyOp::create(
1735  b, loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1736  linearIndex = AffineApplyOp::create(b, loc, addMap,
1737  ValueRange{mulApplyOp, processorId[i]});
1738  }
1739 
1740  auto mulApplyOp = AffineApplyOp::create(
1741  b, loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1742  Value lb = AffineApplyOp::create(
1743  b, loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1744  forOp.setLowerBound(lb);
1745 
1746  Value step = forOp.getStep();
1747  for (auto numProcs : numProcessors)
1748  step = AffineApplyOp::create(b, loc, mulMap, ValueRange{numProcs, step});
1749  forOp.setStep(step);
1750 }
1751 
1752 /// Given a memref region, determine the lowest depth at which transfers can be
1753 /// placed for it, and return the corresponding block, start and end positions
1754 /// in the block for placing incoming (read) and outgoing (write) copies
1755 /// respectively. The lowest depth depends on whether the region being accessed
1756 /// is hoistable with respect to one or more immediately surrounding loops.
1757 static void
1759  Block::iterator &begin, Block::iterator &end,
1760  Block **copyPlacementBlock,
1761  Block::iterator *copyInPlacementStart,
1762  Block::iterator *copyOutPlacementStart) {
1763  const auto *cst = region.getConstraints();
1764  SmallVector<Value, 4> symbols;
1765  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1766 
1767  SmallVector<Operation *, 4> enclosingAffineOps;
1768  getEnclosingAffineOps(*block.begin(), &enclosingAffineOps);
1769  // Walk up loop parents till we find an IV on which this region is
1770  // symbolic/variant or we hit `hoistGuard`.
1771  auto it = enclosingAffineOps.rbegin();
1772  AffineForOp lastInvariantFor;
1773  for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1774  Operation *enclosingOp = *it;
1775  // We can't hoist past the definition of the memref being copied.
1776  Value memref = region.memref;
1777  if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) {
1778  LDBG() << "memref definition will end up not dominating hoist location";
1779  break;
1780  }
1781 
1782  auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1783  if (!affineFor)
1784  break;
1785  // TODO: also need to be checking this for regions symbols that
1786  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1787  if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1788  break;
1789  lastInvariantFor = affineFor;
1790  }
1791 
1792  if (it != enclosingAffineOps.rbegin()) {
1793  *copyInPlacementStart = Block::iterator(lastInvariantFor);
1794  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1795  *copyPlacementBlock = lastInvariantFor->getBlock();
1796  } else {
1797  *copyInPlacementStart = begin;
1798  *copyOutPlacementStart = end;
1799  *copyPlacementBlock = &block;
1800  }
1801 }
1802 
1803 // Info comprising stride and number of elements transferred every stride.
1804 struct StrideInfo {
1805  int64_t stride;
1807 };
1808 
1809 /// Returns striding information for a copy/transfer of this region with
1810 /// potentially multiple striding levels from outermost to innermost. For an
1811 /// n-dimensional region, there can be at most n-1 levels of striding
1812 /// successively nested.
1813 // TODO: make this work with non-identity layout maps.
1814 static void getMultiLevelStrides(const MemRefRegion &region,
1815  ArrayRef<int64_t> bufferShape,
1816  SmallVectorImpl<StrideInfo> *strideInfos) {
1817  if (bufferShape.size() <= 1)
1818  return;
1819 
1820  int64_t numEltPerStride = 1;
1821  int64_t stride = 1;
1822  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1823  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1824  stride *= dimSize;
1825  numEltPerStride *= bufferShape[d];
1826  // A stride is needed only if the region has a shorter extent than the
1827  // memref along the dimension *and* has an extent greater than one along the
1828  // next major dimension.
1829  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1830  strideInfos->push_back({stride, numEltPerStride});
1831  }
1832  }
1833 }
1834 
1835 /// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1836 /// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1837 /// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1838 /// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1839 /// contain the expressions to be subtracted out from the respective copy loop
1840 /// iterators in order to index the fast buffer. If `copyOut' is true, generates
1841 /// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1842 /// copy nest is inserted.
1843 //
1844 /// The copy-in nest is generated as follows as an example for a 2-d region:
1845 /// for x = ...
1846 /// for y = ...
1847 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1848 ///
1849 static AffineForOp
1850 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1851  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1852  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1853  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1854  OpBuilder b) {
1855  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1856  return lbMap.getNumInputs() == lbOperands.size();
1857  }));
1858  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1859  return ubMap.getNumInputs() == ubOperands.size();
1860  }));
1861 
1862  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1863  // A copy nest can't be generated for 0-ranked memrefs.
1864  assert(rank != 0 && "non-zero rank memref expected");
1865  assert(lbMaps.size() == rank && "wrong number of lb maps");
1866  assert(ubMaps.size() == rank && "wrong number of ub maps");
1867 
1868  SmallVector<Value, 4> memIndices;
1869  SmallVector<AffineExpr, 4> fastBufExprs;
1870  SmallVector<Value, 4> fastBufMapOperands;
1871  AffineForOp copyNestRoot;
1872  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1873  for (unsigned d = 0; d < rank; ++d) {
1874  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1875  ubOperands, ubMaps[d]);
1876  if (d == 0)
1877  copyNestRoot = forOp;
1878 
1879  b = OpBuilder::atBlockTerminator(forOp.getBody());
1880 
1881  auto fastBufOffsetMap =
1882  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1883  auto offset = AffineApplyOp::create(b, loc, fastBufOffsetMap, lbOperands);
1884 
1885  // Construct the subscript for the fast memref being copied into/from:
1886  // x - offset_x.
1887  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1888  b.getAffineDimExpr(2 * d));
1889  fastBufMapOperands.push_back(offset);
1890  fastBufMapOperands.push_back(forOp.getInductionVar());
1891  mayBeDeadApplys.push_back(offset);
1892 
1893  // Subscript for the slow memref being copied.
1894  memIndices.push_back(forOp.getInductionVar());
1895  }
1896 
1897  auto fastBufMap =
1898  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1899  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1900  fastBufMap = simplifyAffineMap(fastBufMap);
1901  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1902 
1903  // Drop any dead affine.applys.
1904  for (auto applyOp : mayBeDeadApplys)
1905  if (applyOp.use_empty())
1906  applyOp.erase();
1907 
1908  if (!isCopyOut) {
1909  // Copy in.
1910  auto load = AffineLoadOp::create(b, loc, memref, memIndices);
1911  AffineStoreOp::create(b, loc, load, fastMemRef, fastBufMap,
1912  fastBufMapOperands);
1913  return copyNestRoot;
1914  }
1915 
1916  // Copy out.
1917  auto load =
1918  AffineLoadOp::create(b, loc, fastMemRef, fastBufMap, fastBufMapOperands);
1919  AffineStoreOp::create(b, loc, load, memref, memIndices);
1920  return copyNestRoot;
1921 }
1922 
1923 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1925  return block.getParentOp()->emitRemark();
1926 }
1927 
1928 /// Creates a buffer in the faster memory space for the specified memref region
1929 /// (memref has to be non-zero ranked); generates a copy from the lower memory
1930 /// space to this one, and replaces all loads/stores in the block range
1931 /// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1932 /// if copies could not be generated due to yet unimplemented cases.
1933 /// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1934 /// specify the insertion points where the incoming copies and outgoing copies,
1935 /// respectively, should be inserted (the insertion happens right before the
1936 /// insertion point). Since `begin` can itself be invalidated due to the memref
1937 /// rewriting done from this method, the output argument `nBegin` is set to its
1938 /// replacement (set to `begin` if no invalidation happens). Since outgoing
1939 /// copies could have been inserted at `end`, the output argument `nEnd` is set
1940 /// to the new end. `sizeInBytes` is set to the size of the fast buffer
1941 /// allocated.
1942 static LogicalResult generateCopy(
1943  const MemRefRegion &region, Block *block, Block::iterator begin,
1944  Block::iterator end, Block *copyPlacementBlock,
1945  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1946  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1947  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1948  Block::iterator *nBegin, Block::iterator *nEnd) {
1949  *nBegin = begin;
1950  *nEnd = end;
1951 
1952  auto f = begin->getParentOfType<FunctionOpInterface>();
1953  OpBuilder topBuilder(f.getFunctionBody());
1954  Value zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
1955 
1956  *sizeInBytes = 0;
1957 
1958  if (begin == end)
1959  return success();
1960 
1961  // Record the last op in the block for which we are performing copy
1962  // generation. We later do the memref replacement only in [begin, lastCopyOp]
1963  // so that the original memref's used in the data movement code themselves
1964  // don't get replaced.
1965  Operation *lastCopyOp = end->getPrevNode();
1966 
1967  // Is the copy out point at the end of the block where we are doing
1968  // explicit copying.
1969  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1970 
1971  // Copies for read regions are going to be inserted at 'begin'.
1972  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1973  // Copies for write regions are going to be inserted at 'end'.
1974  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1975  OpBuilder &b = region.isWrite() ? epilogue : prologue;
1976 
1977  // Builder to create constants at the top level.
1978  auto func =
1979  copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1980  OpBuilder top(func.getFunctionBody());
1981 
1982  auto loc = region.loc;
1983  auto memref = region.memref;
1984  auto memRefType = cast<MemRefType>(memref.getType());
1985 
1986  if (!memRefType.getLayout().isIdentity()) {
1987  LDBG() << "Non-identity layout map not yet supported";
1988  return failure();
1989  }
1990 
1991  // Indices to use for the copying.
1992  // Indices for the original memref being copied from/to.
1993  SmallVector<Value, 4> memIndices;
1994  // Indices for the faster buffer being copied into/from.
1995  SmallVector<Value, 4> bufIndices;
1996 
1997  unsigned rank = memRefType.getRank();
1998  if (rank == 0) {
1999  LDBG() << "Non-zero ranked memrefs supported";
2000  return failure();
2001  }
2002 
2003  SmallVector<int64_t, 4> fastBufferShape;
2004 
2005  // Compute the extents of the buffer.
2007  lbs.reserve(rank);
2008  std::optional<int64_t> numElements =
2009  region.getConstantBoundingSizeAndShape(&fastBufferShape, &lbs);
2010  if (!numElements) {
2011  LDBG() << "Non-constant region size not supported";
2012  return failure();
2013  }
2014 
2015  if (llvm::any_of(lbs, [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2016  // This can be supported in the future if needed.
2017  LDBG() << "Max lower bound for memref region start not supported";
2018  return failure();
2019  }
2020 
2021  if (*numElements == 0) {
2022  LDBG() << "Nothing to copy";
2023  return success();
2024  }
2025 
2026  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2027  for (unsigned i = 0; i < rank; ++i) {
2028  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2029  if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2030  LDBG() << "Missing lower or upper bound for region along dimension: "
2031  << i;
2032  return failure();
2033  }
2034  }
2035 
2036  const FlatAffineValueConstraints *cst = region.getConstraints();
2037  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2038  // on; these typically include loop IVs surrounding the level at which the
2039  // copy generation is being done or other valid symbols in MLIR.
2040  SmallVector<Value, 8> regionSymbols;
2041  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2042 
2043  // Construct the access expression for the fast memory buffer. The access
2044  // expression for a particular dimension of the fast buffer is obtained by
2045  // subtracting out the lower bound on the original memref's data region
2046  // along the corresponding dimension.
2047 
2048  // Index start offsets for faster memory buffer relative to the original.
2049  SmallVector<AffineExpr, 4> fastBufOffsets;
2050  fastBufOffsets.reserve(rank);
2051  for (unsigned d = 0; d < rank; d++) {
2052  assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2053  "incorrect bound size");
2054 
2055  // Set copy start location for this dimension in the lower memory space
2056  // memref.
2057  if (lbs[d].isSingleConstant()) {
2058  auto indexVal = lbs[d].getSingleConstantResult();
2059  if (indexVal == 0) {
2060  memIndices.push_back(zeroIndex);
2061  } else {
2062  memIndices.push_back(
2063  arith::ConstantIndexOp::create(top, loc, indexVal).getResult());
2064  }
2065  } else {
2066  // The coordinate for the start location is just the lower bound along the
2067  // corresponding dimension on the memory region (stored in 'offset').
2068  // Remap all inputs of the map to dimensions uniformly since in the
2069  // generate IR we need valid affine symbols as opposed to "symbols" for
2070  // the purpose of the memref region.
2071  SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2072  for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2073  symReplacements[i] = top.getAffineDimExpr(i);
2074  lbs[d] = lbs[d].replaceDimsAndSymbols(
2075  /*dimReplacements=*/{}, symReplacements, lbs[d].getNumSymbols(),
2076  /*numResultSyms=*/0);
2077  memIndices.push_back(
2078  AffineApplyOp::create(b, loc, lbs[d], regionSymbols));
2079  }
2080  // The fast buffer is copied into at location zero; addressing is relative.
2081  bufIndices.push_back(zeroIndex);
2082 
2083  // Record the offsets since they are needed to remap the memory accesses of
2084  // the original memref further below.
2085  fastBufOffsets.push_back(lbs[d].getResult(0));
2086  }
2087 
2088  // The faster memory space buffer.
2089  Value fastMemRef;
2090 
2091  // Check if a buffer was already created.
2092  bool existingBuf = fastBufferMap.count(memref) > 0;
2093  if (!existingBuf) {
2094  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2095  auto fastMemRefType =
2096  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2097  fastBufferLayout, copyOptions.fastMemorySpace);
2098 
2099  // Create the fast memory space buffer just before the 'affine.for'
2100  // operation.
2101  fastMemRef =
2102  memref::AllocOp::create(prologue, loc, fastMemRefType).getResult();
2103  // Record it.
2104  fastBufferMap[memref] = fastMemRef;
2105  // fastMemRefType is a constant shaped memref.
2106  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2107  // We don't account for things of unknown size.
2108  *sizeInBytes = maySizeInBytes.value_or(0);
2109 
2110  LLVM_DEBUG(emitRemarkForBlock(*block)
2111  << "Creating fast buffer of type " << fastMemRefType
2112  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2113  << " KiB\n");
2114  } else {
2115  // Reuse the one already created.
2116  fastMemRef = fastBufferMap[memref];
2117  }
2118 
2119  auto numElementsSSA = arith::ConstantIndexOp::create(top, loc, *numElements);
2120 
2121  Value dmaStride;
2122  Value numEltPerDmaStride;
2123  if (copyOptions.generateDma) {
2124  SmallVector<StrideInfo, 4> dmaStrideInfos;
2125  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2126 
2127  // TODO: use all stride levels once DmaStartOp is extended for
2128  // multi-level strides.
2129  if (dmaStrideInfos.size() > 1) {
2130  LDBG() << "Only up to one level of stride supported";
2131  return failure();
2132  }
2133 
2134  if (!dmaStrideInfos.empty()) {
2135  dmaStride =
2136  arith::ConstantIndexOp::create(top, loc, dmaStrideInfos[0].stride);
2137  numEltPerDmaStride = arith::ConstantIndexOp::create(
2138  top, loc, dmaStrideInfos[0].numEltPerStride);
2139  }
2140  }
2141 
2142  // Create fully composed affine maps for each memref.
2143  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2144  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2145  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2146  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2147 
2148  if (!copyOptions.generateDma) {
2149  // Point-wise copy generation.
2150  auto copyNest =
2151  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2152  /*lbOperands=*/regionSymbols, ubMaps,
2153  /*ubOperands=*/regionSymbols, fastBufOffsets,
2154  /*isCopyOut=*/region.isWrite(), b);
2155 
2156  // Record this so that we can skip it from yet another copy.
2157  copyNests.insert(copyNest);
2158 
2159  // Since new ops are being appended (for copy out's), adjust the end to
2160  // mark end of block range being processed if necessary.
2161  if (region.isWrite() && isCopyOutAtEndOfBlock)
2162  *nEnd = Block::iterator(copyNest.getOperation());
2163  } else {
2164  // DMA generation.
2165  // Create a tag (single element 1-d memref) for the DMA.
2166  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2167  copyOptions.tagMemorySpace);
2168  auto tagMemRef = memref::AllocOp::create(prologue, loc, tagMemRefType);
2169 
2170  SmallVector<Value, 4> tagIndices({zeroIndex});
2171  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2172  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2173  if (!region.isWrite()) {
2174  // DMA non-blocking read from original buffer to fast buffer.
2175  AffineDmaStartOp::create(b, loc, memref, memAffineMap, memIndices,
2176  fastMemRef, bufAffineMap, bufIndices, tagMemRef,
2177  tagAffineMap, tagIndices, numElementsSSA,
2178  dmaStride, numEltPerDmaStride);
2179  } else {
2180  // DMA non-blocking write from fast buffer to the original memref.
2181  auto op = AffineDmaStartOp::create(
2182  b, loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2183  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2184  dmaStride, numEltPerDmaStride);
2185  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2186  // end to mark end of block range being processed.
2187  if (isCopyOutAtEndOfBlock)
2188  *nEnd = Block::iterator(op.getOperation());
2189  }
2190 
2191  // Matching DMA wait to block on completion; tag always has a 0 index.
2192  AffineDmaWaitOp::create(b, loc, tagMemRef, tagAffineMap, zeroIndex,
2193  numElementsSSA);
2194 
2195  // Generate dealloc for the tag.
2196  auto tagDeallocOp = memref::DeallocOp::create(epilogue, loc, tagMemRef);
2197  if (*nEnd == end && isCopyOutAtEndOfBlock)
2198  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2199  // mark end of range of the original.
2200  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2201  }
2202 
2203  // Generate dealloc for the buffer.
2204  if (!existingBuf) {
2205  auto bufDeallocOp = memref::DeallocOp::create(epilogue, loc, fastMemRef);
2206  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2207  // the fast buffer (since it marks the new end insertion point).
2208  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2209  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2210  }
2211 
2212  // Replace all uses of the old memref with the faster one while remapping
2213  // access indices (subtracting out lower bound offsets for each dimension).
2214  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2215  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2216  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2217  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2218  // d2, d3 correspond to the original indices (%i, %j).
2219  SmallVector<AffineExpr, 4> remapExprs;
2220  remapExprs.reserve(rank);
2221  for (unsigned i = 0; i < rank; i++) {
2222  // The starting operands of indexRemap will be regionSymbols (the symbols on
2223  // which the memref region is parametric); then those corresponding to
2224  // the memref's original indices follow.
2225  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2226  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2227  }
2228  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2229  b.getContext());
2230 
2231  // Record the begin since it may be invalidated by memref replacement.
2232  Block::iterator prevOfBegin;
2233  bool isBeginAtStartOfBlock = (begin == block->begin());
2234  if (!isBeginAtStartOfBlock)
2235  prevOfBegin = std::prev(begin);
2236 
2237  auto userFilterFn = [&](Operation *user) {
2238  auto *ancestorUser = block->findAncestorOpInBlock(*user);
2239  return ancestorUser && !ancestorUser->isBeforeInBlock(&*begin) &&
2240  !lastCopyOp->isBeforeInBlock(ancestorUser);
2241  };
2242 
2243  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2244  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2245  /*extraIndices=*/{}, indexRemap,
2246  /*extraOperands=*/regionSymbols,
2247  /*symbolOperands=*/{}, userFilterFn);
2248 
2249  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2250 
2251  return success();
2252 }
2253 
2254 /// Construct the memref region to just include the entire memref. Returns false
2255 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2256 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2257 /// is parametric on.
2258 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2259  MemRefRegion *region) {
2260  unsigned rank;
2261  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2262  rank = loadOp.getMemRefType().getRank();
2263  region->memref = loadOp.getMemRef();
2264  region->setWrite(false);
2265  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2266  rank = storeOp.getMemRefType().getRank();
2267  region->memref = storeOp.getMemRef();
2268  region->setWrite(true);
2269  } else {
2270  assert(false && "expected load or store op");
2271  return false;
2272  }
2273  auto memRefType = cast<MemRefType>(region->memref.getType());
2274  if (!memRefType.hasStaticShape())
2275  return false;
2276 
2277  auto *regionCst = region->getConstraints();
2278 
2279  // Just get the first numSymbols IVs, which the memref region is parametric
2280  // on.
2282  getAffineForIVs(*op, &ivs);
2283  ivs.resize(numParamLoopIVs);
2284  SmallVector<Value, 4> symbols;
2285  extractForInductionVars(ivs, &symbols);
2286  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2287  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2288 
2289  // Memref dim sizes provide the bounds.
2290  for (unsigned d = 0; d < rank; d++) {
2291  auto dimSize = memRefType.getDimSize(d);
2292  assert(dimSize > 0 && "filtered dynamic shapes above");
2293  regionCst->addBound(BoundType::LB, d, 0);
2294  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2295  }
2296  return true;
2297 }
2298 
2299 LogicalResult
2301  const AffineCopyOptions &copyOptions,
2302  std::optional<Value> filterMemRef,
2303  DenseSet<Operation *> &copyNests) {
2304  if (begin == end)
2305  return success();
2306 
2307  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2308  "Inconsistent block begin/end args");
2309  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2310 
2311  Block *block = begin->getBlock();
2312 
2313  // Copies will be generated for this depth, i.e., symbolic in all loops
2314  // surrounding the this block range.
2315  unsigned copyDepth = getNestingDepth(&*begin);
2316 
2317  LDBG() << "Generating copies at depth " << copyDepth;
2318  LDBG() << "from begin: "
2319  << OpWithFlags(&*begin, OpPrintingFlags().skipRegions());
2320  LDBG() << "to inclusive end: "
2321  << OpWithFlags(&*std::prev(end), OpPrintingFlags().skipRegions());
2322 
2323  // List of memory regions to copy for. We need a map vector to have a
2324  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2325  // since the alloc's for example are identical except for the SSA id.
2326  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2327  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2328 
2329  // Map from original memref's to the fast buffers that their accesses are
2330  // replaced with.
2331  DenseMap<Value, Value> fastBufferMap;
2332 
2333  // To check for errors when walking the block.
2334  bool error = false;
2335 
2336  // Walk this range of operations to gather all memory regions.
2337  block->walk(begin, end, [&](Operation *opInst) {
2338  Value memref;
2339  MemRefType memrefType;
2340  // Gather regions to allocate to buffers in faster memory space.
2341  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2342  memref = loadOp.getMemRef();
2343  memrefType = loadOp.getMemRefType();
2344  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2345  memref = storeOp.getMemRef();
2346  memrefType = storeOp.getMemRefType();
2347  }
2348  // Not an affine.load/store op.
2349  if (!memref)
2350  return;
2351 
2352  if ((filterMemRef.has_value() && filterMemRef != memref) ||
2353  (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2354  memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2355  return;
2356 
2357  if (!memref.getParentRegion()->isAncestor(block->getParent())) {
2358  LDBG() << "memref definition is inside of the depth at "
2359  << "which copy-in/copy-out would happen";
2360  return;
2361  }
2362 
2363  // Compute the MemRefRegion accessed.
2364  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2365  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2366  /*addMemRefDimBounds=*/false))) {
2367  LDBG() << "Error obtaining memory region: semi-affine maps?";
2368  LDBG() << "over-approximating to the entire memref";
2369  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2370  LDBG() << "non-constant memref sizes not yet supported";
2371  error = true;
2372  return;
2373  }
2374  }
2375 
2376  // Each memref has a single buffer associated with it irrespective of how
2377  // many load's and store's happen on it.
2378  // TODO: in the future, when regions don't intersect and satisfy
2379  // other properties (based on load/store regions), we could consider
2380  // multiple buffers per memref.
2381 
2382  // Add to the appropriate region if it's not already in it, or take a
2383  // bounding box union with the existing one if it's already in there.
2384  // Note that a memref may have both read and write regions - so update the
2385  // region in the other list if one exists (write in case of read and vice
2386  // versa) since there is a single bounding box for a memref across all reads
2387  // and writes that happen on it.
2388 
2389  // Attempts to update; returns true if 'region' exists in targetRegions.
2390  auto updateRegion =
2391  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2392  &targetRegions) {
2393  const auto *const it = targetRegions.find(region->memref);
2394  if (it == targetRegions.end())
2395  return false;
2396 
2397  // Perform a union with the existing region.
2398  if (failed(it->second->unionBoundingBox(*region))) {
2399  LDBG() << "Memory region bounding box failed; "
2400  << "over-approximating to the entire memref";
2401  // If the union fails, we will overapproximate.
2402  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2403  LDBG() << "non-constant memref sizes not yet supported";
2404  error = true;
2405  return true;
2406  }
2407  it->second->getConstraints()->clearAndCopyFrom(
2408  *region->getConstraints());
2409  } else {
2410  // Union was computed and stored in 'it->second': copy to 'region'.
2411  region->getConstraints()->clearAndCopyFrom(
2412  *it->second->getConstraints());
2413  }
2414  return true;
2415  };
2416 
2417  bool existsInRead = updateRegion(readRegions);
2418  if (error)
2419  return;
2420  bool existsInWrite = updateRegion(writeRegions);
2421  if (error)
2422  return;
2423 
2424  // Finally add it to the region list.
2425  if (region->isWrite() && !existsInWrite) {
2426  writeRegions[region->memref] = std::move(region);
2427  } else if (!region->isWrite() && !existsInRead) {
2428  readRegions[region->memref] = std::move(region);
2429  }
2430  });
2431 
2432  if (error) {
2433  LDBG() << "copy generation failed for one or more memref's in this block";
2434  return failure();
2435  }
2436 
2437  uint64_t totalCopyBuffersSizeInBytes = 0;
2438  bool ret = true;
2439  auto processRegions =
2440  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2441  &regions) {
2442  for (const auto &regionEntry : regions) {
2443  // For each region, hoist copy in/out past all hoistable
2444  // 'affine.for's.
2445  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2446  Block *copyPlacementBlock;
2448  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2449  &copyInPlacementStart, &copyOutPlacementStart);
2450 
2451  uint64_t sizeInBytes;
2452  Block::iterator nBegin, nEnd;
2453  LogicalResult iRet = generateCopy(
2454  *regionEntry.second, block, begin, end, copyPlacementBlock,
2455  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2456  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2457  if (succeeded(iRet)) {
2458  // begin/end could have been invalidated, and need update.
2459  begin = nBegin;
2460  end = nEnd;
2461  totalCopyBuffersSizeInBytes += sizeInBytes;
2462  }
2463  ret = ret & succeeded(iRet);
2464  }
2465  };
2466  processRegions(readRegions);
2467  processRegions(writeRegions);
2468 
2469  if (!ret) {
2470  LDBG() << "copy generation failed for one or more memref's in this block";
2471  return failure();
2472  }
2473 
2474  // For a range of operations, a note will be emitted at the caller.
2475  AffineForOp forOp;
2476  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2477  LLVM_DEBUG(forOp.emitRemark()
2478  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2479  << " KiB of copy buffers in fast memory space for this block");
2480  }
2481 
2482  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2483  block->getParentOp()->emitWarning(
2484  "total size of all copy buffers' for this block exceeds fast memory "
2485  "capacity");
2486  }
2487 
2488  return success();
2489 }
2490 
2491 // A convenience version of affineDataCopyGenerate for all ops in the body of
2492 // an AffineForOp.
2494  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2495  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2496  return affineDataCopyGenerate(forOp.getBody()->begin(),
2497  std::prev(forOp.getBody()->end()), copyOptions,
2498  filterMemRef, copyNests);
2499 }
2500 
2502  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2503  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2504  Block *block = analyzedOp->getBlock();
2505  auto begin = analyzedOp->getIterator();
2506  auto end = std::next(begin);
2507  DenseMap<Value, Value> fastBufferMap;
2508  DenseSet<Operation *> copyNests;
2509 
2510  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2511  copyOptions, fastBufferMap, copyNests,
2512  &result.sizeInBytes, &begin, &end);
2513  if (failed(err))
2514  return err;
2515 
2516  const auto &en = fastBufferMap.find(memrefRegion.memref);
2517  // In some cases (empty loops), no copy generation would have happened.
2518  if (en == fastBufferMap.end())
2519  return failure();
2520  result.alloc = en->second.getDefiningOp();
2521  assert(result.alloc && "fast buffer expected to be locally allocated");
2522  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2523  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2524  return success();
2525 }
2526 
2527 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2528 static void
2529 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2530  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2531  // Add a new empty level to output if it doesn't exist level already.
2532  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2533  if (currLoopDepth == depthToLoops.size())
2534  depthToLoops.emplace_back();
2535 
2536  for (auto &op : *block) {
2537  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2538  depthToLoops[currLoopDepth].push_back(forOp);
2539  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2540  }
2541  }
2542 }
2543 
2544 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2546  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2547  for (auto &block : func)
2548  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2549 
2550  // Remove last loop level from output since it's empty.
2551  if (!depthToLoops.empty()) {
2552  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2553  depthToLoops.pop_back();
2554  }
2555 }
2556 
2558  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2559  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2560  SmallVector<Value, 4> lowerOperands(lbOperands);
2561  SmallVector<Value, 4> upperOperands(ubOperands);
2562 
2563  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2564  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2565  lbMap = removeDuplicateExprs(lbMap);
2566  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2567  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2568  ubMap = removeDuplicateExprs(ubMap);
2569 
2570  return AffineForOp::create(b, loc, lowerOperands, lbMap, upperOperands, ubMap,
2571  step);
2572 }
2573 
2574 /// Creates an AffineIfOp that encodes the conditional to choose between
2575 /// the constant trip count version and an unknown trip count version of this
2576 /// nest of loops. This is used to separate partial and full tiles if `loops`
2577 /// has the intra-tile loops. The affine.if op is inserted at the builder
2578 /// insertion point of `b`.
2580  OpBuilder b) {
2581  if (loops.empty())
2582  return nullptr;
2583 
2584  auto *context = loops[0].getContext();
2585 
2588  llvm::append_range(ops, loops);
2589  (void)getIndexSet(ops, &cst);
2590 
2591  // Remove constraints that are independent of these loop IVs.
2592  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2593 
2594  // Construct the constraint set representing the guard for full tiles. The
2595  // lower bound (and upper bound) corresponding to the full tile should be
2596  // larger (and resp. smaller) than any other lower (or upper bound).
2597  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2598  for (auto loop : loops) {
2599  (void)loop;
2600  // TODO: Non-unit stride is not an issue to generalize to.
2601  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2602  // Mark everything symbols for the purpose of finding a constant diff pair.
2603  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2604  1);
2605  unsigned fullTileLbPos, fullTileUbPos;
2606  if (!((IntegerRelation)cst)
2607  .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2608  /*boundFloorDivisor=*/nullptr,
2609  /*ub=*/nullptr, &fullTileLbPos,
2610  &fullTileUbPos)) {
2611  LDBG() << "Can't get constant diff pair for a loop";
2612  return nullptr;
2613  }
2614 
2615  SmallVector<unsigned, 4> lbIndices, ubIndices;
2616  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2617 
2618  auto fLb = cst.getInequality(fullTileLbPos);
2619  auto fUb = cst.getInequality(fullTileUbPos);
2620  fullTileLb.assign(fLb.begin(), fLb.end());
2621  fullTileUb.assign(fUb.begin(), fUb.end());
2622 
2623  // Full tile lower bound should be >= than any other lower bound.
2624  for (auto lbIndex : lbIndices)
2625  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2626  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2627 
2628  // Full tile upper bound should be <= any other upper bound.
2629  for (auto ubIndex : ubIndices)
2630  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2631  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2632 
2633  cst.removeVar(0);
2634  }
2635 
2636  // The previous step leads to all zeros for the full tile lb and ub position
2637  // itself; remove those and any other duplicates / trivial redundancies.
2639 
2640  // Turn everything into dims conservatively since we earlier turned all
2641  // trailing ids past point loop IV into symbols. Some of these could be outer
2642  // loop IVs; we'll canonicalize anyway.
2643  cst.setDimSymbolSeparation(0);
2644 
2645  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2646  // ifCondSet can be null if cst was empty -- this can happen if all loops
2647  // in the nest have constant trip counts.
2648  if (!ifCondSet)
2649  return nullptr;
2650 
2651  SmallVector<Value, 4> setOperands;
2652  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2653  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2654  return AffineIfOp::create(b, loops[0].getLoc(), ifCondSet, setOperands,
2655  /*withElseRegion=*/true);
2656 }
2657 
2658 /// Create the full tile loop nest (along with its body).
2659 static LogicalResult
2661  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2662  fullTileLoops.reserve(inputNest.size());
2663 
2664  // For each loop in the original nest identify a lower/upper bound pair such
2665  // that their difference is a constant.
2667  for (auto loop : inputNest) {
2668  // TODO: straightforward to generalize to a non-unit stride.
2669  if (loop.getStepAsInt() != 1) {
2670  LDBG() << "[tile separation] non-unit stride not implemented";
2671  return failure();
2672  }
2673  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2674  (void)getIndexSet(loopOp, &cst);
2675  // We will mark everything other than this loop IV as symbol for getting a
2676  // pair of <lb, ub> with a constant difference.
2678  unsigned lbPos, ubPos;
2679  if (!((IntegerRelation)cst)
2680  .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2681  /*boundFloorDivisor=*/nullptr,
2682  /*ub=*/nullptr, &lbPos, &ubPos) ||
2683  lbPos == ubPos) {
2684  LDBG() << "[tile separation] Can't get constant diff / "
2685  << "equalities not yet handled";
2686  return failure();
2687  }
2688 
2689  // Set all variables as dimensions uniformly since some of those marked as
2690  // symbols above could be outer loop IVs (corresponding tile space IVs).
2691  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2692 
2693  AffineValueMap lbVmap, ubVmap;
2694  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2695  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2696  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2697  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2698  ubVmap.getOperands(), ubVmap.getAffineMap());
2699  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2700  fullTileLoops.push_back(fullTileLoop);
2701  }
2702 
2703  // Add the body for the full tile loop nest.
2704  IRMapping operandMap;
2705  for (const auto &loopEn : llvm::enumerate(inputNest))
2706  operandMap.map(loopEn.value().getInductionVar(),
2707  fullTileLoops[loopEn.index()].getInductionVar());
2708  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2709  for (auto &op : inputNest.back().getBody()->without_terminator())
2710  b.clone(op, operandMap);
2711  return success();
2712 }
2713 
2714 LogicalResult
2716  SmallVectorImpl<AffineForOp> *fullTileNest) {
2717  if (inputNest.empty())
2718  return success();
2719 
2720  auto firstLoop = inputNest[0];
2721 
2722  // Each successive for op has to be nested in the other.
2723  auto prevLoop = firstLoop;
2724  for (auto loop : inputNest.drop_front(1)) {
2725  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2726  prevLoop = loop;
2727  }
2728 
2729  // Create the full tile loop nest.
2730  SmallVector<AffineForOp, 4> fullTileLoops;
2731  OpBuilder b(firstLoop);
2732  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2733  if (!fullTileLoops.empty())
2734  fullTileLoops.front().erase();
2735  return failure();
2736  }
2737 
2738  // Create and insert the version select right before the root of the nest.
2739  b = OpBuilder(firstLoop);
2740  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2741  if (!ifOp) {
2742  fullTileLoops.front().erase();
2743  LDBG() << "All tiles are full tiles, or failure creating "
2744  << "separation condition";
2745  return failure();
2746  }
2747 
2748  // Move the full tile into the then block.
2749  Block *thenBlock = ifOp.getThenBlock();
2750  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2751  thenBlock->getOperations().splice(
2752  std::prev(thenBlock->end()),
2753  outermostFullTileLoop->getBlock()->getOperations(),
2754  Block::iterator(outermostFullTileLoop));
2755 
2756  // Move the partial tile into the else block. The partial tile is the same as
2757  // the original loop nest.
2758  Block *elseBlock = ifOp.getElseBlock();
2759  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2760  firstLoop->getBlock()->getOperations(),
2761  Block::iterator(firstLoop));
2762 
2763  if (fullTileNest)
2764  *fullTileNest = std::move(fullTileLoops);
2765 
2766  return success();
2767 }
2768 
2769 LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2770  LogicalResult result(failure());
2772  getPerfectlyNestedLoops(loops, op);
2773  if (loops.size() <= 1)
2774  return success();
2775 
2776  // Look for a band of loops that can be coalesced, i.e. perfectly nested
2777  // loops with bounds defined above some loop.
2778  // 1. For each loop, find above which parent loop its operands are
2779  // defined.
2780  SmallVector<unsigned> operandsDefinedAbove(loops.size());
2781  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2782  operandsDefinedAbove[i] = i;
2783  for (unsigned j = 0; j < i; ++j) {
2784  if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2785  operandsDefinedAbove[i] = j;
2786  break;
2787  }
2788  }
2789  }
2790 
2791  // 2. Identify bands of loops such that the operands of all of them are
2792  // defined above the first loop in the band. Traverse the nest bottom-up
2793  // so that modifications don't invalidate the inner loops.
2794  for (unsigned end = loops.size(); end > 0; --end) {
2795  unsigned start = 0;
2796  for (; start < end - 1; ++start) {
2797  auto maxPos =
2798  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
2799  std::next(operandsDefinedAbove.begin(), end));
2800  if (maxPos > start)
2801  continue;
2802  assert(maxPos == start &&
2803  "expected loop bounds to be known at the start of the band");
2804  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2805  if (succeeded(coalesceLoops(band)))
2806  result = success();
2807  break;
2808  }
2809  // If a band was found and transformed, keep looking at the loops above
2810  // the outermost transformed loop.
2811  if (start != end - 1)
2812  end = start + 1;
2813  }
2814  return result;
2815 }
2816 
2818  int64_t count = 0;
2819  Operation *currentOp = operand.getOwner();
2820  while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2821  if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2822  break;
2823  currentOp = loopOp;
2824  count++;
2825  }
2826  return count;
2827 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:383
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:660
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:423
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2258
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1529
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1924
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:363
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1758
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:175
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:408
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:559
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:464
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2529
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:899
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region (memref has to be non-zer...
Definition: LoopUtils.cpp:1942
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2660
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1814
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:688
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:960
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1072
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1302
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:417
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2579
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:45
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1509
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to a non-zero ranked ‘memref’ to/from ‘fastMemRef’ and returns the o...
Definition: LoopUtils.cpp:1850
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:103
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:959
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:394
unsigned getNumDims() const
Definition: AffineMap.cpp:390
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:403
unsigned getNumResults() const
Definition: AffineMap.cpp:398
unsigned getNumInputs() const
Definition: AffineMap.cpp:399
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:407
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
Definition: Block.cpp:74
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:27
RetT walk(FnT &&callback)
Walk all nested operations, blocks (including this block) or regions, depending on the type of callba...
Definition: Block.h:308
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:31
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:396
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:402
AffineMap getDimIdentityMap()
Definition: Builders.cpp:378
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:382
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:363
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:367
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:359
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:764
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:314
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:257
Set of flags used to control the behavior of the various IR print methods (e.g.
A wrapper class that allows for printing an operation with a set of flags, useful to act as a "stream...
Definition: Operation.h:1111
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Definition: Operation.cpp:385
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:718
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:236
Region * getParentRegion()
Returns the region to which the instruction belongs.
Definition: Operation.h:230
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:288
bool isAncestor(Region *other)
Return true if this region is ancestor of the other region.
Definition: Region.h:222
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:208
Type getType() const
Return the type of this value.
Definition: Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:39
static WalkResult advance()
Definition: WalkResult.h:47
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:550
Value getOperand(unsigned idx)
Definition: AffineOps.h:556
operand_range getOperands()
Definition: AffineOps.h:565
static AffineDmaStartOp create(OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap, ValueRange srcIndices, Value destMemRef, AffineMap dstMap, ValueRange destIndices, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements, Value stride=nullptr, Value elementsPerStride=nullptr)
Definition: AffineOps.cpp:1748
static AffineDmaWaitOp create(OpBuilder &builder, Location location, Value tagMemRef, AffineMap tagMap, ValueRange tagIndices, Value numElements)
Definition: AffineOps.cpp:1944
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
An IntegerRelation represents the set of points from a PresburgerSpace that satisfy a list of affine ...
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
ArrayRef< DynamicAPInt > getInequality(unsigned idx) const
DynamicAPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1605
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:871
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:119
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2300
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1061
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2825
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:995
void getEnclosingAffineOps(Operation &op, SmallVectorImpl< Operation * > *ops)
Populates 'ops' with affine operations enclosing op ordered from outermost to innermost while stoppin...
Definition: Utils.cpp:769
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2545
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1361
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2557
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:856
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:230
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1336
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2501
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1619
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:886
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1383
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1084
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:814
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands, bool composeAffineMin=false)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1260
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1624
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:755
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1367
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2817
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1967
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1718
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1584
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1453
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:772
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1279
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, llvm::function_ref< bool(Operation *)> userFilterFn=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1306
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2769
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2715
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2779
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
Include the generated interface declarations.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:766
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:35
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:776
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
int64_t stride
Definition: LoopUtils.cpp:1805
int64_t numEltPerStride
Definition: LoopUtils.cpp:1806
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:159
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:199
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:1064
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:527
bool isWrite() const
Definition: Utils.h:529
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:1120
void setWrite(bool flag)
Definition: Utils.h:530
Value memref
Memref that this region corresponds to.
Definition: Utils.h:562
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:569
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.