MLIR  18.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
24 #include "mlir/IR/IRMapping.h"
25 #include "mlir/IR/IntegerSet.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include <optional>
33 
34 #define DEBUG_TYPE "loop-utils"
35 
36 using namespace mlir;
37 using namespace affine;
38 using namespace presburger;
39 using llvm::SmallMapVector;
40 
41 namespace {
42 // This structure is to pass and return sets of loop parameters without
43 // confusing the order.
44 struct LoopParams {
45  Value lowerBound;
46  Value upperBound;
47  Value step;
48 };
49 } // namespace
50 
51 /// Computes the cleanup loop lower bound of the loop being unrolled with
52 /// the specified unroll factor; this bound will also be upper bound of the main
53 /// part of the unrolled loop. Computes the bound as an AffineMap with its
54 /// operands or a null map when the trip count can't be expressed as an affine
55 /// expression.
56 static void
57 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
58  AffineMap &cleanupLbMap,
59  SmallVectorImpl<Value> &cleanupLbOperands) {
60  AffineMap tripCountMap;
61  SmallVector<Value, 4> tripCountOperands;
62  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
63  // Trip count can't be computed.
64  if (!tripCountMap) {
65  cleanupLbMap = AffineMap();
66  return;
67  }
68 
69  OpBuilder b(forOp);
70  auto lbMap = forOp.getLowerBoundMap();
71  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
72  forOp.getLowerBoundOperands());
73 
74  // For each upper bound expr, get the range.
75  // Eg: affine.for %i = lb to min (ub1, ub2),
76  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
77  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
78  // these affine.apply's make up the cleanup loop lower bound.
79  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
80  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
81  int64_t step = forOp.getStepAsInt();
82  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
83  auto tripCountExpr = tripCountMap.getResult(i);
84  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
85  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
86  tripCountMap.getNumSymbols(), bumpExprs[i]);
87  bumpValues[i] =
88  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
89  }
90 
91  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
92  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
93  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
94 
95  cleanupLbOperands.clear();
96  cleanupLbOperands.push_back(lb);
97  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
98  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
99  b.getContext());
100  // Simplify the cleanupLbMap + cleanupLbOperands.
101  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
102  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
103  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
104  // Remove any affine.apply's that became dead from the simplification above.
105  for (auto v : bumpValues)
106  if (v.use_empty())
107  v.getDefiningOp()->erase();
108 
109  if (lb.use_empty())
110  lb.erase();
111 }
112 
113 /// Helper to replace uses of loop carried values (iter_args) and loop
114 /// yield values while promoting single iteration affine.for ops.
115 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
116  // Replace uses of iter arguments with iter operands (initial values).
117  auto iterOperands = forOp.getInits();
118  auto iterArgs = forOp.getRegionIterArgs();
119  for (auto e : llvm::zip(iterOperands, iterArgs))
120  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
121 
122  // Replace uses of loop results with the values yielded by the loop.
123  auto outerResults = forOp.getResults();
124  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
125  for (auto e : llvm::zip(outerResults, innerResults))
126  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
127 }
128 
129 /// Promotes the loop body of a forOp to its containing block if the forOp
130 /// was known to have a single iteration.
131 // TODO: extend this for arbitrary affine bounds.
133  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
134  if (!tripCount || *tripCount != 1)
135  return failure();
136 
137  if (forOp.getLowerBoundMap().getNumResults() != 1)
138  return failure();
139 
140  // Replaces all IV uses to its single iteration value.
141  auto iv = forOp.getInductionVar();
142  auto *parentBlock = forOp->getBlock();
143  if (!iv.use_empty()) {
144  if (forOp.hasConstantLowerBound()) {
145  OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
146  auto constOp = topBuilder.create<arith::ConstantIndexOp>(
147  forOp.getLoc(), forOp.getConstantLowerBound());
148  iv.replaceAllUsesWith(constOp);
149  } else {
150  auto lbOperands = forOp.getLowerBoundOperands();
151  auto lbMap = forOp.getLowerBoundMap();
152  OpBuilder builder(forOp);
153  if (lbMap == builder.getDimIdentityMap()) {
154  // No need of generating an affine.apply.
155  iv.replaceAllUsesWith(lbOperands[0]);
156  } else {
157  auto affineApplyOp =
158  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
159  iv.replaceAllUsesWith(affineApplyOp);
160  }
161  }
162  }
163 
165 
166  // Move the loop body operations, except for its terminator, to the loop's
167  // containing block.
168  forOp.getBody()->back().erase();
169  parentBlock->getOperations().splice(Block::iterator(forOp),
170  forOp.getBody()->getOperations());
171  forOp.erase();
172  return success();
173 }
174 
175 /// Generates an affine.for op with the specified lower and upper bounds
176 /// while generating the right IV remappings to realize shifts for operations in
177 /// its body. The operations that go into the loop body are specified in
178 /// opGroupQueue starting from the specified offset, and in that order. The
179 /// first element of the pair specifies the shift applied to that group of
180 /// operations; the shift is multiplied by the loop step before being applied.
181 /// Returns nullptr if the generated loop simplifies to a single iteration one.
182 static AffineForOp generateShiftedLoop(
183  AffineMap lbMap, AffineMap ubMap,
184  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
185  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
186  auto lbOperands = srcForOp.getLowerBoundOperands();
187  auto ubOperands = srcForOp.getUpperBoundOperands();
188 
189  assert(lbMap.getNumInputs() == lbOperands.size());
190  assert(ubMap.getNumInputs() == ubOperands.size());
191 
192  auto loopChunk =
193  b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
194  ubMap, srcForOp.getStepAsInt());
195  auto loopChunkIV = loopChunk.getInductionVar();
196  auto srcIV = srcForOp.getInductionVar();
197 
198  IRMapping operandMap;
199 
200  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
201  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
202  uint64_t shift = it.first;
203  auto ops = it.second;
204  // All 'same shift' operations get added with their operands being
205  // remapped to results of cloned operations, and their IV used remapped.
206  // Generate the remapping if the shift is not zero: remappedIV = newIV -
207  // shift.
208  if (!srcIV.use_empty() && shift != 0) {
209  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
210  srcForOp.getLoc(),
211  bodyBuilder.getSingleDimShiftAffineMap(
212  -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
213  loopChunkIV);
214  operandMap.map(srcIV, ivRemap);
215  } else {
216  operandMap.map(srcIV, loopChunkIV);
217  }
218  for (auto *op : ops)
219  bodyBuilder.clone(*op, operandMap);
220  };
221  if (succeeded(promoteIfSingleIteration(loopChunk)))
222  return AffineForOp();
223  return loopChunk;
224 }
225 
226 // The skewing of operations with respect to one another can be used for
227 // example to allow overlap of asynchronous operations (such as DMA
228 // communication) with computation, or just relative shifting of operations
229 // for better register reuse, locality or parallelism. As such, the shifts are
230 // typically expected to be at most of the order of the number of operations.
231 // This method should not be used as a substitute for loop distribution/fission.
232 // This method uses an algorithm// in time linear in the number of operations
233 // in the body of the for loop - (using the 'sweep line' paradigm). This method
234 // asserts preservation of SSA dominance. A check for that as well as that for
235 // memory-based dependence preservation check rests with the users of this
236 // method.
238  ArrayRef<uint64_t> shifts,
239  bool unrollPrologueEpilogue) {
240  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
241  "too few/many shifts");
242  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243  return success();
244 
245  // If the trip counts aren't constant, we would need versioning and
246  // conditional guards (or context information to prevent such versioning). The
247  // better way to pipeline for such loops is to first tile them and extract
248  // constant trip count "full tiles" before applying this.
249  auto mayBeConstTripCount = getConstantTripCount(forOp);
250  if (!mayBeConstTripCount) {
251  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
252  return success();
253  }
254  uint64_t tripCount = *mayBeConstTripCount;
255 
256  assert(isOpwiseShiftValid(forOp, shifts) &&
257  "shifts will lead to an invalid transformation\n");
258 
259  int64_t step = forOp.getStepAsInt();
260 
261  unsigned numChildOps = shifts.size();
262 
263  // Do a linear time (counting) sort for the shifts.
264  uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
265  if (maxShift >= numChildOps) {
266  // Large shifts are not the typical use case.
267  forOp.emitWarning("not shifting because shifts are unrealistically large");
268  return success();
269  }
270 
271  // An array of operation groups sorted by shift amount; each group has all
272  // operations with the same shift in the order in which they appear in the
273  // body of the 'affine.for' op.
274  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
275  unsigned pos = 0;
276  for (auto &op : forOp.getBody()->without_terminator()) {
277  auto shift = shifts[pos++];
278  sortedOpGroups[shift].push_back(&op);
279  }
280 
281  // Unless the shifts have a specific pattern (which actually would be the
282  // common use case), prologue and epilogue are not meaningfully defined.
283  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
284  // loop generated as the prologue and the last as epilogue and unroll these
285  // fully.
286  AffineForOp prologue, epilogue;
287 
288  // Do a sweep over the sorted shifts while storing open groups in a
289  // vector, and generating loop portions as necessary during the sweep. A block
290  // of operations is paired with its shift.
291  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
292 
293  auto origLbMap = forOp.getLowerBoundMap();
294  uint64_t lbShift = 0;
295  OpBuilder b(forOp);
296  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
297  // If nothing is shifted by d, continue.
298  if (sortedOpGroups[d].empty())
299  continue;
300  if (!opGroupQueue.empty()) {
301  assert(d > 0 &&
302  "Queue expected to be empty when the first block is found");
303  // The interval for which the loop needs to be generated here is:
304  // [lbShift, min(lbShift + tripCount, d)) and the body of the
305  // loop needs to have all operations in opQueue in that order.
306  AffineForOp res;
307  if (lbShift + tripCount * step < d * step) {
308  res = generateShiftedLoop(
309  b.getShiftedAffineMap(origLbMap, lbShift),
310  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
311  opGroupQueue, /*offset=*/0, forOp, b);
312  // Entire loop for the queued op groups generated, empty it.
313  opGroupQueue.clear();
314  lbShift += tripCount * step;
315  } else {
316  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
317  b.getShiftedAffineMap(origLbMap, d),
318  opGroupQueue, /*offset=*/0, forOp, b);
319  lbShift = d * step;
320  }
321 
322  if (res) {
323  // Simplify/canonicalize the affine.for.
324  RewritePatternSet patterns(res.getContext());
325  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
326  GreedyRewriteConfig config;
328  bool erased;
329  (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns),
330  config, /*changed=*/nullptr, &erased);
331  if (!erased && !prologue)
332  prologue = res;
333  if (!erased)
334  epilogue = res;
335  }
336  } else {
337  // Start of first interval.
338  lbShift = d * step;
339  }
340  // Augment the list of operations that get into the current open interval.
341  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
342  }
343 
344  // Those operations groups left in the queue now need to be processed (FIFO)
345  // and their loops completed.
346  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
347  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
348  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
349  b.getShiftedAffineMap(origLbMap, ubShift),
350  opGroupQueue, /*offset=*/i, forOp, b);
351  lbShift = ubShift;
352  if (!prologue)
353  prologue = epilogue;
354  }
355 
356  // Erase the original for op.
357  forOp.erase();
358 
359  if (unrollPrologueEpilogue && prologue)
360  (void)loopUnrollFull(prologue);
361  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
362  (void)loopUnrollFull(epilogue);
363 
364  return success();
365 }
366 
367 /// Checks whether a loop nest is hyper-rectangular or not.
368 static LogicalResult
371  SmallVector<Operation *, 8> ops(input.begin(), input.end());
372  // 0-d or 1-d is trivially hyper-rectangular.
373  if (input.size() <= 1)
374  return success();
375  if (failed(getIndexSet(ops, &cst))) {
376  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
377  return failure();
378  }
379  if (!cst.isHyperRectangular(0, input.size())) {
380  LLVM_DEBUG(llvm::dbgs()
381  << "Non-hyperrectangular nests not supported for tiling!\n");
382  return failure();
383  }
384  return success();
385 }
386 
387 /// Check if the input nest is supported for tiling and whether tiling would be
388 /// legal or not.
389 template <typename t>
391  ArrayRef<t> tileSizes) {
392  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
393 
394  if (llvm::any_of(input,
395  [](AffineForOp op) { return op.getNumResults() > 0; })) {
396  LLVM_DEBUG(llvm::dbgs()
397  << "Cannot tile nest where a loop has yield values\n");
398  return failure();
399  }
400 
401  // Check if the supplied `for` ops are all successively nested.
402  if (!isPerfectlyNested(input)) {
403  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
404  return failure();
405  }
406 
407  if (failed(checkIfHyperRectangular(input)))
408  return failure();
409 
410  return success();
411 }
412 
413 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
414 /// location in destination's body, ignoring the terminator.
415 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
416  Block::iterator loc) {
417  auto &ops = src.getBody()->getOperations();
418  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
419  std::prev(ops.end()));
420 }
421 
422 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
423 /// body.
424 static void moveLoopBody(AffineForOp src, AffineForOp dest) {
425  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
426 }
427 
428 /// Constructs tiled loop nest, without setting the loop bounds and move the
429 /// body of the original loop nest to the tiled loop nest.
431  AffineForOp rootAffineForOp, unsigned width,
432  MutableArrayRef<AffineForOp> tiledLoops) {
433  Location loc = rootAffineForOp.getLoc();
434 
435  // The outermost among the loops as we add more..
436  Operation *topLoop = rootAffineForOp.getOperation();
437  AffineForOp innermostPointLoop;
438 
439  // Add intra-tile (or point) loops.
440  for (unsigned i = 0; i < width; i++) {
441  OpBuilder b(topLoop);
442  // Loop bounds will be set later.
443  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
444  pointLoop.getBody()->getOperations().splice(
445  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
446  topLoop);
447  tiledLoops[2 * width - 1 - i] = pointLoop;
448  topLoop = pointLoop.getOperation();
449  if (i == 0)
450  innermostPointLoop = pointLoop;
451  }
452 
453  // Add tile space loops;
454  for (unsigned i = width; i < 2 * width; i++) {
455  OpBuilder b(topLoop);
456  // Loop bounds will be set later.
457  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
458  tileSpaceLoop.getBody()->getOperations().splice(
459  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
460  topLoop);
461  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
462  topLoop = tileSpaceLoop.getOperation();
463  }
464 
465  // Move the loop body of the original nest to the new one.
466  moveLoopBody(origLoops.back(), innermostPointLoop);
467 }
468 
469 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
470 // TODO: Handle non-constant lower bounds.
471 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
472  AffineForOp newInterTileLoop,
473  AffineForOp newIntraTileLoop,
474  Value tileSize) {
475  // The lower bound for the intra-tile loop is represented by an affine map
476  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
477  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
478  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
479  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
480  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
481  // corresponding inter-tile loop.
482 
483  assert(origLoop.hasConstantLowerBound() &&
484  "expected input loops to have constant lower bound.");
485 
486  // Get lower bound of original loop as an affine expression.
487  AffineExpr origLowerBoundExpr;
488  origLowerBoundExpr =
489  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
490 
491  // Add dim operands from original lower/upper bound.
492  SmallVector<Value, 4> lbOperands, ubOperands;
493  AffineBound lb = origLoop.getLowerBound();
494  AffineBound ub = origLoop.getUpperBound();
495  lbOperands.reserve(lb.getNumOperands() + 2);
496  ubOperands.reserve(ub.getNumOperands() + 2);
497  AffineMap origLbMap = lb.getMap();
498  AffineMap origUbMap = ub.getMap();
499  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
500  lbOperands.push_back(lb.getOperand(j));
501  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
502  ubOperands.push_back(ub.getOperand(j));
503 
504  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
505  // IV.
506  lbOperands.push_back(newInterTileLoop.getInductionVar());
507  ubOperands.push_back(newInterTileLoop.getInductionVar());
508 
509  // Get loop IV as an affine expression for lower/upper bound. Size of
510  // lb/ubOperands is guaranteed to be atleast one.
511  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
512  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
513 
514  // Add symbol operands from original lower/upper bound.
515  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
516  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
517  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
518  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
519 
520  // Add a new symbol operand which is the tile size for this loop.
521  lbOperands.push_back(tileSize);
522  ubOperands.push_back(tileSize);
523 
524  SmallVector<AffineExpr, 4> lbBoundExprs;
525  SmallVector<AffineExpr, 4> ubBoundExprs;
526  lbBoundExprs.reserve(origLbMap.getNumResults());
527  ubBoundExprs.reserve(origUbMap.getNumResults());
528 
529  // Get tiling parameter as an affine expression for lb/ub.
530  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
531  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
532 
533  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
534  lbBoundExprs.push_back(
535  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
536  origLowerBoundExpr);
537 
538  // Get the origLoopStep as an affine expression.
539  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStepAsInt());
540 
541  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
542  // (tilingParameter * origLoopStep) + origlb.
543  ubBoundExprs.push_back(
544  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
545  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
546 
547  ubBoundExprs.append(origUbMap.getResults().begin(),
548  origUbMap.getResults().end());
549 
550  AffineMap lbMap =
551  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
552  lbBoundExprs, b.getContext());
553  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
554 
555  AffineMap ubMap =
556  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
557  ubBoundExprs, b.getContext());
558  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
559 
560  // Original loop step must be preserved.
561  newIntraTileLoop.setStep(origLoop.getStepAsInt());
562 }
563 
564 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
565 // TODO: Handle non-constant lower bounds.
566 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
567  AffineForOp newLoop, Value tileSize) {
568  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
569 
570  // The lower bounds for inter-tile loops are same as the corresponding lower
571  // bounds of original loops.
572  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
573 
574  // The new upper bound map for inter-tile loops, assuming constant lower
575  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
576  // originalLowerBound), tiling parameter); where tiling parameter is the
577  // respective tile size for that loop. For e.g. if the original ubmap was
578  // ()->(1024), the new map will be
579  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
580  // Therefore a new symbol operand is inserted in the map and the result
581  // expression is overwritten.
582 
583  assert(origLoop.hasConstantLowerBound() &&
584  "expected input loops to have constant lower bound.");
585 
586  // Get lower bound of original loop as an affine expression.
587  AffineExpr origLowerBoundExpr;
588  origLowerBoundExpr =
589  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
590 
591  // Add dim operands from original upper bound.
592  SmallVector<Value, 4> ubOperands;
593  AffineBound ub = origLoop.getUpperBound();
594  ubOperands.reserve(ub.getNumOperands() + 1);
595  AffineMap origUbMap = ub.getMap();
596  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
597  ubOperands.push_back(ub.getOperand(j));
598 
599  // Add symbol operands from original upper bound.
600  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
601  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
602 
603  // Add a new symbol operand which is the tile size for this loop.
604  ubOperands.push_back(tileSize);
605 
606  // Get tiling parameter as an affine expression.
607  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
608 
609  SmallVector<AffineExpr, 4> boundExprs;
610  boundExprs.reserve(origUbMap.getNumResults());
611  int64_t origUpperBound;
612  AffineExpr origUpperBoundExpr;
613 
614  // If upper bound for the original loop is constant, then the constant can
615  // be obtained as an affine expression straight away.
616  if (origLoop.hasConstantUpperBound()) {
617  origUpperBound = origLoop.getConstantUpperBound();
618 
619  // Get original constant upper bound as an affine expression.
620  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
621 
622  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
623  // originalLowerBound), tilingParameter).
624  boundExprs.push_back(
625  origLowerBoundExpr +
626  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
627  } else {
628  // If upper bound for the original loop is not constant then two cases
629  // are possible, although there handeling is the same, 1.) The result of
630  // ubmap has only one result expression. For e.g.
631  // affine.for %i = 5 to %ub
632  //
633  // A symbol operand is added which represents the tiling parameter. The
634  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
635  // where 's0' is the original upper bound and 's1' is the tiling
636  // parameter. 2.) When ubMap has more than one result expression. For e.g.
637  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
638  // affine.for %i = 5 to min #map0()[%s0, %s1]
639  //
640  // A symbol operand is added which represents the tiling parameter. The
641  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
642  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
643 
644  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
645  // originalLowerBound), tilingParameter).
646  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
647  boundExprs.push_back(
648  origLowerBoundExpr +
649  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
650  }
651 
652  AffineMap ubMap =
653  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
654  boundExprs, b.getContext());
655  newLoop.setUpperBound(ubOperands, ubMap);
656 
657  // Original loop step must be preserved.
658  newLoop.setStep(origLoop.getStepAsInt());
659 }
660 
661 /// Constructs and sets new loop bounds after tiling for the case of
662 /// hyper-rectangular index sets, where the bounds of one dimension do not
663 /// depend on other dimensions and tiling parameters are captured from SSA
664 /// values. Bounds of each dimension can thus be treated independently,
665 /// and deriving the new bounds is much simpler and faster than for the case of
666 /// tiling arbitrary polyhedral shapes.
669  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
670  assert(!origLoops.empty() && "expected atleast one loop in band");
671  assert(origLoops.size() == tileSizes.size() &&
672  "expected tiling parameter for each loop in band.");
673 
674  OpBuilder b(origLoops[0].getOperation());
675  unsigned width = origLoops.size();
676 
677  // Set bounds for tile space loops.
678  for (unsigned i = 0; i < width; ++i) {
679  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
680  }
681 
682  // Set bounds for intra-tile loops.
683  for (unsigned i = 0; i < width; ++i) {
684  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
685  newLoops[i + width], tileSizes[i]);
686  }
687 }
688 
689 /// Constructs and sets new loop bounds after tiling for the case of
690 /// hyper-rectangular index sets, where the bounds of one dimension do not
691 /// depend on other dimensions. Bounds of each dimension can thus be treated
692 /// independently, and deriving the new bounds is much simpler and faster
693 /// than for the case of tiling arbitrary polyhedral shapes.
694 static void
697  ArrayRef<unsigned> tileSizes) {
698  assert(!origLoops.empty());
699  assert(origLoops.size() == tileSizes.size());
700 
701  OpBuilder b(origLoops[0].getOperation());
702  unsigned width = origLoops.size();
703 
704  // Bounds for tile space loops.
705  for (unsigned i = 0; i < width; i++) {
706  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
707  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
708  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
709  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
710  // If the step size of original loop is x and tileSize is y then after
711  // tiling the tile space loops' step size becomes x*y.
712  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
713  }
714  // Bounds for intra-tile loops.
715  for (unsigned i = 0; i < width; i++) {
716  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
717  std::optional<uint64_t> mayBeConstantCount =
718  getConstantTripCount(origLoops[i]);
719  // The lower bound is just the tile-space loop.
720  AffineMap lbMap = b.getDimIdentityMap();
721  newLoops[width + i].setLowerBound(
722  /*operands=*/newLoops[i].getInductionVar(), lbMap);
723  // The step sizes of intra-tile loops is just the original loops' step size.
724  newLoops[width + i].setStep(origLoops[i].getStepAsInt());
725 
726  // Set the upper bound.
727  if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
728  // Trip count is less than the tile size: upper bound is lower bound +
729  // trip count * stepSize.
731  *mayBeConstantCount * origLoops[i].getStepAsInt());
732  newLoops[width + i].setUpperBound(
733  /*operands=*/newLoops[i].getInductionVar(), ubMap);
734  } else if (largestDiv % tileSizes[i] != 0) {
735  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
736  // Construct the upper bound map; the operands are the original operands
737  // with 'i' (tile-space loop) appended to it. The new upper bound map is
738  // the original one with an additional expression i + tileSize * stepSize
739  // appended.
740 
741  // Add dim operands from original upper bound.
742  SmallVector<Value, 4> ubOperands;
743  AffineBound ub = origLoops[i].getUpperBound();
744  ubOperands.reserve(ub.getNumOperands() + 1);
745  AffineMap origUbMap = ub.getMap();
746  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
747  ubOperands.push_back(ub.getOperand(j));
748 
749  // Add dim operand for new loop upper bound.
750  ubOperands.push_back(newLoops[i].getInductionVar());
751 
752  // Add symbol operands from original upper bound.
753  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
754  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
755 
756  SmallVector<AffineExpr, 4> boundExprs;
757  boundExprs.reserve(1 + origUbMap.getNumResults());
758  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
759  // The new upper bound map is the original one with an additional
760  // expression i + tileSize * stepSize (of original loop) appended.
761  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStepAsInt());
762  boundExprs.append(origUbMap.getResults().begin(),
763  origUbMap.getResults().end());
764  AffineMap ubMap =
765  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
766  boundExprs, b.getContext());
767  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
768  } else {
769  // No need of the min expression.
770  AffineExpr dim = b.getAffineDimExpr(0);
771  AffineMap ubMap = AffineMap::get(
772  1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
773  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
774  }
775  }
776 }
777 
780  ArrayRef<unsigned> tileSizes,
781  SmallVectorImpl<AffineForOp> *tiledNest) {
782  if (input.empty())
783  return success();
784 
785  if (failed(performPreTilingChecks(input, tileSizes)))
786  return failure();
787 
788  MutableArrayRef<AffineForOp> origLoops = input;
789  AffineForOp rootAffineForOp = origLoops[0];
790 
791  // Note that width is at least one since the band isn't empty.
792  unsigned width = input.size();
793  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
794 
795  // Construct a tiled loop nest without setting their bounds. Bounds are
796  // set later.
797  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
798 
799  SmallVector<Value, 8> origLoopIVs;
800  extractForInductionVars(input, &origLoopIVs);
801 
802  // Set loop bounds for the tiled loop nest.
803  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
804 
805  // Replace original IVs with intra-tile loop IVs.
806  for (unsigned i = 0; i < width; i++)
807  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
808 
809  // Erase the old loop nest.
810  rootAffineForOp.erase();
811 
812  if (tiledNest)
813  *tiledNest = std::move(tiledLoops);
814 
815  return success();
816 }
817 
818 /// Tiles the specified band of perfectly nested loops creating tile-space
819 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
820 /// is a contiguous set of loops.
821 // TODO: handle non hyper-rectangular spaces.
824  SmallVectorImpl<AffineForOp> *tiledNest) {
825  if (input.empty())
826  return success();
827 
828  if (failed(performPreTilingChecks(input, tileSizes)))
829  return failure();
830 
831  MutableArrayRef<AffineForOp> origLoops = input;
832  AffineForOp rootAffineForOp = origLoops[0];
833  unsigned width = input.size();
834  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
835 
836  // Construct a tiled loop nest without setting their bounds. Bounds are
837  // set later.
838  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
839 
840  SmallVector<Value, 8> origLoopIVs;
841  extractForInductionVars(input, &origLoopIVs);
842 
843  // Set loop bounds for the tiled loop nest.
844  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
845  tileSizes);
846 
847  // Replace original IVs with intra-tile loop IVs.
848  for (unsigned i = 0; i < width; i++)
849  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
850 
851  // Erase the old loop nest.
852  rootAffineForOp.erase();
853 
854  if (tiledNest)
855  *tiledNest = std::move(tiledLoops);
856 
857  return success();
858 }
859 
860 /// Get perfectly nested sequence of loops starting at root of loop nest
861 /// (the first op being another AffineFor, and the second op - a terminator).
862 /// A loop is perfectly nested iff: the first op in the loop's body is another
863 /// AffineForOp, and the second op is a terminator).
865  SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
866  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
867  nestedLoops.push_back(root);
868  Block &body = root.getRegion().front();
869  if (body.begin() != std::prev(body.end(), 2))
870  return;
871 
872  root = dyn_cast<AffineForOp>(&body.front());
873  if (!root)
874  return;
875  }
876 }
877 
878 /// Identify valid and profitable bands of loops to tile. This is currently just
879 /// a temporary placeholder to test the mechanics of tiled code generation.
880 /// Returns all maximal outermost perfect loop nests to tile.
882  func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
883  // Get maximal perfect nest of 'affine.for' insts starting from root
884  // (inclusive).
885  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
887  getPerfectlyNestedLoops(band, forOp);
888  bands->push_back(band);
889  }
890 }
891 
892 /// Unrolls this loop completely.
894  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
895  if (mayBeConstantTripCount.has_value()) {
896  uint64_t tripCount = *mayBeConstantTripCount;
897  if (tripCount == 0)
898  return success();
899  if (tripCount == 1)
900  return promoteIfSingleIteration(forOp);
901  return loopUnrollByFactor(forOp, tripCount);
902  }
903  return failure();
904 }
905 
906 /// Unrolls this loop by the specified factor or by the trip count (if constant)
907 /// whichever is lower.
909  uint64_t unrollFactor) {
910  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
911  if (mayBeConstantTripCount.has_value() &&
912  *mayBeConstantTripCount < unrollFactor)
913  return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
914  return loopUnrollByFactor(forOp, unrollFactor);
915 }
916 
917 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
918 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
919 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
920 /// using annotateFn.
922  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
923  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
924  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
925  ValueRange iterArgs, ValueRange yieldedValues) {
926  // Builder to insert unrolled bodies just before the terminator of the body of
927  // 'forOp'.
928  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
929 
930  if (!annotateFn)
931  annotateFn = [](unsigned, Operation *, OpBuilder) {};
932 
933  // Keep a pointer to the last non-terminator operation in the original block
934  // so that we know what to clone (since we are doing this in-place).
935  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
936 
937  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
938  SmallVector<Value, 4> lastYielded(yieldedValues);
939 
940  for (unsigned i = 1; i < unrollFactor; i++) {
941  IRMapping operandMap;
942 
943  // Prepare operand map.
944  operandMap.map(iterArgs, lastYielded);
945 
946  // If the induction variable is used, create a remapping to the value for
947  // this unrolled instance.
948  if (!forOpIV.use_empty()) {
949  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
950  operandMap.map(forOpIV, ivUnroll);
951  }
952 
953  // Clone the original body of 'forOp'.
954  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
955  Operation *clonedOp = builder.clone(*it, operandMap);
956  annotateFn(i, clonedOp, builder);
957  }
958 
959  // Update yielded values. If the yielded value is defined outside the
960  // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
961  // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
962  // value with the clone corresponding to the yielded value.
963  for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
964  Operation *defOp = yieldedValues[i].getDefiningOp();
965  if (defOp && defOp->getBlock() == loopBodyBlock)
966  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
967  }
968  }
969 
970  // Make sure we annotate the Ops in the original body. We do this last so that
971  // any annotations are not copied into the cloned Ops above.
972  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
973  annotateFn(0, &*it, builder);
974 
975  // Update operands of the yield statement.
976  loopBodyBlock->getTerminator()->setOperands(lastYielded);
977 }
978 
979 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
980 /// count is not a multiple of `unrollFactor`.
982  uint64_t unrollFactor) {
983  // Insert the cleanup loop right after 'forOp'.
984  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
985  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
986 
987  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
988  // and produce results for the original users of `forOp` results.
989  auto results = forOp.getResults();
990  auto cleanupResults = cleanupForOp.getResults();
991  auto cleanupIterOperands = cleanupForOp.getInits();
992 
993  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
994  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
995  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
996  }
997 
998  AffineMap cleanupMap;
999  SmallVector<Value, 4> cleanupOperands;
1000  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1001  if (!cleanupMap)
1002  return failure();
1003 
1004  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1005  // Promote the loop body up if this has turned into a single iteration loop.
1006  (void)promoteIfSingleIteration(cleanupForOp);
1007 
1008  // Adjust upper bound of the original loop; this is the same as the lower
1009  // bound of the cleanup loop.
1010  forOp.setUpperBound(cleanupOperands, cleanupMap);
1011  return success();
1012 }
1013 
1014 /// Unrolls this loop by the specified factor. Returns success if the loop
1015 /// is successfully unrolled.
1017  AffineForOp forOp, uint64_t unrollFactor,
1018  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1019  bool cleanUpUnroll) {
1020  assert(unrollFactor > 0 && "unroll factor should be positive");
1021 
1022  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1023  if (unrollFactor == 1) {
1024  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1026  return failure();
1027  return success();
1028  }
1029 
1030  // Nothing in the loop body other than the terminator.
1031  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1032  return success();
1033 
1034  // If the trip count is lower than the unroll factor, no unrolled body.
1035  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1036  if (cleanUpUnroll) {
1037  // Unroll the cleanup loop if cleanUpUnroll is specified.
1038  return loopUnrollFull(forOp);
1039  }
1040 
1041  return failure();
1042  }
1043 
1044  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1045  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1046  // Loops where the lower bound is a max expression or the upper bound is
1047  // a min expression and the trip count doesn't divide the unroll factor
1048  // can't be unrolled since the lower bound of the cleanup loop in such cases
1049  // cannot be expressed as an affine function or a max over affine functions.
1050  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1051  forOp.getUpperBoundMap().getNumResults() != 1)
1052  return failure();
1053  if (cleanUpUnroll)
1054  // Force unroll including cleanup loop
1055  return loopUnrollFull(forOp);
1056  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1057  assert(false && "cleanup loop lower bound map for single result lower "
1058  "and upper bound maps can always be determined");
1059  }
1060 
1061  ValueRange iterArgs(forOp.getRegionIterArgs());
1062  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1063 
1064  // Scale the step of loop being unrolled by unroll factor.
1065  int64_t step = forOp.getStepAsInt();
1066  forOp.setStep(step * unrollFactor);
1068  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1069  [&](unsigned i, Value iv, OpBuilder b) {
1070  // iv' = iv + i * step
1071  auto d0 = b.getAffineDimExpr(0);
1072  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1073  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1074  },
1075  /*annotateFn=*/annotateFn,
1076  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1077 
1078  // Promote the loop body up if this has turned into a single iteration loop.
1079  (void)promoteIfSingleIteration(forOp);
1080  return success();
1081 }
1082 
1084  uint64_t unrollJamFactor) {
1085  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1086  if (mayBeConstantTripCount.has_value() &&
1087  *mayBeConstantTripCount < unrollJamFactor)
1088  return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1089  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1090 }
1091 
1092 /// Check if all control operands of all loops are defined outside of `forOp`
1093 /// and return false if not.
1094 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1095  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1096  for (auto controlOperand : aForOp.getControlOperands()) {
1097  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1098  return WalkResult::interrupt();
1099  }
1100  return WalkResult::advance();
1101  });
1102  return !walkResult.wasInterrupted();
1103 }
1104 
1105 // Gathers all maximal sub-blocks of operations that do not themselves
1106 // include a for op (a operation could have a descendant for op though
1107 // in its tree). Ignore the block terminators.
1109  // Store iterators to the first and last op of each sub-block found.
1110  std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1111 
1112  // This is a linear time walk.
1113  void walk(Operation *op) {
1114  for (auto &region : op->getRegions())
1115  for (auto &block : region)
1116  walk(block);
1117  }
1118 
1119  void walk(Block &block) {
1120  for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
1121  auto subBlockStart = it;
1122  while (it != e && !isa<AffineForOp>(&*it))
1123  ++it;
1124  if (it != subBlockStart)
1125  subBlocks.emplace_back(subBlockStart, std::prev(it));
1126  // Process all for ops that appear next.
1127  while (it != e && isa<AffineForOp>(&*it))
1128  walk(&*it++);
1129  }
1130  }
1131 };
1132 
1133 /// Unrolls and jams this loop by the specified factor.
1135  uint64_t unrollJamFactor) {
1136  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1137 
1138  std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1139  if (unrollJamFactor == 1) {
1140  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1142  return failure();
1143  return success();
1144  }
1145 
1146  // Nothing in the loop body other than the terminator.
1147  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1148  return success();
1149 
1150  // If the trip count is lower than the unroll jam factor, no unroll jam.
1151  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1152  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1153  return failure();
1154  }
1155 
1156  // If any control operand of any inner loop of `forOp` is defined within
1157  // `forOp`, no unroll jam.
1158  if (!areInnerBoundsInvariant(forOp))
1159  return failure();
1160 
1161  // Gather all sub-blocks to jam upon the loop being unrolled.
1162  JamBlockGatherer jbg;
1163  jbg.walk(forOp);
1164  auto &subBlocks = jbg.subBlocks;
1165 
1166  // Collect loops with iter_args.
1167  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1168  forOp.walk([&](AffineForOp aForOp) {
1169  if (aForOp.getNumIterOperands() > 0)
1170  loopsWithIterArgs.push_back(aForOp);
1171  });
1172 
1173  // Get supported reductions to be used for creating reduction ops at the end.
1174  SmallVector<LoopReduction> reductions;
1175  if (forOp.getNumIterOperands() > 0)
1176  getSupportedReductions(forOp, reductions);
1177 
1178  // Generate the cleanup loop if trip count isn't a multiple of
1179  // unrollJamFactor.
1180  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1181  // Loops where the lower bound is a max expression or the upper bound is
1182  // a min expression and the trip count doesn't divide the unroll factor
1183  // can't be unrolled since the lower bound of the cleanup loop in such cases
1184  // cannot be expressed as an affine function or a max over affine functions.
1185  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1186  forOp.getUpperBoundMap().getNumResults() != 1)
1187  return failure();
1188  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1189  assert(false && "cleanup loop lower bound map for single result lower "
1190  "and upper bound maps can always be determined");
1191  }
1192 
1193  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1194  // iteration. There are (`unrollJamFactor` - 1) iterations.
1195  SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1196 
1197  // For any loop with iter_args, replace it with a new loop that has
1198  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1199  // operands.
1200  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1201  IRRewriter rewriter(forOp.getContext());
1202  for (AffineForOp oldForOp : loopsWithIterArgs) {
1203  SmallVector<Value> dupIterOperands, dupYieldOperands;
1204  ValueRange oldIterOperands = oldForOp.getInits();
1205  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1206  ValueRange oldYieldOperands =
1207  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1208  // Get additional iterOperands, iterArgs, and yield operands. We will
1209  // fix iterOperands and yield operands after cloning of sub-blocks.
1210  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1211  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1212  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1213  }
1214  // Create a new loop with additional iterOperands, iter_args and yield
1215  // operands. This new loop will take the loop body of the original loop.
1216  bool forOpReplaced = oldForOp == forOp;
1217  AffineForOp newForOp =
1218  cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1219  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1220  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1221  return dupYieldOperands;
1222  }));
1223  newLoopsWithIterArgs.push_back(newForOp);
1224  // `forOp` has been replaced with a new loop.
1225  if (forOpReplaced)
1226  forOp = newForOp;
1227  // Update `operandMaps` for `newForOp` iterArgs and results.
1228  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1229  unsigned oldNumIterArgs = oldIterArgs.size();
1230  ValueRange newResults = newForOp.getResults();
1231  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1232  assert(oldNumIterArgs == oldNumResults &&
1233  "oldNumIterArgs must be the same as oldNumResults");
1234  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1235  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1236  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1237  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1238  // to those in the `i`th new set.
1239  operandMaps[i - 1].map(newIterArgs[j],
1240  newIterArgs[i * oldNumIterArgs + j]);
1241  operandMaps[i - 1].map(newResults[j],
1242  newResults[i * oldNumResults + j]);
1243  }
1244  }
1245  }
1246 
1247  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1248  int64_t step = forOp.getStepAsInt();
1249  forOp.setStep(step * unrollJamFactor);
1250 
1251  auto forOpIV = forOp.getInductionVar();
1252  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1253  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1254  for (auto &subBlock : subBlocks) {
1255  // Builder to insert unroll-jammed bodies. Insert right at the end of
1256  // sub-block.
1257  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1258 
1259  // If the induction variable is used, create a remapping to the value for
1260  // this unrolled instance.
1261  if (!forOpIV.use_empty()) {
1262  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1263  auto d0 = builder.getAffineDimExpr(0);
1264  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1265  auto ivUnroll =
1266  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1267  operandMaps[i - 1].map(forOpIV, ivUnroll);
1268  }
1269  // Clone the sub-block being unroll-jammed.
1270  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1271  builder.clone(*it, operandMaps[i - 1]);
1272  }
1273  // Fix iterOperands and yield op operands of newly created loops.
1274  for (auto newForOp : newLoopsWithIterArgs) {
1275  unsigned oldNumIterOperands =
1276  newForOp.getNumIterOperands() / unrollJamFactor;
1277  unsigned numControlOperands = newForOp.getNumControlOperands();
1278  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1279  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1280  assert(oldNumIterOperands == oldNumYieldOperands &&
1281  "oldNumIterOperands must be the same as oldNumYieldOperands");
1282  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1283  // The `i`th duplication of an old iterOperand or yield op operand
1284  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1285  // if such mapped value exists.
1286  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1287  operandMaps[i - 1].lookupOrDefault(
1288  newForOp.getOperand(numControlOperands + j)));
1289  yieldOp.setOperand(
1290  i * oldNumYieldOperands + j,
1291  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1292  }
1293  }
1294  }
1295  if (forOp.getNumResults() > 0) {
1296  // Create reduction ops to combine every `unrollJamFactor` related results
1297  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1298  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1299  // %1.
1300  rewriter.setInsertionPointAfter(forOp);
1301  auto loc = forOp.getLoc();
1302  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1303  for (LoopReduction &reduction : reductions) {
1304  unsigned pos = reduction.iterArgPosition;
1305  Value lhs = forOp.getResult(pos);
1306  Value rhs;
1308  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1309  rhs = forOp.getResult(i * oldNumResults + pos);
1310  // Create ops based on reduction type.
1311  lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1312  if (!lhs)
1313  return failure();
1314  Operation *op = lhs.getDefiningOp();
1315  assert(op && "Reduction op should have been created");
1316  newOps.insert(op);
1317  }
1318  // Replace all uses except those in newly created reduction ops.
1319  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1320  }
1321  }
1322 
1323  // Promote the loop body up if this has turned into a single iteration loop.
1324  (void)promoteIfSingleIteration(forOp);
1325  return success();
1326 }
1327 
1328 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1329 /// nested within 'forOpA' as the only non-terminator operation in its block.
1330 void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1331  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1332  auto &forOpABody = forOpA.getBody()->getOperations();
1333  auto &forOpBBody = forOpB.getBody()->getOperations();
1334 
1335  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1336  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1337  // body containing only the terminator.
1338  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1339  forOpABody, forOpABody.begin(),
1340  std::prev(forOpABody.end()));
1341  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1342  // body (this leaves forOpB's body containing only the terminator).
1343  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1344  std::prev(forOpBBody.end()));
1345  // 3) Splice forOpA into the beginning of forOpB's body.
1346  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1347  Block::iterator(forOpA));
1348 }
1349 
1350 // Checks each dependence component against the permutation to see if the
1351 // desired loop interchange would violate dependences by making the
1352 // dependence component lexicographically negative.
1354  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1355  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1356  // Invert permutation map.
1357  unsigned maxLoopDepth = loops.size();
1358  SmallVector<unsigned, 4> loopPermMapInv;
1359  loopPermMapInv.resize(maxLoopDepth);
1360  for (unsigned i = 0; i < maxLoopDepth; ++i)
1361  loopPermMapInv[loopPermMap[i]] = i;
1362 
1363  // Check each dependence component against the permutation to see if the
1364  // desired loop interchange permutation would make the dependence vectors
1365  // lexicographically negative.
1366  // Example 1: [-1, 1][0, 0]
1367  // Example 2: [0, 0][-1, 1]
1368  for (const auto &depComps : depCompsVec) {
1369  assert(depComps.size() >= maxLoopDepth);
1370  // Check if the first non-zero dependence component is positive.
1371  // This iterates through loops in the desired order.
1372  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1373  unsigned permIndex = loopPermMapInv[j];
1374  assert(depComps[permIndex].lb);
1375  int64_t depCompLb = *depComps[permIndex].lb;
1376  if (depCompLb > 0)
1377  break;
1378  if (depCompLb < 0)
1379  return false;
1380  }
1381  }
1382  return true;
1383 }
1384 
1385 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1386 /// nested sequence of loops in 'loops' would violate dependences.
1388  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1389  // Gather dependence components for dependences between all ops in loop nest
1390  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1391  assert(loopPermMap.size() == loops.size());
1392  unsigned maxLoopDepth = loops.size();
1393  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1394  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1395  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1396 }
1397 
1398 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1399 /// in it from outermost to innermost.
1400 bool LLVM_ATTRIBUTE_UNUSED
1402  assert(!loops.empty() && "no loops provided");
1403 
1404  // We already know that the block can't be empty.
1405  auto hasTwoElements = [](Block *block) {
1406  auto secondOpIt = std::next(block->begin());
1407  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1408  };
1409 
1410  auto enclosingLoop = loops.front();
1411  for (auto loop : loops.drop_front()) {
1412  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1413  // parentForOp's body should be just this loop and the terminator.
1414  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1415  return false;
1416  enclosingLoop = loop;
1417  }
1418  return true;
1419 }
1420 
1421 // input[i] should move from position i -> permMap[i]. Returns the position in
1422 // `input` that becomes the new outermost loop.
1424  ArrayRef<unsigned> permMap) {
1425  assert(input.size() == permMap.size() && "invalid permutation map size");
1426  // Check whether the permutation spec is valid. This is a small vector - we'll
1427  // just sort and check if it's iota.
1428  SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1429  llvm::sort(checkPermMap);
1430  if (llvm::any_of(llvm::enumerate(checkPermMap),
1431  [](const auto &en) { return en.value() != en.index(); }))
1432  assert(false && "invalid permutation map");
1433 
1434  // Nothing to do.
1435  if (input.size() < 2)
1436  return 0;
1437 
1438  assert(isPerfectlyNested(input) && "input not perfectly nested");
1439 
1440  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1441  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1443  for (unsigned i = 0, e = input.size(); i < e; ++i)
1444  invPermMap.push_back({permMap[i], i});
1445  llvm::sort(invPermMap);
1446 
1447  // Move the innermost loop body to the loop that would be the innermost in the
1448  // permuted nest (only if the innermost loop is going to change).
1449  if (permMap.back() != input.size() - 1) {
1450  auto *destBody = input[invPermMap.back().second].getBody();
1451  auto *srcBody = input.back().getBody();
1452  destBody->getOperations().splice(destBody->begin(),
1453  srcBody->getOperations(), srcBody->begin(),
1454  std::prev(srcBody->end()));
1455  }
1456 
1457  // We'll move each loop in `input` in the reverse order so that its body is
1458  // empty when we are moving it; this incurs zero copies and no erasing.
1459  for (int i = input.size() - 1; i >= 0; --i) {
1460  // If this has to become the outermost loop after permutation, add it to the
1461  // parent block of the original root.
1462  if (permMap[i] == 0) {
1463  // If the root remains the same, nothing to do.
1464  if (i == 0)
1465  continue;
1466  // Make input[i] the new outermost loop moving it into parentBlock.
1467  auto *parentBlock = input[0]->getBlock();
1468  parentBlock->getOperations().splice(Block::iterator(input[0]),
1469  input[i]->getBlock()->getOperations(),
1470  Block::iterator(input[i]));
1471  continue;
1472  }
1473 
1474  // If the parent in the permuted order is the same as in the original,
1475  // nothing to do.
1476  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1477  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1478  continue;
1479 
1480  // Move input[i] to its surrounding loop in the transformed nest.
1481  auto *destBody = input[parentPosInInput].getBody();
1482  destBody->getOperations().splice(destBody->begin(),
1483  input[i]->getBlock()->getOperations(),
1484  Block::iterator(input[i]));
1485  }
1486 
1487  return invPermMap[0].second;
1488 }
1489 
1490 // Sinks all sequential loops to the innermost levels (while preserving
1491 // relative order among them) and moves all parallel loops to the
1492 // outermost (while again preserving relative order among them).
1493 AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1495  getPerfectlyNestedLoops(loops, forOp);
1496  if (loops.size() < 2)
1497  return forOp;
1498 
1499  // Gather dependence components for dependences between all ops in loop nest
1500  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1501  unsigned maxLoopDepth = loops.size();
1502  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1503  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1504 
1505  // Mark loops as either parallel or sequential.
1506  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1507  for (auto &depComps : depCompsVec) {
1508  assert(depComps.size() >= maxLoopDepth);
1509  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1510  DependenceComponent &depComp = depComps[j];
1511  assert(depComp.lb.has_value() && depComp.ub.has_value());
1512  if (*depComp.lb != 0 || *depComp.ub != 0)
1513  isParallelLoop[j] = false;
1514  }
1515  }
1516 
1517  // Count the number of parallel loops.
1518  unsigned numParallelLoops = 0;
1519  for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
1520  if (isParallelLoop[i])
1521  ++numParallelLoops;
1522 
1523  // Compute permutation of loops that sinks sequential loops (and thus raises
1524  // parallel loops) while preserving relative order.
1525  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1526  unsigned nextSequentialLoop = numParallelLoops;
1527  unsigned nextParallelLoop = 0;
1528  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1529  if (isParallelLoop[i]) {
1530  loopPermMap[i] = nextParallelLoop++;
1531  } else {
1532  loopPermMap[i] = nextSequentialLoop++;
1533  }
1534  }
1535 
1536  // Check if permutation 'loopPermMap' would violate dependences.
1537  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1538  return forOp;
1539  // Perform loop interchange according to permutation 'loopPermMap'.
1540  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1541  return loops[loopNestRootIndex];
1542 }
1543 
1544 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1545 // lower (resp. upper) loop bound. When called for both the lower and upper
1546 // bounds, the resulting IR resembles:
1547 //
1548 // ```mlir
1549 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1550 // ...
1551 // }
1552 // ```
1554  SmallVector<Value, 4> *operands,
1555  int64_t offset = 0) {
1556  auto bounds = llvm::to_vector<4>(map->getResults());
1557  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1558  operands->insert(operands->begin() + map->getNumDims(), iv);
1559  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1560  b.getContext());
1561  canonicalizeMapAndOperands(map, operands);
1562 }
1563 
1564 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1565 // Stripmine-sink is a primitive building block for generalized tiling of
1566 // imperfectly nested loops.
1567 // This transformation is purely mechanical and does not check legality,
1568 // profitability or even structural correctness. It is the user's
1569 // responsibility to specify `targets` that are dominated by `forOp`.
1570 // Returns the new AffineForOps, one per `targets`, nested immediately under
1571 // each of the `targets`.
1573 stripmineSink(AffineForOp forOp, uint64_t factor,
1574  ArrayRef<AffineForOp> targets) {
1575  auto originalStep = forOp.getStepAsInt();
1576  auto scaledStep = originalStep * factor;
1577  forOp.setStep(scaledStep);
1578 
1579  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1580 
1581  // Lower-bound map creation.
1582  auto lbMap = forOp.getLowerBoundMap();
1583  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1584  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1585 
1586  // Upper-bound map creation.
1587  auto ubMap = forOp.getUpperBoundMap();
1588  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1589  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1590  /*offset=*/scaledStep);
1591 
1592  auto iv = forOp.getInductionVar();
1593  SmallVector<AffineForOp, 8> innerLoops;
1594  for (auto t : targets) {
1595  // Insert newForOp before the terminator of `t`.
1596  auto b = OpBuilder::atBlockTerminator(t.getBody());
1597  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1598  ubOperands, ubMap, originalStep);
1599  auto begin = t.getBody()->begin();
1600  // Skip terminator and `newForOp` which is just before the terminator.
1601  auto nOps = t.getBody()->getOperations().size() - 2;
1602  newForOp.getBody()->getOperations().splice(
1603  newForOp.getBody()->getOperations().begin(),
1604  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1605  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1606  newForOp.getRegion());
1607  innerLoops.push_back(newForOp);
1608  }
1609 
1610  return innerLoops;
1611 }
1612 
1613 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1614 // Returns the new AffineForOps, nested immediately under `target`.
1615 template <typename SizeType>
1616 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1617  AffineForOp target) {
1618  // TODO: Use cheap structural assertions that targets are nested under
1619  // forOp and that targets are not nested under each other when DominanceInfo
1620  // exposes the capability. It seems overkill to construct a whole function
1621  // dominance tree at this point.
1622  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1623  assert(res.size() == 1 && "Expected 1 inner forOp");
1624  return res[0];
1625 }
1626 
1629  ArrayRef<AffineForOp> targets) {
1631  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1632  for (auto it : llvm::zip(forOps, sizes)) {
1633  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1634  res.push_back(step);
1635  currentTargets = step;
1636  }
1637  return res;
1638 }
1639 
1641  ArrayRef<uint64_t> sizes,
1642  AffineForOp target) {
1644  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1645  assert(loops.size() == 1);
1646  res.push_back(loops[0]);
1647  }
1648  return res;
1649 }
1650 
1652  if (loops.size() < 2)
1653  return success();
1654 
1655  AffineForOp innermost = loops.back();
1656  AffineForOp outermost = loops.front();
1657  AffineBound ub = outermost.getUpperBound();
1658  AffineMap origUbMap = ub.getMap();
1659  Location loc = outermost.getLoc();
1660  OpBuilder builder(outermost);
1661  for (AffineForOp loop : loops) {
1662  // We only work on normalized loops.
1663  if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1664  loop.getConstantLowerBound() != 0)
1665  return failure();
1666  }
1667  SmallVector<Value, 4> upperBoundSymbols;
1668  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1669  ub.getOperands().end());
1670 
1671  // 1. Store the upper bound of the outermost loop in a variable.
1672  Value prev;
1673  if (!llvm::hasSingleElement(origUbMap.getResults()))
1674  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1675  else
1676  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1677  upperBoundSymbols.push_back(prev);
1678 
1679  // 2. Emit code computing the upper bound of the coalesced loop as product of
1680  // the number of iterations of all loops.
1681  for (AffineForOp loop : loops.drop_front()) {
1682  ub = loop.getUpperBound();
1683  origUbMap = ub.getMap();
1684  ubOperands = ub.getOperands();
1685  Value upperBound;
1686  // If upper bound map has more than one result, take their minimum.
1687  if (!llvm::hasSingleElement(origUbMap.getResults()))
1688  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1689  else
1690  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1691  upperBoundSymbols.push_back(upperBound);
1692  SmallVector<Value, 4> operands;
1693  operands.push_back(prev);
1694  operands.push_back(upperBound);
1695  // Maintain running product of loop upper bounds.
1696  prev = builder.create<AffineApplyOp>(
1697  loc,
1698  AffineMap::get(/*dimCount=*/1,
1699  /*symbolCount=*/1,
1700  builder.getAffineDimExpr(0) *
1701  builder.getAffineSymbolExpr(0)),
1702  operands);
1703  }
1704  // Set upper bound of the coalesced loop.
1705  AffineMap newUbMap = AffineMap::get(
1706  /*dimCount=*/0,
1707  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1708  outermost.setUpperBound(prev, newUbMap);
1709 
1710  builder.setInsertionPointToStart(outermost.getBody());
1711 
1712  // 3. Remap induction variables. For each original loop, the value of the
1713  // induction variable can be obtained by dividing the induction variable of
1714  // the linearized loop by the total number of iterations of the loops nested
1715  // in it modulo the number of iterations in this loop (remove the values
1716  // related to the outer loops):
1717  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1718  // Compute these iteratively from the innermost loop by creating a "running
1719  // quotient" of division by the range.
1720  Value previous = outermost.getInductionVar();
1721  for (unsigned idx = loops.size(); idx > 0; --idx) {
1722  if (idx != loops.size()) {
1723  SmallVector<Value, 4> operands;
1724  operands.push_back(previous);
1725  operands.push_back(upperBoundSymbols[idx]);
1726  previous = builder.create<AffineApplyOp>(
1727  loc,
1729  /*dimCount=*/1, /*symbolCount=*/1,
1730  builder.getAffineDimExpr(0).floorDiv(
1731  builder.getAffineSymbolExpr(0))),
1732  operands);
1733  }
1734  // Modified value of the induction variables of the nested loops after
1735  // coalescing.
1736  Value inductionVariable;
1737  if (idx == 1) {
1738  inductionVariable = previous;
1739  } else {
1740  SmallVector<Value, 4> applyOperands;
1741  applyOperands.push_back(previous);
1742  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1743  inductionVariable = builder.create<AffineApplyOp>(
1744  loc,
1746  /*dimCount=*/1, /*symbolCount=*/1,
1747  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1748  applyOperands);
1749  }
1750  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1751  inductionVariable, loops.back().getRegion());
1752  }
1753 
1754  // 4. Move the operations from the innermost just above the second-outermost
1755  // loop, delete the extra terminator and the second-outermost loop.
1756  AffineForOp secondOutermostLoop = loops[1];
1757  innermost.getBody()->back().erase();
1758  outermost.getBody()->getOperations().splice(
1759  Block::iterator(secondOutermostLoop.getOperation()),
1760  innermost.getBody()->getOperations());
1761  secondOutermostLoop.erase();
1762  return success();
1763 }
1764 
1766  ArrayRef<Value> processorId,
1767  ArrayRef<Value> numProcessors) {
1768  assert(processorId.size() == numProcessors.size());
1769  if (processorId.empty())
1770  return;
1771 
1772  OpBuilder b(forOp);
1773  Location loc(forOp.getLoc());
1774  AffineExpr lhs, rhs;
1775  bindSymbols(forOp.getContext(), lhs, rhs);
1776  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1777  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1778 
1779  Value linearIndex = processorId.front();
1780  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1781  auto mulApplyOp = b.create<AffineApplyOp>(
1782  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1783  linearIndex = b.create<AffineApplyOp>(
1784  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1785  }
1786 
1787  auto mulApplyOp = b.create<AffineApplyOp>(
1788  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1789  Value lb = b.create<AffineApplyOp>(
1790  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1791  forOp.setLowerBound(lb);
1792 
1793  Value step = forOp.getStep();
1794  for (auto numProcs : numProcessors)
1795  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1796  forOp.setStep(step);
1797 }
1798 
1799 /// Given a memref region, determine the lowest depth at which transfers can be
1800 /// placed for it, and return the corresponding block, start and end positions
1801 /// in the block for placing incoming (read) and outgoing (write) copies
1802 /// respectively. The lowest depth depends on whether the region being accessed
1803 /// is hoistable with respect to one or more immediately surrounding loops.
1804 static void
1806  Block::iterator &begin, Block::iterator &end,
1807  Block **copyPlacementBlock,
1808  Block::iterator *copyInPlacementStart,
1809  Block::iterator *copyOutPlacementStart) {
1810  const auto *cst = region.getConstraints();
1811  SmallVector<Value, 4> symbols;
1812  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1813 
1814  SmallVector<AffineForOp, 4> enclosingFors;
1815  getAffineForIVs(*block.begin(), &enclosingFors);
1816  // Walk up loop parents till we find an IV on which this region is
1817  // symbolic/variant.
1818  auto it = enclosingFors.rbegin();
1819  for (auto e = enclosingFors.rend(); it != e; ++it) {
1820  // TODO: also need to be checking this for regions symbols that
1821  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1822  if (llvm::is_contained(symbols, it->getInductionVar()))
1823  break;
1824  }
1825 
1826  if (it != enclosingFors.rbegin()) {
1827  auto lastInvariantIV = *std::prev(it);
1828  *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1829  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1830  *copyPlacementBlock = lastInvariantIV->getBlock();
1831  } else {
1832  *copyInPlacementStart = begin;
1833  *copyOutPlacementStart = end;
1834  *copyPlacementBlock = &block;
1835  }
1836 }
1837 
1838 // Info comprising stride and number of elements transferred every stride.
1839 struct StrideInfo {
1840  int64_t stride;
1842 };
1843 
1844 /// Returns striding information for a copy/transfer of this region with
1845 /// potentially multiple striding levels from outermost to innermost. For an
1846 /// n-dimensional region, there can be at most n-1 levels of striding
1847 /// successively nested.
1848 // TODO: make this work with non-identity layout maps.
1849 static void getMultiLevelStrides(const MemRefRegion &region,
1850  ArrayRef<int64_t> bufferShape,
1851  SmallVectorImpl<StrideInfo> *strideInfos) {
1852  if (bufferShape.size() <= 1)
1853  return;
1854 
1855  int64_t numEltPerStride = 1;
1856  int64_t stride = 1;
1857  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1858  int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1859  stride *= dimSize;
1860  numEltPerStride *= bufferShape[d];
1861  // A stride is needed only if the region has a shorter extent than the
1862  // memref along the dimension *and* has an extent greater than one along the
1863  // next major dimension.
1864  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1865  strideInfos->push_back({stride, numEltPerStride});
1866  }
1867  }
1868 }
1869 
1870 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1871 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1872 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1873 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1874 /// expressions to be subtracted out from the respective copy loop iterators in
1875 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1876 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1877 /// inserted.
1878 //
1879 /// The copy-in nest is generated as follows as an example for a 2-d region:
1880 /// for x = ...
1881 /// for y = ...
1882 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1883 ///
1884 static AffineForOp
1885 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1886  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1887  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1888  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1889  OpBuilder b) {
1890  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1891  return lbMap.getNumInputs() == lbOperands.size();
1892  }));
1893  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1894  return ubMap.getNumInputs() == ubOperands.size();
1895  }));
1896 
1897  unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1898  assert(lbMaps.size() == rank && "wrong number of lb maps");
1899  assert(ubMaps.size() == rank && "wrong number of ub maps");
1900 
1901  SmallVector<Value, 4> memIndices;
1902  SmallVector<AffineExpr, 4> fastBufExprs;
1903  SmallVector<Value, 4> fastBufMapOperands;
1904  AffineForOp copyNestRoot;
1905  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1906  for (unsigned d = 0; d < rank; ++d) {
1907  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1908  ubOperands, ubMaps[d]);
1909  if (d == 0)
1910  copyNestRoot = forOp;
1911 
1912  b = OpBuilder::atBlockTerminator(forOp.getBody());
1913 
1914  auto fastBufOffsetMap =
1915  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1916  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1917 
1918  // Construct the subscript for the fast memref being copied into/from:
1919  // x - offset_x.
1920  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1921  b.getAffineDimExpr(2 * d));
1922  fastBufMapOperands.push_back(offset);
1923  fastBufMapOperands.push_back(forOp.getInductionVar());
1924  mayBeDeadApplys.push_back(offset);
1925 
1926  // Subscript for the slow memref being copied.
1927  memIndices.push_back(forOp.getInductionVar());
1928  }
1929 
1930  auto fastBufMap =
1931  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1932  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1933  fastBufMap = simplifyAffineMap(fastBufMap);
1934  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1935 
1936  // Drop any dead affine.applys.
1937  for (auto applyOp : mayBeDeadApplys)
1938  if (applyOp.use_empty())
1939  applyOp.erase();
1940 
1941  if (!isCopyOut) {
1942  // Copy in.
1943  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1944  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1945  fastBufMapOperands);
1946  return copyNestRoot;
1947  }
1948 
1949  // Copy out.
1950  auto load =
1951  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1952  b.create<AffineStoreOp>(loc, load, memref, memIndices);
1953  return copyNestRoot;
1954 }
1955 
1956 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1958  return block.getParentOp()->emitRemark();
1959 }
1960 
1961 /// Creates a buffer in the faster memory space for the specified memref region;
1962 /// generates a copy from the lower memory space to this one, and replaces all
1963 /// loads/stores in the block range [`begin', `end') of `block' to load/store
1964 /// from that buffer. Returns failure if copies could not be generated due to
1965 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
1966 /// in copyPlacementBlock specify the insertion points where the incoming copies
1967 /// and outgoing copies, respectively, should be inserted (the insertion happens
1968 /// right before the insertion point). Since `begin` can itself be invalidated
1969 /// due to the memref rewriting done from this method, the output argument
1970 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
1971 /// happens). Since outgoing copies could have been inserted at `end`, the
1972 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
1973 /// size of the fast buffer allocated.
1975  const MemRefRegion &region, Block *block, Block::iterator begin,
1976  Block::iterator end, Block *copyPlacementBlock,
1977  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1978  const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1979  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1980  Block::iterator *nBegin, Block::iterator *nEnd) {
1981  *nBegin = begin;
1982  *nEnd = end;
1983 
1984  func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1985  OpBuilder topBuilder(f.getBody());
1986  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1987 
1988  *sizeInBytes = 0;
1989 
1990  if (begin == end)
1991  return success();
1992 
1993  // Is the copy out point at the end of the block where we are doing
1994  // explicit copying.
1995  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1996 
1997  // Copies for read regions are going to be inserted at 'begin'.
1998  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1999  // Copies for write regions are going to be inserted at 'end'.
2000  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
2001  OpBuilder &b = region.isWrite() ? epilogue : prologue;
2002 
2003  // Builder to create constants at the top level.
2004  auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
2005  OpBuilder top(func.getBody());
2006 
2007  auto loc = region.loc;
2008  auto memref = region.memref;
2009  auto memRefType = cast<MemRefType>(memref.getType());
2010 
2011  if (!memRefType.getLayout().isIdentity()) {
2012  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2013  return failure();
2014  }
2015 
2016  // Indices to use for the copying.
2017  // Indices for the original memref being copied from/to.
2018  SmallVector<Value, 4> memIndices;
2019  // Indices for the faster buffer being copied into/from.
2020  SmallVector<Value, 4> bufIndices;
2021 
2022  unsigned rank = memRefType.getRank();
2023  SmallVector<int64_t, 4> fastBufferShape;
2024 
2025  // Compute the extents of the buffer.
2026  std::vector<SmallVector<int64_t, 4>> lbs;
2027  SmallVector<int64_t, 8> lbDivisors;
2028  lbs.reserve(rank);
2029  std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
2030  &fastBufferShape, &lbs, &lbDivisors);
2031  if (!numElements) {
2032  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2033  return failure();
2034  }
2035 
2036  if (*numElements == 0) {
2037  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2038  return success();
2039  }
2040 
2041  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2042  for (unsigned i = 0; i < rank; ++i)
2043  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2044 
2045  const FlatAffineValueConstraints *cst = region.getConstraints();
2046  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2047  // on; these typically include loop IVs surrounding the level at which the
2048  // copy generation is being done or other valid symbols in MLIR.
2049  SmallVector<Value, 8> regionSymbols;
2050  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2051 
2052  // Construct the index expressions for the fast memory buffer. The index
2053  // expression for a particular dimension of the fast buffer is obtained by
2054  // subtracting out the lower bound on the original memref's data region
2055  // along the corresponding dimension.
2056 
2057  // Index start offsets for faster memory buffer relative to the original.
2058  SmallVector<AffineExpr, 4> fastBufOffsets;
2059  fastBufOffsets.reserve(rank);
2060  for (unsigned d = 0; d < rank; d++) {
2061  assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2062 
2063  AffineExpr offset = top.getAffineConstantExpr(0);
2064  for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2065  offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2066  assert(lbDivisors[d] > 0);
2067  offset =
2068  (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2069 
2070  // Set copy start location for this dimension in the lower memory space
2071  // memref.
2072  if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
2073  auto indexVal = caf.getValue();
2074  if (indexVal == 0) {
2075  memIndices.push_back(zeroIndex);
2076  } else {
2077  memIndices.push_back(
2078  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2079  }
2080  } else {
2081  // The coordinate for the start location is just the lower bound along the
2082  // corresponding dimension on the memory region (stored in 'offset').
2083  auto map = AffineMap::get(
2084  cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2085  memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2086  }
2087  // The fast buffer is copied into at location zero; addressing is relative.
2088  bufIndices.push_back(zeroIndex);
2089 
2090  // Record the offsets since they are needed to remap the memory accesses of
2091  // the original memref further below.
2092  fastBufOffsets.push_back(offset);
2093  }
2094 
2095  // The faster memory space buffer.
2096  Value fastMemRef;
2097 
2098  // Check if a buffer was already created.
2099  bool existingBuf = fastBufferMap.count(memref) > 0;
2100  if (!existingBuf) {
2101  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2102  auto fastMemRefType =
2103  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2104  fastBufferLayout, copyOptions.fastMemorySpace);
2105 
2106  // Create the fast memory space buffer just before the 'affine.for'
2107  // operation.
2108  fastMemRef =
2109  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2110  // Record it.
2111  fastBufferMap[memref] = fastMemRef;
2112  // fastMemRefType is a constant shaped memref.
2113  auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2114  // We don't account for things of unknown size.
2115  *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
2116 
2117  LLVM_DEBUG(emitRemarkForBlock(*block)
2118  << "Creating fast buffer of type " << fastMemRefType
2119  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2120  << " KiB\n");
2121  } else {
2122  // Reuse the one already created.
2123  fastMemRef = fastBufferMap[memref];
2124  }
2125 
2126  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2127 
2128  Value dmaStride = nullptr;
2129  Value numEltPerDmaStride = nullptr;
2130  if (copyOptions.generateDma) {
2131  SmallVector<StrideInfo, 4> dmaStrideInfos;
2132  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2133 
2134  // TODO: use all stride levels once DmaStartOp is extended for
2135  // multi-level strides.
2136  if (dmaStrideInfos.size() > 1) {
2137  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2138  return failure();
2139  }
2140 
2141  if (!dmaStrideInfos.empty()) {
2142  dmaStride =
2143  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2144  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2145  loc, dmaStrideInfos[0].numEltPerStride);
2146  }
2147  }
2148 
2149  // Record the last operation where we want the memref replacement to end. We
2150  // later do the memref replacement only in [begin, postDomFilter] so
2151  // that the original memref's used in the data movement code themselves don't
2152  // get replaced.
2153  auto postDomFilter = std::prev(end);
2154 
2155  // Create fully composed affine maps for each memref.
2156  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2157  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2158  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2159  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2160 
2161  if (!copyOptions.generateDma) {
2162  // Point-wise copy generation.
2163  auto copyNest =
2164  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2165  /*lbOperands=*/regionSymbols, ubMaps,
2166  /*ubOperands=*/regionSymbols, fastBufOffsets,
2167  /*isCopyOut=*/region.isWrite(), b);
2168 
2169  // Record this so that we can skip it from yet another copy.
2170  copyNests.insert(copyNest);
2171 
2172  // Since new ops are being appended (for copy out's), adjust the end to
2173  // mark end of block range being processed if necessary.
2174  if (region.isWrite() && isCopyOutAtEndOfBlock)
2175  *nEnd = Block::iterator(copyNest.getOperation());
2176  } else {
2177  // DMA generation.
2178  // Create a tag (single element 1-d memref) for the DMA.
2179  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2180  copyOptions.tagMemorySpace);
2181  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2182 
2183  SmallVector<Value, 4> tagIndices({zeroIndex});
2184  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2185  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2186  if (!region.isWrite()) {
2187  // DMA non-blocking read from original buffer to fast buffer.
2188  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2189  fastMemRef, bufAffineMap, bufIndices,
2190  tagMemRef, tagAffineMap, tagIndices,
2191  numElementsSSA, dmaStride, numEltPerDmaStride);
2192  } else {
2193  // DMA non-blocking write from fast buffer to the original memref.
2194  auto op = b.create<AffineDmaStartOp>(
2195  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2196  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2197  dmaStride, numEltPerDmaStride);
2198  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2199  // end to mark end of block range being processed.
2200  if (isCopyOutAtEndOfBlock)
2201  *nEnd = Block::iterator(op.getOperation());
2202  }
2203 
2204  // Matching DMA wait to block on completion; tag always has a 0 index.
2205  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2206  numElementsSSA);
2207 
2208  // Generate dealloc for the tag.
2209  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2210  if (*nEnd == end && isCopyOutAtEndOfBlock)
2211  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2212  // mark end of range of the original.
2213  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2214  }
2215 
2216  // Generate dealloc for the buffer.
2217  if (!existingBuf) {
2218  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2219  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2220  // the fast buffer (since it marks the new end insertion point).
2221  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2222  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2223  }
2224 
2225  // Replace all uses of the old memref with the faster one while remapping
2226  // access indices (subtracting out lower bound offsets for each dimension).
2227  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2228  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2229  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2230  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2231  // d2, d3 correspond to the original indices (%i, %j).
2232  SmallVector<AffineExpr, 4> remapExprs;
2233  remapExprs.reserve(rank);
2234  for (unsigned i = 0; i < rank; i++) {
2235  // The starting operands of indexRemap will be regionSymbols (the symbols on
2236  // which the memref region is parametric); then those corresponding to
2237  // the memref's original indices follow.
2238  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2239  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2240  }
2241  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2242  b.getContext());
2243 
2244  // Record the begin since it may be invalidated by memref replacement.
2245  Block::iterator prevOfBegin;
2246  bool isBeginAtStartOfBlock = (begin == block->begin());
2247  if (!isBeginAtStartOfBlock)
2248  prevOfBegin = std::prev(begin);
2249 
2250  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2251  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2252  /*extraIndices=*/{}, indexRemap,
2253  /*extraOperands=*/regionSymbols,
2254  /*symbolOperands=*/{},
2255  /*domOpFilter=*/&*begin,
2256  /*postDomOpFilter=*/&*postDomFilter);
2257 
2258  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2259 
2260  return success();
2261 }
2262 
2263 /// Construct the memref region to just include the entire memref. Returns false
2264 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2265 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2266 /// is parametric on.
2267 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2268  MemRefRegion *region) {
2269  unsigned rank;
2270  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2271  rank = loadOp.getMemRefType().getRank();
2272  region->memref = loadOp.getMemRef();
2273  region->setWrite(false);
2274  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2275  rank = storeOp.getMemRefType().getRank();
2276  region->memref = storeOp.getMemRef();
2277  region->setWrite(true);
2278  } else {
2279  assert(false && "expected load or store op");
2280  return false;
2281  }
2282  auto memRefType = cast<MemRefType>(region->memref.getType());
2283  if (!memRefType.hasStaticShape())
2284  return false;
2285 
2286  auto *regionCst = region->getConstraints();
2287 
2288  // Just get the first numSymbols IVs, which the memref region is parametric
2289  // on.
2291  getAffineForIVs(*op, &ivs);
2292  ivs.resize(numParamLoopIVs);
2293  SmallVector<Value, 4> symbols;
2294  extractForInductionVars(ivs, &symbols);
2295  *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2296  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2297 
2298  // Memref dim sizes provide the bounds.
2299  for (unsigned d = 0; d < rank; d++) {
2300  auto dimSize = memRefType.getDimSize(d);
2301  assert(dimSize > 0 && "filtered dynamic shapes above");
2302  regionCst->addBound(BoundType::LB, d, 0);
2303  regionCst->addBound(BoundType::UB, d, dimSize - 1);
2304  }
2305  return true;
2306 }
2307 
2310  const AffineCopyOptions &copyOptions,
2311  std::optional<Value> filterMemRef,
2312  DenseSet<Operation *> &copyNests) {
2313  if (begin == end)
2314  return success();
2315 
2316  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2317  "Inconsistent block begin/end args");
2318  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2319 
2320  Block *block = begin->getBlock();
2321 
2322  // Copies will be generated for this depth, i.e., symbolic in all loops
2323  // surrounding the this block range.
2324  unsigned copyDepth = getNestingDepth(&*begin);
2325 
2326  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2327  << "\n");
2328  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2329  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2330 
2331  // List of memory regions to copy for. We need a map vector to have a
2332  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2333  // since the alloc's for example are identical except for the SSA id.
2334  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2335  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2336 
2337  // Map from original memref's to the fast buffers that their accesses are
2338  // replaced with.
2339  DenseMap<Value, Value> fastBufferMap;
2340 
2341  // To check for errors when walking the block.
2342  bool error = false;
2343 
2344  // Walk this range of operations to gather all memory regions.
2345  block->walk(begin, end, [&](Operation *opInst) {
2346  // Gather regions to allocate to buffers in faster memory space.
2347  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2348  if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2349  (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2350  copyOptions.slowMemorySpace))
2351  return;
2352  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2353  if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2354  storeOp.getMemRefType().getMemorySpaceAsInt() !=
2355  copyOptions.slowMemorySpace)
2356  return;
2357  } else {
2358  // Neither load nor a store op.
2359  return;
2360  }
2361 
2362  // Compute the MemRefRegion accessed.
2363  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2364  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2365  /*addMemRefDimBounds=*/false))) {
2366  LLVM_DEBUG(llvm::dbgs()
2367  << "Error obtaining memory region: semi-affine maps?\n");
2368  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2369  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2370  LLVM_DEBUG(
2371  opInst->emitError("non-constant memref sizes not yet supported"));
2372  error = true;
2373  return;
2374  }
2375  }
2376 
2377  // Each memref has a single buffer associated with it irrespective of how
2378  // many load's and store's happen on it.
2379  // TODO: in the future, when regions don't intersect and satisfy
2380  // other properties (based on load/store regions), we could consider
2381  // multiple buffers per memref.
2382 
2383  // Add to the appropriate region if it's not already in it, or take a
2384  // bounding box union with the existing one if it's already in there.
2385  // Note that a memref may have both read and write regions - so update the
2386  // region in the other list if one exists (write in case of read and vice
2387  // versa) since there is a single bounding box for a memref across all reads
2388  // and writes that happen on it.
2389 
2390  // Attempts to update; returns true if 'region' exists in targetRegions.
2391  auto updateRegion =
2392  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2393  &targetRegions) {
2394  const auto *const it = targetRegions.find(region->memref);
2395  if (it == targetRegions.end())
2396  return false;
2397 
2398  // Perform a union with the existing region.
2399  if (failed(it->second->unionBoundingBox(*region))) {
2400  LLVM_DEBUG(llvm::dbgs()
2401  << "Memory region bounding box failed; "
2402  "over-approximating to the entire memref\n");
2403  // If the union fails, we will overapproximate.
2404  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2405  LLVM_DEBUG(opInst->emitError(
2406  "non-constant memref sizes not yet supported"));
2407  error = true;
2408  return true;
2409  }
2410  it->second->getConstraints()->clearAndCopyFrom(
2411  *region->getConstraints());
2412  } else {
2413  // Union was computed and stored in 'it->second': copy to 'region'.
2414  region->getConstraints()->clearAndCopyFrom(
2415  *it->second->getConstraints());
2416  }
2417  return true;
2418  };
2419 
2420  bool existsInRead = updateRegion(readRegions);
2421  if (error)
2422  return;
2423  bool existsInWrite = updateRegion(writeRegions);
2424  if (error)
2425  return;
2426 
2427  // Finally add it to the region list.
2428  if (region->isWrite() && !existsInWrite) {
2429  writeRegions[region->memref] = std::move(region);
2430  } else if (!region->isWrite() && !existsInRead) {
2431  readRegions[region->memref] = std::move(region);
2432  }
2433  });
2434 
2435  if (error) {
2436  LLVM_DEBUG(begin->emitError(
2437  "copy generation failed for one or more memref's in this block\n"));
2438  return failure();
2439  }
2440 
2441  uint64_t totalCopyBuffersSizeInBytes = 0;
2442  bool ret = true;
2443  auto processRegions =
2444  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2445  &regions) {
2446  for (const auto &regionEntry : regions) {
2447  // For each region, hoist copy in/out past all hoistable
2448  // 'affine.for's.
2449  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2450  Block *copyPlacementBlock;
2452  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2453  &copyInPlacementStart, &copyOutPlacementStart);
2454 
2455  uint64_t sizeInBytes;
2456  Block::iterator nBegin, nEnd;
2457  LogicalResult iRet = generateCopy(
2458  *regionEntry.second, block, begin, end, copyPlacementBlock,
2459  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2460  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2461  if (succeeded(iRet)) {
2462  // begin/end could have been invalidated, and need update.
2463  begin = nBegin;
2464  end = nEnd;
2465  totalCopyBuffersSizeInBytes += sizeInBytes;
2466  }
2467  ret = ret & succeeded(iRet);
2468  }
2469  };
2470  processRegions(readRegions);
2471  processRegions(writeRegions);
2472 
2473  if (!ret) {
2474  LLVM_DEBUG(begin->emitError(
2475  "copy generation failed for one or more memref's in this block\n"));
2476  return failure();
2477  }
2478 
2479  // For a range of operations, a note will be emitted at the caller.
2480  AffineForOp forOp;
2481  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2482  LLVM_DEBUG(forOp.emitRemark()
2483  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2484  << " KiB of copy buffers in fast memory space for this block");
2485  }
2486 
2487  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2488  block->getParentOp()->emitWarning(
2489  "total size of all copy buffers' for this block exceeds fast memory "
2490  "capacity");
2491  }
2492 
2493  return success();
2494 }
2495 
2496 // A convenience version of affineDataCopyGenerate for all ops in the body of
2497 // an AffineForOp.
2499  AffineForOp forOp, const AffineCopyOptions &copyOptions,
2500  std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2501  return affineDataCopyGenerate(forOp.getBody()->begin(),
2502  std::prev(forOp.getBody()->end()), copyOptions,
2503  filterMemRef, copyNests);
2504 }
2505 
2507  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2508  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2509  Block *block = analyzedOp->getBlock();
2510  auto begin = analyzedOp->getIterator();
2511  auto end = std::next(begin);
2512  DenseMap<Value, Value> fastBufferMap;
2513  DenseSet<Operation *> copyNests;
2514 
2515  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2516  copyOptions, fastBufferMap, copyNests,
2517  &result.sizeInBytes, &begin, &end);
2518  if (failed(err))
2519  return err;
2520 
2521  const auto &en = fastBufferMap.find(memrefRegion.memref);
2522  // In some cases (empty loops), no copy generation would have happened.
2523  if (en == fastBufferMap.end())
2524  return failure();
2525  result.alloc = en->second.getDefiningOp();
2526  assert(result.alloc && "fast buffer expected to be locally allocated");
2527  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2528  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2529  return success();
2530 }
2531 
2532 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2533 static void
2534 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2535  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2536  // Add a new empty level to output if it doesn't exist level already.
2537  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2538  if (currLoopDepth == depthToLoops.size())
2539  depthToLoops.emplace_back();
2540 
2541  for (auto &op : *block) {
2542  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2543  depthToLoops[currLoopDepth].push_back(forOp);
2544  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2545  }
2546  }
2547 }
2548 
2549 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2551  func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2552  for (auto &block : func)
2553  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2554 
2555  // Remove last loop level from output since it's empty.
2556  if (!depthToLoops.empty()) {
2557  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2558  depthToLoops.pop_back();
2559  }
2560 }
2561 
2562 // TODO: if necessary, this can be extended to also compose in any
2563 // affine.applys, fold to constant if all result dimensions of the map are
2564 // constant (canonicalizeMapAndOperands below already does this for single
2565 // result bound maps), and use simplifyMap to perform algebraic simplification.
2567  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2568  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2569  SmallVector<Value, 4> lowerOperands(lbOperands);
2570  SmallVector<Value, 4> upperOperands(ubOperands);
2571 
2572  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2573  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2574  lbMap = removeDuplicateExprs(lbMap);
2575  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2576  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2577  ubMap = removeDuplicateExprs(ubMap);
2578 
2579  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2580  step);
2581 }
2582 
2583 /// Creates an AffineIfOp that encodes the conditional to choose between
2584 /// the constant trip count version and an unknown trip count version of this
2585 /// nest of loops. This is used to separate partial and full tiles if `loops`
2586 /// has the intra-tile loops. The affine.if op is inserted at the builder
2587 /// insertion point of `b`.
2589  OpBuilder b) {
2590  if (loops.empty())
2591  return nullptr;
2592 
2593  auto *context = loops[0].getContext();
2594 
2597  llvm::append_range(ops, loops);
2598  (void)getIndexSet(ops, &cst);
2599 
2600  // Remove constraints that are independent of these loop IVs.
2601  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2602 
2603  // Construct the constraint set representing the guard for full tiles. The
2604  // lower bound (and upper bound) corresponding to the full tile should be
2605  // larger (and resp. smaller) than any other lower (or upper bound).
2606  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2607  for (auto loop : loops) {
2608  (void)loop;
2609  // TODO: Non-unit stride is not an issue to generalize to.
2610  assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2611  // Mark everything symbols for the purpose of finding a constant diff pair.
2612  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2613  1);
2614  unsigned fullTileLbPos, fullTileUbPos;
2615  if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2616  /*boundFloorDivisor=*/nullptr,
2617  /*ub=*/nullptr, &fullTileLbPos,
2618  &fullTileUbPos)) {
2619  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2620  return nullptr;
2621  }
2622 
2623  SmallVector<unsigned, 4> lbIndices, ubIndices;
2624  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2625 
2626  auto fLb = cst.getInequality(fullTileLbPos);
2627  auto fUb = cst.getInequality(fullTileUbPos);
2628  fullTileLb.assign(fLb.begin(), fLb.end());
2629  fullTileUb.assign(fUb.begin(), fUb.end());
2630 
2631  // Full tile lower bound should be >= than any other lower bound.
2632  for (auto lbIndex : lbIndices)
2633  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2634  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2635 
2636  // Full tile upper bound should be <= any other upper bound.
2637  for (auto ubIndex : ubIndices)
2638  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2639  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2640 
2641  cst.removeVar(0);
2642  }
2643 
2644  // The previous step leads to all zeros for the full tile lb and ub position
2645  // itself; remove those and any other duplicates / trivial redundancies.
2647 
2648  // Turn everything into dims conservatively since we earlier turned all
2649  // trailing ids past point loop IV into symbols. Some of these could be outer
2650  // loop IVs; we'll canonicalize anyway.
2651  cst.setDimSymbolSeparation(0);
2652 
2653  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2654  // ifCondSet can be null if cst was empty -- this can happen if all loops
2655  // in the nest have constant trip counts.
2656  if (!ifCondSet)
2657  return nullptr;
2658 
2659  SmallVector<Value, 4> setOperands;
2660  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2661  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2662  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2663  /*withElseRegion=*/true);
2664 }
2665 
2666 /// Create the full tile loop nest (along with its body).
2667 static LogicalResult
2669  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2670  fullTileLoops.reserve(inputNest.size());
2671 
2672  // For each loop in the original nest identify a lower/upper bound pair such
2673  // that their difference is a constant.
2675  for (auto loop : inputNest) {
2676  // TODO: straightforward to generalize to a non-unit stride.
2677  if (loop.getStepAsInt() != 1) {
2678  LLVM_DEBUG(llvm::dbgs()
2679  << "[tile separation] non-unit stride not implemented\n");
2680  return failure();
2681  }
2682  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2683  (void)getIndexSet(loopOp, &cst);
2684  // We will mark everything other than this loop IV as symbol for getting a
2685  // pair of <lb, ub> with a constant difference.
2687  unsigned lbPos, ubPos;
2688  if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2689  /*boundFloorDivisor=*/nullptr,
2690  /*ub=*/nullptr, &lbPos, &ubPos) ||
2691  lbPos == ubPos) {
2692  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2693  "equalities not yet handled\n");
2694  return failure();
2695  }
2696 
2697  // Set all variables as dimensions uniformly since some of those marked as
2698  // symbols above could be outer loop IVs (corresponding tile space IVs).
2699  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2700 
2701  AffineValueMap lbVmap, ubVmap;
2702  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2703  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2704  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2705  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2706  ubVmap.getOperands(), ubVmap.getAffineMap());
2707  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2708  fullTileLoops.push_back(fullTileLoop);
2709  }
2710 
2711  // Add the body for the full tile loop nest.
2712  IRMapping operandMap;
2713  for (const auto &loopEn : llvm::enumerate(inputNest))
2714  operandMap.map(loopEn.value().getInductionVar(),
2715  fullTileLoops[loopEn.index()].getInductionVar());
2716  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2717  for (auto &op : inputNest.back().getBody()->without_terminator())
2718  b.clone(op, operandMap);
2719  return success();
2720 }
2721 
2724  SmallVectorImpl<AffineForOp> *fullTileNest) {
2725  if (inputNest.empty())
2726  return success();
2727 
2728  auto firstLoop = inputNest[0];
2729 
2730  // Each successive for op has to be nested in the other.
2731  auto prevLoop = firstLoop;
2732  for (auto loop : inputNest.drop_front(1)) {
2733  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2734  prevLoop = loop;
2735  }
2736 
2737  // Create the full tile loop nest.
2738  SmallVector<AffineForOp, 4> fullTileLoops;
2739  OpBuilder b(firstLoop);
2740  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2741  if (!fullTileLoops.empty())
2742  fullTileLoops.front().erase();
2743  return failure();
2744  }
2745 
2746  // Create and insert the version select right before the root of the nest.
2747  b = OpBuilder(firstLoop);
2748  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2749  if (!ifOp) {
2750  fullTileLoops.front().erase();
2751  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2752  "separation condition\n");
2753  return failure();
2754  }
2755 
2756  // Move the full tile into the then block.
2757  Block *thenBlock = ifOp.getThenBlock();
2758  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2759  thenBlock->getOperations().splice(
2760  std::prev(thenBlock->end()),
2761  outermostFullTileLoop->getBlock()->getOperations(),
2762  Block::iterator(outermostFullTileLoop));
2763 
2764  // Move the partial tile into the else block. The partial tile is the same as
2765  // the original loop nest.
2766  Block *elseBlock = ifOp.getElseBlock();
2767  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2768  firstLoop->getBlock()->getOperations(),
2769  Block::iterator(firstLoop));
2770 
2771  if (fullTileNest)
2772  *fullTileNest = std::move(fullTileLoops);
2773 
2774  return success();
2775 }
static LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:390
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:667
static void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:430
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2267
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1573
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:1957
static LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:369
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1805
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:182
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:415
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:566
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:471
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2534
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:921
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions &copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
Definition: LoopUtils.cpp:1974
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2668
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1849
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:695
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:981
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1094
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1353
static void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:424
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2588
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:57
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1553
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
Definition: LoopUtils.cpp:1885
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:115
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
An integer constant appearing in affine expression.
Definition: AffineExpr.h:232
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:786
U dyn_cast() const
Definition: AffineExpr.h:283
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:44
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:341
unsigned getNumDims() const
Definition: AffineMap.cpp:337
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:350
unsigned getNumResults() const
Definition: AffineMap.cpp:345
unsigned getNumInputs() const
Definition: AffineMap.cpp:346
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:354
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:133
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:26
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:238
RetT walk(FnT &&callback)
Walk the operations in this block.
Definition: Block.h:280
OpListType & getOperations()
Definition: Block.h:130
Operation & front()
Definition: Block.h:146
iterator end()
Definition: Block.h:137
iterator begin()
Definition: Block.h:136
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:390
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:396
AffineMap getDimIdentityMap()
Definition: Builders.cpp:372
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:376
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:357
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:361
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:353
MLIRContext * getContext() const
Definition: Builders.h:55
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:710
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:308
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:206
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:528
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:416
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:250
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:446
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:397
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:108
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:42
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:279
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:686
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:267
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:655
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:236
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:288
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
MLIRContext * getContext() const
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:372
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:212
Type getType() const
Return the type of this value.
Definition: Value.h:122
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:52
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:497
Value getOperand(unsigned idx)
Definition: AffineOps.h:503
operand_range getOperands()
Definition: AffineOps.h:512
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:94
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:291
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:90
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
ArrayRef< MPInt > getInequality(unsigned idx) const
MPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
std::optional< MPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< MPInt > *lb=nullptr, MPInt *boundFloorDivisor=nullptr, SmallVectorImpl< MPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
bool isParallelLoop(Operation &op)
std::optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, std::nullopt otherwise.
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1651
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:1187
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:893
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:132
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2309
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1083
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2656
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1016
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2550
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1401
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2566
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:864
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:237
unsigned permuteLoops(MutableArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1423
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1387
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2506
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1514
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:908
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1134
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:822
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1519
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:505
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
std::optional< uint64_t > getIntOrFloatMemRefSizeInBytes(MemRefType memRefType)
Returns the size of a memref with element type int or float in bytes if it's statically shaped,...
Definition: Utils.cpp:1118
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1720
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1765
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1628
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1493
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:779
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1330
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:881
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1267
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2723
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2511
void walk(Operation *op, function_ref< void(Region *)> callback, WalkOrder order)
Walk all of the regions, blocks, or operations nested under (and including) the given operation.
Definition: Visitors.h:137
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
This header declares functions that assist transformations in the MemRef dialect.
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:661
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Applies the specified rewrite patterns on ops while also trying to fold these ops.
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:28
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:671
int64_t floorDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's floordiv operation on constants.
Definition: MathExtras.h:33
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
Definition: MathExtras.h:23
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
Definition: LogicalResult.h:68
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:345
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
@ ExistingOps
Only pre-existing ops are processed.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
std::vector< std::pair< Block::iterator, Block::iterator > > subBlocks
Definition: LoopUtils.cpp:1110
void walk(Operation *op)
Definition: LoopUtils.cpp:1113
void walk(Block &block)
Definition: LoopUtils.cpp:1119
int64_t stride
Definition: LoopUtils.cpp:1840
int64_t numEltPerStride
Definition: LoopUtils.cpp:1841
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:166
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:206
std::optional< int64_t > ub
std::optional< int64_t > lb
A description of a (parallelizable) reduction in an affine loop.
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:432
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:471
bool isWrite() const
Definition: Utils.h:473
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:814
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:875
void setWrite(bool flag)
Definition: Utils.h:474
Value memref
Memref that this region corresponds to.
Definition: Utils.h:519
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:526
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.