MLIR  16.0.0git
LoopUtils.cpp
Go to the documentation of this file.
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
25 #include "mlir/IR/IntegerSet.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/SmallPtrSet.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 
34 #define DEBUG_TYPE "loop-utils"
35 
36 using namespace mlir;
37 using namespace presburger;
38 using llvm::SmallMapVector;
39 
40 namespace {
41 // This structure is to pass and return sets of loop parameters without
42 // confusing the order.
43 struct LoopParams {
44  Value lowerBound;
45  Value upperBound;
46  Value step;
47 };
48 } // namespace
49 
50 /// Computes the cleanup loop lower bound of the loop being unrolled with
51 /// the specified unroll factor; this bound will also be upper bound of the main
52 /// part of the unrolled loop. Computes the bound as an AffineMap with its
53 /// operands or a null map when the trip count can't be expressed as an affine
54 /// expression.
55 static void
56 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
57  AffineMap &cleanupLbMap,
58  SmallVectorImpl<Value> &cleanupLbOperands) {
59  AffineMap tripCountMap;
60  SmallVector<Value, 4> tripCountOperands;
61  getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
62  // Trip count can't be computed.
63  if (!tripCountMap) {
64  cleanupLbMap = AffineMap();
65  return;
66  }
67 
68  OpBuilder b(forOp);
69  auto lbMap = forOp.getLowerBoundMap();
70  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
71  forOp.getLowerBoundOperands());
72 
73  // For each upper bound expr, get the range.
74  // Eg: affine.for %i = lb to min (ub1, ub2),
75  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
76  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
77  // these affine.apply's make up the cleanup loop lower bound.
78  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
79  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
80  int64_t step = forOp.getStep();
81  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
82  auto tripCountExpr = tripCountMap.getResult(i);
83  bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
84  auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
85  tripCountMap.getNumSymbols(), bumpExprs[i]);
86  bumpValues[i] =
87  b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
88  }
89 
90  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
91  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
92  newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
93 
94  cleanupLbOperands.clear();
95  cleanupLbOperands.push_back(lb);
96  cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
97  cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
98  b.getContext());
99  // Simplify the cleanupLbMap + cleanupLbOperands.
100  fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
101  cleanupLbMap = simplifyAffineMap(cleanupLbMap);
102  canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
103  // Remove any affine.apply's that became dead from the simplification above.
104  for (auto v : bumpValues)
105  if (v.use_empty())
106  v.getDefiningOp()->erase();
107 
108  if (lb.use_empty())
109  lb.erase();
110 }
111 
112 /// Helper to replace uses of loop carried values (iter_args) and loop
113 /// yield values while promoting single iteration affine.for ops.
114 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
115  // Replace uses of iter arguments with iter operands (initial values).
116  auto iterOperands = forOp.getIterOperands();
117  auto iterArgs = forOp.getRegionIterArgs();
118  for (auto e : llvm::zip(iterOperands, iterArgs))
119  std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
120 
121  // Replace uses of loop results with the values yielded by the loop.
122  auto outerResults = forOp.getResults();
123  auto innerResults = forOp.getBody()->getTerminator()->getOperands();
124  for (auto e : llvm::zip(outerResults, innerResults))
125  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
126 }
127 
128 /// Promotes the loop body of a forOp to its containing block if the forOp
129 /// was known to have a single iteration.
130 // TODO: extend this for arbitrary affine bounds.
132  Optional<uint64_t> tripCount = getConstantTripCount(forOp);
133  if (!tripCount || *tripCount != 1)
134  return failure();
135 
136  if (forOp.getLowerBoundMap().getNumResults() != 1)
137  return failure();
138 
139  // Replaces all IV uses to its single iteration value.
140  auto iv = forOp.getInductionVar();
141  auto *parentBlock = forOp->getBlock();
142  if (!iv.use_empty()) {
143  if (forOp.hasConstantLowerBound()) {
144  OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
145  auto constOp = topBuilder.create<arith::ConstantIndexOp>(
146  forOp.getLoc(), forOp.getConstantLowerBound());
147  iv.replaceAllUsesWith(constOp);
148  } else {
149  auto lbOperands = forOp.getLowerBoundOperands();
150  auto lbMap = forOp.getLowerBoundMap();
151  OpBuilder builder(forOp);
152  if (lbMap == builder.getDimIdentityMap()) {
153  // No need of generating an affine.apply.
154  iv.replaceAllUsesWith(lbOperands[0]);
155  } else {
156  auto affineApplyOp =
157  builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
158  iv.replaceAllUsesWith(affineApplyOp);
159  }
160  }
161  }
162 
164 
165  // Move the loop body operations, except for its terminator, to the loop's
166  // containing block.
167  forOp.getBody()->back().erase();
168  parentBlock->getOperations().splice(Block::iterator(forOp),
169  forOp.getBody()->getOperations());
170  forOp.erase();
171  return success();
172 }
173 
174 /// Generates an affine.for op with the specified lower and upper bounds
175 /// while generating the right IV remappings to realize shifts for operations in
176 /// its body. The operations that go into the loop body are specified in
177 /// opGroupQueue starting from the specified offset, and in that order. The
178 /// first element of the pair specifies the shift applied to that group of
179 /// operations; the shift is multiplied by the loop step before being applied.
180 /// Returns nullptr if the generated loop simplifies to a single iteration one.
181 static AffineForOp generateShiftedLoop(
182  AffineMap lbMap, AffineMap ubMap,
183  const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
184  unsigned offset, AffineForOp srcForOp, OpBuilder b) {
185  auto lbOperands = srcForOp.getLowerBoundOperands();
186  auto ubOperands = srcForOp.getUpperBoundOperands();
187 
188  assert(lbMap.getNumInputs() == lbOperands.size());
189  assert(ubMap.getNumInputs() == ubOperands.size());
190 
191  auto loopChunk = b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap,
192  ubOperands, ubMap, srcForOp.getStep());
193  auto loopChunkIV = loopChunk.getInductionVar();
194  auto srcIV = srcForOp.getInductionVar();
195 
196  BlockAndValueMapping operandMap;
197 
198  auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
199  for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
200  uint64_t shift = it.first;
201  auto ops = it.second;
202  // All 'same shift' operations get added with their operands being
203  // remapped to results of cloned operations, and their IV used remapped.
204  // Generate the remapping if the shift is not zero: remappedIV = newIV -
205  // shift.
206  if (!srcIV.use_empty() && shift != 0) {
207  auto ivRemap = bodyBuilder.create<AffineApplyOp>(
208  srcForOp.getLoc(),
209  bodyBuilder.getSingleDimShiftAffineMap(
210  -static_cast<int64_t>(srcForOp.getStep() * shift)),
211  loopChunkIV);
212  operandMap.map(srcIV, ivRemap);
213  } else {
214  operandMap.map(srcIV, loopChunkIV);
215  }
216  for (auto *op : ops)
217  bodyBuilder.clone(*op, operandMap);
218  };
219  if (succeeded(promoteIfSingleIteration(loopChunk)))
220  return AffineForOp();
221  return loopChunk;
222 }
223 
224 // The skewing of operations with respect to one another can be used for
225 // example to allow overlap of asynchronous operations (such as DMA
226 // communication) with computation, or just relative shifting of operations
227 // for better register reuse, locality or parallelism. As such, the shifts are
228 // typically expected to be at most of the order of the number of operations.
229 // This method should not be used as a substitute for loop distribution/fission.
230 // This method uses an algorithm// in time linear in the number of operations
231 // in the body of the for loop - (using the 'sweep line' paradigm). This method
232 // asserts preservation of SSA dominance. A check for that as well as that for
233 // memory-based dependence preservation check rests with the users of this
234 // method.
236  ArrayRef<uint64_t> shifts,
237  bool unrollPrologueEpilogue) {
238  assert(forOp.getBody()->getOperations().size() == shifts.size() &&
239  "too few/many shifts");
240  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
241  return success();
242 
243  // If the trip counts aren't constant, we would need versioning and
244  // conditional guards (or context information to prevent such versioning). The
245  // better way to pipeline for such loops is to first tile them and extract
246  // constant trip count "full tiles" before applying this.
247  auto mayBeConstTripCount = getConstantTripCount(forOp);
248  if (!mayBeConstTripCount) {
249  LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
250  return success();
251  }
252  uint64_t tripCount = *mayBeConstTripCount;
253 
254  assert(isOpwiseShiftValid(forOp, shifts) &&
255  "shifts will lead to an invalid transformation\n");
256 
257  int64_t step = forOp.getStep();
258 
259  unsigned numChildOps = shifts.size();
260 
261  // Do a linear time (counting) sort for the shifts.
262  uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
263  if (maxShift >= numChildOps) {
264  // Large shifts are not the typical use case.
265  forOp.emitWarning("not shifting because shifts are unrealistically large");
266  return success();
267  }
268 
269  // An array of operation groups sorted by shift amount; each group has all
270  // operations with the same shift in the order in which they appear in the
271  // body of the 'affine.for' op.
272  std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
273  unsigned pos = 0;
274  for (auto &op : forOp.getBody()->without_terminator()) {
275  auto shift = shifts[pos++];
276  sortedOpGroups[shift].push_back(&op);
277  }
278 
279  // Unless the shifts have a specific pattern (which actually would be the
280  // common use case), prologue and epilogue are not meaningfully defined.
281  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
282  // loop generated as the prologue and the last as epilogue and unroll these
283  // fully.
284  AffineForOp prologue, epilogue;
285 
286  // Do a sweep over the sorted shifts while storing open groups in a
287  // vector, and generating loop portions as necessary during the sweep. A block
288  // of operations is paired with its shift.
289  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
290 
291  auto origLbMap = forOp.getLowerBoundMap();
292  uint64_t lbShift = 0;
293  OpBuilder b(forOp);
294  for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
295  // If nothing is shifted by d, continue.
296  if (sortedOpGroups[d].empty())
297  continue;
298  if (!opGroupQueue.empty()) {
299  assert(d > 0 &&
300  "Queue expected to be empty when the first block is found");
301  // The interval for which the loop needs to be generated here is:
302  // [lbShift, min(lbShift + tripCount, d)) and the body of the
303  // loop needs to have all operations in opQueue in that order.
304  AffineForOp res;
305  if (lbShift + tripCount * step < d * step) {
306  res = generateShiftedLoop(
307  b.getShiftedAffineMap(origLbMap, lbShift),
308  b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
309  opGroupQueue, /*offset=*/0, forOp, b);
310  // Entire loop for the queued op groups generated, empty it.
311  opGroupQueue.clear();
312  lbShift += tripCount * step;
313  } else {
314  res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
315  b.getShiftedAffineMap(origLbMap, d),
316  opGroupQueue, /*offset=*/0, forOp, b);
317  lbShift = d * step;
318  }
319 
320  if (res) {
321  // Simplify/canonicalize the affine.for.
322  RewritePatternSet patterns(res.getContext());
323  AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
324  bool erased;
325  (void)applyOpPatternsAndFold(res, std::move(patterns), &erased);
326 
327  if (!erased && !prologue)
328  prologue = res;
329  if (!erased)
330  epilogue = res;
331  }
332  } else {
333  // Start of first interval.
334  lbShift = d * step;
335  }
336  // Augment the list of operations that get into the current open interval.
337  opGroupQueue.emplace_back(d, sortedOpGroups[d]);
338  }
339 
340  // Those operations groups left in the queue now need to be processed (FIFO)
341  // and their loops completed.
342  for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
343  uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
344  epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
345  b.getShiftedAffineMap(origLbMap, ubShift),
346  opGroupQueue, /*offset=*/i, forOp, b);
347  lbShift = ubShift;
348  if (!prologue)
349  prologue = epilogue;
350  }
351 
352  // Erase the original for op.
353  forOp.erase();
354 
355  if (unrollPrologueEpilogue && prologue)
356  (void)loopUnrollFull(prologue);
357  if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
358  (void)loopUnrollFull(epilogue);
359 
360  return success();
361 }
362 
363 /// Checks the legality of tiling of a hyper-rectangular loop nest by simply
364 /// checking if there is a 'negative' dependence in the memrefs present in
365 /// the loop nest. If yes then tiling is invalid.
366 static bool
368  assert(!origLoops.empty() && "no original loops provided");
369 
370  // We first find out all dependences we intend to check.
371  SmallVector<Operation *, 8> loadAndStoreOps;
372  origLoops[0]->walk([&](Operation *op) {
373  if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
374  loadAndStoreOps.push_back(op);
375  });
376 
377  unsigned numOps = loadAndStoreOps.size();
378  unsigned numLoops = origLoops.size();
379  FlatAffineValueConstraints dependenceConstraints;
380  for (unsigned d = 1; d <= numLoops + 1; ++d) {
381  for (unsigned i = 0; i < numOps; ++i) {
382  Operation *srcOp = loadAndStoreOps[i];
383  MemRefAccess srcAccess(srcOp);
384  for (unsigned j = 0; j < numOps; ++j) {
385  Operation *dstOp = loadAndStoreOps[j];
386  MemRefAccess dstAccess(dstOp);
387 
389  dependenceConstraints.reset();
391  srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
392 
393  // Skip if there is no dependence in this case.
394  if (!hasDependence(result))
395  continue;
396 
397  // Check whether there is any negative direction vector in the
398  // dependence components found above, which means that dependence is
399  // violated by the default hyper-rect tiling method.
400  LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
401  "for dependence at depth: "
402  << Twine(d) << " between:\n";);
403  LLVM_DEBUG(srcAccess.opInst->dump(););
404  LLVM_DEBUG(dstAccess.opInst->dump(););
405  for (unsigned k = 0, e = depComps.size(); k < e; k++) {
406  DependenceComponent depComp = depComps[k];
407  if (depComp.lb.has_value() && depComp.ub.has_value() &&
408  depComp.lb.value() < depComp.ub.value() &&
409  depComp.ub.value() < 0) {
410  LLVM_DEBUG(llvm::dbgs()
411  << "Dependence component lb = "
412  << Twine(depComp.lb.value())
413  << " ub = " << Twine(depComp.ub.value())
414  << " is negative at depth: " << Twine(d)
415  << " and thus violates the legality rule.\n");
416  return false;
417  }
418  }
419  }
420  }
421  }
422 
423  return true;
424 }
425 
426 /// Checks whether hyper-rectangular loop tiling of the nest
427 /// represented by `origLoops` is valid. The validity condition is from Irigoin
428 /// and Triolet, which states that two tiles cannot depend on each other. We
429 /// simplify such condition to just checking whether there is any negative
430 /// dependence direction, since we have the prior knowledge that the tiling
431 /// results will be hyper-rectangles, which are scheduled in the
432 /// lexicographically increasing order on the vector of loop indices. This
433 /// function will return failure when any dependence component is negative along
434 /// any of `origLoops`.
437  return success(checkTilingLegalityImpl(origLoops));
438 }
439 
440 /// Checks whether a loop nest is hyper-rectangular or not.
443  SmallVector<Operation *, 8> ops(input.begin(), input.end());
444  // 0-d or 1-d is trivially hyper-rectangular.
445  if (input.size() <= 1)
446  return success();
447  if (failed(getIndexSet(ops, &cst))) {
448  LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
449  return failure();
450  }
451  if (!cst.isHyperRectangular(0, input.size())) {
452  LLVM_DEBUG(llvm::dbgs()
453  << "Non-hyperrectangular nests not supported for tiling!\n");
454  return failure();
455  }
456  return success();
457 }
458 
459 /// Check if the input nest is supported for tiling and whether tiling would be
460 /// legal or not.
461 template <typename t>
463  ArrayRef<t> tileSizes) {
464  assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
465 
466  if (llvm::any_of(input,
467  [](AffineForOp op) { return op.getNumResults() > 0; })) {
468  LLVM_DEBUG(llvm::dbgs()
469  << "Cannot tile nest where a loop has yield values\n");
470  return failure();
471  }
472 
473  // Check if the supplied `for` ops are all successively nested.
474  if (!isPerfectlyNested(input)) {
475  LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
476  return failure();
477  }
478 
479  if (failed(checkIfHyperRectangular(input)))
480  return failure();
481 
482  // Check if tiling is legal.
483  if (failed(checkTilingLegality(input))) {
484  input[0].emitRemark("tiling code is illegal due to dependences");
485  return failure();
486  }
487 
488  return success();
489 }
490 
491 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
492 /// location in destination's body, ignoring the terminator.
493 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
494  Block::iterator loc) {
495  auto &ops = src.getBody()->getOperations();
496  dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
497  std::prev(ops.end()));
498 }
499 
500 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
501 /// body.
502 void moveLoopBody(AffineForOp src, AffineForOp dest) {
503  moveLoopBodyImpl(src, dest, dest.getBody()->begin());
504 }
505 
506 /// Constructs tiled loop nest, without setting the loop bounds and move the
507 /// body of the original loop nest to the tiled loop nest.
509  AffineForOp rootAffineForOp, unsigned width,
510  MutableArrayRef<AffineForOp> tiledLoops) {
511  Location loc = rootAffineForOp.getLoc();
512 
513  // The outermost among the loops as we add more..
514  Operation *topLoop = rootAffineForOp.getOperation();
515  AffineForOp innermostPointLoop;
516 
517  // Add intra-tile (or point) loops.
518  for (unsigned i = 0; i < width; i++) {
519  OpBuilder b(topLoop);
520  // Loop bounds will be set later.
521  AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
522  pointLoop.getBody()->getOperations().splice(
523  pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
524  topLoop);
525  tiledLoops[2 * width - 1 - i] = pointLoop;
526  topLoop = pointLoop.getOperation();
527  if (i == 0)
528  innermostPointLoop = pointLoop;
529  }
530 
531  // Add tile space loops;
532  for (unsigned i = width; i < 2 * width; i++) {
533  OpBuilder b(topLoop);
534  // Loop bounds will be set later.
535  AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
536  tileSpaceLoop.getBody()->getOperations().splice(
537  tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
538  topLoop);
539  tiledLoops[2 * width - i - 1] = tileSpaceLoop;
540  topLoop = tileSpaceLoop.getOperation();
541  }
542 
543  // Move the loop body of the original nest to the new one.
544  moveLoopBody(origLoops.back(), innermostPointLoop);
545 }
546 
547 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
548 // TODO: Handle non-constant lower bounds.
549 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
550  AffineForOp newInterTileLoop,
551  AffineForOp newIntraTileLoop,
552  Value tileSize) {
553  // The lower bound for the intra-tile loop is represented by an affine map
554  // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
555  // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
556  // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
557  // of the corresponding inter-tile loop, %t0 is the corresponding tiling
558  // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
559  // corresponding inter-tile loop.
560 
561  assert(origLoop.hasConstantLowerBound() &&
562  "expected input loops to have constant lower bound.");
563 
564  // Get lower bound of original loop as an affine expression.
565  AffineExpr origLowerBoundExpr;
566  origLowerBoundExpr =
567  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
568 
569  // Add dim operands from original lower/upper bound.
570  SmallVector<Value, 4> lbOperands, ubOperands;
571  AffineBound lb = origLoop.getLowerBound();
572  AffineBound ub = origLoop.getUpperBound();
573  lbOperands.reserve(lb.getNumOperands() + 2);
574  ubOperands.reserve(ub.getNumOperands() + 2);
575  AffineMap origLbMap = lb.getMap();
576  AffineMap origUbMap = ub.getMap();
577  for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
578  lbOperands.push_back(lb.getOperand(j));
579  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
580  ubOperands.push_back(ub.getOperand(j));
581 
582  // Add a new dim operand in lb/ubOperands corresponding to the origLoop
583  // IV.
584  lbOperands.push_back(newInterTileLoop.getInductionVar());
585  ubOperands.push_back(newInterTileLoop.getInductionVar());
586 
587  // Get loop IV as an affine expression for lower/upper bound. Size of
588  // lb/ubOperands is guaranteed to be atleast one.
589  AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
590  AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
591 
592  // Add symbol operands from original lower/upper bound.
593  for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
594  lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
595  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
596  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
597 
598  // Add a new symbol operand which is the tile size for this loop.
599  lbOperands.push_back(tileSize);
600  ubOperands.push_back(tileSize);
601 
602  SmallVector<AffineExpr, 4> lbBoundExprs;
603  SmallVector<AffineExpr, 4> ubBoundExprs;
604  lbBoundExprs.reserve(origLbMap.getNumResults());
605  ubBoundExprs.reserve(origUbMap.getNumResults());
606 
607  // Get tiling parameter as an affine expression for lb/ub.
608  AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
609  AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
610 
611  // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
612  lbBoundExprs.push_back(
613  ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
614  origLowerBoundExpr);
615 
616  // Get the origLoopStep as an affine expression.
617  AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
618 
619  // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
620  // (tilingParameter * origLoopStep) + origlb.
621  ubBoundExprs.push_back(
622  ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
623  (ubTileParameter * origLoopStep) + origLowerBoundExpr);
624 
625  ubBoundExprs.append(origUbMap.getResults().begin(),
626  origUbMap.getResults().end());
627 
628  AffineMap lbMap =
629  AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
630  lbBoundExprs, b.getContext());
631  newIntraTileLoop.setLowerBound(lbOperands, lbMap);
632 
633  AffineMap ubMap =
634  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
635  ubBoundExprs, b.getContext());
636  newIntraTileLoop.setUpperBound(ubOperands, ubMap);
637 
638  // Original loop step must be preserved.
639  newIntraTileLoop.setStep(origLoop.getStep());
640 }
641 
642 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
643 // TODO: Handle non-constant lower bounds.
644 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
645  AffineForOp newLoop, Value tileSize) {
646  OperandRange newLbOperands = origLoop.getLowerBoundOperands();
647 
648  // The lower bounds for inter-tile loops are same as the corresponding lower
649  // bounds of original loops.
650  newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
651 
652  // The new upper bound map for inter-tile loops, assuming constant lower
653  // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
654  // originalLowerBound), tiling parameter); where tiling parameter is the
655  // respective tile size for that loop. For e.g. if the original ubmap was
656  // ()->(1024), the new map will be
657  // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
658  // Therefore a new symbol operand is inserted in the map and the result
659  // expression is overwritten.
660 
661  assert(origLoop.hasConstantLowerBound() &&
662  "expected input loops to have constant lower bound.");
663 
664  // Get lower bound of original loop as an affine expression.
665  AffineExpr origLowerBoundExpr;
666  origLowerBoundExpr =
667  b.getAffineConstantExpr(origLoop.getConstantLowerBound());
668 
669  // Add dim operands from original upper bound.
670  SmallVector<Value, 4> ubOperands;
671  AffineBound ub = origLoop.getUpperBound();
672  ubOperands.reserve(ub.getNumOperands() + 1);
673  AffineMap origUbMap = ub.getMap();
674  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
675  ubOperands.push_back(ub.getOperand(j));
676 
677  // Add symbol operands from original upper bound.
678  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
679  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
680 
681  // Add a new symbol operand which is the tile size for this loop.
682  ubOperands.push_back(tileSize);
683 
684  // Get tiling parameter as an affine expression.
685  AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
686 
687  SmallVector<AffineExpr, 4> boundExprs;
688  boundExprs.reserve(origUbMap.getNumResults());
689  int64_t origUpperBound;
690  AffineExpr origUpperBoundExpr;
691 
692  // If upper bound for the original loop is constant, then the constant can
693  // be obtained as an affine expression straight away.
694  if (origLoop.hasConstantUpperBound()) {
695  origUpperBound = origLoop.getConstantUpperBound();
696 
697  // Get original constant upper bound as an affine expression.
698  origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
699 
700  // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
701  // originalLowerBound), tilingParameter).
702  boundExprs.push_back(
703  origLowerBoundExpr +
704  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
705  } else {
706  // If upper bound for the original loop is not constant then two cases
707  // are possible, although there handeling is the same, 1.) The result of
708  // ubmap has only one result expression. For e.g.
709  // affine.for %i = 5 to %ub
710  //
711  // A symbol operand is added which represents the tiling parameter. The
712  // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
713  // where 's0' is the original upper bound and 's1' is the tiling
714  // parameter. 2.) When ubMap has more than one result expression. For e.g.
715  // #map0 = affine_map<()[s0, s1] -> (s0, s1)
716  // affine.for %i = 5 to min #map0()[%s0, %s1]
717  //
718  // A symbol operand is added which represents the tiling parameter. The
719  // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
720  // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
721 
722  // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
723  // originalLowerBound), tilingParameter).
724  for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
725  boundExprs.push_back(
726  origLowerBoundExpr +
727  (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
728  }
729 
730  AffineMap ubMap =
731  AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
732  boundExprs, b.getContext());
733  newLoop.setUpperBound(ubOperands, ubMap);
734 
735  // Original loop step must be preserved.
736  newLoop.setStep(origLoop.getStep());
737 }
738 
739 /// Constructs and sets new loop bounds after tiling for the case of
740 /// hyper-rectangular index sets, where the bounds of one dimension do not
741 /// depend on other dimensions and tiling parameters are captured from SSA
742 /// values. Bounds of each dimension can thus be treated independently,
743 /// and deriving the new bounds is much simpler and faster than for the case of
744 /// tiling arbitrary polyhedral shapes.
747  MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
748  assert(!origLoops.empty() && "expected atleast one loop in band");
749  assert(origLoops.size() == tileSizes.size() &&
750  "expected tiling parameter for each loop in band.");
751 
752  OpBuilder b(origLoops[0].getOperation());
753  unsigned width = origLoops.size();
754 
755  // Set bounds for tile space loops.
756  for (unsigned i = 0; i < width; ++i) {
757  setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
758  }
759 
760  // Set bounds for intra-tile loops.
761  for (unsigned i = 0; i < width; ++i) {
762  setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
763  newLoops[i + width], tileSizes[i]);
764  }
765 }
766 
767 /// Constructs and sets new loop bounds after tiling for the case of
768 /// hyper-rectangular index sets, where the bounds of one dimension do not
769 /// depend on other dimensions. Bounds of each dimension can thus be treated
770 /// independently, and deriving the new bounds is much simpler and faster
771 /// than for the case of tiling arbitrary polyhedral shapes.
772 static void
775  ArrayRef<unsigned> tileSizes) {
776  assert(!origLoops.empty());
777  assert(origLoops.size() == tileSizes.size());
778 
779  OpBuilder b(origLoops[0].getOperation());
780  unsigned width = origLoops.size();
781 
782  // Bounds for tile space loops.
783  for (unsigned i = 0; i < width; i++) {
784  OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
785  OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
786  newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
787  newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
788  // If the step size of original loop is x and tileSize is y then after
789  // tiling the tile space loops' step size becomes x*y.
790  newLoops[i].setStep(tileSizes[i] * origLoops[i].getStep());
791  }
792  // Bounds for intra-tile loops.
793  for (unsigned i = 0; i < width; i++) {
794  int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
795  Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
796  // The lower bound is just the tile-space loop.
797  AffineMap lbMap = b.getDimIdentityMap();
798  newLoops[width + i].setLowerBound(
799  /*operands=*/newLoops[i].getInductionVar(), lbMap);
800  // The step sizes of intra-tile loops is just the original loops' step size.
801  newLoops[width + i].setStep(origLoops[i].getStep());
802 
803  // Set the upper bound.
804  if (mayBeConstantCount && mayBeConstantCount.value() < tileSizes[i]) {
805  // Trip count is less than the tile size: upper bound is lower bound +
806  // trip count * stepSize.
808  mayBeConstantCount.value() * origLoops[i].getStep());
809  newLoops[width + i].setUpperBound(
810  /*operands=*/newLoops[i].getInductionVar(), ubMap);
811  } else if (largestDiv % tileSizes[i] != 0) {
812  // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
813  // Construct the upper bound map; the operands are the original operands
814  // with 'i' (tile-space loop) appended to it. The new upper bound map is
815  // the original one with an additional expression i + tileSize * stepSize
816  // appended.
817 
818  // Add dim operands from original upper bound.
819  SmallVector<Value, 4> ubOperands;
820  AffineBound ub = origLoops[i].getUpperBound();
821  ubOperands.reserve(ub.getNumOperands() + 1);
822  AffineMap origUbMap = ub.getMap();
823  for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
824  ubOperands.push_back(ub.getOperand(j));
825 
826  // Add dim operand for new loop upper bound.
827  ubOperands.push_back(newLoops[i].getInductionVar());
828 
829  // Add symbol operands from original upper bound.
830  for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
831  ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
832 
833  SmallVector<AffineExpr, 4> boundExprs;
834  boundExprs.reserve(1 + origUbMap.getNumResults());
835  AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
836  // The new upper bound map is the original one with an additional
837  // expression i + tileSize * stepSize (of original loop) appended.
838  boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStep());
839  boundExprs.append(origUbMap.getResults().begin(),
840  origUbMap.getResults().end());
841  AffineMap ubMap =
842  AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
843  boundExprs, b.getContext());
844  newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
845  } else {
846  // No need of the min expression.
847  AffineExpr dim = b.getAffineDimExpr(0);
848  AffineMap ubMap =
849  AffineMap::get(1, 0, dim + tileSizes[i] * origLoops[i].getStep());
850  newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
851  }
852  }
853 }
854 
855 /// Tiles the specified band of perfectly nested loops creating tile-space loops
856 /// and intra-tile loops. A band is a contiguous set of loops.
857 // TODO: handle non hyper-rectangular spaces.
860  ArrayRef<unsigned> tileSizes,
861  SmallVectorImpl<AffineForOp> *tiledNest) {
862  if (input.empty())
863  return success();
864 
865  if (failed(performPreTilingChecks(input, tileSizes)))
866  return failure();
867 
868  MutableArrayRef<AffineForOp> origLoops = input;
869  AffineForOp rootAffineForOp = origLoops[0];
870 
871  // Note that width is at least one since the band isn't empty.
872  unsigned width = input.size();
873  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
874 
875  // Construct a tiled loop nest without setting their bounds. Bounds are
876  // set later.
877  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
878 
879  SmallVector<Value, 8> origLoopIVs;
880  extractForInductionVars(input, &origLoopIVs);
881 
882  // Set loop bounds for the tiled loop nest.
883  constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
884 
885  // Replace original IVs with intra-tile loop IVs.
886  for (unsigned i = 0; i < width; i++)
887  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
888 
889  // Erase the old loop nest.
890  rootAffineForOp.erase();
891 
892  if (tiledNest)
893  *tiledNest = std::move(tiledLoops);
894 
895  return success();
896 }
897 
898 /// Tiles the specified band of perfectly nested loops creating tile-space
899 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
900 /// is a contiguous set of loops.
901 // TODO: handle non hyper-rectangular spaces.
904  ArrayRef<Value> tileSizes,
905  SmallVectorImpl<AffineForOp> *tiledNest) {
906  if (input.empty())
907  return success();
908 
909  if (failed(performPreTilingChecks(input, tileSizes)))
910  return failure();
911 
912  MutableArrayRef<AffineForOp> origLoops = input;
913  AffineForOp rootAffineForOp = origLoops[0];
914  unsigned width = input.size();
915  SmallVector<AffineForOp, 6> tiledLoops(2 * width);
916 
917  // Construct a tiled loop nest without setting their bounds. Bounds are
918  // set later.
919  constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
920 
921  SmallVector<Value, 8> origLoopIVs;
922  extractForInductionVars(input, &origLoopIVs);
923 
924  // Set loop bounds for the tiled loop nest.
925  constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
926  tileSizes);
927 
928  // Replace original IVs with intra-tile loop IVs.
929  for (unsigned i = 0; i < width; i++)
930  origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
931 
932  // Erase the old loop nest.
933  rootAffineForOp.erase();
934 
935  if (tiledNest)
936  *tiledNest = std::move(tiledLoops);
937 
938  return success();
939 }
940 
941 /// Get perfectly nested sequence of loops starting at root of loop nest
942 /// (the first op being another AffineFor, and the second op - a terminator).
943 /// A loop is perfectly nested iff: the first op in the loop's body is another
944 /// AffineForOp, and the second op is a terminator).
946  AffineForOp root) {
947  for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
948  nestedLoops.push_back(root);
949  Block &body = root.getRegion().front();
950  if (body.begin() != std::prev(body.end(), 2))
951  return;
952 
953  root = dyn_cast<AffineForOp>(&body.front());
954  if (!root)
955  return;
956  }
957 }
958 
959 /// Identify valid and profitable bands of loops to tile. This is currently just
960 /// a temporary placeholder to test the mechanics of tiled code generation.
961 /// Returns all maximal outermost perfect loop nests to tile.
962 void mlir::getTileableBands(func::FuncOp f,
963  std::vector<SmallVector<AffineForOp, 6>> *bands) {
964  // Get maximal perfect nest of 'affine.for' insts starting from root
965  // (inclusive).
966  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
968  getPerfectlyNestedLoops(band, forOp);
969  bands->push_back(band);
970  }
971 }
972 
973 /// Unrolls this loop completely.
975  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
976  if (mayBeConstantTripCount.has_value()) {
977  uint64_t tripCount = mayBeConstantTripCount.value();
978  if (tripCount == 0)
979  return success();
980  if (tripCount == 1)
981  return promoteIfSingleIteration(forOp);
982  return loopUnrollByFactor(forOp, tripCount);
983  }
984  return failure();
985 }
986 
987 /// Unrolls this loop by the specified factor or by the trip count (if constant)
988 /// whichever is lower.
990  uint64_t unrollFactor) {
991  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
992  if (mayBeConstantTripCount.has_value() &&
993  mayBeConstantTripCount.value() < unrollFactor)
994  return loopUnrollByFactor(forOp, mayBeConstantTripCount.value());
995  return loopUnrollByFactor(forOp, unrollFactor);
996 }
997 
998 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
999 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
1000 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
1001 /// using annotateFn.
1003  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
1004  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
1005  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1006  ValueRange iterArgs, ValueRange yieldedValues) {
1007  // Builder to insert unrolled bodies just before the terminator of the body of
1008  // 'forOp'.
1009  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
1010 
1011  if (!annotateFn)
1012  annotateFn = [](unsigned, Operation *, OpBuilder) {};
1013 
1014  // Keep a pointer to the last non-terminator operation in the original block
1015  // so that we know what to clone (since we are doing this in-place).
1016  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
1017 
1018  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
1019  SmallVector<Value, 4> lastYielded(yieldedValues);
1020 
1021  for (unsigned i = 1; i < unrollFactor; i++) {
1022  BlockAndValueMapping operandMap;
1023 
1024  // Prepare operand map.
1025  operandMap.map(iterArgs, lastYielded);
1026 
1027  // If the induction variable is used, create a remapping to the value for
1028  // this unrolled instance.
1029  if (!forOpIV.use_empty()) {
1030  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
1031  operandMap.map(forOpIV, ivUnroll);
1032  }
1033 
1034  // Clone the original body of 'forOp'.
1035  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
1036  Operation *clonedOp = builder.clone(*it, operandMap);
1037  annotateFn(i, clonedOp, builder);
1038  }
1039 
1040  // Update yielded values.
1041  for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
1042  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
1043  }
1044 
1045  // Make sure we annotate the Ops in the original body. We do this last so that
1046  // any annotations are not copied into the cloned Ops above.
1047  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
1048  annotateFn(0, &*it, builder);
1049 
1050  // Update operands of the yield statement.
1051  loopBodyBlock->getTerminator()->setOperands(lastYielded);
1052 }
1053 
1054 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
1055 /// count is not a multiple of `unrollFactor`.
1057  uint64_t unrollFactor) {
1058  // Insert the cleanup loop right after 'forOp'.
1059  OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
1060  auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
1061 
1062  // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
1063  // and produce results for the original users of `forOp` results.
1064  auto results = forOp.getResults();
1065  auto cleanupResults = cleanupForOp.getResults();
1066  auto cleanupIterOperands = cleanupForOp.getIterOperands();
1067 
1068  for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
1069  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
1070  cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1071  }
1072 
1073  AffineMap cleanupMap;
1074  SmallVector<Value, 4> cleanupOperands;
1075  getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1076  if (!cleanupMap)
1077  return failure();
1078 
1079  cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1080  // Promote the loop body up if this has turned into a single iteration loop.
1081  (void)promoteIfSingleIteration(cleanupForOp);
1082 
1083  // Adjust upper bound of the original loop; this is the same as the lower
1084  // bound of the cleanup loop.
1085  forOp.setUpperBound(cleanupOperands, cleanupMap);
1086  return success();
1087 }
1088 
1089 /// Unrolls this loop by the specified factor. Returns success if the loop
1090 /// is successfully unrolled.
1092  AffineForOp forOp, uint64_t unrollFactor,
1093  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1094  bool cleanUpUnroll) {
1095  assert(unrollFactor > 0 && "unroll factor should be positive");
1096 
1097  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1098  if (unrollFactor == 1) {
1099  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1101  return failure();
1102  return success();
1103  }
1104 
1105  // Nothing in the loop body other than the terminator.
1106  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1107  return success();
1108 
1109  // If the trip count is lower than the unroll factor, no unrolled body.
1110  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1111  if (cleanUpUnroll) {
1112  // Unroll the cleanup loop if cleanUpUnroll is specified.
1113  return loopUnrollFull(forOp);
1114  }
1115 
1116  return failure();
1117  }
1118 
1119  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1120  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1121  // Loops where the lower bound is a max expression or the upper bound is
1122  // a min expression and the trip count doesn't divide the unroll factor
1123  // can't be unrolled since the lower bound of the cleanup loop in such cases
1124  // cannot be expressed as an affine function or a max over affine functions.
1125  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1126  forOp.getUpperBoundMap().getNumResults() != 1)
1127  return failure();
1128  if (cleanUpUnroll)
1129  // Force unroll including cleanup loop
1130  return loopUnrollFull(forOp);
1131  if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1132  assert(false && "cleanup loop lower bound map for single result lower "
1133  "and upper bound maps can always be determined");
1134  }
1135 
1136  ValueRange iterArgs(forOp.getRegionIterArgs());
1137  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1138 
1139  // Scale the step of loop being unrolled by unroll factor.
1140  int64_t step = forOp.getStep();
1141  forOp.setStep(step * unrollFactor);
1143  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1144  [&](unsigned i, Value iv, OpBuilder b) {
1145  // iv' = iv + i * step
1146  auto d0 = b.getAffineDimExpr(0);
1147  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1148  return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1149  },
1150  /*annotateFn=*/annotateFn,
1151  /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1152 
1153  // Promote the loop body up if this has turned into a single iteration loop.
1154  (void)promoteIfSingleIteration(forOp);
1155  return success();
1156 }
1157 
1159  uint64_t unrollJamFactor) {
1160  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1161  if (mayBeConstantTripCount.has_value() &&
1162  mayBeConstantTripCount.value() < unrollJamFactor)
1163  return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.value());
1164  return loopUnrollJamByFactor(forOp, unrollJamFactor);
1165 }
1166 
1167 /// Check if all control operands of all loops are defined outside of `forOp`
1168 /// and return false if not.
1169 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1170  auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1171  for (auto controlOperand : aForOp.getControlOperands()) {
1172  if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1173  return WalkResult::interrupt();
1174  }
1175  return WalkResult::advance();
1176  });
1177  return !walkResult.wasInterrupted();
1178 }
1179 
1180 // Gathers all maximal sub-blocks of operations that do not themselves
1181 // include a for op (a operation could have a descendant for op though
1182 // in its tree). Ignore the block terminators.
1184  // Store iterators to the first and last op of each sub-block found.
1185  std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1186 
1187  // This is a linear time walk.
1188  void walk(Operation *op) {
1189  for (auto &region : op->getRegions())
1190  for (auto &block : region)
1191  walk(block);
1192  }
1193 
1194  void walk(Block &block) {
1195  for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
1196  auto subBlockStart = it;
1197  while (it != e && !isa<AffineForOp>(&*it))
1198  ++it;
1199  if (it != subBlockStart)
1200  subBlocks.emplace_back(subBlockStart, std::prev(it));
1201  // Process all for ops that appear next.
1202  while (it != e && isa<AffineForOp>(&*it))
1203  walk(&*it++);
1204  }
1205  }
1206 };
1207 
1208 /// Unrolls and jams this loop by the specified factor.
1210  uint64_t unrollJamFactor) {
1211  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1212 
1213  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1214  if (unrollJamFactor == 1) {
1215  if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1217  return failure();
1218  return success();
1219  }
1220 
1221  // Nothing in the loop body other than the terminator.
1222  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1223  return success();
1224 
1225  // If the trip count is lower than the unroll jam factor, no unroll jam.
1226  if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1227  LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1228  return failure();
1229  }
1230 
1231  // If any control operand of any inner loop of `forOp` is defined within
1232  // `forOp`, no unroll jam.
1233  if (!areInnerBoundsInvariant(forOp))
1234  return failure();
1235 
1236  // Gather all sub-blocks to jam upon the loop being unrolled.
1237  JamBlockGatherer jbg;
1238  jbg.walk(forOp);
1239  auto &subBlocks = jbg.subBlocks;
1240 
1241  // Collect loops with iter_args.
1242  SmallVector<AffineForOp, 4> loopsWithIterArgs;
1243  forOp.walk([&](AffineForOp aForOp) {
1244  if (aForOp.getNumIterOperands() > 0)
1245  loopsWithIterArgs.push_back(aForOp);
1246  });
1247 
1248  // Get supported reductions to be used for creating reduction ops at the end.
1249  SmallVector<LoopReduction> reductions;
1250  if (forOp.getNumIterOperands() > 0)
1251  getSupportedReductions(forOp, reductions);
1252 
1253  // Generate the cleanup loop if trip count isn't a multiple of
1254  // unrollJamFactor.
1255  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1256  // Loops where the lower bound is a max expression or the upper bound is
1257  // a min expression and the trip count doesn't divide the unroll factor
1258  // can't be unrolled since the lower bound of the cleanup loop in such cases
1259  // cannot be expressed as an affine function or a max over affine functions.
1260  if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1261  forOp.getUpperBoundMap().getNumResults() != 1)
1262  return failure();
1263  if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1264  assert(false && "cleanup loop lower bound map for single result lower "
1265  "and upper bound maps can always be determined");
1266  }
1267 
1268  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1269  // iteration. There are (`unrollJamFactor` - 1) iterations.
1270  SmallVector<BlockAndValueMapping, 4> operandMaps(unrollJamFactor - 1);
1271 
1272  // For any loop with iter_args, replace it with a new loop that has
1273  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1274  // operands.
1275  SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1276  OpBuilder builder(forOp.getContext());
1277  for (AffineForOp oldForOp : loopsWithIterArgs) {
1278  SmallVector<Value, 4> dupIterOperands, dupIterArgs, dupYieldOperands;
1279  ValueRange oldIterOperands = oldForOp.getIterOperands();
1280  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1281  ValueRange oldYieldOperands =
1282  cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1283  // Get additional iterOperands, iterArgs, and yield operands. We will
1284  // fix iterOperands and yield operands after cloning of sub-blocks.
1285  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1286  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1287  dupIterArgs.append(oldIterArgs.begin(), oldIterArgs.end());
1288  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1289  }
1290  // Create a new loop with additional iterOperands, iter_args and yield
1291  // operands. This new loop will take the loop body of the original loop.
1292  AffineForOp newForOp = mlir::replaceForOpWithNewYields(
1293  builder, oldForOp, dupIterOperands, dupYieldOperands, dupIterArgs);
1294  newLoopsWithIterArgs.push_back(newForOp);
1295  // `forOp` has been replaced with a new loop.
1296  if (oldForOp == forOp)
1297  forOp = newForOp;
1298  assert(oldForOp.use_empty() && "old for op should not have any user");
1299  oldForOp.erase();
1300  // Update `operandMaps` for `newForOp` iterArgs and results.
1301  ValueRange newIterArgs = newForOp.getRegionIterArgs();
1302  unsigned oldNumIterArgs = oldIterArgs.size();
1303  ValueRange newResults = newForOp.getResults();
1304  unsigned oldNumResults = newResults.size() / unrollJamFactor;
1305  assert(oldNumIterArgs == oldNumResults &&
1306  "oldNumIterArgs must be the same as oldNumResults");
1307  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1308  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1309  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1310  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1311  // to those in the `i`th new set.
1312  operandMaps[i - 1].map(newIterArgs[j],
1313  newIterArgs[i * oldNumIterArgs + j]);
1314  operandMaps[i - 1].map(newResults[j],
1315  newResults[i * oldNumResults + j]);
1316  }
1317  }
1318  }
1319 
1320  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1321  int64_t step = forOp.getStep();
1322  forOp.setStep(step * unrollJamFactor);
1323 
1324  auto forOpIV = forOp.getInductionVar();
1325  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1326  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1327  for (auto &subBlock : subBlocks) {
1328  // Builder to insert unroll-jammed bodies. Insert right at the end of
1329  // sub-block.
1330  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1331 
1332  // If the induction variable is used, create a remapping to the value for
1333  // this unrolled instance.
1334  if (!forOpIV.use_empty()) {
1335  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1336  auto d0 = builder.getAffineDimExpr(0);
1337  auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1338  auto ivUnroll =
1339  builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1340  operandMaps[i - 1].map(forOpIV, ivUnroll);
1341  }
1342  // Clone the sub-block being unroll-jammed.
1343  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1344  builder.clone(*it, operandMaps[i - 1]);
1345  }
1346  // Fix iterOperands and yield op operands of newly created loops.
1347  for (auto newForOp : newLoopsWithIterArgs) {
1348  unsigned oldNumIterOperands =
1349  newForOp.getNumIterOperands() / unrollJamFactor;
1350  unsigned numControlOperands = newForOp.getNumControlOperands();
1351  auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1352  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1353  assert(oldNumIterOperands == oldNumYieldOperands &&
1354  "oldNumIterOperands must be the same as oldNumYieldOperands");
1355  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1356  // The `i`th duplication of an old iterOperand or yield op operand
1357  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1358  // if such mapped value exists.
1359  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1360  operandMaps[i - 1].lookupOrDefault(
1361  newForOp.getOperand(numControlOperands + j)));
1362  yieldOp.setOperand(
1363  i * oldNumYieldOperands + j,
1364  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1365  }
1366  }
1367  }
1368  if (forOp.getNumResults() > 0) {
1369  // Create reduction ops to combine every `unrollJamFactor` related results
1370  // into one value. For example, for %0:2 = affine.for ... and addf, we add
1371  // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1372  // %1.
1373  builder.setInsertionPointAfter(forOp);
1374  auto loc = forOp.getLoc();
1375  unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1376  for (LoopReduction &reduction : reductions) {
1377  unsigned pos = reduction.iterArgPosition;
1378  Value lhs = forOp.getResult(pos);
1379  Value rhs;
1381  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1382  rhs = forOp.getResult(i * oldNumResults + pos);
1383  // Create ops based on reduction type.
1384  lhs = arith::getReductionOp(reduction.kind, builder, loc, lhs, rhs);
1385  if (!lhs)
1386  return failure();
1387  Operation *op = lhs.getDefiningOp();
1388  assert(op && "Reduction op should have been created");
1389  newOps.insert(op);
1390  }
1391  // Replace all uses except those in newly created reduction ops.
1392  forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1393  }
1394  }
1395 
1396  // Promote the loop body up if this has turned into a single iteration loop.
1397  (void)promoteIfSingleIteration(forOp);
1398  return success();
1399 }
1400 
1401 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1402 /// nested within 'forOpA' as the only non-terminator operation in its block.
1403 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1404  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1405  auto &forOpABody = forOpA.getBody()->getOperations();
1406  auto &forOpBBody = forOpB.getBody()->getOperations();
1407 
1408  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1409  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1410  // body containing only the terminator.
1411  forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1412  forOpABody, forOpABody.begin(),
1413  std::prev(forOpABody.end()));
1414  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1415  // body (this leaves forOpB's body containing only the terminator).
1416  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1417  std::prev(forOpBBody.end()));
1418  // 3) Splice forOpA into the beginning of forOpB's body.
1419  forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1420  Block::iterator(forOpA));
1421 }
1422 
1423 // Checks each dependence component against the permutation to see if the
1424 // desired loop interchange would violate dependences by making the
1425 // dependence component lexicographically negative.
1427  const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1428  ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1429  // Invert permutation map.
1430  unsigned maxLoopDepth = loops.size();
1431  SmallVector<unsigned, 4> loopPermMapInv;
1432  loopPermMapInv.resize(maxLoopDepth);
1433  for (unsigned i = 0; i < maxLoopDepth; ++i)
1434  loopPermMapInv[loopPermMap[i]] = i;
1435 
1436  // Check each dependence component against the permutation to see if the
1437  // desired loop interchange permutation would make the dependence vectors
1438  // lexicographically negative.
1439  // Example 1: [-1, 1][0, 0]
1440  // Example 2: [0, 0][-1, 1]
1441  for (const auto &depComps : depCompsVec) {
1442  assert(depComps.size() >= maxLoopDepth);
1443  // Check if the first non-zero dependence component is positive.
1444  // This iterates through loops in the desired order.
1445  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1446  unsigned permIndex = loopPermMapInv[j];
1447  assert(depComps[permIndex].lb);
1448  int64_t depCompLb = *depComps[permIndex].lb;
1449  if (depCompLb > 0)
1450  break;
1451  if (depCompLb < 0)
1452  return false;
1453  }
1454  }
1455  return true;
1456 }
1457 
1458 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1459 /// nested sequence of loops in 'loops' would violate dependences.
1461  ArrayRef<unsigned> loopPermMap) {
1462  // Gather dependence components for dependences between all ops in loop nest
1463  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1464  assert(loopPermMap.size() == loops.size());
1465  unsigned maxLoopDepth = loops.size();
1466  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1467  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1468  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1469 }
1470 
1471 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1472 /// in it from outermost to innermost.
1473 bool LLVM_ATTRIBUTE_UNUSED
1475  assert(!loops.empty() && "no loops provided");
1476 
1477  // We already know that the block can't be empty.
1478  auto hasTwoElements = [](Block *block) {
1479  auto secondOpIt = std::next(block->begin());
1480  return secondOpIt != block->end() && &*secondOpIt == &block->back();
1481  };
1482 
1483  auto enclosingLoop = loops.front();
1484  for (auto loop : loops.drop_front()) {
1485  auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1486  // parentForOp's body should be just this loop and the terminator.
1487  if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1488  return false;
1489  enclosingLoop = loop;
1490  }
1491  return true;
1492 }
1493 
1494 // input[i] should move from position i -> permMap[i]. Returns the position in
1495 // `input` that becomes the new outermost loop.
1497  ArrayRef<unsigned> permMap) {
1498  assert(input.size() == permMap.size() && "invalid permutation map size");
1499  // Check whether the permutation spec is valid. This is a small vector - we'll
1500  // just sort and check if it's iota.
1501  SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1502  llvm::sort(checkPermMap);
1503  if (llvm::any_of(llvm::enumerate(checkPermMap),
1504  [](const auto &en) { return en.value() != en.index(); }))
1505  assert(false && "invalid permutation map");
1506 
1507  // Nothing to do.
1508  if (input.size() < 2)
1509  return 0;
1510 
1511  assert(isPerfectlyNested(input) && "input not perfectly nested");
1512 
1513  // Compute the inverse mapping, invPermMap: since input[i] goes to position
1514  // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1516  for (unsigned i = 0, e = input.size(); i < e; ++i)
1517  invPermMap.push_back({permMap[i], i});
1518  llvm::sort(invPermMap);
1519 
1520  // Move the innermost loop body to the loop that would be the innermost in the
1521  // permuted nest (only if the innermost loop is going to change).
1522  if (permMap.back() != input.size() - 1) {
1523  auto *destBody = input[invPermMap.back().second].getBody();
1524  auto *srcBody = input.back().getBody();
1525  destBody->getOperations().splice(destBody->begin(),
1526  srcBody->getOperations(), srcBody->begin(),
1527  std::prev(srcBody->end()));
1528  }
1529 
1530  // We'll move each loop in `input` in the reverse order so that its body is
1531  // empty when we are moving it; this incurs zero copies and no erasing.
1532  for (int i = input.size() - 1; i >= 0; --i) {
1533  // If this has to become the outermost loop after permutation, add it to the
1534  // parent block of the original root.
1535  if (permMap[i] == 0) {
1536  // If the root remains the same, nothing to do.
1537  if (i == 0)
1538  continue;
1539  // Make input[i] the new outermost loop moving it into parentBlock.
1540  auto *parentBlock = input[0]->getBlock();
1541  parentBlock->getOperations().splice(Block::iterator(input[0]),
1542  input[i]->getBlock()->getOperations(),
1543  Block::iterator(input[i]));
1544  continue;
1545  }
1546 
1547  // If the parent in the permuted order is the same as in the original,
1548  // nothing to do.
1549  unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1550  if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1551  continue;
1552 
1553  // Move input[i] to its surrounding loop in the transformed nest.
1554  auto *destBody = input[parentPosInInput].getBody();
1555  destBody->getOperations().splice(destBody->begin(),
1556  input[i]->getBlock()->getOperations(),
1557  Block::iterator(input[i]));
1558  }
1559 
1560  return invPermMap[0].second;
1561 }
1562 
1563 // Sinks all sequential loops to the innermost levels (while preserving
1564 // relative order among them) and moves all parallel loops to the
1565 // outermost (while again preserving relative order among them).
1566 AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
1568  getPerfectlyNestedLoops(loops, forOp);
1569  if (loops.size() < 2)
1570  return forOp;
1571 
1572  // Gather dependence components for dependences between all ops in loop nest
1573  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1574  unsigned maxLoopDepth = loops.size();
1575  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1576  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1577 
1578  // Mark loops as either parallel or sequential.
1579  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1580  for (auto &depComps : depCompsVec) {
1581  assert(depComps.size() >= maxLoopDepth);
1582  for (unsigned j = 0; j < maxLoopDepth; ++j) {
1583  DependenceComponent &depComp = depComps[j];
1584  assert(depComp.lb.has_value() && depComp.ub.has_value());
1585  if (depComp.lb.value() != 0 || depComp.ub.value() != 0)
1586  isParallelLoop[j] = false;
1587  }
1588  }
1589 
1590  // Count the number of parallel loops.
1591  unsigned numParallelLoops = 0;
1592  for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
1593  if (isParallelLoop[i])
1594  ++numParallelLoops;
1595 
1596  // Compute permutation of loops that sinks sequential loops (and thus raises
1597  // parallel loops) while preserving relative order.
1598  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1599  unsigned nextSequentialLoop = numParallelLoops;
1600  unsigned nextParallelLoop = 0;
1601  for (unsigned i = 0; i < maxLoopDepth; ++i) {
1602  if (isParallelLoop[i]) {
1603  loopPermMap[i] = nextParallelLoop++;
1604  } else {
1605  loopPermMap[i] = nextSequentialLoop++;
1606  }
1607  }
1608 
1609  // Check if permutation 'loopPermMap' would violate dependences.
1610  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1611  return forOp;
1612  // Perform loop interchange according to permutation 'loopPermMap'.
1613  unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1614  return loops[loopNestRootIndex];
1615 }
1616 
1617 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1618 // lower (resp. upper) loop bound. When called for both the lower and upper
1619 // bounds, the resulting IR resembles:
1620 //
1621 // ```mlir
1622 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1623 // ...
1624 // }
1625 // ```
1627  SmallVector<Value, 4> *operands,
1628  int64_t offset = 0) {
1629  auto bounds = llvm::to_vector<4>(map->getResults());
1630  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1631  operands->insert(operands->begin() + map->getNumDims(), iv);
1632  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1633  b.getContext());
1634  canonicalizeMapAndOperands(map, operands);
1635 }
1636 
1637 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1638 // Stripmine-sink is a primitive building block for generalized tiling of
1639 // imperfectly nested loops.
1640 // This transformation is purely mechanical and does not check legality,
1641 // profitability or even structural correctness. It is the user's
1642 // responsibility to specify `targets` that are dominated by `forOp`.
1643 // Returns the new AffineForOps, one per `targets`, nested immediately under
1644 // each of the `targets`.
1646 stripmineSink(AffineForOp forOp, uint64_t factor,
1647  ArrayRef<AffineForOp> targets) {
1648  auto originalStep = forOp.getStep();
1649  auto scaledStep = originalStep * factor;
1650  forOp.setStep(scaledStep);
1651 
1652  OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1653 
1654  // Lower-bound map creation.
1655  auto lbMap = forOp.getLowerBoundMap();
1656  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1657  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1658 
1659  // Upper-bound map creation.
1660  auto ubMap = forOp.getUpperBoundMap();
1661  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1662  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1663  /*offset=*/scaledStep);
1664 
1665  auto iv = forOp.getInductionVar();
1666  SmallVector<AffineForOp, 8> innerLoops;
1667  for (auto t : targets) {
1668  // Insert newForOp before the terminator of `t`.
1669  auto b = OpBuilder::atBlockTerminator(t.getBody());
1670  auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1671  ubOperands, ubMap, originalStep);
1672  auto begin = t.getBody()->begin();
1673  // Skip terminator and `newForOp` which is just before the terminator.
1674  auto nOps = t.getBody()->getOperations().size() - 2;
1675  newForOp.getBody()->getOperations().splice(
1676  newForOp.getBody()->getOperations().begin(),
1677  t.getBody()->getOperations(), begin, std::next(begin, nOps));
1678  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1679  newForOp.getRegion());
1680  innerLoops.push_back(newForOp);
1681  }
1682 
1683  return innerLoops;
1684 }
1685 
1686 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1687 // Returns the new AffineForOps, nested immediately under `target`.
1688 template <typename SizeType>
1689 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1690  AffineForOp target) {
1691  // TODO: Use cheap structural assertions that targets are nested under
1692  // forOp and that targets are not nested under each other when DominanceInfo
1693  // exposes the capability. It seems overkill to construct a whole function
1694  // dominance tree at this point.
1695  auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1696  assert(res.size() == 1 && "Expected 1 inner forOp");
1697  return res[0];
1698 }
1699 
1702  ArrayRef<AffineForOp> targets) {
1704  SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1705  for (auto it : llvm::zip(forOps, sizes)) {
1706  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1707  res.push_back(step);
1708  currentTargets = step;
1709  }
1710  return res;
1711 }
1712 
1714  ArrayRef<uint64_t> sizes,
1715  AffineForOp target) {
1717  for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1718  assert(loops.size() == 1);
1719  res.push_back(loops[0]);
1720  }
1721  return res;
1722 }
1723 
1725  if (loops.size() < 2)
1726  return success();
1727 
1728  AffineForOp innermost = loops.back();
1729  AffineForOp outermost = loops.front();
1730  AffineBound ub = outermost.getUpperBound();
1731  AffineMap origUbMap = ub.getMap();
1732  Location loc = outermost.getLoc();
1733  OpBuilder builder(outermost);
1734  for (AffineForOp loop : loops) {
1735  // We only work on normalized loops.
1736  if (loop.getStep() != 1 || !loop.hasConstantLowerBound() ||
1737  loop.getConstantLowerBound() != 0)
1738  return failure();
1739  }
1740  SmallVector<Value, 4> upperBoundSymbols;
1741  SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1742  ub.getOperands().end());
1743 
1744  // 1. Store the upper bound of the outermost loop in a variable.
1745  Value prev;
1746  if (!llvm::hasSingleElement(origUbMap.getResults()))
1747  prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1748  else
1749  prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1750  upperBoundSymbols.push_back(prev);
1751 
1752  // 2. Emit code computing the upper bound of the coalesced loop as product of
1753  // the number of iterations of all loops.
1754  for (AffineForOp loop : loops.drop_front()) {
1755  ub = loop.getUpperBound();
1756  origUbMap = ub.getMap();
1757  ubOperands = ub.getOperands();
1758  Value upperBound;
1759  // If upper bound map has more than one result, take their minimum.
1760  if (!llvm::hasSingleElement(origUbMap.getResults()))
1761  upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1762  else
1763  upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1764  upperBoundSymbols.push_back(upperBound);
1765  SmallVector<Value, 4> operands;
1766  operands.push_back(prev);
1767  operands.push_back(upperBound);
1768  // Maintain running product of loop upper bounds.
1769  prev = builder.create<AffineApplyOp>(
1770  loc,
1771  AffineMap::get(/*dimCount=*/1,
1772  /*symbolCount=*/1,
1773  builder.getAffineDimExpr(0) *
1774  builder.getAffineSymbolExpr(0)),
1775  operands);
1776  }
1777  // Set upper bound of the coalesced loop.
1778  AffineMap newUbMap = AffineMap::get(
1779  /*dimCount=*/0,
1780  /*symbolCount=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1781  outermost.setUpperBound(prev, newUbMap);
1782 
1783  builder.setInsertionPointToStart(outermost.getBody());
1784 
1785  // 3. Remap induction variables. For each original loop, the value of the
1786  // induction variable can be obtained by dividing the induction variable of
1787  // the linearized loop by the total number of iterations of the loops nested
1788  // in it modulo the number of iterations in this loop (remove the values
1789  // related to the outer loops):
1790  // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1791  // Compute these iteratively from the innermost loop by creating a "running
1792  // quotient" of division by the range.
1793  Value previous = outermost.getInductionVar();
1794  for (unsigned idx = loops.size(); idx > 0; --idx) {
1795  if (idx != loops.size()) {
1796  SmallVector<Value, 4> operands;
1797  operands.push_back(previous);
1798  operands.push_back(upperBoundSymbols[idx]);
1799  previous = builder.create<AffineApplyOp>(
1800  loc,
1802  /*dimCount=*/1, /*symbolCount=*/1,
1803  builder.getAffineDimExpr(0).floorDiv(
1804  builder.getAffineSymbolExpr(0))),
1805  operands);
1806  }
1807  // Modified value of the induction variables of the nested loops after
1808  // coalescing.
1809  Value inductionVariable;
1810  if (idx == 1) {
1811  inductionVariable = previous;
1812  } else {
1813  SmallVector<Value, 4> applyOperands;
1814  applyOperands.push_back(previous);
1815  applyOperands.push_back(upperBoundSymbols[idx - 1]);
1816  inductionVariable = builder.create<AffineApplyOp>(
1817  loc,
1819  /*dimCount=*/1, /*symbolCount=*/1,
1820  builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1821  applyOperands);
1822  }
1823  replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1824  inductionVariable, loops.back().getRegion());
1825  }
1826 
1827  // 4. Move the operations from the innermost just above the second-outermost
1828  // loop, delete the extra terminator and the second-outermost loop.
1829  AffineForOp secondOutermostLoop = loops[1];
1830  innermost.getBody()->back().erase();
1831  outermost.getBody()->getOperations().splice(
1832  Block::iterator(secondOutermostLoop.getOperation()),
1833  innermost.getBody()->getOperations());
1834  secondOutermostLoop.erase();
1835  return success();
1836 }
1837 
1838 void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
1839  ArrayRef<Value> numProcessors) {
1840  assert(processorId.size() == numProcessors.size());
1841  if (processorId.empty())
1842  return;
1843 
1844  OpBuilder b(forOp);
1845  Location loc(forOp.getLoc());
1846  AffineExpr lhs, rhs;
1847  bindSymbols(forOp.getContext(), lhs, rhs);
1848  auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1849  auto addMap = AffineMap::get(0, 2, lhs + rhs);
1850 
1851  Value linearIndex = processorId.front();
1852  for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1853  auto mulApplyOp = b.create<AffineApplyOp>(
1854  loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1855  linearIndex = b.create<AffineApplyOp>(
1856  loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1857  }
1858 
1859  auto mulApplyOp = b.create<AffineApplyOp>(
1860  loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1861  Value lb = b.create<AffineApplyOp>(
1862  loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1863  forOp.setLowerBound(lb);
1864 
1865  Value step = forOp.getStep();
1866  for (auto numProcs : numProcessors)
1867  step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1868  forOp.setStep(step);
1869 }
1870 
1871 /// Given a memref region, determine the lowest depth at which transfers can be
1872 /// placed for it, and return the corresponding block, start and end positions
1873 /// in the block for placing incoming (read) and outgoing (write) copies
1874 /// respectively. The lowest depth depends on whether the region being accessed
1875 /// is hoistable with respect to one or more immediately surrounding loops.
1876 static void
1878  Block::iterator &begin, Block::iterator &end,
1879  Block **copyPlacementBlock,
1880  Block::iterator *copyInPlacementStart,
1881  Block::iterator *copyOutPlacementStart) {
1882  const auto *cst = region.getConstraints();
1883  SmallVector<Value, 4> symbols;
1884  cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1885 
1886  SmallVector<AffineForOp, 4> enclosingFors;
1887  getLoopIVs(*block.begin(), &enclosingFors);
1888  // Walk up loop parents till we find an IV on which this region is
1889  // symbolic/variant.
1890  auto it = enclosingFors.rbegin();
1891  for (auto e = enclosingFors.rend(); it != e; ++it) {
1892  // TODO: also need to be checking this for regions symbols that
1893  // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1894  if (llvm::is_contained(symbols, it->getInductionVar()))
1895  break;
1896  }
1897 
1898  if (it != enclosingFors.rbegin()) {
1899  auto lastInvariantIV = *std::prev(it);
1900  *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1901  *copyOutPlacementStart = std::next(*copyInPlacementStart);
1902  *copyPlacementBlock = lastInvariantIV->getBlock();
1903  } else {
1904  *copyInPlacementStart = begin;
1905  *copyOutPlacementStart = end;
1906  *copyPlacementBlock = &block;
1907  }
1908 }
1909 
1910 // Info comprising stride and number of elements transferred every stride.
1911 struct StrideInfo {
1912  int64_t stride;
1914 };
1915 
1916 /// Returns striding information for a copy/transfer of this region with
1917 /// potentially multiple striding levels from outermost to innermost. For an
1918 /// n-dimensional region, there can be at most n-1 levels of striding
1919 /// successively nested.
1920 // TODO: make this work with non-identity layout maps.
1921 static void getMultiLevelStrides(const MemRefRegion &region,
1922  ArrayRef<int64_t> bufferShape,
1923  SmallVectorImpl<StrideInfo> *strideInfos) {
1924  if (bufferShape.size() <= 1)
1925  return;
1926 
1927  int64_t numEltPerStride = 1;
1928  int64_t stride = 1;
1929  for (int d = bufferShape.size() - 1; d >= 1; d--) {
1930  int64_t dimSize = region.memref.getType().cast<MemRefType>().getDimSize(d);
1931  stride *= dimSize;
1932  numEltPerStride *= bufferShape[d];
1933  // A stride is needed only if the region has a shorter extent than the
1934  // memref along the dimension *and* has an extent greater than one along the
1935  // next major dimension.
1936  if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1937  strideInfos->push_back({stride, numEltPerStride});
1938  }
1939  }
1940 }
1941 
1942 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1943 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1944 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1945 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1946 /// expressions to be subtracted out from the respective copy loop iterators in
1947 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1948 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1949 /// inserted.
1950 //
1951 /// The copy-in nest is generated as follows as an example for a 2-d region:
1952 /// for x = ...
1953 /// for y = ...
1954 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1955 ///
1956 static AffineForOp
1957 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1958  ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1959  ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1960  ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1961  OpBuilder b) {
1962  assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1963  return lbMap.getNumInputs() == lbOperands.size();
1964  }));
1965  assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1966  return ubMap.getNumInputs() == ubOperands.size();
1967  }));
1968 
1969  unsigned rank = memref.getType().cast<MemRefType>().getRank();
1970  assert(lbMaps.size() == rank && "wrong number of lb maps");
1971  assert(ubMaps.size() == rank && "wrong number of ub maps");
1972 
1973  SmallVector<Value, 4> memIndices;
1974  SmallVector<AffineExpr, 4> fastBufExprs;
1975  SmallVector<Value, 4> fastBufMapOperands;
1976  AffineForOp copyNestRoot;
1977  SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1978  for (unsigned d = 0; d < rank; ++d) {
1979  auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1980  ubOperands, ubMaps[d]);
1981  if (d == 0)
1982  copyNestRoot = forOp;
1983 
1984  b = OpBuilder::atBlockTerminator(forOp.getBody());
1985 
1986  auto fastBufOffsetMap =
1987  AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1988  auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1989 
1990  // Construct the subscript for the fast memref being copied into/from:
1991  // x - offset_x.
1992  fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1993  b.getAffineDimExpr(2 * d));
1994  fastBufMapOperands.push_back(offset);
1995  fastBufMapOperands.push_back(forOp.getInductionVar());
1996  mayBeDeadApplys.push_back(offset);
1997 
1998  // Subscript for the slow memref being copied.
1999  memIndices.push_back(forOp.getInductionVar());
2000  }
2001 
2002  auto fastBufMap =
2003  AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
2004  fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
2005  fastBufMap = simplifyAffineMap(fastBufMap);
2006  canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
2007 
2008  // Drop any dead affine.applys.
2009  for (auto applyOp : mayBeDeadApplys)
2010  if (applyOp.use_empty())
2011  applyOp.erase();
2012 
2013  if (!isCopyOut) {
2014  // Copy in.
2015  auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
2016  b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
2017  fastBufMapOperands);
2018  return copyNestRoot;
2019  }
2020 
2021  // Copy out.
2022  auto load =
2023  b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
2024  b.create<AffineStoreOp>(loc, load, memref, memIndices);
2025  return copyNestRoot;
2026 }
2027 
2028 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
2030  return block.getParentOp()->emitRemark();
2031 }
2032 
2033 /// Creates a buffer in the faster memory space for the specified memref region;
2034 /// generates a copy from the lower memory space to this one, and replaces all
2035 /// loads/stores in the block range [`begin', `end') of `block' to load/store
2036 /// from that buffer. Returns failure if copies could not be generated due to
2037 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
2038 /// in copyPlacementBlock specify the insertion points where the incoming copies
2039 /// and outgoing copies, respectively, should be inserted (the insertion happens
2040 /// right before the insertion point). Since `begin` can itself be invalidated
2041 /// due to the memref rewriting done from this method, the output argument
2042 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
2043 /// happens). Since outgoing copies could have been inserted at `end`, the
2044 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
2045 /// size of the fast buffer allocated.
2047  const MemRefRegion &region, Block *block, Block::iterator begin,
2048  Block::iterator end, Block *copyPlacementBlock,
2049  Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
2050  AffineCopyOptions copyOptions, DenseMap<Value, Value> &fastBufferMap,
2051  DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
2052  Block::iterator *nBegin, Block::iterator *nEnd) {
2053  *nBegin = begin;
2054  *nEnd = end;
2055 
2056  func::FuncOp f = begin->getParentOfType<func::FuncOp>();
2057  OpBuilder topBuilder(f.getBody());
2058  Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
2059 
2060  if (begin == end)
2061  return success();
2062 
2063  // Is the copy out point at the end of the block where we are doing
2064  // explicit copying.
2065  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
2066 
2067  // Copies for read regions are going to be inserted at 'begin'.
2068  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
2069  // Copies for write regions are going to be inserted at 'end'.
2070  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
2071  OpBuilder &b = region.isWrite() ? epilogue : prologue;
2072 
2073  // Builder to create constants at the top level.
2074  auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
2075  OpBuilder top(func.getBody());
2076 
2077  auto loc = region.loc;
2078  auto memref = region.memref;
2079  auto memRefType = memref.getType().cast<MemRefType>();
2080 
2081  if (!memRefType.getLayout().isIdentity()) {
2082  LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2083  return failure();
2084  }
2085 
2086  // Indices to use for the copying.
2087  // Indices for the original memref being copied from/to.
2088  SmallVector<Value, 4> memIndices;
2089  // Indices for the faster buffer being copied into/from.
2090  SmallVector<Value, 4> bufIndices;
2091 
2092  unsigned rank = memRefType.getRank();
2093  SmallVector<int64_t, 4> fastBufferShape;
2094 
2095  // Compute the extents of the buffer.
2096  std::vector<SmallVector<int64_t, 4>> lbs;
2097  SmallVector<int64_t, 8> lbDivisors;
2098  lbs.reserve(rank);
2100  &fastBufferShape, &lbs, &lbDivisors);
2101  if (!numElements) {
2102  LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2103  return failure();
2104  }
2105 
2106  if (*numElements == 0) {
2107  LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2108  *sizeInBytes = 0;
2109  return success();
2110  }
2111 
2112  SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2113  for (unsigned i = 0; i < rank; ++i)
2114  region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2115 
2116  const FlatAffineValueConstraints *cst = region.getConstraints();
2117  // 'regionSymbols' hold values that this memory region is symbolic/parametric
2118  // on; these typically include loop IVs surrounding the level at which the
2119  // copy generation is being done or other valid symbols in MLIR.
2120  SmallVector<Value, 8> regionSymbols;
2121  cst->getValues(rank, cst->getNumVars(), &regionSymbols);
2122 
2123  // Construct the index expressions for the fast memory buffer. The index
2124  // expression for a particular dimension of the fast buffer is obtained by
2125  // subtracting out the lower bound on the original memref's data region
2126  // along the corresponding dimension.
2127 
2128  // Index start offsets for faster memory buffer relative to the original.
2129  SmallVector<AffineExpr, 4> fastBufOffsets;
2130  fastBufOffsets.reserve(rank);
2131  for (unsigned d = 0; d < rank; d++) {
2132  assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2133 
2134  AffineExpr offset = top.getAffineConstantExpr(0);
2135  for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2136  offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2137  assert(lbDivisors[d] > 0);
2138  offset =
2139  (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2140 
2141  // Set copy start location for this dimension in the lower memory space
2142  // memref.
2143  if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
2144  auto indexVal = caf.getValue();
2145  if (indexVal == 0) {
2146  memIndices.push_back(zeroIndex);
2147  } else {
2148  memIndices.push_back(
2149  top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2150  }
2151  } else {
2152  // The coordinate for the start location is just the lower bound along the
2153  // corresponding dimension on the memory region (stored in 'offset').
2154  auto map = AffineMap::get(
2155  cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2156  memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2157  }
2158  // The fast buffer is copied into at location zero; addressing is relative.
2159  bufIndices.push_back(zeroIndex);
2160 
2161  // Record the offsets since they are needed to remap the memory accesses of
2162  // the original memref further below.
2163  fastBufOffsets.push_back(offset);
2164  }
2165 
2166  // The faster memory space buffer.
2167  Value fastMemRef;
2168 
2169  // Check if a buffer was already created.
2170  bool existingBuf = fastBufferMap.count(memref) > 0;
2171  if (!existingBuf) {
2172  AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2173  auto fastMemRefType =
2174  MemRefType::get(fastBufferShape, memRefType.getElementType(),
2175  fastBufferLayout, copyOptions.fastMemorySpace);
2176 
2177  // Create the fast memory space buffer just before the 'affine.for'
2178  // operation.
2179  fastMemRef =
2180  prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2181  // Record it.
2182  fastBufferMap[memref] = fastMemRef;
2183  // fastMemRefType is a constant shaped memref.
2184  *sizeInBytes = *getMemRefSizeInBytes(fastMemRefType);
2185  LLVM_DEBUG(emitRemarkForBlock(*block)
2186  << "Creating fast buffer of type " << fastMemRefType
2187  << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2188  << " KiB\n");
2189  } else {
2190  // Reuse the one already created.
2191  fastMemRef = fastBufferMap[memref];
2192  *sizeInBytes = 0;
2193  }
2194 
2195  auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2196 
2197  Value dmaStride = nullptr;
2198  Value numEltPerDmaStride = nullptr;
2199  if (copyOptions.generateDma) {
2200  SmallVector<StrideInfo, 4> dmaStrideInfos;
2201  getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2202 
2203  // TODO: use all stride levels once DmaStartOp is extended for
2204  // multi-level strides.
2205  if (dmaStrideInfos.size() > 1) {
2206  LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2207  return failure();
2208  }
2209 
2210  if (!dmaStrideInfos.empty()) {
2211  dmaStride =
2212  top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2213  numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2214  loc, dmaStrideInfos[0].numEltPerStride);
2215  }
2216  }
2217 
2218  // Record the last operation where we want the memref replacement to end. We
2219  // later do the memref replacement only in [begin, postDomFilter] so
2220  // that the original memref's used in the data movement code themselves don't
2221  // get replaced.
2222  auto postDomFilter = std::prev(end);
2223 
2224  // Create fully composed affine maps for each memref.
2225  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2226  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2227  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2228  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2229 
2230  if (!copyOptions.generateDma) {
2231  // Point-wise copy generation.
2232  auto copyNest =
2233  generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2234  /*lbOperands=*/regionSymbols, ubMaps,
2235  /*ubOperands=*/regionSymbols, fastBufOffsets,
2236  /*isCopyOut=*/region.isWrite(), b);
2237 
2238  // Record this so that we can skip it from yet another copy.
2239  copyNests.insert(copyNest);
2240 
2241  // Since new ops are being appended (for copy out's), adjust the end to
2242  // mark end of block range being processed if necessary.
2243  if (region.isWrite() && isCopyOutAtEndOfBlock)
2244  *nEnd = Block::iterator(copyNest.getOperation());
2245  } else {
2246  // DMA generation.
2247  // Create a tag (single element 1-d memref) for the DMA.
2248  auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2249  copyOptions.tagMemorySpace);
2250  auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2251 
2252  SmallVector<Value, 4> tagIndices({zeroIndex});
2253  auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2254  fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2255  if (!region.isWrite()) {
2256  // DMA non-blocking read from original buffer to fast buffer.
2257  b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2258  fastMemRef, bufAffineMap, bufIndices,
2259  tagMemRef, tagAffineMap, tagIndices,
2260  numElementsSSA, dmaStride, numEltPerDmaStride);
2261  } else {
2262  // DMA non-blocking write from fast buffer to the original memref.
2263  auto op = b.create<AffineDmaStartOp>(
2264  loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2265  memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2266  dmaStride, numEltPerDmaStride);
2267  // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2268  // end to mark end of block range being processed.
2269  if (isCopyOutAtEndOfBlock)
2270  *nEnd = Block::iterator(op.getOperation());
2271  }
2272 
2273  // Matching DMA wait to block on completion; tag always has a 0 index.
2274  b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2275  numElementsSSA);
2276 
2277  // Generate dealloc for the tag.
2278  auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2279  if (*nEnd == end && isCopyOutAtEndOfBlock)
2280  // Since new ops are being appended (for outgoing DMAs), adjust the end to
2281  // mark end of range of the original.
2282  *nEnd = Block::iterator(tagDeallocOp.getOperation());
2283  }
2284 
2285  // Generate dealloc for the buffer.
2286  if (!existingBuf) {
2287  auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2288  // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2289  // the fast buffer (since it marks the new end insertion point).
2290  if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2291  *nEnd = Block::iterator(bufDeallocOp.getOperation());
2292  }
2293 
2294  // Replace all uses of the old memref with the faster one while remapping
2295  // access indices (subtracting out lower bound offsets for each dimension).
2296  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2297  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2298  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2299  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2300  // d2, d3 correspond to the original indices (%i, %j).
2301  SmallVector<AffineExpr, 4> remapExprs;
2302  remapExprs.reserve(rank);
2303  for (unsigned i = 0; i < rank; i++) {
2304  // The starting operands of indexRemap will be regionSymbols (the symbols on
2305  // which the memref region is parametric); then those corresponding to
2306  // the memref's original indices follow.
2307  auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2308  remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2309  }
2310  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2311  b.getContext());
2312 
2313  // Record the begin since it may be invalidated by memref replacement.
2314  Block::iterator prevOfBegin;
2315  bool isBeginAtStartOfBlock = (begin == block->begin());
2316  if (!isBeginAtStartOfBlock)
2317  prevOfBegin = std::prev(begin);
2318 
2319  // *Only* those uses within the range [begin, end) of 'block' are replaced.
2320  (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2321  /*extraIndices=*/{}, indexRemap,
2322  /*extraOperands=*/regionSymbols,
2323  /*symbolOperands=*/{},
2324  /*domOpFilter=*/&*begin,
2325  /*postDomOpFilter=*/&*postDomFilter);
2326 
2327  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2328 
2329  return success();
2330 }
2331 
2332 /// Construct the memref region to just include the entire memref. Returns false
2333 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2334 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2335 /// is parametric on.
2336 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2337  MemRefRegion *region) {
2338  unsigned rank;
2339  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2340  rank = loadOp.getMemRefType().getRank();
2341  region->memref = loadOp.getMemRef();
2342  region->setWrite(false);
2343  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2344  rank = storeOp.getMemRefType().getRank();
2345  region->memref = storeOp.getMemRef();
2346  region->setWrite(true);
2347  } else {
2348  assert(false && "expected load or store op");
2349  return false;
2350  }
2351  auto memRefType = region->memref.getType().cast<MemRefType>();
2352  if (!memRefType.hasStaticShape())
2353  return false;
2354 
2355  auto *regionCst = region->getConstraints();
2356 
2357  // Just get the first numSymbols IVs, which the memref region is parametric
2358  // on.
2360  getLoopIVs(*op, &ivs);
2361  ivs.resize(numParamLoopIVs);
2362  SmallVector<Value, 4> symbols;
2363  extractForInductionVars(ivs, &symbols);
2364  regionCst->reset(rank, numParamLoopIVs, 0);
2365  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2366 
2367  // Memref dim sizes provide the bounds.
2368  for (unsigned d = 0; d < rank; d++) {
2369  auto dimSize = memRefType.getDimSize(d);
2370  assert(dimSize > 0 && "filtered dynamic shapes above");
2371  regionCst->addBound(IntegerPolyhedron::LB, d, 0);
2372  regionCst->addBound(IntegerPolyhedron::UB, d, dimSize - 1);
2373  }
2374  return true;
2375 }
2376 
2378  Block::iterator end,
2379  const AffineCopyOptions &copyOptions,
2380  Optional<Value> filterMemRef,
2381  DenseSet<Operation *> &copyNests) {
2382  if (begin == end)
2383  return success();
2384 
2385  assert(begin->getBlock() == std::prev(end)->getBlock() &&
2386  "Inconsistent block begin/end args");
2387  assert(end != end->getBlock()->end() && "end can't be the block terminator");
2388 
2389  Block *block = begin->getBlock();
2390 
2391  // Copies will be generated for this depth, i.e., symbolic in all loops
2392  // surrounding the this block range.
2393  unsigned copyDepth = getNestingDepth(&*begin);
2394 
2395  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2396  << "\n");
2397  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2398  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2399 
2400  // List of memory regions to copy for. We need a map vector to have a
2401  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2402  // since the alloc's for example are identical except for the SSA id.
2403  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2404  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2405 
2406  // Map from original memref's to the fast buffers that their accesses are
2407  // replaced with.
2408  DenseMap<Value, Value> fastBufferMap;
2409 
2410  // To check for errors when walking the block.
2411  bool error = false;
2412 
2413  // Walk this range of operations to gather all memory regions.
2414  block->walk(begin, end, [&](Operation *opInst) {
2415  // Gather regions to allocate to buffers in faster memory space.
2416  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2417  if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2418  (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2419  copyOptions.slowMemorySpace))
2420  return;
2421  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2422  if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2423  storeOp.getMemRefType().getMemorySpaceAsInt() !=
2424  copyOptions.slowMemorySpace)
2425  return;
2426  } else {
2427  // Neither load nor a store op.
2428  return;
2429  }
2430 
2431  // Compute the MemRefRegion accessed.
2432  auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2433  if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2434  /*addMemRefDimBounds=*/false))) {
2435  LLVM_DEBUG(llvm::dbgs()
2436  << "Error obtaining memory region: semi-affine maps?\n");
2437  LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2438  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2439  LLVM_DEBUG(
2440  opInst->emitError("non-constant memref sizes not yet supported"));
2441  error = true;
2442  return;
2443  }
2444  }
2445 
2446  // Each memref has a single buffer associated with it irrespective of how
2447  // many load's and store's happen on it.
2448  // TODO: in the future, when regions don't intersect and satisfy
2449  // other properties (based on load/store regions), we could consider
2450  // multiple buffers per memref.
2451 
2452  // Add to the appropriate region if it's not already in it, or take a
2453  // bounding box union with the existing one if it's already in there.
2454  // Note that a memref may have both read and write regions - so update the
2455  // region in the other list if one exists (write in case of read and vice
2456  // versa) since there is a single bounding box for a memref across all reads
2457  // and writes that happen on it.
2458 
2459  // Attempts to update; returns true if 'region' exists in targetRegions.
2460  auto updateRegion =
2461  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2462  &targetRegions) {
2463  const auto *const it = targetRegions.find(region->memref);
2464  if (it == targetRegions.end())
2465  return false;
2466 
2467  // Perform a union with the existing region.
2468  if (failed(it->second->unionBoundingBox(*region))) {
2469  LLVM_DEBUG(llvm::dbgs()
2470  << "Memory region bounding box failed; "
2471  "over-approximating to the entire memref\n");
2472  // If the union fails, we will overapproximate.
2473  if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2474  LLVM_DEBUG(opInst->emitError(
2475  "non-constant memref sizes not yet supported"));
2476  error = true;
2477  return true;
2478  }
2479  it->second->getConstraints()->clearAndCopyFrom(
2480  *region->getConstraints());
2481  } else {
2482  // Union was computed and stored in 'it->second': copy to 'region'.
2483  region->getConstraints()->clearAndCopyFrom(
2484  *it->second->getConstraints());
2485  }
2486  return true;
2487  };
2488 
2489  bool existsInRead = updateRegion(readRegions);
2490  if (error)
2491  return;
2492  bool existsInWrite = updateRegion(writeRegions);
2493  if (error)
2494  return;
2495 
2496  // Finally add it to the region list.
2497  if (region->isWrite() && !existsInWrite) {
2498  writeRegions[region->memref] = std::move(region);
2499  } else if (!region->isWrite() && !existsInRead) {
2500  readRegions[region->memref] = std::move(region);
2501  }
2502  });
2503 
2504  if (error) {
2505  LLVM_DEBUG(begin->emitError(
2506  "copy generation failed for one or more memref's in this block\n"));
2507  return failure();
2508  }
2509 
2510  uint64_t totalCopyBuffersSizeInBytes = 0;
2511  bool ret = true;
2512  auto processRegions =
2513  [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2514  &regions) {
2515  for (const auto &regionEntry : regions) {
2516  // For each region, hoist copy in/out past all hoistable
2517  // 'affine.for's.
2518  Block::iterator copyInPlacementStart, copyOutPlacementStart;
2519  Block *copyPlacementBlock;
2521  *regionEntry.second, *block, begin, end, &copyPlacementBlock,
2522  &copyInPlacementStart, &copyOutPlacementStart);
2523 
2524  uint64_t sizeInBytes;
2525  Block::iterator nBegin, nEnd;
2526  LogicalResult iRet = generateCopy(
2527  *regionEntry.second, block, begin, end, copyPlacementBlock,
2528  copyInPlacementStart, copyOutPlacementStart, copyOptions,
2529  fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2530  if (succeeded(iRet)) {
2531  // begin/end could have been invalidated, and need update.
2532  begin = nBegin;
2533  end = nEnd;
2534  totalCopyBuffersSizeInBytes += sizeInBytes;
2535  }
2536  ret = ret & succeeded(iRet);
2537  }
2538  };
2539  processRegions(readRegions);
2540  processRegions(writeRegions);
2541 
2542  if (!ret) {
2543  LLVM_DEBUG(begin->emitError(
2544  "copy generation failed for one or more memref's in this block\n"));
2545  return failure();
2546  }
2547 
2548  // For a range of operations, a note will be emitted at the caller.
2549  AffineForOp forOp;
2550  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2551  LLVM_DEBUG(forOp.emitRemark()
2552  << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2553  << " KiB of copy buffers in fast memory space for this block\n");
2554  }
2555 
2556  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2557  StringRef str = "Total size of all copy buffers' for this block "
2558  "exceeds fast memory capacity\n";
2559  block->getParentOp()->emitWarning(str);
2560  }
2561 
2562  return success();
2563 }
2564 
2565 // A convenience version of affineDataCopyGenerate for all ops in the body of
2566 // an AffineForOp.
2568  const AffineCopyOptions &copyOptions,
2569  Optional<Value> filterMemRef,
2570  DenseSet<Operation *> &copyNests) {
2571  return affineDataCopyGenerate(forOp.getBody()->begin(),
2572  std::prev(forOp.getBody()->end()), copyOptions,
2573  filterMemRef, copyNests);
2574 }
2575 
2577  const MemRefRegion &memrefRegion, Operation *analyzedOp,
2578  const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2579  Block *block = analyzedOp->getBlock();
2580  auto begin = analyzedOp->getIterator();
2581  auto end = std::next(begin);
2582  DenseMap<Value, Value> fastBufferMap;
2583  DenseSet<Operation *> copyNests;
2584 
2585  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2586  copyOptions, fastBufferMap, copyNests,
2587  &result.sizeInBytes, &begin, &end);
2588  if (failed(err))
2589  return err;
2590 
2591  const auto &en = fastBufferMap.find(memrefRegion.memref);
2592  // In some cases (empty loops), no copy generation would have happened.
2593  if (en == fastBufferMap.end())
2594  return failure();
2595  result.alloc = en->second.getDefiningOp();
2596  assert(result.alloc && "fast buffer expected to be locally allocated");
2597  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2598  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2599  return success();
2600 }
2601 
2602 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2603 static void
2604 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2605  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2606  // Add a new empty level to output if it doesn't exist level already.
2607  assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2608  if (currLoopDepth == depthToLoops.size())
2609  depthToLoops.emplace_back();
2610 
2611  for (auto &op : *block) {
2612  if (auto forOp = dyn_cast<AffineForOp>(op)) {
2613  depthToLoops[currLoopDepth].push_back(forOp);
2614  gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2615  }
2616  }
2617 }
2618 
2619 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2620 void mlir::gatherLoops(func::FuncOp func,
2621  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2622  for (auto &block : func)
2623  gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2624 
2625  // Remove last loop level from output since it's empty.
2626  if (!depthToLoops.empty()) {
2627  assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2628  depthToLoops.pop_back();
2629  }
2630 }
2631 
2632 // TODO: if necessary, this can be extended to also compose in any
2633 // affine.applys, fold to constant if all result dimensions of the map are
2634 // constant (canonicalizeMapAndOperands below already does this for single
2635 // result bound maps), and use simplifyMap to perform algebraic simplification.
2637  OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2638  ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2639  SmallVector<Value, 4> lowerOperands(lbOperands);
2640  SmallVector<Value, 4> upperOperands(ubOperands);
2641 
2642  fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2643  canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2644  lbMap = removeDuplicateExprs(lbMap);
2645  fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2646  canonicalizeMapAndOperands(&ubMap, &upperOperands);
2647  ubMap = removeDuplicateExprs(ubMap);
2648 
2649  return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2650  step);
2651 }
2652 
2653 /// Creates an AffineIfOp that encodes the conditional to choose between
2654 /// the constant trip count version and an unknown trip count version of this
2655 /// nest of loops. This is used to separate partial and full tiles if `loops`
2656 /// has the intra-tile loops. The affine.if op is inserted at the builder
2657 /// insertion point of `b`.
2659  OpBuilder b) {
2660  if (loops.empty())
2661  return nullptr;
2662 
2663  auto *context = loops[0].getContext();
2664 
2667  llvm::append_range(ops, loops);
2668  (void)getIndexSet(ops, &cst);
2669 
2670  // Remove constraints that are independent of these loop IVs.
2671  cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2672 
2673  // Construct the constraint set representing the guard for full tiles. The
2674  // lower bound (and upper bound) corresponding to the full tile should be
2675  // larger (and resp. smaller) than any other lower (or upper bound).
2676  SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2677  for (auto loop : loops) {
2678  (void)loop;
2679  // TODO: Non-unit stride is not an issue to generalize to.
2680  assert(loop.getStep() == 1 && "point loop step expected to be one");
2681  // Mark everything symbols for the purpose of finding a constant diff pair.
2682  cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2683  1);
2684  unsigned fullTileLbPos, fullTileUbPos;
2685  if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2686  /*boundFloorDivisor=*/nullptr,
2687  /*ub=*/nullptr, &fullTileLbPos,
2688  &fullTileUbPos)) {
2689  LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2690  return nullptr;
2691  }
2692 
2693  SmallVector<unsigned, 4> lbIndices, ubIndices;
2694  cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2695 
2696  auto fLb = cst.getInequality(fullTileLbPos);
2697  auto fUb = cst.getInequality(fullTileUbPos);
2698  fullTileLb.assign(fLb.begin(), fLb.end());
2699  fullTileUb.assign(fUb.begin(), fUb.end());
2700 
2701  // Full tile lower bound should be >= than any other lower bound.
2702  for (auto lbIndex : lbIndices)
2703  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2704  cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2705 
2706  // Full tile upper bound should be <= any other upper bound.
2707  for (auto ubIndex : ubIndices)
2708  for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2709  cst.atIneq(ubIndex, i) -= fullTileUb[i];
2710 
2711  cst.removeVar(0);
2712  }
2713 
2714  // The previous step leads to all zeros for the full tile lb and ub position
2715  // itself; remove those and any other duplicates / trivial redundancies.
2717 
2718  // Turn everything into dims conservatively since we earlier turned all
2719  // trailing ids past point loop IV into symbols. Some of these could be outer
2720  // loop IVs; we'll canonicalize anyway.
2721  cst.setDimSymbolSeparation(0);
2722 
2723  IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2724  // ifCondSet can be null if cst was empty -- this can happen if all loops
2725  // in the nest have constant trip counts.
2726  if (!ifCondSet)
2727  return nullptr;
2728 
2729  SmallVector<Value, 4> setOperands;
2730  cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2731  canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2732  return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2733  /*withElseRegion=*/true);
2734 }
2735 
2736 /// Create the full tile loop nest (along with its body).
2737 static LogicalResult
2739  SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2740  fullTileLoops.reserve(inputNest.size());
2741 
2742  // For each loop in the original nest identify a lower/upper bound pair such
2743  // that their difference is a constant.
2745  for (auto loop : inputNest) {
2746  // TODO: straightforward to generalize to a non-unit stride.
2747  if (loop.getStep() != 1) {
2748  LLVM_DEBUG(llvm::dbgs()
2749  << "[tile separation] non-unit stride not implemented\n");
2750  return failure();
2751  }
2752  SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2753  (void)getIndexSet(loopOp, &cst);
2754  // We will mark everything other than this loop IV as symbol for getting a
2755  // pair of <lb, ub> with a constant difference.
2757  unsigned lbPos, ubPos;
2758  if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2759  /*boundFloorDivisor=*/nullptr,
2760  /*ub=*/nullptr, &lbPos, &ubPos) ||
2761  lbPos == ubPos) {
2762  LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2763  "equalities not yet handled\n");
2764  return failure();
2765  }
2766 
2767  // Set all variables as dimensions uniformly since some of those marked as
2768  // symbols above could be outer loop IVs (corresponding tile space IVs).
2769  cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2770 
2771  AffineValueMap lbVmap, ubVmap;
2772  cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2773  cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2774  AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2775  b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2776  ubVmap.getOperands(), ubVmap.getAffineMap());
2777  b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2778  fullTileLoops.push_back(fullTileLoop);
2779  }
2780 
2781  // Add the body for the full tile loop nest.
2782  BlockAndValueMapping operandMap;
2783  for (const auto &loopEn : llvm::enumerate(inputNest))
2784  operandMap.map(loopEn.value().getInductionVar(),
2785  fullTileLoops[loopEn.index()].getInductionVar());
2786  b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2787  for (auto &op : inputNest.back().getBody()->without_terminator())
2788  b.clone(op, operandMap);
2789  return success();
2790 }
2791 
2794  SmallVectorImpl<AffineForOp> *fullTileNest) {
2795  if (inputNest.empty())
2796  return success();
2797 
2798  auto firstLoop = inputNest[0];
2799 
2800  // Each successive for op has to be nested in the other.
2801  auto prevLoop = firstLoop;
2802  for (auto loop : inputNest.drop_front(1)) {
2803  assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2804  prevLoop = loop;
2805  }
2806 
2807  // Create the full tile loop nest.
2808  SmallVector<AffineForOp, 4> fullTileLoops;
2809  OpBuilder b(firstLoop);
2810  if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2811  if (!fullTileLoops.empty())
2812  fullTileLoops.front().erase();
2813  return failure();
2814  }
2815 
2816  // Create and insert the version select right before the root of the nest.
2817  b = OpBuilder(firstLoop);
2818  AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2819  if (!ifOp) {
2820  fullTileLoops.front().erase();
2821  LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2822  "separation condition\n");
2823  return failure();
2824  }
2825 
2826  // Move the full tile into the then block.
2827  Block *thenBlock = ifOp.getThenBlock();
2828  AffineForOp outermostFullTileLoop = fullTileLoops[0];
2829  thenBlock->getOperations().splice(
2830  std::prev(thenBlock->end()),
2831  outermostFullTileLoop->getBlock()->getOperations(),
2832  Block::iterator(outermostFullTileLoop));
2833 
2834  // Move the partial tile into the else block. The partial tile is the same as
2835  // the original loop nest.
2836  Block *elseBlock = ifOp.getElseBlock();
2837  elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2838  firstLoop->getBlock()->getOperations(),
2839  Block::iterator(firstLoop));
2840 
2841  if (fullTileNest)
2842  *fullTileNest = std::move(fullTileLoops);
2843 
2844  return success();
2845 }
static void constructParametricallyTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< Value > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:745
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region)
Construct the memref region to just include the entire memref.
Definition: LoopUtils.cpp:2336
static SmallVector< AffineForOp, 8 > stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef< AffineForOp > targets)
Definition: LoopUtils.cpp:1646
static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block)
Definition: LoopUtils.cpp:2029
static void findHighestBlockForPlacement(const MemRefRegion &region, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart)
Given a memref region, determine the lowest depth at which transfers can be placed for it,...
Definition: LoopUtils.cpp:1877
static AffineForOp generateShiftedLoop(AffineMap lbMap, AffineMap ubMap, const std::vector< std::pair< uint64_t, ArrayRef< Operation * >>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b)
Generates an affine.for op with the specified lower and upper bounds while generating the right IV re...
Definition: LoopUtils.cpp:181
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc)
Move the loop body of AffineForOp 'src' from 'src' into the specified location in destination's body,...
Definition: LoopUtils.cpp:493
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize)
Set lower and upper bounds of inter-tile loops for parametric tiling.
Definition: LoopUtils.cpp:644
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize)
Set lower and upper bounds of intra-tile loops for parametric tiling.
Definition: LoopUtils.cpp:549
static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
Definition: LoopUtils.cpp:2604
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor'...
Definition: LoopUtils.cpp:1002
static LogicalResult createFullTiles(MutableArrayRef< AffineForOp > inputNest, SmallVectorImpl< AffineForOp > &fullTileLoops, OpBuilder b)
Create the full tile loop nest (along with its body).
Definition: LoopUtils.cpp:2738
void moveLoopBody(AffineForOp src, AffineForOp dest)
Move the loop body of AffineForOp 'src' from 'src' to the start of dest body.
Definition: LoopUtils.cpp:502
static void getMultiLevelStrides(const MemRefRegion &region, ArrayRef< int64_t > bufferShape, SmallVectorImpl< StrideInfo > *strideInfos)
Returns striding information for a copy/transfer of this region with potentially multiple striding le...
Definition: LoopUtils.cpp:1921
void constructTiledLoopNest(MutableArrayRef< AffineForOp > origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef< AffineForOp > tiledLoops)
Constructs tiled loop nest, without setting the loop bounds and move the body of the original loop ne...
Definition: LoopUtils.cpp:508
static void constructTiledIndexSetHyperRect(MutableArrayRef< AffineForOp > origLoops, MutableArrayRef< AffineForOp > newLoops, ArrayRef< unsigned > tileSizes)
Constructs and sets new loop bounds after tiling for the case of hyper-rectangular index sets,...
Definition: LoopUtils.cpp:773
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor)
Helper to generate cleanup loop for unroll or unroll-and-jam when the trip count is not a multiple of...
Definition: LoopUtils.cpp:1056
static bool areInnerBoundsInvariant(AffineForOp forOp)
Check if all control operands of all loops are defined outside of forOp and return false if not.
Definition: LoopUtils.cpp:1169
static bool checkLoopInterchangeDependences(const std::vector< SmallVector< DependenceComponent, 2 >> &depCompsVec, ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Definition: LoopUtils.cpp:1426
static AffineIfOp createSeparationCondition(MutableArrayRef< AffineForOp > loops, OpBuilder b)
Creates an AffineIfOp that encodes the conditional to choose between the constant trip count version ...
Definition: LoopUtils.cpp:2658
LogicalResult performPreTilingChecks(MutableArrayRef< AffineForOp > input, ArrayRef< t > tileSizes)
Check if the input nest is supported for tiling and whether tiling would be legal or not.
Definition: LoopUtils.cpp:462
LogicalResult checkIfHyperRectangular(MutableArrayRef< AffineForOp > input)
Checks whether a loop nest is hyper-rectangular or not.
Definition: LoopUtils.cpp:441
static bool checkTilingLegalityImpl(MutableArrayRef< mlir::AffineForOp > origLoops)
Checks the legality of tiling of a hyper-rectangular loop nest by simply checking if there is a 'nega...
Definition: LoopUtils.cpp:367
static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl< Value > &cleanupLbOperands)
Computes the cleanup loop lower bound of the loop being unrolled with the specified unroll factor; th...
Definition: LoopUtils.cpp:56
LogicalResult checkTilingLegality(MutableArrayRef< mlir::AffineForOp > origLoops)
Checks whether hyper-rectangular loop tiling of the nest represented by origLoops is valid.
Definition: LoopUtils.cpp:436
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector< Value, 4 > *operands, int64_t offset=0)
Definition: LoopUtils.cpp:1626
static LogicalResult generateCopy(const MemRefRegion &region, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, AffineCopyOptions copyOptions, DenseMap< Value, Value > &fastBufferMap, DenseSet< Operation * > &copyNests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd)
Creates a buffer in the faster memory space for the specified memref region; generates a copy from th...
Definition: LoopUtils.cpp:2046
static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef< AffineMap > lbMaps, ArrayRef< Value > lbOperands, ArrayRef< AffineMap > ubMaps, ArrayRef< Value > ubOperands, ArrayRef< AffineExpr > fastBufOffsets, bool isCopyOut, OpBuilder b)
Generates a point-wise copy from/to ‘memref’ to/from ‘fastMemRef’ and returns the outermost AffineFor...
Definition: LoopUtils.cpp:1957
static void replaceIterArgsAndYieldResults(AffineForOp forOp)
Helper to replace uses of loop carried values (iter_args) and loop yield values while promoting singl...
Definition: LoopUtils.cpp:114
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
AffineBound represents a lower or upper bound in the for operation.
Definition: AffineOps.h:486
operand_range getOperands()
Definition: AffineOps.h:499
Value getOperand(unsigned idx)
Definition: AffineOps.h:492
AffineMap getMap()
Definition: AffineOps.h:489
unsigned getNumOperands()
Definition: AffineOps.h:491
An integer constant appearing in affine expression.
Definition: AffineExpr.h:232
AffineDmaStartOp starts a non-blocking DMA operation that transfers data from a source memref to a de...
Definition: AffineOps.h:92
AffineDmaWaitOp blocks until the completion of a DMA operation associated with the tag element 'tag[i...
Definition: AffineOps.h:285
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:764
U dyn_cast() const
Definition: AffineExpr.h:281
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:42
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
Definition: AffineMap.cpp:310
unsigned getNumDims() const
Definition: AffineMap.cpp:306
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:319
unsigned getNumResults() const
Definition: AffineMap.cpp:314
unsigned getNumInputs() const
Definition: AffineMap.cpp:315
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:323
An AffineValueMap is an affine map plus its ML value operands and results for analysis purposes.
ArrayRef< Value > getOperands() const
AffineMap getAffineMap() const
Block * lookup(Block *from) const
Lookup a mapped value within the map.
void map(Block *from, Block *to)
Inserts a new mapping for 'from' to 'to'.
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:129
RetT walk(FnT &&callback)
Walk the operations in this block.
Definition: Block.h:271
Region * getParent() const
Provide a 'getParent' method for ilist_node_with_parent methods.
Definition: Block.cpp:26
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:232
OpListType & getOperations()
Definition: Block.h:126
Operation & front()
Definition: Block.h:142
iterator end()
Definition: Block.h:133
iterator begin()
Definition: Block.h:132
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
AffineMap getSingleDimShiftAffineMap(int64_t shift)
Returns a map that shifts its (single) input dimension by 'shift'.
Definition: Builders.cpp:364
AffineMap getShiftedAffineMap(AffineMap map, int64_t shift)
Returns an affine map that is a translation (shift) of all result expressions in 'map' by 'shift'.
Definition: Builders.cpp:370
AffineMap getDimIdentityMap()
Definition: Builders.cpp:346
AffineMap getMultiDimIdentityMap(unsigned rank)
Definition: Builders.cpp:350
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:331
AffineExpr getAffineConstantExpr(int64_t constant)
Definition: Builders.cpp:335
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:327
MLIRContext * getContext() const
Definition: Builders.h:54
FlatAffineValueConstraints represents an extension of IntegerPolyhedron where each non-local variable...
IntegerSet getAsIntegerSet(MLIRContext *context) const
Returns the constraint system as an integer set.
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
void reset(unsigned numReservedInequalities, unsigned numReservedEqualities, unsigned numReservedCols, unsigned numDims, unsigned numSymbols, unsigned numLocals=0)
Clears any existing data and reserves memory for the specified constraints.
void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const
Returns the bound for the variable at pos from the inequality at ineqPos as a 1-d affine value map (a...
This class represents a diagnostic that is inflight and set to be reported.
Definition: Diagnostics.h:307
An integer set representing a conjunction of one or more affine equalities and inequalities.
Definition: IntegerSet.h:44
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:64
This class helps build Operations.
Definition: Builders.h:198
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:383
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:242
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:422
Operation * clone(Operation &op, BlockAndValueMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:510
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:364
Operation * getOperation()
Inherit getOperation from OpState.
Definition: OpDefinition.h:91
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:41
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:31
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:237
Operation * clone(BlockAndValueMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:558
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:154
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:225
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:144
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:480
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:203
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:194
InFlightDiagnostic emitRemark(const Twine &message={})
Emit a remark about this operation, reporting up to any diagnostic handlers that may be listening.
Definition: Operation.cpp:246
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
U cast() const
Definition: Types.h:280
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:349
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:85
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:199
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Definition: Value.h:117
Type getType() const
Return the type of this value.
Definition: Value.h:114
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:51
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:89
void removeIndependentConstraints(unsigned pos, unsigned num)
Removes constraints that are independent of (i.e., do not have a coefficient) variables in the range ...
void removeTrivialRedundancy()
Removes duplicate constraints, trivially true constraints, and constraints that can be detected as re...
Optional< MPInt > getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl< MPInt > *lb=nullptr, MPInt *boundFloorDivisor=nullptr, SmallVectorImpl< MPInt > *ub=nullptr, unsigned *minLbPos=nullptr, unsigned *minUbPos=nullptr) const
Returns the smallest known constant bound for the extent of the specified variable (pos^th),...
bool isHyperRectangular(unsigned pos, unsigned num) const
Returns true if the set can be trivially detected as being hyper-rectangular on the specified contigu...
ArrayRef< MPInt > getInequality(unsigned idx) const
MPInt atIneq(unsigned i, unsigned j) const
Returns the value at the specified inequality row and column.
void setDimSymbolSeparation(unsigned newSymbolCount)
Changes the partition between dimensions and symbols.
unsigned getNumCols() const
Returns the number of columns in the constraint system.
void getLowerAndUpperBoundIndices(unsigned pos, SmallVectorImpl< unsigned > *lbIndices, SmallVectorImpl< unsigned > *ubIndices, SmallVectorImpl< unsigned > *eqIndices=nullptr, unsigned offset=0, unsigned num=0) const
Gather positions of all lower and upper bounds of the variable at pos, and optionally any equalities ...
void removeVar(VarKind kind, unsigned pos)
Removes variables of the specified kind with the specified pos (or within the specified range) from t...
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Definition: ArithOps.cpp:2216
void walk(Operation *op, function_ref< void(Region *)> callback, WalkOrder order)
Walk all of the regions, blocks, or operations nested under (and including) the given operation.
Definition: Visitors.cpp:24
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:230
bool isParallelLoop(Operation &op)
LLVM_ATTRIBUTE_ALWAYS_INLINE MPInt ceilDiv(const MPInt &lhs, const MPInt &rhs)
Definition: MPInt.h:374
LLVM_ATTRIBUTE_ALWAYS_INLINE MPInt floorDiv(const MPInt &lhs, const MPInt &rhs)
Definition: MPInt.h:382
Include the generated interface declarations.
bool hasDependence(DependenceResult result)
Utility function that returns true if the provided DependenceResult corresponds to a dependence resul...
AffineMap simplifyAffineMap(AffineMap map)
Simplifies an affine map by simplifying its underlying AffineExpr results.
Definition: AffineMap.cpp:650
void canonicalizeSetAndOperands(IntegerSet *set, SmallVectorImpl< Value > *operands)
Canonicalizes an integer set the same way canonicalizeMapAndOperands does for affine maps.
Definition: AffineOps.cpp:1237
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:1091
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
void extractForInductionVars(ArrayRef< AffineForOp > forInsts, SmallVectorImpl< Value > *ivs)
Extracts the induction variables from a list of AffineForOps and places them in the output argument i...
Definition: AffineOps.cpp:2314
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:24
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
Definition: Utils.cpp:1254
AffineMap removeDuplicateExprs(AffineMap map)
Returns a map with the same dimension and symbol count as map, but whose results are the unique affin...
Definition: AffineMap.cpp:660
LogicalResult getIndexSet(MutableArrayRef< Operation * > ops, FlatAffineValueConstraints *domain)
Builds a system of constraints with dimensional variables corresponding to the loop IVs of the forOps...
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2793
unsigned permuteLoops(MutableArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1496
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:989
void getTileableBands(func::FuncOp f, std::vector< SmallVector< AffineForOp, 6 >> *bands)
Identify valid and profitable bands of loops to tile.
Definition: LoopUtils.cpp:962
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
Definition: LogicalResult.h:68
Optional< uint64_t > getMemRefSizeInBytes(MemRefType memRefType)
Returns the size of memref data in bytes if it's statically shaped, None otherwise.
Definition: Utils.cpp:641
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2620
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
AffineForOp replaceForOpWithNewYields(OpBuilder &b, AffineForOp loop, ValueRange newIterOperands, ValueRange newYieldedValues, ValueRange newIterArgs, bool replaceLoopResults=true)
Replace loop with a new loop where newIterOperands are appended with new initialization values and ne...
Definition: AffineOps.cpp:2403
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1701
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:903
void getDependenceComponents(AffineForOp forOp, unsigned maxLoopDepth, std::vector< SmallVector< DependenceComponent, 2 >> *depCompsVec)
Returns in 'depCompsVec', dependence components for dependences between all load and store ops in loo...
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:945
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2636
uint64_t getLargestDivisorOfTripCount(AffineForOp forOp)
Returns the greatest known integral divisor of the trip count.
Optional< uint64_t > getConstantTripCount(AffineForOp forOp)
Returns the trip count of the loop if it's a constant, None otherwise.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:343
void getLoopIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the loops surrounding 'op' ordered from the outermost 'affine....
Definition: Utils.cpp:35
void getSupportedReductions(AffineForOp forOp, SmallVectorImpl< LoopReduction > &supportedReductions)
Populate supportedReductions with descriptors of the supported reductions.
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:235
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:131
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1403
LogicalResult applyOpPatternsAndFold(Operation *op, const FrozenRewritePatternSet &patterns, bool *erased=nullptr)
Applies the specified patterns on op alone while also trying to fold it, by selecting the highest ben...
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2576
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1460
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, Optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2377
void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Given an affine map map and its input operands, this method composes into map, maps of AffineApplyOps...
Definition: AffineOps.cpp:867
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:974
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1474
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1838
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1724
unsigned getNestingDepth(Operation *op)
Returns the nesting depth of this operation, i.e., the number of loops surrounding this operation.
Definition: Utils.cpp:1238
bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef< uint64_t > shifts)
Checks where SSA dominance would be violated if a for op's body operations are shifted by the specifi...
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1158
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1566
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1209
void canonicalizeMapAndOperands(AffineMap *map, SmallVectorImpl< Value > *operands)
Modifies both map and operands in-place so as to:
Definition: AffineOps.cpp:1232
DependenceResult checkMemrefAccessDependence(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints, SmallVector< DependenceComponent, 2 > *dependenceComponents, bool allowRAR=false)
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, SmallVectorImpl< Value > *operands)
Returns the trip count of the loop as an affine map with its corresponding operands if the latter is ...
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:859
std::vector< std::pair< Block::iterator, Block::iterator > > subBlocks
Definition: LoopUtils.cpp:1185
void walk(Operation *op)
Definition: LoopUtils.cpp:1188
void walk(Block &block)
Definition: LoopUtils.cpp:1194
int64_t stride
Definition: LoopUtils.cpp:1912
int64_t numEltPerStride
Definition: LoopUtils.cpp:1913
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:161
uint64_t fastMemCapacityBytes
Definition: LoopUtils.h:171
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:199
Optional< int64_t > ub
Optional< int64_t > lb
Checks whether two accesses to the same memref access the same element.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
A description of a (parallelizable) reduction in an affine loop.
Encapsulates a memref load or store access information.
Operation * opInst
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:250
FlatAffineValueConstraints * getConstraints()
Definition: Utils.h:289
void setWrite(bool flag)
Definition: Utils.h:292
Optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, std::vector< SmallVector< int64_t, 4 >> *lbs=nullptr, SmallVectorImpl< int64_t > *lbDivisors=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
Definition: Utils.cpp:342
Location loc
If there is more than one load/store op associated with the region, the location information would co...
Definition: Utils.h:343
void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const
Gets the lower and upper bound map for the dimensional variable at pos.
Definition: Utils.cpp:403
bool isWrite() const
Definition: Utils.h:291
Value memref
Memref that this region corresponds to.
Definition: Utils.h:336
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.