MLIR  20.0.0git
Utils.cpp
Go to the documentation of this file.
1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
19 #include "mlir/IR/BuiltinOps.h"
20 #include "mlir/IR/IRMapping.h"
21 #include "mlir/IR/OpDefinition.h"
22 #include "mlir/IR/PatternMatch.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallPtrSet.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <cstdint>
32 
33 using namespace mlir;
34 
35 #define DEBUG_TYPE "scf-utils"
36 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
37 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
38 
40  RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loopNest,
41  ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn,
42  bool replaceIterOperandsUsesInLoop) {
43  if (loopNest.empty())
44  return {};
45  // This method is recursive (to make it more readable). Adding an
46  // assertion here to limit the recursion. (See
47  // https://discourse.llvm.org/t/rfc-update-to-mlir-developer-policy-on-recursion/62235)
48  assert(loopNest.size() <= 10 &&
49  "exceeded recursion limit when yielding value from loop nest");
50 
51  // To yield a value from a perfectly nested loop nest, the following
52  // pattern needs to be created, i.e. starting with
53  //
54  // ```mlir
55  // scf.for .. {
56  // scf.for .. {
57  // scf.for .. {
58  // %value = ...
59  // }
60  // }
61  // }
62  // ```
63  //
64  // needs to be modified to
65  //
66  // ```mlir
67  // %0 = scf.for .. iter_args(%arg0 = %init) {
68  // %1 = scf.for .. iter_args(%arg1 = %arg0) {
69  // %2 = scf.for .. iter_args(%arg2 = %arg1) {
70  // %value = ...
71  // scf.yield %value
72  // }
73  // scf.yield %2
74  // }
75  // scf.yield %1
76  // }
77  // ```
78  //
79  // The inner most loop is handled using the `replaceWithAdditionalYields`
80  // that works on a single loop.
81  if (loopNest.size() == 1) {
82  auto innerMostLoop =
83  cast<scf::ForOp>(*loopNest.back().replaceWithAdditionalYields(
84  rewriter, newIterOperands, replaceIterOperandsUsesInLoop,
85  newYieldValuesFn));
86  return {innerMostLoop};
87  }
88  // The outer loops are modified by calling this method recursively
89  // - The return value of the inner loop is the value yielded by this loop.
90  // - The region iter args of this loop are the init_args for the inner loop.
91  SmallVector<scf::ForOp> newLoopNest;
92  NewYieldValuesFn fn =
93  [&](OpBuilder &innerBuilder, Location loc,
94  ArrayRef<BlockArgument> innerNewBBArgs) -> SmallVector<Value> {
95  newLoopNest = replaceLoopNestWithNewYields(rewriter, loopNest.drop_front(),
96  innerNewBBArgs, newYieldValuesFn,
97  replaceIterOperandsUsesInLoop);
98  return llvm::to_vector(llvm::map_range(
99  newLoopNest.front().getResults().take_back(innerNewBBArgs.size()),
100  [](OpResult r) -> Value { return r; }));
101  };
102  scf::ForOp outerMostLoop =
103  cast<scf::ForOp>(*loopNest.front().replaceWithAdditionalYields(
104  rewriter, newIterOperands, replaceIterOperandsUsesInLoop, fn));
105  newLoopNest.insert(newLoopNest.begin(), outerMostLoop);
106  return newLoopNest;
107 }
108 
109 /// Outline a region with a single block into a new FuncOp.
110 /// Assumes the FuncOp result types is the type of the yielded operands of the
111 /// single block. This constraint makes it easy to determine the result.
112 /// This method also clones the `arith::ConstantIndexOp` at the start of
113 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is
114 /// provided, it will be set to point to the operation that calls the outlined
115 /// function.
116 // TODO: support more than single-block regions.
117 // TODO: more flexible constant handling.
118 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
119  Location loc,
120  Region &region,
121  StringRef funcName,
122  func::CallOp *callOp) {
123  assert(!funcName.empty() && "funcName cannot be empty");
124  if (!region.hasOneBlock())
125  return failure();
126 
127  Block *originalBlock = &region.front();
128  Operation *originalTerminator = originalBlock->getTerminator();
129 
130  // Outline before current function.
131  OpBuilder::InsertionGuard g(rewriter);
132  rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>());
133 
134  SetVector<Value> captures;
135  getUsedValuesDefinedAbove(region, captures);
136 
137  ValueRange outlinedValues(captures.getArrayRef());
138  SmallVector<Type> outlinedFuncArgTypes;
139  SmallVector<Location> outlinedFuncArgLocs;
140  // Region's arguments are exactly the first block's arguments as per
141  // Region::getArguments().
142  // Func's arguments are cat(regions's arguments, captures arguments).
143  for (BlockArgument arg : region.getArguments()) {
144  outlinedFuncArgTypes.push_back(arg.getType());
145  outlinedFuncArgLocs.push_back(arg.getLoc());
146  }
147  for (Value value : outlinedValues) {
148  outlinedFuncArgTypes.push_back(value.getType());
149  outlinedFuncArgLocs.push_back(value.getLoc());
150  }
151  FunctionType outlinedFuncType =
152  FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
153  originalTerminator->getOperandTypes());
154  auto outlinedFunc =
155  rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType);
156  Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
157 
158  // Merge blocks while replacing the original block operands.
159  // Warning: `mergeBlocks` erases the original block, reconstruct it later.
160  int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
161  auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
162  {
163  OpBuilder::InsertionGuard g(rewriter);
164  rewriter.setInsertionPointToEnd(outlinedFuncBody);
165  rewriter.mergeBlocks(
166  originalBlock, outlinedFuncBody,
167  outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
168  // Explicitly set up a new ReturnOp terminator.
169  rewriter.setInsertionPointToEnd(outlinedFuncBody);
170  rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
171  originalTerminator->getOperands());
172  }
173 
174  // Reconstruct the block that was deleted and add a
175  // terminator(call_results).
176  Block *newBlock = rewriter.createBlock(
177  &region, region.begin(),
178  TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
179  ArrayRef<Location>(outlinedFuncArgLocs)
180  .take_front(numOriginalBlockArguments));
181  {
182  OpBuilder::InsertionGuard g(rewriter);
183  rewriter.setInsertionPointToEnd(newBlock);
184  SmallVector<Value> callValues;
185  llvm::append_range(callValues, newBlock->getArguments());
186  llvm::append_range(callValues, outlinedValues);
187  auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
188  if (callOp)
189  *callOp = call;
190 
191  // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
192  // Clone `originalTerminator` to take the callOp results then erase it from
193  // `outlinedFuncBody`.
194  IRMapping bvm;
195  bvm.map(originalTerminator->getOperands(), call->getResults());
196  rewriter.clone(*originalTerminator, bvm);
197  rewriter.eraseOp(originalTerminator);
198  }
199 
200  // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
201  // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
202  for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
203  outlinedValues.size()))) {
204  Value orig = std::get<0>(it);
205  Value repl = std::get<1>(it);
206  {
207  OpBuilder::InsertionGuard g(rewriter);
208  rewriter.setInsertionPointToStart(outlinedFuncBody);
209  if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
210  IRMapping bvm;
211  repl = rewriter.clone(*cst, bvm)->getResult(0);
212  }
213  }
214  orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
215  return outlinedFunc->isProperAncestor(opOperand.getOwner());
216  });
217  }
218 
219  return outlinedFunc;
220 }
221 
222 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,
223  func::FuncOp *thenFn, StringRef thenFnName,
224  func::FuncOp *elseFn, StringRef elseFnName) {
225  IRRewriter rewriter(b);
226  Location loc = ifOp.getLoc();
227  FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
228  if (thenFn && !ifOp.getThenRegion().empty()) {
229  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
230  rewriter, loc, ifOp.getThenRegion(), thenFnName);
231  if (failed(outlinedFuncOpOrFailure))
232  return failure();
233  *thenFn = *outlinedFuncOpOrFailure;
234  }
235  if (elseFn && !ifOp.getElseRegion().empty()) {
236  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
237  rewriter, loc, ifOp.getElseRegion(), elseFnName);
238  if (failed(outlinedFuncOpOrFailure))
239  return failure();
240  *elseFn = *outlinedFuncOpOrFailure;
241  }
242  return success();
243 }
244 
247  assert(rootOp != nullptr && "Root operation must not be a nullptr.");
248  bool rootEnclosesPloops = false;
249  for (Region &region : rootOp->getRegions()) {
250  for (Block &block : region.getBlocks()) {
251  for (Operation &op : block) {
252  bool enclosesPloops = getInnermostParallelLoops(&op, result);
253  rootEnclosesPloops |= enclosesPloops;
254  if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
255  rootEnclosesPloops = true;
256 
257  // Collect parallel loop if it is an innermost one.
258  if (!enclosesPloops)
259  result.push_back(ploop);
260  }
261  }
262  }
263  }
264  return rootEnclosesPloops;
265 }
266 
267 // Build the IR that performs ceil division of a positive value by a constant:
268 // ceildiv(a, B) = divis(a + (B-1), B)
269 // where divis is rounding-to-zero division.
270 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
271  int64_t divisor) {
272  assert(divisor > 0 && "expected positive divisor");
273  assert(dividend.getType().isIndex() && "expected index-typed value");
274 
275  Value divisorMinusOneCst =
276  builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
277  Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
278  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
279  return builder.create<arith::DivUIOp>(loc, sum, divisorCst);
280 }
281 
282 // Build the IR that performs ceil division of a positive value by another
283 // positive value:
284 // ceildiv(a, b) = divis(a + (b - 1), b)
285 // where divis is rounding-to-zero division.
286 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
287  Value divisor) {
288  assert(dividend.getType().isIndex() && "expected index-typed value");
289 
290  Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
291  Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
292  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
293  return builder.create<arith::DivUIOp>(loc, sum, divisor);
294 }
295 
296 /// Returns the trip count of `forOp` if its' low bound, high bound and step are
297 /// constants, or optional otherwise. Trip count is computed as ceilDiv(highBound
298 /// - lowBound, step).
299 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
300  std::optional<int64_t> lbCstOp = getConstantIntValue(forOp.getLowerBound());
301  std::optional<int64_t> ubCstOp = getConstantIntValue(forOp.getUpperBound());
302  std::optional<int64_t> stepCstOp = getConstantIntValue(forOp.getStep());
303  if (!lbCstOp.has_value() || !ubCstOp.has_value() || !stepCstOp.has_value())
304  return {};
305 
306  // Constant loop bounds computation.
307  int64_t lbCst = lbCstOp.value();
308  int64_t ubCst = ubCstOp.value();
309  int64_t stepCst = stepCstOp.value();
310  assert(lbCst >= 0 && ubCst >= 0 && stepCst > 0 &&
311  "expected positive loop bounds and step");
312  return llvm::divideCeilSigned(ubCst - lbCst, stepCst);
313 }
314 
315 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
316 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
317 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
318 /// unrolled iteration using annotateFn.
320  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
321  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
322  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
323  ValueRange iterArgs, ValueRange yieldedValues) {
324  // Builder to insert unrolled bodies just before the terminator of the body of
325  // 'forOp'.
326  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
327 
328  if (!annotateFn)
329  annotateFn = [](unsigned, Operation *, OpBuilder) {};
330 
331  // Keep a pointer to the last non-terminator operation in the original block
332  // so that we know what to clone (since we are doing this in-place).
333  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
334 
335  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
336  SmallVector<Value, 4> lastYielded(yieldedValues);
337 
338  for (unsigned i = 1; i < unrollFactor; i++) {
339  IRMapping operandMap;
340 
341  // Prepare operand map.
342  operandMap.map(iterArgs, lastYielded);
343 
344  // If the induction variable is used, create a remapping to the value for
345  // this unrolled instance.
346  if (!forOpIV.use_empty()) {
347  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
348  operandMap.map(forOpIV, ivUnroll);
349  }
350 
351  // Clone the original body of 'forOp'.
352  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
353  Operation *clonedOp = builder.clone(*it, operandMap);
354  annotateFn(i, clonedOp, builder);
355  }
356 
357  // Update yielded values.
358  for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
359  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
360  }
361 
362  // Make sure we annotate the Ops in the original body. We do this last so that
363  // any annotations are not copied into the cloned Ops above.
364  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
365  annotateFn(0, &*it, builder);
366 
367  // Update operands of the yield statement.
368  loopBodyBlock->getTerminator()->setOperands(lastYielded);
369 }
370 
371 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
373  scf::ForOp forOp, uint64_t unrollFactor,
374  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
375  assert(unrollFactor > 0 && "expected positive unroll factor");
376 
377  // Return if the loop body is empty.
378  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
379  return success();
380 
381  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
382  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
383  OpBuilder boundsBuilder(forOp);
384  IRRewriter rewriter(forOp.getContext());
385  auto loc = forOp.getLoc();
386  Value step = forOp.getStep();
387  Value upperBoundUnrolled;
388  Value stepUnrolled;
389  bool generateEpilogueLoop = true;
390 
391  std::optional<int64_t> constTripCount = getConstantTripCount(forOp);
392  if (constTripCount) {
393  // Constant loop bounds computation.
394  int64_t lbCst = getConstantIntValue(forOp.getLowerBound()).value();
395  int64_t ubCst = getConstantIntValue(forOp.getUpperBound()).value();
396  int64_t stepCst = getConstantIntValue(forOp.getStep()).value();
397  if (unrollFactor == 1) {
398  if (*constTripCount == 1 &&
399  failed(forOp.promoteIfSingleIteration(rewriter)))
400  return failure();
401  return success();
402  }
403 
404  int64_t tripCountEvenMultiple =
405  *constTripCount - (*constTripCount % unrollFactor);
406  int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
407  int64_t stepUnrolledCst = stepCst * unrollFactor;
408 
409  // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
410  generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
411  if (generateEpilogueLoop)
412  upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
413  loc, upperBoundUnrolledCst);
414  else
415  upperBoundUnrolled = forOp.getUpperBound();
416 
417  // Create constant for 'stepUnrolled'.
418  stepUnrolled = stepCst == stepUnrolledCst
419  ? step
420  : boundsBuilder.create<arith::ConstantIndexOp>(
421  loc, stepUnrolledCst);
422  } else {
423  // Dynamic loop bounds computation.
424  // TODO: Add dynamic asserts for negative lb/ub/step, or
425  // consider using ceilDiv from AffineApplyExpander.
426  auto lowerBound = forOp.getLowerBound();
427  auto upperBound = forOp.getUpperBound();
428  Value diff =
429  boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
430  Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
431  Value unrollFactorCst =
432  boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
433  Value tripCountRem =
434  boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
435  // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
436  Value tripCountEvenMultiple =
437  boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
438  // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
439  upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
440  loc, lowerBound,
441  boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
442  // Scale 'step' by 'unrollFactor'.
443  stepUnrolled =
444  boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
445  }
446 
447  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
448  if (generateEpilogueLoop) {
449  OpBuilder epilogueBuilder(forOp->getContext());
450  epilogueBuilder.setInsertionPoint(forOp->getBlock(),
451  std::next(Block::iterator(forOp)));
452  auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
453  epilogueForOp.setLowerBound(upperBoundUnrolled);
454 
455  // Update uses of loop results.
456  auto results = forOp.getResults();
457  auto epilogueResults = epilogueForOp.getResults();
458 
459  for (auto e : llvm::zip(results, epilogueResults)) {
460  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
461  }
462  epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
463  epilogueForOp.getInitArgs().size(), results);
464  (void)epilogueForOp.promoteIfSingleIteration(rewriter);
465  }
466 
467  // Create unrolled loop.
468  forOp.setUpperBound(upperBoundUnrolled);
469  forOp.setStep(stepUnrolled);
470 
471  auto iterArgs = ValueRange(forOp.getRegionIterArgs());
472  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
473 
475  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
476  [&](unsigned i, Value iv, OpBuilder b) {
477  // iv' = iv + step * i;
478  auto stride = b.create<arith::MulIOp>(
479  loc, step, b.create<arith::ConstantIndexOp>(loc, i));
480  return b.create<arith::AddIOp>(loc, iv, stride);
481  },
482  annotateFn, iterArgs, yieldedValues);
483  // Promote the loop body up if this has turned into a single iteration loop.
484  (void)forOp.promoteIfSingleIteration(rewriter);
485  return success();
486 }
487 
488 /// Check if bounds of all inner loops are defined outside of `forOp`
489 /// and return false if not.
490 static bool areInnerBoundsInvariant(scf::ForOp forOp) {
491  auto walkResult = forOp.walk([&](scf::ForOp innerForOp) {
492  if (!forOp.isDefinedOutsideOfLoop(innerForOp.getLowerBound()) ||
493  !forOp.isDefinedOutsideOfLoop(innerForOp.getUpperBound()) ||
494  !forOp.isDefinedOutsideOfLoop(innerForOp.getStep()))
495  return WalkResult::interrupt();
496 
497  return WalkResult::advance();
498  });
499  return !walkResult.wasInterrupted();
500 }
501 
502 /// Unrolls and jams this loop by the specified factor.
503 LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
504  uint64_t unrollJamFactor) {
505  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
506 
507  if (unrollJamFactor == 1)
508  return success();
509 
510  // If any control operand of any inner loop of `forOp` is defined within
511  // `forOp`, no unroll jam.
512  if (!areInnerBoundsInvariant(forOp)) {
513  LDBG("failed to unroll and jam: inner bounds are not invariant");
514  return failure();
515  }
516 
517  // Currently, for operations with results are not supported.
518  if (forOp->getNumResults() > 0) {
519  LDBG("failed to unroll and jam: unsupported loop with results");
520  return failure();
521  }
522 
523  // Currently, only constant trip count that divided by the unroll factor is
524  // supported.
525  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
526  if (!tripCount.has_value()) {
527  // If the trip count is dynamic, do not unroll & jam.
528  LDBG("failed to unroll and jam: trip count could not be determined");
529  return failure();
530  }
531  if (unrollJamFactor > *tripCount) {
532  LDBG("unroll and jam factor is greater than trip count, set factor to trip "
533  "count");
534  unrollJamFactor = *tripCount;
535  } else if (*tripCount % unrollJamFactor != 0) {
536  LDBG("failed to unroll and jam: unsupported trip count that is not a "
537  "multiple of unroll jam factor");
538  return failure();
539  }
540 
541  // Nothing in the loop body other than the terminator.
542  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
543  return success();
544 
545  // Gather all sub-blocks to jam upon the loop being unrolled.
547  jbg.walk(forOp);
548  auto &subBlocks = jbg.subBlocks;
549 
550  // Collect inner loops.
551  SmallVector<scf::ForOp> innerLoops;
552  forOp.walk([&](scf::ForOp innerForOp) { innerLoops.push_back(innerForOp); });
553 
554  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
555  // iteration. There are (`unrollJamFactor` - 1) iterations.
556  SmallVector<IRMapping> operandMaps(unrollJamFactor - 1);
557 
558  // For any loop with iter_args, replace it with a new loop that has
559  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
560  // operands.
561  SmallVector<scf::ForOp> newInnerLoops;
562  IRRewriter rewriter(forOp.getContext());
563  for (scf::ForOp oldForOp : innerLoops) {
564  SmallVector<Value> dupIterOperands, dupYieldOperands;
565  ValueRange oldIterOperands = oldForOp.getInits();
566  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
567  ValueRange oldYieldOperands =
568  cast<scf::YieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
569  // Get additional iterOperands, iterArgs, and yield operands. We will
570  // fix iterOperands and yield operands after cloning of sub-blocks.
571  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
572  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
573  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
574  }
575  // Create a new loop with additional iterOperands, iter_args and yield
576  // operands. This new loop will take the loop body of the original loop.
577  bool forOpReplaced = oldForOp == forOp;
578  scf::ForOp newForOp =
579  cast<scf::ForOp>(*oldForOp.replaceWithAdditionalYields(
580  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
581  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
582  return dupYieldOperands;
583  }));
584  newInnerLoops.push_back(newForOp);
585  // `forOp` has been replaced with a new loop.
586  if (forOpReplaced)
587  forOp = newForOp;
588  // Update `operandMaps` for `newForOp` iterArgs and results.
589  ValueRange newIterArgs = newForOp.getRegionIterArgs();
590  unsigned oldNumIterArgs = oldIterArgs.size();
591  ValueRange newResults = newForOp.getResults();
592  unsigned oldNumResults = newResults.size() / unrollJamFactor;
593  assert(oldNumIterArgs == oldNumResults &&
594  "oldNumIterArgs must be the same as oldNumResults");
595  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
596  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
597  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
598  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
599  // to those in the `i`th new set.
600  operandMaps[i - 1].map(newIterArgs[j],
601  newIterArgs[i * oldNumIterArgs + j]);
602  operandMaps[i - 1].map(newResults[j],
603  newResults[i * oldNumResults + j]);
604  }
605  }
606  }
607 
608  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
609  rewriter.setInsertionPoint(forOp);
610  int64_t step = forOp.getConstantStep()->getSExtValue();
611  auto newStep = rewriter.createOrFold<arith::MulIOp>(
612  forOp.getLoc(), forOp.getStep(),
613  rewriter.createOrFold<arith::ConstantOp>(
614  forOp.getLoc(), rewriter.getIndexAttr(unrollJamFactor)));
615  forOp.setStep(newStep);
616  auto forOpIV = forOp.getInductionVar();
617 
618  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
619  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
620  for (auto &subBlock : subBlocks) {
621  // Builder to insert unroll-jammed bodies. Insert right at the end of
622  // sub-block.
623  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
624 
625  // If the induction variable is used, create a remapping to the value for
626  // this unrolled instance.
627  if (!forOpIV.use_empty()) {
628  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
629  auto ivTag = builder.createOrFold<arith::ConstantOp>(
630  forOp.getLoc(), builder.getIndexAttr(step * i));
631  auto ivUnroll =
632  builder.createOrFold<arith::AddIOp>(forOp.getLoc(), forOpIV, ivTag);
633  operandMaps[i - 1].map(forOpIV, ivUnroll);
634  }
635  // Clone the sub-block being unroll-jammed.
636  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
637  builder.clone(*it, operandMaps[i - 1]);
638  }
639  // Fix iterOperands and yield op operands of newly created loops.
640  for (auto newForOp : newInnerLoops) {
641  unsigned oldNumIterOperands =
642  newForOp.getNumRegionIterArgs() / unrollJamFactor;
643  unsigned numControlOperands = newForOp.getNumControlOperands();
644  auto yieldOp = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
645  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
646  assert(oldNumIterOperands == oldNumYieldOperands &&
647  "oldNumIterOperands must be the same as oldNumYieldOperands");
648  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
649  // The `i`th duplication of an old iterOperand or yield op operand
650  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
651  // if such mapped value exists.
652  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
653  operandMaps[i - 1].lookupOrDefault(
654  newForOp.getOperand(numControlOperands + j)));
655  yieldOp.setOperand(
656  i * oldNumYieldOperands + j,
657  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
658  }
659  }
660  }
661 
662  // Promote the loop body up if this has turned into a single iteration loop.
663  (void)forOp.promoteIfSingleIteration(rewriter);
664  return success();
665 }
666 
668  OpFoldResult lb, OpFoldResult ub,
669  OpFoldResult step) {
670  // For non-index types, generate `arith` instructions
671  // Check if the loop is already known to have a constant zero lower bound or
672  // a constant one step.
673  bool isZeroBased = false;
674  if (auto lbCst = getConstantIntValue(lb))
675  isZeroBased = lbCst.value() == 0;
676 
677  bool isStepOne = false;
678  if (auto stepCst = getConstantIntValue(step))
679  isStepOne = stepCst.value() == 1;
680 
681  Type rangeType = getType(lb);
682  assert(rangeType == getType(ub) && rangeType == getType(step) &&
683  "expected matching types");
684 
685  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
686  // assuming the step is strictly positive. Update the bounds and the step
687  // of the loop to go from 0 to the number of iterations, if necessary.
688  if (isZeroBased && isStepOne)
689  return {lb, ub, step};
690 
691  OpFoldResult diff = ub;
692  if (!isZeroBased) {
693  diff = rewriter.createOrFold<arith::SubIOp>(
694  loc, getValueOrCreateConstantIntOp(rewriter, loc, ub),
695  getValueOrCreateConstantIntOp(rewriter, loc, lb));
696  }
697  OpFoldResult newUpperBound = diff;
698  if (!isStepOne) {
699  newUpperBound = rewriter.createOrFold<arith::CeilDivSIOp>(
700  loc, getValueOrCreateConstantIntOp(rewriter, loc, diff),
701  getValueOrCreateConstantIntOp(rewriter, loc, step));
702  }
703 
704  OpFoldResult newLowerBound = rewriter.getZeroAttr(rangeType);
705  OpFoldResult newStep = rewriter.getOneAttr(rangeType);
706 
707  return {newLowerBound, newUpperBound, newStep};
708 }
709 
711  Value normalizedIv, OpFoldResult origLb,
712  OpFoldResult origStep) {
713  Value denormalizedIv;
715  bool isStepOne = isConstantIntValue(origStep, 1);
716  bool isZeroBased = isConstantIntValue(origLb, 0);
717 
718  Value scaled = normalizedIv;
719  if (!isStepOne) {
720  Value origStepValue =
721  getValueOrCreateConstantIntOp(rewriter, loc, origStep);
722  scaled = rewriter.create<arith::MulIOp>(loc, normalizedIv, origStepValue);
723  preserve.insert(scaled.getDefiningOp());
724  }
725  denormalizedIv = scaled;
726  if (!isZeroBased) {
727  Value origLbValue = getValueOrCreateConstantIntOp(rewriter, loc, origLb);
728  denormalizedIv = rewriter.create<arith::AddIOp>(loc, scaled, origLbValue);
729  preserve.insert(denormalizedIv.getDefiningOp());
730  }
731 
732  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
733 }
734 
735 /// Helper function to multiply a sequence of values.
737  ArrayRef<Value> values) {
738  assert(!values.empty() && "unexpected empty list");
739  std::optional<Value> productOf;
740  for (auto v : values) {
741  auto vOne = getConstantIntValue(v);
742  if (vOne && vOne.value() == 1)
743  continue;
744  if (productOf)
745  productOf =
746  rewriter.create<arith::MulIOp>(loc, productOf.value(), v).getResult();
747  else
748  productOf = v;
749  }
750  if (!productOf) {
751  productOf = rewriter
752  .create<arith::ConstantOp>(
753  loc, rewriter.getOneAttr(values.front().getType()))
754  .getResult();
755  }
756  return productOf.value();
757 }
758 
759 /// For each original loop, the value of the
760 /// induction variable can be obtained by dividing the induction variable of
761 /// the linearized loop by the total number of iterations of the loops nested
762 /// in it modulo the number of iterations in this loop (remove the values
763 /// related to the outer loops):
764 /// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
765 /// Compute these iteratively from the innermost loop by creating a "running
766 /// quotient" of division by the range.
767 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
769  Value linearizedIv, ArrayRef<Value> ubs) {
770  SmallVector<Value> delinearizedIvs(ubs.size());
771  SmallPtrSet<Operation *, 2> preservedUsers;
772 
773  llvm::BitVector isUbOne(ubs.size());
774  for (auto [index, ub] : llvm::enumerate(ubs)) {
775  auto ubCst = getConstantIntValue(ub);
776  if (ubCst && ubCst.value() == 1)
777  isUbOne.set(index);
778  }
779 
780  // Prune the lead ubs that are all ones.
781  unsigned numLeadingOneUbs = 0;
782  for (auto [index, ub] : llvm::enumerate(ubs)) {
783  if (!isUbOne.test(index)) {
784  break;
785  }
786  delinearizedIvs[index] = rewriter.create<arith::ConstantOp>(
787  loc, rewriter.getZeroAttr(ub.getType()));
788  numLeadingOneUbs++;
789  }
790 
791  Value previous = linearizedIv;
792  for (unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {
793  unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;
794  if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {
795  previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
796  preservedUsers.insert(previous.getDefiningOp());
797  }
798  Value iv = previous;
799  if (i != e - 1) {
800  if (!isUbOne.test(idx)) {
801  iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
802  preservedUsers.insert(iv.getDefiningOp());
803  } else {
804  iv = rewriter.create<arith::ConstantOp>(
805  loc, rewriter.getZeroAttr(ubs[idx].getType()));
806  }
807  }
808  delinearizedIvs[idx] = iv;
809  }
810  return {delinearizedIvs, preservedUsers};
811 }
812 
813 LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,
815  if (loops.size() < 2)
816  return failure();
817 
818  scf::ForOp innermost = loops.back();
819  scf::ForOp outermost = loops.front();
820 
821  // 1. Make sure all loops iterate from 0 to upperBound with step 1. This
822  // allows the following code to assume upperBound is the number of iterations.
823  for (auto loop : loops) {
824  OpBuilder::InsertionGuard g(rewriter);
825  rewriter.setInsertionPoint(outermost);
826  Value lb = loop.getLowerBound();
827  Value ub = loop.getUpperBound();
828  Value step = loop.getStep();
829  auto newLoopRange =
830  emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);
831 
832  rewriter.modifyOpInPlace(loop, [&]() {
833  loop.setLowerBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
834  newLoopRange.offset));
835  loop.setUpperBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
836  newLoopRange.size));
837  loop.setStep(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
838  newLoopRange.stride));
839  });
840  rewriter.setInsertionPointToStart(innermost.getBody());
841  denormalizeInductionVariable(rewriter, loop.getLoc(),
842  loop.getInductionVar(), lb, step);
843  }
844 
845  // 2. Emit code computing the upper bound of the coalesced loop as product
846  // of the number of iterations of all loops.
847  OpBuilder::InsertionGuard g(rewriter);
848  rewriter.setInsertionPoint(outermost);
849  Location loc = outermost.getLoc();
850  SmallVector<Value> upperBounds = llvm::map_to_vector(
851  loops, [](auto loop) { return loop.getUpperBound(); });
852  Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);
853  outermost.setUpperBound(upperBound);
854 
855  rewriter.setInsertionPointToStart(innermost.getBody());
856  auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(
857  rewriter, loc, outermost.getInductionVar(), upperBounds);
858  rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],
859  preservedUsers);
860 
861  for (int i = loops.size() - 1; i > 0; --i) {
862  auto outerLoop = loops[i - 1];
863  auto innerLoop = loops[i];
864 
865  Operation *innerTerminator = innerLoop.getBody()->getTerminator();
866  auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());
867  rewriter.eraseOp(innerTerminator);
868 
869  SmallVector<Value> innerBlockArgs;
870  innerBlockArgs.push_back(delinearizeIvs[i]);
871  llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
872  rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),
873  Block::iterator(innerLoop), innerBlockArgs);
874  rewriter.replaceOp(innerLoop, yieldedVals);
875  }
876  return success();
877 }
878 
880  if (loops.empty()) {
881  return failure();
882  }
883  IRRewriter rewriter(loops.front().getContext());
884  return coalesceLoops(rewriter, loops);
885 }
886 
887 LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {
888  LogicalResult result(failure());
890  getPerfectlyNestedLoops(loops, op);
891 
892  // Look for a band of loops that can be coalesced, i.e. perfectly nested
893  // loops with bounds defined above some loop.
894 
895  // 1. For each loop, find above which parent loop its bounds operands are
896  // defined.
897  SmallVector<unsigned> operandsDefinedAbove(loops.size());
898  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
899  operandsDefinedAbove[i] = i;
900  for (unsigned j = 0; j < i; ++j) {
901  SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),
902  loops[i].getUpperBound(),
903  loops[i].getStep()};
904  if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {
905  operandsDefinedAbove[i] = j;
906  break;
907  }
908  }
909  }
910 
911  // 2. For each inner loop check that the iter_args for the immediately outer
912  // loop are the init for the immediately inner loop and that the yields of the
913  // return of the inner loop is the yield for the immediately outer loop. Keep
914  // track of where the chain starts from for each loop.
915  SmallVector<unsigned> iterArgChainStart(loops.size());
916  iterArgChainStart[0] = 0;
917  for (unsigned i = 1, e = loops.size(); i < e; ++i) {
918  // By default set the start of the chain to itself.
919  iterArgChainStart[i] = i;
920  auto outerloop = loops[i - 1];
921  auto innerLoop = loops[i];
922  if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
923  continue;
924  }
925  if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
926  continue;
927  }
928  auto outerloopTerminator = outerloop.getBody()->getTerminator();
929  if (!llvm::equal(outerloopTerminator->getOperands(),
930  innerLoop.getResults())) {
931  continue;
932  }
933  iterArgChainStart[i] = iterArgChainStart[i - 1];
934  }
935 
936  // 3. Identify bands of loops such that the operands of all of them are
937  // defined above the first loop in the band. Traverse the nest bottom-up
938  // so that modifications don't invalidate the inner loops.
939  for (unsigned end = loops.size(); end > 0; --end) {
940  unsigned start = 0;
941  for (; start < end - 1; ++start) {
942  auto maxPos =
943  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
944  std::next(operandsDefinedAbove.begin(), end));
945  if (maxPos > start)
946  continue;
947  if (iterArgChainStart[end - 1] > start)
948  continue;
949  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
950  if (succeeded(coalesceLoops(band)))
951  result = success();
952  break;
953  }
954  // If a band was found and transformed, keep looking at the loops above
955  // the outermost transformed loop.
956  if (start != end - 1)
957  end = start + 1;
958  }
959  return result;
960 }
961 
963  RewriterBase &rewriter, scf::ParallelOp loops,
964  ArrayRef<std::vector<unsigned>> combinedDimensions) {
965  OpBuilder::InsertionGuard g(rewriter);
966  rewriter.setInsertionPoint(loops);
967  Location loc = loops.getLoc();
968 
969  // Presort combined dimensions.
970  auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
971  for (auto &dims : sortedDimensions)
972  llvm::sort(dims);
973 
974  // Normalize ParallelOp's iteration pattern.
975  SmallVector<Value, 3> normalizedUpperBounds;
976  for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
977  OpBuilder::InsertionGuard g2(rewriter);
978  rewriter.setInsertionPoint(loops);
979  Value lb = loops.getLowerBound()[i];
980  Value ub = loops.getUpperBound()[i];
981  Value step = loops.getStep()[i];
982  auto newLoopRange = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);
983  normalizedUpperBounds.push_back(getValueOrCreateConstantIntOp(
984  rewriter, loops.getLoc(), newLoopRange.size));
985 
986  rewriter.setInsertionPointToStart(loops.getBody());
987  denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,
988  step);
989  }
990 
991  // Combine iteration spaces.
992  SmallVector<Value, 3> lowerBounds, upperBounds, steps;
993  auto cst0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
994  auto cst1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
995  for (auto &sortedDimension : sortedDimensions) {
996  Value newUpperBound = rewriter.create<arith::ConstantIndexOp>(loc, 1);
997  for (auto idx : sortedDimension) {
998  newUpperBound = rewriter.create<arith::MulIOp>(
999  loc, newUpperBound, normalizedUpperBounds[idx]);
1000  }
1001  lowerBounds.push_back(cst0);
1002  steps.push_back(cst1);
1003  upperBounds.push_back(newUpperBound);
1004  }
1005 
1006  // Create new ParallelLoop with conversions to the original induction values.
1007  // The loop below uses divisions to get the relevant range of values in the
1008  // new induction value that represent each range of the original induction
1009  // value. The remainders then determine based on that range, which iteration
1010  // of the original induction value this represents. This is a normalized value
1011  // that is un-normalized already by the previous logic.
1012  auto newPloop = rewriter.create<scf::ParallelOp>(
1013  loc, lowerBounds, upperBounds, steps,
1014  [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
1015  for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
1016  Value previous = ploopIVs[i];
1017  unsigned numberCombinedDimensions = combinedDimensions[i].size();
1018  // Iterate over all except the last induction value.
1019  for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
1020  unsigned idx = combinedDimensions[i][j];
1021 
1022  // Determine the current induction value's current loop iteration
1023  Value iv = insideBuilder.create<arith::RemSIOp>(
1024  loc, previous, normalizedUpperBounds[idx]);
1025  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
1026  loops.getRegion());
1027 
1028  // Remove the effect of the current induction value to prepare for
1029  // the next value.
1030  previous = insideBuilder.create<arith::DivSIOp>(
1031  loc, previous, normalizedUpperBounds[idx]);
1032  }
1033 
1034  // The final induction value is just the remaining value.
1035  unsigned idx = combinedDimensions[i][0];
1036  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
1037  previous, loops.getRegion());
1038  }
1039  });
1040 
1041  // Replace the old loop with the new loop.
1042  loops.getBody()->back().erase();
1043  newPloop.getBody()->getOperations().splice(
1044  Block::iterator(newPloop.getBody()->back()),
1045  loops.getBody()->getOperations());
1046  loops.erase();
1047 }
1048 
1049 // Hoist the ops within `outer` that appear before `inner`.
1050 // Such ops include the ops that have been introduced by parametric tiling.
1051 // Ops that come from triangular loops (i.e. that belong to the program slice
1052 // rooted at `outer`) and ops that have side effects cannot be hoisted.
1053 // Return failure when any op fails to hoist.
1054 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
1055  SetVector<Operation *> forwardSlice;
1057  options.filter = [&inner](Operation *op) {
1058  return op != inner.getOperation();
1059  };
1060  getForwardSlice(outer.getInductionVar(), &forwardSlice, options);
1061  LogicalResult status = success();
1063  for (auto &op : outer.getBody()->without_terminator()) {
1064  // Stop when encountering the inner loop.
1065  if (&op == inner.getOperation())
1066  break;
1067  // Skip over non-hoistable ops.
1068  if (forwardSlice.count(&op) > 0) {
1069  status = failure();
1070  continue;
1071  }
1072  // Skip intermediate scf::ForOp, these are not considered a failure.
1073  if (isa<scf::ForOp>(op))
1074  continue;
1075  // Skip other ops with regions.
1076  if (op.getNumRegions() > 0) {
1077  status = failure();
1078  continue;
1079  }
1080  // Skip if op has side effects.
1081  // TODO: loads to immutable memory regions are ok.
1082  if (!isMemoryEffectFree(&op)) {
1083  status = failure();
1084  continue;
1085  }
1086  toHoist.push_back(&op);
1087  }
1088  auto *outerForOp = outer.getOperation();
1089  for (auto *op : toHoist)
1090  op->moveBefore(outerForOp);
1091  return status;
1092 }
1093 
1094 // Traverse the interTile and intraTile loops and try to hoist ops such that
1095 // bands of perfectly nested loops are isolated.
1096 // Return failure if either perfect interTile or perfect intraTile bands cannot
1097 // be formed.
1098 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
1099  LogicalResult status = success();
1100  const Loops &interTile = tileLoops.first;
1101  const Loops &intraTile = tileLoops.second;
1102  auto size = interTile.size();
1103  assert(size == intraTile.size());
1104  if (size <= 1)
1105  return success();
1106  for (unsigned s = 1; s < size; ++s)
1107  status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
1108  : failure();
1109  for (unsigned s = 1; s < size; ++s)
1110  status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
1111  : failure();
1112  return status;
1113 }
1114 
1115 /// Collect perfectly nested loops starting from `rootForOps`. Loops are
1116 /// perfectly nested if each loop is the first and only non-terminator operation
1117 /// in the parent loop. Collect at most `maxLoops` loops and append them to
1118 /// `forOps`.
1119 template <typename T>
1121  SmallVectorImpl<T> &forOps, T rootForOp,
1122  unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
1123  for (unsigned i = 0; i < maxLoops; ++i) {
1124  forOps.push_back(rootForOp);
1125  Block &body = rootForOp.getRegion().front();
1126  if (body.begin() != std::prev(body.end(), 2))
1127  return;
1128 
1129  rootForOp = dyn_cast<T>(&body.front());
1130  if (!rootForOp)
1131  return;
1132  }
1133 }
1134 
1135 static Loops stripmineSink(scf::ForOp forOp, Value factor,
1136  ArrayRef<scf::ForOp> targets) {
1137  auto originalStep = forOp.getStep();
1138  auto iv = forOp.getInductionVar();
1139 
1140  OpBuilder b(forOp);
1141  forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
1142 
1143  Loops innerLoops;
1144  for (auto t : targets) {
1145  // Save information for splicing ops out of t when done
1146  auto begin = t.getBody()->begin();
1147  auto nOps = t.getBody()->getOperations().size();
1148 
1149  // Insert newForOp before the terminator of `t`.
1150  auto b = OpBuilder::atBlockTerminator((t.getBody()));
1151  Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
1152  Value ub =
1153  b.create<arith::MinSIOp>(t.getLoc(), forOp.getUpperBound(), stepped);
1154 
1155  // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
1156  auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
1157  newForOp.getBody()->getOperations().splice(
1158  newForOp.getBody()->getOperations().begin(),
1159  t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
1160  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1161  newForOp.getRegion());
1162 
1163  innerLoops.push_back(newForOp);
1164  }
1165 
1166  return innerLoops;
1167 }
1168 
1169 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1170 // Returns the new for operation, nested immediately under `target`.
1171 template <typename SizeType>
1172 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
1173  scf::ForOp target) {
1174  // TODO: Use cheap structural assertions that targets are nested under
1175  // forOp and that targets are not nested under each other when DominanceInfo
1176  // exposes the capability. It seems overkill to construct a whole function
1177  // dominance tree at this point.
1178  auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
1179  assert(res.size() == 1 && "Expected 1 inner forOp");
1180  return res[0];
1181 }
1182 
1184  ArrayRef<Value> sizes,
1185  ArrayRef<scf::ForOp> targets) {
1187  SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
1188  for (auto it : llvm::zip(forOps, sizes)) {
1189  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1190  res.push_back(step);
1191  currentTargets = step;
1192  }
1193  return res;
1194 }
1195 
1197  scf::ForOp target) {
1199  for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
1200  assert(loops.size() == 1);
1201  res.push_back(loops[0]);
1202  }
1203  return res;
1204 }
1205 
1206 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
1207  // Collect perfectly nested loops. If more size values provided than nested
1208  // loops available, truncate `sizes`.
1210  forOps.reserve(sizes.size());
1211  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1212  if (forOps.size() < sizes.size())
1213  sizes = sizes.take_front(forOps.size());
1214 
1215  return ::tile(forOps, sizes, forOps.back());
1216 }
1217 
1219  scf::ForOp root) {
1220  getPerfectlyNestedLoopsImpl(nestedLoops, root);
1221 }
1222 
1224  ArrayRef<int64_t> sizes) {
1225  // Collect perfectly nested loops. If more size values provided than nested
1226  // loops available, truncate `sizes`.
1228  forOps.reserve(sizes.size());
1229  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1230  if (forOps.size() < sizes.size())
1231  sizes = sizes.take_front(forOps.size());
1232 
1233  // Compute the tile sizes such that i-th outer loop executes size[i]
1234  // iterations. Given that the loop current executes
1235  // numIterations = ceildiv((upperBound - lowerBound), step)
1236  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
1237  SmallVector<Value, 4> tileSizes;
1238  tileSizes.reserve(sizes.size());
1239  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
1240  assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
1241 
1242  auto forOp = forOps[i];
1243  OpBuilder builder(forOp);
1244  auto loc = forOp.getLoc();
1245  Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
1246  forOp.getLowerBound());
1247  Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
1248  Value iterationsPerBlock =
1249  ceilDivPositive(builder, loc, numIterations, sizes[i]);
1250  tileSizes.push_back(iterationsPerBlock);
1251  }
1252 
1253  // Call parametric tiling with the given sizes.
1254  auto intraTile = tile(forOps, tileSizes, forOps.back());
1255  TileLoops tileLoops = std::make_pair(forOps, intraTile);
1256 
1257  // TODO: for now we just ignore the result of band isolation.
1258  // In the future, mapping decisions may be impacted by the ability to
1259  // isolate perfectly nested bands.
1260  (void)tryIsolateBands(tileLoops);
1261 
1262  return tileLoops;
1263 }
1264 
1265 scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,
1266  scf::ForallOp source,
1267  RewriterBase &rewriter) {
1268  unsigned numTargetOuts = target.getNumResults();
1269  unsigned numSourceOuts = source.getNumResults();
1270 
1271  // Create fused shared_outs.
1272  SmallVector<Value> fusedOuts;
1273  llvm::append_range(fusedOuts, target.getOutputs());
1274  llvm::append_range(fusedOuts, source.getOutputs());
1275 
1276  // Create a new scf.forall op after the source loop.
1277  rewriter.setInsertionPointAfter(source);
1278  scf::ForallOp fusedLoop = rewriter.create<scf::ForallOp>(
1279  source.getLoc(), source.getMixedLowerBound(), source.getMixedUpperBound(),
1280  source.getMixedStep(), fusedOuts, source.getMapping());
1281 
1282  // Map control operands.
1283  IRMapping mapping;
1284  mapping.map(target.getInductionVars(), fusedLoop.getInductionVars());
1285  mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());
1286 
1287  // Map shared outs.
1288  mapping.map(target.getRegionIterArgs(),
1289  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1290  mapping.map(source.getRegionIterArgs(),
1291  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1292 
1293  // Append everything except the terminator into the fused operation.
1294  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1295  for (Operation &op : target.getBody()->without_terminator())
1296  rewriter.clone(op, mapping);
1297  for (Operation &op : source.getBody()->without_terminator())
1298  rewriter.clone(op, mapping);
1299 
1300  // Fuse the old terminator in_parallel ops into the new one.
1301  scf::InParallelOp targetTerm = target.getTerminator();
1302  scf::InParallelOp sourceTerm = source.getTerminator();
1303  scf::InParallelOp fusedTerm = fusedLoop.getTerminator();
1304  rewriter.setInsertionPointToStart(fusedTerm.getBody());
1305  for (Operation &op : targetTerm.getYieldingOps())
1306  rewriter.clone(op, mapping);
1307  for (Operation &op : sourceTerm.getYieldingOps())
1308  rewriter.clone(op, mapping);
1309 
1310  // Replace old loops by substituting their uses by results of the fused loop.
1311  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1312  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1313 
1314  return fusedLoop;
1315 }
1316 
1317 scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
1318  scf::ForOp source,
1319  RewriterBase &rewriter) {
1320  unsigned numTargetOuts = target.getNumResults();
1321  unsigned numSourceOuts = source.getNumResults();
1322 
1323  // Create fused init_args, with target's init_args before source's init_args.
1324  SmallVector<Value> fusedInitArgs;
1325  llvm::append_range(fusedInitArgs, target.getInitArgs());
1326  llvm::append_range(fusedInitArgs, source.getInitArgs());
1327 
1328  // Create a new scf.for op after the source loop (with scf.yield terminator
1329  // (without arguments) only in case its init_args is empty).
1330  rewriter.setInsertionPointAfter(source);
1331  scf::ForOp fusedLoop = rewriter.create<scf::ForOp>(
1332  source.getLoc(), source.getLowerBound(), source.getUpperBound(),
1333  source.getStep(), fusedInitArgs);
1334 
1335  // Map original induction variables and operands to those of the fused loop.
1336  IRMapping mapping;
1337  mapping.map(target.getInductionVar(), fusedLoop.getInductionVar());
1338  mapping.map(target.getRegionIterArgs(),
1339  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1340  mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());
1341  mapping.map(source.getRegionIterArgs(),
1342  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1343 
1344  // Merge target's body into the new (fused) for loop and then source's body.
1345  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1346  for (Operation &op : target.getBody()->without_terminator())
1347  rewriter.clone(op, mapping);
1348  for (Operation &op : source.getBody()->without_terminator())
1349  rewriter.clone(op, mapping);
1350 
1351  // Build fused yield results by appropriately mapping original yield operands.
1352  SmallVector<Value> yieldResults;
1353  for (Value operand : target.getBody()->getTerminator()->getOperands())
1354  yieldResults.push_back(mapping.lookupOrDefault(operand));
1355  for (Value operand : source.getBody()->getTerminator()->getOperands())
1356  yieldResults.push_back(mapping.lookupOrDefault(operand));
1357  if (!yieldResults.empty())
1358  rewriter.create<scf::YieldOp>(source.getLoc(), yieldResults);
1359 
1360  // Replace old loops by substituting their uses by results of the fused loop.
1361  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1362  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1363 
1364  return fusedLoop;
1365 }
static std::optional< int64_t > getConstantTripCount(scf::ForOp forOp)
Returns the trip count of forOp if its' low bound, high bound and step are constants,...
Definition: Utils.cpp:299
static LogicalResult tryIsolateBands(const TileLoops &tileLoops)
Definition: Utils.cpp:1098
static void getPerfectlyNestedLoopsImpl(SmallVectorImpl< T > &forOps, T rootForOp, unsigned maxLoops=std::numeric_limits< unsigned >::max())
Collect perfectly nested loops starting from rootForOps.
Definition: Utils.cpp:1120
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner)
Definition: Utils.cpp:1054
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of scf::ForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor',...
Definition: Utils.cpp:319
static Loops stripmineSink(scf::ForOp forOp, Value factor, ArrayRef< scf::ForOp > targets)
Definition: Utils.cpp:1135
static std::pair< SmallVector< Value >, SmallPtrSet< Operation *, 2 > > delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef< Value > ubs)
For each original loop, the value of the induction variable can be obtained by dividing the induction...
Definition: Utils.cpp:768
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor)
Definition: Utils.cpp:270
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef< Value > values)
Helper function to multiply a sequence of values.
Definition: Utils.cpp:736
#define LDBG(X)
Definition: Utils.cpp:37
static bool areInnerBoundsInvariant(scf::ForOp forOp)
Check if bounds of all inner loops are defined outside of forOp and return false if not.
Definition: Utils.cpp:490
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
This class represents an argument of a Block.
Definition: Value.h:319
Block represents an ordered list of Operations.
Definition: Block.h:31
OpListType::iterator iterator
Definition: Block.h:138
unsigned getNumArguments()
Definition: Block.h:126
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:243
BlockArgListType getArguments()
Definition: Block.h:85
Operation & front()
Definition: Block.h:151
iterator end()
Definition: Block.h:142
iterator begin()
Definition: Block.h:141
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:128
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:335
MLIRContext * getContext() const
Definition: Builders.h:55
TypedAttr getOneAttr(Type type)
Definition: Builders.cpp:353
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:766
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:351
This class helps build Operations.
Definition: Builders.h:210
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:559
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:434
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:401
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:255
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:439
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=std::nullopt, ArrayRef< Location > locs=std::nullopt)
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:441
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:523
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:468
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:415
This class represents a single result from folding an operation.
Definition: OpDefinition.h:268
This class represents an operand of an operation.
Definition: Value.h:267
This is a value defined by a result of an operation.
Definition: Value.h:457
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition: Operation.h:669
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:672
operand_type_range getOperandTypes()
Definition: Operation.h:392
result_type_range getResultTypes()
Definition: Operation.h:423
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:373
void moveBefore(Operation *existingOp)
Unlink this operation from its current block and insert it right before existingOp which may be in th...
Definition: Operation.cpp:555
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:539
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
BlockArgListType getArguments()
Definition: Region.h:81
iterator begin()
Definition: Region.h:55
Block & front()
Definition: Region.h:65
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
bool hasOneBlock()
Return true if this region has exactly one block.
Definition: Region.h:68
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
void mergeBlocks(Block *source, Block *dest, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into the end of block 'dest'.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void replaceAllUsesExcept(Value from, Value to, Operation *exceptedUser)
Find uses of from and replace them with to except if the user is exceptedUser.
Definition: PatternMatch.h:702
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:630
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into block 'dest' before the given position.
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
bool isIndex() const
Definition: Types.cpp:57
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
void replaceUsesWithIf(Value newValue, function_ref< bool(OpOperand &)> shouldReplace)
Replace all uses of 'this' value with 'newValue' if the given callback returns true.
Definition: Value.cpp:81
Type getType() const
Return the type of this value.
Definition: Value.h:129
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:51
static WalkResult interrupt()
Definition: Visitors.h:50
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:92
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1583
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
Include the generated interface declarations.
void getPerfectlyNestedLoops(SmallVectorImpl< scf::ForOp > &nestedLoops, scf::ForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: Utils.cpp:1218
LogicalResult loopUnrollByFactor(scf::ForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr)
Unrolls this for operation by the specified unroll factor.
Definition: Utils.cpp:372
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
LogicalResult outlineIfOp(RewriterBase &b, scf::IfOp ifOp, func::FuncOp *thenFn, StringRef thenFnName, func::FuncOp *elseFn, StringRef elseFnName)
Outline the then and/or else regions of ifOp as follows:
Definition: Utils.cpp:222
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:27
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(RewriterBase &rewriter, MutableArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
Definition: Utils.cpp:39
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition: Utils.cpp:305
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op)
Walk an affine.for to find a band to coalesce.
Definition: Utils.cpp:887
std::pair< Loops, Loops > TileLoops
Definition: Utils.h:145
Value getValueOrCreateConstantIntOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:103
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef< std::vector< unsigned >> combinedDimensions)
Take the ParallelLoop and for each set of dimension indices, combine them into a single dimension.
Definition: Utils.cpp:962
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBbArgs)> NewYieldValuesFn
A function that returns the additional yielded values during replaceWithAdditionalYields.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef< Value > sizes)
Tile a nest of scf::ForOp loops rooted at rootForOp with the given (parametric) sizes.
Definition: Utils.cpp:1206
LogicalResult loopUnrollJamByFactor(scf::ForOp forOp, uint64_t unrollFactor)
Unrolls and jams this scf.for operation by the specified unroll factor.
Definition: Utils.cpp:503
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
Definition: Utils.cpp:245
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:62
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: Utils.cpp:1183
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
FailureOr< func::FuncOp > outlineSingleBlockRegion(RewriterBase &rewriter, Location loc, Region &region, StringRef funcName, func::CallOp *callOp=nullptr)
Outline a region with a single block into a new FuncOp.
Definition: Utils.cpp:118
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:24
void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Get back the original induction variable values after loop normalization.
Definition: Utils.cpp:710
scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForallOp source, RewriterBase &rewriter)
Given two scf.forall loops, target and source, fuses target into source.
Definition: Utils.cpp:1265
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: Utils.cpp:879
scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter)
Given two scf.for loops, target and source, fuses target into source.
Definition: Utils.cpp:1317
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef< int64_t > sizes)
Definition: Utils.cpp:1223
Range emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Materialize bounds and step of a zero-based and unit-step loop derived by normalizing the specified b...
Definition: Utils.cpp:667
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.