MLIR  20.0.0git
Utils.cpp
Go to the documentation of this file.
1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
20 #include "mlir/IR/BuiltinOps.h"
21 #include "mlir/IR/IRMapping.h"
22 #include "mlir/IR/OpDefinition.h"
23 #include "mlir/IR/PatternMatch.h"
26 #include "llvm/ADT/STLExtras.h"
27 #include "llvm/ADT/SetVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/MathExtras.h"
32 #include <cstdint>
33 
34 using namespace mlir;
35 
36 #define DEBUG_TYPE "scf-utils"
37 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
38 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
39 
41  RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loopNest,
42  ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn,
43  bool replaceIterOperandsUsesInLoop) {
44  if (loopNest.empty())
45  return {};
46  // This method is recursive (to make it more readable). Adding an
47  // assertion here to limit the recursion. (See
48  // https://discourse.llvm.org/t/rfc-update-to-mlir-developer-policy-on-recursion/62235)
49  assert(loopNest.size() <= 10 &&
50  "exceeded recursion limit when yielding value from loop nest");
51 
52  // To yield a value from a perfectly nested loop nest, the following
53  // pattern needs to be created, i.e. starting with
54  //
55  // ```mlir
56  // scf.for .. {
57  // scf.for .. {
58  // scf.for .. {
59  // %value = ...
60  // }
61  // }
62  // }
63  // ```
64  //
65  // needs to be modified to
66  //
67  // ```mlir
68  // %0 = scf.for .. iter_args(%arg0 = %init) {
69  // %1 = scf.for .. iter_args(%arg1 = %arg0) {
70  // %2 = scf.for .. iter_args(%arg2 = %arg1) {
71  // %value = ...
72  // scf.yield %value
73  // }
74  // scf.yield %2
75  // }
76  // scf.yield %1
77  // }
78  // ```
79  //
80  // The inner most loop is handled using the `replaceWithAdditionalYields`
81  // that works on a single loop.
82  if (loopNest.size() == 1) {
83  auto innerMostLoop =
84  cast<scf::ForOp>(*loopNest.back().replaceWithAdditionalYields(
85  rewriter, newIterOperands, replaceIterOperandsUsesInLoop,
86  newYieldValuesFn));
87  return {innerMostLoop};
88  }
89  // The outer loops are modified by calling this method recursively
90  // - The return value of the inner loop is the value yielded by this loop.
91  // - The region iter args of this loop are the init_args for the inner loop.
92  SmallVector<scf::ForOp> newLoopNest;
93  NewYieldValuesFn fn =
94  [&](OpBuilder &innerBuilder, Location loc,
95  ArrayRef<BlockArgument> innerNewBBArgs) -> SmallVector<Value> {
96  newLoopNest = replaceLoopNestWithNewYields(rewriter, loopNest.drop_front(),
97  innerNewBBArgs, newYieldValuesFn,
98  replaceIterOperandsUsesInLoop);
99  return llvm::to_vector(llvm::map_range(
100  newLoopNest.front().getResults().take_back(innerNewBBArgs.size()),
101  [](OpResult r) -> Value { return r; }));
102  };
103  scf::ForOp outerMostLoop =
104  cast<scf::ForOp>(*loopNest.front().replaceWithAdditionalYields(
105  rewriter, newIterOperands, replaceIterOperandsUsesInLoop, fn));
106  newLoopNest.insert(newLoopNest.begin(), outerMostLoop);
107  return newLoopNest;
108 }
109 
110 /// Outline a region with a single block into a new FuncOp.
111 /// Assumes the FuncOp result types is the type of the yielded operands of the
112 /// single block. This constraint makes it easy to determine the result.
113 /// This method also clones the `arith::ConstantIndexOp` at the start of
114 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is
115 /// provided, it will be set to point to the operation that calls the outlined
116 /// function.
117 // TODO: support more than single-block regions.
118 // TODO: more flexible constant handling.
119 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
120  Location loc,
121  Region &region,
122  StringRef funcName,
123  func::CallOp *callOp) {
124  assert(!funcName.empty() && "funcName cannot be empty");
125  if (!region.hasOneBlock())
126  return failure();
127 
128  Block *originalBlock = &region.front();
129  Operation *originalTerminator = originalBlock->getTerminator();
130 
131  // Outline before current function.
132  OpBuilder::InsertionGuard g(rewriter);
133  rewriter.setInsertionPoint(region.getParentOfType<FunctionOpInterface>());
134 
135  SetVector<Value> captures;
136  getUsedValuesDefinedAbove(region, captures);
137 
138  ValueRange outlinedValues(captures.getArrayRef());
139  SmallVector<Type> outlinedFuncArgTypes;
140  SmallVector<Location> outlinedFuncArgLocs;
141  // Region's arguments are exactly the first block's arguments as per
142  // Region::getArguments().
143  // Func's arguments are cat(regions's arguments, captures arguments).
144  for (BlockArgument arg : region.getArguments()) {
145  outlinedFuncArgTypes.push_back(arg.getType());
146  outlinedFuncArgLocs.push_back(arg.getLoc());
147  }
148  for (Value value : outlinedValues) {
149  outlinedFuncArgTypes.push_back(value.getType());
150  outlinedFuncArgLocs.push_back(value.getLoc());
151  }
152  FunctionType outlinedFuncType =
153  FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
154  originalTerminator->getOperandTypes());
155  auto outlinedFunc =
156  rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType);
157  Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
158 
159  // Merge blocks while replacing the original block operands.
160  // Warning: `mergeBlocks` erases the original block, reconstruct it later.
161  int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
162  auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
163  {
164  OpBuilder::InsertionGuard g(rewriter);
165  rewriter.setInsertionPointToEnd(outlinedFuncBody);
166  rewriter.mergeBlocks(
167  originalBlock, outlinedFuncBody,
168  outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
169  // Explicitly set up a new ReturnOp terminator.
170  rewriter.setInsertionPointToEnd(outlinedFuncBody);
171  rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
172  originalTerminator->getOperands());
173  }
174 
175  // Reconstruct the block that was deleted and add a
176  // terminator(call_results).
177  Block *newBlock = rewriter.createBlock(
178  &region, region.begin(),
179  TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
180  ArrayRef<Location>(outlinedFuncArgLocs)
181  .take_front(numOriginalBlockArguments));
182  {
183  OpBuilder::InsertionGuard g(rewriter);
184  rewriter.setInsertionPointToEnd(newBlock);
185  SmallVector<Value> callValues;
186  llvm::append_range(callValues, newBlock->getArguments());
187  llvm::append_range(callValues, outlinedValues);
188  auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
189  if (callOp)
190  *callOp = call;
191 
192  // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
193  // Clone `originalTerminator` to take the callOp results then erase it from
194  // `outlinedFuncBody`.
195  IRMapping bvm;
196  bvm.map(originalTerminator->getOperands(), call->getResults());
197  rewriter.clone(*originalTerminator, bvm);
198  rewriter.eraseOp(originalTerminator);
199  }
200 
201  // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
202  // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
203  for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
204  outlinedValues.size()))) {
205  Value orig = std::get<0>(it);
206  Value repl = std::get<1>(it);
207  {
208  OpBuilder::InsertionGuard g(rewriter);
209  rewriter.setInsertionPointToStart(outlinedFuncBody);
210  if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
211  IRMapping bvm;
212  repl = rewriter.clone(*cst, bvm)->getResult(0);
213  }
214  }
215  orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
216  return outlinedFunc->isProperAncestor(opOperand.getOwner());
217  });
218  }
219 
220  return outlinedFunc;
221 }
222 
223 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,
224  func::FuncOp *thenFn, StringRef thenFnName,
225  func::FuncOp *elseFn, StringRef elseFnName) {
226  IRRewriter rewriter(b);
227  Location loc = ifOp.getLoc();
228  FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
229  if (thenFn && !ifOp.getThenRegion().empty()) {
230  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
231  rewriter, loc, ifOp.getThenRegion(), thenFnName);
232  if (failed(outlinedFuncOpOrFailure))
233  return failure();
234  *thenFn = *outlinedFuncOpOrFailure;
235  }
236  if (elseFn && !ifOp.getElseRegion().empty()) {
237  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
238  rewriter, loc, ifOp.getElseRegion(), elseFnName);
239  if (failed(outlinedFuncOpOrFailure))
240  return failure();
241  *elseFn = *outlinedFuncOpOrFailure;
242  }
243  return success();
244 }
245 
248  assert(rootOp != nullptr && "Root operation must not be a nullptr.");
249  bool rootEnclosesPloops = false;
250  for (Region &region : rootOp->getRegions()) {
251  for (Block &block : region.getBlocks()) {
252  for (Operation &op : block) {
253  bool enclosesPloops = getInnermostParallelLoops(&op, result);
254  rootEnclosesPloops |= enclosesPloops;
255  if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
256  rootEnclosesPloops = true;
257 
258  // Collect parallel loop if it is an innermost one.
259  if (!enclosesPloops)
260  result.push_back(ploop);
261  }
262  }
263  }
264  }
265  return rootEnclosesPloops;
266 }
267 
268 // Build the IR that performs ceil division of a positive value by a constant:
269 // ceildiv(a, B) = divis(a + (B-1), B)
270 // where divis is rounding-to-zero division.
271 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
272  int64_t divisor) {
273  assert(divisor > 0 && "expected positive divisor");
274  assert(dividend.getType().isIntOrIndex() &&
275  "expected integer or index-typed value");
276 
277  Value divisorMinusOneCst = builder.create<arith::ConstantOp>(
278  loc, builder.getIntegerAttr(dividend.getType(), divisor - 1));
279  Value divisorCst = builder.create<arith::ConstantOp>(
280  loc, builder.getIntegerAttr(dividend.getType(), divisor));
281  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
282  return builder.create<arith::DivUIOp>(loc, sum, divisorCst);
283 }
284 
285 // Build the IR that performs ceil division of a positive value by another
286 // positive value:
287 // ceildiv(a, b) = divis(a + (b - 1), b)
288 // where divis is rounding-to-zero division.
289 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
290  Value divisor) {
291  assert(dividend.getType().isIntOrIndex() &&
292  "expected integer or index-typed value");
293  Value cstOne = builder.create<arith::ConstantOp>(
294  loc, builder.getOneAttr(dividend.getType()));
295  Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
296  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
297  return builder.create<arith::DivUIOp>(loc, sum, divisor);
298 }
299 
300 /// Returns the trip count of `forOp` if its' low bound, high bound and step are
301 /// constants, or optional otherwise. Trip count is computed as
302 /// ceilDiv(highBound - lowBound, step).
303 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
304  std::optional<int64_t> lbCstOp = getConstantIntValue(forOp.getLowerBound());
305  std::optional<int64_t> ubCstOp = getConstantIntValue(forOp.getUpperBound());
306  std::optional<int64_t> stepCstOp = getConstantIntValue(forOp.getStep());
307  if (!lbCstOp.has_value() || !ubCstOp.has_value() || !stepCstOp.has_value())
308  return {};
309 
310  // Constant loop bounds computation.
311  int64_t lbCst = lbCstOp.value();
312  int64_t ubCst = ubCstOp.value();
313  int64_t stepCst = stepCstOp.value();
314  assert(lbCst >= 0 && ubCst >= 0 && stepCst > 0 &&
315  "expected positive loop bounds and step");
316  return llvm::divideCeilSigned(ubCst - lbCst, stepCst);
317 }
318 
319 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
320 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
321 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
322 /// unrolled iteration using annotateFn.
324  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
325  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
326  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
327  ValueRange iterArgs, ValueRange yieldedValues) {
328  // Builder to insert unrolled bodies just before the terminator of the body of
329  // 'forOp'.
330  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
331 
332  if (!annotateFn)
333  annotateFn = [](unsigned, Operation *, OpBuilder) {};
334 
335  // Keep a pointer to the last non-terminator operation in the original block
336  // so that we know what to clone (since we are doing this in-place).
337  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
338 
339  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
340  SmallVector<Value, 4> lastYielded(yieldedValues);
341 
342  for (unsigned i = 1; i < unrollFactor; i++) {
343  IRMapping operandMap;
344 
345  // Prepare operand map.
346  operandMap.map(iterArgs, lastYielded);
347 
348  // If the induction variable is used, create a remapping to the value for
349  // this unrolled instance.
350  if (!forOpIV.use_empty()) {
351  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
352  operandMap.map(forOpIV, ivUnroll);
353  }
354 
355  // Clone the original body of 'forOp'.
356  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
357  Operation *clonedOp = builder.clone(*it, operandMap);
358  annotateFn(i, clonedOp, builder);
359  }
360 
361  // Update yielded values.
362  for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
363  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
364  }
365 
366  // Make sure we annotate the Ops in the original body. We do this last so that
367  // any annotations are not copied into the cloned Ops above.
368  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
369  annotateFn(0, &*it, builder);
370 
371  // Update operands of the yield statement.
372  loopBodyBlock->getTerminator()->setOperands(lastYielded);
373 }
374 
375 /// Unrolls 'forOp' by 'unrollFactor', returns the unrolled main loop and the
376 /// eplilog loop, if the loop is unrolled.
377 FailureOr<UnrolledLoopInfo> mlir::loopUnrollByFactor(
378  scf::ForOp forOp, uint64_t unrollFactor,
379  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
380  assert(unrollFactor > 0 && "expected positive unroll factor");
381 
382  // Return if the loop body is empty.
383  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
384  return UnrolledLoopInfo{forOp, std::nullopt};
385 
386  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
387  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
388  OpBuilder boundsBuilder(forOp);
389  IRRewriter rewriter(forOp.getContext());
390  auto loc = forOp.getLoc();
391  Value step = forOp.getStep();
392  Value upperBoundUnrolled;
393  Value stepUnrolled;
394  bool generateEpilogueLoop = true;
395 
396  std::optional<int64_t> constTripCount = getConstantTripCount(forOp);
397  if (constTripCount) {
398  // Constant loop bounds computation.
399  int64_t lbCst = getConstantIntValue(forOp.getLowerBound()).value();
400  int64_t ubCst = getConstantIntValue(forOp.getUpperBound()).value();
401  int64_t stepCst = getConstantIntValue(forOp.getStep()).value();
402  if (unrollFactor == 1) {
403  if (*constTripCount == 1 &&
404  failed(forOp.promoteIfSingleIteration(rewriter)))
405  return failure();
406  return UnrolledLoopInfo{forOp, std::nullopt};
407  }
408 
409  int64_t tripCountEvenMultiple =
410  *constTripCount - (*constTripCount % unrollFactor);
411  int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
412  int64_t stepUnrolledCst = stepCst * unrollFactor;
413 
414  // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
415  generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
416  if (generateEpilogueLoop)
417  upperBoundUnrolled = boundsBuilder.create<arith::ConstantOp>(
418  loc, boundsBuilder.getIntegerAttr(forOp.getUpperBound().getType(),
419  upperBoundUnrolledCst));
420  else
421  upperBoundUnrolled = forOp.getUpperBound();
422 
423  // Create constant for 'stepUnrolled'.
424  stepUnrolled = stepCst == stepUnrolledCst
425  ? step
426  : boundsBuilder.create<arith::ConstantOp>(
427  loc, boundsBuilder.getIntegerAttr(
428  step.getType(), stepUnrolledCst));
429  } else {
430  // Dynamic loop bounds computation.
431  // TODO: Add dynamic asserts for negative lb/ub/step, or
432  // consider using ceilDiv from AffineApplyExpander.
433  auto lowerBound = forOp.getLowerBound();
434  auto upperBound = forOp.getUpperBound();
435  Value diff =
436  boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
437  Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
438  Value unrollFactorCst = boundsBuilder.create<arith::ConstantOp>(
439  loc, boundsBuilder.getIntegerAttr(tripCount.getType(), unrollFactor));
440  Value tripCountRem =
441  boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
442  // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
443  Value tripCountEvenMultiple =
444  boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
445  // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
446  upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
447  loc, lowerBound,
448  boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
449  // Scale 'step' by 'unrollFactor'.
450  stepUnrolled =
451  boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
452  }
453 
454  UnrolledLoopInfo resultLoops;
455 
456  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
457  if (generateEpilogueLoop) {
458  OpBuilder epilogueBuilder(forOp->getContext());
459  epilogueBuilder.setInsertionPointAfter(forOp);
460  auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
461  epilogueForOp.setLowerBound(upperBoundUnrolled);
462 
463  // Update uses of loop results.
464  auto results = forOp.getResults();
465  auto epilogueResults = epilogueForOp.getResults();
466 
467  for (auto e : llvm::zip(results, epilogueResults)) {
468  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
469  }
470  epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
471  epilogueForOp.getInitArgs().size(), results);
472  if (epilogueForOp.promoteIfSingleIteration(rewriter).failed())
473  resultLoops.epilogueLoopOp = epilogueForOp;
474  }
475 
476  // Create unrolled loop.
477  forOp.setUpperBound(upperBoundUnrolled);
478  forOp.setStep(stepUnrolled);
479 
480  auto iterArgs = ValueRange(forOp.getRegionIterArgs());
481  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
482 
484  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
485  [&](unsigned i, Value iv, OpBuilder b) {
486  // iv' = iv + step * i;
487  auto stride = b.create<arith::MulIOp>(
488  loc, step,
489  b.create<arith::ConstantOp>(loc,
490  b.getIntegerAttr(iv.getType(), i)));
491  return b.create<arith::AddIOp>(loc, iv, stride);
492  },
493  annotateFn, iterArgs, yieldedValues);
494  // Promote the loop body up if this has turned into a single iteration loop.
495  if (forOp.promoteIfSingleIteration(rewriter).failed())
496  resultLoops.mainLoopOp = forOp;
497  return resultLoops;
498 }
499 
500 /// Check if bounds of all inner loops are defined outside of `forOp`
501 /// and return false if not.
502 static bool areInnerBoundsInvariant(scf::ForOp forOp) {
503  auto walkResult = forOp.walk([&](scf::ForOp innerForOp) {
504  if (!forOp.isDefinedOutsideOfLoop(innerForOp.getLowerBound()) ||
505  !forOp.isDefinedOutsideOfLoop(innerForOp.getUpperBound()) ||
506  !forOp.isDefinedOutsideOfLoop(innerForOp.getStep()))
507  return WalkResult::interrupt();
508 
509  return WalkResult::advance();
510  });
511  return !walkResult.wasInterrupted();
512 }
513 
514 /// Unrolls and jams this loop by the specified factor.
515 LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
516  uint64_t unrollJamFactor) {
517  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
518 
519  if (unrollJamFactor == 1)
520  return success();
521 
522  // If any control operand of any inner loop of `forOp` is defined within
523  // `forOp`, no unroll jam.
524  if (!areInnerBoundsInvariant(forOp)) {
525  LDBG("failed to unroll and jam: inner bounds are not invariant");
526  return failure();
527  }
528 
529  // Currently, for operations with results are not supported.
530  if (forOp->getNumResults() > 0) {
531  LDBG("failed to unroll and jam: unsupported loop with results");
532  return failure();
533  }
534 
535  // Currently, only constant trip count that divided by the unroll factor is
536  // supported.
537  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
538  if (!tripCount.has_value()) {
539  // If the trip count is dynamic, do not unroll & jam.
540  LDBG("failed to unroll and jam: trip count could not be determined");
541  return failure();
542  }
543  if (unrollJamFactor > *tripCount) {
544  LDBG("unroll and jam factor is greater than trip count, set factor to trip "
545  "count");
546  unrollJamFactor = *tripCount;
547  } else if (*tripCount % unrollJamFactor != 0) {
548  LDBG("failed to unroll and jam: unsupported trip count that is not a "
549  "multiple of unroll jam factor");
550  return failure();
551  }
552 
553  // Nothing in the loop body other than the terminator.
554  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
555  return success();
556 
557  // Gather all sub-blocks to jam upon the loop being unrolled.
559  jbg.walk(forOp);
560  auto &subBlocks = jbg.subBlocks;
561 
562  // Collect inner loops.
563  SmallVector<scf::ForOp> innerLoops;
564  forOp.walk([&](scf::ForOp innerForOp) { innerLoops.push_back(innerForOp); });
565 
566  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
567  // iteration. There are (`unrollJamFactor` - 1) iterations.
568  SmallVector<IRMapping> operandMaps(unrollJamFactor - 1);
569 
570  // For any loop with iter_args, replace it with a new loop that has
571  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
572  // operands.
573  SmallVector<scf::ForOp> newInnerLoops;
574  IRRewriter rewriter(forOp.getContext());
575  for (scf::ForOp oldForOp : innerLoops) {
576  SmallVector<Value> dupIterOperands, dupYieldOperands;
577  ValueRange oldIterOperands = oldForOp.getInits();
578  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
579  ValueRange oldYieldOperands =
580  cast<scf::YieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
581  // Get additional iterOperands, iterArgs, and yield operands. We will
582  // fix iterOperands and yield operands after cloning of sub-blocks.
583  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
584  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
585  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
586  }
587  // Create a new loop with additional iterOperands, iter_args and yield
588  // operands. This new loop will take the loop body of the original loop.
589  bool forOpReplaced = oldForOp == forOp;
590  scf::ForOp newForOp =
591  cast<scf::ForOp>(*oldForOp.replaceWithAdditionalYields(
592  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
593  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
594  return dupYieldOperands;
595  }));
596  newInnerLoops.push_back(newForOp);
597  // `forOp` has been replaced with a new loop.
598  if (forOpReplaced)
599  forOp = newForOp;
600  // Update `operandMaps` for `newForOp` iterArgs and results.
601  ValueRange newIterArgs = newForOp.getRegionIterArgs();
602  unsigned oldNumIterArgs = oldIterArgs.size();
603  ValueRange newResults = newForOp.getResults();
604  unsigned oldNumResults = newResults.size() / unrollJamFactor;
605  assert(oldNumIterArgs == oldNumResults &&
606  "oldNumIterArgs must be the same as oldNumResults");
607  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
608  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
609  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
610  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
611  // to those in the `i`th new set.
612  operandMaps[i - 1].map(newIterArgs[j],
613  newIterArgs[i * oldNumIterArgs + j]);
614  operandMaps[i - 1].map(newResults[j],
615  newResults[i * oldNumResults + j]);
616  }
617  }
618  }
619 
620  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
621  rewriter.setInsertionPoint(forOp);
622  int64_t step = forOp.getConstantStep()->getSExtValue();
623  auto newStep = rewriter.createOrFold<arith::MulIOp>(
624  forOp.getLoc(), forOp.getStep(),
625  rewriter.createOrFold<arith::ConstantOp>(
626  forOp.getLoc(), rewriter.getIndexAttr(unrollJamFactor)));
627  forOp.setStep(newStep);
628  auto forOpIV = forOp.getInductionVar();
629 
630  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
631  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
632  for (auto &subBlock : subBlocks) {
633  // Builder to insert unroll-jammed bodies. Insert right at the end of
634  // sub-block.
635  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
636 
637  // If the induction variable is used, create a remapping to the value for
638  // this unrolled instance.
639  if (!forOpIV.use_empty()) {
640  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
641  auto ivTag = builder.createOrFold<arith::ConstantOp>(
642  forOp.getLoc(), builder.getIndexAttr(step * i));
643  auto ivUnroll =
644  builder.createOrFold<arith::AddIOp>(forOp.getLoc(), forOpIV, ivTag);
645  operandMaps[i - 1].map(forOpIV, ivUnroll);
646  }
647  // Clone the sub-block being unroll-jammed.
648  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
649  builder.clone(*it, operandMaps[i - 1]);
650  }
651  // Fix iterOperands and yield op operands of newly created loops.
652  for (auto newForOp : newInnerLoops) {
653  unsigned oldNumIterOperands =
654  newForOp.getNumRegionIterArgs() / unrollJamFactor;
655  unsigned numControlOperands = newForOp.getNumControlOperands();
656  auto yieldOp = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
657  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
658  assert(oldNumIterOperands == oldNumYieldOperands &&
659  "oldNumIterOperands must be the same as oldNumYieldOperands");
660  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
661  // The `i`th duplication of an old iterOperand or yield op operand
662  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
663  // if such mapped value exists.
664  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
665  operandMaps[i - 1].lookupOrDefault(
666  newForOp.getOperand(numControlOperands + j)));
667  yieldOp.setOperand(
668  i * oldNumYieldOperands + j,
669  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
670  }
671  }
672  }
673 
674  // Promote the loop body up if this has turned into a single iteration loop.
675  (void)forOp.promoteIfSingleIteration(rewriter);
676  return success();
677 }
678 
680  OpFoldResult lb, OpFoldResult ub,
681  OpFoldResult step) {
682  Range normalizedLoopBounds;
683  normalizedLoopBounds.offset = rewriter.getIndexAttr(0);
684  normalizedLoopBounds.stride = rewriter.getIndexAttr(1);
685  AffineExpr s0, s1, s2;
686  bindSymbols(rewriter.getContext(), s0, s1, s2);
687  AffineExpr e = (s1 - s0).ceilDiv(s2);
688  normalizedLoopBounds.size =
689  affine::makeComposedFoldedAffineApply(rewriter, loc, e, {lb, ub, step});
690  return normalizedLoopBounds;
691 }
692 
694  OpFoldResult lb, OpFoldResult ub,
695  OpFoldResult step) {
696  if (getType(lb).isIndex()) {
697  return emitNormalizedLoopBoundsForIndexType(rewriter, loc, lb, ub, step);
698  }
699  // For non-index types, generate `arith` instructions
700  // Check if the loop is already known to have a constant zero lower bound or
701  // a constant one step.
702  bool isZeroBased = false;
703  if (auto lbCst = getConstantIntValue(lb))
704  isZeroBased = lbCst.value() == 0;
705 
706  bool isStepOne = false;
707  if (auto stepCst = getConstantIntValue(step))
708  isStepOne = stepCst.value() == 1;
709 
710  Type rangeType = getType(lb);
711  assert(rangeType == getType(ub) && rangeType == getType(step) &&
712  "expected matching types");
713 
714  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
715  // assuming the step is strictly positive. Update the bounds and the step
716  // of the loop to go from 0 to the number of iterations, if necessary.
717  if (isZeroBased && isStepOne)
718  return {lb, ub, step};
719 
720  OpFoldResult diff = ub;
721  if (!isZeroBased) {
722  diff = rewriter.createOrFold<arith::SubIOp>(
723  loc, getValueOrCreateConstantIntOp(rewriter, loc, ub),
724  getValueOrCreateConstantIntOp(rewriter, loc, lb));
725  }
726  OpFoldResult newUpperBound = diff;
727  if (!isStepOne) {
728  newUpperBound = rewriter.createOrFold<arith::CeilDivSIOp>(
729  loc, getValueOrCreateConstantIntOp(rewriter, loc, diff),
730  getValueOrCreateConstantIntOp(rewriter, loc, step));
731  }
732 
733  OpFoldResult newLowerBound = rewriter.getZeroAttr(rangeType);
734  OpFoldResult newStep = rewriter.getOneAttr(rangeType);
735 
736  return {newLowerBound, newUpperBound, newStep};
737 }
738 
740  Location loc,
741  Value normalizedIv,
742  OpFoldResult origLb,
743  OpFoldResult origStep) {
744  AffineExpr d0, s0, s1;
745  bindSymbols(rewriter.getContext(), s0, s1);
746  bindDims(rewriter.getContext(), d0);
747  AffineExpr e = d0 * s1 + s0;
749  rewriter, loc, e, ArrayRef<OpFoldResult>{normalizedIv, origLb, origStep});
750  Value denormalizedIvVal =
751  getValueOrCreateConstantIndexOp(rewriter, loc, denormalizedIv);
752  SmallPtrSet<Operation *, 1> preservedUses;
753  // If an `affine.apply` operation is generated for denormalization, the use
754  // of `origLb` in those ops must not be replaced. These arent not generated
755  // when `origLb == 0` and `origStep == 1`.
756  if (!isConstantIntValue(origLb, 0) || !isConstantIntValue(origStep, 1)) {
757  if (Operation *preservedUse = denormalizedIvVal.getDefiningOp()) {
758  preservedUses.insert(preservedUse);
759  }
760  }
761  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIvVal, preservedUses);
762 }
763 
765  Value normalizedIv, OpFoldResult origLb,
766  OpFoldResult origStep) {
767  if (getType(origLb).isIndex()) {
768  return denormalizeInductionVariableForIndexType(rewriter, loc, normalizedIv,
769  origLb, origStep);
770  }
771  Value denormalizedIv;
773  bool isStepOne = isConstantIntValue(origStep, 1);
774  bool isZeroBased = isConstantIntValue(origLb, 0);
775 
776  Value scaled = normalizedIv;
777  if (!isStepOne) {
778  Value origStepValue =
779  getValueOrCreateConstantIntOp(rewriter, loc, origStep);
780  scaled = rewriter.create<arith::MulIOp>(loc, normalizedIv, origStepValue);
781  preserve.insert(scaled.getDefiningOp());
782  }
783  denormalizedIv = scaled;
784  if (!isZeroBased) {
785  Value origLbValue = getValueOrCreateConstantIntOp(rewriter, loc, origLb);
786  denormalizedIv = rewriter.create<arith::AddIOp>(loc, scaled, origLbValue);
787  preserve.insert(denormalizedIv.getDefiningOp());
788  }
789 
790  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
791 }
792 
794  ArrayRef<OpFoldResult> values) {
795  assert(!values.empty() && "unexecpted empty array");
796  AffineExpr s0, s1;
797  bindSymbols(rewriter.getContext(), s0, s1);
798  AffineExpr mul = s0 * s1;
799  OpFoldResult products = rewriter.getIndexAttr(1);
800  for (auto v : values) {
802  rewriter, loc, mul, ArrayRef<OpFoldResult>{products, v});
803  }
804  return products;
805 }
806 
807 /// Helper function to multiply a sequence of values.
809  ArrayRef<Value> values) {
810  assert(!values.empty() && "unexpected empty list");
811  if (getType(values.front()).isIndex()) {
813  OpFoldResult product = getProductOfIndexes(rewriter, loc, ofrs);
814  return getValueOrCreateConstantIndexOp(rewriter, loc, product);
815  }
816  std::optional<Value> productOf;
817  for (auto v : values) {
818  auto vOne = getConstantIntValue(v);
819  if (vOne && vOne.value() == 1)
820  continue;
821  if (productOf)
822  productOf =
823  rewriter.create<arith::MulIOp>(loc, productOf.value(), v).getResult();
824  else
825  productOf = v;
826  }
827  if (!productOf) {
828  productOf = rewriter
829  .create<arith::ConstantOp>(
830  loc, rewriter.getOneAttr(getType(values.front())))
831  .getResult();
832  }
833  return productOf.value();
834 }
835 
836 /// For each original loop, the value of the
837 /// induction variable can be obtained by dividing the induction variable of
838 /// the linearized loop by the total number of iterations of the loops nested
839 /// in it modulo the number of iterations in this loop (remove the values
840 /// related to the outer loops):
841 /// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
842 /// Compute these iteratively from the innermost loop by creating a "running
843 /// quotient" of division by the range.
844 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
846  Value linearizedIv, ArrayRef<Value> ubs) {
847 
848  if (linearizedIv.getType().isIndex()) {
849  Operation *delinearizedOp =
850  rewriter.create<affine::AffineDelinearizeIndexOp>(loc, linearizedIv,
851  ubs);
852  auto resultVals = llvm::map_to_vector(
853  delinearizedOp->getResults(), [](OpResult r) -> Value { return r; });
854  return {resultVals, SmallPtrSet<Operation *, 2>{delinearizedOp}};
855  }
856 
857  SmallVector<Value> delinearizedIvs(ubs.size());
858  SmallPtrSet<Operation *, 2> preservedUsers;
859 
860  llvm::BitVector isUbOne(ubs.size());
861  for (auto [index, ub] : llvm::enumerate(ubs)) {
862  auto ubCst = getConstantIntValue(ub);
863  if (ubCst && ubCst.value() == 1)
864  isUbOne.set(index);
865  }
866 
867  // Prune the lead ubs that are all ones.
868  unsigned numLeadingOneUbs = 0;
869  for (auto [index, ub] : llvm::enumerate(ubs)) {
870  if (!isUbOne.test(index)) {
871  break;
872  }
873  delinearizedIvs[index] = rewriter.create<arith::ConstantOp>(
874  loc, rewriter.getZeroAttr(ub.getType()));
875  numLeadingOneUbs++;
876  }
877 
878  Value previous = linearizedIv;
879  for (unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {
880  unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;
881  if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {
882  previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
883  preservedUsers.insert(previous.getDefiningOp());
884  }
885  Value iv = previous;
886  if (i != e - 1) {
887  if (!isUbOne.test(idx)) {
888  iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
889  preservedUsers.insert(iv.getDefiningOp());
890  } else {
891  iv = rewriter.create<arith::ConstantOp>(
892  loc, rewriter.getZeroAttr(ubs[idx].getType()));
893  }
894  }
895  delinearizedIvs[idx] = iv;
896  }
897  return {delinearizedIvs, preservedUsers};
898 }
899 
900 LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,
902  if (loops.size() < 2)
903  return failure();
904 
905  scf::ForOp innermost = loops.back();
906  scf::ForOp outermost = loops.front();
907 
908  // 1. Make sure all loops iterate from 0 to upperBound with step 1. This
909  // allows the following code to assume upperBound is the number of iterations.
910  for (auto loop : loops) {
911  OpBuilder::InsertionGuard g(rewriter);
912  rewriter.setInsertionPoint(outermost);
913  Value lb = loop.getLowerBound();
914  Value ub = loop.getUpperBound();
915  Value step = loop.getStep();
916  auto newLoopRange =
917  emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);
918 
919  rewriter.modifyOpInPlace(loop, [&]() {
920  loop.setLowerBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
921  newLoopRange.offset));
922  loop.setUpperBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
923  newLoopRange.size));
924  loop.setStep(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
925  newLoopRange.stride));
926  });
927  rewriter.setInsertionPointToStart(innermost.getBody());
928  denormalizeInductionVariable(rewriter, loop.getLoc(),
929  loop.getInductionVar(), lb, step);
930  }
931 
932  // 2. Emit code computing the upper bound of the coalesced loop as product
933  // of the number of iterations of all loops.
934  OpBuilder::InsertionGuard g(rewriter);
935  rewriter.setInsertionPoint(outermost);
936  Location loc = outermost.getLoc();
937  SmallVector<Value> upperBounds = llvm::map_to_vector(
938  loops, [](auto loop) { return loop.getUpperBound(); });
939  Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);
940  outermost.setUpperBound(upperBound);
941 
942  rewriter.setInsertionPointToStart(innermost.getBody());
943  auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(
944  rewriter, loc, outermost.getInductionVar(), upperBounds);
945  rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],
946  preservedUsers);
947 
948  for (int i = loops.size() - 1; i > 0; --i) {
949  auto outerLoop = loops[i - 1];
950  auto innerLoop = loops[i];
951 
952  Operation *innerTerminator = innerLoop.getBody()->getTerminator();
953  auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());
954  assert(llvm::equal(outerLoop.getRegionIterArgs(), innerLoop.getInitArgs()));
955  for (Value &yieldedVal : yieldedVals) {
956  // The yielded value may be an iteration argument of the inner loop
957  // which is about to be inlined.
958  auto iter = llvm::find(innerLoop.getRegionIterArgs(), yieldedVal);
959  if (iter != innerLoop.getRegionIterArgs().end()) {
960  unsigned iterArgIndex = iter - innerLoop.getRegionIterArgs().begin();
961  // `outerLoop` iter args identical to the `innerLoop` init args.
962  assert(iterArgIndex < innerLoop.getInitArgs().size());
963  yieldedVal = innerLoop.getInitArgs()[iterArgIndex];
964  }
965  }
966  rewriter.eraseOp(innerTerminator);
967 
968  SmallVector<Value> innerBlockArgs;
969  innerBlockArgs.push_back(delinearizeIvs[i]);
970  llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
971  rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),
972  Block::iterator(innerLoop), innerBlockArgs);
973  rewriter.replaceOp(innerLoop, yieldedVals);
974  }
975  return success();
976 }
977 
979  if (loops.empty()) {
980  return failure();
981  }
982  IRRewriter rewriter(loops.front().getContext());
983  return coalesceLoops(rewriter, loops);
984 }
985 
986 LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {
987  LogicalResult result(failure());
989  getPerfectlyNestedLoops(loops, op);
990 
991  // Look for a band of loops that can be coalesced, i.e. perfectly nested
992  // loops with bounds defined above some loop.
993 
994  // 1. For each loop, find above which parent loop its bounds operands are
995  // defined.
996  SmallVector<unsigned> operandsDefinedAbove(loops.size());
997  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
998  operandsDefinedAbove[i] = i;
999  for (unsigned j = 0; j < i; ++j) {
1000  SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),
1001  loops[i].getUpperBound(),
1002  loops[i].getStep()};
1003  if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {
1004  operandsDefinedAbove[i] = j;
1005  break;
1006  }
1007  }
1008  }
1009 
1010  // 2. For each inner loop check that the iter_args for the immediately outer
1011  // loop are the init for the immediately inner loop and that the yields of the
1012  // return of the inner loop is the yield for the immediately outer loop. Keep
1013  // track of where the chain starts from for each loop.
1014  SmallVector<unsigned> iterArgChainStart(loops.size());
1015  iterArgChainStart[0] = 0;
1016  for (unsigned i = 1, e = loops.size(); i < e; ++i) {
1017  // By default set the start of the chain to itself.
1018  iterArgChainStart[i] = i;
1019  auto outerloop = loops[i - 1];
1020  auto innerLoop = loops[i];
1021  if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
1022  continue;
1023  }
1024  if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
1025  continue;
1026  }
1027  auto outerloopTerminator = outerloop.getBody()->getTerminator();
1028  if (!llvm::equal(outerloopTerminator->getOperands(),
1029  innerLoop.getResults())) {
1030  continue;
1031  }
1032  iterArgChainStart[i] = iterArgChainStart[i - 1];
1033  }
1034 
1035  // 3. Identify bands of loops such that the operands of all of them are
1036  // defined above the first loop in the band. Traverse the nest bottom-up
1037  // so that modifications don't invalidate the inner loops.
1038  for (unsigned end = loops.size(); end > 0; --end) {
1039  unsigned start = 0;
1040  for (; start < end - 1; ++start) {
1041  auto maxPos =
1042  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
1043  std::next(operandsDefinedAbove.begin(), end));
1044  if (maxPos > start)
1045  continue;
1046  if (iterArgChainStart[end - 1] > start)
1047  continue;
1048  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
1049  if (succeeded(coalesceLoops(band)))
1050  result = success();
1051  break;
1052  }
1053  // If a band was found and transformed, keep looking at the loops above
1054  // the outermost transformed loop.
1055  if (start != end - 1)
1056  end = start + 1;
1057  }
1058  return result;
1059 }
1060 
1062  RewriterBase &rewriter, scf::ParallelOp loops,
1063  ArrayRef<std::vector<unsigned>> combinedDimensions) {
1064  OpBuilder::InsertionGuard g(rewriter);
1065  rewriter.setInsertionPoint(loops);
1066  Location loc = loops.getLoc();
1067 
1068  // Presort combined dimensions.
1069  auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
1070  for (auto &dims : sortedDimensions)
1071  llvm::sort(dims);
1072 
1073  // Normalize ParallelOp's iteration pattern.
1074  SmallVector<Value, 3> normalizedUpperBounds;
1075  for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
1076  OpBuilder::InsertionGuard g2(rewriter);
1077  rewriter.setInsertionPoint(loops);
1078  Value lb = loops.getLowerBound()[i];
1079  Value ub = loops.getUpperBound()[i];
1080  Value step = loops.getStep()[i];
1081  auto newLoopRange = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);
1082  normalizedUpperBounds.push_back(getValueOrCreateConstantIntOp(
1083  rewriter, loops.getLoc(), newLoopRange.size));
1084 
1085  rewriter.setInsertionPointToStart(loops.getBody());
1086  denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,
1087  step);
1088  }
1089 
1090  // Combine iteration spaces.
1091  SmallVector<Value, 3> lowerBounds, upperBounds, steps;
1092  auto cst0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
1093  auto cst1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
1094  for (auto &sortedDimension : sortedDimensions) {
1095  Value newUpperBound = rewriter.create<arith::ConstantIndexOp>(loc, 1);
1096  for (auto idx : sortedDimension) {
1097  newUpperBound = rewriter.create<arith::MulIOp>(
1098  loc, newUpperBound, normalizedUpperBounds[idx]);
1099  }
1100  lowerBounds.push_back(cst0);
1101  steps.push_back(cst1);
1102  upperBounds.push_back(newUpperBound);
1103  }
1104 
1105  // Create new ParallelLoop with conversions to the original induction values.
1106  // The loop below uses divisions to get the relevant range of values in the
1107  // new induction value that represent each range of the original induction
1108  // value. The remainders then determine based on that range, which iteration
1109  // of the original induction value this represents. This is a normalized value
1110  // that is un-normalized already by the previous logic.
1111  auto newPloop = rewriter.create<scf::ParallelOp>(
1112  loc, lowerBounds, upperBounds, steps,
1113  [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
1114  for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
1115  Value previous = ploopIVs[i];
1116  unsigned numberCombinedDimensions = combinedDimensions[i].size();
1117  // Iterate over all except the last induction value.
1118  for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
1119  unsigned idx = combinedDimensions[i][j];
1120 
1121  // Determine the current induction value's current loop iteration
1122  Value iv = insideBuilder.create<arith::RemSIOp>(
1123  loc, previous, normalizedUpperBounds[idx]);
1124  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
1125  loops.getRegion());
1126 
1127  // Remove the effect of the current induction value to prepare for
1128  // the next value.
1129  previous = insideBuilder.create<arith::DivSIOp>(
1130  loc, previous, normalizedUpperBounds[idx]);
1131  }
1132 
1133  // The final induction value is just the remaining value.
1134  unsigned idx = combinedDimensions[i][0];
1135  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
1136  previous, loops.getRegion());
1137  }
1138  });
1139 
1140  // Replace the old loop with the new loop.
1141  loops.getBody()->back().erase();
1142  newPloop.getBody()->getOperations().splice(
1143  Block::iterator(newPloop.getBody()->back()),
1144  loops.getBody()->getOperations());
1145  loops.erase();
1146 }
1147 
1148 // Hoist the ops within `outer` that appear before `inner`.
1149 // Such ops include the ops that have been introduced by parametric tiling.
1150 // Ops that come from triangular loops (i.e. that belong to the program slice
1151 // rooted at `outer`) and ops that have side effects cannot be hoisted.
1152 // Return failure when any op fails to hoist.
1153 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
1154  SetVector<Operation *> forwardSlice;
1156  options.filter = [&inner](Operation *op) {
1157  return op != inner.getOperation();
1158  };
1159  getForwardSlice(outer.getInductionVar(), &forwardSlice, options);
1160  LogicalResult status = success();
1162  for (auto &op : outer.getBody()->without_terminator()) {
1163  // Stop when encountering the inner loop.
1164  if (&op == inner.getOperation())
1165  break;
1166  // Skip over non-hoistable ops.
1167  if (forwardSlice.count(&op) > 0) {
1168  status = failure();
1169  continue;
1170  }
1171  // Skip intermediate scf::ForOp, these are not considered a failure.
1172  if (isa<scf::ForOp>(op))
1173  continue;
1174  // Skip other ops with regions.
1175  if (op.getNumRegions() > 0) {
1176  status = failure();
1177  continue;
1178  }
1179  // Skip if op has side effects.
1180  // TODO: loads to immutable memory regions are ok.
1181  if (!isMemoryEffectFree(&op)) {
1182  status = failure();
1183  continue;
1184  }
1185  toHoist.push_back(&op);
1186  }
1187  auto *outerForOp = outer.getOperation();
1188  for (auto *op : toHoist)
1189  op->moveBefore(outerForOp);
1190  return status;
1191 }
1192 
1193 // Traverse the interTile and intraTile loops and try to hoist ops such that
1194 // bands of perfectly nested loops are isolated.
1195 // Return failure if either perfect interTile or perfect intraTile bands cannot
1196 // be formed.
1197 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
1198  LogicalResult status = success();
1199  const Loops &interTile = tileLoops.first;
1200  const Loops &intraTile = tileLoops.second;
1201  auto size = interTile.size();
1202  assert(size == intraTile.size());
1203  if (size <= 1)
1204  return success();
1205  for (unsigned s = 1; s < size; ++s)
1206  status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
1207  : failure();
1208  for (unsigned s = 1; s < size; ++s)
1209  status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
1210  : failure();
1211  return status;
1212 }
1213 
1214 /// Collect perfectly nested loops starting from `rootForOps`. Loops are
1215 /// perfectly nested if each loop is the first and only non-terminator operation
1216 /// in the parent loop. Collect at most `maxLoops` loops and append them to
1217 /// `forOps`.
1218 template <typename T>
1220  SmallVectorImpl<T> &forOps, T rootForOp,
1221  unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
1222  for (unsigned i = 0; i < maxLoops; ++i) {
1223  forOps.push_back(rootForOp);
1224  Block &body = rootForOp.getRegion().front();
1225  if (body.begin() != std::prev(body.end(), 2))
1226  return;
1227 
1228  rootForOp = dyn_cast<T>(&body.front());
1229  if (!rootForOp)
1230  return;
1231  }
1232 }
1233 
1234 static Loops stripmineSink(scf::ForOp forOp, Value factor,
1235  ArrayRef<scf::ForOp> targets) {
1236  auto originalStep = forOp.getStep();
1237  auto iv = forOp.getInductionVar();
1238 
1239  OpBuilder b(forOp);
1240  forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
1241 
1242  Loops innerLoops;
1243  for (auto t : targets) {
1244  // Save information for splicing ops out of t when done
1245  auto begin = t.getBody()->begin();
1246  auto nOps = t.getBody()->getOperations().size();
1247 
1248  // Insert newForOp before the terminator of `t`.
1249  auto b = OpBuilder::atBlockTerminator((t.getBody()));
1250  Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
1251  Value ub =
1252  b.create<arith::MinSIOp>(t.getLoc(), forOp.getUpperBound(), stepped);
1253 
1254  // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
1255  auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
1256  newForOp.getBody()->getOperations().splice(
1257  newForOp.getBody()->getOperations().begin(),
1258  t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
1259  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1260  newForOp.getRegion());
1261 
1262  innerLoops.push_back(newForOp);
1263  }
1264 
1265  return innerLoops;
1266 }
1267 
1268 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1269 // Returns the new for operation, nested immediately under `target`.
1270 template <typename SizeType>
1271 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
1272  scf::ForOp target) {
1273  // TODO: Use cheap structural assertions that targets are nested under
1274  // forOp and that targets are not nested under each other when DominanceInfo
1275  // exposes the capability. It seems overkill to construct a whole function
1276  // dominance tree at this point.
1277  auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
1278  assert(res.size() == 1 && "Expected 1 inner forOp");
1279  return res[0];
1280 }
1281 
1283  ArrayRef<Value> sizes,
1284  ArrayRef<scf::ForOp> targets) {
1286  SmallVector<scf::ForOp, 8> currentTargets(targets);
1287  for (auto it : llvm::zip(forOps, sizes)) {
1288  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1289  res.push_back(step);
1290  currentTargets = step;
1291  }
1292  return res;
1293 }
1294 
1296  scf::ForOp target) {
1298  for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
1299  assert(loops.size() == 1);
1300  res.push_back(loops[0]);
1301  }
1302  return res;
1303 }
1304 
1305 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
1306  // Collect perfectly nested loops. If more size values provided than nested
1307  // loops available, truncate `sizes`.
1309  forOps.reserve(sizes.size());
1310  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1311  if (forOps.size() < sizes.size())
1312  sizes = sizes.take_front(forOps.size());
1313 
1314  return ::tile(forOps, sizes, forOps.back());
1315 }
1316 
1318  scf::ForOp root) {
1319  getPerfectlyNestedLoopsImpl(nestedLoops, root);
1320 }
1321 
1323  ArrayRef<int64_t> sizes) {
1324  // Collect perfectly nested loops. If more size values provided than nested
1325  // loops available, truncate `sizes`.
1327  forOps.reserve(sizes.size());
1328  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1329  if (forOps.size() < sizes.size())
1330  sizes = sizes.take_front(forOps.size());
1331 
1332  // Compute the tile sizes such that i-th outer loop executes size[i]
1333  // iterations. Given that the loop current executes
1334  // numIterations = ceildiv((upperBound - lowerBound), step)
1335  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
1336  SmallVector<Value, 4> tileSizes;
1337  tileSizes.reserve(sizes.size());
1338  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
1339  assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
1340 
1341  auto forOp = forOps[i];
1342  OpBuilder builder(forOp);
1343  auto loc = forOp.getLoc();
1344  Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
1345  forOp.getLowerBound());
1346  Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
1347  Value iterationsPerBlock =
1348  ceilDivPositive(builder, loc, numIterations, sizes[i]);
1349  tileSizes.push_back(iterationsPerBlock);
1350  }
1351 
1352  // Call parametric tiling with the given sizes.
1353  auto intraTile = tile(forOps, tileSizes, forOps.back());
1354  TileLoops tileLoops = std::make_pair(forOps, intraTile);
1355 
1356  // TODO: for now we just ignore the result of band isolation.
1357  // In the future, mapping decisions may be impacted by the ability to
1358  // isolate perfectly nested bands.
1359  (void)tryIsolateBands(tileLoops);
1360 
1361  return tileLoops;
1362 }
1363 
1364 scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,
1365  scf::ForallOp source,
1366  RewriterBase &rewriter) {
1367  unsigned numTargetOuts = target.getNumResults();
1368  unsigned numSourceOuts = source.getNumResults();
1369 
1370  // Create fused shared_outs.
1371  SmallVector<Value> fusedOuts;
1372  llvm::append_range(fusedOuts, target.getOutputs());
1373  llvm::append_range(fusedOuts, source.getOutputs());
1374 
1375  // Create a new scf.forall op after the source loop.
1376  rewriter.setInsertionPointAfter(source);
1377  scf::ForallOp fusedLoop = rewriter.create<scf::ForallOp>(
1378  source.getLoc(), source.getMixedLowerBound(), source.getMixedUpperBound(),
1379  source.getMixedStep(), fusedOuts, source.getMapping());
1380 
1381  // Map control operands.
1382  IRMapping mapping;
1383  mapping.map(target.getInductionVars(), fusedLoop.getInductionVars());
1384  mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());
1385 
1386  // Map shared outs.
1387  mapping.map(target.getRegionIterArgs(),
1388  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1389  mapping.map(source.getRegionIterArgs(),
1390  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1391 
1392  // Append everything except the terminator into the fused operation.
1393  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1394  for (Operation &op : target.getBody()->without_terminator())
1395  rewriter.clone(op, mapping);
1396  for (Operation &op : source.getBody()->without_terminator())
1397  rewriter.clone(op, mapping);
1398 
1399  // Fuse the old terminator in_parallel ops into the new one.
1400  scf::InParallelOp targetTerm = target.getTerminator();
1401  scf::InParallelOp sourceTerm = source.getTerminator();
1402  scf::InParallelOp fusedTerm = fusedLoop.getTerminator();
1403  rewriter.setInsertionPointToStart(fusedTerm.getBody());
1404  for (Operation &op : targetTerm.getYieldingOps())
1405  rewriter.clone(op, mapping);
1406  for (Operation &op : sourceTerm.getYieldingOps())
1407  rewriter.clone(op, mapping);
1408 
1409  // Replace old loops by substituting their uses by results of the fused loop.
1410  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1411  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1412 
1413  return fusedLoop;
1414 }
1415 
1416 scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
1417  scf::ForOp source,
1418  RewriterBase &rewriter) {
1419  unsigned numTargetOuts = target.getNumResults();
1420  unsigned numSourceOuts = source.getNumResults();
1421 
1422  // Create fused init_args, with target's init_args before source's init_args.
1423  SmallVector<Value> fusedInitArgs;
1424  llvm::append_range(fusedInitArgs, target.getInitArgs());
1425  llvm::append_range(fusedInitArgs, source.getInitArgs());
1426 
1427  // Create a new scf.for op after the source loop (with scf.yield terminator
1428  // (without arguments) only in case its init_args is empty).
1429  rewriter.setInsertionPointAfter(source);
1430  scf::ForOp fusedLoop = rewriter.create<scf::ForOp>(
1431  source.getLoc(), source.getLowerBound(), source.getUpperBound(),
1432  source.getStep(), fusedInitArgs);
1433 
1434  // Map original induction variables and operands to those of the fused loop.
1435  IRMapping mapping;
1436  mapping.map(target.getInductionVar(), fusedLoop.getInductionVar());
1437  mapping.map(target.getRegionIterArgs(),
1438  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1439  mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());
1440  mapping.map(source.getRegionIterArgs(),
1441  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1442 
1443  // Merge target's body into the new (fused) for loop and then source's body.
1444  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1445  for (Operation &op : target.getBody()->without_terminator())
1446  rewriter.clone(op, mapping);
1447  for (Operation &op : source.getBody()->without_terminator())
1448  rewriter.clone(op, mapping);
1449 
1450  // Build fused yield results by appropriately mapping original yield operands.
1451  SmallVector<Value> yieldResults;
1452  for (Value operand : target.getBody()->getTerminator()->getOperands())
1453  yieldResults.push_back(mapping.lookupOrDefault(operand));
1454  for (Value operand : source.getBody()->getTerminator()->getOperands())
1455  yieldResults.push_back(mapping.lookupOrDefault(operand));
1456  if (!yieldResults.empty())
1457  rewriter.create<scf::YieldOp>(source.getLoc(), yieldResults);
1458 
1459  // Replace old loops by substituting their uses by results of the fused loop.
1460  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1461  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1462 
1463  return fusedLoop;
1464 }
1465 
1466 FailureOr<scf::ForallOp> mlir::normalizeForallOp(RewriterBase &rewriter,
1467  scf::ForallOp forallOp) {
1468  SmallVector<OpFoldResult> lbs = forallOp.getMixedLowerBound();
1469  SmallVector<OpFoldResult> ubs = forallOp.getMixedUpperBound();
1470  SmallVector<OpFoldResult> steps = forallOp.getMixedStep();
1471 
1472  if (llvm::all_of(
1473  lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
1474  llvm::all_of(
1475  steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
1476  return forallOp;
1477  }
1478 
1479  SmallVector<OpFoldResult> newLbs, newUbs, newSteps;
1480  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
1481  Range normalizedLoopParams =
1482  emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step);
1483  newLbs.push_back(normalizedLoopParams.offset);
1484  newUbs.push_back(normalizedLoopParams.size);
1485  newSteps.push_back(normalizedLoopParams.stride);
1486  }
1487 
1488  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
1489  forallOp.getLoc(), newLbs, newUbs, newSteps, forallOp.getOutputs(),
1490  forallOp.getMapping(), [](OpBuilder &, Location, ValueRange) {});
1491 
1492  rewriter.inlineRegionBefore(forallOp.getBodyRegion(),
1493  normalizedForallOp.getBodyRegion(),
1494  normalizedForallOp.getBodyRegion().begin());
1495 
1496  rewriter.replaceAllOpUsesWith(forallOp, normalizedForallOp);
1497  return success();
1498 }
static std::optional< int64_t > getConstantTripCount(scf::ForOp forOp)
Returns the trip count of forOp if its' low bound, high bound and step are constants,...
Definition: Utils.cpp:303
static OpFoldResult getProductOfIndexes(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > values)
Definition: Utils.cpp:793
static LogicalResult tryIsolateBands(const TileLoops &tileLoops)
Definition: Utils.cpp:1197
static void getPerfectlyNestedLoopsImpl(SmallVectorImpl< T > &forOps, T rootForOp, unsigned maxLoops=std::numeric_limits< unsigned >::max())
Collect perfectly nested loops starting from rootForOps.
Definition: Utils.cpp:1219
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner)
Definition: Utils.cpp:1153
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of scf::ForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor',...
Definition: Utils.cpp:323
static Loops stripmineSink(scf::ForOp forOp, Value factor, ArrayRef< scf::ForOp > targets)
Definition: Utils.cpp:1234
static std::pair< SmallVector< Value >, SmallPtrSet< Operation *, 2 > > delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef< Value > ubs)
For each original loop, the value of the induction variable can be obtained by dividing the induction...
Definition: Utils.cpp:845
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor)
Definition: Utils.cpp:271
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef< Value > values)
Helper function to multiply a sequence of values.
Definition: Utils.cpp:808
static void denormalizeInductionVariableForIndexType(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Definition: Utils.cpp:739
Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Definition: Utils.cpp:679
#define LDBG(X)
Definition: Utils.cpp:38
static bool areInnerBoundsInvariant(scf::ForOp forOp)
Check if bounds of all inner loops are defined outside of forOp and return false if not.
Definition: Utils.cpp:502
static int64_t product(ArrayRef< int64_t > vals)
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
This class represents an argument of a Block.
Definition: Value.h:319
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
unsigned getNumArguments()
Definition: Block.h:128
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:246
BlockArgListType getArguments()
Definition: Block.h:87
Operation & front()
Definition: Block.h:153
iterator end()
Definition: Block.h:144
iterator begin()
Definition: Block.h:143
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:148
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition: Builders.cpp:268
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:364
MLIRContext * getContext() const
Definition: Builders.h:56
TypedAttr getOneAttr(Type type)
Definition: Builders.cpp:382
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:772
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:357
This class helps build Operations.
Definition: Builders.h:216
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:588
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:440
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:407
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:261
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:445
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=std::nullopt, ArrayRef< Location > locs=std::nullopt)
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:470
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:529
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:497
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:421
This class represents a single result from folding an operation.
Definition: OpDefinition.h:268
This class represents an operand of an operation.
Definition: Value.h:267
This is a value defined by a result of an operation.
Definition: Value.h:457
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:677
operand_type_range getOperandTypes()
Definition: Operation.h:397
result_type_range getResultTypes()
Definition: Operation.h:428
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:378
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
result_range getResults()
Definition: Operation.h:415
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:539
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
BlockArgListType getArguments()
Definition: Region.h:81
iterator begin()
Definition: Region.h:55
Block & front()
Definition: Region.h:65
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
bool hasOneBlock()
Return true if this region has exactly one block.
Definition: Region.h:68
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
void mergeBlocks(Block *source, Block *dest, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into the end of block 'dest'.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void replaceAllUsesExcept(Value from, Value to, Operation *exceptedUser)
Find uses of from and replace them with to except if the user is exceptedUser.
Definition: PatternMatch.h:708
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:636
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into block 'dest' before the given position.
void replaceAllOpUsesWith(Operation *from, ValueRange to)
Find uses of from and replace them with to.
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
bool isIndex() const
Definition: Types.cpp:64
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
Definition: Types.cpp:123
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
void replaceUsesWithIf(Value newValue, function_ref< bool(OpOperand &)> shouldReplace)
Replace all uses of 'this' value with 'newValue' if the given callback returns true.
Definition: Value.cpp:81
Type getType() const
Return the type of this value.
Definition: Value.h:129
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:51
static WalkResult interrupt()
Definition: Visitors.h:50
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1194
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1588
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
Include the generated interface declarations.
void getPerfectlyNestedLoops(SmallVectorImpl< scf::ForOp > &nestedLoops, scf::ForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: Utils.cpp:1317
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
LogicalResult outlineIfOp(RewriterBase &b, scf::IfOp ifOp, func::FuncOp *thenFn, StringRef thenFnName, func::FuncOp *elseFn, StringRef elseFnName)
Outline the then and/or else regions of ifOp as follows:
Definition: Utils.cpp:223
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:32
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(RewriterBase &rewriter, MutableArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
Definition: Utils.cpp:40
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition: Utils.cpp:305
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op)
Walk an affine.for to find a band to coalesce.
Definition: Utils.cpp:986
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:348
std::pair< Loops, Loops > TileLoops
Definition: Utils.h:152
Value getValueOrCreateConstantIntOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:103
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef< std::vector< unsigned >> combinedDimensions)
Take the ParallelLoop and for each set of dimension indices, combine them into a single dimension.
Definition: Utils.cpp:1061
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBbArgs)> NewYieldValuesFn
A function that returns the additional yielded values during replaceWithAdditionalYields.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef< Value > sizes)
Tile a nest of scf::ForOp loops rooted at rootForOp with the given (parametric) sizes.
Definition: Utils.cpp:1305
FailureOr< UnrolledLoopInfo > loopUnrollByFactor(scf::ForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr)
Unrolls this for operation by the specified unroll factor.
Definition: Utils.cpp:377
LogicalResult loopUnrollJamByFactor(scf::ForOp forOp, uint64_t unrollFactor)
Unrolls and jams this scf.for operation by the specified unroll factor.
Definition: Utils.cpp:515
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
Definition: Utils.cpp:246
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:362
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:67
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:112
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: Utils.cpp:1282
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
FailureOr< func::FuncOp > outlineSingleBlockRegion(RewriterBase &rewriter, Location loc, Region &region, StringRef funcName, func::CallOp *callOp=nullptr)
Outline a region with a single block into a new FuncOp.
Definition: Utils.cpp:119
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:24
void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Get back the original induction variable values after loop normalization.
Definition: Utils.cpp:764
scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForallOp source, RewriterBase &rewriter)
Given two scf.forall loops, target and source, fuses target into source.
Definition: Utils.cpp:1364
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: Utils.cpp:978
scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter)
Given two scf.for loops, target and source, fuses target into source.
Definition: Utils.cpp:1416
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef< int64_t > sizes)
Definition: Utils.cpp:1322
Range emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Materialize bounds and step of a zero-based and unit-step loop derived by normalizing the specified b...
Definition: Utils.cpp:693
FailureOr< scf::ForallOp > normalizeForallOp(RewriterBase &rewriter, scf::ForallOp forallOp)
Normalize an scf.forall operation.
Definition: Utils.cpp:1466
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
OpFoldResult stride
OpFoldResult size
OpFoldResult offset
std::optional< scf::ForOp > epilogueLoopOp
Definition: Utils.h:116
std::optional< scf::ForOp > mainLoopOp
Definition: Utils.h:115
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.