MLIR  20.0.0git
Utils.cpp
Go to the documentation of this file.
1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
19 #include "mlir/IR/BuiltinOps.h"
20 #include "mlir/IR/IRMapping.h"
21 #include "mlir/IR/OpDefinition.h"
22 #include "mlir/IR/PatternMatch.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallPtrSet.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <cstdint>
32 
33 using namespace mlir;
34 
35 #define DEBUG_TYPE "scf-utils"
36 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
37 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
38 
40  RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loopNest,
41  ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn,
42  bool replaceIterOperandsUsesInLoop) {
43  if (loopNest.empty())
44  return {};
45  // This method is recursive (to make it more readable). Adding an
46  // assertion here to limit the recursion. (See
47  // https://discourse.llvm.org/t/rfc-update-to-mlir-developer-policy-on-recursion/62235)
48  assert(loopNest.size() <= 10 &&
49  "exceeded recursion limit when yielding value from loop nest");
50 
51  // To yield a value from a perfectly nested loop nest, the following
52  // pattern needs to be created, i.e. starting with
53  //
54  // ```mlir
55  // scf.for .. {
56  // scf.for .. {
57  // scf.for .. {
58  // %value = ...
59  // }
60  // }
61  // }
62  // ```
63  //
64  // needs to be modified to
65  //
66  // ```mlir
67  // %0 = scf.for .. iter_args(%arg0 = %init) {
68  // %1 = scf.for .. iter_args(%arg1 = %arg0) {
69  // %2 = scf.for .. iter_args(%arg2 = %arg1) {
70  // %value = ...
71  // scf.yield %value
72  // }
73  // scf.yield %2
74  // }
75  // scf.yield %1
76  // }
77  // ```
78  //
79  // The inner most loop is handled using the `replaceWithAdditionalYields`
80  // that works on a single loop.
81  if (loopNest.size() == 1) {
82  auto innerMostLoop =
83  cast<scf::ForOp>(*loopNest.back().replaceWithAdditionalYields(
84  rewriter, newIterOperands, replaceIterOperandsUsesInLoop,
85  newYieldValuesFn));
86  return {innerMostLoop};
87  }
88  // The outer loops are modified by calling this method recursively
89  // - The return value of the inner loop is the value yielded by this loop.
90  // - The region iter args of this loop are the init_args for the inner loop.
91  SmallVector<scf::ForOp> newLoopNest;
92  NewYieldValuesFn fn =
93  [&](OpBuilder &innerBuilder, Location loc,
94  ArrayRef<BlockArgument> innerNewBBArgs) -> SmallVector<Value> {
95  newLoopNest = replaceLoopNestWithNewYields(rewriter, loopNest.drop_front(),
96  innerNewBBArgs, newYieldValuesFn,
97  replaceIterOperandsUsesInLoop);
98  return llvm::to_vector(llvm::map_range(
99  newLoopNest.front().getResults().take_back(innerNewBBArgs.size()),
100  [](OpResult r) -> Value { return r; }));
101  };
102  scf::ForOp outerMostLoop =
103  cast<scf::ForOp>(*loopNest.front().replaceWithAdditionalYields(
104  rewriter, newIterOperands, replaceIterOperandsUsesInLoop, fn));
105  newLoopNest.insert(newLoopNest.begin(), outerMostLoop);
106  return newLoopNest;
107 }
108 
109 /// Outline a region with a single block into a new FuncOp.
110 /// Assumes the FuncOp result types is the type of the yielded operands of the
111 /// single block. This constraint makes it easy to determine the result.
112 /// This method also clones the `arith::ConstantIndexOp` at the start of
113 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is
114 /// provided, it will be set to point to the operation that calls the outlined
115 /// function.
116 // TODO: support more than single-block regions.
117 // TODO: more flexible constant handling.
118 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
119  Location loc,
120  Region &region,
121  StringRef funcName,
122  func::CallOp *callOp) {
123  assert(!funcName.empty() && "funcName cannot be empty");
124  if (!region.hasOneBlock())
125  return failure();
126 
127  Block *originalBlock = &region.front();
128  Operation *originalTerminator = originalBlock->getTerminator();
129 
130  // Outline before current function.
131  OpBuilder::InsertionGuard g(rewriter);
132  rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>());
133 
134  SetVector<Value> captures;
135  getUsedValuesDefinedAbove(region, captures);
136 
137  ValueRange outlinedValues(captures.getArrayRef());
138  SmallVector<Type> outlinedFuncArgTypes;
139  SmallVector<Location> outlinedFuncArgLocs;
140  // Region's arguments are exactly the first block's arguments as per
141  // Region::getArguments().
142  // Func's arguments are cat(regions's arguments, captures arguments).
143  for (BlockArgument arg : region.getArguments()) {
144  outlinedFuncArgTypes.push_back(arg.getType());
145  outlinedFuncArgLocs.push_back(arg.getLoc());
146  }
147  for (Value value : outlinedValues) {
148  outlinedFuncArgTypes.push_back(value.getType());
149  outlinedFuncArgLocs.push_back(value.getLoc());
150  }
151  FunctionType outlinedFuncType =
152  FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
153  originalTerminator->getOperandTypes());
154  auto outlinedFunc =
155  rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType);
156  Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
157 
158  // Merge blocks while replacing the original block operands.
159  // Warning: `mergeBlocks` erases the original block, reconstruct it later.
160  int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
161  auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
162  {
163  OpBuilder::InsertionGuard g(rewriter);
164  rewriter.setInsertionPointToEnd(outlinedFuncBody);
165  rewriter.mergeBlocks(
166  originalBlock, outlinedFuncBody,
167  outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
168  // Explicitly set up a new ReturnOp terminator.
169  rewriter.setInsertionPointToEnd(outlinedFuncBody);
170  rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
171  originalTerminator->getOperands());
172  }
173 
174  // Reconstruct the block that was deleted and add a
175  // terminator(call_results).
176  Block *newBlock = rewriter.createBlock(
177  &region, region.begin(),
178  TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
179  ArrayRef<Location>(outlinedFuncArgLocs)
180  .take_front(numOriginalBlockArguments));
181  {
182  OpBuilder::InsertionGuard g(rewriter);
183  rewriter.setInsertionPointToEnd(newBlock);
184  SmallVector<Value> callValues;
185  llvm::append_range(callValues, newBlock->getArguments());
186  llvm::append_range(callValues, outlinedValues);
187  auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
188  if (callOp)
189  *callOp = call;
190 
191  // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
192  // Clone `originalTerminator` to take the callOp results then erase it from
193  // `outlinedFuncBody`.
194  IRMapping bvm;
195  bvm.map(originalTerminator->getOperands(), call->getResults());
196  rewriter.clone(*originalTerminator, bvm);
197  rewriter.eraseOp(originalTerminator);
198  }
199 
200  // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
201  // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
202  for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
203  outlinedValues.size()))) {
204  Value orig = std::get<0>(it);
205  Value repl = std::get<1>(it);
206  {
207  OpBuilder::InsertionGuard g(rewriter);
208  rewriter.setInsertionPointToStart(outlinedFuncBody);
209  if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
210  IRMapping bvm;
211  repl = rewriter.clone(*cst, bvm)->getResult(0);
212  }
213  }
214  orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
215  return outlinedFunc->isProperAncestor(opOperand.getOwner());
216  });
217  }
218 
219  return outlinedFunc;
220 }
221 
222 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,
223  func::FuncOp *thenFn, StringRef thenFnName,
224  func::FuncOp *elseFn, StringRef elseFnName) {
225  IRRewriter rewriter(b);
226  Location loc = ifOp.getLoc();
227  FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
228  if (thenFn && !ifOp.getThenRegion().empty()) {
229  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
230  rewriter, loc, ifOp.getThenRegion(), thenFnName);
231  if (failed(outlinedFuncOpOrFailure))
232  return failure();
233  *thenFn = *outlinedFuncOpOrFailure;
234  }
235  if (elseFn && !ifOp.getElseRegion().empty()) {
236  outlinedFuncOpOrFailure = outlineSingleBlockRegion(
237  rewriter, loc, ifOp.getElseRegion(), elseFnName);
238  if (failed(outlinedFuncOpOrFailure))
239  return failure();
240  *elseFn = *outlinedFuncOpOrFailure;
241  }
242  return success();
243 }
244 
247  assert(rootOp != nullptr && "Root operation must not be a nullptr.");
248  bool rootEnclosesPloops = false;
249  for (Region &region : rootOp->getRegions()) {
250  for (Block &block : region.getBlocks()) {
251  for (Operation &op : block) {
252  bool enclosesPloops = getInnermostParallelLoops(&op, result);
253  rootEnclosesPloops |= enclosesPloops;
254  if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
255  rootEnclosesPloops = true;
256 
257  // Collect parallel loop if it is an innermost one.
258  if (!enclosesPloops)
259  result.push_back(ploop);
260  }
261  }
262  }
263  }
264  return rootEnclosesPloops;
265 }
266 
267 // Build the IR that performs ceil division of a positive value by a constant:
268 // ceildiv(a, B) = divis(a + (B-1), B)
269 // where divis is rounding-to-zero division.
270 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
271  int64_t divisor) {
272  assert(divisor > 0 && "expected positive divisor");
273  assert(dividend.getType().isIntOrIndex() &&
274  "expected integer or index-typed value");
275 
276  Value divisorMinusOneCst = builder.create<arith::ConstantOp>(
277  loc, builder.getIntegerAttr(dividend.getType(), divisor - 1));
278  Value divisorCst = builder.create<arith::ConstantOp>(
279  loc, builder.getIntegerAttr(dividend.getType(), divisor));
280  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
281  return builder.create<arith::DivUIOp>(loc, sum, divisorCst);
282 }
283 
284 // Build the IR that performs ceil division of a positive value by another
285 // positive value:
286 // ceildiv(a, b) = divis(a + (b - 1), b)
287 // where divis is rounding-to-zero division.
288 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
289  Value divisor) {
290  assert(dividend.getType().isIntOrIndex() &&
291  "expected integer or index-typed value");
292  Value cstOne = builder.create<arith::ConstantOp>(
293  loc, builder.getOneAttr(dividend.getType()));
294  Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
295  Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
296  return builder.create<arith::DivUIOp>(loc, sum, divisor);
297 }
298 
299 /// Returns the trip count of `forOp` if its' low bound, high bound and step are
300 /// constants, or optional otherwise. Trip count is computed as
301 /// ceilDiv(highBound - lowBound, step).
302 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
303  std::optional<int64_t> lbCstOp = getConstantIntValue(forOp.getLowerBound());
304  std::optional<int64_t> ubCstOp = getConstantIntValue(forOp.getUpperBound());
305  std::optional<int64_t> stepCstOp = getConstantIntValue(forOp.getStep());
306  if (!lbCstOp.has_value() || !ubCstOp.has_value() || !stepCstOp.has_value())
307  return {};
308 
309  // Constant loop bounds computation.
310  int64_t lbCst = lbCstOp.value();
311  int64_t ubCst = ubCstOp.value();
312  int64_t stepCst = stepCstOp.value();
313  assert(lbCst >= 0 && ubCst >= 0 && stepCst > 0 &&
314  "expected positive loop bounds and step");
315  return llvm::divideCeilSigned(ubCst - lbCst, stepCst);
316 }
317 
318 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
319 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
320 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
321 /// unrolled iteration using annotateFn.
323  Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
324  function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
325  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
326  ValueRange iterArgs, ValueRange yieldedValues) {
327  // Builder to insert unrolled bodies just before the terminator of the body of
328  // 'forOp'.
329  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
330 
331  if (!annotateFn)
332  annotateFn = [](unsigned, Operation *, OpBuilder) {};
333 
334  // Keep a pointer to the last non-terminator operation in the original block
335  // so that we know what to clone (since we are doing this in-place).
336  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
337 
338  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
339  SmallVector<Value, 4> lastYielded(yieldedValues);
340 
341  for (unsigned i = 1; i < unrollFactor; i++) {
342  IRMapping operandMap;
343 
344  // Prepare operand map.
345  operandMap.map(iterArgs, lastYielded);
346 
347  // If the induction variable is used, create a remapping to the value for
348  // this unrolled instance.
349  if (!forOpIV.use_empty()) {
350  Value ivUnroll = ivRemapFn(i, forOpIV, builder);
351  operandMap.map(forOpIV, ivUnroll);
352  }
353 
354  // Clone the original body of 'forOp'.
355  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
356  Operation *clonedOp = builder.clone(*it, operandMap);
357  annotateFn(i, clonedOp, builder);
358  }
359 
360  // Update yielded values.
361  for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
362  lastYielded[i] = operandMap.lookup(yieldedValues[i]);
363  }
364 
365  // Make sure we annotate the Ops in the original body. We do this last so that
366  // any annotations are not copied into the cloned Ops above.
367  for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
368  annotateFn(0, &*it, builder);
369 
370  // Update operands of the yield statement.
371  loopBodyBlock->getTerminator()->setOperands(lastYielded);
372 }
373 
374 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
376  scf::ForOp forOp, uint64_t unrollFactor,
377  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
378  assert(unrollFactor > 0 && "expected positive unroll factor");
379 
380  // Return if the loop body is empty.
381  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
382  return success();
383 
384  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
385  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
386  OpBuilder boundsBuilder(forOp);
387  IRRewriter rewriter(forOp.getContext());
388  auto loc = forOp.getLoc();
389  Value step = forOp.getStep();
390  Value upperBoundUnrolled;
391  Value stepUnrolled;
392  bool generateEpilogueLoop = true;
393 
394  std::optional<int64_t> constTripCount = getConstantTripCount(forOp);
395  if (constTripCount) {
396  // Constant loop bounds computation.
397  int64_t lbCst = getConstantIntValue(forOp.getLowerBound()).value();
398  int64_t ubCst = getConstantIntValue(forOp.getUpperBound()).value();
399  int64_t stepCst = getConstantIntValue(forOp.getStep()).value();
400  if (unrollFactor == 1) {
401  if (*constTripCount == 1 &&
402  failed(forOp.promoteIfSingleIteration(rewriter)))
403  return failure();
404  return success();
405  }
406 
407  int64_t tripCountEvenMultiple =
408  *constTripCount - (*constTripCount % unrollFactor);
409  int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
410  int64_t stepUnrolledCst = stepCst * unrollFactor;
411 
412  // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
413  generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
414  if (generateEpilogueLoop)
415  upperBoundUnrolled = boundsBuilder.create<arith::ConstantOp>(
416  loc, boundsBuilder.getIntegerAttr(forOp.getUpperBound().getType(),
417  upperBoundUnrolledCst));
418  else
419  upperBoundUnrolled = forOp.getUpperBound();
420 
421  // Create constant for 'stepUnrolled'.
422  stepUnrolled = stepCst == stepUnrolledCst
423  ? step
424  : boundsBuilder.create<arith::ConstantOp>(
425  loc, boundsBuilder.getIntegerAttr(
426  step.getType(), stepUnrolledCst));
427  } else {
428  // Dynamic loop bounds computation.
429  // TODO: Add dynamic asserts for negative lb/ub/step, or
430  // consider using ceilDiv from AffineApplyExpander.
431  auto lowerBound = forOp.getLowerBound();
432  auto upperBound = forOp.getUpperBound();
433  Value diff =
434  boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
435  Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
436  Value unrollFactorCst = boundsBuilder.create<arith::ConstantOp>(
437  loc, boundsBuilder.getIntegerAttr(tripCount.getType(), unrollFactor));
438  Value tripCountRem =
439  boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
440  // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
441  Value tripCountEvenMultiple =
442  boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
443  // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
444  upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
445  loc, lowerBound,
446  boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
447  // Scale 'step' by 'unrollFactor'.
448  stepUnrolled =
449  boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
450  }
451 
452  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
453  if (generateEpilogueLoop) {
454  OpBuilder epilogueBuilder(forOp->getContext());
455  epilogueBuilder.setInsertionPoint(forOp->getBlock(),
456  std::next(Block::iterator(forOp)));
457  auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
458  epilogueForOp.setLowerBound(upperBoundUnrolled);
459 
460  // Update uses of loop results.
461  auto results = forOp.getResults();
462  auto epilogueResults = epilogueForOp.getResults();
463 
464  for (auto e : llvm::zip(results, epilogueResults)) {
465  std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
466  }
467  epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
468  epilogueForOp.getInitArgs().size(), results);
469  (void)epilogueForOp.promoteIfSingleIteration(rewriter);
470  }
471 
472  // Create unrolled loop.
473  forOp.setUpperBound(upperBoundUnrolled);
474  forOp.setStep(stepUnrolled);
475 
476  auto iterArgs = ValueRange(forOp.getRegionIterArgs());
477  auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
478 
480  forOp.getBody(), forOp.getInductionVar(), unrollFactor,
481  [&](unsigned i, Value iv, OpBuilder b) {
482  // iv' = iv + step * i;
483  auto stride = b.create<arith::MulIOp>(
484  loc, step,
485  b.create<arith::ConstantOp>(loc,
486  b.getIntegerAttr(iv.getType(), i)));
487  return b.create<arith::AddIOp>(loc, iv, stride);
488  },
489  annotateFn, iterArgs, yieldedValues);
490  // Promote the loop body up if this has turned into a single iteration loop.
491  (void)forOp.promoteIfSingleIteration(rewriter);
492  return success();
493 }
494 
495 /// Check if bounds of all inner loops are defined outside of `forOp`
496 /// and return false if not.
497 static bool areInnerBoundsInvariant(scf::ForOp forOp) {
498  auto walkResult = forOp.walk([&](scf::ForOp innerForOp) {
499  if (!forOp.isDefinedOutsideOfLoop(innerForOp.getLowerBound()) ||
500  !forOp.isDefinedOutsideOfLoop(innerForOp.getUpperBound()) ||
501  !forOp.isDefinedOutsideOfLoop(innerForOp.getStep()))
502  return WalkResult::interrupt();
503 
504  return WalkResult::advance();
505  });
506  return !walkResult.wasInterrupted();
507 }
508 
509 /// Unrolls and jams this loop by the specified factor.
510 LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp,
511  uint64_t unrollJamFactor) {
512  assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
513 
514  if (unrollJamFactor == 1)
515  return success();
516 
517  // If any control operand of any inner loop of `forOp` is defined within
518  // `forOp`, no unroll jam.
519  if (!areInnerBoundsInvariant(forOp)) {
520  LDBG("failed to unroll and jam: inner bounds are not invariant");
521  return failure();
522  }
523 
524  // Currently, for operations with results are not supported.
525  if (forOp->getNumResults() > 0) {
526  LDBG("failed to unroll and jam: unsupported loop with results");
527  return failure();
528  }
529 
530  // Currently, only constant trip count that divided by the unroll factor is
531  // supported.
532  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
533  if (!tripCount.has_value()) {
534  // If the trip count is dynamic, do not unroll & jam.
535  LDBG("failed to unroll and jam: trip count could not be determined");
536  return failure();
537  }
538  if (unrollJamFactor > *tripCount) {
539  LDBG("unroll and jam factor is greater than trip count, set factor to trip "
540  "count");
541  unrollJamFactor = *tripCount;
542  } else if (*tripCount % unrollJamFactor != 0) {
543  LDBG("failed to unroll and jam: unsupported trip count that is not a "
544  "multiple of unroll jam factor");
545  return failure();
546  }
547 
548  // Nothing in the loop body other than the terminator.
549  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
550  return success();
551 
552  // Gather all sub-blocks to jam upon the loop being unrolled.
554  jbg.walk(forOp);
555  auto &subBlocks = jbg.subBlocks;
556 
557  // Collect inner loops.
558  SmallVector<scf::ForOp> innerLoops;
559  forOp.walk([&](scf::ForOp innerForOp) { innerLoops.push_back(innerForOp); });
560 
561  // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
562  // iteration. There are (`unrollJamFactor` - 1) iterations.
563  SmallVector<IRMapping> operandMaps(unrollJamFactor - 1);
564 
565  // For any loop with iter_args, replace it with a new loop that has
566  // `unrollJamFactor` copies of its iterOperands, iter_args and yield
567  // operands.
568  SmallVector<scf::ForOp> newInnerLoops;
569  IRRewriter rewriter(forOp.getContext());
570  for (scf::ForOp oldForOp : innerLoops) {
571  SmallVector<Value> dupIterOperands, dupYieldOperands;
572  ValueRange oldIterOperands = oldForOp.getInits();
573  ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
574  ValueRange oldYieldOperands =
575  cast<scf::YieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
576  // Get additional iterOperands, iterArgs, and yield operands. We will
577  // fix iterOperands and yield operands after cloning of sub-blocks.
578  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
579  dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
580  dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
581  }
582  // Create a new loop with additional iterOperands, iter_args and yield
583  // operands. This new loop will take the loop body of the original loop.
584  bool forOpReplaced = oldForOp == forOp;
585  scf::ForOp newForOp =
586  cast<scf::ForOp>(*oldForOp.replaceWithAdditionalYields(
587  rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
588  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
589  return dupYieldOperands;
590  }));
591  newInnerLoops.push_back(newForOp);
592  // `forOp` has been replaced with a new loop.
593  if (forOpReplaced)
594  forOp = newForOp;
595  // Update `operandMaps` for `newForOp` iterArgs and results.
596  ValueRange newIterArgs = newForOp.getRegionIterArgs();
597  unsigned oldNumIterArgs = oldIterArgs.size();
598  ValueRange newResults = newForOp.getResults();
599  unsigned oldNumResults = newResults.size() / unrollJamFactor;
600  assert(oldNumIterArgs == oldNumResults &&
601  "oldNumIterArgs must be the same as oldNumResults");
602  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
603  for (unsigned j = 0; j < oldNumIterArgs; ++j) {
604  // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
605  // results. Update `operandMaps[i - 1]` to map old iterArgs and results
606  // to those in the `i`th new set.
607  operandMaps[i - 1].map(newIterArgs[j],
608  newIterArgs[i * oldNumIterArgs + j]);
609  operandMaps[i - 1].map(newResults[j],
610  newResults[i * oldNumResults + j]);
611  }
612  }
613  }
614 
615  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
616  rewriter.setInsertionPoint(forOp);
617  int64_t step = forOp.getConstantStep()->getSExtValue();
618  auto newStep = rewriter.createOrFold<arith::MulIOp>(
619  forOp.getLoc(), forOp.getStep(),
620  rewriter.createOrFold<arith::ConstantOp>(
621  forOp.getLoc(), rewriter.getIndexAttr(unrollJamFactor)));
622  forOp.setStep(newStep);
623  auto forOpIV = forOp.getInductionVar();
624 
625  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
626  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
627  for (auto &subBlock : subBlocks) {
628  // Builder to insert unroll-jammed bodies. Insert right at the end of
629  // sub-block.
630  OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
631 
632  // If the induction variable is used, create a remapping to the value for
633  // this unrolled instance.
634  if (!forOpIV.use_empty()) {
635  // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
636  auto ivTag = builder.createOrFold<arith::ConstantOp>(
637  forOp.getLoc(), builder.getIndexAttr(step * i));
638  auto ivUnroll =
639  builder.createOrFold<arith::AddIOp>(forOp.getLoc(), forOpIV, ivTag);
640  operandMaps[i - 1].map(forOpIV, ivUnroll);
641  }
642  // Clone the sub-block being unroll-jammed.
643  for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
644  builder.clone(*it, operandMaps[i - 1]);
645  }
646  // Fix iterOperands and yield op operands of newly created loops.
647  for (auto newForOp : newInnerLoops) {
648  unsigned oldNumIterOperands =
649  newForOp.getNumRegionIterArgs() / unrollJamFactor;
650  unsigned numControlOperands = newForOp.getNumControlOperands();
651  auto yieldOp = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
652  unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
653  assert(oldNumIterOperands == oldNumYieldOperands &&
654  "oldNumIterOperands must be the same as oldNumYieldOperands");
655  for (unsigned j = 0; j < oldNumIterOperands; ++j) {
656  // The `i`th duplication of an old iterOperand or yield op operand
657  // needs to be replaced with a mapped value from `operandMaps[i - 1]`
658  // if such mapped value exists.
659  newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
660  operandMaps[i - 1].lookupOrDefault(
661  newForOp.getOperand(numControlOperands + j)));
662  yieldOp.setOperand(
663  i * oldNumYieldOperands + j,
664  operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
665  }
666  }
667  }
668 
669  // Promote the loop body up if this has turned into a single iteration loop.
670  (void)forOp.promoteIfSingleIteration(rewriter);
671  return success();
672 }
673 
675  OpFoldResult lb, OpFoldResult ub,
676  OpFoldResult step) {
677  // For non-index types, generate `arith` instructions
678  // Check if the loop is already known to have a constant zero lower bound or
679  // a constant one step.
680  bool isZeroBased = false;
681  if (auto lbCst = getConstantIntValue(lb))
682  isZeroBased = lbCst.value() == 0;
683 
684  bool isStepOne = false;
685  if (auto stepCst = getConstantIntValue(step))
686  isStepOne = stepCst.value() == 1;
687 
688  Type rangeType = getType(lb);
689  assert(rangeType == getType(ub) && rangeType == getType(step) &&
690  "expected matching types");
691 
692  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
693  // assuming the step is strictly positive. Update the bounds and the step
694  // of the loop to go from 0 to the number of iterations, if necessary.
695  if (isZeroBased && isStepOne)
696  return {lb, ub, step};
697 
698  OpFoldResult diff = ub;
699  if (!isZeroBased) {
700  diff = rewriter.createOrFold<arith::SubIOp>(
701  loc, getValueOrCreateConstantIntOp(rewriter, loc, ub),
702  getValueOrCreateConstantIntOp(rewriter, loc, lb));
703  }
704  OpFoldResult newUpperBound = diff;
705  if (!isStepOne) {
706  newUpperBound = rewriter.createOrFold<arith::CeilDivSIOp>(
707  loc, getValueOrCreateConstantIntOp(rewriter, loc, diff),
708  getValueOrCreateConstantIntOp(rewriter, loc, step));
709  }
710 
711  OpFoldResult newLowerBound = rewriter.getZeroAttr(rangeType);
712  OpFoldResult newStep = rewriter.getOneAttr(rangeType);
713 
714  return {newLowerBound, newUpperBound, newStep};
715 }
716 
718  Value normalizedIv, OpFoldResult origLb,
719  OpFoldResult origStep) {
720  Value denormalizedIv;
722  bool isStepOne = isConstantIntValue(origStep, 1);
723  bool isZeroBased = isConstantIntValue(origLb, 0);
724 
725  Value scaled = normalizedIv;
726  if (!isStepOne) {
727  Value origStepValue =
728  getValueOrCreateConstantIntOp(rewriter, loc, origStep);
729  scaled = rewriter.create<arith::MulIOp>(loc, normalizedIv, origStepValue);
730  preserve.insert(scaled.getDefiningOp());
731  }
732  denormalizedIv = scaled;
733  if (!isZeroBased) {
734  Value origLbValue = getValueOrCreateConstantIntOp(rewriter, loc, origLb);
735  denormalizedIv = rewriter.create<arith::AddIOp>(loc, scaled, origLbValue);
736  preserve.insert(denormalizedIv.getDefiningOp());
737  }
738 
739  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
740 }
741 
742 /// Helper function to multiply a sequence of values.
744  ArrayRef<Value> values) {
745  assert(!values.empty() && "unexpected empty list");
746  std::optional<Value> productOf;
747  for (auto v : values) {
748  auto vOne = getConstantIntValue(v);
749  if (vOne && vOne.value() == 1)
750  continue;
751  if (productOf)
752  productOf =
753  rewriter.create<arith::MulIOp>(loc, productOf.value(), v).getResult();
754  else
755  productOf = v;
756  }
757  if (!productOf) {
758  productOf = rewriter
759  .create<arith::ConstantOp>(
760  loc, rewriter.getOneAttr(values.front().getType()))
761  .getResult();
762  }
763  return productOf.value();
764 }
765 
766 /// For each original loop, the value of the
767 /// induction variable can be obtained by dividing the induction variable of
768 /// the linearized loop by the total number of iterations of the loops nested
769 /// in it modulo the number of iterations in this loop (remove the values
770 /// related to the outer loops):
771 /// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
772 /// Compute these iteratively from the innermost loop by creating a "running
773 /// quotient" of division by the range.
774 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
776  Value linearizedIv, ArrayRef<Value> ubs) {
777  SmallVector<Value> delinearizedIvs(ubs.size());
778  SmallPtrSet<Operation *, 2> preservedUsers;
779 
780  llvm::BitVector isUbOne(ubs.size());
781  for (auto [index, ub] : llvm::enumerate(ubs)) {
782  auto ubCst = getConstantIntValue(ub);
783  if (ubCst && ubCst.value() == 1)
784  isUbOne.set(index);
785  }
786 
787  // Prune the lead ubs that are all ones.
788  unsigned numLeadingOneUbs = 0;
789  for (auto [index, ub] : llvm::enumerate(ubs)) {
790  if (!isUbOne.test(index)) {
791  break;
792  }
793  delinearizedIvs[index] = rewriter.create<arith::ConstantOp>(
794  loc, rewriter.getZeroAttr(ub.getType()));
795  numLeadingOneUbs++;
796  }
797 
798  Value previous = linearizedIv;
799  for (unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {
800  unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;
801  if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {
802  previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
803  preservedUsers.insert(previous.getDefiningOp());
804  }
805  Value iv = previous;
806  if (i != e - 1) {
807  if (!isUbOne.test(idx)) {
808  iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
809  preservedUsers.insert(iv.getDefiningOp());
810  } else {
811  iv = rewriter.create<arith::ConstantOp>(
812  loc, rewriter.getZeroAttr(ubs[idx].getType()));
813  }
814  }
815  delinearizedIvs[idx] = iv;
816  }
817  return {delinearizedIvs, preservedUsers};
818 }
819 
820 LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,
822  if (loops.size() < 2)
823  return failure();
824 
825  scf::ForOp innermost = loops.back();
826  scf::ForOp outermost = loops.front();
827 
828  // 1. Make sure all loops iterate from 0 to upperBound with step 1. This
829  // allows the following code to assume upperBound is the number of iterations.
830  for (auto loop : loops) {
831  OpBuilder::InsertionGuard g(rewriter);
832  rewriter.setInsertionPoint(outermost);
833  Value lb = loop.getLowerBound();
834  Value ub = loop.getUpperBound();
835  Value step = loop.getStep();
836  auto newLoopRange =
837  emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);
838 
839  rewriter.modifyOpInPlace(loop, [&]() {
840  loop.setLowerBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
841  newLoopRange.offset));
842  loop.setUpperBound(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
843  newLoopRange.size));
844  loop.setStep(getValueOrCreateConstantIntOp(rewriter, loop.getLoc(),
845  newLoopRange.stride));
846  });
847  rewriter.setInsertionPointToStart(innermost.getBody());
848  denormalizeInductionVariable(rewriter, loop.getLoc(),
849  loop.getInductionVar(), lb, step);
850  }
851 
852  // 2. Emit code computing the upper bound of the coalesced loop as product
853  // of the number of iterations of all loops.
854  OpBuilder::InsertionGuard g(rewriter);
855  rewriter.setInsertionPoint(outermost);
856  Location loc = outermost.getLoc();
857  SmallVector<Value> upperBounds = llvm::map_to_vector(
858  loops, [](auto loop) { return loop.getUpperBound(); });
859  Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);
860  outermost.setUpperBound(upperBound);
861 
862  rewriter.setInsertionPointToStart(innermost.getBody());
863  auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(
864  rewriter, loc, outermost.getInductionVar(), upperBounds);
865  rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],
866  preservedUsers);
867 
868  for (int i = loops.size() - 1; i > 0; --i) {
869  auto outerLoop = loops[i - 1];
870  auto innerLoop = loops[i];
871 
872  Operation *innerTerminator = innerLoop.getBody()->getTerminator();
873  auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());
874  assert(llvm::equal(outerLoop.getRegionIterArgs(), innerLoop.getInitArgs()));
875  for (Value &yieldedVal : yieldedVals) {
876  // The yielded value may be an iteration argument of the inner loop
877  // which is about to be inlined.
878  auto iter = llvm::find(innerLoop.getRegionIterArgs(), yieldedVal);
879  if (iter != innerLoop.getRegionIterArgs().end()) {
880  unsigned iterArgIndex = iter - innerLoop.getRegionIterArgs().begin();
881  // `outerLoop` iter args identical to the `innerLoop` init args.
882  assert(iterArgIndex < innerLoop.getInitArgs().size());
883  yieldedVal = innerLoop.getInitArgs()[iterArgIndex];
884  }
885  }
886  rewriter.eraseOp(innerTerminator);
887 
888  SmallVector<Value> innerBlockArgs;
889  innerBlockArgs.push_back(delinearizeIvs[i]);
890  llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
891  rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),
892  Block::iterator(innerLoop), innerBlockArgs);
893  rewriter.replaceOp(innerLoop, yieldedVals);
894  }
895  return success();
896 }
897 
899  if (loops.empty()) {
900  return failure();
901  }
902  IRRewriter rewriter(loops.front().getContext());
903  return coalesceLoops(rewriter, loops);
904 }
905 
906 LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {
907  LogicalResult result(failure());
909  getPerfectlyNestedLoops(loops, op);
910 
911  // Look for a band of loops that can be coalesced, i.e. perfectly nested
912  // loops with bounds defined above some loop.
913 
914  // 1. For each loop, find above which parent loop its bounds operands are
915  // defined.
916  SmallVector<unsigned> operandsDefinedAbove(loops.size());
917  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
918  operandsDefinedAbove[i] = i;
919  for (unsigned j = 0; j < i; ++j) {
920  SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),
921  loops[i].getUpperBound(),
922  loops[i].getStep()};
923  if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {
924  operandsDefinedAbove[i] = j;
925  break;
926  }
927  }
928  }
929 
930  // 2. For each inner loop check that the iter_args for the immediately outer
931  // loop are the init for the immediately inner loop and that the yields of the
932  // return of the inner loop is the yield for the immediately outer loop. Keep
933  // track of where the chain starts from for each loop.
934  SmallVector<unsigned> iterArgChainStart(loops.size());
935  iterArgChainStart[0] = 0;
936  for (unsigned i = 1, e = loops.size(); i < e; ++i) {
937  // By default set the start of the chain to itself.
938  iterArgChainStart[i] = i;
939  auto outerloop = loops[i - 1];
940  auto innerLoop = loops[i];
941  if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
942  continue;
943  }
944  if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
945  continue;
946  }
947  auto outerloopTerminator = outerloop.getBody()->getTerminator();
948  if (!llvm::equal(outerloopTerminator->getOperands(),
949  innerLoop.getResults())) {
950  continue;
951  }
952  iterArgChainStart[i] = iterArgChainStart[i - 1];
953  }
954 
955  // 3. Identify bands of loops such that the operands of all of them are
956  // defined above the first loop in the band. Traverse the nest bottom-up
957  // so that modifications don't invalidate the inner loops.
958  for (unsigned end = loops.size(); end > 0; --end) {
959  unsigned start = 0;
960  for (; start < end - 1; ++start) {
961  auto maxPos =
962  *std::max_element(std::next(operandsDefinedAbove.begin(), start),
963  std::next(operandsDefinedAbove.begin(), end));
964  if (maxPos > start)
965  continue;
966  if (iterArgChainStart[end - 1] > start)
967  continue;
968  auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
969  if (succeeded(coalesceLoops(band)))
970  result = success();
971  break;
972  }
973  // If a band was found and transformed, keep looking at the loops above
974  // the outermost transformed loop.
975  if (start != end - 1)
976  end = start + 1;
977  }
978  return result;
979 }
980 
982  RewriterBase &rewriter, scf::ParallelOp loops,
983  ArrayRef<std::vector<unsigned>> combinedDimensions) {
984  OpBuilder::InsertionGuard g(rewriter);
985  rewriter.setInsertionPoint(loops);
986  Location loc = loops.getLoc();
987 
988  // Presort combined dimensions.
989  auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
990  for (auto &dims : sortedDimensions)
991  llvm::sort(dims);
992 
993  // Normalize ParallelOp's iteration pattern.
994  SmallVector<Value, 3> normalizedUpperBounds;
995  for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
996  OpBuilder::InsertionGuard g2(rewriter);
997  rewriter.setInsertionPoint(loops);
998  Value lb = loops.getLowerBound()[i];
999  Value ub = loops.getUpperBound()[i];
1000  Value step = loops.getStep()[i];
1001  auto newLoopRange = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);
1002  normalizedUpperBounds.push_back(getValueOrCreateConstantIntOp(
1003  rewriter, loops.getLoc(), newLoopRange.size));
1004 
1005  rewriter.setInsertionPointToStart(loops.getBody());
1006  denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,
1007  step);
1008  }
1009 
1010  // Combine iteration spaces.
1011  SmallVector<Value, 3> lowerBounds, upperBounds, steps;
1012  auto cst0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
1013  auto cst1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
1014  for (auto &sortedDimension : sortedDimensions) {
1015  Value newUpperBound = rewriter.create<arith::ConstantIndexOp>(loc, 1);
1016  for (auto idx : sortedDimension) {
1017  newUpperBound = rewriter.create<arith::MulIOp>(
1018  loc, newUpperBound, normalizedUpperBounds[idx]);
1019  }
1020  lowerBounds.push_back(cst0);
1021  steps.push_back(cst1);
1022  upperBounds.push_back(newUpperBound);
1023  }
1024 
1025  // Create new ParallelLoop with conversions to the original induction values.
1026  // The loop below uses divisions to get the relevant range of values in the
1027  // new induction value that represent each range of the original induction
1028  // value. The remainders then determine based on that range, which iteration
1029  // of the original induction value this represents. This is a normalized value
1030  // that is un-normalized already by the previous logic.
1031  auto newPloop = rewriter.create<scf::ParallelOp>(
1032  loc, lowerBounds, upperBounds, steps,
1033  [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
1034  for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
1035  Value previous = ploopIVs[i];
1036  unsigned numberCombinedDimensions = combinedDimensions[i].size();
1037  // Iterate over all except the last induction value.
1038  for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
1039  unsigned idx = combinedDimensions[i][j];
1040 
1041  // Determine the current induction value's current loop iteration
1042  Value iv = insideBuilder.create<arith::RemSIOp>(
1043  loc, previous, normalizedUpperBounds[idx]);
1044  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
1045  loops.getRegion());
1046 
1047  // Remove the effect of the current induction value to prepare for
1048  // the next value.
1049  previous = insideBuilder.create<arith::DivSIOp>(
1050  loc, previous, normalizedUpperBounds[idx]);
1051  }
1052 
1053  // The final induction value is just the remaining value.
1054  unsigned idx = combinedDimensions[i][0];
1055  replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
1056  previous, loops.getRegion());
1057  }
1058  });
1059 
1060  // Replace the old loop with the new loop.
1061  loops.getBody()->back().erase();
1062  newPloop.getBody()->getOperations().splice(
1063  Block::iterator(newPloop.getBody()->back()),
1064  loops.getBody()->getOperations());
1065  loops.erase();
1066 }
1067 
1068 // Hoist the ops within `outer` that appear before `inner`.
1069 // Such ops include the ops that have been introduced by parametric tiling.
1070 // Ops that come from triangular loops (i.e. that belong to the program slice
1071 // rooted at `outer`) and ops that have side effects cannot be hoisted.
1072 // Return failure when any op fails to hoist.
1073 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
1074  SetVector<Operation *> forwardSlice;
1076  options.filter = [&inner](Operation *op) {
1077  return op != inner.getOperation();
1078  };
1079  getForwardSlice(outer.getInductionVar(), &forwardSlice, options);
1080  LogicalResult status = success();
1082  for (auto &op : outer.getBody()->without_terminator()) {
1083  // Stop when encountering the inner loop.
1084  if (&op == inner.getOperation())
1085  break;
1086  // Skip over non-hoistable ops.
1087  if (forwardSlice.count(&op) > 0) {
1088  status = failure();
1089  continue;
1090  }
1091  // Skip intermediate scf::ForOp, these are not considered a failure.
1092  if (isa<scf::ForOp>(op))
1093  continue;
1094  // Skip other ops with regions.
1095  if (op.getNumRegions() > 0) {
1096  status = failure();
1097  continue;
1098  }
1099  // Skip if op has side effects.
1100  // TODO: loads to immutable memory regions are ok.
1101  if (!isMemoryEffectFree(&op)) {
1102  status = failure();
1103  continue;
1104  }
1105  toHoist.push_back(&op);
1106  }
1107  auto *outerForOp = outer.getOperation();
1108  for (auto *op : toHoist)
1109  op->moveBefore(outerForOp);
1110  return status;
1111 }
1112 
1113 // Traverse the interTile and intraTile loops and try to hoist ops such that
1114 // bands of perfectly nested loops are isolated.
1115 // Return failure if either perfect interTile or perfect intraTile bands cannot
1116 // be formed.
1117 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
1118  LogicalResult status = success();
1119  const Loops &interTile = tileLoops.first;
1120  const Loops &intraTile = tileLoops.second;
1121  auto size = interTile.size();
1122  assert(size == intraTile.size());
1123  if (size <= 1)
1124  return success();
1125  for (unsigned s = 1; s < size; ++s)
1126  status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
1127  : failure();
1128  for (unsigned s = 1; s < size; ++s)
1129  status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
1130  : failure();
1131  return status;
1132 }
1133 
1134 /// Collect perfectly nested loops starting from `rootForOps`. Loops are
1135 /// perfectly nested if each loop is the first and only non-terminator operation
1136 /// in the parent loop. Collect at most `maxLoops` loops and append them to
1137 /// `forOps`.
1138 template <typename T>
1140  SmallVectorImpl<T> &forOps, T rootForOp,
1141  unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
1142  for (unsigned i = 0; i < maxLoops; ++i) {
1143  forOps.push_back(rootForOp);
1144  Block &body = rootForOp.getRegion().front();
1145  if (body.begin() != std::prev(body.end(), 2))
1146  return;
1147 
1148  rootForOp = dyn_cast<T>(&body.front());
1149  if (!rootForOp)
1150  return;
1151  }
1152 }
1153 
1154 static Loops stripmineSink(scf::ForOp forOp, Value factor,
1155  ArrayRef<scf::ForOp> targets) {
1156  auto originalStep = forOp.getStep();
1157  auto iv = forOp.getInductionVar();
1158 
1159  OpBuilder b(forOp);
1160  forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
1161 
1162  Loops innerLoops;
1163  for (auto t : targets) {
1164  // Save information for splicing ops out of t when done
1165  auto begin = t.getBody()->begin();
1166  auto nOps = t.getBody()->getOperations().size();
1167 
1168  // Insert newForOp before the terminator of `t`.
1169  auto b = OpBuilder::atBlockTerminator((t.getBody()));
1170  Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
1171  Value ub =
1172  b.create<arith::MinSIOp>(t.getLoc(), forOp.getUpperBound(), stepped);
1173 
1174  // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
1175  auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
1176  newForOp.getBody()->getOperations().splice(
1177  newForOp.getBody()->getOperations().begin(),
1178  t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
1179  replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1180  newForOp.getRegion());
1181 
1182  innerLoops.push_back(newForOp);
1183  }
1184 
1185  return innerLoops;
1186 }
1187 
1188 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1189 // Returns the new for operation, nested immediately under `target`.
1190 template <typename SizeType>
1191 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
1192  scf::ForOp target) {
1193  // TODO: Use cheap structural assertions that targets are nested under
1194  // forOp and that targets are not nested under each other when DominanceInfo
1195  // exposes the capability. It seems overkill to construct a whole function
1196  // dominance tree at this point.
1197  auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
1198  assert(res.size() == 1 && "Expected 1 inner forOp");
1199  return res[0];
1200 }
1201 
1203  ArrayRef<Value> sizes,
1204  ArrayRef<scf::ForOp> targets) {
1206  SmallVector<scf::ForOp, 8> currentTargets(targets);
1207  for (auto it : llvm::zip(forOps, sizes)) {
1208  auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1209  res.push_back(step);
1210  currentTargets = step;
1211  }
1212  return res;
1213 }
1214 
1216  scf::ForOp target) {
1218  for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
1219  assert(loops.size() == 1);
1220  res.push_back(loops[0]);
1221  }
1222  return res;
1223 }
1224 
1225 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
1226  // Collect perfectly nested loops. If more size values provided than nested
1227  // loops available, truncate `sizes`.
1229  forOps.reserve(sizes.size());
1230  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1231  if (forOps.size() < sizes.size())
1232  sizes = sizes.take_front(forOps.size());
1233 
1234  return ::tile(forOps, sizes, forOps.back());
1235 }
1236 
1238  scf::ForOp root) {
1239  getPerfectlyNestedLoopsImpl(nestedLoops, root);
1240 }
1241 
1243  ArrayRef<int64_t> sizes) {
1244  // Collect perfectly nested loops. If more size values provided than nested
1245  // loops available, truncate `sizes`.
1247  forOps.reserve(sizes.size());
1248  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1249  if (forOps.size() < sizes.size())
1250  sizes = sizes.take_front(forOps.size());
1251 
1252  // Compute the tile sizes such that i-th outer loop executes size[i]
1253  // iterations. Given that the loop current executes
1254  // numIterations = ceildiv((upperBound - lowerBound), step)
1255  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
1256  SmallVector<Value, 4> tileSizes;
1257  tileSizes.reserve(sizes.size());
1258  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
1259  assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
1260 
1261  auto forOp = forOps[i];
1262  OpBuilder builder(forOp);
1263  auto loc = forOp.getLoc();
1264  Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
1265  forOp.getLowerBound());
1266  Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
1267  Value iterationsPerBlock =
1268  ceilDivPositive(builder, loc, numIterations, sizes[i]);
1269  tileSizes.push_back(iterationsPerBlock);
1270  }
1271 
1272  // Call parametric tiling with the given sizes.
1273  auto intraTile = tile(forOps, tileSizes, forOps.back());
1274  TileLoops tileLoops = std::make_pair(forOps, intraTile);
1275 
1276  // TODO: for now we just ignore the result of band isolation.
1277  // In the future, mapping decisions may be impacted by the ability to
1278  // isolate perfectly nested bands.
1279  (void)tryIsolateBands(tileLoops);
1280 
1281  return tileLoops;
1282 }
1283 
1284 scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,
1285  scf::ForallOp source,
1286  RewriterBase &rewriter) {
1287  unsigned numTargetOuts = target.getNumResults();
1288  unsigned numSourceOuts = source.getNumResults();
1289 
1290  // Create fused shared_outs.
1291  SmallVector<Value> fusedOuts;
1292  llvm::append_range(fusedOuts, target.getOutputs());
1293  llvm::append_range(fusedOuts, source.getOutputs());
1294 
1295  // Create a new scf.forall op after the source loop.
1296  rewriter.setInsertionPointAfter(source);
1297  scf::ForallOp fusedLoop = rewriter.create<scf::ForallOp>(
1298  source.getLoc(), source.getMixedLowerBound(), source.getMixedUpperBound(),
1299  source.getMixedStep(), fusedOuts, source.getMapping());
1300 
1301  // Map control operands.
1302  IRMapping mapping;
1303  mapping.map(target.getInductionVars(), fusedLoop.getInductionVars());
1304  mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());
1305 
1306  // Map shared outs.
1307  mapping.map(target.getRegionIterArgs(),
1308  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1309  mapping.map(source.getRegionIterArgs(),
1310  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1311 
1312  // Append everything except the terminator into the fused operation.
1313  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1314  for (Operation &op : target.getBody()->without_terminator())
1315  rewriter.clone(op, mapping);
1316  for (Operation &op : source.getBody()->without_terminator())
1317  rewriter.clone(op, mapping);
1318 
1319  // Fuse the old terminator in_parallel ops into the new one.
1320  scf::InParallelOp targetTerm = target.getTerminator();
1321  scf::InParallelOp sourceTerm = source.getTerminator();
1322  scf::InParallelOp fusedTerm = fusedLoop.getTerminator();
1323  rewriter.setInsertionPointToStart(fusedTerm.getBody());
1324  for (Operation &op : targetTerm.getYieldingOps())
1325  rewriter.clone(op, mapping);
1326  for (Operation &op : sourceTerm.getYieldingOps())
1327  rewriter.clone(op, mapping);
1328 
1329  // Replace old loops by substituting their uses by results of the fused loop.
1330  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1331  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1332 
1333  return fusedLoop;
1334 }
1335 
1336 scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
1337  scf::ForOp source,
1338  RewriterBase &rewriter) {
1339  unsigned numTargetOuts = target.getNumResults();
1340  unsigned numSourceOuts = source.getNumResults();
1341 
1342  // Create fused init_args, with target's init_args before source's init_args.
1343  SmallVector<Value> fusedInitArgs;
1344  llvm::append_range(fusedInitArgs, target.getInitArgs());
1345  llvm::append_range(fusedInitArgs, source.getInitArgs());
1346 
1347  // Create a new scf.for op after the source loop (with scf.yield terminator
1348  // (without arguments) only in case its init_args is empty).
1349  rewriter.setInsertionPointAfter(source);
1350  scf::ForOp fusedLoop = rewriter.create<scf::ForOp>(
1351  source.getLoc(), source.getLowerBound(), source.getUpperBound(),
1352  source.getStep(), fusedInitArgs);
1353 
1354  // Map original induction variables and operands to those of the fused loop.
1355  IRMapping mapping;
1356  mapping.map(target.getInductionVar(), fusedLoop.getInductionVar());
1357  mapping.map(target.getRegionIterArgs(),
1358  fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
1359  mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());
1360  mapping.map(source.getRegionIterArgs(),
1361  fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
1362 
1363  // Merge target's body into the new (fused) for loop and then source's body.
1364  rewriter.setInsertionPointToStart(fusedLoop.getBody());
1365  for (Operation &op : target.getBody()->without_terminator())
1366  rewriter.clone(op, mapping);
1367  for (Operation &op : source.getBody()->without_terminator())
1368  rewriter.clone(op, mapping);
1369 
1370  // Build fused yield results by appropriately mapping original yield operands.
1371  SmallVector<Value> yieldResults;
1372  for (Value operand : target.getBody()->getTerminator()->getOperands())
1373  yieldResults.push_back(mapping.lookupOrDefault(operand));
1374  for (Value operand : source.getBody()->getTerminator()->getOperands())
1375  yieldResults.push_back(mapping.lookupOrDefault(operand));
1376  if (!yieldResults.empty())
1377  rewriter.create<scf::YieldOp>(source.getLoc(), yieldResults);
1378 
1379  // Replace old loops by substituting their uses by results of the fused loop.
1380  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
1381  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
1382 
1383  return fusedLoop;
1384 }
1385 
1386 FailureOr<scf::ForallOp> mlir::normalizeForallOp(RewriterBase &rewriter,
1387  scf::ForallOp forallOp) {
1388  SmallVector<OpFoldResult> lbs = forallOp.getMixedLowerBound();
1389  SmallVector<OpFoldResult> ubs = forallOp.getMixedUpperBound();
1390  SmallVector<OpFoldResult> steps = forallOp.getMixedStep();
1391 
1392  if (llvm::all_of(
1393  lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
1394  llvm::all_of(
1395  steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
1396  return forallOp;
1397  }
1398 
1399  SmallVector<OpFoldResult> newLbs, newUbs, newSteps;
1400  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
1401  Range normalizedLoopParams =
1402  emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step);
1403  newLbs.push_back(normalizedLoopParams.offset);
1404  newUbs.push_back(normalizedLoopParams.size);
1405  newSteps.push_back(normalizedLoopParams.stride);
1406  }
1407 
1408  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
1409  forallOp.getLoc(), newLbs, newUbs, newSteps, forallOp.getOutputs(),
1410  forallOp.getMapping(), [](OpBuilder &, Location, ValueRange) {});
1411 
1412  rewriter.inlineRegionBefore(forallOp.getBodyRegion(),
1413  normalizedForallOp.getBodyRegion(),
1414  normalizedForallOp.getBodyRegion().begin());
1415 
1416  rewriter.replaceAllOpUsesWith(forallOp, normalizedForallOp);
1417  return success();
1418 }
static std::optional< int64_t > getConstantTripCount(scf::ForOp forOp)
Returns the trip count of forOp if its' low bound, high bound and step are constants,...
Definition: Utils.cpp:302
static LogicalResult tryIsolateBands(const TileLoops &tileLoops)
Definition: Utils.cpp:1117
static void getPerfectlyNestedLoopsImpl(SmallVectorImpl< T > &forOps, T rootForOp, unsigned maxLoops=std::numeric_limits< unsigned >::max())
Collect perfectly nested loops starting from rootForOps.
Definition: Utils.cpp:1139
static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner)
Definition: Utils.cpp:1073
static void generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref< Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues)
Generates unrolled copies of scf::ForOp 'loopBodyBlock', with associated 'forOpIV' by 'unrollFactor',...
Definition: Utils.cpp:322
static Loops stripmineSink(scf::ForOp forOp, Value factor, ArrayRef< scf::ForOp > targets)
Definition: Utils.cpp:1154
static std::pair< SmallVector< Value >, SmallPtrSet< Operation *, 2 > > delinearizeInductionVariable(RewriterBase &rewriter, Location loc, Value linearizedIv, ArrayRef< Value > ubs)
For each original loop, the value of the induction variable can be obtained by dividing the induction...
Definition: Utils.cpp:775
static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, int64_t divisor)
Definition: Utils.cpp:270
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, ArrayRef< Value > values)
Helper function to multiply a sequence of values.
Definition: Utils.cpp:743
#define LDBG(X)
Definition: Utils.cpp:37
static bool areInnerBoundsInvariant(scf::ForOp forOp)
Check if bounds of all inner loops are defined outside of forOp and return false if not.
Definition: Utils.cpp:497
static llvm::ManagedStatic< PassManagerOptions > options
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
This class represents an argument of a Block.
Definition: Value.h:319
Block represents an ordered list of Operations.
Definition: Block.h:31
OpListType::iterator iterator
Definition: Block.h:138
unsigned getNumArguments()
Definition: Block.h:126
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:243
BlockArgListType getArguments()
Definition: Block.h:85
Operation & front()
Definition: Block.h:151
iterator end()
Definition: Block.h:142
iterator begin()
Definition: Block.h:141
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:136
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition: Builders.cpp:250
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:343
MLIRContext * getContext() const
Definition: Builders.h:55
TypedAttr getOneAttr(Type type)
Definition: Builders.cpp:361
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:766
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:353
This class helps build Operations.
Definition: Builders.h:212
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:567
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:436
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:403
static OpBuilder atBlockTerminator(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the block terminator.
Definition: Builders.h:257
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:441
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=std::nullopt, ArrayRef< Location > locs=std::nullopt)
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:449
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:525
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:476
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:417
This class represents a single result from folding an operation.
Definition: OpDefinition.h:268
This class represents an operand of an operation.
Definition: Value.h:267
This is a value defined by a result of an operation.
Definition: Value.h:457
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:717
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition: Operation.h:669
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
Definition: Operation.h:672
operand_type_range getOperandTypes()
Definition: Operation.h:392
result_type_range getResultTypes()
Definition: Operation.h:423
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:373
void moveBefore(Operation *existingOp)
Unlink this operation from its current block and insert it right before existingOp which may be in th...
Definition: Operation.cpp:555
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:539
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
BlockArgListType getArguments()
Definition: Region.h:81
iterator begin()
Definition: Region.h:55
Block & front()
Definition: Region.h:65
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation.
Definition: Region.h:205
bool hasOneBlock()
Return true if this region has exactly one block.
Definition: Region.h:68
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
void mergeBlocks(Block *source, Block *dest, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into the end of block 'dest'.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void replaceAllUsesExcept(Value from, Value to, Operation *exceptedUser)
Find uses of from and replace them with to except if the user is exceptedUser.
Definition: PatternMatch.h:702
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:630
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
virtual void inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues=std::nullopt)
Inline the operations of block 'source' into block 'dest' before the given position.
void replaceAllOpUsesWith(Operation *from, ValueRange to)
Find uses of from and replace them with to.
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
bool isIntOrIndex() const
Return true if this is an integer (of any signedness) or an index type.
Definition: Types.cpp:118
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
void replaceUsesWithIf(Value newValue, function_ref< bool(OpOperand &)> shouldReplace)
Replace all uses of 'this' value with 'newValue' if the given callback returns true.
Definition: Value.cpp:81
Type getType() const
Return the type of this value.
Definition: Value.h:129
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
static WalkResult advance()
Definition: Visitors.h:51
static WalkResult interrupt()
Definition: Visitors.h:50
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
Operation * getOwner() const
Return the owner of this operand.
Definition: UseDefLists.h:38
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1583
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
Include the generated interface declarations.
void getPerfectlyNestedLoops(SmallVectorImpl< scf::ForOp > &nestedLoops, scf::ForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: Utils.cpp:1237
LogicalResult loopUnrollByFactor(scf::ForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr)
Unrolls this for operation by the specified unroll factor.
Definition: Utils.cpp:375
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
LogicalResult outlineIfOp(RewriterBase &b, scf::IfOp ifOp, func::FuncOp *thenFn, StringRef thenFnName, func::FuncOp *elseFn, StringRef elseFnName)
Outline the then and/or else regions of ifOp as follows:
Definition: Utils.cpp:222
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:32
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(RewriterBase &rewriter, MutableArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
Definition: Utils.cpp:39
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition: Utils.cpp:305
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op)
Walk an affine.for to find a band to coalesce.
Definition: Utils.cpp:906
std::pair< Loops, Loops > TileLoops
Definition: Utils.h:145
Value getValueOrCreateConstantIntOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:103
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef< std::vector< unsigned >> combinedDimensions)
Take the ParallelLoop and for each set of dimension indices, combine them into a single dimension.
Definition: Utils.cpp:981
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBbArgs)> NewYieldValuesFn
A function that returns the additional yielded values during replaceWithAdditionalYields.
Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef< Value > sizes)
Tile a nest of scf::ForOp loops rooted at rootForOp with the given (parametric) sizes.
Definition: Utils.cpp:1225
LogicalResult loopUnrollJamByFactor(scf::ForOp forOp, uint64_t unrollFactor)
Unrolls and jams this scf.for operation by the specified unroll factor.
Definition: Utils.cpp:510
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
Definition: Utils.cpp:245
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:67
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: Utils.cpp:1202
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
FailureOr< func::FuncOp > outlineSingleBlockRegion(RewriterBase &rewriter, Location loc, Region &region, StringRef funcName, func::CallOp *callOp=nullptr)
Outline a region with a single block into a new FuncOp.
Definition: Utils.cpp:118
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:24
void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, Value normalizedIv, OpFoldResult origLb, OpFoldResult origStep)
Get back the original induction variable values after loop normalization.
Definition: Utils.cpp:717
scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForallOp source, RewriterBase &rewriter)
Given two scf.forall loops, target and source, fuses target into source.
Definition: Utils.cpp:1284
LogicalResult coalesceLoops(MutableArrayRef< scf::ForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: Utils.cpp:898
scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter)
Given two scf.for loops, target and source, fuses target into source.
Definition: Utils.cpp:1336
TileLoops extractFixedOuterLoops(scf::ForOp rootFOrOp, ArrayRef< int64_t > sizes)
Definition: Utils.cpp:1242
Range emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, OpFoldResult lb, OpFoldResult ub, OpFoldResult step)
Materialize bounds and step of a zero-based and unit-step loop derived by normalizing the specified b...
Definition: Utils.cpp:674
FailureOr< scf::ForallOp > normalizeForallOp(RewriterBase &rewriter, scf::ForallOp forallOp)
Normalize an scf.forall operation.
Definition: Utils.cpp:1386
void getForwardSlice(Operation *op, SetVector< Operation * > *forwardSlice, const ForwardSliceOptions &options={})
Fills forwardSlice with the computed forward slice (i.e.
void walk(Operation *op)
SmallVector< std::pair< Block::iterator, Block::iterator > > subBlocks
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
OpFoldResult stride
OpFoldResult size
OpFoldResult offset
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.