MLIR  18.0.0git
HoistPadding.cpp
Go to the documentation of this file.
1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
24 #include "mlir/IR/AsmState.h"
25 #include "mlir/IR/Dominance.h"
26 #include "mlir/IR/Matchers.h"
29 #include "llvm/Support/Debug.h"
30 
31 using llvm::dbgs;
32 
33 #define DEBUG_TYPE "hoist-padding"
34 
35 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
36 
37 using namespace mlir;
38 using namespace mlir::linalg;
39 using namespace mlir::linalg::detail;
40 
41 #ifndef NDEBUG
43  AsmState state(op->getParentOfType<func::FuncOp>());
44  (void)state;
45  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
46  forOp.getInductionVar().printAsOperand(dbgs(), state);
47  dbgs() << " @ " << forOp.getOperation();
48  return true;
49  }
50  return false;
51 }
52 #endif
53 
54 static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) {
55  LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:",
56  [](Operation *op) {
57  dbgs() << "\n";
58  DBGS() << "----";
59  if (debugPrintLoopInShortForm(op)) {
60  dbgs() << "\n";
61  return;
62  }
63  dbgs() << *op << "\n";
64  });
65  DBGS() << "\n";);
66 }
67 
68 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
69 /// Stops at the first parent that is not an scf::ForOp.
70 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
71 /// Control-flow and other containing ops with regions are not modeled atm.
72 static void
73 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
74  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
75  scf::ForOp outermostEnclosingForOp = nullptr;
76  Operation *nextEnclosingOp = padOp->getParentOp();
77  while (nLevels-- > 0 &&
78  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
79  LLVM_DEBUG(DBGS() << "loops: ";
80  debugPrintLoopInShortForm(outermostEnclosingForOp);
81  dbgs() << "\n");
82  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
83  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
84  }
85 }
86 
87 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
88 /// Stops at the first parent that is not an scf::ForOp.
89 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
90 /// Control-flow and other containing ops with regions are not modeled atm.
91 static void
92 getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
93  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
94  scf::ForOp outermostEnclosingForOp = nullptr;
95  Operation *nextEnclosingOp = padOp->getParentOp();
96  while (outermostEnclosingForOp != untilLoop &&
97  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
98  LLVM_DEBUG(DBGS() << "loops: ";
99  debugPrintLoopInShortForm(outermostEnclosingForOp);
100  dbgs() << "\n");
101  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
102  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
103  }
104 }
105 
106 // Get all the ops in the backwards slice starting from `padOp` and that
107 // are dominated by the outermost enclosing loop.
108 // This also requires tracking ops defining values used in the region but
109 // defined above.
110 static void computeBackwardSlice(tensor::PadOp padOp,
111  scf::ForOp outermostEnclosingForOp,
112  SetVector<Operation *> &backwardSlice) {
113  DominanceInfo domInfo(outermostEnclosingForOp);
114  BackwardSliceOptions sliceOptions;
115  sliceOptions.filter = [&](Operation *op) {
116  return domInfo.dominates(outermostEnclosingForOp, op) &&
117  !padOp->isProperAncestor(op);
118  };
119  sliceOptions.inclusive = true;
120 
121  // First, add the ops required to compute the region to the backwardSlice.
122  SetVector<Value> valuesDefinedAbove;
123  getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),
124  valuesDefinedAbove);
125  for (Value v : valuesDefinedAbove) {
126  getBackwardSlice(v, &backwardSlice, sliceOptions);
127  }
128  // Then, add the backward slice from padOp itself.
129  getBackwardSlice(padOp.getOperation(), &backwardSlice, sliceOptions);
130 }
131 
132 //===----------------------------------------------------------------------===//
133 // HoistPaddingAnalysis Implementation.
134 //===----------------------------------------------------------------------===//
135 
136 namespace {
137 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
138 /// loops. The failure conditions are:
139 /// 1. Pad op has a use that is not an input of a LinalgOp.
140 /// 2. Pad op does not have a constant padding value.
141 /// 3. There is no immediately enclosing scf::ForOp.
142 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above
143 /// contains an unknown op with non index type operands, a region, or a
144 /// memory effect.
145 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
146 /// empty.
147 /// 6. The source tensor of pad op is not defined by an extract slice op.
148 /// 7. The source tensor of the extract slice op is not defined outside of
149 /// the outermost enclosing scf::ForOp.
150 /// 8. There is no enclosing scf::ForOp that indexes the padded data.
151 /// Other cases succeed and will trigger hoisting of the pad op.
152 struct HoistPaddingAnalysis {
153  HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);
154  HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);
155 
156  bool isValid() { return valid.has_value() && valid.value(); }
157  bool isInvalid() { return valid.has_value() && !valid.value(); }
158 
159  /// Footprint of the hoistedPackedTensor, computed from the packingLoops.
160  SmallVector<Value> getHoistedPackedTensorSizes(RewriterBase &rewriter,
161  Location loc) const;
162 
163  /// Performs optional hoisting to enable hoist padding to occur. This may be
164  /// necessary when `sliceOp` is not defined outside of the outermost enclosing
165  /// loop we want to hoist above.
166  ///
167  /// Example:
168  /// ```
169  /// %source = linalg.fill(%cst, %arg0)
170  /// // %source is available for packing here!
171  /// scf.for %i
172  /// scf.for %j
173  /// scf.for %k
174  /// %slice = tensor.extract_slice %source [%i, %j]
175  /// %padded_slice = tensor.pad %slice
176  /// ```
177  void enableHoistPadding(RewriterBase &rewriter);
178 
179  /// Common analysis builder to finalize the construction of the analysis once
180  /// optional `enableHoistPadding` has run.
181  /// `reverseEnclosingLoops.back()` is the loop to hoist above.
182  void finalizeHoistPaddingAnalysis();
183 
184 private:
185  /// Encodes whether the analysis is valid and hoisting can proceed.
186  std::optional<bool> valid;
187 
188  /// The padOp to hoist.
189  tensor::PadOp opToHoist;
190 
191  /// Immediately enclosing loops considered for hoisting padding.
192  SmallVector<scf::ForOp> reverseEnclosingLoops;
193 
194  /// Drop any non-index dependencies of `padOp` and `sliceOp` from
195  /// `backwardSlice`. The method follows the use-def chains of the index
196  /// operands consumed by `padOp` and `sliceOp` and drops the operations
197  /// not part of this index computation. Afterwards, the filtered
198  /// `backwardSlice` contains only the loops whose induction variable is
199  /// used, directly or indirectly, to index the padded tensor. The method
200  /// returns failure if the filtered backward slice contains an unexpected
201  /// operation.
202  ///
203  /// Example:
204  /// ```
205  /// %source = linalg.fill(%cst, %arg0)
206  /// scf.for %i
207  /// %unrelated = linalg.fill(%cst, %arg1) // not used to index
208  /// %source! scf.for %j (%arg2 = %unrelated)
209  /// scf.for %k // not used to index
210  /// %source!
211  /// %ubi = affine.min #map(%i)
212  /// %ubj = affine.min #map(%j)
213  /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
214  /// %padded_slice = tensor.pad %slice
215  /// ```
216  /// dropNonIndexDependencies(%padded_slice, %slice)
217  /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
218  LogicalResult dropNonIndexDependencies();
219 
220 public:
221  /// The outermost loop, determined by `nLevels` above which `padOp` will
222  /// be hoisted.
223  scf::ForOp outermostEnclosingForOp;
224 
225  /// Backward slice rooted at `padOp` and nested under
226  /// `outermostEnclosingForOp`.
227  SetVector<Operation *> backwardSlice;
228 
229  /// The scf::ForOp immediately enclosing `padOp` such that:
230  /// 1. they are nested under `outermostEnclosingForOp` (inclusive)
231  /// 2. whose induction variable is used, directly or indirectly, in the
232  /// computation of `padOp`.
233  /// The span of these loops determines the footprint of the packed tensor.
234  SmallVector<scf::ForOp> packingLoops;
235 
236  /// The ExtractSliceOp that feeds the PadOp we want to hoist.
237  tensor::ExtractSliceOp sliceOp;
238 
239  /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
240  scf::ForOp padConsumingForOp;
241 };
242 
243 } // namespace
244 
245 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
246  : valid(std::nullopt), opToHoist(padOp) {
247  // Get at most `numLoops` of immediately enclosing loops.
248  getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops);
249  if (reverseEnclosingLoops.empty()) {
250  LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
251  valid = false;
252  return;
253  }
254  outermostEnclosingForOp = reverseEnclosingLoops.back();
255  sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
256  if (!sliceOp) {
257  LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
258  valid = false;
259  return;
260  }
261 }
262 
263 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
264  scf::ForOp outermostEnclosingForOp)
265  : valid(std::nullopt), opToHoist(padOp) {
266  // Get enclosing loops until outermostEnclosingForOp.
267  getEnclosingLoopsUntil(opToHoist, outermostEnclosingForOp,
268  reverseEnclosingLoops);
269  if (reverseEnclosingLoops.empty()) {
270  LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
271  valid = false;
272  return;
273  }
274  this->outermostEnclosingForOp = reverseEnclosingLoops.back();
275  if (this->outermostEnclosingForOp != outermostEnclosingForOp) {
276  LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");
277  valid = false;
278  return;
279  }
280  sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
281  if (!sliceOp) {
282  LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
283  valid = false;
284  return;
285  }
286 }
287 
288 void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
289  if (isInvalid())
290  return;
291  // If the padded data is not yet available before entering the outermost
292  // enclosing loop, try to apply hoisting on this outermost loop.
293  // TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
294  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
295  outermostEnclosingForOp =
296  hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp);
297  }
298 }
299 
300 void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {
301  if (isInvalid())
302  return;
303 
304  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
305  LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"
306  << outermostEnclosingForOp << "\n"
307  << "--sliceOp: " << sliceOp << "\n"
308  << "--sliceOp.getSource(): " << sliceOp.getSource()
309  << "\n");
310  LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n");
311  valid = false;
312  return;
313  }
314  if (sliceOp->hasOneUse()) {
315  padConsumingForOp = dyn_cast<scf::ForOp>(*(sliceOp->getUsers().begin()));
316  }
317 
318  // Check the region of `padOp` depends on a constant only. Adding hoisting
319  // support for arbitrary padding regions would require cloning all
320  // dependencies captured by the padding region.
321  Value paddingValue = opToHoist.getConstantPaddingValue();
322  if (!paddingValue ||
323  !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
324  LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");
325  valid = false;
326  return;
327  }
328 
329  computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice);
330  if (backwardSlice.size() <= 1) {
331  valid = false;
332  return;
333  }
334 
335  debugPrintBackwardSlice(backwardSlice);
336  // Remove all ops in the backward slice that are not used to index
337  // the padded tensor. In particular, keep `padOp`, `sliceOp`, and
338  // the loop and affine operations used for the index computation.
339  if (failed(dropNonIndexDependencies())) {
340  LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");
341  valid = false;
342  return;
343  }
344  debugPrintBackwardSlice(backwardSlice);
345 
346  // Add only the loops part of the filtered `backwardSlice` to the
347  // packing loops. All other loops are not used to index the padded
348  // data and consequently access the same data in every loop
349  // iteration. Adding them to the packing loops would increase the
350  // cache footprint of the packed data by storing the same data
351  // multiple times.
352  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
353  if (backwardSlice.contains(forOp))
354  packingLoops.push_back(forOp);
355 
356  // TODO: for multiple loops we need to track the use to the innermost loop.
357  if (packingLoops.size() > 1 && padConsumingForOp) {
358  LLVM_DEBUG(DBGS() << "--Cannot hoist multiple loops through iter_args -> "
359  "Downgrade to 1 loop\n");
360  packingLoops.resize(1);
361  }
362 
363  // Note: at this point, packing loops may be empty but we would still like
364  // to hoist the padding if so specified.
365 
366  // The analysis is valid and hoisting can occur.
367  valid = true;
368 }
369 
370 LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() {
371  // Set of all values used for index computation.
372  SetVector<Value> indexEdges;
373 
374  // Add all index operands of `operation` to `indexEdges`. An index operand
375  // is an operand of type index.
376  auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
377  for (Value operand : operation->getOperands())
378  if (operand.getType().isIndex())
379  indexEdges.insert(operand);
380  };
381 
382  // Check if any operation result is contained in `indexEdges`.
383  auto hasIndexResult = [&](Operation *operation) {
384  return llvm::any_of(operation->getResults(), [&](Value result) {
385  return indexEdges.contains(result);
386  });
387  };
388 
389  // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index
390  // type in `backwardSlice`. Add the index operands of an operation to
391  // `indexEdges` and remove all operations from `backwardSlice` that are not
392  // part of the index computation.
393  //
394  // Example:
395  // ```
396  // %source = linalg.fill(%cst, %arg0)
397  // scf.for %i
398  // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
399  // scf.for %j (%arg2 = %unrelated)
400  // scf.for %k // not used to index %source!
401  // %ubi = affine.min #map(%i)
402  // %ubj = affine.min #map(%j)
403  // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
404  // %padded_slice = tensor.pad %slice
405  // ```
406  // After iterating `backwardSlice` we obtain:
407  // indexEdges = [%i, %j, %ubi, %ubj]
408  // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
409  SetVector<Operation *> operationsToRemove;
410  for (Operation *op : llvm::reverse(backwardSlice)) {
411  // Add the index operands of `opToHoist` and `sliceOp` to start the
412  // exploration of the index computation.
413  if (op == opToHoist || op == sliceOp) {
414  addIndexOperandsToIndexEdges(op);
415  continue;
416  }
417  // Add the index operands of the loop if its induction variable is
418  // used for index computation.
419  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
420  if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
421  addIndexOperandsToIndexEdges(op);
422  continue;
423  }
424  }
425  // Add the index operands of all other operations if at least one result
426  // is used for index computation.
427  if (hasIndexResult(op)) {
428  addIndexOperandsToIndexEdges(op);
429  // Check the operands of the remaining operations all have index type.
430  if (llvm::any_of(op->getOperandTypes(),
431  [](Type type) { return !type.isIndex(); })) {
432  LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
433  << op << " -> Skip\n");
434  return failure();
435  }
436  // Check the remaining operations do not have regions or memory effects.
437  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
438  bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
439  if (hasMemoryEffect || op->getNumRegions() != 0) {
440  LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
441  << op << " -> Skip\n");
442  return failure();
443  }
444  continue;
445  }
446  // Remove all other operations not used by the index computation. An
447  // exception are constant operations that may be used by `opToHoist`.
448  if (!isa<arith::ConstantOp>(op))
449  operationsToRemove.insert(op);
450  }
451  backwardSlice.set_subtract(operationsToRemove);
452  return success();
453 }
454 
456 HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
457  Location loc) const {
458  SmallVector<Value> dynamicTensorSizes;
459 
460  // Upper bound the packing loop lengths to size the packed tensor. Taking
461  // upper bounds can make the sizes of the packed tensor independent of the
462  // enclosing loops. This independence is a prerequisite for reusing the same
463  // buffer for all enclosing loop iterations and hoisting its allocation out
464  // of the enclosing loops.
465  for (auto forOp : packingLoops) {
466  // Compute an upper bound `ubVal` for the upper bound of `forOp`.
468  rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),
469  /*stopCondition=*/
470  [&](Value v, std::optional<int64_t> d) {
471  if (v == forOp.getUpperBound())
472  return false;
473  // Compute a bound that is independent of any affine op results.
474  Operation *op = v.getDefiningOp();
475  if (!op)
476  return true;
477  return !isa<affine::AffineMinOp, affine::AffineMaxOp,
478  affine::AffineApplyOp>(op);
479  },
480  /*closedUB=*/true);
481  assert(succeeded(loopUb) && "could not get upper bound");
482  Value ubVal = getValueOrCreateConstantIndexOp(rewriter, loc, *loopUb);
483 
484  // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
485  // store the result to `dynamicTensorSizes`.
486  // TODO: instead of using the lower bound of `forOp` directly, implement a
487  // lower bound computation similar to the upper bound computation.
488  AffineExpr lb, ub, step;
489  bindDims(rewriter.getContext(), lb, ub);
490  bindSymbols(rewriter.getContext(), step);
491  Value res = rewriter.createOrFold<affine::AffineApplyOp>(
492  loc, (ub - lb).ceilDiv(step),
493  ValueRange{forOp.getLowerBound(), ubVal,
494  cast<scf::ForOp>(forOp).getStep()});
495  dynamicTensorSizes.push_back(res);
496  }
497 
498  return dynamicTensorSizes;
499 }
500 
501 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
502  return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
503 }
504 
505 //===----------------------------------------------------------------------===//
506 // buildPackingLoopNest Implementation.
507 //===----------------------------------------------------------------------===//
508 
509 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
510 /// The returned Value is guaranteed not to depend on any loop comprised in
511 /// [`outer`, `forOp`].
512 /// Return null if such a loop-independent quantity cannot be computed.
513 static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,
514  scf::ForOp forOp) {
515  MLIRContext *ctx = forOp->getContext();
516  AffineExpr iv, lb, step;
517  bindDims(ctx, iv, lb);
518  bindSymbols(ctx, step);
519  if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
520  !isDefinedOutsideOrConstant(outer, forOp.getStep()))
521  return Value();
522  Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
523  stepVal = forOp.getStep();
524  auto loc = forOp->getLoc();
525  return rewriter.createOrFold<affine::AffineApplyOp>(
526  loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal});
527 }
528 
529 // Build a packing loop nest by iteratively traversing the backward slice and
530 // clone the operations, iteratively stepping into the loops that we encounter.
531 // The implementation proceeds in a stack-like fashion:
532 // 1. Iteratively clone and step into the loops, pushing the
533 // `hoistedPackedTensor`
534 // deeper in the stack.
535 // 2. At the innermost loop level, create a GenericOp if `transposeVector` is
536 // non-empty.
537 // 3. At the innermost loop level, create a InsertSliceOp.
538 // 4. Iteratively pop and yield the result of the InsertSliceOp across the
539 // cloned loops.
541  RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
542  ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
543  tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {
544  SmallVector<OpFoldResult> offsets, sizes, strides;
545  SmallVector<Value> clonedLoopIvs, leadingHoistedPackedTensorIndexings;
546 
547  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
548 
549  Location loc = opToHoist->getLoc();
550  RankedTensorType paddedTensorType = opToHoist.getResultType();
551  int paddedRank = paddedTensorType.getRank();
552 
553  // Step 0. Populate bvm with opToHoist.getSource if relevant.
554  BlockArgument bbArg = dyn_cast<BlockArgument>(opToHoist.getSource());
555  while (bbArg) {
556  auto forOp = dyn_cast<scf::ForOp>(bbArg.getOwner()->getParentOp());
557  if (!forOp)
558  break;
559  if (forOp != outerLoop && !outerLoop->isAncestor(forOp))
560  break;
561  OpOperand &operand = forOp.getOpOperandForRegionIterArg(bbArg);
562  bvm.map(bbArg, operand.get());
563  bbArg = dyn_cast<BlockArgument>(operand.get());
564  }
565 
566  // Step 1. iteratively clone loops and push `hoistedPackedTensor`.
567  Value hoistedPackedTensor = emptyOp.getResult();
568  OpBuilder::InsertionGuard g(rewriter);
569  for (Operation *op : analysis.backwardSlice) {
570  // Specifically sit out in the extract_slice(hoistedPackedTensor) case: this
571  // is the piece we seek to replace.
572  if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
573  if (bvm.lookupOrDefault(sliceOp.getSource()) == hoistedPackedTensor) {
574  LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n");
575  continue;
576  }
577  }
578 
579  // Clone all operations except loops which require special handling.
580  auto forOp = dyn_cast<scf::ForOp>(op);
581  if (!forOp) {
582  // We are at the right insertion point within the loop nest.
583  rewriter.clone(*op, bvm);
584  continue;
585  }
586 
587  // Create a packing loop that takes `hoistedPackedTensor` as iteration
588  // argument.
589  auto clonedForOp = rewriter.create<scf::ForOp>(
590  loc, bvm.lookupOrDefault(forOp.getLowerBound()),
591  bvm.lookupOrDefault(forOp.getUpperBound()),
592  bvm.lookupOrDefault(forOp.getStep()), hoistedPackedTensor);
593 
594  // Map the induction var, region args and results to the `clonedForOp`.
595  bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
596  bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
597  bvm.map(forOp.getResults(), clonedForOp.getResults());
598  assert(clonedForOp->getNumRegions() == 1);
599  clonedLoopIvs.push_back(clonedForOp.getInductionVar());
600 
601  // Do not insert guard here, we get deeper into the loop nest.
602  rewriter.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
603  Value loopIndependentIterationCount =
604  buildLoopIterationCount(rewriter, outerLoop, clonedForOp);
605 
606  // Assert the loop-independent iteration count can be computed.
607  if (!loopIndependentIterationCount)
608  llvm_unreachable("loop independence prerequisite not met");
609  leadingHoistedPackedTensorIndexings.push_back(
610  loopIndependentIterationCount);
611  hoistedPackedTensor = clonedForOp.getRegionIterArgs().front();
612  }
613 
614  // Step 2. Construct offsets, sizes and strides for the innermost level of the
615  // packing loop.
616  int64_t nPackedLoops = clonedLoopIvs.size();
617  // offsets = [clonedLoopIvs, 0 .. 0].
618  offsets =
619  SmallVector<OpFoldResult>{leadingHoistedPackedTensorIndexings.begin(),
620  leadingHoistedPackedTensorIndexings.end()};
621  offsets.append(paddedRank, rewriter.getIndexAttr(0));
622  // sizes = [1 .. 1, transposedShape].
623  sizes = SmallVector<OpFoldResult>(nPackedLoops, rewriter.getIndexAttr(1));
624  for (int64_t sz : transposedTensorType.getShape()) {
625  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
626  if (ShapedType::isDynamic(sz))
627  return failure();
628  sizes.push_back(rewriter.getIndexAttr(sz));
629  }
630  // strides = [1 .. 1].
631  strides = SmallVector<OpFoldResult>(nPackedLoops + paddedRank,
632  rewriter.getIndexAttr(1));
633 
634  // Step 3. Optionally transpose the padded tensor.
635  GenericOp maybeTransposeOp;
636  Value paddedTensor = bvm.lookup(opToHoist.getResult());
637  if (!transposeVector.empty()) {
638  Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(
639  loc, transposedTensorType, hoistedPackedTensor, offsets, sizes,
640  strides);
641  maybeTransposeOp = makeTransposeOp(rewriter, loc, paddedTensor,
642  outputTensor, transposeVector);
643  paddedTensor = maybeTransposeOp.getResult(0);
644  }
645 
646  // Innermost tensor.insert_slice and yields are optional / need loops.
647  if (nPackedLoops > 0) {
648  // Step 4. Create InsertSliceOp at the innermost loop level, inserting an
649  // optionally transposed padded slice into the packed tensor.
650  Value inserted = rewriter.create<tensor::InsertSliceOp>(
651  loc, paddedTensor, hoistedPackedTensor, offsets, sizes, strides);
652 
653  // Step 5. Iteratively pop the stack and propagate the yield.
654  Value valueToYield = inserted;
655  for (Value iv : llvm::reverse(clonedLoopIvs)) {
656  auto forOp = scf::getForInductionVarOwner(iv);
657  rewriter.setInsertionPointToEnd(&forOp.getRegion().front());
658  rewriter.create<scf::YieldOp>(loc, valueToYield);
659  valueToYield = forOp.getResult(0);
660  }
661  }
662 
663  return PackingResult{
664  offsets,
665  sizes,
666  strides,
667  clonedLoopIvs,
668  leadingHoistedPackedTensorIndexings,
669  maybeTransposeOp,
670  cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp())};
671 }
672 
673 /// Build the packing loop nest required to hoist `opToHoist` above
674 /// `outermostEnclosingForOp`.
675 /// The loop nest is built just before `outermostEnclosingForOp`.
677  RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
678  ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {
679  // Update actual number of loops, which may be smaller.
680  int nPackedLoops = analysis.packingLoops.size();
681  LLVM_DEBUG(DBGS() << "\n";
682  DBGS() << "Func:\n"
683  << *opToHoist->getParentOfType<func::FuncOp>() << "\n";
684  DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
685 
686  Location loc = opToHoist->getLoc();
687  RankedTensorType paddedTensorType = opToHoist.getResultType();
688 
689  // Compute the type of the transposed padded tensor.
690  FailureOr<RankedTensorType> transposedTensorType =
691  tensor::computeTransposedType(paddedTensorType, transposeVector);
692  if (failed(transposedTensorType)) {
693  LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
694  return failure();
695  }
696 
697  // Create the packed tensor<?x?x..? x transposedShape>.
698  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
699  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
700  llvm::append_range(packedShape, transposedTensorType->getShape());
701  auto hoistedPackedTensorType = RankedTensorType::get(
702  packedShape, transposedTensorType->getElementType());
703 
704  // Set the insertion point right before the outer loop and start packing.
705  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
706  OpBuilder::InsertionGuard g(rewriter);
707  rewriter.setInsertionPoint(outerLoop);
708  SmallVector<Value> dynamicTensorSizes =
709  analysis.getHoistedPackedTensorSizes(rewriter, loc);
710  auto emptyOp = rewriter.create<tensor::EmptyOp>(
711  loc, hoistedPackedTensorType.getShape(),
712  hoistedPackedTensorType.getElementType(), dynamicTensorSizes);
713 
714  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
715  *transposedTensorType, emptyOp, analysis);
716 }
717 
718 /// Build the packing loop nest required to hoist `opToHoist` above
719 /// `outermostEnclosingForOp`.
720 /// The loop nest is built just before `outermostEnclosingForOp`.
722  RewriterBase &rewriter, tensor::PadOp opToHoist,
723  scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {
724  HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);
725  analysis.enableHoistPadding(rewriter);
726  analysis.finalizeHoistPaddingAnalysis();
727  if (!analysis.isValid()) {
728  LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
729  return failure();
730  }
731  IRMapping bvm;
732  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
733  analysis);
734 }
735 
736 //===----------------------------------------------------------------------===//
737 // hoistPaddingOnTensors Implementation.
738 //===----------------------------------------------------------------------===//
739 
740 /// Return true if we can walk back the use-def chain from `extractSliceOp` to
741 /// expectedSource going through DestinationStyleOpInterface inits only.
742 /// This is a poor man's analysis that is sufficient to check the extractSliceOp
743 /// the matches tensor.pad we want to hoist.
744 /// In the future, it will be easier to ensure this with a matching symmetric
745 /// tensor.unpad op.
746 static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,
747  Value expectedSource) {
748  LLVM_DEBUG(DBGS() << "Start tracesBackToExpectedValue on: " << extractSliceOp
749  << "\n");
750  LLVM_DEBUG(DBGS() << "--with extractSlice: " << extractSliceOp << "\n");
751  Value source = extractSliceOp.getSource();
752  LLVM_DEBUG(DBGS() << "--with starting source: " << source << "\n");
753  while (source && source != expectedSource) {
754  auto destOp =
755  dyn_cast_or_null<DestinationStyleOpInterface>(source.getDefiningOp());
756  if (!destOp)
757  break;
758  LLVM_DEBUG(DBGS() << "--step dest op: " << destOp << "\n");
759  source = destOp.getDpsInitOperand(cast<OpResult>(source).getResultNumber())
760  ->get();
761  }
762  LLVM_DEBUG(DBGS() << "--final source: " << source << "\n");
763  LLVM_DEBUG(DBGS() << "--expected source: " << expectedSource << "\n");
764  return source == expectedSource;
765 }
766 
767 /// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an
768 /// iter arg), propagate the `hoistedPackedTensor` value through the same iter
769 /// arg.
770 /// TODO: for multiple loops we need to track the use to the innermost loop.
771 ///
772 /// Match:
773 /// ```
774 /// %outerSliceOp = tensor.extract_slice ..
775 /// %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {
776 /// %hoistedPackedTensor = tensor.pad %arg0
777 /// %1 = compute %hoistedPackedTensor
778 /// %2 = tensor.extract_slice %1
779 /// scf.yield %2
780 /// }
781 /// ```
782 ///
783 /// and rewrite as:
784 /// ```
785 /// %outerSliceOp = tensor.extract_slice ..
786 /// %hoistedPackedTensor = tensor.pad %outerSliceOp
787 /// %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {
788 /// %1 = compute %arg0
789 /// scf.yield %1
790 /// }
791 /// %2 = tensor.extract_slice %forOp
792 /// ```
793 ///
794 /// Return null when no rewrite happened.
795 static tensor::ExtractSliceOp
796 padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
797  Value hoistedPackedTensor,
798  tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) {
799  LLVM_DEBUG(DBGS() << "Start padThroughLoopIterArg on: " << forOp << "\n");
800  LLVM_DEBUG(DBGS() << "--paddedValueBeforeHoisting: "
801  << paddedValueBeforeHoisting << "\n");
802  OpOperand *pUse = nullptr;
803  for (OpOperand &use : outerSliceOp->getUses()) {
804  if (use.getOwner() == forOp) {
805  assert(!pUse && "Multiple slice uses in the for loop");
806  pUse = &use;
807  }
808  }
809  assert(pUse && "No slice use in the for loop");
810  OpBuilder::InsertionGuard g(rewriter);
811  rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());
812 
813  unsigned iterArgNumber = forOp.getResultForOpOperand(*pUse).getResultNumber();
814  auto yieldOp = cast<scf::YieldOp>(forOp.getBody(0)->getTerminator());
815  auto yieldingExtractSliceOp = yieldOp->getOperand(iterArgNumber)
816  .getDefiningOp<tensor::ExtractSliceOp>();
817  if (!yieldingExtractSliceOp)
818  return tensor::ExtractSliceOp();
819 
820  // Poor man's analysis sufficient to ensure extractSlice matches tensor.pad.
821  // In the future, it will be easier to ensure this with a matching symmetric
822  // tensor.unpad op.
823  if (!tracesBackToExpectedValue(yieldingExtractSliceOp,
824  paddedValueBeforeHoisting))
825  return tensor::ExtractSliceOp();
826 
827  SmallVector<Value> initArgs = forOp.getInitArgs();
828  initArgs[iterArgNumber] = hoistedPackedTensor;
829  SmallVector<Value> yieldOperands = yieldOp.getOperands();
830  yieldOperands[iterArgNumber] = yieldingExtractSliceOp.getSource();
831 
832  int64_t numOriginalForOpResults = initArgs.size();
833  LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults
834  << "\n");
835  tensor::ExtractSliceOp extracted;
836  {
837  OpBuilder::InsertionGuard g(rewriter);
838  rewriter.setInsertionPointAfter(forOp);
839  extracted = rewriter.create<tensor::ExtractSliceOp>(
840  hoistedPackedTensor.getLoc(), hoistedPackedTensor,
841  outerSliceOp.getMixedOffsets(), outerSliceOp.getMixedSizes(),
842  outerSliceOp.getMixedStrides());
843  rewriter.replaceAllUsesWith(forOp.getResult(iterArgNumber), extracted);
844  }
845  scf::ForOp newForOp =
846  replaceLoopWithNewYields(rewriter, forOp, initArgs, yieldOperands);
847 
848  LLVM_DEBUG(DBGS() << "newForOp results: " << newForOp.getNumResults()
849  << "\n");
850  LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n");
851  LLVM_DEBUG(DBGS() << "with result #"
852  << numOriginalForOpResults + iterArgNumber
853  << " of forOp, giving us: " << extracted << "\n");
854  rewriter.startRootUpdate(extracted);
855  extracted.getSourceMutable().assign(
856  newForOp.getResult(numOriginalForOpResults + iterArgNumber));
857  rewriter.finalizeRootUpdate(extracted);
858 
859  LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting
860  << "\n");
861  LLVM_DEBUG(DBGS() << "with region iter arg #"
862  << numOriginalForOpResults + iterArgNumber << "\n");
863  rewriter.replaceAllUsesWith(
864  paddedValueBeforeHoisting,
865  newForOp.getRegionIterArg(numOriginalForOpResults + iterArgNumber));
866 
867  return extracted;
868 }
869 
870 /// Produce a tensor extracted from the packingResult. This can be used as a
871 /// replacement for `opToHoist` in callers.
873  const IRMapping &bvm,
874  tensor::PadOp opToHoist,
875  RankedTensorType transposedTensorType,
876  const HoistPaddingAnalysis &analysis,
877  const PackingResult &packingResult) {
878  // The replacement occurs under a single insertion point within the original
879  // loop, just before opToHoist.
880  OpBuilder::InsertionGuard g(rewriter);
881  rewriter.setInsertionPoint(opToHoist);
882 
883  Location loc = opToHoist->getLoc();
884  RankedTensorType paddedTensorType = opToHoist.getResultType();
885  int paddedRank = paddedTensorType.getRank();
886 
887  int64_t nPackedLoops = packingResult.clonedLoopIvs.size();
888  LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n");
889 
890  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
891  ArrayRef<scf::ForOp> packingLoops = analysis.packingLoops;
892 
893  Value hoistedPackedTensor;
894  SmallVector<Value> loopIterationCounts;
895  SmallVector<OpFoldResult> offsets(nPackedLoops + paddedRank,
896  rewriter.getIndexAttr(0));
897  if (nPackedLoops > 0) {
898  loopIterationCounts =
899  llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
900  return buildLoopIterationCount(rewriter, outerLoop,
901  cast<scf::ForOp>(loop));
902  }));
903  // Assert all loop iteration counts can be computed.
904  if (llvm ::any_of(loopIterationCounts, [](Value v) { return !v; }))
905  llvm_unreachable("loop independence prerequisite not met");
906 
907  // offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0].
908  std::copy(loopIterationCounts.begin(), loopIterationCounts.end(),
909  offsets.begin());
910  hoistedPackedTensor =
911  scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front())
912  ->getResult(0);
913  } else {
914  // If no loops were created, this is just hoisting without packing.
915  hoistedPackedTensor = bvm.lookup(opToHoist.getResult());
916  }
917 
918  LLVM_DEBUG(DBGS() << "hoistedPackedTensor: " << hoistedPackedTensor << "\n");
919 
920  // If the consumer of `padOp` was a `forOp`, propagate through iter args.
921  scf::ForOp forOp = analysis.padConsumingForOp;
922  if (forOp) {
923  return padThroughLoopIterArg(rewriter, opToHoist, hoistedPackedTensor,
924  analysis.sliceOp, forOp);
925  }
926 
927  // offsets = [maybe_leading_ivs, 0 .. 0].
928  // sizes = [1 .. 1, transposedShape] (defined above).
929  // strides = [1 .. 1] (defined above)
930  return rewriter.create<tensor::ExtractSliceOp>(
931  loc, transposedTensorType, hoistedPackedTensor, offsets,
932  packingResult.sizes, packingResult.strides);
933 }
934 
936  RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
937  ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
938  SmallVectorImpl<GenericOp> &transposeOps) {
939  LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
940  DBGS() << " by " << numLoops << " loops\n");
941 
942  HoistPaddingAnalysis analysis(opToHoist, numLoops);
943  analysis.enableHoistPadding(rewriter);
944  analysis.finalizeHoistPaddingAnalysis();
945  if (!analysis.isValid()) {
946  LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
947  return failure();
948  }
949 
950  /// Construct the packing loop nest.
951  IRMapping bvm;
953  rewriter, bvm, opToHoist, transposeVector, analysis);
954  if (failed(packingResult)) {
955  LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");
956  return failure();
957  }
958 
959  if (!transposeVector.empty())
960  transposeOps.push_back(packingResult->maybeTransposeOp);
961 
962  FailureOr<RankedTensorType> transposedTensorType =
963  tensor::computeTransposedType(opToHoist.getResultType(), transposeVector);
964  assert(succeeded(transposedTensorType) && "unexpected failure in type");
965 
966  // Now the packed tensor is ready, replace the original padding op by a
967  // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
968  Value newResult =
969  replaceByPackingResult(rewriter, bvm, opToHoist, *transposedTensorType,
970  analysis, *packingResult);
971 
972  Location loc = opToHoist->getLoc();
973  RankedTensorType paddedTensorType = opToHoist.getResultType();
974  if (!transposeVector.empty()) {
975  OpBuilder::InsertionGuard g(rewriter);
976  rewriter.setInsertionPointAfter(newResult.getDefiningOp());
977  // Transpose the packed tensor back to the original storage order.
978  Value emptyTensor = rewriter.create<tensor::EmptyOp>(
979  loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
980  GenericOp unTransposeOp =
981  makeTransposeOp(rewriter, loc, newResult, emptyTensor, transposeVector);
982  newResult = unTransposeOp.getResult(0);
983  transposeOps.push_back(unTransposeOp);
984  }
985 
986  LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n");
987  LLVM_DEBUG(
988  DBGS() << "After hoisting: "
989  << newResult.getDefiningOp()->getParentOfType<func::FuncOp>()
990  << "\n");
991 
992  // Make the newly cloned `opToHoist` available to the caller.
993  hoistedOp = packingResult->hoistedPadOp;
994 
995  LLVM_DEBUG(DBGS() << "--SUCCESS\n");
996  return newResult;
997 }
998 
1000 mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
1001  ArrayRef<int64_t> transposeVector,
1002  tensor::PadOp &hoistedOp,
1003  SmallVectorImpl<GenericOp> &transposeOps) {
1004  IRRewriter rewriter(opToHoist.getContext());
1005  return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
1006  hoistedOp, transposeOps);
1007 }
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
static tensor::ExtractSliceOp padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, Value hoistedPackedTensor, tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp)
If the original consumer of outerSliceOp was a forOp (i.e.
static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
static void getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
static bool debugPrintLoopInShortForm(Operation *op)
static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp, Value expectedSource)
Return true if we can walk back the use-def chain from extractSliceOp to expectedSource going through...
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
static FailureOr< PackingResult > buildPackingLoopNestImpl(RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, ArrayRef< int64_t > transposeVector, RankedTensorType transposedTensorType, tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis)
static void computeBackwardSlice(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp, SetVector< Operation * > &backwardSlice)
static Value replaceByPackingResult(RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, RankedTensorType transposedTensorType, const HoistPaddingAnalysis &analysis, const PackingResult &packingResult)
Produce a tensor extracted from the packingResult.
#define DBGS()
static void debugPrintBackwardSlice(SetVector< Operation * > &backwardSlice)
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Base type for affine expression.
Definition: AffineExpr.h:68
This class provides management for the lifetime of the state used when printing the IR.
Definition: AsmState.h:533
This class represents an argument of a Block.
Definition: Value.h:310
Block * getOwner() const
Returns the block that owns this argument.
Definition: Value.h:319
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:124
MLIRContext * getContext() const
Definition: Builders.h:55
A class for computing basic dominance information.
Definition: Dominance.h:121
bool dominates(Operation *a, Operation *b) const
Return true if operation A dominates operation B, i.e.
Definition: Dominance.h:141
This class provides support for representing a failure result, or a valid value of type T.
Definition: LogicalResult.h:78
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:150
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:710
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:333
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:528
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:416
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:383
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:421
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:505
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:446
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:397
This class represents an operand of an operation.
Definition: Value.h:261
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition: Operation.h:652
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:234
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
operand_type_range getOperandTypes()
Definition: Operation.h:392
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:615
virtual void finalizeRootUpdate(Operation *op)
This method is used to signal the end of a root update on the given operation.
virtual void startRootUpdate(Operation *op)
This method is used to notify the rewriter that an in-place operation modification is about to happen...
Definition: PatternMatch.h:591
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:372
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Definition: AffineOps.cpp:2602
FailureOr< OpFoldResult > reifyIndexValueBound(OpBuilder &b, Location loc, presburger::BoundType type, Value value, ValueBoundsConstraintSet::StopConditionFn stopCondition=nullptr, bool closedUB=false)
Reify a bound for the given index-typed value in terms of SSA values for which stopCondition is met.
void bindDims(MLIRContext *ctx)
Definition: AffineExpr.h:309
void bindSymbols(MLIRContext *ctx)
Definition: AffineExpr.h:318
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, Value outputTensor, ArrayRef< int64_t > transposeVector)
Returns a GenericOp that transposes inputTensor into outputTensor using transposeVector to permute th...
Definition: Utils.cpp:252
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
scf::ForOp hoistRedundantSubsetExtractInsert(RewriterBase &rewriter, scf::ForOp forOp)
Greedily hoist redundant subset extract/insert operations on tensors outside of forOp.
LLVM_ATTRIBUTE_ALWAYS_INLINE MPInt ceilDiv(const MPInt &lhs, const MPInt &rhs)
Definition: MPInt.h:374
FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
Definition: Utils.cpp:57
This header declares functions that assist transformations in the MemRef dialect.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:401
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
void getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, BackwardSliceOptions options={})
Fills backwardSlice with the computed backward slice (i.e.
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
Definition: LogicalResult.h:68
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
scf::ForOp replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop, ValueRange newIterOperands, const NewYieldValueFn &newYieldValuesFn, bool replaceIterOperandsUsesInLoop=true)
Definition: Utils.cpp:42
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:63
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:40
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:310
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
bool inclusive
Include the top level op in the slice.
Definition: SliceAnalysis.h:33
TransitiveFilter filter
Definition: SliceAnalysis.h:30
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:512
SmallVector< OpFoldResult > strides
Definition: Transforms.h:513
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:514
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:513