MLIR  22.0.0git
HoistPadding.cpp
Go to the documentation of this file.
1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
23 #include "mlir/IR/AsmState.h"
24 #include "mlir/IR/Dominance.h"
25 #include "mlir/IR/Matchers.h"
29 #include "llvm/Support/Debug.h"
30 
31 using llvm::dbgs;
32 
33 #define DEBUG_TYPE "hoist-padding"
34 
35 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
36 
37 using namespace mlir;
38 using namespace mlir::linalg;
39 using namespace mlir::linalg::detail;
40 
41 #ifndef NDEBUG
43  AsmState state(op->getParentOfType<func::FuncOp>());
44  (void)state;
45  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
46  forOp.getInductionVar().printAsOperand(dbgs(), state);
47  dbgs() << " @ " << forOp.getOperation();
48  return true;
49  }
50  return false;
51 }
52 #endif
53 
54 static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) {
55  LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:",
56  [](Operation *op) {
57  dbgs() << "\n";
58  DBGS() << "----";
59  if (debugPrintLoopInShortForm(op)) {
60  dbgs() << "\n";
61  return;
62  }
63  dbgs() << *op << "\n";
64  });
65  DBGS() << "\n";);
66 }
67 
68 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
69 /// Stops at the first parent that is not an scf::ForOp.
70 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
71 /// Control-flow and other containing ops with regions are not modeled atm.
72 static void
73 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
74  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
75  scf::ForOp outermostEnclosingForOp = nullptr;
76  Operation *nextEnclosingOp = padOp->getParentOp();
77  while (nLevels-- > 0 &&
78  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
79  LLVM_DEBUG(DBGS() << "loops: ";
80  debugPrintLoopInShortForm(outermostEnclosingForOp);
81  dbgs() << "\n");
82  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
83  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
84  }
85 }
86 
87 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
88 /// Stops at the first parent that is not an scf::ForOp.
89 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
90 /// Control-flow and other containing ops with regions are not modeled atm.
91 static void
92 getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
93  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
94  scf::ForOp outermostEnclosingForOp = nullptr;
95  Operation *nextEnclosingOp = padOp->getParentOp();
96  while (outermostEnclosingForOp != untilLoop &&
97  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
98  LLVM_DEBUG(DBGS() << "loops: ";
99  debugPrintLoopInShortForm(outermostEnclosingForOp);
100  dbgs() << "\n");
101  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
102  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
103  }
104 }
105 
106 // Get all the ops in the backwards slice starting from `padOp` and that
107 // are dominated by the outermost enclosing loop.
108 // This also requires tracking ops defining values used in the region but
109 // defined above.
110 static void computeBackwardSlice(tensor::PadOp padOp,
111  scf::ForOp outermostEnclosingForOp,
112  SetVector<Operation *> &backwardSlice) {
113  DominanceInfo domInfo(outermostEnclosingForOp);
114  BackwardSliceOptions sliceOptions;
115  sliceOptions.filter = [&](Operation *op) {
116  return domInfo.dominates(outermostEnclosingForOp, op) &&
117  !padOp->isProperAncestor(op);
118  };
119  sliceOptions.inclusive = true;
120 
121  // First, add the ops required to compute the region to the backwardSlice.
122  SetVector<Value> valuesDefinedAbove;
123  getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),
124  valuesDefinedAbove);
125  for (Value v : valuesDefinedAbove) {
126  LogicalResult result = getBackwardSlice(v, &backwardSlice, sliceOptions);
127  assert(result.succeeded() && "expected a backward slice");
128  (void)result;
129  }
130  // Then, add the backward slice from padOp itself.
131  LogicalResult result =
132  getBackwardSlice(padOp.getOperation(), &backwardSlice, sliceOptions);
133  assert(result.succeeded() && "expected a backward slice");
134  (void)result;
135 }
136 
137 //===----------------------------------------------------------------------===//
138 // HoistPaddingAnalysis Implementation.
139 //===----------------------------------------------------------------------===//
140 
141 namespace {
142 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
143 /// loops. The failure conditions are:
144 /// 1. Pad op has a use that is not an input of a LinalgOp.
145 /// 2. Pad op does not have a constant padding value.
146 /// 3. There is no immediately enclosing scf::ForOp.
147 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above
148 /// contains an unknown op with non index type operands, a region, or a
149 /// memory effect.
150 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
151 /// empty.
152 /// 6. The source tensor of pad op is not defined by an extract slice op.
153 /// 7. The source tensor of the extract slice op is not defined outside of
154 /// the outermost enclosing scf::ForOp.
155 /// 8. There is no enclosing scf::ForOp that indexes the padded data.
156 /// Other cases succeed and will trigger hoisting of the pad op.
157 struct HoistPaddingAnalysis {
158  HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);
159  HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);
160 
161  bool isValid() { return valid.has_value() && valid.value(); }
162  bool isInvalid() { return valid.has_value() && !valid.value(); }
163 
164  /// Footprint of the hoistedPackedTensor, computed from the packingLoops.
165  SmallVector<Value> getHoistedPackedTensorSizes(RewriterBase &rewriter,
166  Location loc) const;
167 
168  /// Performs optional hoisting to enable hoist padding to occur. This may be
169  /// necessary when `sliceOp` is not defined outside of the outermost enclosing
170  /// loop we want to hoist above.
171  ///
172  /// Example:
173  /// ```
174  /// %source = linalg.fill(%cst, %arg0)
175  /// // %source is available for packing here!
176  /// scf.for %i
177  /// scf.for %j
178  /// scf.for %k
179  /// %slice = tensor.extract_slice %source [%i, %j]
180  /// %padded_slice = tensor.pad %slice
181  /// ```
182  void enableHoistPadding(RewriterBase &rewriter);
183 
184  /// Common analysis builder to finalize the construction of the analysis once
185  /// optional `enableHoistPadding` has run.
186  /// `reverseEnclosingLoops.back()` is the loop to hoist above.
187  void finalizeHoistPaddingAnalysis();
188 
189 private:
190  /// Encodes whether the analysis is valid and hoisting can proceed.
191  std::optional<bool> valid;
192 
193  /// The padOp to hoist.
194  tensor::PadOp opToHoist;
195 
196  /// Immediately enclosing loops considered for hoisting padding.
197  SmallVector<scf::ForOp> reverseEnclosingLoops;
198 
199  /// Drop any non-index dependencies of `padOp` and `sliceOp` from
200  /// `backwardSlice`. The method follows the use-def chains of the index
201  /// operands consumed by `padOp` and `sliceOp` and drops the operations
202  /// not part of this index computation. Afterwards, the filtered
203  /// `backwardSlice` contains only the loops whose induction variable is
204  /// used, directly or indirectly, to index the padded tensor. The method
205  /// returns failure if the filtered backward slice contains an unexpected
206  /// operation.
207  ///
208  /// Example:
209  /// ```
210  /// %source = linalg.fill(%cst, %arg0)
211  /// scf.for %i
212  /// %unrelated = linalg.fill(%cst, %arg1) // not used to index
213  /// %source! scf.for %j (%arg2 = %unrelated)
214  /// scf.for %k // not used to index
215  /// %source!
216  /// %ubi = affine.min #map(%i)
217  /// %ubj = affine.min #map(%j)
218  /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
219  /// %padded_slice = tensor.pad %slice
220  /// ```
221  /// dropNonIndexDependencies(%padded_slice, %slice)
222  /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
223  LogicalResult dropNonIndexDependencies();
224 
225 public:
226  /// The outermost loop, determined by `nLevels` above which `padOp` will
227  /// be hoisted.
228  scf::ForOp outermostEnclosingForOp;
229 
230  /// Backward slice rooted at `padOp` and nested under
231  /// `outermostEnclosingForOp`.
232  SetVector<Operation *> backwardSlice;
233 
234  /// The scf::ForOp immediately enclosing `padOp` such that:
235  /// 1. they are nested under `outermostEnclosingForOp` (inclusive)
236  /// 2. whose induction variable is used, directly or indirectly, in the
237  /// computation of `padOp`.
238  /// The span of these loops determines the footprint of the packed tensor.
239  SmallVector<scf::ForOp> packingLoops;
240 
241  /// The ExtractSliceOp that feeds the PadOp we want to hoist.
242  tensor::ExtractSliceOp sliceOp;
243 
244  /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
245  scf::ForOp padConsumingForOp;
246 };
247 
248 } // namespace
249 
250 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
251  : valid(std::nullopt), opToHoist(padOp) {
252  // Get at most `numLoops` of immediately enclosing loops.
253  getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops);
254  if (reverseEnclosingLoops.empty()) {
255  LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
256  valid = false;
257  return;
258  }
259  outermostEnclosingForOp = reverseEnclosingLoops.back();
260  sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
261  if (!sliceOp) {
262  LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
263  valid = false;
264  return;
265  }
266 }
267 
268 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
269  scf::ForOp outermostEnclosingForOp)
270  : valid(std::nullopt), opToHoist(padOp) {
271  // Get enclosing loops until outermostEnclosingForOp.
272  getEnclosingLoopsUntil(opToHoist, outermostEnclosingForOp,
273  reverseEnclosingLoops);
274  if (reverseEnclosingLoops.empty()) {
275  LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
276  valid = false;
277  return;
278  }
279  this->outermostEnclosingForOp = reverseEnclosingLoops.back();
280  if (this->outermostEnclosingForOp != outermostEnclosingForOp) {
281  LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");
282  valid = false;
283  return;
284  }
285  sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
286  if (!sliceOp) {
287  LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
288  valid = false;
289  return;
290  }
291 }
292 
293 void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
294  if (isInvalid())
295  return;
296  // If the padded data is not yet available before entering the outermost
297  // enclosing loop, try to apply hoisting on this outermost loop.
298  // TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
299  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
300  outermostEnclosingForOp = cast<scf::ForOp>(
301  hoistLoopInvariantSubsets(rewriter, outermostEnclosingForOp));
302  }
303 }
304 
305 void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {
306  if (isInvalid())
307  return;
308 
309  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
310  LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"
311  << outermostEnclosingForOp << "\n"
312  << "--sliceOp: " << sliceOp << "\n"
313  << "--sliceOp.getSource(): " << sliceOp.getSource()
314  << "\n");
315  LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n");
316  valid = false;
317  return;
318  }
319  if (sliceOp->hasOneUse()) {
320  padConsumingForOp = dyn_cast<scf::ForOp>(*(sliceOp->getUsers().begin()));
321  }
322 
323  // Check the region of `padOp` depends on a constant only. Adding hoisting
324  // support for arbitrary padding regions would require cloning all
325  // dependencies captured by the padding region.
326  Value paddingValue = opToHoist.getConstantPaddingValue();
327  if (!paddingValue ||
328  !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
329  LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");
330  valid = false;
331  return;
332  }
333 
334  computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice);
335  if (backwardSlice.size() <= 1) {
336  valid = false;
337  return;
338  }
339 
340  debugPrintBackwardSlice(backwardSlice);
341  // Remove all ops in the backward slice that are not used to index
342  // the padded tensor. In particular, keep `padOp`, `sliceOp`, and
343  // the loop and affine operations used for the index computation.
344  if (failed(dropNonIndexDependencies())) {
345  LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");
346  valid = false;
347  return;
348  }
349  debugPrintBackwardSlice(backwardSlice);
350 
351  // Add only the loops part of the filtered `backwardSlice` to the
352  // packing loops. All other loops are not used to index the padded
353  // data and consequently access the same data in every loop
354  // iteration. Adding them to the packing loops would increase the
355  // cache footprint of the packed data by storing the same data
356  // multiple times.
357  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
358  if (backwardSlice.contains(forOp))
359  packingLoops.push_back(forOp);
360 
361  // TODO: for multiple loops we need to track the use to the innermost loop.
362  if (packingLoops.size() > 1 && padConsumingForOp) {
363  LLVM_DEBUG(DBGS() << "--Cannot hoist multiple loops through iter_args -> "
364  "Downgrade to 1 loop\n");
365  packingLoops.resize(1);
366  }
367 
368  // Note: at this point, packing loops may be empty but we would still like
369  // to hoist the padding if so specified.
370 
371  // The analysis is valid and hoisting can occur.
372  valid = true;
373 }
374 
375 LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() {
376  // Set of all values used for index computation.
377  SetVector<Value> indexEdges;
378 
379  // Add all index operands of `operation` to `indexEdges`. An index operand
380  // is an operand of type index.
381  auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
382  for (Value operand : operation->getOperands())
383  if (operand.getType().isIndex())
384  indexEdges.insert(operand);
385  };
386 
387  // Check if any operation result is contained in `indexEdges`.
388  auto hasIndexResult = [&](Operation *operation) {
389  return llvm::any_of(operation->getResults(), [&](Value result) {
390  return indexEdges.contains(result);
391  });
392  };
393 
394  // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index
395  // type in `backwardSlice`. Add the index operands of an operation to
396  // `indexEdges` and remove all operations from `backwardSlice` that are not
397  // part of the index computation.
398  //
399  // Example:
400  // ```
401  // %source = linalg.fill(%cst, %arg0)
402  // scf.for %i
403  // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
404  // scf.for %j (%arg2 = %unrelated)
405  // scf.for %k // not used to index %source!
406  // %ubi = affine.min #map(%i)
407  // %ubj = affine.min #map(%j)
408  // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
409  // %padded_slice = tensor.pad %slice
410  // ```
411  // After iterating `backwardSlice` we obtain:
412  // indexEdges = [%i, %j, %ubi, %ubj]
413  // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
414  SetVector<Operation *> operationsToRemove;
415  for (Operation *op : llvm::reverse(backwardSlice)) {
416  // Add the index operands of `opToHoist` and `sliceOp` to start the
417  // exploration of the index computation.
418  if (op == opToHoist || op == sliceOp) {
419  addIndexOperandsToIndexEdges(op);
420  continue;
421  }
422  // Add the index operands of the loop if its induction variable is
423  // used for index computation.
424  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
425  if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
426  addIndexOperandsToIndexEdges(op);
427  continue;
428  }
429  }
430  // Add the index operands of all other operations if at least one result
431  // is used for index computation.
432  if (hasIndexResult(op)) {
433  addIndexOperandsToIndexEdges(op);
434  // Check the operands of the remaining operations all have index type.
435  if (llvm::any_of(op->getOperandTypes(),
436  [](Type type) { return !type.isIndex(); })) {
437  LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
438  << op << " -> Skip\n");
439  return failure();
440  }
441  // Check the remaining operations do not have regions or memory effects.
442  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
443  bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
444  if (hasMemoryEffect || op->getNumRegions() != 0) {
445  LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
446  << op << " -> Skip\n");
447  return failure();
448  }
449  continue;
450  }
451  // Remove all other operations not used by the index computation. An
452  // exception are constant operations that may be used by `opToHoist`.
453  if (!isa<arith::ConstantOp>(op))
454  operationsToRemove.insert(op);
455  }
456  backwardSlice.set_subtract(operationsToRemove);
457  return success();
458 }
459 
461 HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
462  Location loc) const {
463  SmallVector<Value> dynamicTensorSizes;
464 
465  // Upper bound the packing loop lengths to size the packed tensor. Taking
466  // upper bounds can make the sizes of the packed tensor independent of the
467  // enclosing loops. This independence is a prerequisite for reusing the same
468  // buffer for all enclosing loop iterations and hoisting its allocation out
469  // of the enclosing loops.
470  for (auto forOp : packingLoops) {
471  // Compute an upper bound `ubVal` for the upper bound of `forOp`.
472  FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(
473  rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),
474  /*stopCondition=*/
475  [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
476  if (v == forOp.getUpperBound())
477  return false;
478  // Compute a bound that is independent of any affine op results.
479  Operation *op = v.getDefiningOp();
480  if (!op)
481  return true;
482  return !isa<affine::AffineMinOp, affine::AffineMaxOp,
483  affine::AffineApplyOp>(op);
484  },
485  /*closedUB=*/true);
486  assert(succeeded(loopUb) && "could not get upper bound");
487  Value ubVal = getValueOrCreateConstantIndexOp(rewriter, loc, *loopUb);
488 
489  // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
490  // store the result to `dynamicTensorSizes`.
491  // TODO: instead of using the lower bound of `forOp` directly, implement a
492  // lower bound computation similar to the upper bound computation.
493  AffineExpr lb, ub, step;
494  bindDims(rewriter.getContext(), lb, ub);
495  bindSymbols(rewriter.getContext(), step);
496  Value res = rewriter.createOrFold<affine::AffineApplyOp>(
497  loc, (ub - lb).ceilDiv(step),
498  ValueRange{forOp.getLowerBound(), ubVal,
499  cast<scf::ForOp>(forOp).getStep()});
500  dynamicTensorSizes.push_back(res);
501  }
502 
503  return dynamicTensorSizes;
504 }
505 
506 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
507  return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
508 }
509 
510 //===----------------------------------------------------------------------===//
511 // buildPackingLoopNest Implementation.
512 //===----------------------------------------------------------------------===//
513 
514 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
515 /// The returned Value is guaranteed not to depend on any loop comprised in
516 /// [`outer`, `forOp`].
517 /// Return null if such a loop-independent quantity cannot be computed.
518 static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,
519  scf::ForOp forOp) {
520  MLIRContext *ctx = forOp->getContext();
521  AffineExpr iv, lb, step;
522  bindDims(ctx, iv, lb);
523  bindSymbols(ctx, step);
524  if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
525  !isDefinedOutsideOrConstant(outer, forOp.getStep()))
526  return Value();
527  Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
528  stepVal = forOp.getStep();
529  auto loc = forOp->getLoc();
530  return rewriter.createOrFold<affine::AffineApplyOp>(
531  loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal});
532 }
533 
534 // Build a packing loop nest by iteratively traversing the backward slice and
535 // clone the operations, iteratively stepping into the loops that we encounter.
536 // The implementation proceeds in a stack-like fashion:
537 // 1. Iteratively clone and step into the loops, pushing the
538 // `hoistedPackedTensor`
539 // deeper in the stack.
540 // 2. At the innermost loop level, create a GenericOp if `transposeVector` is
541 // non-empty.
542 // 3. At the innermost loop level, create a InsertSliceOp.
543 // 4. Iteratively pop and yield the result of the InsertSliceOp across the
544 // cloned loops.
545 static FailureOr<PackingResult> buildPackingLoopNestImpl(
546  RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
547  ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
548  tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {
549  SmallVector<OpFoldResult> offsets, sizes, strides;
550  SmallVector<Value> clonedLoopIvs, leadingHoistedPackedTensorIndexings;
551 
552  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
553 
554  Location loc = opToHoist->getLoc();
555  RankedTensorType paddedTensorType = opToHoist.getResultType();
556  int paddedRank = paddedTensorType.getRank();
557 
558  // Step 0. Populate bvm with opToHoist.getSource if relevant.
559  BlockArgument bbArg = dyn_cast<BlockArgument>(opToHoist.getSource());
560  while (bbArg) {
561  auto forOp = dyn_cast<scf::ForOp>(bbArg.getOwner()->getParentOp());
562  if (!forOp)
563  break;
564  if (forOp != outerLoop && !outerLoop->isAncestor(forOp))
565  break;
566  OpOperand &operand = *forOp.getTiedLoopInit(bbArg);
567  bvm.map(bbArg, operand.get());
568  bbArg = dyn_cast<BlockArgument>(operand.get());
569  }
570 
571  // Step 1. iteratively clone loops and push `hoistedPackedTensor`.
572  Value hoistedPackedTensor = emptyOp.getResult();
573  OpBuilder::InsertionGuard g(rewriter);
574  for (Operation *op : analysis.backwardSlice) {
575  // Specifically sit out in the extract_slice(hoistedPackedTensor) case: this
576  // is the piece we seek to replace.
577  if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
578  if (bvm.lookupOrDefault(sliceOp.getSource()) == hoistedPackedTensor) {
579  LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n");
580  continue;
581  }
582  }
583 
584  // Clone all operations except loops which require special handling.
585  auto forOp = dyn_cast<scf::ForOp>(op);
586  if (!forOp) {
587  // We are at the right insertion point within the loop nest.
588  rewriter.clone(*op, bvm);
589  continue;
590  }
591 
592  // Create a packing loop that takes `hoistedPackedTensor` as iteration
593  // argument.
594  auto clonedForOp = scf::ForOp::create(
595  rewriter, loc, bvm.lookupOrDefault(forOp.getLowerBound()),
596  bvm.lookupOrDefault(forOp.getUpperBound()),
597  bvm.lookupOrDefault(forOp.getStep()), hoistedPackedTensor,
598  /*bodyBuilder=*/nullptr, forOp.getUnsignedCmp());
599 
600  // Map the induction var, region args and results to the `clonedForOp`.
601  bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
602  bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
603  bvm.map(forOp.getResults(), clonedForOp.getResults());
604  assert(clonedForOp->getNumRegions() == 1);
605  clonedLoopIvs.push_back(clonedForOp.getInductionVar());
606 
607  // Do not insert guard here, we get deeper into the loop nest.
608  rewriter.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
609  Value loopIndependentIterationCount =
610  buildLoopIterationCount(rewriter, outerLoop, clonedForOp);
611 
612  // Assert the loop-independent iteration count can be computed.
613  if (!loopIndependentIterationCount)
614  llvm_unreachable("loop independence prerequisite not met");
615  leadingHoistedPackedTensorIndexings.push_back(
616  loopIndependentIterationCount);
617  hoistedPackedTensor = clonedForOp.getRegionIterArgs().front();
618  }
619 
620  // Step 2. Construct offsets, sizes and strides for the innermost level of the
621  // packing loop.
622  int64_t nPackedLoops = clonedLoopIvs.size();
623  // offsets = [clonedLoopIvs, 0 .. 0].
624  offsets =
625  SmallVector<OpFoldResult>{leadingHoistedPackedTensorIndexings.begin(),
626  leadingHoistedPackedTensorIndexings.end()};
627  offsets.append(paddedRank, rewriter.getIndexAttr(0));
628  // sizes = [1 .. 1, transposedShape].
629  sizes = SmallVector<OpFoldResult>(nPackedLoops, rewriter.getIndexAttr(1));
630  for (int64_t sz : transposedTensorType.getShape()) {
631  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
632  if (ShapedType::isDynamic(sz))
633  return failure();
634  sizes.push_back(rewriter.getIndexAttr(sz));
635  }
636  // strides = [1 .. 1].
637  strides = SmallVector<OpFoldResult>(nPackedLoops + paddedRank,
638  rewriter.getIndexAttr(1));
639 
640  // Step 3. Optionally transpose the padded tensor.
641  TransposeOp maybeTransposeOp;
642  Value paddedTensor = bvm.lookup(opToHoist.getResult());
643  if (!transposeVector.empty()) {
644  Value outputTensor = tensor::ExtractSliceOp::create(
645  rewriter, loc, transposedTensorType, hoistedPackedTensor, offsets,
646  sizes, strides);
647  maybeTransposeOp = linalg::TransposeOp::create(
648  rewriter, loc, paddedTensor, outputTensor, transposeVector);
649  paddedTensor = maybeTransposeOp.getResult()[0];
650  }
651 
652  // Innermost tensor.insert_slice and yields are optional / need loops.
653  if (nPackedLoops > 0) {
654  // Step 4. Create InsertSliceOp at the innermost loop level, inserting an
655  // optionally transposed padded slice into the packed tensor.
656  Value inserted = tensor::InsertSliceOp::create(rewriter, loc, paddedTensor,
657  hoistedPackedTensor, offsets,
658  sizes, strides);
659 
660  // Step 5. Iteratively pop the stack and propagate the yield.
661  Value valueToYield = inserted;
662  for (Value iv : llvm::reverse(clonedLoopIvs)) {
663  auto forOp = scf::getForInductionVarOwner(iv);
664  rewriter.setInsertionPointToEnd(&forOp.getRegion().front());
665  scf::YieldOp::create(rewriter, loc, valueToYield);
666  valueToYield = forOp.getResult(0);
667  }
668  }
669 
670  return PackingResult{
671  offsets,
672  sizes,
673  strides,
674  clonedLoopIvs,
675  leadingHoistedPackedTensorIndexings,
676  maybeTransposeOp,
677  cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp())};
678 }
679 
680 /// Build the packing loop nest required to hoist `opToHoist` above
681 /// `outermostEnclosingForOp`.
682 /// The loop nest is built just before `outermostEnclosingForOp`.
683 static FailureOr<PackingResult> buildPackingLoopNestImpl(
684  RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
685  ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {
686  // Update actual number of loops, which may be smaller.
687  int nPackedLoops = analysis.packingLoops.size();
688  LLVM_DEBUG(DBGS() << "\n";
689  DBGS() << "Func:\n"
690  << *opToHoist->getParentOfType<func::FuncOp>() << "\n";
691  DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
692 
693  Location loc = opToHoist->getLoc();
694  RankedTensorType paddedTensorType = opToHoist.getResultType();
695 
696  // Compute the type of the transposed padded tensor.
697  FailureOr<RankedTensorType> transposedTensorType =
698  tensor::computeTransposedType(paddedTensorType, transposeVector);
699  if (failed(transposedTensorType)) {
700  LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
701  return failure();
702  }
703 
704  // Create the packed tensor<?x?x..? x transposedShape>.
705  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
706  // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
707  llvm::append_range(packedShape, transposedTensorType->getShape());
708  auto hoistedPackedTensorType = RankedTensorType::get(
709  packedShape, transposedTensorType->getElementType());
710 
711  // Set the insertion point right before the outer loop and start packing.
712  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
713  OpBuilder::InsertionGuard g(rewriter);
714  rewriter.setInsertionPoint(outerLoop);
715  SmallVector<Value> dynamicTensorSizes =
716  analysis.getHoistedPackedTensorSizes(rewriter, loc);
717  auto emptyOp = tensor::EmptyOp::create(
718  rewriter, loc, hoistedPackedTensorType.getShape(),
719  hoistedPackedTensorType.getElementType(), dynamicTensorSizes);
720 
721  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
722  *transposedTensorType, emptyOp, analysis);
723 }
724 
725 /// Build the packing loop nest required to hoist `opToHoist` above
726 /// `outermostEnclosingForOp`.
727 /// The loop nest is built just before `outermostEnclosingForOp`.
729  RewriterBase &rewriter, tensor::PadOp opToHoist,
730  scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {
731  HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);
732  analysis.enableHoistPadding(rewriter);
733  analysis.finalizeHoistPaddingAnalysis();
734  if (!analysis.isValid()) {
735  LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
736  return failure();
737  }
738  IRMapping bvm;
739  return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
740  analysis);
741 }
742 
743 //===----------------------------------------------------------------------===//
744 // hoistPaddingOnTensors Implementation.
745 //===----------------------------------------------------------------------===//
746 
747 /// Return true if we can walk back the use-def chain from `extractSliceOp` to
748 /// expectedSource going through DestinationStyleOpInterface inits only.
749 /// This is a poor man's analysis that is sufficient to check the extractSliceOp
750 /// the matches tensor.pad we want to hoist.
751 /// In the future, it will be easier to ensure this with a matching symmetric
752 /// tensor.unpad op.
753 static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,
754  Value expectedSource) {
755  LLVM_DEBUG(DBGS() << "Start tracesBackToExpectedValue on: " << extractSliceOp
756  << "\n");
757  LLVM_DEBUG(DBGS() << "--with extractSlice: " << extractSliceOp << "\n");
758  Value source = extractSliceOp.getSource();
759  LLVM_DEBUG(DBGS() << "--with starting source: " << source << "\n");
760  while (source && source != expectedSource) {
761  auto destOp = source.getDefiningOp<DestinationStyleOpInterface>();
762  if (!destOp)
763  break;
764  LLVM_DEBUG(DBGS() << "--step dest op: " << destOp << "\n");
765  source = destOp.getDpsInitOperand(cast<OpResult>(source).getResultNumber())
766  ->get();
767  }
768  LLVM_DEBUG(DBGS() << "--final source: " << source << "\n");
769  LLVM_DEBUG(DBGS() << "--expected source: " << expectedSource << "\n");
770  return source == expectedSource;
771 }
772 
773 /// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an
774 /// iter arg), propagate the `hoistedPackedTensor` value through the same iter
775 /// arg.
776 /// TODO: for multiple loops we need to track the use to the innermost loop.
777 ///
778 /// Match:
779 /// ```
780 /// %outerSliceOp = tensor.extract_slice ..
781 /// %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {
782 /// %hoistedPackedTensor = tensor.pad %arg0
783 /// %1 = compute %hoistedPackedTensor
784 /// %2 = tensor.extract_slice %1
785 /// scf.yield %2
786 /// }
787 /// ```
788 ///
789 /// and rewrite as:
790 /// ```
791 /// %outerSliceOp = tensor.extract_slice ..
792 /// %hoistedPackedTensor = tensor.pad %outerSliceOp
793 /// %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {
794 /// %1 = compute %arg0
795 /// scf.yield %1
796 /// }
797 /// %2 = tensor.extract_slice %forOp
798 /// ```
799 ///
800 /// Return null when no rewrite happened.
801 static tensor::ExtractSliceOp
802 padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
803  Value hoistedPackedTensor,
804  tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) {
805  LLVM_DEBUG(DBGS() << "Start padThroughLoopIterArg on: " << forOp << "\n");
806  LLVM_DEBUG(DBGS() << "--paddedValueBeforeHoisting: "
807  << paddedValueBeforeHoisting << "\n");
808  OpOperand *pUse = nullptr;
809  for (OpOperand &use : outerSliceOp->getUses()) {
810  if (use.getOwner() == forOp) {
811  assert(!pUse && "Multiple slice uses in the for loop");
812  pUse = &use;
813  }
814  }
815  assert(pUse && "No slice use in the for loop");
816  OpBuilder::InsertionGuard g(rewriter);
817  rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());
818 
819  unsigned iterArgNumber = forOp.getTiedLoopResult(pUse).getResultNumber();
820  auto yieldingExtractSliceOp = forOp.getYieldedValues()[iterArgNumber]
821  .getDefiningOp<tensor::ExtractSliceOp>();
822  if (!yieldingExtractSliceOp)
823  return tensor::ExtractSliceOp();
824 
825  // Poor man's analysis sufficient to ensure extractSlice matches tensor.pad.
826  // In the future, it will be easier to ensure this with a matching symmetric
827  // tensor.unpad op.
828  if (!tracesBackToExpectedValue(yieldingExtractSliceOp,
829  paddedValueBeforeHoisting))
830  return tensor::ExtractSliceOp();
831 
832  SmallVector<Value> initArgs = forOp.getInitArgs();
833  initArgs[iterArgNumber] = hoistedPackedTensor;
834  SmallVector<Value> yieldOperands = llvm::to_vector(forOp.getYieldedValues());
835  yieldOperands[iterArgNumber] = yieldingExtractSliceOp.getSource();
836 
837  int64_t numOriginalForOpResults = initArgs.size();
838  LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults
839  << "\n");
840  tensor::ExtractSliceOp extracted;
841  {
842  OpBuilder::InsertionGuard g(rewriter);
843  rewriter.setInsertionPointAfter(forOp);
844  extracted = tensor::ExtractSliceOp::create(
845  rewriter, hoistedPackedTensor.getLoc(), hoistedPackedTensor,
846  outerSliceOp.getMixedOffsets(), outerSliceOp.getMixedSizes(),
847  outerSliceOp.getMixedStrides());
848  rewriter.replaceAllUsesWith(forOp.getResult(iterArgNumber), extracted);
849  }
850  scf::ForOp newForOp = cast<scf::ForOp>(*forOp.replaceWithAdditionalYields(
851  rewriter, initArgs, /*replaceInitOperandUsesInLoop=*/true,
852  [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
853  return yieldOperands;
854  }));
855 
856  LLVM_DEBUG(DBGS() << "newForOp results: " << newForOp.getNumResults()
857  << "\n");
858  LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n");
859  LLVM_DEBUG(DBGS() << "with result #"
860  << numOriginalForOpResults + iterArgNumber
861  << " of forOp, giving us: " << extracted << "\n");
862  rewriter.startOpModification(extracted);
863  extracted.getSourceMutable().assign(
864  newForOp.getResult(numOriginalForOpResults + iterArgNumber));
865  rewriter.finalizeOpModification(extracted);
866 
867  LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting
868  << "\n");
869  LLVM_DEBUG(DBGS() << "with region iter arg #"
870  << numOriginalForOpResults + iterArgNumber << "\n");
871  rewriter.replaceAllUsesWith(
872  paddedValueBeforeHoisting,
873  newForOp.getRegionIterArg(numOriginalForOpResults + iterArgNumber));
874 
875  return extracted;
876 }
877 
878 /// Produce a tensor extracted from the packingResult. This can be used as a
879 /// replacement for `opToHoist` in callers.
881  const IRMapping &bvm,
882  tensor::PadOp opToHoist,
883  RankedTensorType transposedTensorType,
884  const HoistPaddingAnalysis &analysis,
885  const PackingResult &packingResult) {
886  // The replacement occurs under a single insertion point within the original
887  // loop, just before opToHoist.
888  OpBuilder::InsertionGuard g(rewriter);
889  rewriter.setInsertionPoint(opToHoist);
890 
891  Location loc = opToHoist->getLoc();
892  RankedTensorType paddedTensorType = opToHoist.getResultType();
893  int paddedRank = paddedTensorType.getRank();
894 
895  int64_t nPackedLoops = packingResult.clonedLoopIvs.size();
896  LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n");
897 
898  scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
899  ArrayRef<scf::ForOp> packingLoops = analysis.packingLoops;
900 
901  Value hoistedPackedTensor;
902  SmallVector<Value> loopIterationCounts;
903  SmallVector<OpFoldResult> offsets(nPackedLoops + paddedRank,
904  rewriter.getIndexAttr(0));
905  if (nPackedLoops > 0) {
906  loopIterationCounts =
907  llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
908  return buildLoopIterationCount(rewriter, outerLoop,
909  cast<scf::ForOp>(loop));
910  }));
911  // Assert all loop iteration counts can be computed.
912  if (llvm ::any_of(loopIterationCounts, [](Value v) { return !v; }))
913  llvm_unreachable("loop independence prerequisite not met");
914 
915  // offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0].
916  std::copy(loopIterationCounts.begin(), loopIterationCounts.end(),
917  offsets.begin());
918  hoistedPackedTensor =
919  scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front())
920  ->getResult(0);
921  } else {
922  // If no loops were created, this is just hoisting without packing.
923  hoistedPackedTensor = bvm.lookup(opToHoist.getResult());
924  }
925 
926  LLVM_DEBUG(DBGS() << "hoistedPackedTensor: " << hoistedPackedTensor << "\n");
927 
928  // If the consumer of `padOp` was a `forOp`, propagate through iter args.
929  scf::ForOp forOp = analysis.padConsumingForOp;
930  if (forOp) {
931  return padThroughLoopIterArg(rewriter, opToHoist, hoistedPackedTensor,
932  analysis.sliceOp, forOp);
933  }
934 
935  // offsets = [maybe_leading_ivs, 0 .. 0].
936  // sizes = [1 .. 1, transposedShape] (defined above).
937  // strides = [1 .. 1] (defined above)
938  return tensor::ExtractSliceOp::create(
939  rewriter, loc, transposedTensorType, hoistedPackedTensor, offsets,
940  packingResult.sizes, packingResult.strides);
941 }
942 
944  RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
945  ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
946  SmallVectorImpl<TransposeOp> &transposeOps) {
947  LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
948  DBGS() << " by " << numLoops << " loops\n");
949 
950  HoistPaddingAnalysis analysis(opToHoist, numLoops);
951  analysis.enableHoistPadding(rewriter);
952  analysis.finalizeHoistPaddingAnalysis();
953  if (!analysis.isValid()) {
954  LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
955  return failure();
956  }
957 
958  /// Construct the packing loop nest.
959  IRMapping bvm;
960  FailureOr<PackingResult> packingResult = buildPackingLoopNestImpl(
961  rewriter, bvm, opToHoist, transposeVector, analysis);
962  if (failed(packingResult)) {
963  LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");
964  return failure();
965  }
966 
967  if (!transposeVector.empty())
968  transposeOps.push_back(packingResult->maybeTransposeOp);
969 
970  FailureOr<RankedTensorType> transposedTensorType =
971  tensor::computeTransposedType(opToHoist.getResultType(), transposeVector);
972  assert(succeeded(transposedTensorType) && "unexpected failure in type");
973 
974  // Now the packed tensor is ready, replace the original padding op by a
975  // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
976  Value newResult =
977  replaceByPackingResult(rewriter, bvm, opToHoist, *transposedTensorType,
978  analysis, *packingResult);
979 
980  Location loc = opToHoist->getLoc();
981  RankedTensorType paddedTensorType = opToHoist.getResultType();
982  if (!transposeVector.empty()) {
983  OpBuilder::InsertionGuard g(rewriter);
984  rewriter.setInsertionPointAfter(newResult.getDefiningOp());
985  // Transpose the packed tensor back to the original storage order.
986  Value emptyTensor =
987  tensor::EmptyOp::create(rewriter, loc, paddedTensorType.getShape(),
988  paddedTensorType.getElementType());
989  TransposeOp unTransposeOp = linalg::TransposeOp::create(
990  rewriter, loc, newResult, emptyTensor, transposeVector);
991  newResult = unTransposeOp.getResult()[0];
992  transposeOps.push_back(unTransposeOp);
993  }
994 
995  LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n");
996  LLVM_DEBUG(
997  DBGS() << "After hoisting: "
998  << newResult.getDefiningOp()->getParentOfType<func::FuncOp>()
999  << "\n");
1000 
1001  // Make the newly cloned `opToHoist` available to the caller.
1002  hoistedOp = packingResult->hoistedPadOp;
1003 
1004  LLVM_DEBUG(DBGS() << "--SUCCESS\n");
1005  return newResult;
1006 }
1007 
1009  tensor::PadOp opToHoist, int64_t numLoops,
1010  ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
1011  SmallVectorImpl<TransposeOp> &transposeOps) {
1012  IRRewriter rewriter(opToHoist.getContext());
1013  return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
1014  hoistedOp, transposeOps);
1015 }
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
static tensor::ExtractSliceOp padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, Value hoistedPackedTensor, tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp)
If the original consumer of outerSliceOp was a forOp (i.e.
static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
static void getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
static bool debugPrintLoopInShortForm(Operation *op)
static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp, Value expectedSource)
Return true if we can walk back the use-def chain from extractSliceOp to expectedSource going through...
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
static FailureOr< PackingResult > buildPackingLoopNestImpl(RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, ArrayRef< int64_t > transposeVector, RankedTensorType transposedTensorType, tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis)
static void computeBackwardSlice(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp, SetVector< Operation * > &backwardSlice)
static Value replaceByPackingResult(RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, RankedTensorType transposedTensorType, const HoistPaddingAnalysis &analysis, const PackingResult &packingResult)
Produce a tensor extracted from the packingResult.
#define DBGS()
static void debugPrintBackwardSlice(SetVector< Operation * > &backwardSlice)
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Base type for affine expression.
Definition: AffineExpr.h:68
This class provides management for the lifetime of the state used when printing the IR.
Definition: AsmState.h:542
This class represents an argument of a Block.
Definition: Value.h:309
Block * getOwner() const
Returns the block that owns this argument.
Definition: Value.h:318
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:31
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:103
MLIRContext * getContext() const
Definition: Builders.h:55
A class for computing basic dominance information.
Definition: Dominance.h:140
bool dominates(Operation *a, Operation *b) const
Return true if operation A dominates operation B, i.e.
Definition: Dominance.h:158
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:764
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346
This class helps build Operations.
Definition: Builders.h:205
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:548
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:517
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
This class represents an operand of an operation.
Definition: Value.h:257
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:234
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:636
virtual void finalizeOpModification(Operation *op)
This method is used to signal the end of an in-place modification of the given operation.
virtual void startOpModification(Operation *op)
This method is used to notify the rewriter that an in-place operation modification is about to happen...
Definition: PatternMatch.h:612
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
A helper class to be used with ValueBoundsOpInterface.
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:24
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Definition: AffineOps.cpp:2801
FailureOr< OpFoldResult > reifyIndexValueBound(OpBuilder &b, Location loc, presburger::BoundType type, Value value, ValueBoundsConstraintSet::StopConditionFn stopCondition=nullptr, bool closedUB=false)
Reify a bound for the given index-typed value in terms of SSA values for which stopCondition is met.
void bindDims(MLIRContext *ctx)
Definition: AffineExpr.h:289
void bindSymbols(MLIRContext *ctx)
Definition: AffineExpr.h:298
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< TransposeOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
detail::InFlightRemark analysis(Location loc, RemarkOpts opts)
Report an optimization analysis remark.
Definition: Remarks.h:497
FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
Definition: Utils.cpp:76
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:490
LogicalResult getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, const BackwardSliceOptions &options={})
Fills backwardSlice with the computed backward slice (i.e.
LoopLikeOpInterface hoistLoopInvariantSubsets(RewriterBase &rewriter, LoopLikeOpInterface loopLike)
Hoist loop-invariant tensor subsets (subset extraction and subset insertion ops) from loop-like ops.
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:70
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:111
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:369
bool inclusive
Include the top level op in the slice.
Definition: SliceAnalysis.h:32
TransitiveFilter filter
Definition: SliceAnalysis.h:29
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:658
SmallVector< OpFoldResult > strides
Definition: Transforms.h:659
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:660
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:659