MLIR  17.0.0git
HoistPadding.cpp
Go to the documentation of this file.
1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
25 #include "mlir/IR/AsmState.h"
26 #include "mlir/IR/BuiltinOps.h"
27 #include "mlir/IR/Dominance.h"
28 #include "mlir/IR/Matchers.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/Support/Debug.h"
31 
32 using llvm::dbgs;
33 
34 #define DEBUG_TYPE "hoist-padding"
35 
36 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
37 
38 using namespace mlir;
39 using namespace mlir::linalg;
40 
41 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
42 /// loops. The failure conditions are:
43 /// 1. Pad op has a use that is not an input of a LinalgOp.
44 /// 2. Pad op does not have a constant padding value.
45 /// 3. There is no immediately enclosing scf::ForOp.
46 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above
47 /// contains an unknown op with non index type operands, a region, or a
48 /// memory effect.
49 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
50 /// empty.
51 /// 6. The source tensor of pad op is not defined by an extract slice op.
52 /// 7. The source tensor of the extract slice op is not defined outside of
53 /// the outermost enclosing scf::ForOp.
54 /// 8. There is no enclosing scf::ForOp that indexes the padded data.
55 /// Other cases succeed and will trigger hoisting of the pad op.
57  HoistingAnalysis(tensor::PadOp padOp, int numLoops);
58 
59  bool isValid() { return valid; }
60 
61  /// Footprint of the packedTensor, computed from the packingLoops.
62  SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
63 
64  /// The outermost loop, determined by `nLevels` above which `padOp` will
65  /// be hoisted.
67 
68  /// Backward slice rooted at `padOp` and nested under
69  /// `outermostEnclosingForOp`.
71 
72  /// The scf::ForOp immediately enclosing `padOp` such that:
73  /// 1. they are nested under `outermostEnclosingForOp` (inclusive)
74  /// 2. whose induction variable is used, directly or indirectly, in the
75  /// computation of `padOp`.
76  /// The span of these loops determines the footprint of the packed tensor.
78 
79 private:
80  /// Drop any non-index dependencies of `padOp` and `sliceOp` from
81  /// `backwardSlice`. The method follows the use-def chains of the index
82  /// operands consumed by `padOp` and `sliceOp` and drops the operations
83  /// not part of this index computation. Afterwards, the filtered
84  /// `backwardSlice` contains only the loops whose induction variable is used,
85  /// directly or indirectly, to index the padded tensor. The method returns
86  /// failure if the filtered backward slice contains an unexpected operation.
87  ///
88  /// Example:
89  /// ```
90  /// %source = linalg.fill(%cst, %arg0)
91  /// scf.for %i
92  /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
93  /// scf.for %j (%arg2 = %unrelated)
94  /// scf.for %k // not used to index %source!
95  /// %ubi = affine.min #map(%i)
96  /// %ubj = affine.min #map(%j)
97  /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
98  /// %padded_slice = tensor.pad %slice
99  /// ```
100  /// dropNonIndexDependencies(%padded_slice, %slice)
101  /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
102  LogicalResult dropNonIndexDependencies(tensor::PadOp padOp,
103  tensor::ExtractSliceOp sliceOp);
104 
105  /// Encodes whether the analysis is valid and hoisting can proceed.
106  bool valid;
107 };
108 
109 /// Return true if all uses of `padOp` are an input tensor of some
110 /// LinalgOp.
111 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) {
112  for (OpOperand &use : padOp.getResult().getUses()) {
113  auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
114  if (!linalgUser || !linalgUser.isDpsInput(&use)) {
115  LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp)
116  << "\nthat is not an input tensor of a LinalgOp, "
117  << "cannot hoist\n"
118  << *(use.getOwner()) << "\n");
119  return false;
120  }
121  }
122  return true;
123 }
124 
125 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
126 /// Stops at the first parent that is not an scf::ForOp.
127 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
128 /// Control-flow and other containing ops with regions are not modeled atm.
129 static void
130 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
131  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
132  AsmState state(padOp->getParentOfType<func::FuncOp>());
133  (void)state;
134  scf::ForOp outermostEnclosingForOp = nullptr;
135  Operation *nextEnclosingOp = padOp->getParentOp();
136  while (nLevels-- > 0 &&
137  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
138  LLVM_DEBUG(
139  DBGS() << "loops: ";
140  outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
141  dbgs() << "\n");
142  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
143  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
144  }
145 }
146 
147 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty.
148 /// Fail if `transposeVector` is no permutation matching the tensor rank.
150 computeTransposedType(RankedTensorType rankedTensorType,
151  ArrayRef<int64_t> transposeVector) {
152  if (transposeVector.empty())
153  return rankedTensorType;
154  if (!isPermutationVector(transposeVector) ||
155  transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
156  return failure();
157 
158  SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(),
159  rankedTensorType.getShape().end());
160  applyPermutationToVector(transposedShape, transposeVector);
161 
162  using RTTBuilder = RankedTensorType::Builder;
163  RankedTensorType transposedTensorType =
164  RTTBuilder(rankedTensorType).setShape(transposedShape);
165  return transposedTensorType;
166 }
167 
168 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
169  valid = false;
170 
171  // Bail on any use that isn't an input of a LinalgOp.
172  // Hoisting of inplace updates happens after vectorization.
173  if (!isOnlyUsedAsInputOfLinalgOp(padOp))
174  return;
175 
176  // Get at most `numLoops` of immediately enclosing loops.
177  SmallVector<scf::ForOp> reverseEnclosingLoops;
178  getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
179  if (reverseEnclosingLoops.empty()) {
180  LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
181  return;
182  }
183 
184  outermostEnclosingForOp = reverseEnclosingLoops.back();
185 
186  // Get the `sliceOp` that defines the source tensor of `padOp` and
187  // check its source is defined outside of the outermost loop. This check
188  // ensures the padded data is available for packing before entering the
189  // outermost enclosing loop.
190  //
191  // Example:
192  // ```
193  // %source = linalg.fill(%cst, %arg0)
194  // // %source is available for packing here!
195  // scf.for %i
196  // scf.for %j
197  // scf.for %k
198  // %slice = tensor.extract_slice %source [%i, %j]
199  // %padded_slice = tensor.pad %slice
200  // ```
201  auto sliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
202  if (!sliceOp) {
203  LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
204  return;
205  }
206  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
207  LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
208  return;
209  }
210 
211  // Check the region of `padOp` depends on a constant only. Adding
212  // hoisting support for arbitrary padding regions would require cloning all
213  // dependencies captured by the padding region.
214  Value paddingValue = padOp.getConstantPaddingValue();
215  if (!paddingValue ||
216  !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
217  LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
218  return;
219  }
220 
221  // Get all the ops in the backwards slice starting from `padOp` and that
222  // are dominated by the outermost enclosing loop.
223  DominanceInfo domInfo(outermostEnclosingForOp);
224  getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) {
225  return domInfo.dominates(outermostEnclosingForOp, op);
226  });
227  if (backwardSlice.empty())
228  return;
229  // Add `padOp` itself to the backward slice.
230  backwardSlice.insert(padOp.getOperation());
231 
232  // Remove all ops in the backward slice that are not used to index the padded
233  // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and
234  // affine operations used for the index computation.
235  if (failed(dropNonIndexDependencies(padOp, sliceOp)))
236  return;
237 
238  // Add only the loops part of the filtered `backwardSlice` to the packing
239  // loops. All other loops are not used to index the padded data and
240  // consequently access the same data in every loop iteration. Adding them to
241  // the packing loops would increase the cache footprint of the packed data
242  // by storing the same data multiple times.
243  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
244  if (backwardSlice.contains(forOp))
245  packingLoops.push_back(forOp);
246  if (packingLoops.empty()) {
247  LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
248  return;
249  }
250 
251  // The analysis is valid and hoisting can occur.
252  valid = true;
253 }
254 
256 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
257  tensor::ExtractSliceOp sliceOp) {
258  // Set of all values used for index computation.
259  SetVector<Value> indexEdges;
260 
261  // Add all index operands of `operation` to `indexEdges`. An index operand is
262  // an operand of type index.
263  auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
264  for (Value operand : operation->getOperands())
265  if (operand.getType().isIndex())
266  indexEdges.insert(operand);
267  };
268 
269  // Check if any operation result is contained in `indexEdges`.
270  auto hasIndexResult = [&](Operation *operation) {
271  return llvm::any_of(operation->getResults(), [&](Value result) {
272  return indexEdges.contains(result);
273  });
274  };
275 
276  // Starting from `padOp` and `sliceOp` walk the use-def edges of index
277  // type in `backwardSlice`. Add the index operands of an operation to
278  // `indexEdges` and remove all operations from `backwardSlice` that are not
279  // part of the index computation.
280  //
281  // Example:
282  // ```
283  // %source = linalg.fill(%cst, %arg0)
284  // scf.for %i
285  // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
286  // scf.for %j (%arg2 = %unrelated)
287  // scf.for %k // not used to index %source!
288  // %ubi = affine.min #map(%i)
289  // %ubj = affine.min #map(%j)
290  // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
291  // %padded_slice = tensor.pad %slice
292  // ```
293  // After iterating `backwardSlice` we obtain:
294  // indexEdges = [%i, %j, %ubi, %ubj]
295  // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
296  SetVector<Operation *> operationsToRemove;
297  for (Operation *op : llvm::reverse(backwardSlice)) {
298  // Add the index operands of `padOp` and `sliceOp` to start the
299  // exploration of the index computation.
300  if (op == padOp || op == sliceOp) {
301  addIndexOperandsToIndexEdges(op);
302  continue;
303  }
304  // Add the index operands of the loop if its induction variable is
305  // used for index computation.
306  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
307  if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
308  addIndexOperandsToIndexEdges(op);
309  continue;
310  }
311  }
312  // Add the index operands of all other operations if at least one result is
313  // used for index computation.
314  if (hasIndexResult(op)) {
315  addIndexOperandsToIndexEdges(op);
316  // Check the operands of the remaining operations all have index type.
317  if (llvm::any_of(op->getOperandTypes(),
318  [](Type type) { return !type.isIndex(); })) {
319  LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
320  << op << " -> skip\n");
321  return failure();
322  }
323  // Check the remaining operations do not have regions or memory effects.
324  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
325  bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
326  if (hasMemoryEffect || op->getNumRegions() != 0) {
327  LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
328  << op << " -> skip\n");
329  return failure();
330  }
331  continue;
332  }
333  // Remove all other operations not used by the index computation. An
334  // exception are constant operations that may be used by `padOp`.
335  if (!isa<arith::ConstantOp>(op))
336  operationsToRemove.insert(op);
337  }
338  backwardSlice.set_subtract(operationsToRemove);
339  return success();
340 }
341 
344  SmallVector<Value> dynamicTensorSizes;
345 
346  // Upper bound the packing loop lengths to size the packed tensor. Taking
347  // upper bounds can make the sizes of the packed tensor independent of the
348  // enclosing loops. This independence is a prerequisite for reusing the same
349  // buffer for all enclosing loop iterations and hoisting its allocation out of
350  // the enclosing loops.
351  for (auto forOp : packingLoops) {
352  // Compute an upper bound `ubVal` for the upper bound of `forOp`.
353  AffineMap boundMap;
354  SmallVector<Value> boundOperands;
355  getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands);
356  Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
357  // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
358  // store the result to `dynamicTensorSizes`.
359  // TODO: instead of using the lower bound of `forOp` directly, implement a
360  // lower bound computation similar to the upper bound computation.
361  AffineExpr lb, ub, step;
362  bindDims(b.getContext(), lb, ub);
363  bindSymbols(b.getContext(), step);
364  Value res = b.createOrFold<AffineApplyOp>(
365  (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal,
366  cast<scf::ForOp>(forOp).getStep()});
367  dynamicTensorSizes.push_back(res);
368  }
369 
370  return dynamicTensorSizes;
371 }
372 
373 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
374  return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
375 }
376 
377 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
378 /// The returned Value is guaranteed not to depend on any loop comprised in
379 /// [`outer`, `forOp`].
380 /// Return null if such a loop-independent quantity cannot be computed.
381 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
382  scf::ForOp forOp) {
383  MLIRContext *ctx = forOp->getContext();
384  AffineExpr iv, lb, step;
385  bindDims(ctx, iv, lb);
386  bindSymbols(ctx, step);
387  if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
388  !isDefinedOutsideOrConstant(outer, forOp.getStep()))
389  return Value();
390  Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
391  stepVal = forOp.getStep();
392  auto loc = forOp->getLoc();
393  return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
394  ValueRange{ivVal, lbVal, stepVal});
395 }
396 
398  tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
399  tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) {
400  LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
401  << " loops\n");
402  HoistingAnalysis analysis(opToHoist, numLoops);
403  if (!analysis.isValid()) {
404  LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
405  return failure();
406  }
407 
408  scf::ForOp outer = analysis.outermostEnclosingForOp;
409  ImplicitLocOpBuilder b(outer->getLoc(), outer);
410 
411  SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
412 
413  // Update actual number of loops, which may be smaller.
414  int nPackedLoops = analysis.packingLoops.size();
415 
416  Location loc = opToHoist->getLoc();
417  RankedTensorType paddedTensorType = opToHoist.getResultType();
418  int paddedRank = paddedTensorType.getRank();
419 
420  // Compute the type of the transposed padded tensor.
421  FailureOr<RankedTensorType> transposedTensorType =
422  computeTransposedType(paddedTensorType, transposeVector);
423  if (failed(transposedTensorType))
424  return failure();
425 
426  // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize
427  // padding.
428  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
429  // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
430  // tensor.
431  llvm::append_range(packedShape, transposedTensorType->getShape());
432  auto packedTensorType = RankedTensorType::get(
433  packedShape, transposedTensorType->getElementType());
434  Value packedTensor = b.create<tensor::EmptyOp>(
435  loc, packedTensorType.getShape(), packedTensorType.getElementType(),
436  dynamicTensorSizes);
437 
438  // Clone the operations involved in the backward slice, iteratively stepping
439  // into the loops that we encounter.
440  // The implementation proceeds in a stack-like fashion:
441  // 1. Iteratively clone and step into the loops, pushing the `packedTensor`
442  // deeper in the stack.
443  // 2. Create a GenericOp if `transposeVector` is non-empty.
444  // 3. Create a InsertSliceOp at the top of the stack.
445  // 4. Iteratively pop and yield the result of the InsertSliceOp across
446  // the cloned loops.
447  SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
448  clonedLoopIvs.reserve(nPackedLoops);
449  leadingPackedTensorIndexings.reserve(nPackedLoops);
450  IRMapping bvm;
451  // Stack step 1. iteratively clone loops and push `packedTensor`.
452  for (Operation *op : analysis.backwardSlice) {
453  // Specifically sit out in the extract_slice(packedTensor) case: this is the
454  // piece we seek to replace.
455  if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
456  if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor)
457  continue;
458  // Clone all operations except it is a loop.
459  auto forOp = dyn_cast<scf::ForOp>(op);
460  if (!forOp) {
461  b.clone(*op, bvm);
462  continue;
463  }
464  // Create a packing loop that takes `packedTensor` as iteration argument.
465  auto clonedForOp = b.create<scf::ForOp>(
466  loc, bvm.lookupOrDefault(forOp.getLowerBound()),
467  bvm.lookupOrDefault(forOp.getUpperBound()),
468  bvm.lookupOrDefault(forOp.getStep()), packedTensor);
469  // Map the induction var, region args and results to the `clonedForOp`.
470  bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
471  bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
472  bvm.map(forOp.getResults(), clonedForOp.getResults());
473  assert(clonedForOp->getNumRegions() == 1);
474  clonedLoopIvs.push_back(clonedForOp.getInductionVar());
475 
476  b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
477  Value loopIndependentIterationCount =
478  buildLoopIterationCount(b, outer, clonedForOp);
479  // Assert the loop-independent iteration count can be computed.
480  if (!loopIndependentIterationCount)
481  llvm_unreachable("loop independence prerequisite not met");
482  leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
483  packedTensor = clonedForOp.getRegionIterArgs().front();
484  }
485 
486  // offsets = [clonedLoopIvs, 0 .. 0].
487  SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
488  leadingPackedTensorIndexings.end());
489  offsets.append(paddedRank, b.getIndexAttr(0));
490  // sizes = [1 .. 1, transposedShape].
491  SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
492  for (int64_t sz : transposedTensorType->getShape()) {
493  // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
494  assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
495  sizes.push_back(b.getIndexAttr(sz));
496  }
497  // strides = [1 .. 1].
498  SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
499  b.getIndexAttr(1));
500 
501  // Stack step 2. create GenericOp if `transposeVector` is non-empty.
502  Value paddedTensor = bvm.lookup(opToHoist.getResult());
503  if (!transposeVector.empty()) {
504  Value outputTensor = b.create<tensor::ExtractSliceOp>(
505  loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
506  transposeOps.push_back(
507  makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector));
508  paddedTensor = transposeOps.back()->getResult(0);
509  }
510 
511  // Stack step 3. create InsertSliceOp at the top of the stack.
512  Value inserted = b.create<tensor::InsertSliceOp>(
513  loc, paddedTensor, packedTensor, offsets, sizes, strides);
514 
515  // Stack step 4. iteratively pop the stack and propagate the yield.
516  Value valueToYield = inserted;
517  for (Value iv : llvm::reverse(clonedLoopIvs)) {
518  auto forOp = scf::getForInductionVarOwner(iv);
519  b.setInsertionPointToEnd(&forOp.getRegion().front());
520  b.create<scf::YieldOp>(loc, valueToYield);
521  valueToYield = forOp.getResult(0);
522  }
523 
524  // Now the packed tensor is ready, replace the original padding op by a
525  // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
526  b.setInsertionPoint(opToHoist);
527  SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
528  llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
529  return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
530  }));
531  // Assert all loop iteration counts can be computed.
532  if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
533  llvm_unreachable("loop independence prerequisite not met");
534  // offsets = [originalLoopIvs, 0 .. 0].
535  offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
536  offsets.append(paddedRank, b.getIndexAttr(0));
537  // sizes = [1 .. 1, transposedShape] (definedabove).
538  // strides = [1 .. 1] (defined above)
539  packedTensor =
540  scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
541  Value newResult = b.create<tensor::ExtractSliceOp>(
542  loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
543 
544  // Transpose the packed tensor back to the original storage order.
545  if (!transposeVector.empty()) {
546  Value emptyTensor = b.create<tensor::EmptyOp>(
547  loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
548  transposeOps.push_back(
549  makeTransposeOp(b, loc, newResult, emptyTensor, transposeVector));
550  newResult = transposeOps.back()->getResult(0);
551  }
552 
553  // Make the newly cloned `opToHoist` available to the caller.
554  hoistedOp =
555  cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp());
556  return newResult;
557 }
static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp)
Return true if all uses of padOp are an input tensor of some LinalgOp.
static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
#define DBGS()
static FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:43
This class provides management for the lifetime of the state used when printing the IR.
Definition: AsmState.h:525
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:109
MLIRContext * getContext() const
Definition: Builders.h:55
A class for computing basic dominance information.
Definition: Dominance.h:121
This class provides support for representing a failure result, or a valid value of type T.
Definition: LogicalResult.h:78
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:56
This class helps build Operations.
Definition: Builders.h:199
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:510
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:384
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:351
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:389
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:473
This class represents an operand of an operation.
Definition: Value.h:255
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:75
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:209
This is a builder type that keeps local references to arguments.
Definition: BuiltinTypes.h:213
Builder & setShape(ArrayRef< int64_t > newShape)
Definition: BuiltinTypes.h:224
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:350
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
void getUpperBoundForIndex(Value value, AffineMap &boundMap, SmallVectorImpl< Value > &boundOperands, bool constantRequired=false)
Computes an upper bound for the result value of an index computation.
Definition: Utils.cpp:229
GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, Value outputTensor, ArrayRef< int64_t > transposeVector)
Returns a GenericOp that transposes inputTensor into outputTensor using transposeVector to permute th...
Definition: Utils.cpp:408
FailureOr< Value > hoistPaddingOnTensors(tensor::PadOp opToHoist, int numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
ForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:322
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:336
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
Definition: MathExtras.h:23
void getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, TransitiveFilter filter=nullptr)
Fills backwardSlice with the computed backward slice (i.e.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:343
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:248
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Definition: IndexingUtils.h:72
bool isPermutationVector(ArrayRef< int64_t > interchange)
Method to check if an interchange vector is a permutation.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
Analysis class to support tensor::PadOp hoisting across multiple enclosing loops.
SmallVector< Value > getPackedTensorSizes(ImplicitLocOpBuilder &b)
Footprint of the packedTensor, computed from the packingLoops.
HoistingAnalysis(tensor::PadOp padOp, int numLoops)
SmallVector< scf::ForOp > packingLoops
The scf::ForOp immediately enclosing padOp such that:
scf::ForOp outermostEnclosingForOp
The outermost loop, determined by nLevels above which padOp will be hoisted.
SetVector< Operation * > backwardSlice
Backward slice rooted at padOp and nested under outermostEnclosingForOp.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26