MLIR  16.0.0git
HoistPadding.cpp
Go to the documentation of this file.
1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
25 #include "mlir/IR/AsmState.h"
26 #include "mlir/IR/BuiltinOps.h"
27 #include "mlir/IR/Dominance.h"
28 #include "mlir/IR/Matchers.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/Support/Debug.h"
31 
32 using llvm::dbgs;
33 
34 #define DEBUG_TYPE "hoist-padding"
35 
36 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
37 
38 using namespace mlir;
39 using namespace mlir::linalg;
40 
41 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
42 /// loops. The failure conditions are:
43 /// 1. Pad op has a use that is not an input of a LinalgOp.
44 /// 2. Pad op does not have a constant padding value.
45 /// 3. There is no immediately enclosing scf::ForOp.
46 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above
47 /// contains an unknown op with non index type operands, a region, or a
48 /// memory effect.
49 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
50 /// empty.
51 /// 6. The source tensor of pad op is not defined by an extract slice op.
52 /// 7. The source tensor of the extract slice op is not defined outside of
53 /// the outermost enclosing scf::ForOp.
54 /// 8. There is no enclosing scf::ForOp that indexes the padded data.
55 /// Other cases succeed and will trigger hoisting of the pad op.
57  HoistingAnalysis(tensor::PadOp padOp, int numLoops);
58 
59  bool isValid() { return valid; }
60 
61  /// Footprint of the packedTensor, computed from the packingLoops.
62  SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
63 
64  /// The outermost loop, determined by `nLevels` above which `padOp` will
65  /// be hoisted.
67 
68  /// Backward slice rooted at `padOp` and nested under
69  /// `outermostEnclosingForOp`.
71 
72  /// The scf::ForOp immediately enclosing `padOp` such that:
73  /// 1. they are nested under `outermostEnclosingForOp` (inclusive)
74  /// 2. whose induction variable is used, directly or indirectly, in the
75  /// computation of `padOp`.
76  /// The span of these loops determines the footprint of the packed tensor.
77  SmallVector<scf::ForOp> packingLoops;
78 
79 private:
80  /// Drop any non-index dependencies of `padOp` and `sliceOp` from
81  /// `backwardSlice`. The method follows the use-def chains of the index
82  /// operands consumed by `padOp` and `sliceOp` and drops the operations
83  /// not part of this index computation. Afterwards, the filtered
84  /// `backwardSlice` contains only the loops whose induction variable is used,
85  /// directly or indirectly, to index the padded tensor. The method returns
86  /// failure if the filtered backward slice contains an unexpected operation.
87  ///
88  /// Example:
89  /// ```
90  /// %source = linalg.fill(%cst, %arg0)
91  /// scf.for %i
92  /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
93  /// scf.for %j (%arg2 = %unrelated)
94  /// scf.for %k // not used to index %source!
95  /// %ubi = affine.min #map(%i)
96  /// %ubj = affine.min #map(%j)
97  /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
98  /// %padded_slice = tensor.pad %slice
99  /// ```
100  /// dropNonIndexDependencies(%padded_slice, %slice)
101  /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
102  LogicalResult dropNonIndexDependencies(tensor::PadOp padOp,
103  tensor::ExtractSliceOp sliceOp);
104 
105  /// Encodes whether the analysis is valid and hoisting can proceed.
106  bool valid;
107 };
108 
109 /// Return true if all uses of `padOp` are an input tensor of some
110 /// LinalgOp.
111 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) {
112  for (OpOperand &use : padOp.getResult().getUses()) {
113  auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
114  if (!linalgUser || !linalgUser.isInputTensor(&use)) {
115  LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp)
116  << "\nthat is not an input tensor of a LinalgOp, "
117  << "cannot hoist\n"
118  << *(use.getOwner()) << "\n");
119  return false;
120  }
121  }
122  return true;
123 }
124 
125 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
126 /// Stops at the first parent that is not an scf::ForOp.
127 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
128 /// Control-flow and other containing ops with regions are not modeled atm.
129 static void
130 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
131  SmallVector<scf::ForOp> &reverseEnclosingLoops) {
132  AsmState state(padOp->getParentOfType<func::FuncOp>());
133  (void)state;
134  scf::ForOp outermostEnclosingForOp = nullptr;
135  Operation *nextEnclosingOp = padOp->getParentOp();
136  while (nLevels-- > 0 &&
137  (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
138  LLVM_DEBUG(
139  DBGS() << "loops: ";
140  outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
141  dbgs() << "\n");
142  reverseEnclosingLoops.push_back(outermostEnclosingForOp);
143  nextEnclosingOp = outermostEnclosingForOp->getParentOp();
144  }
145 }
146 
147 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty.
148 /// Fail if `transposeVector` is no permutation matching the tensor rank.
150 computeTransposedType(RankedTensorType rankedTensorType,
151  ArrayRef<int64_t> transposeVector) {
152  if (transposeVector.empty())
153  return rankedTensorType;
154  if (!isPermutation(transposeVector) ||
155  transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
156  return failure();
157 
158  SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(),
159  rankedTensorType.getShape().end());
160  applyPermutationToVector(transposedShape, transposeVector);
161 
162  using RTTBuilder = RankedTensorType::Builder;
163  RankedTensorType transposedTensorType =
164  RTTBuilder(rankedTensorType).setShape(transposedShape);
165  return transposedTensorType;
166 }
167 
168 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
169  valid = false;
170 
171  // Bail on any use that isn't an input of a LinalgOp.
172  // Hoisting of inplace updates happens after vectorization.
173  if (!isOnlyUsedAsInputOfLinalgOp(padOp))
174  return;
175 
176  // Get at most `numLoops` of immediately enclosing loops.
177  SmallVector<scf::ForOp> reverseEnclosingLoops;
178  getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
179  if (reverseEnclosingLoops.empty()) {
180  LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
181  return;
182  }
183 
184  outermostEnclosingForOp = reverseEnclosingLoops.back();
185 
186  // Get the `sliceOp` that defines the source tensor of `padOp` and
187  // check its source is defined outside of the outermost loop. This check
188  // ensures the padded data is available for packing before entering the
189  // outermost enclosing loop.
190  //
191  // Example:
192  // ```
193  // %source = linalg.fill(%cst, %arg0)
194  // // %source is available for packing here!
195  // scf.for %i
196  // scf.for %j
197  // scf.for %k
198  // %slice = tensor.extract_slice %source [%i, %j]
199  // %padded_slice = tensor.pad %slice
200  // ```
201  auto sliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
202  if (!sliceOp) {
203  LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
204  return;
205  }
206  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
207  LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
208  return;
209  }
210 
211  // Check the region of `padOp` depends on a constant only. Adding
212  // hoisting support for arbitrary padding regions would require cloning all
213  // dependencies captured by the padding region.
214  Value paddingValue = padOp.getConstantPaddingValue();
215  if (!paddingValue ||
216  !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
217  LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
218  return;
219  }
220 
221  // Get all the ops in the backwards slice starting from `padOp` and that
222  // are dominated by the outermost enclosing loop.
223  DominanceInfo domInfo(outermostEnclosingForOp);
224  getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) {
225  return domInfo.dominates(outermostEnclosingForOp, op);
226  });
227  if (backwardSlice.empty())
228  return;
229  // Add `padOp` itself to the backward slice.
230  backwardSlice.insert(padOp.getOperation());
231 
232  // Remove all ops in the backward slice that are not used to index the padded
233  // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and
234  // affine operations used for the index computation.
235  if (failed(dropNonIndexDependencies(padOp, sliceOp)))
236  return;
237 
238  // Add only the loops part of the filtered `backwardSlice` to the packing
239  // loops. All other loops are not used to index the padded data and
240  // consequently access the same data in every loop iteration. Adding them to
241  // the packing loops would increase the cache footprint of the packed data
242  // by storing the same data multiple times.
243  for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
244  if (backwardSlice.contains(forOp))
245  packingLoops.push_back(forOp);
246  if (packingLoops.empty()) {
247  LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
248  return;
249  }
250 
251  // The analysis is valid and hoisting can occur.
252  valid = true;
253 }
254 
256 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
257  tensor::ExtractSliceOp sliceOp) {
258  // Set of all values used for index computation.
259  SetVector<Value> indexEdges;
260 
261  // Add all index operands of `operation` to `indexEdges`. An index operand is
262  // an operand of type index.
263  auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
264  for (Value operand : operation->getOperands())
265  if (operand.getType().isIndex())
266  indexEdges.insert(operand);
267  };
268 
269  // Check if any operation result is contained in `indexEdges`.
270  auto hasIndexResult = [&](Operation *operation) {
271  return llvm::any_of(operation->getResults(), [&](Value result) {
272  return indexEdges.contains(result);
273  });
274  };
275 
276  // Starting from `padOp` and `sliceOp` walk the use-def edges of index
277  // type in `backwardSlice`. Add the index operands of an operation to
278  // `indexEdges` and remove all operations from `backwardSlice` that are not
279  // part of the index computation.
280  //
281  // Example:
282  // ```
283  // %source = linalg.fill(%cst, %arg0)
284  // scf.for %i
285  // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
286  // scf.for %j (%arg2 = %unrelated)
287  // scf.for %k // not used to index %source!
288  // %ubi = affine.min #map(%i)
289  // %ubj = affine.min #map(%j)
290  // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
291  // %padded_slice = tensor.pad %slice
292  // ```
293  // After iterating `backwardSlice` we obtain:
294  // indexEdges = [%i, %j, %ubi, %ubj]
295  // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
296  SetVector<Operation *> operationsToRemove;
297  for (Operation *op : llvm::reverse(backwardSlice)) {
298  // Add the index operands of `padOp` and `sliceOp` to start the
299  // exploration of the index computation.
300  if (op == padOp || op == sliceOp) {
301  addIndexOperandsToIndexEdges(op);
302  continue;
303  }
304  // Add the index operands of the loop if its induction variable is
305  // used for index computation.
306  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
307  if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
308  addIndexOperandsToIndexEdges(op);
309  continue;
310  }
311  }
312  // Add the index operands of all other operations if at least one result is
313  // used for index computation.
314  if (hasIndexResult(op)) {
315  addIndexOperandsToIndexEdges(op);
316  // Check the operands of the remaining operations all have index type.
317  if (llvm::any_of(op->getOperandTypes(),
318  [](Type type) { return !type.isIndex(); })) {
319  LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
320  << op << " -> skip\n");
321  return failure();
322  }
323  // Check the remaining operations do not have regions or memory effects.
324  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
325  bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
326  if (hasMemoryEffect || op->getNumRegions() != 0) {
327  LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
328  << op << " -> skip\n");
329  return failure();
330  }
331  continue;
332  }
333  // Remove all other operations not used by the index computation. An
334  // exception are constant operations that may be used by `padOp`.
335  if (!isa<arith::ConstantOp>(op))
336  operationsToRemove.insert(op);
337  }
338  backwardSlice.set_subtract(operationsToRemove);
339  return success();
340 }
341 
342 SmallVector<Value>
344  SmallVector<Value> dynamicTensorSizes;
345 
346  // Upper bound the packing loop lengths to size the packed tensor. Taking
347  // upper bounds can make the sizes of the packed tensor independent of the
348  // enclosing loops. This independence is a prerequisite for reusing the same
349  // buffer for all enclosing loop iterations and hoisting its allocation out of
350  // the enclosing loops.
351  for (auto forOp : packingLoops) {
352  // Compute an upper bound `ubVal` for the upper bound of `forOp`.
353  AffineMap boundMap;
354  SmallVector<Value> boundOperands;
355  getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands);
356  Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
357  // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
358  // store the result to `dynamicTensorSizes`.
359  // TODO: instead of using the lower bound of `forOp` directly, implement a
360  // lower bound computation similar to the upper bound computation.
361  AffineExpr lb, ub, step;
362  bindDims(b.getContext(), lb, ub);
363  bindSymbols(b.getContext(), step);
364  Value res = b.createOrFold<AffineApplyOp>(
365  (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal,
366  cast<scf::ForOp>(forOp).getStep()});
367  dynamicTensorSizes.push_back(res);
368  }
369 
370  return dynamicTensorSizes;
371 }
372 
373 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
374  return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
375 }
376 
377 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
378 /// The returned Value is guaranteed not to depend on any loop comprised in
379 /// [`outer`, `forOp`].
380 /// Return null if such a loop-independent quantity cannot be computed.
381 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
382  scf::ForOp forOp) {
383  MLIRContext *ctx = forOp->getContext();
384  AffineExpr iv, lb, step;
385  bindDims(ctx, iv, lb);
386  bindSymbols(ctx, step);
387  if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
388  !isDefinedOutsideOrConstant(outer, forOp.getStep()))
389  return Value();
390  Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
391  stepVal = forOp.getStep();
392  auto loc = forOp->getLoc();
393  return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
394  ValueRange{ivVal, lbVal, stepVal});
395 }
396 
398  tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
399  tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) {
400  LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
401  << " loops\n");
402  HoistingAnalysis analysis(opToHoist, numLoops);
403  if (!analysis.isValid()) {
404  LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
405  return failure();
406  }
407 
408  scf::ForOp outer = analysis.outermostEnclosingForOp;
409  ImplicitLocOpBuilder b(outer->getLoc(), outer);
410 
411  SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
412 
413  // Update actual number of loops, which may be smaller.
414  int nPackedLoops = analysis.packingLoops.size();
415 
416  Location loc = opToHoist->getLoc();
417  RankedTensorType paddedTensorType = opToHoist.getResultType();
418  int paddedRank = paddedTensorType.getRank();
419 
420  // Compute the type of the transposed padded tensor.
421  FailureOr<RankedTensorType> transposedTensorType =
422  computeTransposedType(paddedTensorType, transposeVector);
423  if (failed(transposedTensorType))
424  return failure();
425 
426  // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize
427  // padding.
428  SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
429  // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
430  // tensor.
431  llvm::append_range(packedShape, transposedTensorType->getShape());
432  auto packedTensorType = RankedTensorType::get(
433  packedShape, transposedTensorType->getElementType());
434  Value packedTensor = b.create<linalg::InitTensorOp>(
435  loc, dynamicTensorSizes, packedTensorType.getShape(),
436  packedTensorType.getElementType());
437 
438  // Clone the operations involved in the backward slice, iteratively stepping
439  // into the loops that we encounter.
440  // The implementation proceeds in a stack-like fashion:
441  // 1. Iteratively clone and step into the loops, pushing the `packedTensor`
442  // deeper in the stack.
443  // 2. Create a GenericOp if `transposeVector` is non-empty.
444  // 3. Create a InsertSliceOp at the top of the stack.
445  // 4. Iteratively pop and yield the result of the InsertSliceOp across
446  // the cloned loops.
447  SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
448  clonedLoopIvs.reserve(nPackedLoops);
449  leadingPackedTensorIndexings.reserve(nPackedLoops);
451  // Stack step 1. iteratively clone loops and push `packedTensor`.
452  for (Operation *op : analysis.backwardSlice) {
453  // Specifically sit out in the extract_slice(packedTensor) case: this is the
454  // piece we seek to replace.
455  if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
456  if (bvm.lookupOrDefault(sliceOp.getSource()) == packedTensor)
457  continue;
458  // Clone all operations except it is a loop.
459  auto forOp = dyn_cast<scf::ForOp>(op);
460  if (!forOp) {
461  b.clone(*op, bvm);
462  continue;
463  }
464  // Create a packing loop that takes `packedTensor` as iteration argument.
465  auto clonedForOp = b.create<scf::ForOp>(
466  loc, bvm.lookupOrDefault(forOp.getLowerBound()),
467  bvm.lookupOrDefault(forOp.getUpperBound()),
468  bvm.lookupOrDefault(forOp.getStep()), packedTensor);
469  // Map the induction var, region args and results to the `clonedForOp`.
470  bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
471  bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
472  bvm.map(forOp.getResults(), clonedForOp.getResults());
473  assert(clonedForOp->getNumRegions() == 1);
474  clonedLoopIvs.push_back(clonedForOp.getInductionVar());
475 
476  b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
477  Value loopIndependentIterationCount =
478  buildLoopIterationCount(b, outer, clonedForOp);
479  // Assert the loop-independent iteration count can be computed.
480  if (!loopIndependentIterationCount)
481  llvm_unreachable("loop independence prerequisite not met");
482  leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
483  packedTensor = clonedForOp.getRegionIterArgs().front();
484  }
485 
486  // offsets = [clonedLoopIvs, 0 .. 0].
487  SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
488  leadingPackedTensorIndexings.end());
489  offsets.append(paddedRank, b.getIndexAttr(0));
490  // sizes = [1 .. 1, transposedShape].
491  SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
492  for (int64_t sz : transposedTensorType->getShape()) {
493  // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
494  assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
495  sizes.push_back(b.getIndexAttr(sz));
496  }
497  // strides = [1 .. 1].
498  SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
499  b.getIndexAttr(1));
500 
501  // Stack step 2. create GenericOp if `transposeVector` is non-empty.
502  Value paddedTensor = bvm.lookup(opToHoist.getResult());
503  if (!transposeVector.empty()) {
504  Value outputTensor = b.create<tensor::ExtractSliceOp>(
505  loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
506  transposeOps.push_back(
507  makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector));
508  paddedTensor = transposeOps.back()->getResult(0);
509  }
510 
511  // Stack step 3. create InsertSliceOp at the top of the stack.
512  Value inserted = b.create<tensor::InsertSliceOp>(
513  loc, paddedTensor, packedTensor, offsets, sizes, strides);
514 
515  // Stack step 4. iteratively pop the stack and propagate the yield.
516  Value valueToYield = inserted;
517  for (Value iv : llvm::reverse(clonedLoopIvs)) {
518  auto forOp = scf::getForInductionVarOwner(iv);
519  b.setInsertionPointToEnd(&forOp.getRegion().front());
520  b.create<scf::YieldOp>(loc, valueToYield);
521  valueToYield = forOp.getResult(0);
522  }
523 
524  // Now the packed tensor is ready, replace the original padding op by a
525  // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
526  b.setInsertionPoint(opToHoist);
527  SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
528  llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
529  return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
530  }));
531  // Assert all loop iteration counts can be computed.
532  if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
533  llvm_unreachable("loop independence prerequisite not met");
534  // offsets = [originalLoopIvs, 0 .. 0].
535  offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
536  offsets.append(paddedRank, b.getIndexAttr(0));
537  // sizes = [1 .. 1, transposedShape] (definedabove).
538  // strides = [1 .. 1] (defined above)
539  packedTensor =
540  scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
541  Value newResult = b.create<tensor::ExtractSliceOp>(
542  loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
543 
544  // Transpose the packed tensor back to the original storage order.
545  if (!transposeVector.empty()) {
546  Value initTensor =
547  b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(),
548  paddedTensorType.getElementType());
549  transposeOps.push_back(
550  makeTransposeOp(b, loc, newResult, initTensor, transposeVector));
551  newResult = transposeOps.back()->getResult(0);
552  }
553 
554  // Make the newly cloned `opToHoist` available to the caller.
555  hoistedOp =
556  cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp());
557  return newResult;
558 }
Include the generated interface declarations.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:335
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
MLIRContext * getContext() const
Definition: Builders.h:54
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:466
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:28
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Definition: IndexingUtils.h:38
void getBackwardSlice(Operation *op, SetVector< Operation *> *backwardSlice, TransitiveFilter filter=nullptr)
Fills backwardSlice with the computed backward slice (i.e.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value...
Definition: LogicalResult.h:72
A class for computing basic dominance information.
Definition: Dominance.h:117
static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
T lookup(T from) const
Lookup a mapped value within the map.
void getUpperBoundForIndex(Value value, AffineMap &boundMap, SmallVectorImpl< Value > &boundOperands, bool constantRequired=false)
Computes an upper bound for the result value of an index computation.
Definition: Utils.cpp:230
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:48
scf::ForOp outermostEnclosingForOp
The outermost loop, determined by nLevels above which padOp will be hoisted.
void map(Block *from, Block *to)
Inserts a new mapping for &#39;from&#39; to &#39;to&#39;.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
HoistingAnalysis(tensor::PadOp padOp, int numLoops)
This class provides support for representing a failure result, or a valid value of type T...
Definition: LogicalResult.h:78
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR&#39;s ceildiv operation on constants.
Definition: MathExtras.h:23
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:165
ForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map&#39;s are immutable like Type&#39;s, and they are uniqued...
Definition: AffineMap.h:42
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:258
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Analysis class to support tensor::PadOp hoisting across multiple enclosing loops. ...
FailureOr< Value > hoistPaddingOnTensors(tensor::PadOp opToHoist, int numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor...
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:72
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:85
bool isPermutation(ArrayRef< int64_t > permutation)
Check if permutation is a permutation of the range [0, permutation.size()).
Definition: Utils.cpp:187
static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp)
Return true if all uses of padOp are an input tensor of some LinalgOp.
This is a builder type that keeps local references to arguments.
Definition: BuiltinTypes.h:215
static FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
ImplicitLocOpBuilder maintains a &#39;current location&#39;, allowing use of the create<> method without spec...
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:332
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
bool dominates(Operation *a, Operation *b) const
Return true if operation A dominates operation B, i.e.
Definition: Dominance.h:137
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:55
Block * lookupOrDefault(Block *from) const
Lookup a mapped value within the map.
This class represents an operand of an operation.
Definition: Value.h:251
SmallVector< Value > getPackedTensorSizes(ImplicitLocOpBuilder &b)
Footprint of the packedTensor, computed from the packingLoops.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:328
GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, Value outputTensor, ArrayRef< int64_t > transposeVector)
Returns a GenericOp that tansposes inputTensor into outputTensor using transposeVector to permute the...
Definition: Utils.cpp:451
This class helps build Operations.
Definition: Builders.h:192
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:345
SmallVector< scf::ForOp > packingLoops
The scf::ForOp immediately enclosing padOp such that:
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
Builder & setShape(ArrayRef< int64_t > newShape)
Definition: BuiltinTypes.h:226
#define DBGS()
SetVector< Operation * > backwardSlice
Backward slice rooted at padOp and nested under outermostEnclosingForOp.
This class provides management for the lifetime of the state used when printing the IR...
Definition: AsmState.h:446