doxygen/HoistPadding_8cpp_source.html

 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements functions concerned with hoisting padding operations.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Analysis/Presburger/IntegerRelation.h"

 #include "mlir/Analysis/SliceAnalysis.h"

 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Affine/Transforms/Transforms.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/Linalg/IR/Linalg.h"

 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"

 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/Dialect/Tensor/Utils/Utils.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/IR/AsmState.h"

 #include "mlir/IR/Dominance.h"

 #include "mlir/IR/Matchers.h"

 #include "mlir/Interfaces/DestinationStyleOpInterface.h"

 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"

 #include "mlir/Transforms/RegionUtils.h"

 #include "llvm/Support/Debug.h"


 using llvm::dbgs;


 #define DEBUG_TYPE "hoist-padding"


 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")


 using namespace mlir;

 using namespace mlir::linalg;

 using namespace mlir::linalg::detail;


 #ifndef NDEBUG

 static bool debugPrintLoopInShortForm(Operation *op) {

   AsmState state(op->getParentOfType<func::FuncOp>());

   (void)state;

   if (auto forOp = dyn_cast<scf::ForOp>(op)) {

     forOp.getInductionVar().printAsOperand(dbgs(), state);

     dbgs() << " @ " << forOp.getOperation();

     return true;

   }

   return false;

 }

 #endif


 static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) {

   LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:",

                                    [](Operation *op) {

                                      dbgs() << "\n";

                                      DBGS() << "----";

                                      if (debugPrintLoopInShortForm(op)) {

                                        dbgs() << "\n";

                                        return;

                                      }

                                      dbgs() << *op << "\n";

                                    });

              DBGS() << "\n";);

 }


 /// Return at most nLevels of immediately enclosing scf::ForOp loops.

 /// Stops at the first parent that is not an scf::ForOp.

 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.

 /// Control-flow and other containing ops with regions are not modeled atm.

 static void

 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,

                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {

   scf::ForOp outermostEnclosingForOp = nullptr;

   Operation *nextEnclosingOp = padOp->getParentOp();

   while (nLevels-- > 0 &&

          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {

     LLVM_DEBUG(DBGS() << "loops: ";

                debugPrintLoopInShortForm(outermostEnclosingForOp);

                dbgs() << "\n");

     reverseEnclosingLoops.push_back(outermostEnclosingForOp);

     nextEnclosingOp = outermostEnclosingForOp->getParentOp();

   }

 }


 /// Return at most nLevels of immediately enclosing scf::ForOp loops.

 /// Stops at the first parent that is not an scf::ForOp.

 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.

 /// Control-flow and other containing ops with regions are not modeled atm.

 static void

 getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,

                        SmallVector<scf::ForOp> &reverseEnclosingLoops) {

   scf::ForOp outermostEnclosingForOp = nullptr;

   Operation *nextEnclosingOp = padOp->getParentOp();

   while (outermostEnclosingForOp != untilLoop &&

          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {

     LLVM_DEBUG(DBGS() << "loops: ";

                debugPrintLoopInShortForm(outermostEnclosingForOp);

                dbgs() << "\n");

     reverseEnclosingLoops.push_back(outermostEnclosingForOp);

     nextEnclosingOp = outermostEnclosingForOp->getParentOp();

   }

 }


 // Get all the ops in the backwards slice starting from `padOp` and that

 // are dominated by the outermost enclosing loop.

 // This also requires tracking ops defining values used in the region but

 // defined above.

 static void computeBackwardSlice(tensor::PadOp padOp,

                                  scf::ForOp outermostEnclosingForOp,

                                  SetVector<Operation *> &backwardSlice) {

   DominanceInfo domInfo(outermostEnclosingForOp);

   BackwardSliceOptions sliceOptions;

   sliceOptions.filter = [&](Operation *op) {

     return domInfo.dominates(outermostEnclosingForOp, op) &&

            !padOp->isProperAncestor(op);

   };

   sliceOptions.inclusive = true;


   // First, add the ops required to compute the region to the backwardSlice.

   SetVector<Value> valuesDefinedAbove;

   getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),

                             valuesDefinedAbove);

   for (Value v : valuesDefinedAbove) {

     LogicalResult result = getBackwardSlice(v, &backwardSlice, sliceOptions);

     assert(result.succeeded() && "expected a backward slice");

     (void)result;

   }

   // Then, add the backward slice from padOp itself.

   LogicalResult result =

       getBackwardSlice(padOp.getOperation(), &backwardSlice, sliceOptions);

   assert(result.succeeded() && "expected a backward slice");

   (void)result;

 }


 //===----------------------------------------------------------------------===//

 // HoistPaddingAnalysis Implementation.

 //===----------------------------------------------------------------------===//


 namespace {

 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing

 /// loops. The failure conditions are:

 ///   1. Pad op has a use that is not an input of a LinalgOp.

 ///   2. Pad op does not have a constant padding value.

 ///   3. There is no immediately enclosing scf::ForOp.

 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above

 ///      contains an unknown op with non index type operands, a region, or a

 ///      memory effect.

 ///   5. The backward slice from the pad op to the scf::ForOp to hoist above is

 ///      empty.

 ///   6. The source tensor of pad op is not defined by an extract slice op.

 ///   7. The source tensor of the extract slice op is not defined outside of

 ///      the outermost enclosing scf::ForOp.

 ///   8. There is no enclosing scf::ForOp that indexes the padded data.

 /// Other cases succeed and will trigger hoisting of the pad op.

 struct HoistPaddingAnalysis {

   HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);

   HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);


   bool isValid() { return valid.has_value() && valid.value(); }

   bool isInvalid() { return valid.has_value() && !valid.value(); }


   /// Footprint of the hoistedPackedTensor, computed from the packingLoops.

   SmallVector<Value> getHoistedPackedTensorSizes(RewriterBase &rewriter,

                                                  Location loc) const;


   /// Performs optional hoisting to enable hoist padding to occur. This may be

   /// necessary when `sliceOp` is not defined outside of the outermost enclosing

   /// loop we want to hoist above.

   ///

   /// Example:

   /// ```

   /// %source = linalg.fill(%cst, %arg0)

   /// // %source is available for packing here!

   /// scf.for %i

   ///   scf.for %j

   ///     scf.for %k

   ///       %slice = tensor.extract_slice %source [%i, %j]

   ///       %padded_slice = tensor.pad %slice

   /// ```

   void enableHoistPadding(RewriterBase &rewriter);


   /// Common analysis builder to finalize the construction of the analysis once

   /// optional `enableHoistPadding` has run.

   /// `reverseEnclosingLoops.back()` is the loop to hoist above.

   void finalizeHoistPaddingAnalysis();


 private:

   /// Encodes whether the analysis is valid and hoisting can proceed.

   std::optional<bool> valid;


   /// The padOp to hoist.

   tensor::PadOp opToHoist;


   /// Immediately enclosing loops considered for hoisting padding.

   SmallVector<scf::ForOp> reverseEnclosingLoops;


   /// Drop any non-index dependencies of `padOp` and `sliceOp` from

   /// `backwardSlice`. The method follows the use-def chains of the index

   /// operands consumed by `padOp` and `sliceOp` and drops the operations

   /// not part of this index computation. Afterwards, the filtered

   /// `backwardSlice` contains only the loops whose induction variable is

   /// used, directly or indirectly, to index the padded tensor. The method

   /// returns failure if the filtered backward slice contains an unexpected

   /// operation.

   ///

   /// Example:

   /// ```

   /// %source = linalg.fill(%cst, %arg0)

   /// scf.for %i

   ///   %unrelated = linalg.fill(%cst, %arg1)    // not used to index

   ///   %source! scf.for %j (%arg2 = %unrelated)

   ///     scf.for %k                             // not used to index

   ///     %source!

   ///       %ubi = affine.min #map(%i)

   ///       %ubj = affine.min #map(%j)

   ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]

   ///       %padded_slice = tensor.pad %slice

   /// ```

   /// dropNonIndexDependencies(%padded_slice, %slice)

   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.

   LogicalResult dropNonIndexDependencies();


 public:

   /// The outermost loop, determined by `nLevels` above which `padOp` will

   /// be hoisted.

   scf::ForOp outermostEnclosingForOp;


   /// Backward slice rooted at `padOp` and nested under

   /// `outermostEnclosingForOp`.

   SetVector<Operation *> backwardSlice;


   /// The scf::ForOp immediately enclosing `padOp` such that:

   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)

   ///  2. whose induction variable is used, directly or indirectly, in the

   ///     computation of `padOp`.

   /// The span of these loops determines the footprint of the packed tensor.

   SmallVector<scf::ForOp> packingLoops;


   /// The ExtractSliceOp that feeds the PadOp we want to hoist.

   tensor::ExtractSliceOp sliceOp;


   /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.

   scf::ForOp padConsumingForOp;

 };


 } // namespace


 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)

     : valid(std::nullopt), opToHoist(padOp) {

   // Get at most `numLoops` of immediately enclosing loops.

   getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops);

   if (reverseEnclosingLoops.empty()) {

     LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");

     valid = false;

     return;

   }

   outermostEnclosingForOp = reverseEnclosingLoops.back();

   sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();

   if (!sliceOp) {

     LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");

     valid = false;

     return;

   }

 }


 HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,

                                            scf::ForOp outermostEnclosingForOp)

     : valid(std::nullopt), opToHoist(padOp) {

   // Get enclosing loops until outermostEnclosingForOp.

   getEnclosingLoopsUntil(opToHoist, outermostEnclosingForOp,

                          reverseEnclosingLoops);

   if (reverseEnclosingLoops.empty()) {

     LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");

     valid = false;

     return;

   }

   this->outermostEnclosingForOp = reverseEnclosingLoops.back();

   if (this->outermostEnclosingForOp != outermostEnclosingForOp) {

     LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");

     valid = false;

     return;

   }

   sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();

   if (!sliceOp) {

     LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");

     valid = false;

     return;

   }

 }


 void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {

   if (isInvalid())

     return;

   // If the padded data is not yet available before entering the outermost

   // enclosing loop, try to apply hoisting on this outermost loop.

   // TODO: we may want finer-grained hoisting of only that particular `sliceOp`.

   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {

     outermostEnclosingForOp = cast<scf::ForOp>(

         hoistLoopInvariantSubsets(rewriter, outermostEnclosingForOp));

   }

 }


 void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {

   if (isInvalid())

     return;


   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {

     LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"

                       << outermostEnclosingForOp << "\n"

                       << "--sliceOp: " << sliceOp << "\n"

                       << "--sliceOp.getSource(): " << sliceOp.getSource()

                       << "\n");

     LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n");

     valid = false;

     return;

   }

   if (sliceOp->hasOneUse()) {

     padConsumingForOp = dyn_cast<scf::ForOp>(*(sliceOp->getUsers().begin()));

   }


   // Check the region of `padOp` depends on a constant only. Adding hoisting

   // support for arbitrary padding regions would require cloning all

   // dependencies captured by the padding region.

   Value paddingValue = opToHoist.getConstantPaddingValue();

   if (!paddingValue ||

       !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {

     LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");

     valid = false;

     return;

   }


   computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice);

   if (backwardSlice.size() <= 1) {

     valid = false;

     return;

   }


   debugPrintBackwardSlice(backwardSlice);

   // Remove all ops in the backward slice that are not used to index

   // the padded tensor. In particular, keep `padOp`, `sliceOp`, and

   // the loop and affine operations used for the index computation.

   if (failed(dropNonIndexDependencies())) {

     LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");

     valid = false;

     return;

   }

   debugPrintBackwardSlice(backwardSlice);


   // Add only the loops part of the filtered `backwardSlice` to the

   // packing loops. All other loops are not used to index the padded

   // data and consequently access the same data in every loop

   // iteration. Adding them to the packing loops would increase the

   // cache footprint of the packed data by storing the same data

   // multiple times.

   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))

     if (backwardSlice.contains(forOp))

       packingLoops.push_back(forOp);


   // TODO: for multiple loops we need to track the use to the innermost loop.

   if (packingLoops.size() > 1 && padConsumingForOp) {

     LLVM_DEBUG(DBGS() << "--Cannot hoist multiple loops through iter_args -> "

                          "Downgrade to 1 loop\n");

     packingLoops.resize(1);

   }


   // Note: at this point, packing loops may be empty but we would still like

   // to hoist the padding if so specified.


   // The analysis is valid and hoisting can occur.

   valid = true;

 }


 LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() {

   // Set of all values used for index computation.

   SetVector<Value> indexEdges;


   // Add all index operands of `operation` to `indexEdges`. An index operand

   // is an operand of type index.

   auto addIndexOperandsToIndexEdges = [&](Operation *operation) {

     for (Value operand : operation->getOperands())

       if (operand.getType().isIndex())

         indexEdges.insert(operand);

   };


   // Check if any operation result is contained in `indexEdges`.

   auto hasIndexResult = [&](Operation *operation) {

     return llvm::any_of(operation->getResults(), [&](Value result) {

       return indexEdges.contains(result);

     });

   };


   // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index

   // type in `backwardSlice`. Add the index operands of an operation to

   // `indexEdges` and remove all operations from `backwardSlice` that are not

   // part of the index computation.

   //

   // Example:

   // ```

   // %source = linalg.fill(%cst, %arg0)

   // scf.for %i

   //   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!

   //   scf.for %j (%arg2 = %unrelated)

   //     scf.for %k                             // not used to index %source!

   //       %ubi = affine.min #map(%i)

   //       %ubj = affine.min #map(%j)

   //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]

   //       %padded_slice = tensor.pad %slice

   // ```

   // After iterating `backwardSlice` we obtain:

   // indexEdges = [%i, %j, %ubi, %ubj]

   // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]

   SetVector<Operation *> operationsToRemove;

   for (Operation *op : llvm::reverse(backwardSlice)) {

     // Add the index operands of `opToHoist` and `sliceOp` to start the

     // exploration of the index computation.

     if (op == opToHoist || op == sliceOp) {

       addIndexOperandsToIndexEdges(op);

       continue;

     }

     // Add the index operands of the loop if its induction variable is

     // used for index computation.

     if (auto forOp = dyn_cast<scf::ForOp>(op)) {

       if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {

         addIndexOperandsToIndexEdges(op);

         continue;

       }

     }

     // Add the index operands of all other operations if at least one result

     // is used for index computation.

     if (hasIndexResult(op)) {

       addIndexOperandsToIndexEdges(op);

       // Check the operands of the remaining operations all have index type.

       if (llvm::any_of(op->getOperandTypes(),

                        [](Type type) { return !type.isIndex(); })) {

         LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "

                           << op << " -> Skip\n");

         return failure();

       }

       // Check the remaining operations do not have regions or memory effects.

       auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);

       bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();

       if (hasMemoryEffect || op->getNumRegions() != 0) {

         LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "

                           << op << " -> Skip\n");

         return failure();

       }

       continue;

     }

     // Remove all other operations not used by the index computation. An

     // exception are constant operations that may be used by `opToHoist`.

     if (!isa<arith::ConstantOp>(op))

       operationsToRemove.insert(op);

   }

   backwardSlice.set_subtract(operationsToRemove);

   return success();

 }


 SmallVector<Value>

 HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,

                                                   Location loc) const {

   SmallVector<Value> dynamicTensorSizes;


   // Upper bound the packing loop lengths to size the packed tensor. Taking

   // upper bounds can make the sizes of the packed tensor independent of the

   // enclosing loops. This independence is a prerequisite for reusing the same

   // buffer for all enclosing loop iterations and hoisting its allocation out

   // of the enclosing loops.

   for (auto forOp : packingLoops) {

     // Compute an upper bound `ubVal` for the upper bound of `forOp`.

     FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(

         rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),

         /*stopCondition=*/

         [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {

           if (v == forOp.getUpperBound())

             return false;

           // Compute a bound that is independent of any affine op results.

           Operation *op = v.getDefiningOp();

           if (!op)

             return true;

           return !isa<affine::AffineMinOp, affine::AffineMaxOp,

                       affine::AffineApplyOp>(op);

         },

         /*closedUB=*/true);

     assert(succeeded(loopUb) && "could not get upper bound");

     Value ubVal = getValueOrCreateConstantIndexOp(rewriter, loc, *loopUb);


     // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and

     // store the result to `dynamicTensorSizes`.

     // TODO: instead of using the lower bound of `forOp` directly, implement a

     // lower bound computation similar to the upper bound computation.

     AffineExpr lb, ub, step;

     bindDims(rewriter.getContext(), lb, ub);

     bindSymbols(rewriter.getContext(), step);

     Value res = rewriter.createOrFold<affine::AffineApplyOp>(

         loc, (ub - lb).ceilDiv(step),

         ValueRange{forOp.getLowerBound(), ubVal,

                    cast<scf::ForOp>(forOp).getStep()});

     dynamicTensorSizes.push_back(res);

   }


   return dynamicTensorSizes;

 }


 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {

   return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());

 }


 //===----------------------------------------------------------------------===//

 // buildPackingLoopNest Implementation.

 //===----------------------------------------------------------------------===//


 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).

 /// The returned Value is guaranteed not to depend on any loop comprised in

 /// [`outer`, `forOp`].

 /// Return null if such a loop-independent quantity cannot be computed.

 static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,

                                      scf::ForOp forOp) {

   MLIRContext *ctx = forOp->getContext();

   AffineExpr iv, lb, step;

   bindDims(ctx, iv, lb);

   bindSymbols(ctx, step);

   if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||

       !isDefinedOutsideOrConstant(outer, forOp.getStep()))

     return Value();

   Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),

         stepVal = forOp.getStep();

   auto loc = forOp->getLoc();

   return rewriter.createOrFold<affine::AffineApplyOp>(

       loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal});

 }


 // Build a packing loop nest by iteratively traversing the backward slice and

 // clone the operations, iteratively stepping into the loops that we encounter.

 // The implementation proceeds in a stack-like fashion:

 //   1. Iteratively clone and step into the loops, pushing the

 //   `hoistedPackedTensor`

 //      deeper in the stack.

 //   2. At the innermost loop level, create a GenericOp if `transposeVector` is

 //      non-empty.

 //   3. At the innermost loop level, create a InsertSliceOp.

 //   4. Iteratively pop and yield the result of the InsertSliceOp across the

 //      cloned loops.

 static FailureOr<PackingResult> buildPackingLoopNestImpl(

     RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,

     ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,

     tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {

   SmallVector<OpFoldResult> offsets, sizes, strides;

   SmallVector<Value> clonedLoopIvs, leadingHoistedPackedTensorIndexings;


   scf::ForOp outerLoop = analysis.outermostEnclosingForOp;


   Location loc = opToHoist->getLoc();

   RankedTensorType paddedTensorType = opToHoist.getResultType();

   int paddedRank = paddedTensorType.getRank();


   // Step 0. Populate bvm with opToHoist.getSource if relevant.

   BlockArgument bbArg = dyn_cast<BlockArgument>(opToHoist.getSource());

   while (bbArg) {

     auto forOp = dyn_cast<scf::ForOp>(bbArg.getOwner()->getParentOp());

     if (!forOp)

       break;

     if (forOp != outerLoop && !outerLoop->isAncestor(forOp))

       break;

     OpOperand &operand = *forOp.getTiedLoopInit(bbArg);

     bvm.map(bbArg, operand.get());

     bbArg = dyn_cast<BlockArgument>(operand.get());

   }


   // Step 1. iteratively clone loops and push `hoistedPackedTensor`.

   Value hoistedPackedTensor = emptyOp.getResult();

   OpBuilder::InsertionGuard g(rewriter);

   for (Operation *op : analysis.backwardSlice) {

     // Specifically sit out in the extract_slice(hoistedPackedTensor) case: this

     // is the piece we seek to replace.

     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {

       if (bvm.lookupOrDefault(sliceOp.getSource()) == hoistedPackedTensor) {

         LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n");

         continue;

       }

     }


     // Clone all operations except loops which require special handling.

     auto forOp = dyn_cast<scf::ForOp>(op);

     if (!forOp) {

       // We are at the right insertion point within the loop nest.

       rewriter.clone(*op, bvm);

       continue;

     }


     // Create a packing loop that takes `hoistedPackedTensor` as iteration

     // argument.

     auto clonedForOp = rewriter.create<scf::ForOp>(

         loc, bvm.lookupOrDefault(forOp.getLowerBound()),

         bvm.lookupOrDefault(forOp.getUpperBound()),

         bvm.lookupOrDefault(forOp.getStep()), hoistedPackedTensor);


     // Map the induction var, region args and results to the `clonedForOp`.

     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());

     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());

     bvm.map(forOp.getResults(), clonedForOp.getResults());

     assert(clonedForOp->getNumRegions() == 1);

     clonedLoopIvs.push_back(clonedForOp.getInductionVar());


     // Do not insert guard here, we get deeper into the loop nest.

     rewriter.setInsertionPointToStart(&clonedForOp->getRegion(0).front());

     Value loopIndependentIterationCount =

         buildLoopIterationCount(rewriter, outerLoop, clonedForOp);


     // Assert the loop-independent iteration count can be computed.

     if (!loopIndependentIterationCount)

       llvm_unreachable("loop independence prerequisite not met");

     leadingHoistedPackedTensorIndexings.push_back(

         loopIndependentIterationCount);

     hoistedPackedTensor = clonedForOp.getRegionIterArgs().front();

   }


   // Step 2. Construct offsets, sizes and strides for the innermost level of the

   // packing loop.

   int64_t nPackedLoops = clonedLoopIvs.size();

   // offsets = [clonedLoopIvs, 0 .. 0].

   offsets =

       SmallVector<OpFoldResult>{leadingHoistedPackedTensorIndexings.begin(),

                                 leadingHoistedPackedTensorIndexings.end()};

   offsets.append(paddedRank, rewriter.getIndexAttr(0));

   // sizes = [1 .. 1, transposedShape].

   sizes = SmallVector<OpFoldResult>(nPackedLoops, rewriter.getIndexAttr(1));

   for (int64_t sz : transposedTensorType.getShape()) {

     // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.

     if (ShapedType::isDynamic(sz))

       return failure();

     sizes.push_back(rewriter.getIndexAttr(sz));

   }

   // strides = [1 .. 1].

   strides = SmallVector<OpFoldResult>(nPackedLoops + paddedRank,

                                       rewriter.getIndexAttr(1));


   // Step 3. Optionally transpose the padded tensor.

   TransposeOp maybeTransposeOp;

   Value paddedTensor = bvm.lookup(opToHoist.getResult());

   if (!transposeVector.empty()) {

     Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(

         loc, transposedTensorType, hoistedPackedTensor, offsets, sizes,

         strides);

     maybeTransposeOp = rewriter.create<linalg::TransposeOp>(

         loc, paddedTensor, outputTensor, transposeVector);

     paddedTensor = maybeTransposeOp.getResult()[0];

   }


   // Innermost tensor.insert_slice and yields are optional / need loops.

   if (nPackedLoops > 0) {

     // Step 4. Create InsertSliceOp at the innermost loop level, inserting an

     // optionally transposed padded slice into the packed tensor.

     Value inserted = rewriter.create<tensor::InsertSliceOp>(

         loc, paddedTensor, hoistedPackedTensor, offsets, sizes, strides);


     // Step 5. Iteratively pop the stack and propagate the yield.

     Value valueToYield = inserted;

     for (Value iv : llvm::reverse(clonedLoopIvs)) {

       auto forOp = scf::getForInductionVarOwner(iv);

       rewriter.setInsertionPointToEnd(&forOp.getRegion().front());

       rewriter.create<scf::YieldOp>(loc, valueToYield);

       valueToYield = forOp.getResult(0);

     }

   }


   return PackingResult{

       offsets,

       sizes,

       strides,

       clonedLoopIvs,

       leadingHoistedPackedTensorIndexings,

       maybeTransposeOp,

       cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp())};

 }


 /// Build the packing loop nest required to hoist `opToHoist` above

 /// `outermostEnclosingForOp`.

 /// The loop nest is built just before `outermostEnclosingForOp`.

 static FailureOr<PackingResult> buildPackingLoopNestImpl(

     RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,

     ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {

   // Update actual number of loops, which may be smaller.

   int nPackedLoops = analysis.packingLoops.size();

   LLVM_DEBUG(DBGS() << "\n";

              DBGS() << "Func:\n"

                     << *opToHoist->getParentOfType<func::FuncOp>() << "\n";

              DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");


   Location loc = opToHoist->getLoc();

   RankedTensorType paddedTensorType = opToHoist.getResultType();


   // Compute the type of the transposed padded tensor.

   FailureOr<RankedTensorType> transposedTensorType =

       tensor::computeTransposedType(paddedTensorType, transposeVector);

   if (failed(transposedTensorType)) {

     LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");

     return failure();

   }


   // Create the packed tensor<?x?x..? x transposedShape>.

   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);

   // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.

   llvm::append_range(packedShape, transposedTensorType->getShape());

   auto hoistedPackedTensorType = RankedTensorType::get(

       packedShape, transposedTensorType->getElementType());


   // Set the insertion point right before the outer loop and start packing.

   scf::ForOp outerLoop = analysis.outermostEnclosingForOp;

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(outerLoop);

   SmallVector<Value> dynamicTensorSizes =

       analysis.getHoistedPackedTensorSizes(rewriter, loc);

   auto emptyOp = rewriter.create<tensor::EmptyOp>(

       loc, hoistedPackedTensorType.getShape(),

       hoistedPackedTensorType.getElementType(), dynamicTensorSizes);


   return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,

                                   *transposedTensorType, emptyOp, analysis);

 }


 /// Build the packing loop nest required to hoist `opToHoist` above

 /// `outermostEnclosingForOp`.

 /// The loop nest is built just before `outermostEnclosingForOp`.

 FailureOr<PackingResult> mlir::linalg::detail::buildPackingLoopNest(

     RewriterBase &rewriter, tensor::PadOp opToHoist,

     scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {

   HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);

   analysis.enableHoistPadding(rewriter);

   analysis.finalizeHoistPaddingAnalysis();

   if (!analysis.isValid()) {

     LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");

     return failure();

   }

   IRMapping bvm;

   return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,

                                   analysis);

 }


 //===----------------------------------------------------------------------===//

 // hoistPaddingOnTensors Implementation.

 //===----------------------------------------------------------------------===//


 /// Return true if we can walk back the use-def chain from `extractSliceOp` to

 /// expectedSource going through DestinationStyleOpInterface inits only.

 /// This is a poor man's analysis that is sufficient to check the extractSliceOp

 /// the matches tensor.pad we want to hoist.

 /// In the future, it will be easier to ensure this with a matching symmetric

 /// tensor.unpad op.

 static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,

                                       Value expectedSource) {

   LLVM_DEBUG(DBGS() << "Start tracesBackToExpectedValue on: " << extractSliceOp

                     << "\n");

   LLVM_DEBUG(DBGS() << "--with extractSlice: " << extractSliceOp << "\n");

   Value source = extractSliceOp.getSource();

   LLVM_DEBUG(DBGS() << "--with starting source: " << source << "\n");

   while (source && source != expectedSource) {

     auto destOp =

         dyn_cast_or_null<DestinationStyleOpInterface>(source.getDefiningOp());

     if (!destOp)

       break;

     LLVM_DEBUG(DBGS() << "--step dest op: " << destOp << "\n");

     source = destOp.getDpsInitOperand(cast<OpResult>(source).getResultNumber())

                  ->get();

   }

   LLVM_DEBUG(DBGS() << "--final source: " << source << "\n");

   LLVM_DEBUG(DBGS() << "--expected source: " << expectedSource << "\n");

   return source == expectedSource;

 }


 /// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an

 /// iter arg), propagate the `hoistedPackedTensor` value through the same iter

 /// arg.

 /// TODO: for multiple loops we need to track the use to the innermost loop.

 ///

 /// Match:

 /// ```

 ///   %outerSliceOp = tensor.extract_slice ..

 ///   %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {

 ///     %hoistedPackedTensor = tensor.pad %arg0

 ///     %1 = compute %hoistedPackedTensor

 ///     %2 = tensor.extract_slice %1

 ///     scf.yield %2

 ///   }

 /// ```

 ///

 /// and rewrite as:

 /// ```

 ///   %outerSliceOp = tensor.extract_slice ..

 ///   %hoistedPackedTensor = tensor.pad %outerSliceOp

 ///   %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {

 ///     %1 = compute %arg0

 ///     scf.yield %1

 ///   }

 ///   %2 = tensor.extract_slice %forOp

 /// ```

 ///

 /// Return null when no rewrite happened.

 static tensor::ExtractSliceOp

 padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,

                       Value hoistedPackedTensor,

                       tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) {

   LLVM_DEBUG(DBGS() << "Start padThroughLoopIterArg on: " << forOp << "\n");

   LLVM_DEBUG(DBGS() << "--paddedValueBeforeHoisting: "

                     << paddedValueBeforeHoisting << "\n");

   OpOperand *pUse = nullptr;

   for (OpOperand &use : outerSliceOp->getUses()) {

     if (use.getOwner() == forOp) {

       assert(!pUse && "Multiple slice uses in the for loop");

       pUse = &use;

     }

   }

   assert(pUse && "No slice use in the for loop");

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());


   unsigned iterArgNumber = forOp.getTiedLoopResult(pUse).getResultNumber();

   auto yieldingExtractSliceOp = forOp.getYieldedValues()[iterArgNumber]

                                     .getDefiningOp<tensor::ExtractSliceOp>();

   if (!yieldingExtractSliceOp)

     return tensor::ExtractSliceOp();


   // Poor man's analysis sufficient to ensure extractSlice matches tensor.pad.

   // In the future, it will be easier to ensure this with a matching symmetric

   // tensor.unpad op.

   if (!tracesBackToExpectedValue(yieldingExtractSliceOp,

                                  paddedValueBeforeHoisting))

     return tensor::ExtractSliceOp();


   SmallVector<Value> initArgs = forOp.getInitArgs();

   initArgs[iterArgNumber] = hoistedPackedTensor;

   SmallVector<Value> yieldOperands = llvm::to_vector(forOp.getYieldedValues());

   yieldOperands[iterArgNumber] = yieldingExtractSliceOp.getSource();


   int64_t numOriginalForOpResults = initArgs.size();

   LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults

                     << "\n");

   tensor::ExtractSliceOp extracted;

   {

     OpBuilder::InsertionGuard g(rewriter);

     rewriter.setInsertionPointAfter(forOp);

     extracted = rewriter.create<tensor::ExtractSliceOp>(

         hoistedPackedTensor.getLoc(), hoistedPackedTensor,

         outerSliceOp.getMixedOffsets(), outerSliceOp.getMixedSizes(),

         outerSliceOp.getMixedStrides());

     rewriter.replaceAllUsesWith(forOp.getResult(iterArgNumber), extracted);

   }

   scf::ForOp newForOp = cast<scf::ForOp>(*forOp.replaceWithAdditionalYields(

       rewriter, initArgs, /*replaceInitOperandUsesInLoop=*/true,

       [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {

         return yieldOperands;

       }));


   LLVM_DEBUG(DBGS() << "newForOp results: " << newForOp.getNumResults()

                     << "\n");

   LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n");

   LLVM_DEBUG(DBGS() << "with result #"

                     << numOriginalForOpResults + iterArgNumber

                     << " of forOp, giving us: " << extracted << "\n");

   rewriter.startOpModification(extracted);

   extracted.getSourceMutable().assign(

       newForOp.getResult(numOriginalForOpResults + iterArgNumber));

   rewriter.finalizeOpModification(extracted);


   LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting

                     << "\n");

   LLVM_DEBUG(DBGS() << "with region iter arg #"

                     << numOriginalForOpResults + iterArgNumber << "\n");

   rewriter.replaceAllUsesWith(

       paddedValueBeforeHoisting,

       newForOp.getRegionIterArg(numOriginalForOpResults + iterArgNumber));


   return extracted;

 }


 /// Produce a tensor extracted from the packingResult. This can be used as a

 /// replacement for `opToHoist` in callers.

 static Value replaceByPackingResult(RewriterBase &rewriter,

                                     const IRMapping &bvm,

                                     tensor::PadOp opToHoist,

                                     RankedTensorType transposedTensorType,

                                     const HoistPaddingAnalysis &analysis,

                                     const PackingResult &packingResult) {

   // The replacement occurs under a single insertion point within the original

   // loop, just before opToHoist.

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(opToHoist);


   Location loc = opToHoist->getLoc();

   RankedTensorType paddedTensorType = opToHoist.getResultType();

   int paddedRank = paddedTensorType.getRank();


   int64_t nPackedLoops = packingResult.clonedLoopIvs.size();

   LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n");


   scf::ForOp outerLoop = analysis.outermostEnclosingForOp;

   ArrayRef<scf::ForOp> packingLoops = analysis.packingLoops;


   Value hoistedPackedTensor;

   SmallVector<Value> loopIterationCounts;

   SmallVector<OpFoldResult> offsets(nPackedLoops + paddedRank,

                                     rewriter.getIndexAttr(0));

   if (nPackedLoops > 0) {

     loopIterationCounts =

         llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {

           return buildLoopIterationCount(rewriter, outerLoop,

                                          cast<scf::ForOp>(loop));

         }));

     // Assert all loop iteration counts can be computed.

     if (llvm ::any_of(loopIterationCounts, [](Value v) { return !v; }))

       llvm_unreachable("loop independence prerequisite not met");


     // offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0].

     std::copy(loopIterationCounts.begin(), loopIterationCounts.end(),

               offsets.begin());

     hoistedPackedTensor =

         scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front())

             ->getResult(0);

   } else {

     // If no loops were created, this is just hoisting without packing.

     hoistedPackedTensor = bvm.lookup(opToHoist.getResult());

   }


   LLVM_DEBUG(DBGS() << "hoistedPackedTensor: " << hoistedPackedTensor << "\n");


   // If the consumer of `padOp` was a `forOp`, propagate through iter args.

   scf::ForOp forOp = analysis.padConsumingForOp;

   if (forOp) {

     return padThroughLoopIterArg(rewriter, opToHoist, hoistedPackedTensor,

                                  analysis.sliceOp, forOp);

   }


   // offsets = [maybe_leading_ivs, 0 .. 0].

   // sizes = [1 .. 1, transposedShape] (defined above).

   // strides = [1 .. 1] (defined above)

   return rewriter.create<tensor::ExtractSliceOp>(

       loc, transposedTensorType, hoistedPackedTensor, offsets,

       packingResult.sizes, packingResult.strides);

 }


 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(

     RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,

     ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,

     SmallVectorImpl<TransposeOp> &transposeOps) {

   LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";

              DBGS() << " by " << numLoops << " loops\n");


   HoistPaddingAnalysis analysis(opToHoist, numLoops);

   analysis.enableHoistPadding(rewriter);

   analysis.finalizeHoistPaddingAnalysis();

   if (!analysis.isValid()) {

     LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");

     return failure();

   }


   /// Construct the packing loop nest.

   IRMapping bvm;

   FailureOr<PackingResult> packingResult = buildPackingLoopNestImpl(

       rewriter, bvm, opToHoist, transposeVector, analysis);

   if (failed(packingResult)) {

     LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");

     return failure();

   }


   if (!transposeVector.empty())

     transposeOps.push_back(packingResult->maybeTransposeOp);


   FailureOr<RankedTensorType> transposedTensorType =

       tensor::computeTransposedType(opToHoist.getResultType(), transposeVector);

   assert(succeeded(transposedTensorType) && "unexpected failure in type");


   // Now the packed tensor is ready, replace the original padding op by a

   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].

   Value newResult =

       replaceByPackingResult(rewriter, bvm, opToHoist, *transposedTensorType,

                              analysis, *packingResult);


   Location loc = opToHoist->getLoc();

   RankedTensorType paddedTensorType = opToHoist.getResultType();

   if (!transposeVector.empty()) {

     OpBuilder::InsertionGuard g(rewriter);

     rewriter.setInsertionPointAfter(newResult.getDefiningOp());

     // Transpose the packed tensor back to the original storage order.

     Value emptyTensor = rewriter.create<tensor::EmptyOp>(

         loc, paddedTensorType.getShape(), paddedTensorType.getElementType());

     TransposeOp unTransposeOp = rewriter.create<linalg::TransposeOp>(

         loc, newResult, emptyTensor, transposeVector);

     newResult = unTransposeOp.getResult()[0];

     transposeOps.push_back(unTransposeOp);

   }


   LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n");

   LLVM_DEBUG(

       DBGS() << "After hoisting: "

              << newResult.getDefiningOp()->getParentOfType<func::FuncOp>()

              << "\n");


   // Make the newly cloned `opToHoist` available to the caller.

   hoistedOp = packingResult->hoistedPadOp;


   LLVM_DEBUG(DBGS() << "--SUCCESS\n");

   return newResult;

 }


 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(

     tensor::PadOp opToHoist, int64_t numLoops,

     ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,

     SmallVectorImpl<TransposeOp> &transposeOps) {

   IRRewriter rewriter(opToHoist.getContext());

   return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,

                                hoistedOp, transposeOps);

 }

AffineOps.h

AsmState.h

copy
static void copy(Location loc, Value dst, Value src, Value size, OpBuilder &builder)
Copies the given number of bytes from src to dst pointers.
Definition: ConvertLaunchFuncToLLVMCalls.cpp:71

DestinationStyleOpInterface.h

Utils.h

Dominance.h

FuncOps.h

padThroughLoopIterArg
static tensor::ExtractSliceOp padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, Value hoistedPackedTensor, tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp)
If the original consumer of outerSliceOp was a forOp (i.e.
Definition: HoistPadding.cpp:802

buildLoopIterationCount
static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
Definition: HoistPadding.cpp:519

getEnclosingLoopsUntil
static void getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Definition: HoistPadding.cpp:93

debugPrintLoopInShortForm
static bool debugPrintLoopInShortForm(Operation *op)
Definition: HoistPadding.cpp:43

tracesBackToExpectedValue
static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp, Value expectedSource)
Return true if we can walk back the use-def chain from extractSliceOp to expectedSource going through...
Definition: HoistPadding.cpp:752

isDefinedOutsideOrConstant
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
Definition: HoistPadding.cpp:507

buildPackingLoopNestImpl
static FailureOr< PackingResult > buildPackingLoopNestImpl(RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist, ArrayRef< int64_t > transposeVector, RankedTensorType transposedTensorType, tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis)
Definition: HoistPadding.cpp:546

computeBackwardSlice
static void computeBackwardSlice(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp, SetVector< Operation * > &backwardSlice)
Definition: HoistPadding.cpp:111

replaceByPackingResult
static Value replaceByPackingResult(RewriterBase &rewriter, const IRMapping &bvm, tensor::PadOp opToHoist, RankedTensorType transposedTensorType, const HoistPaddingAnalysis &analysis, const PackingResult &packingResult)
Produce a tensor extracted from the packingResult.
Definition: HoistPadding.cpp:880

DBGS
#define DBGS()
Definition: HoistPadding.cpp:36

debugPrintBackwardSlice
static void debugPrintBackwardSlice(SetVector< Operation * > &backwardSlice)
Definition: HoistPadding.cpp:55

getAtMostNEnclosingLoops
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Definition: HoistPadding.cpp:74

Hoisting.h

IndexingUtils.h

IntegerRelation.h

LoopInvariantCodeMotionUtils.h

Matchers.h

RegionUtils.h

SliceAnalysis.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SetVector
Definition: LLVM.h:66

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AsmState
This class provides management for the lifetime of the state used when printing the IR.
Definition: AsmState.h:542

mlir::BlockArgument
This class represents an argument of a Block.
Definition: Value.h:309

mlir::BlockArgument::getOwner
Block * getOwner() const
Returns the block that owns this argument.
Definition: Value.h:318

mlir::Block::getParentOp
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:33

mlir::Builder::getIndexAttr
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:106

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::DominanceInfo
A class for computing basic dominance information.
Definition: Dominance.h:140

mlir::DominanceInfo::dominates
bool dominates(Operation *a, Operation *b) const
Return true if operation A dominates operation B, i.e.
Definition: Dominance.h:158

mlir::IRMapping
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26

mlir::IRMapping::lookupOrDefault
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65

mlir::IRMapping::lookup
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72

mlir::IRMapping::map
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30

mlir::IROperand::get
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160

mlir::IRRewriter
This class coordinates rewriting a piece of IR outside of a pattern rewrite, providing a way to keep ...
Definition: PatternMatch.h:729

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:551

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396

mlir::OpBuilder::setInsertionPointToEnd
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434

mlir::OpBuilder::createOrFold
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:517

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410

mlir::OpOperand
This class represents an operand of an operation.
Definition: Value.h:257

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407

mlir::Operation::getParentOp
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:234

mlir::Operation::getParentOfType
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358

mlir::RewriterBase::replaceAllUsesWith
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:601

mlir::RewriterBase::finalizeOpModification
virtual void finalizeOpModification(Operation *op)
This method is used to signal the end of an in-place modification of the given operation.
Definition: PatternMatch.cpp:244

mlir::RewriterBase::startOpModification
virtual void startOpModification(Operation *op)
This method is used to notify the rewriter that an in-place operation modification is about to happen...
Definition: PatternMatch.h:577

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::ValueBoundsConstraintSet
A helper class to be used with ValueBoundsOpInterface.
Definition: ValueBoundsOpInterface.h:69

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getLoc
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20

Transforms.h

Linalg.h

Transforms.h

SCF.h

mlir::affine::getForInductionVarOwner
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Definition: AffineOps.cpp:2653

mlir::affine::reifyIndexValueBound
FailureOr< OpFoldResult > reifyIndexValueBound(OpBuilder &b, Location loc, presburger::BoundType type, Value value, ValueBoundsConstraintSet::StopConditionFn stopCondition=nullptr, bool closedUB=false)
Reify a bound for the given index-typed value in terms of SSA values for which stopCondition is met.
Definition: ReifyValueBounds.cpp:100

mlir::detail::bindDims
void bindDims(MLIRContext *ctx)
Definition: AffineExpr.h:289

mlir::detail::bindSymbols
void bindSymbols(MLIRContext *ctx)
Definition: AffineExpr.h:298

mlir::linalg::detail
Definition: LinalgInterfaces.h:33

mlir::linalg::detail::buildPackingLoopNest
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
Definition: HoistPadding.cpp:727

mlir::linalg
Definition: LinalgToStandard.h:24

mlir::linalg::hoistPaddingOnTensors
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< TransposeOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
Definition: HoistPadding.cpp:943

mlir::tensor::computeTransposedType
FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
Definition: Utils.cpp:78

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::matchPattern
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:490

mlir::getBackwardSlice
LogicalResult getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, const BackwardSliceOptions &options={})
Fills backwardSlice with the computed backward slice (i.e.
Definition: SliceAnalysis.cpp:146

mlir::hoistLoopInvariantSubsets
LoopLikeOpInterface hoistLoopInvariantSubsets(RewriterBase &rewriter, LoopLikeOpInterface loopLike)
Hoist loop-invariant tensor subsets (subset extraction and subset insertion ops) from loop-like ops.
Definition: LoopInvariantCodeMotionUtils.cpp:392

mlir::getUsedValuesDefinedAbove
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:70

mlir::getValueOrCreateConstantIndexOp
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:112

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::m_Constant
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:369

mlir::BackwardSliceOptions
Definition: SliceAnalysis.h:42

mlir::SliceOptions::inclusive
bool inclusive
Include the top level op in the slice.
Definition: SliceAnalysis.h:32

mlir::SliceOptions::filter
TransitiveFilter filter
Definition: SliceAnalysis.h:29

mlir::linalg::detail::PackingResult
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:639

mlir::linalg::detail::PackingResult::strides
SmallVector< OpFoldResult > strides
Definition: Transforms.h:640

mlir::linalg::detail::PackingResult::clonedLoopIvs
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:641

mlir::linalg::detail::PackingResult::sizes
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:640