29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "hoist-padding"
36 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
103 tensor::ExtractSliceOp sliceOp);
112 for (
OpOperand &use : padOp.getResult().getUses()) {
113 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
114 if (!linalgUser || !linalgUser.isDpsInput(&use)) {
115 LLVM_DEBUG(
DBGS() <<
"Found a use of " << *(padOp)
116 <<
"\nthat is not an input tensor of a LinalgOp, "
118 << *(use.getOwner()) <<
"\n");
132 AsmState state(padOp->getParentOfType<func::FuncOp>());
134 scf::ForOp outermostEnclosingForOp =
nullptr;
136 while (nLevels-- > 0 &&
137 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
140 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
142 reverseEnclosingLoops.push_back(outermostEnclosingForOp);
143 nextEnclosingOp = outermostEnclosingForOp->
getParentOp();
152 if (transposeVector.empty())
153 return rankedTensorType;
155 transposeVector.size() !=
static_cast<size_t>(rankedTensorType.getRank()))
159 rankedTensorType.getShape().end());
163 RankedTensorType transposedTensorType =
164 RTTBuilder(rankedTensorType).
setShape(transposedShape);
165 return transposedTensorType;
179 if (reverseEnclosingLoops.empty()) {
180 LLVM_DEBUG(
DBGS() <<
"No immediately enclosing loop -> skip\n");
184 outermostEnclosingForOp = reverseEnclosingLoops.back();
201 auto sliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
203 LLVM_DEBUG(
DBGS() <<
"Cannot find the extract slice op -> skip\n");
206 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
207 LLVM_DEBUG(
DBGS() <<
"Source not defined outside of loops -> skip\n");
214 Value paddingValue = padOp.getConstantPaddingValue();
216 !isa_and_nonnull<arith::ConstantOp>(paddingValue.
getDefiningOp())) {
217 LLVM_DEBUG(
DBGS() <<
"Cannot find constant padding value -> skip\n");
225 return domInfo.dominates(outermostEnclosingForOp, op);
227 if (backwardSlice.empty())
230 backwardSlice.insert(padOp.getOperation());
235 if (
failed(dropNonIndexDependencies(padOp, sliceOp)))
243 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
244 if (backwardSlice.contains(forOp))
245 packingLoops.push_back(forOp);
246 if (packingLoops.empty()) {
247 LLVM_DEBUG(
DBGS() <<
"Cannot find a packing loop -> skip\n");
256 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
257 tensor::ExtractSliceOp sliceOp) {
263 auto addIndexOperandsToIndexEdges = [&](
Operation *operation) {
264 for (
Value operand : operation->getOperands())
265 if (operand.getType().isIndex())
266 indexEdges.insert(operand);
270 auto hasIndexResult = [&](
Operation *operation) {
271 return llvm::any_of(operation->getResults(), [&](
Value result) {
272 return indexEdges.contains(result);
297 for (
Operation *op : llvm::reverse(backwardSlice)) {
300 if (op == padOp || op == sliceOp) {
301 addIndexOperandsToIndexEdges(op);
306 if (
auto forOp = dyn_cast<scf::ForOp>(op)) {
307 if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
308 addIndexOperandsToIndexEdges(op);
314 if (hasIndexResult(op)) {
315 addIndexOperandsToIndexEdges(op);
317 if (llvm::any_of(op->getOperandTypes(),
318 [](
Type type) { return !type.isIndex(); })) {
319 LLVM_DEBUG(
DBGS() <<
"Unsupported op with non index type operands: "
320 << op <<
" -> skip\n");
324 auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
325 bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
326 if (hasMemoryEffect || op->getNumRegions() != 0) {
327 LLVM_DEBUG(
DBGS() <<
"Unsupported op with region or memory effect: "
328 << op <<
" -> skip\n");
335 if (!isa<arith::ConstantOp>(op))
336 operationsToRemove.insert(op);
338 backwardSlice.set_subtract(operationsToRemove);
351 for (
auto forOp : packingLoops) {
366 cast<scf::ForOp>(forOp).getStep()});
367 dynamicTensorSizes.push_back(res);
370 return dynamicTensorSizes;
390 Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
391 stepVal = forOp.getStep();
392 auto loc = forOp->
getLoc();
400 LLVM_DEBUG(
DBGS() <<
"Try to hoist " << *(opToHoist) <<
" by " << numLoops
404 LLVM_DEBUG(
DBGS() <<
"Analysis failed -> Skip\n");
417 RankedTensorType paddedTensorType = opToHoist.getResultType();
418 int paddedRank = paddedTensorType.getRank();
423 if (
failed(transposedTensorType))
431 llvm::append_range(packedShape, transposedTensorType->getShape());
432 auto packedTensorType = RankedTensorType::get(
433 packedShape, transposedTensorType->getElementType());
435 loc, packedTensorType.getShape(), packedTensorType.getElementType(),
448 clonedLoopIvs.reserve(nPackedLoops);
449 leadingPackedTensorIndexings.reserve(nPackedLoops);
455 if (
auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
459 auto forOp = dyn_cast<scf::ForOp>(op);
465 auto clonedForOp = b.
create<scf::ForOp>(
470 bvm.
map(forOp.getInductionVar(), clonedForOp.getInductionVar());
471 bvm.
map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
472 bvm.
map(forOp.getResults(), clonedForOp.getResults());
473 assert(clonedForOp->getNumRegions() == 1);
474 clonedLoopIvs.push_back(clonedForOp.getInductionVar());
477 Value loopIndependentIterationCount =
480 if (!loopIndependentIterationCount)
481 llvm_unreachable(
"loop independence prerequisite not met");
482 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
483 packedTensor = clonedForOp.getRegionIterArgs().front();
488 leadingPackedTensorIndexings.end());
492 for (int64_t sz : transposedTensorType->getShape()) {
494 assert(!ShapedType::isDynamic(sz) &&
"padded tensor needs static sizes");
502 Value paddedTensor = bvm.
lookup(opToHoist.getResult());
503 if (!transposeVector.empty()) {
504 Value outputTensor = b.
create<tensor::ExtractSliceOp>(
505 loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
506 transposeOps.push_back(
508 paddedTensor = transposeOps.back()->getResult(0);
513 loc, paddedTensor, packedTensor, offsets, sizes, strides);
516 Value valueToYield = inserted;
517 for (
Value iv : llvm::reverse(clonedLoopIvs)) {
520 b.
create<scf::YieldOp>(loc, valueToYield);
521 valueToYield = forOp.getResult(0);
529 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
532 if (llvm::any_of(loopIterationCounts, [](
Value v) {
return !v; }))
533 llvm_unreachable(
"loop independence prerequisite not met");
535 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
541 Value newResult = b.
create<tensor::ExtractSliceOp>(
542 loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
545 if (!transposeVector.empty()) {
547 loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
548 transposeOps.push_back(
550 newResult = transposeOps.back()->getResult(0);
555 cast<tensor::PadOp>(bvm.
lookup(opToHoist.getResult()).getDefiningOp());
static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp)
Return true if all uses of padOp are an input tensor of some LinalgOp.
static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, scf::ForOp forOp)
Return the current iteration number in the loop (iv - lb).ceilDiv(step).
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v)
static FailureOr< RankedTensorType > computeTransposedType(RankedTensorType rankedTensorType, ArrayRef< int64_t > transposeVector)
Returns the transposed rankedTensorType if transposeVector is non-empty.
static void getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, SmallVector< scf::ForOp > &reverseEnclosingLoops)
Return at most nLevels of immediately enclosing scf::ForOp loops.
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
This class provides management for the lifetime of the state used when printing the IR.
IntegerAttr getIndexAttr(int64_t value)
MLIRContext * getContext() const
A class for computing basic dominance information.
This class provides support for representing a failure result, or a valid value of type T.
This is a utility class for mapping one set of IR entities to another.
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
This is a builder type that keeps local references to arguments.
Builder & setShape(ArrayRef< int64_t > newShape)
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Location getLoc() const
Return the location of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
void getUpperBoundForIndex(Value value, AffineMap &boundMap, SmallVectorImpl< Value > &boundOperands, bool constantRequired=false)
Computes an upper bound for the result value of an index computation.
GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, Value outputTensor, ArrayRef< int64_t > transposeVector)
Returns a GenericOp that transposes inputTensor into outputTensor using transposeVector to permute th...
FailureOr< Value > hoistPaddingOnTensors(tensor::PadOp opToHoist, int numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
ForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
void getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, TransitiveFilter filter=nullptr)
Fills backwardSlice with the computed backward slice (i.e.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
bool isPermutationVector(ArrayRef< int64_t > interchange)
Method to check if an interchange vector is a permutation.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Analysis class to support tensor::PadOp hoisting across multiple enclosing loops.
SmallVector< Value > getPackedTensorSizes(ImplicitLocOpBuilder &b)
Footprint of the packedTensor, computed from the packingLoops.
HoistingAnalysis(tensor::PadOp padOp, int numLoops)
SmallVector< scf::ForOp > packingLoops
The scf::ForOp immediately enclosing padOp such that:
scf::ForOp outermostEnclosingForOp
The outermost loop, determined by nLevels above which padOp will be hoisted.
SetVector< Operation * > backwardSlice
Backward slice rooted at padOp and nested under outermostEnclosingForOp.
This class represents an efficient way to signal success or failure.