26 #include "llvm/Support/Debug.h"
29 #define DEBUG_TYPE "tile-using-interface"
36 auto tileSizes = llvm::to_vector(ts);
47 size_t iterationDomainSize) {
49 if (filledVector.size() < iterationDomainSize) {
50 auto range = llvm::seq<int64_t>(filledVector.size(), iterationDomainSize);
51 filledVector.append(range.begin(), range.end());
53 if (filledVector.size() > iterationDomainSize)
54 filledVector.resize(iterationDomainSize);
73 return ((sizeAsInt.value() - offsetAsInt.value()) % strideAsInt.value() == 0);
82 if (ts && ts.value() == 1)
111 assert(!loopRanges.empty() &&
"expected at least one loop range");
112 assert(loopRanges.size() == tileSizes.size() &&
113 "expected as many tile sizes as loop ranges");
116 offsets.resize(loopRanges.size());
117 sizes.resize(loopRanges.size());
125 builder, loc, tileSizes[loopRange.index()]);
129 offsets[loopRange.index()] = offset;
130 sizes[loopRange.index()] = size;
134 auto loop = builder.
create<scf::ForOp>(
139 bodyBuilder, bodyLoc, loopRange.value(), iv, tileSize);
140 builder.
create<scf::YieldOp>(loc);
142 offsets[loopRange.index()] = loop.getInductionVar();
143 loops.push_back(loop);
183 tileOffsetsList[yieldedValue.index()];
188 loc, yieldedValue.value(), newBBArgs[yieldedValue.index()],
189 tileOffsets, tileSizes, tileStrides);
190 inserts.push_back(insert);
199 rewriter.
eraseOp(loop.value());
200 loops[loop.index()] = newLoops[loop.index()];
202 return llvm::to_vector(llvm::map_range(
203 loops.front().getResults().take_back(yieldedValues.size()),
235 for (
const auto &destValue :
llvm::enumerate(tiledOpDestinationValues)) {
236 auto sliceOp = destValue.value().getDefiningOp<tensor::ExtractSliceOp>();
239 sliceOp.setOperand(0, bbArgsList[destValue.index()]);
254 tileOffsetsList, tileSizesList, loops);
255 for (
auto tiledOp : tilingResult.
tiledOps) {
256 if (
auto dstOp = dyn_cast<DestinationStyleOpInterface>(tiledOp)) {
257 auto innerMostLoop = loops.back();
259 llvm::to_vector(dstOp.getDpsInits());
261 innerMostLoop.getRegionIterArgs());
275 if (!
options.tileSizeComputationFunction) {
277 op,
"missing tile size computation function");
282 size_t numLoops = iterationDomain.size();
285 op,
"unable to tile op with no iteration domain");
293 options.tileSizeComputationFunction(rewriter, op);
294 if (tileSizeVector.size() < iterationDomain.size()) {
296 tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
305 if (!
options.interchangeVector.empty()) {
307 iterationDomain.size());
309 if (!interchangeVector.empty()) {
312 op,
"invalid intechange vector, not a permutation of the entire "
324 rewriter, op.
getLoc(), iterationDomain, tileSizeVector, offsets, sizes);
326 if (!interchangeVector.empty()) {
334 if (!tilingResult.
loops.empty()) {
335 llvm::dbgs() <<
"LoopNest shell :\n";
336 tilingResult.loops.front().dump();
337 llvm::dbgs() <<
"\n";
342 if (!tilingResult.
loops.empty())
344 tilingResult.
loops.back().getBody()->getTerminator());
346 op.getTiledImplementation(rewriter, offsets, sizes);
347 tilingResult.
tiledOps.append(tiledImplementation->tiledOps);
355 if (tilingResult.
loops.empty()) {
356 tilingResult.
replacements = tiledImplementation->tiledValues;
365 resultSizesList(numResults);
367 if (
failed(op.getResultTilePosition(rewriter, result.index(), offsets,
369 resultOffsetsList[result.index()],
370 resultSizesList[result.index()]))) {
372 op,
"failed to get slice of result produced");
378 destinationTensors)))
382 rewriter, destinationTensors, tiledImplementation.value(),
383 resultOffsetsList, resultSizesList, tilingResult.
loops);
386 if (!tilingResult.
loops.empty()) {
387 llvm::dbgs() <<
"After tiled implementation :\n";
388 tilingResult.loops.front().dump();
389 llvm::dbgs() <<
"\n";
397 PartialReductionOpInterface op,
402 auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
404 auto tileSizesVector = llvm::to_vector(tileSizes);
405 if (tileSizesVector.size() < iterationDomain.size()) {
407 tileSizesVector.append(iterationDomain.size() - tileSizesVector.size(),
412 op,
"don't support ops with multiple results for now");
414 tilingInterfaceOp.getLoopIteratorTypes();
417 for (
auto [idx, iteratorType] :
419 if (iteratorType == utils::IteratorType::reduction)
420 reductionDims.push_back(idx);
425 op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector,
427 if (
failed(identityTensor))
429 "cannot create a tensor of identity value.");
433 b, loc, iterationDomain, tileSizesVector, offsets, sizes);
437 Operation *parallelOp = op.tileToPartialReduction(
438 b, loc, (*identityTensor)->getResults(), offsets, sizes, reductionDims);
441 for (
size_t i = 0; i < offsets.size(); i++)
442 resultSizesList.push_back(
446 b, (*identityTensor)->getResults(), parallelOp->
getResults(), outOffsets,
447 resultSizesList, loops);
449 auto dstOp = cast<DestinationStyleOpInterface>(parallelOp);
450 auto innerMostLoop = loops.back();
452 assert(destinationTensors.size() ==
453 innerMostLoop.getRegionIterArgs().size() &&
454 "unexpected number of outputs");
456 innerMostLoop.getRegionIterArgs());
460 Operation *mergeOp = op.mergeReductions(b, loc, replacements, reductionDims);
465 results.
loops = std::move(loops);
479 static std::tuple<OpResult, std::optional<OpOperand *>>
482 std::optional<OpOperand *> destinationIterArg;
483 auto loopIt = loops.rbegin();
484 while (
auto iterArg = dyn_cast<BlockArgument>(source->
get())) {
485 scf::ForOp loop = *loopIt;
486 if (iterArg.getOwner()->getParentOp() != loop)
488 source = &loop.getOpOperandForRegionIterArg(iterArg);
491 if (loopIt == loops.rend())
492 destinationIterArg = source;
493 return {dyn_cast<OpResult>(source->
get()), destinationIterArg};
498 std::optional<scf::SCFFuseProducerOfSliceResult>
500 tensor::ExtractSliceOp candidateSliceOp,
504 auto [fusableProducer, destinationInitArg] =
507 if (!fusableProducer)
516 if (
failed(tileAndFuseResult))
519 tileAndFuseResult->tiledValues[0]);
570 scf::ForOp outerMostLoop = loops.front();
571 if (destinationInitArg &&
572 (*destinationInitArg)->getOwner() == outerMostLoop) {
573 unsigned iterArgNumber =
574 outerMostLoop.getResultForOpOperand(**destinationInitArg)
576 int64_t resultNumber = fusableProducer.getResultNumber();
578 dyn_cast<DestinationStyleOpInterface>(fusableProducer.getOwner())) {
579 (*destinationInitArg)
580 ->set(dstOp.getTiedOpOperand(fusableProducer)->get());
582 for (
auto tileAndFusedOp : tileAndFuseResult->tiledOps) {
583 auto dstOp = dyn_cast<DestinationStyleOpInterface>(tileAndFusedOp);
586 scf::ForOp innerMostLoop = loops.back();
588 rewriter, dstOp.getDpsInitOperand(resultNumber)->get(),
589 innerMostLoop.getRegionIterArgs()[iterArgNumber]);
593 tileAndFuseResult->tiledValues[0],
594 tileAndFuseResult->tiledOps};
602 auto [fusableProducer, fusedProducerValue, tileAndFusedOps] =
606 rewriter, fusableProducer.getOwner()->getLoc(), fusableProducer);
612 resultOffsets, resultSizes, loops);
614 for (
auto tileAndFusedOp : tileAndFusedOps) {
615 auto dstStyleProducer =
616 dyn_cast<DestinationStyleOpInterface>(tileAndFusedOp);
617 if (!dstStyleProducer)
620 dstStyleProducer.getDpsInitOperand(fusableProducer.getResultNumber())
623 rewriter, dstValue, loops.back().getRegionIterArgs().back());
634 if (!consumer->getNumResults()) {
636 consumer,
"invalid pattern for op with no results");
641 llvm::SmallDenseMap<Value, int64_t> yieldedValueToResultNumber;
647 for (
auto *tiledOp : tilingResult->tiledOps)
649 tileAndFuseResult.
loops = std::move(tilingResult->loops);
651 llvm::zip(consumer->getResults(), tilingResult->replacements))) {
652 tileAndFuseResult.
replacements[std::get<0>(result.value())] =
653 std::get<1>(result.value());
654 yieldedValueToResultNumber[tilingResult->tiledOps.back()->getResult(
655 result.index())] = result.index();
660 if (tileAndFuseResult.
loops.empty())
661 return tileAndFuseResult;
670 auto addCandidateSlices = [](
Operation *fusedOp,
671 std::deque<tensor::ExtractSliceOp> &candidates) {
673 if (
auto sliceOp = operand.getDefiningOp<tensor::ExtractSliceOp>())
674 candidates.push_back(sliceOp);
677 std::deque<tensor::ExtractSliceOp> candidates;
680 while (!candidates.empty()) {
682 tensor::ExtractSliceOp candidateSliceOp = candidates.front();
683 candidates.pop_front();
688 std::optional<scf::SCFFuseProducerOfSliceResult> fusedProducer =
690 tileAndFuseResult.
loops);
695 fusedProducer->tiledAndFusedProducer.getDefiningOp()) {
697 addCandidateSlices(tiledAndFusedOp, candidates);
700 return tileAndFuseResult;
709 TilingInterface op) {
713 op,
"unable to lower to loops operations with return values");
720 for (
auto loopRange : domain) {
727 auto loop = rewriter.
create<scf::ForOp>(op.
getLoc(), offsetVal, sizeVal,
729 loops.push_back(loop);
730 ivs.push_back(loop.getInductionVar());
733 if (
failed(op.generateScalarImplementation(rewriter, op.
getLoc(), ivs))) {
static llvm::ManagedStatic< PassManagerOptions > options
static void updateDestinationOperandsForTiledOp(OpBuilder &builder, ValueRange tiledOpDestinationValues, ValueRange bbArgsList)
If the tiled operation is destination passing style, update the slice of the destination used (which ...
static SmallVector< int64_t > fillInterchangeVector(ArrayRef< int64_t > interchangeVector, size_t iterationDomainSize)
Helper method to adjust the interchange vector to match the iteration domain.
static SmallVector< scf::ForOp > generateTileLoopNest(OpBuilder &builder, Location loc, ArrayRef< Range > loopRanges, ArrayRef< OpFoldResult > tileSizes, SmallVector< OpFoldResult > &offsets, SmallVector< OpFoldResult > &sizes)
Generate an empty loop nest that represents the tiled loop nest shell.
static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, Range loopRange, Value iv, Value tileSize)
Returns the bounded tile size given the current iv, loopRange and tileSize, i.e., min(tileSize,...
static SmallVector< Value > yieldTiledValues(RewriterBase &rewriter, ValueRange initValues, ValueRange yieldedValues, ArrayRef< SmallVector< OpFoldResult >> tileOffsetsList, ArrayRef< SmallVector< OpFoldResult >> tileSizesList, MutableArrayRef< scf::ForOp > loops)
For a value to be yielded (yieldedValue) from within a loop nest loops, construct the destructive upd...
static std::tuple< OpResult, std::optional< OpOperand * > > getUntiledProducerFromSliceSource(OpOperand *source, ArrayRef< scf::ForOp > loops)
Return the untiled producer whose slice is used in a tiled consumer.
static bool tileDividesIterationDomain(Range loopRange)
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
IntegerAttr getIndexAttr(int64_t value)
MLIRContext * getContext() const
This class provides support for representing a failure result, or a valid value of type T.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
This class represents an operand of an operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Location getLoc()
The source location the operation was defined or derived from.
operand_range getOperands()
Returns an iterator on the underlying Value's.
result_range getResults()
unsigned getNumResults()
Return the number of results held by this operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the rewriter that the IR failed to be rewritten because of a match failure,...
virtual void replaceOp(Operation *op, ValueRange newValues)
This method replaces the results of the operation with the specified list of values.
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
FailureOr< scf::SCFReductionTilingResult > tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > tileSize)
Method to tile a reduction and generate a parallel op within a serial loop.
FailureOr< SmallVector< scf::ForOp > > lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op)
Method to lower an op that implements the TilingInterface to loops/scalars.
std::optional< SCFFuseProducerOfSliceResult > tileAndFuseProducerOfSlice(RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp, MutableArrayRef< scf::ForOp > loops)
Implementation of fusing producer of a single slice by computing the slice of the producer in-place.
void yieldReplacementForFusedProducer(RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp, scf::SCFFuseProducerOfSliceResult fusedProducerInfo, MutableArrayRef< scf::ForOp > loops)
Reconstruct the fused producer from within the tiled-and-fused code.
FailureOr< SCFTilingResult > tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op, const SCFTilingOptions &options)
Method to tile an op that implements the TilingInterface using scf.for for iterating over the tiles.
FailureOr< SCFTileAndFuseResult > tileConsumerAndFuseProducerGreedilyUsingSCFForOp(RewriterBase &rewriter, TilingInterface consumer, const SCFTileAndFuseOptions &options)
Method to tile and fuse a sequence of operations, by tiling the consumer and fusing its producers.
FailureOr< TilingResult > replaceExtractSliceWithTiledProducer(OpBuilder &builder, tensor::ExtractSliceOp sliceOp, OpResult producerOp)
Pattern to swap an tensor.extract_slice with its producer when the producer implements the TilingInte...
OpFoldResult getMixedSize(OpBuilder &builder, Location loc, Value value, int64_t dim)
Return the dimension of the given tensor value.
FailureOr< Value > getOrCreateDestination(OpBuilder &b, Location loc, OpResult opResult)
This is a helper function for DestinationStyleOpInterface.
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
This header declares functions that assist transformations in the MemRef dialect.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
bool succeeded(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
detail::constant_int_predicate_matcher m_Zero()
Matches a constant scalar / vector splat / tensor splat integer zero.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
SmallVector< scf::ForOp > replaceLoopNestWithNewYields(OpBuilder &builder, ArrayRef< scf::ForOp > loopNest, ValueRange newIterOperands, const NewYieldValueFn &newYieldValueFn, bool replaceIterOperandsUsesInLoop=true)
Update a perfectly nested loop nest to yield new values from the innermost loop and propagating it up...
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
std::function< SmallVector< Value >(OpBuilder &b, Location loc, ArrayRef< BlockArgument > newBBArgs)> NewYieldValueFn
Replace the loop with newIterOperands added as new initialization values.
bool isPermutationVector(ArrayRef< int64_t > interchange)
Method to check if an interchange vector is a permutation.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
SmallVector< int64_t > invertPermutationVector(ArrayRef< int64_t > permutation)
Helper method to apply to inverse a permutation.
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Container for result values of tiling.
SmallVector< Value > tiledValues
SmallVector< Operation * > tiledOps
Fuse the producer of the source of candidateSliceOp by computing the required slice of the producer i...
Transformation information returned after reduction tiling.
Operation * parallelTiledOp
The partial reduction tiled op generated.
Operation * mergeOp
The final reduction operation merging all the partial reductions.
SmallVector< scf::ForOp > loops
The scf.for operations that iterate over the tiles.
Operation * initialOp
Initial op.
Options used to control tile + fuse.
Transformation information returned after tile and fuse.
SmallVector< scf::ForOp > loops
The scf.for operations that iterate over the tiles.
llvm::DenseMap< Value, Value > replacements
The replacement values to use for the tiled and fused operations.
llvm::SetVector< Operation * > tiledAndFusedOps
List of tiled and fused operations generated.
Options to use to control tiling.
SCFTileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes for each operation.
SCFTilingOptions & setTileSizes(ArrayRef< OpFoldResult > ts)
Convenience function to set the tileSizeComputationFunction to a function that computes tile sizes at...
Transformation information returned after tiling.
SmallVector< Operation * > tiledOps
Tiled operations that are generated during tiling.
SmallVector< scf::ForOp > loops
The scf.for operations that iterate over the tiles.
SmallVector< Value > replacements
Values to use as replacements for the untiled op.