32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/Support/CommandLine.h"
37 #define GEN_PASS_DEF_LINALGTILINGPASS
38 #include "mlir/Dialect/Linalg/Passes.h.inc"
46 #define DEBUG_TYPE "linalg-tiling"
60 for (
int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
62 static_cast<int64_t
>(0)) {
63 shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);
64 tileSizes.erase(tileSizes.begin() + idx - zerosCount);
68 loopIndexToRangeIndex[idx] = idx - zerosCount;
73 for (
unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)
75 return std::make_tuple(res, loopIndexToRangeIndex);
83 auto rangeIndex = loopIndexToRangeIndex.find(en.index());
84 if (rangeIndex == loopIndexToRangeIndex.end())
86 en.value() = ivs[rangeIndex->second];
96 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {
97 assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&
98 "expected strictly positive tile size and divisor");
103 Value condition = b.
create<arith::CmpIOp>(arith::CmpIPredicate::sgt,
104 value.get<
Value>(), zero);
107 b.
getStringAttr(
"expected strictly positive tile size and divisor"));
112 int64_t targetSize, int64_t divisor) {
113 assert(!op.hasDynamicShape() &&
114 "cannot compute static multi-tile sizes for an op with dynamic shape");
115 assert(targetSize > 0 &&
"target size must be non-negative");
116 assert(divisor > 0 &&
"divisor must be non-negative");
117 assert(dimension < op.getNumLoops() &&
"dimension overflow");
120 int64_t tripCount = op.getStaticLoopRanges()[dimension];
121 int64_t a = tripCount / divisor;
122 int64_t t = (targetSize + divisor - 1) / divisor;
123 int64_t totalTripCount = (a + t - 1) / t;
141 if (dimension >= op.getNumLoops())
147 if (emitAssertions) {
151 Value targetSizeValue =
158 op.createFlatListOfOperandDims(b, b.
getLoc());
159 AffineMap shapesToLoops = op.getShapesToLoopsMap();
174 Value t = apply((s0 + s1 - 1).
floorDiv(s1), {targetSizeValue, divisorValue});
177 Value v = apply(s0 % s1, {a, d});
178 Value u = apply(s0 - s1, {d, v});
182 spec.highTileSize = apply(s0 + s1, {s, divisorValue});
183 spec.lowTripCount = u;
184 spec.highTripCount = v;
190 if (emitAssertions) {
193 apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,
194 spec.highTileSize, spec.highTripCount});
195 Value equals = b.
create<arith::CmpIOp>(arith::CmpIPredicate::eq,
196 coveredSize, tripCount);
199 "could not compute dynamic multi-size tile shapes"));
213 if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
215 return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
239 bool omitTileOffsetBoundsCheck,
246 ValueRange threadIds = forallOp.getInductionVars();
248 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
251 int64_t nLoops = loopRanges.size();
252 tiledOffsets.reserve(nLoops);
253 tiledSizes.reserve(nLoops);
254 for (
unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {
255 bool overflow = loopIdx >= numThreads.size();
258 if (overflow || isZero) {
259 tiledOffsets.push_back(loopRanges[loopIdx].offset);
260 tiledSizes.push_back(loopRanges[loopIdx].size);
274 nominalTileSizes.has_value()
275 ? (*nominalTileSizes)[loopIdx]
282 b, loc, i +
j * m, {offset, threadId, tileSizePerThread});
285 b, loc, i +
j * m - n,
286 {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});
289 b, loc, -i + m, {offsetPerThread, size});
291 buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
294 tiledOffsets.push_back(offsetPerThread);
296 if (!omitTileOffsetBoundsCheck &&
298 nonZeroNumThreads[threadIdIdx], size))
302 tiledSizes.push_back(tileSizePerThread);
314 auto iterators = linalgOp.getIteratorTypesArray();
317 for (
unsigned i = 0, e = numThreads.size(); i != e; i++) {
318 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(numThreads[i])) {
319 if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1) {
320 safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
323 safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
346 std::optional<ArrayAttr> mapping,
bool omitTileOffsetBoundsCheck) {
351 if (loopRanges.empty())
352 return op->
emitOpError(
"expected non-empty loop ranges");
354 if (llvm::any_of(loopRanges, hasStrideOne))
355 return op->
emitOpError(
"only stride-1 supported atm");
360 return op->
emitOpError(
"failed to get destination tensors");
363 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
367 llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](
OpFoldResult ofr) {
371 LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
376 for (
size_t i = 0; i < tilingSafety.size(); i++)
377 if (!tilingSafety[i])
378 op.
emitWarning() <<
"tiling is not thread safe at axis #" << i;
384 scf::ForallOp forallOp = b.
create<scf::ForallOp>(
390 omitTileOffsetBoundsCheck, nominalTileSizes,
391 tiledOffsets, tiledSizes);
403 auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
404 if (destinationStyleOp) {
405 for (
OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) {
408 if (isa<TensorType>(outOperand.get().getType())) {
409 auto *it = llvm::find(dest, outOperand.get());
410 assert(it != dest.end() &&
"could not find destination tensor");
411 unsigned destNum = std::distance(dest.begin(), it);
412 outOperand.set(destBbArgs[destNum]);
419 cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
422 return clonedOp->
emitError(
"Failed to tile op: ");
423 if (tilingResult->tiledOps.size() != 1) {
424 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
425 << tilingResult->tiledOps.size();
429 tiledOp = tilingResult->tiledOps.front();
430 tiledValues = tilingResult->tiledValues;
434 for (
auto it : llvm::zip(llvm::seq(
unsigned(0),
unsigned(dest.size())),
435 tiledValues, destBbArgs)) {
441 if (
failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
442 tiledSizes, resultOffsets,
444 return op->
emitOpError(
"output offsets couldn't be calculated");
450 b.
create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
451 std::get<2>(it), resultOffsets,
452 resultSizes, strides);
460 std::optional<ArrayAttr> mapping) {
462 std::nullopt, mapping,
469 std::optional<ArrayAttr> mapping) {
471 unsigned nLoops = loopRanges.size();
473 numThreads.reserve(nLoops);
477 for (
const auto &it : llvm::zip(tileSizes, loopRanges)) {
481 b, op.
getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
482 numThreads.push_back(numTiles);
489 template <
typename LoopTy>
495 auto nLoops = op.getNumLoops();
497 tileSizes = tileSizes.take_front(nLoops);
503 tiledOp.
op = cast<LinalgOp>(b.
clone(*op.getOperation()));
505 tiledOp.
op->result_end());
511 op.createFlatListOfOperandDims(b, op.
getLoc());
512 AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
513 if (!shapeSizesToLoopsMap)
517 b, op.
getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);
520 for (
const auto &attr :
enumerate(op.getIteratorTypesArray())) {
521 if (loopIndexToRangeIndex.count(attr.index()))
522 iteratorTypes.push_back(attr.value());
526 auto invPermutationMap =
528 if (!
options.interchangeVector.empty()) {
532 interchangeVector.reserve(
options.interchangeVector.size());
533 for (
auto pos :
options.interchangeVector) {
534 auto it = loopIndexToRangeIndex.find(pos);
535 if (it == loopIndexToRangeIndex.end())
537 interchangeVector.push_back(it->second);
543 assert(invPermutationMap);
545 interchangeVector.end());
555 iteratorTypes.size(),
562 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
564 auto returnedProcInfo =
565 options.distribution->procInfo(b, op.
getLoc(), parallelLoopRanges);
566 unsigned procIdIdx = 0;
571 procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
578 auto tiledLoopBodyBuilder =
581 ivs.assign(localIvs.begin(), localIvs.end());
587 if (!
options.interchangeVector.empty()) {
588 for (
AffineExpr result : invPermutationMap.getResults())
589 interchangedIvs.push_back(
590 ivs[cast<AffineDimExpr>(result).getPosition()]);
592 interchangedIvs.assign(ivs.begin(), ivs.end());
597 assert(operandValuesToUse.size() ==
599 "expect the number of operands and inputs and outputs to match");
611 res =
clone(b, op, resultTensorTypes, tiledOperands);
617 tiledLoopBodyBuilder, procInfo);
624 loops.reserve(ivs.size());
625 for (
auto iv : ivs) {
626 if (isa<BlockArgument>(iv)) {
627 loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());
628 assert(loops.back() &&
"no owner found for induction variable!");
632 loops.push_back(
nullptr);
640 if ((outermostLoop = loop))
644 res, loops, outermostLoop ? outermostLoop->
getResults() : tensorResults};
650 std::optional<ArrayAttr> mapping) {
657 auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
663 auto destinationStyleOp =
664 dyn_cast<DestinationStyleOpInterface>(op.getOperation());
665 if (!destinationStyleOp)
669 auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
676 op,
"don't support ops with multiple results for now");
679 tilingInterfaceOp.getLoopIteratorTypes();
681 linalgOp.getReductionDims(redDims);
682 if (redDims.size() != 1)
684 op,
"only support ops with one reduction dimension.");
685 if (!tileSizes.empty() && tileSizes.size() != numThreads.size())
687 "many elements as number of threads");
688 int reductionDim =
static_cast<int>(redDims.front());
690 if (redDims.front() >= numThreads.size())
692 op,
"reduction dimension must be mapped to threads");
696 op.generateInitialTensorForPartialReduction(b, loc, numThreads,
698 if (
failed(identityTensor))
700 "cannot create a tensor of identity value.");
710 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
717 scf::ForallOp forallOp = b.
create<scf::ForallOp>(
719 (*identityTensor)->getResults(), mapping);
726 std::nullopt, tiledOffsets,
739 for (
Value initOperand : destinationStyleOp.getDpsInits()) {
740 auto *it = llvm::find(dest, initOperand);
741 assert(it != dest.end() &&
"dest operand not found in dest");
742 unsigned destNum = std::distance(dest.begin(), it);
748 outOffsets[reductionDim] = forallOp.getInductionVars().front();
750 tiledDpsInitOperands.push_back(b.
create<tensor::ExtractSliceOp>(
751 loc, cast<RankedTensorType>(initOperand.getType()),
752 destBbArgs[destNum], outOffsets, sizes, strides));
760 for (
auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
761 cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),
762 tiledDpsInitOperands)) {
763 initOperandPtr.set(tiledInitValue);
768 if (tileSizes.empty()) {
770 cast<TilingInterface>(clonedOp).getTiledImplementation(
771 b, tiledOffsets, tiledSizes);
773 return clonedOp->
emitError(
"Failed to tile op: ");
774 if (tilingResult->tiledOps.size() != 1) {
775 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
776 << tilingResult->tiledOps.size();
778 tiledOp = tilingResult->tiledOps.front();
779 tilingResults = tilingResult->tiledValues;
783 b, cast<LinalgOp>(clonedOp), tileSizes,
options);
789 materializedNonZeroNumThreads);
790 if (maybeTiled->loops.size() != 1) {
791 return clonedOp->
emitError(
"expected a single produced loop");
793 tiledOp = maybeTiled->op;
794 tilingResults = maybeTiled->loops.front()->getResults();
801 for (
auto [index, result, bbArg] : llvm::zip(
802 llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
808 if (
failed(tilingInterfaceOp.getResultTilePosition(
809 b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))
810 return op->
emitOpError(
"output offsets couldn't be calculated");
814 for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
815 if (i == reductionDim) {
816 resultOffsetsRank.push_back(forallOp.getInductionVars().front());
820 resultOffsetsRank.push_back(resultOffsets[offIdx++]);
821 resultSizesRank.push_back(resultSizes[sizeIdx++]);
829 b.
create<tensor::ParallelInsertSliceOp>(
830 loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
836 op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);
842 results.
loops = forallOp;
848 template <
typename LoopTy>
854 if (!
options.tileSizeComputationFunction)
860 auto nLoops = op.getNumLoops();
863 if (tileSizeVector.size() < nLoops) {
864 tileSizeVector.append(nLoops - tileSizeVector.size(), b.
getIndexAttr(0));
867 return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector,
options);
875 return tileLinalgOpImpl<scf::ForOp>(b, op,
options);
876 case LinalgTilingLoopType::ParallelLoops:
877 return tileLinalgOpImpl<scf::ParallelOp>(b, op,
options);
885 template <
typename... OpTypes>
886 class CanonicalizationPatternList;
889 class CanonicalizationPatternList<> {
894 template <
typename OpTy,
typename... OpTypes>
895 class CanonicalizationPatternList<OpTy, OpTypes...> {
898 OpTy::getCanonicalizationPatterns(patterns, patterns.
getContext());
899 CanonicalizationPatternList<OpTypes...>::insert(patterns);
914 affine::AffineApplyOp::getCanonicalizationPatterns(patterns, ctx);
915 affine::AffineForOp::getCanonicalizationPatterns(patterns, ctx);
916 affine::AffineMinOp::getCanonicalizationPatterns(patterns, ctx);
917 affine::AffineMaxOp::getCanonicalizationPatterns(patterns, ctx);
918 arith::ConstantIndexOp::getCanonicalizationPatterns(patterns, ctx);
920 memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);
921 memref::ViewOp::getCanonicalizationPatterns(patterns, ctx);
923 scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
924 scf::ParallelOp::getCanonicalizationPatterns(patterns, ctx);
926 tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
927 tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
928 tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, ctx);
929 tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, ctx);
930 tensor::PadOp::getCanonicalizationPatterns(patterns, ctx);
931 ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(patterns);
933 CanonicalizationPatternList<
935 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
static llvm::ManagedStatic< PassManagerOptions > options
SmallVector< bool > safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp, ArrayRef< OpFoldResult > numThreads)
Returns a vector of bools representing if, for each axis, op can be tiled without incurring in a race...
static FailureOr< ForallTilingResult > tileToForallOpImpl(RewriterBase &b, TilingInterface op, ArrayRef< OpFoldResult > numThreads, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, std::optional< ArrayAttr > mapping, bool omitTileOffsetBoundsCheck)
Rewrite a TilingInterface op to a tiled scf.forall.
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize)
Returns true if the maximum tile offset tileSize * numThreads-1 is less than iterationSize.
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, OpFoldResult value)
Asserts that the given index-typed value is strictly positive.
static OpFoldResult buildMax(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_max of all the vals.
static void calculateTileOffsetsAndSizes(RewriterBase &b, Location loc, scf::ForallOp forallOp, ArrayRef< OpFoldResult > numThreads, SmallVector< Range > loopRanges, bool omitTileOffsetBoundsCheck, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, SmallVector< OpFoldResult > &tiledOffsets, SmallVector< OpFoldResult > &tiledSizes)
Fill out the tiledOffsets and tiledSizes to be used to tile to a given number of threads.
static FailureOr< TiledLinalgOp > tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef< OpFoldResult > tileSizes, const LinalgTilingOptions &options)
static OpFoldResult buildMin(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_min of all the vals.
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
AffineExpr ceilDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
unsigned getNumResults() const
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
IntegerAttr getIndexAttr(int64_t value)
AffineExpr getAffineSymbolExpr(unsigned position)
StringAttr getStringAttr(const Twine &bytes)
MLIRContext * getContext() const
This class provides support for representing a failure result, or a valid value of type T.
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Location getLoc() const
Accessors for the implied location.
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext * getContext() const
Return the context this location is uniqued in.
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Location getLoc()
The source location the operation was defined or derived from.
unsigned getNumOperands()
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
InFlightDiagnostic emitOpError(const Twine &message={})
Emit an error with the op name prefixed, like "'dim' op " which is convenient for verifiers.
unsigned getNumResults()
Return the number of results held by this operation.
MLIRContext * getContext() const
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
SmallVector< OpFoldResult > makeComposedFoldedMultiResultAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Variant of makeComposedFoldedAffineApply suitable for multi-result maps.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
FailureOr< ForallTilingResult > tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > tileSizes, std::optional< ArrayAttr > mapping)
Same as tileToForallOp, but calculate the number of threads required using the given tileSizes.
SmallVector< Value > makeTiledShapes(OpBuilder &builder, Location loc, LinalgOp linalgOp, ValueRange valuesToTile, ArrayRef< OpFoldResult > ivs, ArrayRef< OpFoldResult > tileSizes, ArrayRef< OpFoldResult > sizeBounds, bool omitPartialTileCheck)
Creates extract_slice/subview ops for all valuesToTile of the given linalgOp with builder,...
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
FailureOr< ForallTilingResult > tileToForallOp(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > numThreads, std::optional< ArrayAttr > mapping)
SmallVector< Value > insertSlicesBack(OpBuilder &builder, Location loc, LinalgOp op, ValueRange operands, ValueRange results)
Creates insert_slice ops that insert results back into larger tensors they were originally extracted ...
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef< OpFoldResult > offests)
Add the specified offsets to any linalg.index ops contained in the given linalgOp.
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
SmallVector< Type > getTensorOutputTypes(LinalgOp op, ValueRange operands)
Returns the list of tensor output types produced when the given structured operation op is applied to...
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
SmallVector< Value > ValueVector
An owning vector of values, handy to return from functions.
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
Include the generated interface declarations.
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
int64_t floorDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's floordiv operation on constants.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
SmallVector< scf::ForOp, 8 > Loops
Tile a nest of standard for loops rooted at rootForOp by finding such parametric tile sizes that the ...
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Transformation information returned after reduction tiling.
Operation * parallelTiledOp
The partial reduction tiled op generated.
Operation * initialOp
The op initializing the tensor used for partial reductions.
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
Operation * mergeOp
The final reduction operation merging all the partial reductions.
Rewrite a TilingInterface op to a tiled scf.forall, applying tiling by numThreads.
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Callback function type used to get processor ID, and number of processors used for distribution for a...
Perform standalone tiling of a single LinalgOp by tileSizes.
SmallVector< Value, 4 > tensorResults
T lowTripCount
Number of tiles associated with each size.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.