28 #include "llvm/ADT/STLExtras.h"
32 #define GEN_PASS_DEF_LINALGTILINGPASS
33 #include "mlir/Dialect/Linalg/Passes.h.inc"
41 #define DEBUG_TYPE "linalg-tiling"
55 for (
int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
57 static_cast<int64_t
>(0)) {
58 shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);
59 tileSizes.erase(tileSizes.begin() + idx - zerosCount);
63 loopIndexToRangeIndex[idx] = idx - zerosCount;
68 for (
unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)
70 return std::make_tuple(res, loopIndexToRangeIndex);
78 auto rangeIndex = loopIndexToRangeIndex.find(en.index());
79 if (rangeIndex == loopIndexToRangeIndex.end())
81 en.value() = ivs[rangeIndex->second];
91 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {
92 assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&
93 "expected strictly positive tile size and divisor");
98 Value condition = arith::CmpIOp::create(b, arith::CmpIPredicate::sgt,
99 cast<Value>(value), zero);
100 cf::AssertOp::create(
102 b.
getStringAttr(
"expected strictly positive tile size and divisor"));
105 FailureOr<StaticContinuousTileSizeSpecification>
107 unsigned targetSize) {
109 assert(!op.hasDynamicShape() &&
110 "cannot compute static multi-tile sizes for an op with dynamic shape");
111 assert(targetSize > 0 &&
"target size must be non-negative");
112 assert(dimension < op.getNumLoops() &&
"dimension overflow");
115 int64_t loopRange = op.getStaticLoopRanges()[dimension];
116 int64_t tripCount = loopRange / targetSize;
118 unsigned tileSize = targetSize;
123 int64_t remainderChunk = loopRange % targetSize;
125 while (tileSize > 1 && remainderChunk != 0) {
127 uint64_t maxPower = llvm::bit_floor(tileSize);
128 tileSize = maxPower == tileSize ? maxPower >> 1 : maxPower;
130 tripCount = remainderChunk / tileSize;
137 remainderChunk = remainderChunk % tileSize;
142 int64_t range) ->
bool {
143 int64_t computedRange = 0;
144 for (
auto [tileSize, tripCount] : llvm::zip(tileSizes, tripCounts))
145 computedRange += tileSize * tripCount;
146 return range == computedRange;
155 FailureOr<ContinuousTileSizeSpecification>
159 bool emitAssertions) {
162 unsigned numLoops = loopRanges.size();
165 if (dimension >= numLoops)
171 if (emitAssertions) {
174 Value targetSizeValue =
190 Value tripCountValue = apply(s0.
floorDiv(s1), {loopRange, targetSizeValue});
191 Value remainderChunkValue = apply(s0 % s1, {loopRange, targetSizeValue});
200 assert(tileSizeInt > 0 &&
"target size must be non-negative");
202 spec.
tileSizes.push_back(targetSizeValue);
205 while (tileSizeInt > 1) {
206 uint64_t maxPower = llvm::bit_floor(tileSizeInt);
207 tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
210 tripCountValue = apply(s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
213 b, b.
getLoc(), s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
216 if (
Attribute attr = llvm::dyn_cast_if_present<Attribute>(tripCountSize)) {
217 auto intAttr = cast<IntegerAttr>(attr);
218 bool isTripCountZero = intAttr.getValue().isZero();
220 if (!isTripCountZero) {
229 remainderChunkValue = apply(s0 % s1, {remainderChunkValue, constStepOp});
235 FailureOr<StaticMultiSizeSpecification>
237 int64_t targetSize, int64_t divisor) {
238 assert(!op.hasDynamicShape() &&
239 "cannot compute static multi-tile sizes for an op with dynamic shape");
240 assert(targetSize > 0 &&
"target size must be non-negative");
241 assert(divisor > 0 &&
"divisor must be non-negative");
242 assert(dimension < op.getNumLoops() &&
"dimension overflow");
245 int64_t tripCount = op.getStaticLoopRanges()[dimension];
246 int64_t a = tripCount / divisor;
247 int64_t t = (targetSize + divisor - 1) / divisor;
248 int64_t totalTripCount = (a + t - 1) / t;
261 FailureOr<MultiSizeSpecification>
266 if (dimension >= op.getNumLoops())
272 if (emitAssertions) {
276 Value targetSizeValue =
283 op.createFlatListOfOperandDims(b, b.
getLoc());
284 AffineMap shapesToLoops = op.getShapesToLoopsMap();
299 Value t = apply((s0 + s1 - 1).floorDiv(s1), {targetSizeValue, divisorValue});
300 Value d = apply((s0 + s1 - 1).floorDiv(s1), {a, t});
302 Value v = apply(s0 % s1, {a, d});
303 Value u = apply(s0 - s1, {d, v});
307 spec.highTileSize = apply(s0 + s1, {s, divisorValue});
308 spec.lowTripCount = u;
309 spec.highTripCount = v;
315 if (emitAssertions) {
318 apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,
319 spec.highTileSize, spec.highTripCount});
320 Value equals = arith::CmpIOp::create(b, arith::CmpIPredicate::eq,
321 coveredSize, tripCount);
322 cf::AssertOp::create(
325 "could not compute dynamic multi-size tile shapes"));
339 if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
341 return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
365 bool omitTileOffsetBoundsCheck,
375 int64_t nLoops = loopRanges.size();
376 tiledOffsets.reserve(nLoops);
377 tiledSizes.reserve(nLoops);
378 for (
unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {
379 bool overflow = loopIdx >= numThreads.size();
380 bool isZero = !overflow &&
isZeroInteger(numThreads[loopIdx]);
382 if (overflow || isZero) {
383 tiledOffsets.push_back(loopRanges[loopIdx].offset);
384 tiledSizes.push_back(loopRanges[loopIdx].size);
398 nominalTileSizes.has_value()
399 ? (*nominalTileSizes)[loopIdx]
406 b, loc, i +
j * m, {offset, threadId, tileSizePerThread});
409 b, loc, i +
j * m - n,
410 {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});
413 b, loc, -i + m, {offsetPerThread, size});
415 buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
418 tiledOffsets.push_back(offsetPerThread);
420 if (!omitTileOffsetBoundsCheck &&
422 nonZeroNumThreads[threadIdIdx], size))
426 tiledSizes.push_back(tileSizePerThread);
431 template <
typename LoopTy>
432 static FailureOr<TiledLinalgOp>
437 auto nLoops = op.getNumLoops();
439 tileSizes = tileSizes.take_front(nLoops);
445 tiledOp.
op = cast<LinalgOp>(b.
clone(*op.getOperation()));
447 tiledOp.
op->result_end());
453 op.createFlatListOfOperandDims(b, op.getLoc());
454 AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
455 if (!shapeSizesToLoopsMap)
459 b, op.getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);
462 for (
const auto &attr :
enumerate(op.getIteratorTypesArray())) {
463 if (loopIndexToRangeIndex.count(attr.index()))
464 iteratorTypes.push_back(attr.value());
468 auto invPermutationMap =
470 if (!
options.interchangeVector.empty()) {
474 interchangeVector.reserve(
options.interchangeVector.size());
475 for (
auto pos :
options.interchangeVector) {
476 auto it = loopIndexToRangeIndex.find(pos);
477 if (it == loopIndexToRangeIndex.end())
479 interchangeVector.push_back(it->second);
485 assert(invPermutationMap);
487 interchangeVector.end());
497 iteratorTypes.size(),
504 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
506 auto returnedProcInfo =
507 options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges);
508 unsigned procIdIdx = 0;
513 procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
520 auto tiledLoopBodyBuilder =
523 ivs.assign(localIvs.begin(), localIvs.end());
529 if (!
options.interchangeVector.empty()) {
530 for (
AffineExpr result : invPermutationMap.getResults())
531 interchangedIvs.push_back(
532 ivs[cast<AffineDimExpr>(result).getPosition()]);
534 interchangedIvs.assign(ivs.begin(), ivs.end());
539 assert(operandValuesToUse.size() ==
540 static_cast<size_t>(op->getNumOperands()) &&
541 "expect the number of operands and inputs and outputs to match");
553 res =
clone(b, op, resultTensorTypes, tiledOperands);
559 tiledLoopBodyBuilder, procInfo);
566 loops.reserve(ivs.size());
567 for (
auto iv : ivs) {
568 if (isa<BlockArgument>(iv)) {
569 loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());
570 assert(loops.back() &&
"no owner found for induction variable!");
574 loops.push_back(
nullptr);
582 if ((outermostLoop = loop))
586 res, loops, outermostLoop ? outermostLoop->
getResults() : tensorResults};
592 std::optional<ArrayAttr> mapping) {
599 auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
605 auto destinationStyleOp =
606 dyn_cast<DestinationStyleOpInterface>(op.getOperation());
607 if (!destinationStyleOp)
611 auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
616 if (op->getNumResults() != 1)
618 op,
"don't support ops with multiple results for now");
621 tilingInterfaceOp.getLoopIteratorTypes();
623 linalgOp.getReductionDims(redDims);
624 if (redDims.size() != 1)
626 op,
"only support ops with one reduction dimension.");
627 if (!tileSizes.empty() && tileSizes.size() != numThreads.size())
629 "many elements as number of threads");
631 if (redDims.front() >= numThreads.size())
633 op,
"reduction dimension must be mapped to threads");
636 unsigned reductionDim = redDims.front();
638 reductionDims.insert(reductionDim);
639 FailureOr<SmallVector<Value>> maybeInitTensors =
640 op.generateInitialTensorForPartialReduction(b, loc, numThreads,
642 if (
failed(maybeInitTensors))
644 op,
"Failed to create inital tensors for partial reduction");
660 scf::ForallOp forallOp = scf::ForallOp::create(
669 std::nullopt, tiledOffsets,
682 for (
Value initOperand : destinationStyleOp.getDpsInits()) {
683 auto *it = llvm::find(dest, initOperand);
684 assert(it != dest.end() &&
"dest operand not found in dest");
685 unsigned destNum = std::distance(dest.begin(), it);
691 outOffsets[reductionDim] = forallOp.getInductionVars()[0];
693 tiledDpsInitOperands.push_back(tensor::ExtractSliceOp::create(
694 b, loc, cast<RankedTensorType>(initOperand.getType()),
695 destBbArgs[destNum], outOffsets, sizes, strides));
703 for (
auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
704 cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),
705 tiledDpsInitOperands)) {
706 initOperandPtr.set(tiledInitValue);
711 if (tileSizes.empty()) {
712 FailureOr<TilingResult> tilingResult =
713 cast<TilingInterface>(clonedOp).getTiledImplementation(
714 b, tiledOffsets, tiledSizes);
716 return clonedOp->
emitError(
"Failed to tile op: ");
717 if (tilingResult->tiledOps.size() != 1) {
718 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
719 << tilingResult->tiledOps.size();
721 tiledOp = tilingResult->tiledOps.front();
722 tilingResults = tilingResult->tiledValues;
725 FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
726 b, cast<LinalgOp>(clonedOp), tileSizes,
options);
732 materializedNonZeroNumThreads);
733 if (maybeTiled->loops.size() != 1) {
734 return clonedOp->
emitError(
"expected a single produced loop");
736 tiledOp = maybeTiled->op;
737 tilingResults = maybeTiled->loops.front()->getResults();
744 for (
auto [index, result, bbArg] : llvm::zip(
745 llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
751 if (
failed(tilingInterfaceOp.getResultTilePosition(
752 b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))
753 return op->emitOpError(
"output offsets couldn't be calculated");
757 for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
758 if (i == reductionDim) {
759 resultOffsetsRank.push_back(forallOp.getInductionVars()[0]);
763 resultOffsetsRank.push_back(resultOffsets[offIdx++]);
764 resultSizesRank.push_back(resultSizes[sizeIdx++]);
772 tensor::ParallelInsertSliceOp::create(
773 b, loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
778 FailureOr<MergeResult> mergeResult =
779 op.mergeReductions(b, loc, forallOp->getResults(), reductionDims);
780 if (
failed(mergeResult)) {
783 b.
replaceOp(op, mergeResult->replacements);
788 results.
loops = forallOp;
790 results.
mergeOps.append(mergeResult->mergeOps);
794 template <
typename LoopTy>
800 if (!
options.tileSizeComputationFunction)
806 auto nLoops = op.getNumLoops();
809 if (tileSizeVector.size() < nLoops) {
810 tileSizeVector.append(nLoops - tileSizeVector.size(), b.
getIndexAttr(0));
813 return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector,
options);
816 FailureOr<TiledLinalgOp>
821 return tileLinalgOpImpl<scf::ForOp>(b, op,
options);
822 case LinalgTilingLoopType::ParallelLoops:
823 return tileLinalgOpImpl<scf::ParallelOp>(b, op,
options);
831 template <
typename... OpTypes>
832 class CanonicalizationPatternList;
835 class CanonicalizationPatternList<> {
840 template <
typename OpTy,
typename... OpTypes>
841 class CanonicalizationPatternList<OpTy, OpTypes...> {
845 CanonicalizationPatternList<OpTypes...>::insert(
patterns);
860 affine::AffineApplyOp::getCanonicalizationPatterns(
patterns, ctx);
861 affine::AffineForOp::getCanonicalizationPatterns(
patterns, ctx);
862 affine::AffineMinOp::getCanonicalizationPatterns(
patterns, ctx);
863 affine::AffineMaxOp::getCanonicalizationPatterns(
patterns, ctx);
864 arith::ConstantIndexOp::getCanonicalizationPatterns(
patterns, ctx);
866 memref::SubViewOp::getCanonicalizationPatterns(
patterns, ctx);
867 memref::ViewOp::getCanonicalizationPatterns(
patterns, ctx);
869 scf::ForOp::getCanonicalizationPatterns(
patterns, ctx);
870 scf::ParallelOp::getCanonicalizationPatterns(
patterns, ctx);
872 tensor::CastOp::getCanonicalizationPatterns(
patterns, ctx);
873 tensor::EmptyOp::getCanonicalizationPatterns(
patterns, ctx);
874 tensor::ExtractSliceOp::getCanonicalizationPatterns(
patterns, ctx);
875 tensor::InsertSliceOp::getCanonicalizationPatterns(
patterns, ctx);
876 tensor::PadOp::getCanonicalizationPatterns(
patterns, ctx);
877 ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(
patterns);
879 CanonicalizationPatternList<
881 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
static llvm::ManagedStatic< PassManagerOptions > options
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize)
Returns true if the maximum tile offset tileSize * numThreads-1 is less than iterationSize.
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, OpFoldResult value)
Asserts that the given index-typed value is strictly positive.
static OpFoldResult buildMax(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_max of all the vals.
static void calculateTileOffsetsAndSizes(RewriterBase &b, Location loc, scf::ForallOp forallOp, ArrayRef< OpFoldResult > numThreads, SmallVector< Range > loopRanges, bool omitTileOffsetBoundsCheck, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, SmallVector< OpFoldResult > &tiledOffsets, SmallVector< OpFoldResult > &tiledSizes)
Fill out the tiledOffsets and tiledSizes to be used to tile to a given number of threads.
static FailureOr< TiledLinalgOp > tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef< OpFoldResult > tileSizes, const LinalgTilingOptions &options)
static OpFoldResult buildMin(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_min of all the vals.
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
AffineExpr ceilDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
unsigned getNumResults() const
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Attributes are known-constant values of operations.
IntegerAttr getIndexAttr(int64_t value)
AffineExpr getAffineSymbolExpr(unsigned position)
StringAttr getStringAttr(const Twine &bytes)
MLIRContext * getContext() const
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Location getLoc() const
Accessors for the implied location.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext * getContext() const
Return the context this location is uniqued in.
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Specialization of arith.constant op that returns an integer of index type.
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
SmallVector< OpFoldResult > makeComposedFoldedMultiResultAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Variant of makeComposedFoldedAffineApply suitable for multi-result maps.
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
SmallVector< Value > makeTiledShapes(OpBuilder &builder, Location loc, LinalgOp linalgOp, ValueRange valuesToTile, ArrayRef< OpFoldResult > ivs, ArrayRef< OpFoldResult > tileSizes, ArrayRef< OpFoldResult > sizeBounds, bool omitPartialTileCheck)
Creates extract_slice/subview ops for all valuesToTile of the given linalgOp with builder,...
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
SmallVector< Value > insertSlicesBack(OpBuilder &builder, Location loc, LinalgOp op, ValueRange operands, ValueRange results)
Creates insert_slice ops that insert results back into larger tensors they were originally extracted ...
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef< OpFoldResult > offests)
Add the specified offsets to any linalg.index ops contained in the given linalgOp.
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
SmallVector< Type > getTensorOutputTypes(LinalgOp op, ValueRange operands)
Returns the list of tensor output types produced when the given structured operation op is applied to...
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
SmallVector< Value > ValueVector
An owning vector of values, handy to return from functions.
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
const FrozenRewritePatternSet & patterns
bool isZeroInteger(OpFoldResult v)
Return true if v is an IntegerAttr with value 0.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
SmallVector< scf::ForOp, 8 > Loops
Tile a nest of standard for loops rooted at rootForOp by finding such parametric tile sizes that the ...
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Transformation information returned after reduction tiling.
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
SmallVector< Value > initialValues
Initial values used for partial reductions.
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Callback function type used to get processor ID, and number of processors used for distribution for a...
Perform standalone tiling of a single LinalgOp by tileSizes.
SmallVector< Value, 4 > tensorResults
SmallVector< T > tileSizes
Tile sizes.
SmallVector< T > tripCounts
Number of tiles associated with each size.
T lowTripCount
Number of tiles associated with each size.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.