28 #include "llvm/ADT/STLExtras.h"
32 #define GEN_PASS_DEF_LINALGTILINGPASS
33 #include "mlir/Dialect/Linalg/Passes.h.inc"
41 #define DEBUG_TYPE "linalg-tiling"
55 for (
int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
57 static_cast<int64_t
>(0)) {
58 shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);
59 tileSizes.erase(tileSizes.begin() + idx - zerosCount);
63 loopIndexToRangeIndex[idx] = idx - zerosCount;
68 for (
unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)
70 return std::make_tuple(res, loopIndexToRangeIndex);
78 auto rangeIndex = loopIndexToRangeIndex.find(en.index());
79 if (rangeIndex == loopIndexToRangeIndex.end())
81 en.value() = ivs[rangeIndex->second];
91 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {
92 assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&
93 "expected strictly positive tile size and divisor");
98 Value condition = b.
create<arith::CmpIOp>(arith::CmpIPredicate::sgt,
99 cast<Value>(value), zero);
102 b.
getStringAttr(
"expected strictly positive tile size and divisor"));
105 FailureOr<StaticContinuousTileSizeSpecification>
107 unsigned targetSize) {
109 assert(!op.hasDynamicShape() &&
110 "cannot compute static multi-tile sizes for an op with dynamic shape");
111 assert(targetSize > 0 &&
"target size must be non-negative");
112 assert(dimension < op.getNumLoops() &&
"dimension overflow");
115 int64_t loopRange = op.getStaticLoopRanges()[dimension];
116 int64_t tripCount = loopRange / targetSize;
118 unsigned tileSize = targetSize;
123 int64_t remainderChunk = loopRange % targetSize;
125 while (tileSize > 1 && remainderChunk != 0) {
127 uint64_t maxPower = llvm::bit_floor(tileSize);
128 tileSize = maxPower == tileSize ? maxPower >> 1 : maxPower;
130 tripCount = remainderChunk / tileSize;
137 remainderChunk = remainderChunk % tileSize;
142 int64_t range) ->
bool {
143 int64_t computedRange = 0;
144 for (
auto [tileSize, tripCount] : llvm::zip(tileSizes, tripCounts))
145 computedRange += tileSize * tripCount;
146 return range == computedRange;
155 FailureOr<ContinuousTileSizeSpecification>
159 bool emitAssertions) {
162 unsigned numLoops = loopRanges.size();
165 if (dimension >= numLoops)
171 if (emitAssertions) {
174 Value targetSizeValue =
190 Value tripCountValue = apply(s0.
floorDiv(s1), {loopRange, targetSizeValue});
191 Value remainderChunkValue = apply(s0 % s1, {loopRange, targetSizeValue});
200 assert(tileSizeInt > 0 &&
"target size must be non-negative");
202 spec.
tileSizes.push_back(targetSizeValue);
205 while (tileSizeInt > 1) {
206 uint64_t maxPower = llvm::bit_floor(tileSizeInt);
207 tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
210 tripCountValue = apply(s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
213 b, b.
getLoc(), s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
216 if (
Attribute attr = llvm::dyn_cast_if_present<Attribute>(tripCountSize)) {
217 auto intAttr = cast<IntegerAttr>(attr);
218 bool isTripCountZero = intAttr.getValue().isZero();
220 if (!isTripCountZero) {
229 remainderChunkValue = apply(s0 % s1, {remainderChunkValue, constStepOp});
235 FailureOr<StaticMultiSizeSpecification>
237 int64_t targetSize, int64_t divisor) {
238 assert(!op.hasDynamicShape() &&
239 "cannot compute static multi-tile sizes for an op with dynamic shape");
240 assert(targetSize > 0 &&
"target size must be non-negative");
241 assert(divisor > 0 &&
"divisor must be non-negative");
242 assert(dimension < op.getNumLoops() &&
"dimension overflow");
245 int64_t tripCount = op.getStaticLoopRanges()[dimension];
246 int64_t a = tripCount / divisor;
247 int64_t t = (targetSize + divisor - 1) / divisor;
248 int64_t totalTripCount = (a + t - 1) / t;
261 FailureOr<MultiSizeSpecification>
266 if (dimension >= op.getNumLoops())
272 if (emitAssertions) {
276 Value targetSizeValue =
283 op.createFlatListOfOperandDims(b, b.
getLoc());
284 AffineMap shapesToLoops = op.getShapesToLoopsMap();
299 Value t = apply((s0 + s1 - 1).floorDiv(s1), {targetSizeValue, divisorValue});
300 Value d = apply((s0 + s1 - 1).floorDiv(s1), {a, t});
302 Value v = apply(s0 % s1, {a, d});
303 Value u = apply(s0 - s1, {d, v});
307 spec.highTileSize = apply(s0 + s1, {s, divisorValue});
308 spec.lowTripCount = u;
309 spec.highTripCount = v;
315 if (emitAssertions) {
318 apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,
319 spec.highTileSize, spec.highTripCount});
320 Value equals = b.
create<arith::CmpIOp>(arith::CmpIPredicate::eq,
321 coveredSize, tripCount);
324 "could not compute dynamic multi-size tile shapes"));
338 if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
340 return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
364 bool omitTileOffsetBoundsCheck,
374 int64_t nLoops = loopRanges.size();
375 tiledOffsets.reserve(nLoops);
376 tiledSizes.reserve(nLoops);
377 for (
unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {
378 bool overflow = loopIdx >= numThreads.size();
379 bool isZero = !overflow &&
isZeroInteger(numThreads[loopIdx]);
381 if (overflow || isZero) {
382 tiledOffsets.push_back(loopRanges[loopIdx].offset);
383 tiledSizes.push_back(loopRanges[loopIdx].size);
397 nominalTileSizes.has_value()
398 ? (*nominalTileSizes)[loopIdx]
405 b, loc, i +
j * m, {offset, threadId, tileSizePerThread});
408 b, loc, i +
j * m - n,
409 {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});
412 b, loc, -i + m, {offsetPerThread, size});
414 buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
417 tiledOffsets.push_back(offsetPerThread);
419 if (!omitTileOffsetBoundsCheck &&
421 nonZeroNumThreads[threadIdIdx], size))
425 tiledSizes.push_back(tileSizePerThread);
430 template <
typename LoopTy>
431 static FailureOr<TiledLinalgOp>
436 auto nLoops = op.getNumLoops();
438 tileSizes = tileSizes.take_front(nLoops);
444 tiledOp.
op = cast<LinalgOp>(b.
clone(*op.getOperation()));
446 tiledOp.
op->result_end());
452 op.createFlatListOfOperandDims(b, op.getLoc());
453 AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
454 if (!shapeSizesToLoopsMap)
458 b, op.getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);
461 for (
const auto &attr :
enumerate(op.getIteratorTypesArray())) {
462 if (loopIndexToRangeIndex.count(attr.index()))
463 iteratorTypes.push_back(attr.value());
467 auto invPermutationMap =
469 if (!
options.interchangeVector.empty()) {
473 interchangeVector.reserve(
options.interchangeVector.size());
474 for (
auto pos :
options.interchangeVector) {
475 auto it = loopIndexToRangeIndex.find(pos);
476 if (it == loopIndexToRangeIndex.end())
478 interchangeVector.push_back(it->second);
484 assert(invPermutationMap);
486 interchangeVector.end());
496 iteratorTypes.size(),
503 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
505 auto returnedProcInfo =
506 options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges);
507 unsigned procIdIdx = 0;
512 procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
519 auto tiledLoopBodyBuilder =
522 ivs.assign(localIvs.begin(), localIvs.end());
528 if (!
options.interchangeVector.empty()) {
529 for (
AffineExpr result : invPermutationMap.getResults())
530 interchangedIvs.push_back(
531 ivs[cast<AffineDimExpr>(result).getPosition()]);
533 interchangedIvs.assign(ivs.begin(), ivs.end());
538 assert(operandValuesToUse.size() ==
539 static_cast<size_t>(op->getNumOperands()) &&
540 "expect the number of operands and inputs and outputs to match");
552 res =
clone(b, op, resultTensorTypes, tiledOperands);
558 tiledLoopBodyBuilder, procInfo);
565 loops.reserve(ivs.size());
566 for (
auto iv : ivs) {
567 if (isa<BlockArgument>(iv)) {
568 loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());
569 assert(loops.back() &&
"no owner found for induction variable!");
573 loops.push_back(
nullptr);
581 if ((outermostLoop = loop))
585 res, loops, outermostLoop ? outermostLoop->
getResults() : tensorResults};
591 std::optional<ArrayAttr> mapping) {
598 auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
604 auto destinationStyleOp =
605 dyn_cast<DestinationStyleOpInterface>(op.getOperation());
606 if (!destinationStyleOp)
610 auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
615 if (op->getNumResults() != 1)
617 op,
"don't support ops with multiple results for now");
620 tilingInterfaceOp.getLoopIteratorTypes();
622 linalgOp.getReductionDims(redDims);
623 if (redDims.size() != 1)
625 op,
"only support ops with one reduction dimension.");
626 if (!tileSizes.empty() && tileSizes.size() != numThreads.size())
628 "many elements as number of threads");
630 if (redDims.front() >= numThreads.size())
632 op,
"reduction dimension must be mapped to threads");
635 unsigned reductionDim = redDims.front();
637 reductionDims.insert(reductionDim);
638 FailureOr<SmallVector<Value>> maybeInitTensors =
639 op.generateInitialTensorForPartialReduction(b, loc, numThreads,
641 if (failed(maybeInitTensors))
643 op,
"Failed to create inital tensors for partial reduction");
659 scf::ForallOp forallOp = b.
create<scf::ForallOp>(
668 std::nullopt, tiledOffsets,
681 for (
Value initOperand : destinationStyleOp.getDpsInits()) {
682 auto *it = llvm::find(dest, initOperand);
683 assert(it != dest.end() &&
"dest operand not found in dest");
684 unsigned destNum = std::distance(dest.begin(), it);
690 outOffsets[reductionDim] = forallOp.getInductionVars()[0];
692 tiledDpsInitOperands.push_back(b.
create<tensor::ExtractSliceOp>(
693 loc, cast<RankedTensorType>(initOperand.getType()),
694 destBbArgs[destNum], outOffsets, sizes, strides));
702 for (
auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
703 cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),
704 tiledDpsInitOperands)) {
705 initOperandPtr.set(tiledInitValue);
710 if (tileSizes.empty()) {
711 FailureOr<TilingResult> tilingResult =
712 cast<TilingInterface>(clonedOp).getTiledImplementation(
713 b, tiledOffsets, tiledSizes);
714 if (failed(tilingResult))
715 return clonedOp->
emitError(
"Failed to tile op: ");
716 if (tilingResult->tiledOps.size() != 1) {
717 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
718 << tilingResult->tiledOps.size();
720 tiledOp = tilingResult->tiledOps.front();
721 tilingResults = tilingResult->tiledValues;
724 FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
725 b, cast<LinalgOp>(clonedOp), tileSizes,
options);
726 if (failed(maybeTiled))
731 materializedNonZeroNumThreads);
732 if (maybeTiled->loops.size() != 1) {
733 return clonedOp->
emitError(
"expected a single produced loop");
735 tiledOp = maybeTiled->op;
736 tilingResults = maybeTiled->loops.front()->getResults();
743 for (
auto [index, result, bbArg] : llvm::zip(
744 llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
750 if (failed(tilingInterfaceOp.getResultTilePosition(
751 b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))
752 return op->emitOpError(
"output offsets couldn't be calculated");
756 for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
757 if (i == reductionDim) {
758 resultOffsetsRank.push_back(forallOp.getInductionVars()[0]);
762 resultOffsetsRank.push_back(resultOffsets[offIdx++]);
763 resultSizesRank.push_back(resultSizes[sizeIdx++]);
771 b.
create<tensor::ParallelInsertSliceOp>(
772 loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
777 FailureOr<MergeResult> mergeResult =
778 op.mergeReductions(b, loc, forallOp->getResults(), reductionDims);
779 if (failed(mergeResult)) {
782 b.
replaceOp(op, mergeResult->replacements);
787 results.
loops = forallOp;
789 results.
mergeOps.append(mergeResult->mergeOps);
793 template <
typename LoopTy>
799 if (!
options.tileSizeComputationFunction)
805 auto nLoops = op.getNumLoops();
808 if (tileSizeVector.size() < nLoops) {
809 tileSizeVector.append(nLoops - tileSizeVector.size(), b.
getIndexAttr(0));
812 return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector,
options);
815 FailureOr<TiledLinalgOp>
820 return tileLinalgOpImpl<scf::ForOp>(b, op,
options);
821 case LinalgTilingLoopType::ParallelLoops:
822 return tileLinalgOpImpl<scf::ParallelOp>(b, op,
options);
830 template <
typename... OpTypes>
831 class CanonicalizationPatternList;
834 class CanonicalizationPatternList<> {
839 template <
typename OpTy,
typename... OpTypes>
840 class CanonicalizationPatternList<OpTy, OpTypes...> {
844 CanonicalizationPatternList<OpTypes...>::insert(
patterns);
859 affine::AffineApplyOp::getCanonicalizationPatterns(
patterns, ctx);
860 affine::AffineForOp::getCanonicalizationPatterns(
patterns, ctx);
861 affine::AffineMinOp::getCanonicalizationPatterns(
patterns, ctx);
862 affine::AffineMaxOp::getCanonicalizationPatterns(
patterns, ctx);
863 arith::ConstantIndexOp::getCanonicalizationPatterns(
patterns, ctx);
865 memref::SubViewOp::getCanonicalizationPatterns(
patterns, ctx);
866 memref::ViewOp::getCanonicalizationPatterns(
patterns, ctx);
868 scf::ForOp::getCanonicalizationPatterns(
patterns, ctx);
869 scf::ParallelOp::getCanonicalizationPatterns(
patterns, ctx);
871 tensor::CastOp::getCanonicalizationPatterns(
patterns, ctx);
872 tensor::EmptyOp::getCanonicalizationPatterns(
patterns, ctx);
873 tensor::ExtractSliceOp::getCanonicalizationPatterns(
patterns, ctx);
874 tensor::InsertSliceOp::getCanonicalizationPatterns(
patterns, ctx);
875 tensor::PadOp::getCanonicalizationPatterns(
patterns, ctx);
876 ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(
patterns);
878 CanonicalizationPatternList<
880 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
static llvm::ManagedStatic< PassManagerOptions > options
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize)
Returns true if the maximum tile offset tileSize * numThreads-1 is less than iterationSize.
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, OpFoldResult value)
Asserts that the given index-typed value is strictly positive.
static OpFoldResult buildMax(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_max of all the vals.
static void calculateTileOffsetsAndSizes(RewriterBase &b, Location loc, scf::ForallOp forallOp, ArrayRef< OpFoldResult > numThreads, SmallVector< Range > loopRanges, bool omitTileOffsetBoundsCheck, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, SmallVector< OpFoldResult > &tiledOffsets, SmallVector< OpFoldResult > &tiledSizes)
Fill out the tiledOffsets and tiledSizes to be used to tile to a given number of threads.
static FailureOr< TiledLinalgOp > tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef< OpFoldResult > tileSizes, const LinalgTilingOptions &options)
static OpFoldResult buildMin(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_min of all the vals.
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
AffineExpr ceilDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
unsigned getNumResults() const
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Attributes are known-constant values of operations.
IntegerAttr getIndexAttr(int64_t value)
AffineExpr getAffineSymbolExpr(unsigned position)
StringAttr getStringAttr(const Twine &bytes)
MLIRContext * getContext() const
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Location getLoc() const
Accessors for the implied location.
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext * getContext() const
Return the context this location is uniqued in.
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Specialization of arith.constant op that returns an integer of index type.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
SmallVector< OpFoldResult > makeComposedFoldedMultiResultAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Variant of makeComposedFoldedAffineApply suitable for multi-result maps.
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
SmallVector< Value > makeTiledShapes(OpBuilder &builder, Location loc, LinalgOp linalgOp, ValueRange valuesToTile, ArrayRef< OpFoldResult > ivs, ArrayRef< OpFoldResult > tileSizes, ArrayRef< OpFoldResult > sizeBounds, bool omitPartialTileCheck)
Creates extract_slice/subview ops for all valuesToTile of the given linalgOp with builder,...
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
SmallVector< Value > insertSlicesBack(OpBuilder &builder, Location loc, LinalgOp op, ValueRange operands, ValueRange results)
Creates insert_slice ops that insert results back into larger tensors they were originally extracted ...
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef< OpFoldResult > offests)
Add the specified offsets to any linalg.index ops contained in the given linalgOp.
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
SmallVector< Type > getTensorOutputTypes(LinalgOp op, ValueRange operands)
Returns the list of tensor output types produced when the given structured operation op is applied to...
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
SmallVector< Value > ValueVector
An owning vector of values, handy to return from functions.
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
const FrozenRewritePatternSet & patterns
bool isZeroInteger(OpFoldResult v)
Return true if v is an IntegerAttr with value 0.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
SmallVector< scf::ForOp, 8 > Loops
Tile a nest of standard for loops rooted at rootForOp by finding such parametric tile sizes that the ...
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Transformation information returned after reduction tiling.
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
SmallVector< Value > initialValues
Initial values used for partial reductions.
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Callback function type used to get processor ID, and number of processors used for distribution for a...
Perform standalone tiling of a single LinalgOp by tileSizes.
SmallVector< Value, 4 > tensorResults
SmallVector< T > tileSizes
Tile sizes.
SmallVector< T > tripCounts
Number of tiles associated with each size.
T lowTripCount
Number of tiles associated with each size.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.