32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/Support/CommandLine.h"
37 #define GEN_PASS_DEF_LINALGTILINGPASS
38 #include "mlir/Dialect/Linalg/Passes.h.inc"
46 #define DEBUG_TYPE "linalg-tiling"
60 for (
int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
62 static_cast<int64_t
>(0)) {
63 shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);
64 tileSizes.erase(tileSizes.begin() + idx - zerosCount);
68 loopIndexToRangeIndex[idx] = idx - zerosCount;
73 for (
unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)
75 return std::make_tuple(res, loopIndexToRangeIndex);
83 auto rangeIndex = loopIndexToRangeIndex.find(en.index());
84 if (rangeIndex == loopIndexToRangeIndex.end())
86 en.value() = ivs[rangeIndex->second];
96 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {
97 assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&
98 "expected strictly positive tile size and divisor");
103 Value condition = b.
create<arith::CmpIOp>(arith::CmpIPredicate::sgt,
104 value.get<
Value>(), zero);
107 b.
getStringAttr(
"expected strictly positive tile size and divisor"));
110 FailureOr<StaticContinuousTileSizeSpecification>
113 unsigned targetSize) {
115 assert(!op.hasDynamicShape() &&
116 "cannot compute static multi-tile sizes for an op with dynamic shape");
117 assert(targetSize > 0 &&
"target size must be non-negative");
118 assert(dimension < op.getNumLoops() &&
"dimension overflow");
121 int64_t loopRange = op.getStaticLoopRanges()[dimension];
122 int64_t tripCount = loopRange / targetSize;
124 unsigned tileSize = targetSize;
129 int64_t remainderChunk = loopRange % targetSize;
131 while (tileSize > 1 && remainderChunk != 0) {
133 uint64_t maxPower = llvm::bit_floor(tileSize);
134 tileSize = maxPower == tileSize ? maxPower >> 1 : maxPower;
136 tripCount = remainderChunk / tileSize;
143 remainderChunk = remainderChunk % tileSize;
148 int64_t range) ->
bool {
149 int64_t computedRange = 0;
150 for (
auto [tileSize, tripCount] : llvm::zip(tileSizes, tripCounts))
151 computedRange += tileSize * tripCount;
152 return range == computedRange;
161 FailureOr<ContinuousTileSizeSpecification>
165 bool emitAssertions) {
168 unsigned numLoops = loopRanges.size();
171 if (dimension >= numLoops)
177 if (emitAssertions) {
180 Value targetSizeValue =
186 loopRanges[dimension].size);
196 Value tripCountValue = apply(s0.
floorDiv(s1), {loopRange, targetSizeValue});
197 Value remainderChunkValue = apply(s0 % s1, {loopRange, targetSizeValue});
206 assert(tileSizeInt > 0 &&
"target size must be non-negative");
208 spec.
tileSizes.push_back(targetSizeValue);
211 while (tileSizeInt > 1) {
212 uint64_t maxPower = llvm::bit_floor(tileSizeInt);
213 tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
216 tripCountValue = apply(s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
219 b, b.
getLoc(), s0.
floorDiv(s1), {remainderChunkValue, constStepOp});
222 if (
Attribute attr = llvm::dyn_cast_if_present<Attribute>(tripCountSize)) {
223 auto intAttr = cast<IntegerAttr>(attr);
224 bool isTripCountZero = intAttr.getValue().isZero();
226 if (!isTripCountZero) {
235 remainderChunkValue = apply(s0 % s1, {remainderChunkValue, constStepOp});
241 FailureOr<StaticMultiSizeSpecification>
243 int64_t targetSize, int64_t divisor) {
244 assert(!op.hasDynamicShape() &&
245 "cannot compute static multi-tile sizes for an op with dynamic shape");
246 assert(targetSize > 0 &&
"target size must be non-negative");
247 assert(divisor > 0 &&
"divisor must be non-negative");
248 assert(dimension < op.getNumLoops() &&
"dimension overflow");
251 int64_t tripCount = op.getStaticLoopRanges()[dimension];
252 int64_t a = tripCount / divisor;
253 int64_t t = (targetSize + divisor - 1) / divisor;
254 int64_t totalTripCount = (a + t - 1) / t;
267 FailureOr<MultiSizeSpecification>
272 if (dimension >= op.getNumLoops())
278 if (emitAssertions) {
282 Value targetSizeValue =
289 op.createFlatListOfOperandDims(b, b.
getLoc());
290 AffineMap shapesToLoops = op.getShapesToLoopsMap();
305 Value t = apply((s0 + s1 - 1).floorDiv(s1), {targetSizeValue, divisorValue});
306 Value d = apply((s0 + s1 - 1).floorDiv(s1), {a, t});
308 Value v = apply(s0 % s1, {a, d});
309 Value u = apply(s0 - s1, {d, v});
313 spec.highTileSize = apply(s0 + s1, {s, divisorValue});
314 spec.lowTripCount = u;
315 spec.highTripCount = v;
321 if (emitAssertions) {
324 apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,
325 spec.highTileSize, spec.highTripCount});
326 Value equals = b.
create<arith::CmpIOp>(arith::CmpIPredicate::eq,
327 coveredSize, tripCount);
330 "could not compute dynamic multi-size tile shapes"));
344 if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
346 return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
370 bool omitTileOffsetBoundsCheck,
379 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
382 int64_t nLoops = loopRanges.size();
383 tiledOffsets.reserve(nLoops);
384 tiledSizes.reserve(nLoops);
385 for (
unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {
386 bool overflow = loopIdx >= numThreads.size();
389 if (overflow || isZero) {
390 tiledOffsets.push_back(loopRanges[loopIdx].offset);
391 tiledSizes.push_back(loopRanges[loopIdx].size);
405 nominalTileSizes.has_value()
406 ? (*nominalTileSizes)[loopIdx]
408 b, loc, m.ceilDiv(n),
413 b, loc, i +
j * m, {offset, threadId, tileSizePerThread});
416 b, loc, i +
j * m - n,
417 {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});
420 b, loc, -i + m, {offsetPerThread, size});
422 buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
425 tiledOffsets.push_back(offsetPerThread);
427 if (!omitTileOffsetBoundsCheck &&
429 nonZeroNumThreads[threadIdIdx], size))
433 tiledSizes.push_back(tileSizePerThread);
445 auto iterators = linalgOp.getIteratorTypesArray();
448 for (
unsigned i = 0, e = numThreads.size(); i != e; i++) {
449 if (
auto attr = llvm::dyn_cast_if_present<Attribute>(numThreads[i])) {
450 if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1) {
451 safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
454 safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
477 std::optional<ArrayAttr> mapping,
bool omitTileOffsetBoundsCheck) {
482 if (loopRanges.empty())
483 return op->
emitOpError(
"expected non-empty loop ranges");
485 if (llvm::any_of(loopRanges, hasStrideOne))
486 return op->
emitOpError(
"only stride-1 supported atm");
491 return op->
emitOpError(
"failed to get destination tensors");
494 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
498 llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](
OpFoldResult ofr) {
502 LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
507 for (
size_t i = 0; i < tilingSafety.size(); i++)
508 if (!tilingSafety[i])
509 op.
emitWarning() <<
"tiling is not thread safe at axis #" << i;
515 scf::ForallOp forallOp = b.
create<scf::ForallOp>(
521 omitTileOffsetBoundsCheck, nominalTileSizes,
522 tiledOffsets, tiledSizes);
534 auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
535 if (destinationStyleOp) {
536 for (
OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) {
539 if (isa<TensorType>(outOperand.get().getType())) {
540 auto *it = llvm::find(dest, outOperand.get());
541 assert(it != dest.end() &&
"could not find destination tensor");
542 unsigned destNum = std::distance(dest.begin(), it);
543 outOperand.set(destBbArgs[destNum]);
549 FailureOr<TilingResult> tilingResult =
550 cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
552 if (failed(tilingResult))
553 return clonedOp->
emitError(
"Failed to tile op: ");
554 if (tilingResult->tiledOps.size() != 1) {
555 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
556 << tilingResult->tiledOps.size();
560 tiledOp = tilingResult->tiledOps.front();
561 tiledValues = tilingResult->tiledValues;
565 for (
auto it : llvm::zip(llvm::seq(
unsigned(0),
unsigned(dest.size())),
566 tiledValues, destBbArgs)) {
572 if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
573 tiledSizes, resultOffsets,
575 return op->
emitOpError(
"output offsets couldn't be calculated");
581 b.
create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
582 std::get<2>(it), resultOffsets,
583 resultSizes, strides);
588 FailureOr<ForallTilingResult>
591 std::optional<ArrayAttr> mapping) {
593 std::nullopt, mapping,
597 FailureOr<ForallTilingResult>
600 std::optional<ArrayAttr> mapping) {
602 unsigned nLoops = loopRanges.size();
604 numThreads.reserve(nLoops);
608 for (
const auto &it : llvm::zip(tileSizes, loopRanges)) {
612 b, op.
getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
613 numThreads.push_back(numTiles);
620 template <
typename LoopTy>
621 static FailureOr<TiledLinalgOp>
626 auto nLoops = op.getNumLoops();
628 tileSizes = tileSizes.take_front(nLoops);
634 tiledOp.
op = cast<LinalgOp>(b.
clone(*op.getOperation()));
636 tiledOp.
op->result_end());
642 op.createFlatListOfOperandDims(b, op.
getLoc());
643 AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
644 if (!shapeSizesToLoopsMap)
648 b, op.
getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);
651 for (
const auto &attr :
enumerate(op.getIteratorTypesArray())) {
652 if (loopIndexToRangeIndex.count(attr.index()))
653 iteratorTypes.push_back(attr.value());
657 auto invPermutationMap =
659 if (!
options.interchangeVector.empty()) {
663 interchangeVector.reserve(
options.interchangeVector.size());
664 for (
auto pos :
options.interchangeVector) {
665 auto it = loopIndexToRangeIndex.find(pos);
666 if (it == loopIndexToRangeIndex.end())
668 interchangeVector.push_back(it->second);
674 assert(invPermutationMap);
676 interchangeVector.end());
686 iteratorTypes.size(),
693 parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
695 auto returnedProcInfo =
696 options.distribution->procInfo(b, op.
getLoc(), parallelLoopRanges);
697 unsigned procIdIdx = 0;
702 procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
709 auto tiledLoopBodyBuilder =
712 ivs.assign(localIvs.begin(), localIvs.end());
718 if (!
options.interchangeVector.empty()) {
719 for (
AffineExpr result : invPermutationMap.getResults())
720 interchangedIvs.push_back(
721 ivs[cast<AffineDimExpr>(result).getPosition()]);
723 interchangedIvs.assign(ivs.begin(), ivs.end());
728 assert(operandValuesToUse.size() ==
730 "expect the number of operands and inputs and outputs to match");
742 res =
clone(b, op, resultTensorTypes, tiledOperands);
748 tiledLoopBodyBuilder, procInfo);
755 loops.reserve(ivs.size());
756 for (
auto iv : ivs) {
757 if (isa<BlockArgument>(iv)) {
758 loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());
759 assert(loops.back() &&
"no owner found for induction variable!");
763 loops.push_back(
nullptr);
771 if ((outermostLoop = loop))
775 res, loops, outermostLoop ? outermostLoop->
getResults() : tensorResults};
781 std::optional<ArrayAttr> mapping) {
788 auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
794 auto destinationStyleOp =
795 dyn_cast<DestinationStyleOpInterface>(op.getOperation());
796 if (!destinationStyleOp)
800 auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
807 op,
"don't support ops with multiple results for now");
810 tilingInterfaceOp.getLoopIteratorTypes();
812 linalgOp.getReductionDims(redDims);
813 if (redDims.size() != 1)
815 op,
"only support ops with one reduction dimension.");
816 if (!tileSizes.empty() && tileSizes.size() != numThreads.size())
818 "many elements as number of threads");
819 int reductionDim =
static_cast<int>(redDims.front());
821 if (redDims.front() >= numThreads.size())
823 op,
"reduction dimension must be mapped to threads");
826 FailureOr<SmallVector<Value>> maybeInitTensors =
827 op.generateInitialTensorForPartialReduction(b, loc, numThreads,
829 if (failed(maybeInitTensors))
831 op,
"Failed to create inital tensors for partial reduction");
842 llvm::to_vector(llvm::make_filter_range(numThreads, [](
OpFoldResult ofr) {
849 scf::ForallOp forallOp = b.
create<scf::ForallOp>(
858 std::nullopt, tiledOffsets,
871 for (
Value initOperand : destinationStyleOp.getDpsInits()) {
872 auto *it = llvm::find(dest, initOperand);
873 assert(it != dest.end() &&
"dest operand not found in dest");
874 unsigned destNum = std::distance(dest.begin(), it);
880 outOffsets[reductionDim] = forallOp.getInductionVars()[0];
882 tiledDpsInitOperands.push_back(b.
create<tensor::ExtractSliceOp>(
883 loc, cast<RankedTensorType>(initOperand.getType()),
884 destBbArgs[destNum], outOffsets, sizes, strides));
892 for (
auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
893 cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),
894 tiledDpsInitOperands)) {
895 initOperandPtr.set(tiledInitValue);
900 if (tileSizes.empty()) {
901 FailureOr<TilingResult> tilingResult =
902 cast<TilingInterface>(clonedOp).getTiledImplementation(
903 b, tiledOffsets, tiledSizes);
904 if (failed(tilingResult))
905 return clonedOp->
emitError(
"Failed to tile op: ");
906 if (tilingResult->tiledOps.size() != 1) {
907 return clonedOp->
emitError(
"expected a single produced tiled op, got ")
908 << tilingResult->tiledOps.size();
910 tiledOp = tilingResult->tiledOps.front();
911 tilingResults = tilingResult->tiledValues;
914 FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
915 b, cast<LinalgOp>(clonedOp), tileSizes,
options);
916 if (failed(maybeTiled))
921 materializedNonZeroNumThreads);
922 if (maybeTiled->loops.size() != 1) {
923 return clonedOp->
emitError(
"expected a single produced loop");
925 tiledOp = maybeTiled->op;
926 tilingResults = maybeTiled->loops.front()->getResults();
933 for (
auto [index, result, bbArg] : llvm::zip(
934 llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
940 if (failed(tilingInterfaceOp.getResultTilePosition(
941 b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))
942 return op->
emitOpError(
"output offsets couldn't be calculated");
946 for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
947 if (i == reductionDim) {
948 resultOffsetsRank.push_back(forallOp.getInductionVars()[0]);
952 resultOffsetsRank.push_back(resultOffsets[offIdx++]);
953 resultSizesRank.push_back(resultSizes[sizeIdx++]);
961 b.
create<tensor::ParallelInsertSliceOp>(
962 loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
967 FailureOr<MergeResult> mergeResult =
968 op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);
969 if (failed(mergeResult)) {
972 b.
replaceOp(op, mergeResult->replacements);
977 results.
loops = forallOp;
979 results.
mergeOps.append(mergeResult->mergeOps);
983 template <
typename LoopTy>
989 if (!
options.tileSizeComputationFunction)
995 auto nLoops = op.getNumLoops();
998 if (tileSizeVector.size() < nLoops) {
999 tileSizeVector.append(nLoops - tileSizeVector.size(), b.
getIndexAttr(0));
1002 return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector,
options);
1005 FailureOr<TiledLinalgOp>
1010 return tileLinalgOpImpl<scf::ForOp>(b, op,
options);
1011 case LinalgTilingLoopType::ParallelLoops:
1012 return tileLinalgOpImpl<scf::ParallelOp>(b, op,
options);
1020 template <
typename... OpTypes>
1021 class CanonicalizationPatternList;
1024 class CanonicalizationPatternList<> {
1029 template <
typename OpTy,
typename... OpTypes>
1030 class CanonicalizationPatternList<OpTy, OpTypes...> {
1033 OpTy::getCanonicalizationPatterns(patterns, patterns.
getContext());
1034 CanonicalizationPatternList<OpTypes...>::insert(patterns);
1049 affine::AffineApplyOp::getCanonicalizationPatterns(patterns, ctx);
1050 affine::AffineForOp::getCanonicalizationPatterns(patterns, ctx);
1051 affine::AffineMinOp::getCanonicalizationPatterns(patterns, ctx);
1052 affine::AffineMaxOp::getCanonicalizationPatterns(patterns, ctx);
1053 arith::ConstantIndexOp::getCanonicalizationPatterns(patterns, ctx);
1055 memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);
1056 memref::ViewOp::getCanonicalizationPatterns(patterns, ctx);
1058 scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
1059 scf::ParallelOp::getCanonicalizationPatterns(patterns, ctx);
1061 tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
1062 tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
1063 tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, ctx);
1064 tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, ctx);
1065 tensor::PadOp::getCanonicalizationPatterns(patterns, ctx);
1066 ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(patterns);
1068 CanonicalizationPatternList<
1070 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
1071 >::insert(patterns);
static llvm::ManagedStatic< PassManagerOptions > options
SmallVector< bool > safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp, ArrayRef< OpFoldResult > numThreads)
Returns a vector of bools representing if, for each axis, op can be tiled without incurring in a race...
static FailureOr< ForallTilingResult > tileToForallOpImpl(RewriterBase &b, TilingInterface op, ArrayRef< OpFoldResult > numThreads, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, std::optional< ArrayAttr > mapping, bool omitTileOffsetBoundsCheck)
Rewrite a TilingInterface op to a tiled scf.forall.
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize)
Returns true if the maximum tile offset tileSize * numThreads-1 is less than iterationSize.
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, OpFoldResult value)
Asserts that the given index-typed value is strictly positive.
static OpFoldResult buildMax(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_max of all the vals.
static void calculateTileOffsetsAndSizes(RewriterBase &b, Location loc, scf::ForallOp forallOp, ArrayRef< OpFoldResult > numThreads, SmallVector< Range > loopRanges, bool omitTileOffsetBoundsCheck, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, SmallVector< OpFoldResult > &tiledOffsets, SmallVector< OpFoldResult > &tiledSizes)
Fill out the tiledOffsets and tiledSizes to be used to tile to a given number of threads.
static FailureOr< TiledLinalgOp > tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef< OpFoldResult > tileSizes, const LinalgTilingOptions &options)
static OpFoldResult buildMin(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_min of all the vals.
Base type for affine expression.
AffineExpr floorDiv(uint64_t v) const
AffineExpr ceilDiv(uint64_t v) const
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
unsigned getNumResults() const
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Attributes are known-constant values of operations.
IntegerAttr getIndexAttr(int64_t value)
AffineExpr getAffineSymbolExpr(unsigned position)
StringAttr getStringAttr(const Twine &bytes)
MLIRContext * getContext() const
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Location getLoc() const
Accessors for the implied location.
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext * getContext() const
Return the context this location is uniqued in.
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class represents a single result from folding an operation.
This class represents an operand of an operation.
Operation is the basic unit of execution within MLIR.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
Location getLoc()
The source location the operation was defined or derived from.
unsigned getNumOperands()
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
InFlightDiagnostic emitOpError(const Twine &message={})
Emit an error with the op name prefixed, like "'dim' op " which is convenient for verifiers.
unsigned getNumResults()
Return the number of results held by this operation.
MLIRContext * getContext() const
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Specialization of arith.constant op that returns an integer of index type.
SmallVector< OpFoldResult > makeComposedFoldedMultiResultAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Variant of makeComposedFoldedAffineApply suitable for multi-result maps.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
FailureOr< ForallTilingResult > tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > tileSizes, std::optional< ArrayAttr > mapping)
Same as tileToForallOp, but calculate the number of threads required using the given tileSizes.
SmallVector< Value > makeTiledShapes(OpBuilder &builder, Location loc, LinalgOp linalgOp, ValueRange valuesToTile, ArrayRef< OpFoldResult > ivs, ArrayRef< OpFoldResult > tileSizes, ArrayRef< OpFoldResult > sizeBounds, bool omitPartialTileCheck)
Creates extract_slice/subview ops for all valuesToTile of the given linalgOp with builder,...
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
FailureOr< ForallTilingResult > tileToForallOp(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > numThreads, std::optional< ArrayAttr > mapping)
SmallVector< Value > insertSlicesBack(OpBuilder &builder, Location loc, LinalgOp op, ValueRange operands, ValueRange results)
Creates insert_slice ops that insert results back into larger tensors they were originally extracted ...
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef< OpFoldResult > offests)
Add the specified offsets to any linalg.index ops contained in the given linalgOp.
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
SmallVector< Type > getTensorOutputTypes(LinalgOp op, ValueRange operands)
Returns the list of tensor output types produced when the given structured operation op is applied to...
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
SmallVector< Value > ValueVector
An owning vector of values, handy to return from functions.
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
Include the generated interface declarations.
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
SmallVector< scf::ForOp, 8 > Loops
Tile a nest of standard for loops rooted at rootForOp by finding such parametric tile sizes that the ...
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Transformation information returned after reduction tiling.
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
SmallVector< Value > initialValues
Initial values used for partial reductions.
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
Rewrite a TilingInterface op to a tiled scf.forall, applying tiling by numThreads.
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Callback function type used to get processor ID, and number of processors used for distribution for a...
Perform standalone tiling of a single LinalgOp by tileSizes.
SmallVector< Value, 4 > tensorResults
SmallVector< T > tileSizes
Tile sizes.
SmallVector< T > tripCounts
Number of tiles associated with each size.
T lowTripCount
Number of tiles associated with each size.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.