doxygen/Tiling%5F8cpp%5Fsource.html

 //===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements the linalg dialect Tiling pass.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/Linalg/Passes.h"


 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Affine/LoopUtils.h"

 #include "mlir/Dialect/Arith/Utils/Utils.h"

 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/Linalg/IR/Linalg.h"

 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/SCF/Transforms/Transforms.h"

 #include "mlir/Dialect/Tensor/IR/Tensor.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/Dialect/Utils/StaticValueUtils.h"

 #include "mlir/IR/AffineExpr.h"

 #include "mlir/IR/AffineMap.h"

 #include "mlir/IR/BuiltinOps.h"

 #include "mlir/IR/ValueRange.h"

 #include "mlir/Transforms/FoldUtils.h"

 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/Support/CommandLine.h"

 #include <utility>


 namespace mlir {

 #define GEN_PASS_DEF_LINALGTILINGPASS

 #include "mlir/Dialect/Linalg/Passes.h.inc"

 } // namespace mlir


 using namespace mlir;

 using namespace mlir::affine;

 using namespace mlir::linalg;

 using namespace mlir::scf;


 #define DEBUG_TYPE "linalg-tiling"


 std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>

 mlir::linalg::makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map,

                                   ArrayRef<OpFoldResult> allShapeSizes,

                                   ArrayRef<OpFoldResult> allTileSizes) {

   assert(allTileSizes.size() == map.getNumResults());

   // Apply `map` to get shape sizes in loop order.

   SmallVector<OpFoldResult> shapeSizes =

       makeComposedFoldedMultiResultAffineApply(b, loc, map, allShapeSizes);

   SmallVector<OpFoldResult> tileSizes(allTileSizes);


   // Traverse the tile sizes, which are in loop order, erase zeros everywhere.

   LoopIndexToRangeIndexMap loopIndexToRangeIndex;

   for (int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {

     if (getConstantIntValue(tileSizes[idx - zerosCount]) ==

         static_cast<int64_t>(0)) {

       shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);

       tileSizes.erase(tileSizes.begin() + idx - zerosCount);

       ++zerosCount;

       continue;

     }

     loopIndexToRangeIndex[idx] = idx - zerosCount;

   }


   // Create a new range with the applied tile sizes.

   SmallVector<Range, 4> res;

   for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)

     res.push_back(Range{b.getIndexAttr(0), shapeSizes[idx], tileSizes[idx]});

   return std::make_tuple(res, loopIndexToRangeIndex);

 }


 void mlir::linalg::transformIndexOps(

     RewriterBase &b, LinalgOp op, SmallVectorImpl<Value> &ivs,

     const LoopIndexToRangeIndexMap &loopIndexToRangeIndex) {

   SmallVector<Value> allIvs(op.getNumLoops(), nullptr);

   for (auto en : enumerate(allIvs)) {

     auto rangeIndex = loopIndexToRangeIndex.find(en.index());

     if (rangeIndex == loopIndexToRangeIndex.end())

       continue;

     en.value() = ivs[rangeIndex->second];

   }

   offsetIndices(b, op, getAsOpFoldResult(allIvs));

 }


 /// Asserts that the given index-typed value is strictly positive. If the value

 /// is an attribute, asserts at compile time, otherwise emits an assertion

 /// checked at runtime.

 static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b,

                                          OpFoldResult value) {

   if (auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {

     assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&

            "expected strictly positive tile size and divisor");

     return;

   }


   Value zero = b.create<arith::ConstantIndexOp>(0);

   Value condition = b.create<arith::CmpIOp>(arith::CmpIPredicate::sgt,

                                             cast<Value>(value), zero);

   b.create<cf::AssertOp>(

       condition,

       b.getStringAttr("expected strictly positive tile size and divisor"));

 }


 FailureOr<StaticContinuousTileSizeSpecification>

 mlir::linalg::computeStaticContinuousTileSizes(LinalgOp op,

                                                unsigned dimension,

                                                unsigned targetSize) {


   assert(!op.hasDynamicShape() &&

          "cannot compute static multi-tile sizes for an op with dynamic shape");

   assert(targetSize > 0 && "target size must be non-negative");

   assert(dimension < op.getNumLoops() && "dimension overflow");


   StaticContinuousTileSizeSpecification spec;

   int64_t loopRange = op.getStaticLoopRanges()[dimension];

   int64_t tripCount = loopRange / targetSize;


   unsigned tileSize = targetSize;


   spec.tileSizes.push_back(tileSize);

   spec.tripCounts.push_back(tripCount);


   int64_t remainderChunk = loopRange % targetSize;


   while (tileSize > 1 && remainderChunk != 0) {


     uint64_t maxPower = llvm::bit_floor(tileSize);

     tileSize = maxPower == tileSize ? maxPower >> 1 : maxPower;


     tripCount = remainderChunk / tileSize;


     if (tripCount > 0) {

       spec.tileSizes.push_back(tileSize);

       spec.tripCounts.push_back(tripCount);

     }


     remainderChunk = remainderChunk % tileSize;

   }


   auto tripCountCheck = [&](SmallVector<int64_t> tileSizes,

                             SmallVector<int64_t> tripCounts,

                             int64_t range) -> bool {

     int64_t computedRange = 0;

     for (auto [tileSize, tripCount] : llvm::zip(tileSizes, tripCounts))

       computedRange += tileSize * tripCount;

     return range == computedRange;

   };


   if (!tripCountCheck(spec.tileSizes, spec.tripCounts, loopRange))

     return failure();


   return spec;

 }


 FailureOr<ContinuousTileSizeSpecification>

 mlir::linalg::computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,

                                          unsigned dimension,

                                          OpFoldResult targetSize,

                                          bool emitAssertions) {


   SmallVector<Range> loopRanges = op.getIterationDomain(builder);

   unsigned numLoops = loopRanges.size();


   // Bail out on dimension overflow.

   if (dimension >= numLoops)

     return failure();


   // The code below works only on values.

   Location loc = op->getLoc();

   ImplicitLocOpBuilder b(loc, builder);

   if (emitAssertions) {

     emitIsPositiveIndexAssertion(b, targetSize);

   }

   Value targetSizeValue =

       getValueOrCreateConstantIndexOp(builder, loc, targetSize);


   // Find the trip count of the iteration space dimension for which the tile

   // sizes are computed.

   Value loopRange = getValueOrCreateConstantIndexOp(b, loc,

                                                     loopRanges[dimension].size);

   ContinuousTileSizeSpecification spec;


   // Compute the tile sizes and the respective numbers of tiles.

   AffineExpr s0 = b.getAffineSymbolExpr(0);

   AffineExpr s1 = b.getAffineSymbolExpr(1);

   auto apply = [&](AffineExpr expr, ArrayRef<OpFoldResult> ofrs) -> Value {

     return affine::makeComposedAffineApply(b, b.getLoc(), expr, ofrs);

   };


   Value tripCountValue = apply(s0.floorDiv(s1), {loopRange, targetSizeValue});

   Value remainderChunkValue = apply(s0 % s1, {loopRange, targetSizeValue});


   OpFoldResult tripCountSize = affine::makeComposedFoldedAffineApply(

       b, b.getLoc(), s0.floorDiv(s1), {loopRange, targetSizeValue});


   // emitAssertions above already asserts that targetSize is

   // a poistive integer.

   uint64_t tileSizeInt = *getConstantIntValue(targetSizeValue);


   assert(tileSizeInt > 0 && "target size must be non-negative");


   spec.tileSizes.push_back(targetSizeValue);

   spec.tripCounts.push_back(tripCountValue);


   while (tileSizeInt > 1) {

     uint64_t maxPower = llvm::bit_floor(tileSizeInt);

     tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;

     auto constStepOp =

         builder.createOrFold<arith::ConstantIndexOp>(b.getLoc(), tileSizeInt);

     tripCountValue = apply(s0.floorDiv(s1), {remainderChunkValue, constStepOp});


     tripCountSize = affine::makeComposedFoldedAffineApply(

         b, b.getLoc(), s0.floorDiv(s1), {remainderChunkValue, constStepOp});


     // Optimization if tripCount can be determined to be zero.

     if (Attribute attr = llvm::dyn_cast_if_present<Attribute>(tripCountSize)) {

       auto intAttr = cast<IntegerAttr>(attr);

       bool isTripCountZero = intAttr.getValue().isZero();


       if (!isTripCountZero) {

         spec.tileSizes.push_back(constStepOp);

         spec.tripCounts.push_back(tripCountValue);

       }

     } else {

       spec.tileSizes.push_back(constStepOp);

       spec.tripCounts.push_back(tripCountValue);

     }


     remainderChunkValue = apply(s0 % s1, {remainderChunkValue, constStepOp});

   }


   return spec;

 }


 FailureOr<StaticMultiSizeSpecification>

 mlir::linalg::computeStaticMultiTileSizes(LinalgOp op, unsigned dimension,

                                           int64_t targetSize, int64_t divisor) {

   assert(!op.hasDynamicShape() &&

          "cannot compute static multi-tile sizes for an op with dynamic shape");

   assert(targetSize > 0 && "target size must be non-negative");

   assert(divisor > 0 && "divisor must be non-negative");

   assert(dimension < op.getNumLoops() && "dimension overflow");


   StaticMultiSizeSpecification spec;

   int64_t tripCount = op.getStaticLoopRanges()[dimension];

   int64_t a = tripCount / divisor;

   int64_t t = (targetSize + divisor - 1) / divisor;

   int64_t totalTripCount = (a + t - 1) / t;

   spec.lowTileSize = (a / totalTripCount) * divisor;

   spec.highTileSize = spec.lowTileSize + divisor;

   spec.highTripCount = a % totalTripCount;

   spec.lowTripCount = totalTripCount - spec.highTripCount;

   if (spec.lowTileSize * spec.lowTripCount +

           spec.highTileSize * spec.highTripCount !=

       tripCount) {

     return failure();

   }

   return spec;

 }


 FailureOr<MultiSizeSpecification>

 mlir::linalg::computeMultiTileSizes(OpBuilder &builder, LinalgOp op,

                                     unsigned dimension, OpFoldResult targetSize,

                                     OpFoldResult divisor, bool emitAssertions) {

   // Bail out on dimension overflow.

   if (dimension >= op.getNumLoops())

     return failure();


   // The code below works only on values.

   Location loc = op.getLoc();

   ImplicitLocOpBuilder b(loc, builder);

   if (emitAssertions) {

     emitIsPositiveIndexAssertion(b, targetSize);

     emitIsPositiveIndexAssertion(b, divisor);

   }

   Value targetSizeValue =

       getValueOrCreateConstantIndexOp(builder, loc, targetSize);

   Value divisorValue = getValueOrCreateConstantIndexOp(builder, loc, divisor);


   // Find the trip count of the iteration space dimension for which the tile

   // sizes are computed.

   SmallVector<OpFoldResult> allShapes =

       op.createFlatListOfOperandDims(b, b.getLoc());

   AffineMap shapesToLoops = op.getShapesToLoopsMap();

   SmallVector<OpFoldResult> loopRanges =

       makeComposedFoldedMultiResultAffineApply(b, op.getLoc(), shapesToLoops,

                                                allShapes);

   Value tripCount =

       getValueOrCreateConstantIndexOp(b, op.getLoc(), loopRanges[dimension]);


   // Compute the tile sizes and the respective numbers of tiles.

   AffineExpr s0 = b.getAffineSymbolExpr(0);

   AffineExpr s1 = b.getAffineSymbolExpr(1);

   AffineExpr s2 = b.getAffineSymbolExpr(2);

   auto apply = [&](AffineExpr expr, ArrayRef<OpFoldResult> ofrs) -> Value {

     return affine::makeComposedAffineApply(b, b.getLoc(), expr, ofrs);

   };

   Value a = apply(s0.floorDiv(s1), {tripCount, divisorValue});

   Value t = apply((s0 + s1 - 1).floorDiv(s1), {targetSizeValue, divisorValue});

   Value d = apply((s0 + s1 - 1).floorDiv(s1), {a, t});

   Value s = apply(s0.floorDiv(s1) * s2, {a, d, divisorValue});

   Value v = apply(s0 % s1, {a, d});

   Value u = apply(s0 - s1, {d, v});


   MultiSizeSpecification spec;

   spec.lowTileSize = s;

   spec.highTileSize = apply(s0 + s1, {s, divisorValue});

   spec.lowTripCount = u;

   spec.highTripCount = v;


   // If requested, emit the check that the tile sizes are computed correctly.

   // For example, for iteration dimension size of 15 and the target size 8 it is

   // impossible to find two tile sizes both divisible by 8 that fully cover the

   // original space dimension.

   if (emitAssertions) {

     AffineExpr s3 = builder.getAffineSymbolExpr(3);

     Value coveredSize =

         apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,

                                   spec.highTileSize, spec.highTripCount});

     Value equals = b.create<arith::CmpIOp>(arith::CmpIPredicate::eq,

                                            coveredSize, tripCount);

     b.create<cf::AssertOp>(

         equals, builder.getStringAttr(

                     "could not compute dynamic multi-size tile shapes"));

   }


   return spec;

 }


 /// Returns true if the maximum tile offset `tileSize * numThreads-1` is less

 /// than `iterationSize`.

 static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,

                                            OpFoldResult numThreads,

                                            OpFoldResult iterationSize) {

   std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);

   std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);

   std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);

   if (!tileSizeConst || !numThreadsConst || !iterSizeConst)

     return false;

   return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;

 }


 /// Build an `affine_max` of all the `vals`.

 static OpFoldResult buildMax(OpBuilder &b, Location loc,

                              ArrayRef<OpFoldResult> vals) {

   return affine::makeComposedFoldedAffineMax(

       b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()),

       vals);

 }


 /// Build an `affine_min` of all the `vals`.

 static OpFoldResult buildMin(OpBuilder &b, Location loc,

                              ArrayRef<OpFoldResult> vals) {

   return affine::makeComposedFoldedAffineMin(

       b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()),

       vals);

 }


 /// Fill out the `tiledOffsets` and `tiledSizes` to be used to tile to a given

 /// number of threads.

 static void calculateTileOffsetsAndSizes(

     RewriterBase &b, Location loc, scf::ForallOp forallOp,

     ArrayRef<OpFoldResult> numThreads, SmallVector<Range> loopRanges,

     bool omitTileOffsetBoundsCheck,

     std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,

     SmallVector<OpFoldResult> &tiledOffsets,

     SmallVector<OpFoldResult> &tiledSizes) {

   OpBuilder::InsertionGuard g(b);

   b.setInsertionPointToStart(forallOp.getBody(0));


   SmallVector<Value> threadIds = forallOp.getInductionVars();

   SmallVector<OpFoldResult> nonZeroNumThreads = llvm::filter_to_vector(

       numThreads, [](OpFoldResult ofr) { return !isZeroInteger(ofr); });

   int64_t nLoops = loopRanges.size();

   tiledOffsets.reserve(nLoops);

   tiledSizes.reserve(nLoops);

   for (unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {

     bool overflow = loopIdx >= numThreads.size();

     bool isZero = !overflow && isZeroInteger(numThreads[loopIdx]);

     // Degenerate case: take the whole domain.

     if (overflow || isZero) {

       tiledOffsets.push_back(loopRanges[loopIdx].offset);

       tiledSizes.push_back(loopRanges[loopIdx].size);

       continue;

     }


     // Tiled case: compute the offset and size.

     AffineExpr i, j, m, n, o;

     bindDims(b.getContext(), i, j);

     bindSymbols(b.getContext(), m, n, o);

     OpFoldResult size = loopRanges[loopIdx].size;

     OpFoldResult offset = loopRanges[loopIdx].offset;

     OpFoldResult threadId = threadIds[threadIdIdx];

     // Symbolic fixed max size per thread.

     // TODO: floor + 0/1 depending on case for better load-balancing.

     OpFoldResult tileSizePerThread =

         nominalTileSizes.has_value()

             ? (*nominalTileSizes)[loopIdx]

             : makeComposedFoldedAffineApply(

                   b, loc, m.ceilDiv(n),

                   ArrayRef<OpFoldResult>{size, nonZeroNumThreads[threadIdIdx]});


     // Dynamic offset shifted by threadId * maxSizePerThread.

     OpFoldResult offsetPerThread = makeComposedFoldedAffineApply(

         b, loc, i + j * m, {offset, threadId, tileSizePerThread});

     // Dynamic upper-bound depending on the threadId.

     OpFoldResult residualTileSize = makeComposedFoldedAffineApply(

         b, loc, i + j * m - n,

         {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});

     if (!isZeroInteger(residualTileSize)) {

       OpFoldResult sizeMinusOffsetPerThread = makeComposedFoldedAffineApply(

           b, loc, -i + m, {offsetPerThread, size});

       tileSizePerThread =

           buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});

     }


     tiledOffsets.push_back(offsetPerThread);

     // TODO: if tileSizePerThread <= 0 early exit.

     if (!omitTileOffsetBoundsCheck &&

         !canOmitTileOffsetInBoundsCheck(tileSizePerThread,

                                         nonZeroNumThreads[threadIdIdx], size))

       tileSizePerThread =

           buildMax(b, loc, {b.getIndexAttr(0), tileSizePerThread});


     tiledSizes.push_back(tileSizePerThread);

     ++threadIdIdx;

   }

 }


 template <typename LoopTy>

 static FailureOr<TiledLinalgOp>

 tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,

                  const LinalgTilingOptions &options) {

   OpBuilder::InsertionGuard g(b);


   auto nLoops = op.getNumLoops();

   // Initial tile sizes may be too big, only take the first nLoops.

   tileSizes = tileSizes.take_front(nLoops);


   if (llvm::all_of(tileSizes, [](OpFoldResult ofr) {

         return getConstantIntValue(ofr) == static_cast<int64_t>(0);

       })) {

     TiledLinalgOp tiledOp;

     tiledOp.op = cast<LinalgOp>(b.clone(*op.getOperation()));

     tiledOp.tensorResults.assign(tiledOp.op->result_begin(),

                                  tiledOp.op->result_end());

     return tiledOp;

   }


   // 1. Build the tiled loop ranges.

   SmallVector<OpFoldResult> allShapeSizes =

       op.createFlatListOfOperandDims(b, op.getLoc());

   AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();

   if (!shapeSizesToLoopsMap)

     return failure();


   auto [loopRanges, loopIndexToRangeIndex] = makeTiledLoopRanges(

       b, op.getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);


   SmallVector<utils::IteratorType, 4> iteratorTypes;

   for (const auto &attr : enumerate(op.getIteratorTypesArray())) {

     if (loopIndexToRangeIndex.count(attr.index()))

       iteratorTypes.push_back(attr.value());

   }

   // If interchangeVector is empty, use the identity. Build the permutation map

   // otherwise.

   auto invPermutationMap =

       AffineMap::getMultiDimIdentityMap(tileSizes.size(), b.getContext());

   if (!options.interchangeVector.empty()) {

     // Based on the pruned iterations (due to zero tile size), recompute the

     // interchange vector.

     SmallVector<unsigned, 4> interchangeVector;

     interchangeVector.reserve(options.interchangeVector.size());

     for (auto pos : options.interchangeVector) {

       auto it = loopIndexToRangeIndex.find(pos);

       if (it == loopIndexToRangeIndex.end())

         continue;

       interchangeVector.push_back(it->second);

     }

     // Interchange vector is guaranteed to be a permutation,

     // `inversePermutation` must succeed.

     invPermutationMap = inversePermutation(

         AffineMap::getPermutationMap(interchangeVector, b.getContext()));

     assert(invPermutationMap);

     SmallVector<int64_t> permutation(interchangeVector.begin(),

                                      interchangeVector.end());

     applyPermutationToVector(loopRanges, permutation);

     applyPermutationToVector(iteratorTypes, permutation);

   }


   // Handle distribution. Create a vector of the same size of loops that are to

   // be tiled.

   SmallVector<linalg::ProcInfo> procInfo;

   if (options.distribution) {

     procInfo.resize(

         iteratorTypes.size(),

         linalg::ProcInfo{nullptr, nullptr, linalg::DistributionMethod::None});

     // Collect loop ranges of tiled loops, loops that are parallel.

     SmallVector<Range> parallelLoopRanges;

     for (const auto &iteratorType : llvm::enumerate(iteratorTypes)) {

       if (!isParallelIterator(iteratorType.value()))

         break;

       parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);

     }

     auto returnedProcInfo =

         options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges);

     unsigned procIdIdx = 0;

     // Update the distribution information for the loops.

     for (const auto &iteratorType : llvm::enumerate(iteratorTypes)) {

       if (!isParallelIterator(iteratorType.value()))

         break;

       procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];

     }

   }


   // 2. Create the tiled loops.

   LinalgOp res = op;

   SmallVector<Value, 4> ivs, tensorResults;

   auto tiledLoopBodyBuilder =

       [&](OpBuilder &builder, Location loc, ValueRange localIvs,

           ValueRange operandValuesToUse) -> scf::ValueVector {

     ivs.assign(localIvs.begin(), localIvs.end());


     // When an `interchangeVector` is present, it has been applied to the

     // loop ranges and the iterator types. Apply its inverse to the

     // resulting loop `ivs` to match the op definition.

     SmallVector<Value, 4> interchangedIvs;

     if (!options.interchangeVector.empty()) {

       for (AffineExpr result : invPermutationMap.getResults())

         interchangedIvs.push_back(

             ivs[cast<AffineDimExpr>(result).getPosition()]);

     } else {

       interchangedIvs.assign(ivs.begin(), ivs.end());

     }


     // Tile the `operandValuesToUse` that either match the `op` operands

     // themselves or the tile loop arguments forwarding them.

     assert(operandValuesToUse.size() ==

                static_cast<size_t>(op->getNumOperands()) &&

            "expect the number of operands and inputs and outputs to match");

     SmallVector<Value> valuesToTile = operandValuesToUse;

     SmallVector<OpFoldResult> sizeBounds =

         makeComposedFoldedMultiResultAffineApply(b, loc, shapeSizesToLoopsMap,

                                                  allShapeSizes);

     SmallVector<Value> tiledOperands = makeTiledShapes(

         b, loc, op, valuesToTile, getAsOpFoldResult(interchangedIvs), tileSizes,

         sizeBounds,

         /*omitPartialTileCheck=*/false);


     SmallVector<Type> resultTensorTypes =

         getTensorOutputTypes(op, tiledOperands);

     res = clone(b, op, resultTensorTypes, tiledOperands);

     tensorResults =

         insertSlicesBack(builder, loc, op, tiledOperands, res->getResults());

     return scf::ValueVector(tensorResults.begin(), tensorResults.end());

   };

   GenerateLoopNest<LoopTy>::doit(b, op.getLoc(), loopRanges, op, iteratorTypes,

                                  tiledLoopBodyBuilder, procInfo);


   // 3. Transform IndexOp results w.r.t. the tiling.

   transformIndexOps(b, res, ivs, loopIndexToRangeIndex);


   // 4. Gather the newly created loops and return them with the new op.

   SmallVector<Operation *, 8> loops;

   loops.reserve(ivs.size());

   for (auto iv : ivs) {

     if (isa<BlockArgument>(iv)) {

       loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());

       assert(loops.back() && "no owner found for induction variable!");

     } else {

       // TODO: Instead of doing this, try to recover the ops used instead of the

       // loop.

       loops.push_back(nullptr);

     }

   }


   // 5. Get the tensor results from the outermost loop if available. Otherwise

   // use the previously captured `tensorResults`.

   Operation *outermostLoop = nullptr;

   for (Operation *loop : loops)

     if ((outermostLoop = loop))

       break;


   return TiledLinalgOp{

       res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};

 }


 FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(

     RewriterBase &b, PartialReductionOpInterface op,

     ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,

     std::optional<ArrayAttr> mapping) {

   Location loc = op.getLoc();

   OpBuilder::InsertionGuard g(b);


   // Ops implementing PartialReductionOpInterface are expected to implement

   // TilingInterface.

   // TODO: proper core mechanism to tie interfaces together.

   auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());


   // Ops implementing PartialReductionOpInterface are not necessarily expected

   // to implement TilingInterface.. This cast is unsafe atm.

   // TODO: proper core mechanism to tie interfaces together.

   // TODO: this function requires a pair of interfaces ..

   auto destinationStyleOp =

       dyn_cast<DestinationStyleOpInterface>(op.getOperation());

   if (!destinationStyleOp)

     return b.notifyMatchFailure(op, "not a destination style op");


   // Actually this only work for Linalg ops atm.

   auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());

   if (!linalgOp)

     return b.notifyMatchFailure(op, "not a linalg op");


   SmallVector<Range> iterationDomain = tilingInterfaceOp.getIterationDomain(b);

   if (op->getNumResults() != 1)

     return b.notifyMatchFailure(

         op, "don't support ops with multiple results for now");


   SmallVector<utils::IteratorType> iterators =

       tilingInterfaceOp.getLoopIteratorTypes();

   SmallVector<unsigned> redDims;

   linalgOp.getReductionDims(redDims);

   if (redDims.size() != 1)

     return b.notifyMatchFailure(

         op, "only support ops with one reduction dimension.");

   if (!tileSizes.empty() && tileSizes.size() != numThreads.size())

     return b.notifyMatchFailure(op, "if tile sizes are present it must have as "

                                     "many elements as number of threads");

   int reductionDim = static_cast<int>(redDims.front());


   if (redDims.front() >= numThreads.size())

     return b.notifyMatchFailure(

         op, "reduction dimension must be mapped to threads");


   // 1. Create the inital tensor value.

   FailureOr<SmallVector<Value>> maybeInitTensors =

       op.generateInitialTensorForPartialReduction(b, loc, numThreads,

                                                   reductionDim);

   if (failed(maybeInitTensors))

     return b.notifyMatchFailure(

         op, "Failed to create inital tensors for partial reduction");

   SmallVector<Value> &initTensors = maybeInitTensors.value();


   // Gather destination tensors.

   SmallVector<Value> dest;

   if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))

     return b.notifyMatchFailure(op, "failed to get destination tensors");


   Operation *tiledOp = nullptr;


   SmallVector<OpFoldResult> nonZeroNumThreads = llvm::filter_to_vector(

       numThreads, [](OpFoldResult ofr) { return !isZeroInteger(ofr); });

   SmallVector<Value> materializedNonZeroNumThreads =

       getValueOrCreateConstantIndexOp(b, loc, nonZeroNumThreads);


   // 2. Create the ForallOp with an empty region.

   scf::ForallOp forallOp = b.create<scf::ForallOp>(

       loc, getAsOpFoldResult(materializedNonZeroNumThreads), initTensors,

       mapping);


   // 3. Calculate the tile offsets and sizes for the subsequent loop that will

   // be nested under `forallOp`.

   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;

   calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, iterationDomain,

                                /*omitTileOffsetBoundsCheck =*/false,

                                /*nominalTileSizes=*/std::nullopt, tiledOffsets,

                                tiledSizes);


   // 4b. Clone the tileable op and update its destination operands to use the

   // output bbArgs of the ForallOp.

   SmallVector<Value> tilingResults;

   ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();

   {

     // 4.a. RAII guard, inserting within forallOp, before terminator.

     OpBuilder::InsertionGuard g(b);

     b.setInsertionPoint(forallOp.getTerminator());


     SmallVector<Value> tiledDpsInitOperands;

     for (Value initOperand : destinationStyleOp.getDpsInits()) {

       auto *it = llvm::find(dest, initOperand);

       assert(it != dest.end() && "dest operand not found in dest");

       unsigned destNum = std::distance(dest.begin(), it);

       SmallVector<OpFoldResult> strides(numThreads.size(), b.getIndexAttr(1));

       SmallVector<OpFoldResult> outOffsets(numThreads.size(),

                                            b.getIndexAttr(0));

       SmallVector<OpFoldResult> sizes = tiledSizes;

       sizes[reductionDim] = b.getIndexAttr(1);

       outOffsets[reductionDim] = forallOp.getInductionVars()[0];

       // TODO: use SubsetExtractOpInterface once it is available.

       tiledDpsInitOperands.push_back(b.create<tensor::ExtractSliceOp>(

           loc, cast<RankedTensorType>(initOperand.getType()),

           destBbArgs[destNum], outOffsets, sizes, strides));

     }


     // 4.b. Clone the op and update init operands.

     // We cannot use a IRMapping here because it can replace

     // different OpOperands with the same value.

     Operation *clonedOp = b.clone(*op.getOperation());

     b.modifyOpInPlace(clonedOp, [&]() {

       for (auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(

                cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),

                tiledDpsInitOperands)) {

         initOperandPtr.set(tiledInitValue);

       }

     });


     // 5. Tile the cloned op and delete the clone.

     if (tileSizes.empty()) {

       FailureOr<TilingResult> tilingResult =

           cast<TilingInterface>(clonedOp).getTiledImplementation(

               b, tiledOffsets, tiledSizes);

       if (failed(tilingResult))

         return clonedOp->emitError("Failed to tile op: ");

       if (tilingResult->tiledOps.size() != 1) {

         return clonedOp->emitError("expected a single produced tiled op, got ")

                << tilingResult->tiledOps.size();

       }

       tiledOp = tilingResult->tiledOps.front();

       tilingResults = tilingResult->tiledValues;

     } else {

       LinalgTilingOptions options;

       FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(

           b, cast<LinalgOp>(clonedOp), tileSizes, options);

       if (failed(maybeTiled))

         return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");


       SmallVector<Value> ids = forallOp.getInductionVars();

       mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,

                             materializedNonZeroNumThreads);

       if (maybeTiled->loops.size() != 1) {

         return clonedOp->emitError("expected a single produced loop");

       }

       tiledOp = maybeTiled->op;

       tilingResults = maybeTiled->loops.front()->getResults();

     }


     b.eraseOp(clonedOp);

   }


   // 6. Insert the partial reductions back into a new tensor.

   for (auto [index, result, bbArg] : llvm::zip(

            llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {

     // 6.a. Partial subset information is inserted just before the terminator.

     OpBuilder::InsertionGuard g(b);

     b.setInsertionPoint(forallOp.getTerminator());


     SmallVector<OpFoldResult> resultOffsets, resultSizes;

     if (failed(tilingInterfaceOp.getResultTilePosition(

             b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))

       return op->emitOpError("output offsets couldn't be calculated");

     SmallVector<OpFoldResult> resultOffsetsRank, resultSizesRank;

     int64_t offIdx = 0;

     int64_t sizeIdx = 0;

     for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {

       if (i == reductionDim) {

         resultOffsetsRank.push_back(forallOp.getInductionVars()[0]);

         resultSizesRank.push_back(b.getIndexAttr(1));

         continue;

       }

       resultOffsetsRank.push_back(resultOffsets[offIdx++]);

       resultSizesRank.push_back(resultSizes[sizeIdx++]);

     }

     SmallVector<OpFoldResult> strides(resultSizesRank.size(),

                                       b.getIndexAttr(1));


     // 6.b. Parallel insertions are inserted at the end of the combining

     // terminator.

     b.setInsertionPointToEnd(forallOp.getTerminator().getBody());

     b.create<tensor::ParallelInsertSliceOp>(

         loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);

   }


   // 7. Merge the partial reductions.

   b.setInsertionPointAfter(forallOp);

   FailureOr<MergeResult> mergeResult =

       op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);

   if (failed(mergeResult)) {

     return failure();

   }

   b.replaceOp(op, mergeResult->replacements);


   // 8. Return.

   ForallReductionTilingResult results;

   results.initialValues = initTensors;

   results.loops = forallOp;

   results.parallelTiledOps.push_back(tiledOp);

   results.mergeOps.append(mergeResult->mergeOps);

   return results;

 }


 template <typename LoopTy>

 FailureOr<TiledLinalgOp> static tileLinalgOpImpl(

     RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options) {

   OpBuilder::InsertionGuard g(b);

   b.setInsertionPoint(op);


   if (!options.tileSizeComputationFunction)

     return failure();


   // Enforce the convention that "tiling by zero" skips tiling a particular

   // dimension. This convention is significantly simpler to handle instead of

   // adjusting affine maps to account for missing dimensions.

   auto nLoops = op.getNumLoops();

   SmallVector<OpFoldResult> tileSizeVector =

       getAsOpFoldResult(options.tileSizeComputationFunction(b, op));

   if (tileSizeVector.size() < nLoops) {

     tileSizeVector.append(nLoops - tileSizeVector.size(), b.getIndexAttr(0));

   }


   return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);

 }


 FailureOr<TiledLinalgOp>

 mlir::linalg::tileLinalgOp(RewriterBase &b, LinalgOp op,

                            const LinalgTilingOptions &options) {

   switch (options.loopType) {

   case LinalgTilingLoopType::Loops:

     return tileLinalgOpImpl<scf::ForOp>(b, op, options);

   case LinalgTilingLoopType::ParallelLoops:

     return tileLinalgOpImpl<scf::ParallelOp>(b, op, options);

   default:;

   }

   return failure();

 }


 namespace {

 /// Helper classes for type list expansion.

 template <typename... OpTypes>

 class CanonicalizationPatternList;


 template <>

 class CanonicalizationPatternList<> {

 public:

   static void insert(RewritePatternSet &patterns) {}

 };


 template <typename OpTy, typename... OpTypes>

 class CanonicalizationPatternList<OpTy, OpTypes...> {

 public:

   static void insert(RewritePatternSet &patterns) {

     OpTy::getCanonicalizationPatterns(patterns, patterns.getContext());

     CanonicalizationPatternList<OpTypes...>::insert(patterns);

   }

 };

 } // namespace


 RewritePatternSet

 mlir::linalg::getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx) {

   RewritePatternSet patterns(ctx);

   populateLinalgTilingCanonicalizationPatterns(patterns);

   return patterns;

 }


 void mlir::linalg::populateLinalgTilingCanonicalizationPatterns(

     RewritePatternSet &patterns) {

   auto *ctx = patterns.getContext();

   affine::AffineApplyOp::getCanonicalizationPatterns(patterns, ctx);

   affine::AffineForOp::getCanonicalizationPatterns(patterns, ctx);

   affine::AffineMinOp::getCanonicalizationPatterns(patterns, ctx);

   affine::AffineMaxOp::getCanonicalizationPatterns(patterns, ctx);

   arith::ConstantIndexOp::getCanonicalizationPatterns(patterns, ctx);


   memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);

   memref::ViewOp::getCanonicalizationPatterns(patterns, ctx);


   scf::ForOp::getCanonicalizationPatterns(patterns, ctx);

   scf::ParallelOp::getCanonicalizationPatterns(patterns, ctx);


   tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);

   tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);

   tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, ctx);

   tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, ctx);

   tensor::PadOp::getCanonicalizationPatterns(patterns, ctx);

   ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(patterns);


   CanonicalizationPatternList<

 #define GET_OP_LIST

 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"

       >::insert(patterns);

 }

AffineOps.h

BuiltinOps.h

ControlFlowOps.h

Utils.h

Passes.h

FoldUtils.h

FuncOps.h

GreedyPatternRewriteDriver.h

IndexingUtils.h

doit
DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target, transform::ApplyToEachResultList &results, transform::TransformState &state)
Definition: LinalgTransformOps.cpp:3892

LoopUtils.h

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition: PassManagerOptions.cpp:89

StaticValueUtils.h

canOmitTileOffsetInBoundsCheck
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize)
Returns true if the maximum tile offset tileSize * numThreads-1 is less than iterationSize.
Definition: Tiling.cpp:339

emitIsPositiveIndexAssertion
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b, OpFoldResult value)
Asserts that the given index-typed value is strictly positive.
Definition: Tiling.cpp:95

buildMax
static OpFoldResult buildMax(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_max of all the vals.
Definition: Tiling.cpp:351

calculateTileOffsetsAndSizes
static void calculateTileOffsetsAndSizes(RewriterBase &b, Location loc, scf::ForallOp forallOp, ArrayRef< OpFoldResult > numThreads, SmallVector< Range > loopRanges, bool omitTileOffsetBoundsCheck, std::optional< ArrayRef< OpFoldResult >> nominalTileSizes, SmallVector< OpFoldResult > &tiledOffsets, SmallVector< OpFoldResult > &tiledSizes)
Fill out the tiledOffsets and tiledSizes to be used to tile to a given number of threads.
Definition: Tiling.cpp:368

tileLinalgOpImpl
static FailureOr< TiledLinalgOp > tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef< OpFoldResult > tileSizes, const LinalgTilingOptions &options)
Definition: Tiling.cpp:439

buildMin
static OpFoldResult buildMin(OpBuilder &b, Location loc, ArrayRef< OpFoldResult > vals)
Build an affine_min of all the vals.
Definition: Tiling.cpp:359

ValueRange.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::DenseMap
Definition: LLVM.h:55

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineExpr::floorDiv
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46

mlir::AffineMap::getMultiDimIdentityMap
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
Definition: AffineMap.cpp:334

mlir::AffineMap::getNumResults
unsigned getNumResults() const
Definition: AffineMap.cpp:402

mlir::AffineMap::getPermutationMap
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Definition: AffineMap.cpp:264

mlir::Attribute
Attributes are known-constant values of operations.
Definition: Attributes.h:25

mlir::Builder::getIndexAttr
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:106

mlir::Builder::getAffineSymbolExpr
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:366

mlir::Builder::getStringAttr
StringAttr getStringAttr(const Twine &bytes)
Definition: Builders.cpp:260

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::ImplicitLocOpBuilder
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition: ImplicitLocOpBuilder.h:23

mlir::ImplicitLocOpBuilder::getLoc
Location getLoc() const
Accessors for the implied location.
Definition: ImplicitLocOpBuilder.h:56

mlir::ImplicitLocOpBuilder::create
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
Definition: ImplicitLocOpBuilder.h:66

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::Location::getContext
MLIRContext * getContext() const
Return the context this location is uniqued in.
Definition: Location.h:86

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpBuilder::clone
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:551

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396

mlir::OpBuilder::setInsertionPointToEnd
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434

mlir::OpBuilder::createOrFold
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:518

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455

mlir::OpBuilder::setInsertionPointAfter
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410

mlir::OpFoldResult
This class represents a single result from folding an operation.
Definition: OpDefinition.h:271

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::emitError
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268

mlir::Operation::getResults
result_range getResults()
Definition: Operation.h:415

mlir::RewritePatternSet
Definition: PatternMatch.h:772

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358

mlir::RewriterBase::notifyMatchFailure
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:682

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:129

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:157

mlir::RewriterBase::modifyOpInPlace
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:594

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::arith::ConstantIndexOp
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93

Linalg.h

Transforms.h

MemRef.h

Transforms.h

Tensor.h

AffineExpr.h

AffineMap.h

mlir::affine
Definition: AffineToStandard.h:23

mlir::affine::makeComposedFoldedMultiResultAffineApply
SmallVector< OpFoldResult > makeComposedFoldedMultiResultAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Variant of makeComposedFoldedAffineApply suitable for multi-result maps.
Definition: AffineOps.cpp:1271

mlir::affine::makeComposedAffineApply
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
Definition: AffineOps.cpp:1175

mlir::affine::makeComposedFoldedAffineMax
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
Definition: AffineOps.cpp:1336

mlir::affine::makeComposedFoldedAffineMin
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
Definition: AffineOps.cpp:1329

mlir::affine::makeComposedFoldedAffineApply
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1225

mlir::affine::mapLoopToProcessorIds
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir::linalg
Definition: LinalgToStandard.h:24

mlir::linalg::makeTiledShapes
SmallVector< Value > makeTiledShapes(OpBuilder &builder, Location loc, LinalgOp linalgOp, ValueRange valuesToTile, ArrayRef< OpFoldResult > ivs, ArrayRef< OpFoldResult > tileSizes, ArrayRef< OpFoldResult > sizeBounds, bool omitPartialTileCheck)
Creates extract_slice/subview ops for all valuesToTile of the given linalgOp with builder,...
Definition: Utils.cpp:857

mlir::linalg::transformIndexOps
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
Definition: Tiling.cpp:79

mlir::linalg::isParallelIterator
bool isParallelIterator(utils::IteratorType iteratorType)
Check if iterator type has "parallel" semantics.
Definition: Utils.cpp:238

mlir::linalg::populateLinalgTilingCanonicalizationPatterns
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
Definition: Tiling.cpp:861

mlir::linalg::insertSlicesBack
SmallVector< Value > insertSlicesBack(OpBuilder &builder, Location loc, LinalgOp op, ValueRange operands, ValueRange results)
Creates insert_slice ops that insert results back into larger tensors they were originally extracted ...
Definition: Utils.cpp:777

mlir::linalg::makeTiledLoopRanges
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
Definition: Tiling.cpp:50

mlir::linalg::offsetIndices
void offsetIndices(OpBuilder &b, LinalgOp linalgOp, ArrayRef< OpFoldResult > offests)
Add the specified offsets to any linalg.index ops contained in the given linalgOp.
Definition: Utils.cpp:879

mlir::linalg::computeStaticMultiTileSizes
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
Definition: Tiling.cpp:243

mlir::linalg::computeContinuousTileSizes
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
Definition: Tiling.cpp:163

mlir::linalg::computeStaticContinuousTileSizes
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
Definition: Tiling.cpp:112

mlir::linalg::tileReductionUsingForall
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
Definition: Tiling.cpp:595

mlir::linalg::tileLinalgOp
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
Definition: Tiling.cpp:821

mlir::linalg::getLinalgTilingCanonicalizationPatterns
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
Definition: Tiling.cpp:855

mlir::linalg::getTensorOutputTypes
SmallVector< Type > getTensorOutputTypes(LinalgOp op, ValueRange operands)
Returns the list of tensor output types produced when the given structured operation op is applied to...
Definition: Utils.cpp:768

mlir::linalg::computeMultiTileSizes
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
Definition: Tiling.cpp:269

mlir::scf
Definition: SCFToGPU.h:24

mlir::scf::ValueVector
SmallVector< Value > ValueVector
An owning vector of values, handy to return from functions.
Definition: SCF.h:64

mlir::tensor::getOrCreateDestinations
LogicalResult getOrCreateDestinations(OpBuilder &b, Location loc, Operation *op, SmallVector< Value > &result)
This is a helper function for DestinationStyleOpInterface.
Definition: TensorOps.cpp:117

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::getConstantIntValue
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Definition: StaticValueUtils.cpp:115

mlir::bindDims
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311

mlir::inversePermutation
AffineMap inversePermutation(AffineMap map)
Returns a map of codomain to domain dimensions such that the first codomain dimension for a particula...
Definition: AffineMap.cpp:788

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::isZeroInteger
bool isZeroInteger(OpFoldResult v)
Return true if v is an IntegerAttr with value 0.
Definition: StaticValueUtils.cpp:18

mlir::bindSymbols
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325

mlir::getValueOrCreateConstantIndexOp
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:112

mlir::clone
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
Definition: StructuredOpsUtils.cpp:197

mlir::getAsOpFoldResult
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
Definition: StaticValueUtils.cpp:79

mlir::Loops
SmallVector< scf::ForOp, 8 > Loops
Tile a nest of standard for loops rooted at rootForOp by finding such parametric tile sizes that the ...
Definition: Utils.h:154

mlir::applyPermutationToVector
void applyPermutationToVector(SmallVector< T, N > &inVec, ArrayRef< int64_t > permutation)
Apply the permutation defined by permutation to inVec.
Definition: IndexingUtils.h:226

mlir::Range
Represents a range (offset, size, and stride) where each element of the triple may be dynamic or stat...
Definition: StaticValueUtils.h:35

mlir::linalg::ContinuousTileSizeSpecification
Definition: Transforms.h:844

mlir::linalg::ForallReductionTilingResult
Transformation information returned after reduction tiling.
Definition: Transforms.h:892

mlir::linalg::ForallReductionTilingResult::mergeOps
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
Definition: Transforms.h:896

mlir::linalg::ForallReductionTilingResult::initialValues
SmallVector< Value > initialValues
Initial values used for partial reductions.
Definition: Transforms.h:898

mlir::linalg::ForallReductionTilingResult::loops
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
Definition: Transforms.h:900

mlir::linalg::ForallReductionTilingResult::parallelTiledOps
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
Definition: Transforms.h:894

mlir::linalg::LinalgTilingOptions
Definition: Transforms.h:189

mlir::linalg::MultiSizeSpecification
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Definition: Transforms.h:839

mlir::linalg::ProcInfo
Callback function type used to get processor ID, and number of processors used for distribution for a...
Definition: Utils.h:306

mlir::linalg::StaticContinuousTileSizeSpecification
Definition: Transforms.h:846

mlir::linalg::StaticMultiSizeSpecification
Definition: Transforms.h:841

mlir::linalg::TiledLinalgOp
Perform standalone tiling of a single LinalgOp by tileSizes.
Definition: Transforms.h:680

mlir::linalg::TiledLinalgOp::op
LinalgOp op
Definition: Transforms.h:681

mlir::linalg::TiledLinalgOp::tensorResults
SmallVector< Value, 4 > tensorResults
Definition: Transforms.h:683

mlir::linalg::detail::ContinuousTileSizeSpecificationBase::tileSizes
SmallVector< T > tileSizes
Tile sizes.
Definition: Transforms.h:828

mlir::linalg::detail::ContinuousTileSizeSpecificationBase::tripCounts
SmallVector< T > tripCounts
Number of tiles associated with each size.
Definition: Transforms.h:830

mlir::linalg::detail::MultiSizeSpecificationBase::lowTripCount
T lowTripCount
Number of tiles associated with each size.
Definition: Transforms.h:822

mlir::linalg::detail::MultiSizeSpecificationBase::lowTileSize
T lowTileSize
Tile sizes.
Definition: Transforms.h:820

mlir::linalg::detail::MultiSizeSpecificationBase::highTileSize
T highTileSize
Definition: Transforms.h:820

mlir::linalg::detail::MultiSizeSpecificationBase::highTripCount
T highTripCount
Definition: Transforms.h:822

j
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.