doxygen/ParallelLoopTiling_8cpp_source.html

 //===- ParallelLoopTiling.cpp - Tiles scf.parallel ------------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements loop tiling on parallel loops.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/SCF/Transforms/Passes.h"


 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/Dialect/SCF/Transforms/Transforms.h"

 #include "mlir/Dialect/SCF/Utils/Utils.h"


 namespace mlir {

 #define GEN_PASS_DEF_SCFPARALLELLOOPTILING

 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"

 } // namespace mlir


 using namespace mlir;

 using namespace mlir::scf;


 /// Tile a parallel loop of the form

 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)

 ///                                            step (%arg4, %arg5)

 ///

 /// into

 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)

 ///                                            step (%arg4*tileSize[0],

 ///                                                  %arg5*tileSize[1])

 ///     scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)

 ///                                          min(%arg5*tileSize[1], %arg3-%i1))

 ///                                      step (%arg4, %arg5)

 ///

 /// or, when no-min-max-bounds is true, into

 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)

 ///                                            step (%arg4*tileSize[0],

 ///                                                  %arg5*tileSize[1])

 ///     scf.parallel (%j0, %j1) = (0, 0) to (%arg4*tileSize[0],

 ///                                          %arg5*tileSize[1])

 ///                                      step (%arg4, %arg5)

 ///        %inbound = (%j0 * %arg4 + %i0 < %arg2) &&

 ///                   (%j1 * %arg5 + %i1 < %arg3)

 ///        scf.if (%inbound)

 ///          ....

 ///

 /// where the uses of %i0 and %i1 in the loop body are replaced by

 /// %i0 + j0 and %i1 + %j1.

 ///

 /// The old loop is replaced with the new one.

 std::pair<ParallelOp, ParallelOp>

 mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes,

                             bool noMinMaxBounds) {

   OpBuilder b(op);

   auto zero = arith::ConstantIndexOp::create(b, op.getLoc(), 0);

   SmallVector<Value, 2> tileSizeConstants;

   tileSizeConstants.reserve(op.getUpperBound().size());

   for (size_t i = 0, end = op.getUpperBound().size(); i != end; ++i) {

     if (i < tileSizes.size())

       tileSizeConstants.push_back(

           arith::ConstantIndexOp::create(b, op.getLoc(), tileSizes[i]));

     else

       // Just pick 1 for the remaining dimensions.

       tileSizeConstants.push_back(

           arith::ConstantIndexOp::create(b, op.getLoc(), 1));

   }


   // Create the outer loop with adjusted steps.

   SmallVector<Value, 2> newSteps;

   newSteps.reserve(op.getStep().size());

   for (auto step : llvm::zip(op.getStep(), tileSizeConstants)) {

     newSteps.push_back(arith::MulIOp::create(b, op.getLoc(), std::get<0>(step),

                                              std::get<1>(step)));

   }

   auto outerLoop = ParallelOp::create(b, op.getLoc(), op.getLowerBound(),

                                       op.getUpperBound(), newSteps);

   b.setInsertionPointToStart(outerLoop.getBody());


   // Compute min(size, dim - offset) to avoid out-of-bounds accesses.

   auto minMap = AffineMap::get(

       /*dimCount=*/3, /*symbolCount=*/0,

       {getAffineDimExpr(/*position=*/0, b.getContext()),

        getAffineDimExpr(/*position=*/1, b.getContext()) -

            getAffineDimExpr(/*position=*/2, b.getContext())},

       b.getContext());


   // Create the inner loop with adjusted bounds.

   SmallVector<Value, 2> newBounds;

   newBounds.reserve(op.getUpperBound().size());

   bool needInboundCheck = false;

   for (auto [lowerBound, upperBound, newStep, iv, step, tileSizeConstant] :

        llvm::zip(outerLoop.getLowerBound(), outerLoop.getUpperBound(),

                  outerLoop.getStep(), outerLoop.getInductionVars(),

                  op.getStep(), tileSizeConstants)) {

     // Collect the statically known loop bounds

     auto lowerBoundConstant =

         lowerBound.getDefiningOp<arith::ConstantIndexOp>();

     auto upperBoundConstant =

         upperBound.getDefiningOp<arith::ConstantIndexOp>();

     auto stepConstant = step.getDefiningOp<arith::ConstantIndexOp>();

     auto tileSize =

         cast<arith::ConstantIndexOp>(tileSizeConstant.getDefiningOp()).value();

     // If the loop bounds and the loop step are constant and if the number of

     // loop iterations is an integer multiple of the tile size, we use a static

     // bound for the inner loop.

     if (lowerBoundConstant && upperBoundConstant && stepConstant) {

       auto numIterations = llvm::divideCeil(upperBoundConstant.value() -

                                                 lowerBoundConstant.value(),

                                             stepConstant.value());

       if (numIterations % tileSize == 0) {

         newBounds.push_back(newStep);

         continue;

       }

     }


     // For InboundCheck mode, just use the variable outer step

     if (noMinMaxBounds) {

       newBounds.push_back(newStep);

       needInboundCheck = true;

       continue;

     }


     // Otherwise, we dynamically compute the bound for

     // each iteration of the outer loop.

     newBounds.push_back(

         affine::AffineMinOp::create(b, op.getLoc(), b.getIndexType(), minMap,

                                     ValueRange{newStep, upperBound, iv}));

   }

   auto innerLoop = ParallelOp::create(

       b, op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,

       op.getStep());


   if (noMinMaxBounds && needInboundCheck) {

     b.setInsertionPointToStart(innerLoop.getBody());

     // Insert in-bound check

     Value inbound =

         arith::ConstantIntOp::create(b, op.getLoc(), b.getIntegerType(1), 1);

     for (auto [outerUpperBound, outerIV, innerIV, innerStep] :

          llvm::zip(outerLoop.getUpperBound(), outerLoop.getInductionVars(),

                    innerLoop.getInductionVars(), innerLoop.getStep())) {

       // %in_bound = %in_bound &&

       //             (%inner_iv * %inner_step + %outer_iv < %outer_upper_bound)

       Value index = arith::AddIOp::create(

           b, op.getLoc(),

           arith::MulIOp::create(b, op.getLoc(), innerIV, innerStep), outerIV);

       Value dimInbound = arith::CmpIOp::create(

           b, op.getLoc(), arith::CmpIPredicate::ult, index, outerUpperBound);

       inbound = arith::AndIOp::create(b, op.getLoc(), inbound, dimInbound);

     }

     auto ifInbound = IfOp::create(b, op.getLoc(),

                                   /*resultTypes*/ ArrayRef<Type>{}, inbound,

                                   /*hasElseRegion*/ false);

     ifInbound.getThenRegion().takeBody(op.getRegion());

     Block &thenBlock = ifInbound.getThenRegion().front();

     // Replace the scf.reduce terminator with an scf.yield terminator.

     Operation *reduceOp = thenBlock.getTerminator();

     b.setInsertionPointToEnd(&thenBlock);

     scf::YieldOp::create(b, reduceOp->getLoc());

     reduceOp->erase();

     b.setInsertionPointToStart(innerLoop.getBody());

     for (const auto &ivs : llvm::enumerate(llvm::zip(

              innerLoop.getInductionVars(), outerLoop.getInductionVars()))) {

       auto newIndex = arith::AddIOp::create(

           b, op.getLoc(), std::get<0>(ivs.value()), std::get<1>(ivs.value()));

       thenBlock.getArgument(ivs.index())

           .replaceAllUsesExcept(newIndex, newIndex);

     }

     thenBlock.eraseArguments(0, thenBlock.getNumArguments());

   } else {

     innerLoop.getRegion().takeBody(op.getRegion());

     b.setInsertionPointToStart(innerLoop.getBody());

     for (auto ivs : llvm::zip(innerLoop.getInductionVars(),

                               outerLoop.getInductionVars())) {

       Value innerIndex = std::get<0>(ivs);

       auto newIndex = arith::AddIOp::create(b, op.getLoc(), std::get<0>(ivs),

                                             std::get<1>(ivs));

       innerIndex.replaceAllUsesExcept(newIndex, newIndex);

     }

   }


   op.erase();

   return std::make_pair(outerLoop, innerLoop);

 }


 namespace {

 struct ParallelLoopTiling

     : public impl::SCFParallelLoopTilingBase<ParallelLoopTiling> {

   ParallelLoopTiling() = default;

   explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,

                               bool noMinMaxBounds = false) {

     this->tileSizes = tileSizes;

     this->noMinMaxBounds = noMinMaxBounds;

   }


   void runOnOperation() override {

     for (auto tileSize : tileSizes)

       if (tileSize == 0) {

         mlir::emitError(mlir::UnknownLoc::get(&Pass::getContext()),

                         "tile size cannot be 0");

         return signalPassFailure();

       }

     auto *parentOp = getOperation();

     SmallVector<ParallelOp, 2> innermostPloops;

     getInnermostParallelLoops(parentOp, innermostPloops);

     for (ParallelOp ploop : innermostPloops) {

       // FIXME: Add reduction support.

       if (ploop.getNumReductions() == 0)

         tileParallelLoop(ploop, tileSizes, noMinMaxBounds);

     }

   }

 };

 } // namespace


 std::unique_ptr<Pass>

 mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes,

                                    bool noMinMaxBounds) {

   return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);

 }

AffineOps.h

Passes.h

Utils.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineMap::get
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Definition: MLIRContext.cpp:1203

mlir::Block
Block represents an ordered list of Operations.
Definition: Block.h:33

mlir::Block::getArgument
BlockArgument getArgument(unsigned i)
Definition: Block.h:129

mlir::Block::getNumArguments
unsigned getNumArguments()
Definition: Block.h:128

mlir::Block::getTerminator
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244

mlir::Block::eraseArguments
void eraseArguments(unsigned start, unsigned num)
Erases 'num' arguments from the index 'start'.
Definition: Block.cpp:201

mlir::Block::front
Operation & front()
Definition: Block.h:153

mlir::Builder::getIntegerType
IntegerType getIntegerType(unsigned width)
Definition: Builders.cpp:66

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::Builder::getIndexType
IndexType getIndexType()
Definition: Builders.cpp:50

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpBuilder::setInsertionPointToStart
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429

mlir::OpBuilder::setInsertionPointToEnd
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223

mlir::Operation::erase
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:538

mlir::Pass::getContext
MLIRContext & getContext()
Return the MLIR context for the current operation being transformed.
Definition: Pass.h:177

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::replaceAllUsesExcept
void replaceAllUsesExcept(Value newValue, const SmallPtrSetImpl< Operation * > &exceptions)
Replace all uses of 'this' value with 'newValue', updating anything in the IR that uses 'this' to use...
Definition: Value.cpp:71

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359

mlir::arith::ConstantIntOp::create
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition: ArithOps.cpp:258

Arith.h

SCF.h

Transforms.h

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir::detail::divideCeil
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Definition: DataLayoutInterfaces.cpp:468

mlir::scf
Definition: SCFToGPU.h:24

mlir::scf::tileParallelLoop
std::pair< ParallelOp, ParallelOp > tileParallelLoop(ParallelOp op, llvm::ArrayRef< int64_t > tileSizes, bool noMinMaxBounds)
Tile a parallel loop of the form scf.parallel (i0, i1) = (arg0, arg1) to (arg2, arg3) step (arg4,...

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::createParallelLoopTilingPass
std::unique_ptr< Pass > createParallelLoopTilingPass(llvm::ArrayRef< int64_t > tileSize={}, bool noMinMaxBounds=false)
Creates a pass which tiles innermost parallel loops.

mlir::emitError
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
Definition: Diagnostics.cpp:328

mlir::getInnermostParallelLoops
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
Definition: Utils.cpp:240

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::getAffineDimExpr
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:619