22#define GEN_PASS_DEF_SCFPARALLELLOOPTILING
23#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
57std::pair<ParallelOp, ParallelOp>
59 bool noMinMaxBounds) {
63 tileSizeConstants.reserve(op.getUpperBound().size());
64 for (
size_t i = 0, end = op.getUpperBound().size(); i != end; ++i) {
65 if (i < tileSizes.size())
66 tileSizeConstants.push_back(
70 tileSizeConstants.push_back(
76 newSteps.reserve(op.getStep().size());
77 for (
auto step : llvm::zip(op.getStep(), tileSizeConstants)) {
78 newSteps.push_back(arith::MulIOp::create(
b, op.getLoc(), std::get<0>(step),
81 auto outerLoop = ParallelOp::create(
b, op.getLoc(), op.getLowerBound(),
82 op.getUpperBound(), newSteps);
83 b.setInsertionPointToStart(outerLoop.getBody());
95 newBounds.reserve(op.getUpperBound().size());
96 bool needInboundCheck =
false;
97 for (
auto [lowerBound, upperBound, newStep, iv, step, tileSizeConstant] :
98 llvm::zip(outerLoop.getLowerBound(), outerLoop.getUpperBound(),
99 outerLoop.getStep(), outerLoop.getInductionVars(),
100 op.getStep(), tileSizeConstants)) {
102 auto lowerBoundConstant =
104 auto upperBoundConstant =
108 cast<arith::ConstantIndexOp>(tileSizeConstant.getDefiningOp()).
value();
112 if (lowerBoundConstant && upperBoundConstant && stepConstant) {
113 auto numIterations = llvm::divideCeil(upperBoundConstant.
value() -
114 lowerBoundConstant.value(),
115 stepConstant.
value());
116 if (numIterations % tileSize == 0) {
117 newBounds.push_back(newStep);
123 if (noMinMaxBounds) {
124 newBounds.push_back(newStep);
125 needInboundCheck =
true;
132 affine::AffineMinOp::create(
b, op.getLoc(),
b.getIndexType(), minMap,
135 auto innerLoop = ParallelOp::create(
139 if (noMinMaxBounds && needInboundCheck) {
140 b.setInsertionPointToStart(innerLoop.getBody());
144 for (
auto [outerUpperBound, outerIV, innerIV, innerStep] :
145 llvm::zip(outerLoop.getUpperBound(), outerLoop.getInductionVars(),
146 innerLoop.getInductionVars(), innerLoop.getStep())) {
151 arith::MulIOp::create(
b, op.getLoc(), innerIV, innerStep), outerIV);
152 Value dimInbound = arith::CmpIOp::create(
153 b, op.getLoc(), arith::CmpIPredicate::ult,
index, outerUpperBound);
154 inbound = arith::AndIOp::create(
b, op.getLoc(), inbound, dimInbound);
156 auto ifInbound = IfOp::create(
b, op.getLoc(),
159 ifInbound.getThenRegion().takeBody(op.getRegion());
160 Block &thenBlock = ifInbound.getThenRegion().
front();
163 b.setInsertionPointToEnd(&thenBlock);
164 scf::YieldOp::create(
b, reduceOp->
getLoc());
166 b.setInsertionPointToStart(innerLoop.getBody());
167 for (
const auto &ivs : llvm::enumerate(llvm::zip(
168 innerLoop.getInductionVars(), outerLoop.getInductionVars()))) {
169 auto newIndex = arith::AddIOp::create(
170 b, op.getLoc(), std::get<0>(ivs.value()), std::get<1>(ivs.value()));
176 innerLoop.getRegion().takeBody(op.getRegion());
177 b.setInsertionPointToStart(innerLoop.getBody());
178 for (
auto ivs : llvm::zip(innerLoop.getInductionVars(),
179 outerLoop.getInductionVars())) {
180 Value innerIndex = std::get<0>(ivs);
181 auto newIndex = arith::AddIOp::create(
b, op.getLoc(), std::get<0>(ivs),
188 return std::make_pair(outerLoop, innerLoop);
192struct ParallelLoopTiling
194 ParallelLoopTiling() =
default;
195 explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,
196 bool noMinMaxBounds =
false) {
197 this->tileSizes = tileSizes;
198 this->noMinMaxBounds = noMinMaxBounds;
201 void runOnOperation()
override {
202 for (
auto tileSize : tileSizes)
205 "tile size cannot be 0");
206 return signalPassFailure();
208 auto *parentOp = getOperation();
209 SmallVector<ParallelOp, 2> innermostPloops;
211 for (ParallelOp ploop : innermostPloops) {
213 if (ploop.getNumReductions() == 0)
222 bool noMinMaxBounds) {
223 return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Block represents an ordered list of Operations.
BlockArgument getArgument(unsigned i)
unsigned getNumArguments()
Operation * getTerminator()
Get the terminator operation of this block.
void eraseArguments(unsigned start, unsigned num)
Erases 'num' arguments from the index 'start'.
This class helps build Operations.
Operation is the basic unit of execution within MLIR.
Location getLoc()
The source location the operation was defined or derived from.
void erase()
Remove this operation from its parent block and delete it.
MLIRContext & getContext()
Return the MLIR context for the current operation being transformed.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
void replaceAllUsesExcept(Value newValue, const SmallPtrSetImpl< Operation * > &exceptions)
Replace all uses of 'this' value with 'newValue', updating anything in the IR that uses 'this' to use...
Specialization of arith.constant op that returns an integer of index type.
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
std::pair< ParallelOp, ParallelOp > tileParallelLoop(ParallelOp op, llvm::ArrayRef< int64_t > tileSizes, bool noMinMaxBounds)
Tile a parallel loop of the form scf.parallel (i0, i1) = (arg0, arg1) to (arg2, arg3) step (arg4,...
Include the generated interface declarations.
std::unique_ptr< Pass > createParallelLoopTilingPass(llvm::ArrayRef< int64_t > tileSize={}, bool noMinMaxBounds=false)
Creates a pass which tiles innermost parallel loops.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
bool getInnermostParallelLoops(Operation *rootOp, SmallVectorImpl< scf::ParallelOp > &result)
Get a list of innermost parallel loops contained in rootOp.
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.