30 #include "llvm/Support/DebugLog.h"
33 #define DEBUG_TYPE "loops-to-gpu"
67 llvm_unreachable(
"dim3 position out of bounds");
74 return forOp.getLowerBoundOperands();
79 return forOp.getUpperBoundOperands();
86 forOp.getStepAsInt());
109 Region &limit = forOp.getRegion();
110 for (
unsigned i = 0, e = numDims; i < e; ++i) {
111 Operation *nested = &forOp.getBody()->front();
115 "loops with bounds depending on other mapped loops "
116 "are not supported");
123 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
124 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
125 return forOp.emitError(
"expected perfectly nested loops in the body");
127 if (!(forOp = dyn_cast<AffineForOp>(nested)))
128 return nested->
emitError(
"expected a nested loop");
134 unsigned numBlockDims,
135 unsigned numThreadDims) {
136 if (numBlockDims < 1 || numThreadDims < 1) {
137 LDBG() <<
"nothing to map";
141 if (numBlockDims > 3) {
142 return forOp.emitError(
"cannot map to more than 3 block dimensions");
144 if (numThreadDims > 3) {
145 return forOp.emitError(
"cannot map to more than 3 thread dimensions");
153 struct AffineLoopToGpuConverter {
154 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
157 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
158 unsigned numBlockDims,
unsigned numThreadDims);
176 std::optional<AffineForOp>
177 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp,
unsigned numLoops) {
179 dims.reserve(numLoops);
180 lbs.reserve(numLoops);
181 ivs.reserve(numLoops);
182 steps.reserve(numLoops);
183 AffineForOp currentLoop = forOp;
184 for (
unsigned i = 0; i < numLoops; ++i) {
187 if (!lowerBound || !upperBound) {
191 Value range = arith::SubIOp::create(builder, currentLoop.getLoc(),
192 upperBound, lowerBound);
195 range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range,
197 dims.push_back(range);
199 lbs.push_back(lowerBound);
200 ivs.push_back(currentLoop.getInductionVar());
201 steps.push_back(step);
203 if (i != numLoops - 1)
204 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
213 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
214 AffineForOp innermostForOp,
215 unsigned numBlockDims,
216 unsigned numThreadDims) {
217 OpBuilder builder(rootForOp.getOperation());
221 (numBlockDims < 3 || numThreadDims < 3)
224 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
225 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
226 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
227 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
228 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
229 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
234 gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY,
235 gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
241 Operation &terminator = innermostForOp.getBody()->back();
244 builder.setInsertionPointToEnd(innermostForOp.getBody());
245 gpu::TerminatorOp::create(builder, terminatorLoc,
TypeRange());
246 launchOp.getBody().front().getOperations().splice(
247 launchOp.getBody().front().begin(),
248 innermostForOp.getBody()->getOperations());
254 builder.setInsertionPointToStart(&launchOp.getBody().front());
255 auto *lbArgumentIt = lbs.begin();
256 auto *stepArgumentIt = steps.begin();
259 en.index() < numBlockDims
261 :
getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
262 Value step = steps[en.index()];
264 id = arith::MulIOp::create(builder, rootForOp.getLoc(), step,
id);
266 Value ivReplacement =
267 arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt,
id);
268 en.value().replaceAllUsesWith(ivReplacement);
269 std::advance(lbArgumentIt, 1);
270 std::advance(stepArgumentIt, 1);
279 unsigned numBlockDims,
280 unsigned numThreadDims) {
284 AffineLoopToGpuConverter converter;
285 auto maybeInnerLoop =
286 converter.collectBounds(forOp, numBlockDims + numThreadDims);
289 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
295 unsigned numBlockDims,
296 unsigned numThreadDims) {
304 LogicalResult matchAndRewrite(ParallelOp parallelOp,
313 if (
auto op = upperBound.
getDefiningOp<arith::ConstantIndexOp>()) {
318 for (
const AffineExpr &result : minOp.getMap().getResults()) {
319 if (
auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
321 constExpr.getValue());
326 if (
auto minOp = upperBound.
getDefiningOp<arith::MinSIOp>()) {
327 for (
Value operand : {minOp.getLhs(), minOp.getRhs()}) {
333 if (
auto multiplyOp = upperBound.
getDefiningOp<arith::MulIOp>()) {
334 if (
auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
337 if (
auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
342 if ((lhs.value() < 0) != (rhs.value() < 0))
346 lhs.value() * rhs.value());
354 return processor != gpu::Processor::Sequential;
359 case gpu::Processor::BlockX:
361 case gpu::Processor::BlockY:
363 case gpu::Processor::BlockZ:
365 case gpu::Processor::ThreadX:
367 case gpu::Processor::ThreadY:
369 case gpu::Processor::ThreadZ:
374 "invalid processor type while retrieving launch op argument number");
400 ParallelOp parallelOp, gpu::LaunchOp launchOp,
IRMapping &cloningMap,
409 if (!mapping || parallelOp.getNumResults() > 1)
414 auto launchIndependent = [&launchOp](
Value val) {
415 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
418 auto ensureLaunchIndependent = [&rewriter,
420 if (launchIndependent(val))
422 if (
auto constOp = val.getDefiningOp<arith::ConstantOp>())
423 return arith::ConstantOp::create(rewriter, constOp.getLoc(),
428 for (
auto config : llvm::zip(
429 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
430 parallelOp.getUpperBound(), parallelOp.getStep())) {
432 Value iv, lowerBound, upperBound, step;
433 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) =
config;
435 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
437 return parallelOp.emitOpError()
438 <<
"expected mapping attribute for lowering to GPU";
440 gpu::Processor processor = annotation.getProcessor();
454 newIndex = AffineApplyOp::create(
455 rewriter, loc, annotation.getMap().compose(lowerAndStep),
456 ValueRange{operand, ensureLaunchIndependent(step),
457 ensureLaunchIndependent(lowerBound)});
460 if (annotation.getBound()) {
468 if (!launchIndependent(lowerBound) &&
469 !isa_and_nonnull<arith::ConstantOp>(lowerBound.
getDefiningOp()))
472 if (!launchIndependent(step) &&
477 bool boundIsPrecise =
478 launchIndependent(upperBound) ||
479 isa_and_nonnull<arith::ConstantOp>(upperBound.
getDefiningOp());
481 PatternRewriter::InsertionGuard guard(rewriter);
483 if (!boundIsPrecise) {
488 "cannot derive loop-invariant upper bound for number of"
499 Value launchBound = AffineApplyOp::create(
500 rewriter, loc, annotation.getBound().compose(stepMap),
502 ensureLaunchIndependent(
503 cloningMap.lookupOrDefault(upperBound)),
504 ensureLaunchIndependent(
505 cloningMap.lookupOrDefault(lowerBound)),
506 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
509 if (!bounds.try_emplace(processor, launchBound).second) {
511 parallelOp,
"cannot redefine the bound for processor " +
512 Twine(
static_cast<int64_t
>(processor)));
515 if (!boundIsPrecise) {
518 arith::CmpIOp pred = arith::CmpIOp::create(
519 rewriter, loc, arith::CmpIPredicate::slt, newIndex,
521 scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred,
false);
526 worklist.push_back(launchOp.getOperation());
531 auto loopOp = scf::ForOp::create(rewriter, loc,
535 newIndex = loopOp.getInductionVar();
540 worklist.push_back(launchOp.getOperation());
542 cloningMap.
map(iv, newIndex);
547 for (
const auto &namedAttr : parallelOp->getAttrs()) {
549 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
551 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
554 Block *body = parallelOp.getBody();
555 worklist.reserve(worklist.size() + body->
getOperations().size());
558 isa<scf::ReduceOp>(terminator) && terminator->
getOperands().size() == 1) {
559 worklist.push_back(terminator);
562 worklist.push_back(&op);
596 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
603 if (
auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
610 gpu::LaunchOp launchOp = gpu::LaunchOp::create(
614 gpu::TerminatorOp::create(rewriter, loc);
621 launchBounds, rewriter)))
625 bool seenSideeffects =
false;
627 bool leftNestingScope =
false;
628 while (!worklist.empty()) {
635 if (
auto nestedParallel = dyn_cast<ParallelOp>(op)) {
644 worklist, launchBounds, rewriter)))
646 }
else if (op == launchOp.getOperation()) {
651 leftNestingScope =
true;
652 seenSideeffects =
false;
653 }
else if (
auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
660 if (!newValue || !operand.getType().isSignlessIntOrFloat())
665 if (externalValues.size())
668 auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue);
669 cloningMap.
map(parentLoop->getResult(0), gpuRedOp.getResult());
672 gpuRedOp.getRegion().begin());
674 auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
678 scfReturn, scfReturn->getOperands().front());
689 if (seenSideeffects && leftNestingScope)
696 for (
auto bound : launchBounds)
717 op->
walk([](scf::ParallelOp parallelOp) {
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
static bool isMappedToProcessor(gpu::Processor processor)
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
static constexpr StringLiteral kVisitedAttrName
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineDimExpr(unsigned position)
This class describes a specific conversion target.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
This is a utility class for mapping one set of IR entities to another.
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
unsigned getNumRegions()
Returns the number of regions held by this operation.
Location getLoc()
The source location the operation was defined or derived from.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
operand_range getOperands()
Returns an iterator on the underlying Value's.
result_range getResults()
void erase()
Remove this operation from its parent block and delete it.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void inlineRegionBefore(Region ®ion, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Include the generated interface declarations.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
const FrozenRewritePatternSet & patterns
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....