31#include "llvm/ADT/DenseSet.h"
32#include "llvm/Support/DebugLog.h"
35#define DEBUG_TYPE "loops-to-gpu"
69 llvm_unreachable(
"dim3 position out of bounds");
76 return forOp.getLowerBoundOperands();
81 return forOp.getUpperBoundOperands();
88 forOp.getStepAsInt());
111 Region &limit = forOp.getRegion();
112 for (
unsigned i = 0, e = numDims; i < e; ++i) {
113 Operation *nested = &forOp.getBody()->front();
116 return forOp.emitError(
117 "loops with bounds depending on other mapped loops "
118 "are not supported");
125 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
126 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
127 return forOp.emitError(
"expected perfectly nested loops in the body");
129 if (!(forOp = dyn_cast<AffineForOp>(nested)))
130 return nested->
emitError(
"expected a nested loop");
136 unsigned numBlockDims,
137 unsigned numThreadDims) {
138 if (numBlockDims < 1 || numThreadDims < 1) {
139 LDBG() <<
"nothing to map";
143 if (numBlockDims > 3) {
144 return forOp.emitError(
"cannot map to more than 3 block dimensions");
146 if (numThreadDims > 3) {
147 return forOp.emitError(
"cannot map to more than 3 thread dimensions");
155struct AffineLoopToGpuConverter {
156 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
159 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
160 unsigned numBlockDims,
unsigned numThreadDims);
163 SmallVector<Value, 6> dims;
165 SmallVector<Value, 6> lbs;
167 SmallVector<Value, 6> ivs;
169 SmallVector<Value, 6> steps;
178std::optional<AffineForOp>
179AffineLoopToGpuConverter::collectBounds(AffineForOp forOp,
unsigned numLoops) {
180 OpBuilder builder(forOp.getOperation());
181 dims.reserve(numLoops);
182 lbs.reserve(numLoops);
183 ivs.reserve(numLoops);
184 steps.reserve(numLoops);
185 AffineForOp currentLoop = forOp;
186 for (
unsigned i = 0; i < numLoops; ++i) {
187 if (currentLoop.getNumIterOperands() > 0) {
188 currentLoop.emitError(
189 "affine loop with iter_args cannot be converted to GPU kernel");
195 if (!lowerBound || !upperBound) {
199 Value range = arith::SubIOp::create(builder, currentLoop.getLoc(),
200 upperBound, lowerBound);
203 range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range,
205 dims.push_back(range);
207 lbs.push_back(lowerBound);
208 ivs.push_back(currentLoop.getInductionVar());
209 steps.push_back(step);
211 if (i != numLoops - 1)
212 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
221void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
222 AffineForOp innermostForOp,
223 unsigned numBlockDims,
224 unsigned numThreadDims) {
225 OpBuilder builder(rootForOp.getOperation());
229 (numBlockDims < 3 || numThreadDims < 3)
232 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
233 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
234 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
235 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
236 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
237 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
242 gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY,
243 gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
249 Operation &terminator = innermostForOp.getBody()->back();
250 Location terminatorLoc = terminator.
getLoc();
252 builder.setInsertionPointToEnd(innermostForOp.getBody());
253 gpu::TerminatorOp::create(builder, terminatorLoc,
TypeRange());
254 launchOp.getBody().front().getOperations().splice(
255 launchOp.getBody().front().begin(),
256 innermostForOp.getBody()->getOperations());
262 builder.setInsertionPointToStart(&launchOp.getBody().front());
263 auto *lbArgumentIt = lbs.begin();
264 auto *stepArgumentIt = steps.begin();
265 for (
const auto &en : llvm::enumerate(ivs)) {
267 en.index() < numBlockDims
269 :
getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
270 Value step = steps[en.index()];
272 id = arith::MulIOp::create(builder, rootForOp.getLoc(), step,
id);
274 Value ivReplacement =
275 arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt,
id);
276 en.value().replaceAllUsesWith(ivReplacement);
277 std::advance(lbArgumentIt, 1);
278 std::advance(stepArgumentIt, 1);
287 unsigned numBlockDims,
288 unsigned numThreadDims) {
292 AffineLoopToGpuConverter converter;
293 auto maybeInnerLoop =
294 converter.collectBounds(forOp, numBlockDims + numThreadDims);
297 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
303 unsigned numBlockDims,
304 unsigned numThreadDims) {
305 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
309struct ParallelToGpuLaunchLowering :
public OpRewritePattern<ParallelOp> {
310 using OpRewritePattern<ParallelOp>::OpRewritePattern;
312 LogicalResult matchAndRewrite(ParallelOp parallelOp,
313 PatternRewriter &rewriter)
const override;
327 if (
auto constExpr = dyn_cast<AffineConstantExpr>(
result)) {
329 constExpr.getValue());
334 if (
auto minOp = upperBound.
getDefiningOp<arith::MinSIOp>()) {
335 for (
Value operand : {minOp.getLhs(), minOp.getRhs()}) {
341 if (
auto multiplyOp = upperBound.
getDefiningOp<arith::MulIOp>()) {
348 if ((
lhs.value() < 0) != (
rhs.value() < 0))
352 lhs.value() *
rhs.value());
360 return processor != gpu::Processor::Sequential;
365 case gpu::Processor::BlockX:
367 case gpu::Processor::BlockY:
369 case gpu::Processor::BlockZ:
371 case gpu::Processor::ThreadX:
373 case gpu::Processor::ThreadY:
375 case gpu::Processor::ThreadZ:
380 "invalid processor type while retrieving launch op argument number");
406 ParallelOp parallelOp, gpu::LaunchOp launchOp,
IRMapping &cloningMap,
415 if (!mapping || parallelOp.getNumResults() > 1)
420 auto launchIndependent = [&launchOp](
Value val) {
421 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
424 auto ensureLaunchIndependent = [&rewriter,
426 if (launchIndependent(val))
434 for (
auto config : llvm::zip(
435 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
436 parallelOp.getUpperBound(), parallelOp.getStep())) {
438 Value iv, lowerBound, upperBound, step;
439 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
441 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
443 return parallelOp.emitOpError()
444 <<
"expected mapping attribute for lowering to GPU";
446 gpu::Processor processor = annotation.getProcessor();
465 mappedStep = ensureLaunchIndependent(mappedStep);
466 mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);
469 if (!mappedStep || !mappedLowerBound) {
471 parallelOp,
"lower bound / step must be constant or defined above "
475 newIndex = AffineApplyOp::create(
476 rewriter, loc, annotation.getMap().compose(lowerAndStep),
477 ValueRange{operand, mappedStep, mappedLowerBound});
480 if (annotation.getBound()) {
488 if (!launchIndependent(lowerBound) &&
496 bool boundIsPrecise = launchIndependent(upperBound) ||
501 if (!boundIsPrecise) {
506 "cannot derive loop-invariant upper bound for number of"
517 Value launchBound = AffineApplyOp::create(
518 rewriter, loc, annotation.getBound().compose(stepMap),
520 ensureLaunchIndependent(
521 cloningMap.lookupOrDefault(upperBound)),
522 ensureLaunchIndependent(
523 cloningMap.lookupOrDefault(lowerBound)),
524 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
527 if (!bounds.try_emplace(processor, launchBound).second) {
529 parallelOp,
"cannot redefine the bound for processor " +
530 Twine(
static_cast<int64_t>(processor)));
533 if (!boundIsPrecise) {
535 Value originalBound = std::get<3>(config);
536 arith::CmpIOp pred = arith::CmpIOp::create(
537 rewriter, loc, arith::CmpIPredicate::slt, newIndex,
539 scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred,
false);
544 worklist.push_back(launchOp.getOperation());
549 auto loopOp = scf::ForOp::create(rewriter, loc,
553 newIndex = loopOp.getInductionVar();
558 worklist.push_back(launchOp.getOperation());
560 cloningMap.
map(iv, newIndex);
565 for (
const auto &namedAttr : parallelOp->getAttrs()) {
567 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
569 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
572 Block *body = parallelOp.getBody();
573 worklist.reserve(worklist.size() + body->
getOperations().size());
576 isa<scf::ReduceOp>(terminator) && terminator->
getOperands().size() == 1) {
577 worklist.push_back(terminator);
580 worklist.push_back(&op);
614ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
615 PatternRewriter &rewriter)
const {
621 if (
auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
625 Location loc = parallelOp.getLoc();
627 gpu::LaunchOp launchOp =
628 gpu::LaunchOp::create(rewriter, loc, constantOne, constantOne,
629 constantOne, constantOne, constantOne, constantOne);
631 gpu::TerminatorOp::create(rewriter, loc);
634 IRMapping cloningMap;
635 llvm::DenseMap<gpu::Processor, Value> launchBounds;
636 SmallVector<Operation *, 16> worklist;
638 launchBounds, rewriter)))
642 bool seenSideeffects =
false;
644 bool leftNestingScope =
false;
645 LocalAliasAnalysis aliasAnalysis;
646 llvm::DenseSet<Value> writtenBuffer;
647 while (!worklist.empty()) {
648 Operation *op = worklist.pop_back_val();
654 if (
auto nestedParallel = dyn_cast<ParallelOp>(op)) {
658 if (seenSideeffects) {
659 WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) {
663 auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp);
664 if (!memEffectInterface)
667 SmallVector<MemoryEffects::EffectInstance> effects;
668 memEffectInterface.getEffects(effects);
670 if (isa<MemoryEffects::Read>(effect.getEffect()) ||
671 isa<MemoryEffects::Write>(effect.getEffect())) {
672 Value baseBuffer = effect.getValue();
675 for (Value val : writtenBuffer) {
676 if (aliasAnalysis.
alias(baseBuffer, val) !=
692 worklist, launchBounds, rewriter)))
694 }
else if (op == launchOp.getOperation()) {
699 leftNestingScope =
true;
700 seenSideeffects =
false;
701 writtenBuffer.clear();
702 }
else if (
auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
709 if (!newValue || !operand.getType().isSignlessIntOrFloat())
712 llvm::SetVector<Value> externalValues;
714 if (!externalValues.empty())
717 auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue);
718 cloningMap.
map(parentLoop->getResult(0), gpuRedOp.getResult());
721 gpuRedOp.getRegion().begin());
723 auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
727 scfReturn, scfReturn->getOperands().front());
731 Operation *
clone = rewriter.
clone(*op, cloningMap);
736 if (
auto memEffectInterface =
737 dyn_cast<MemoryEffectOpInterface>(
clone)) {
738 SmallVector<MemoryEffects::EffectInstance> effects;
739 memEffectInterface.getEffects(effects);
741 if (isa<MemoryEffects::Write>(effect.getEffect())) {
742 Value writtenBase = effect.getValue();
747 writtenBuffer.insert(writtenBase);
756 if (seenSideeffects && leftNestingScope)
763 for (
auto bound : launchBounds)
772 patterns.
add<ParallelToGpuLaunchLowering>(patterns.
getContext());
776 target.addLegalDialect<memref::MemRefDialect>();
777 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
784 op->
walk([](scf::ParallelOp parallelOp) {
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
static bool isMappedToProcessor(gpu::Processor processor)
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
static constexpr StringLiteral kVisitedAttrName
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
@ NoAlias
The two locations do not alias at all.
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType & getOperations()
Operation * getTerminator()
Get the terminator operation of this block.
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineDimExpr(unsigned position)
This is a utility class for mapping one set of IR entities to another.
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
AliasResult alias(Value lhs, Value rhs)
Given two values, return their aliasing behavior.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Operation is the basic unit of execution within MLIR.
unsigned getNumRegions()
Returns the number of regions held by this operation.
Location getLoc()
The source location the operation was defined or derived from.
OperandRange operand_range
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
operand_range getOperands()
Returns an iterator on the underlying Value's.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getResults()
void erase()
Remove this operation from its parent block and delete it.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void inlineRegionBefore(Region ®ion, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
Specialization of arith.constant op that returns an integer of index type.
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
SideEffects::EffectInstance< Effect > EffectInstance
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Include the generated interface declarations.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
Utility class for the GPU dialect to represent triples of Values accessible through ....