32 #include "llvm/ADT/Sequence.h"
33 #include "llvm/Support/Debug.h"
36 #define DEBUG_TYPE "loops-to-gpu"
70 llvm_unreachable(
"dim3 position out of bounds");
77 return forOp.getLowerBoundOperands();
82 return forOp.getUpperBoundOperands();
88 return builder.
create<arith::ConstantIndexOp>(forOp.getLoc(),
112 Region &limit = forOp.getRegion();
113 for (
unsigned i = 0, e = numDims; i < e; ++i) {
114 Operation *nested = &forOp.getBody()->front();
118 "loops with bounds depending on other mapped loops "
119 "are not supported");
126 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
127 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
128 return forOp.emitError(
"expected perfectly nested loops in the body");
130 if (!(forOp = dyn_cast<AffineForOp>(nested)))
131 return nested->
emitError(
"expected a nested loop");
137 unsigned numBlockDims,
138 unsigned numThreadDims) {
139 if (numBlockDims < 1 || numThreadDims < 1) {
140 LLVM_DEBUG(llvm::dbgs() <<
"nothing to map");
144 if (numBlockDims > 3) {
145 return forOp.emitError(
"cannot map to more than 3 block dimensions");
147 if (numThreadDims > 3) {
148 return forOp.emitError(
"cannot map to more than 3 thread dimensions");
156 struct AffineLoopToGpuConverter {
157 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
160 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
161 unsigned numBlockDims,
unsigned numThreadDims);
179 std::optional<AffineForOp>
180 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp,
unsigned numLoops) {
182 dims.reserve(numLoops);
183 lbs.reserve(numLoops);
184 ivs.reserve(numLoops);
185 steps.reserve(numLoops);
186 AffineForOp currentLoop = forOp;
187 for (
unsigned i = 0; i < numLoops; ++i) {
190 if (!lowerBound || !upperBound) {
194 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
195 upperBound, lowerBound);
198 range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step);
199 dims.push_back(range);
201 lbs.push_back(lowerBound);
202 ivs.push_back(currentLoop.getInductionVar());
203 steps.push_back(step);
205 if (i != numLoops - 1)
206 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
215 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
216 AffineForOp innermostForOp,
217 unsigned numBlockDims,
218 unsigned numThreadDims) {
219 OpBuilder builder(rootForOp.getOperation());
223 (numBlockDims < 3 || numThreadDims < 3)
224 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
226 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
227 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
228 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
229 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
230 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
231 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
235 auto launchOp = builder.create<gpu::LaunchOp>(
236 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
237 blockSizeY, blockSizeZ);
243 Operation &terminator = innermostForOp.getBody()->back();
244 Location terminatorLoc = terminator.getLoc();
246 builder.setInsertionPointToEnd(innermostForOp.getBody());
247 builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
248 launchOp.getBody().front().getOperations().splice(
249 launchOp.getBody().front().begin(),
250 innermostForOp.getBody()->getOperations());
256 builder.setInsertionPointToStart(&launchOp.getBody().front());
257 auto *lbArgumentIt = lbs.begin();
258 auto *stepArgumentIt = steps.begin();
261 en.index() < numBlockDims
263 :
getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
264 Value step = steps[en.index()];
266 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
268 Value ivReplacement =
269 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
271 std::advance(lbArgumentIt, 1);
272 std::advance(stepArgumentIt, 1);
281 unsigned numBlockDims,
282 unsigned numThreadDims) {
286 AffineLoopToGpuConverter converter;
287 auto maybeInnerLoop =
288 converter.collectBounds(forOp, numBlockDims + numThreadDims);
291 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
297 unsigned numBlockDims,
298 unsigned numThreadDims) {
315 if (
auto op = upperBound.
getDefiningOp<arith::ConstantIndexOp>()) {
320 for (
const AffineExpr &result : minOp.getMap().getResults()) {
322 return rewriter.
create<arith::ConstantIndexOp>(minOp.getLoc(),
323 constExpr.getValue());
328 if (
auto minOp = upperBound.
getDefiningOp<arith::MinSIOp>()) {
329 for (
Value operand : {minOp.getLhs(), minOp.getRhs()}) {
335 if (
auto multiplyOp = upperBound.
getDefiningOp<arith::MulIOp>()) {
336 if (
auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
339 if (
auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
344 if ((lhs.value() < 0) != (rhs.value() < 0))
347 return rewriter.
create<arith::ConstantIndexOp>(
348 multiplyOp.getLoc(), lhs.value() * rhs.value());
356 return processor != gpu::Processor::Sequential;
361 case gpu::Processor::BlockX:
363 case gpu::Processor::BlockY:
365 case gpu::Processor::BlockZ:
367 case gpu::Processor::ThreadX:
369 case gpu::Processor::ThreadY:
371 case gpu::Processor::ThreadZ:
376 "invalid processor type while retrieving launch op argument number");
402 ParallelOp parallelOp, gpu::LaunchOp launchOp,
IRMapping &cloningMap,
411 if (!mapping || parallelOp.getNumResults() != 0)
416 auto launchIndependent = [&launchOp](
Value val) {
417 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
420 auto ensureLaunchIndependent = [&rewriter,
422 if (launchIndependent(val))
424 if (
auto constOp = val.getDefiningOp<arith::ConstantOp>())
425 return rewriter.
create<arith::ConstantOp>(constOp.getLoc(),
430 for (
auto config : llvm::zip(
431 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
432 parallelOp.getUpperBound(), parallelOp.getStep())) {
434 Value iv, lowerBound, upperBound, step;
435 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
437 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
439 return parallelOp.emitOpError()
440 <<
"expected mapping attribute for lowering to GPU";
442 gpu::Processor processor = annotation.getProcessor();
456 newIndex = rewriter.
create<AffineApplyOp>(
457 loc, annotation.getMap().compose(lowerAndStep),
461 if (annotation.getBound()) {
469 if (!launchIndependent(lowerBound) &&
470 !isa_and_nonnull<arith::ConstantOp>(lowerBound.
getDefiningOp()))
473 if (!launchIndependent(step) &&
478 bool boundIsPrecise =
479 launchIndependent(upperBound) ||
480 isa_and_nonnull<arith::ConstantOp>(upperBound.
getDefiningOp());
482 PatternRewriter::InsertionGuard guard(rewriter);
484 if (!boundIsPrecise) {
489 "cannot derive loop-invariant upper bound for number of"
500 Value launchBound = rewriter.
create<AffineApplyOp>(
501 loc, annotation.getBound().compose(stepMap),
503 ensureLaunchIndependent(
505 ensureLaunchIndependent(
510 if (bounds.contains(processor)) {
512 parallelOp,
"cannot redefine the bound for processor " +
513 Twine(
static_cast<int64_t
>(processor)));
515 bounds[processor] = launchBound;
517 if (!boundIsPrecise) {
519 Value originalBound = std::get<3>(config);
520 arith::CmpIOp pred = rewriter.
create<arith::CmpIOp>(
521 loc, arith::CmpIPredicate::slt, newIndex,
523 scf::IfOp ifOp = rewriter.
create<scf::IfOp>(loc, pred,
false);
528 worklist.push_back(launchOp.getOperation());
533 auto loopOp = rewriter.
create<scf::ForOp>(
537 newIndex = loopOp.getInductionVar();
542 worklist.push_back(launchOp.getOperation());
544 cloningMap.
map(iv, newIndex);
549 for (
const auto &namedAttr : parallelOp->getAttrs()) {
551 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
553 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
556 Block *body = parallelOp.getBody();
557 worklist.reserve(worklist.size() + body->
getOperations().size());
559 worklist.push_back(&op);
593 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
600 if (
auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
606 rewriter.
create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
607 gpu::LaunchOp launchOp = rewriter.
create<gpu::LaunchOp>(
611 rewriter.
create<gpu::TerminatorOp>(loc);
618 launchBounds, rewriter)))
622 bool seenSideeffects =
false;
624 bool leftNestingScope =
false;
625 while (!worklist.empty()) {
632 if (
auto nestedParallel = dyn_cast<ParallelOp>(op)) {
641 worklist, launchBounds, rewriter)))
643 }
else if (op == launchOp.getOperation()) {
648 leftNestingScope =
true;
649 seenSideeffects =
false;
659 if (seenSideeffects && leftNestingScope)
666 for (
auto bound : launchBounds)
675 patterns.
add<ParallelToGpuLaunchLowering>(patterns.
getContext());
687 op->
walk([](scf::ParallelOp parallelOp) {
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
static bool isMappedToProcessor(gpu::Processor processor)
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
static constexpr StringLiteral kVisitedAttrName
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
An integer constant appearing in affine expression.
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType & getOperations()
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineDimExpr(unsigned position)
This class describes a specific conversion target.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
This is a utility class for mapping one set of IR entities to another.
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
unsigned getNumRegions()
Returns the number of regions held by this operation.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the rewriter that the IR failed to be rewritten because of a match failure,...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
void replaceAllUsesWith(Value newValue) const
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
This header declares functions that assist transformations in the MemRef dialect.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
This class represents an efficient way to signal success or failure.
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....