32 #include "llvm/ADT/Sequence.h"
33 #include "llvm/Support/Debug.h"
36 #define DEBUG_TYPE "loops-to-gpu"
70 llvm_unreachable(
"dim3 position out of bounds");
77 return forOp.getLowerBoundOperands();
82 return forOp.getUpperBoundOperands();
88 return builder.
create<arith::ConstantIndexOp>(forOp.getLoc(),
89 forOp.getStepAsInt());
112 Region &limit = forOp.getRegion();
113 for (
unsigned i = 0, e = numDims; i < e; ++i) {
114 Operation *nested = &forOp.getBody()->front();
118 "loops with bounds depending on other mapped loops "
119 "are not supported");
126 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
127 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
128 return forOp.emitError(
"expected perfectly nested loops in the body");
130 if (!(forOp = dyn_cast<AffineForOp>(nested)))
131 return nested->
emitError(
"expected a nested loop");
137 unsigned numBlockDims,
138 unsigned numThreadDims) {
139 if (numBlockDims < 1 || numThreadDims < 1) {
140 LLVM_DEBUG(llvm::dbgs() <<
"nothing to map");
144 if (numBlockDims > 3) {
145 return forOp.emitError(
"cannot map to more than 3 block dimensions");
147 if (numThreadDims > 3) {
148 return forOp.emitError(
"cannot map to more than 3 thread dimensions");
156 struct AffineLoopToGpuConverter {
157 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
160 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
161 unsigned numBlockDims,
unsigned numThreadDims);
179 std::optional<AffineForOp>
180 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp,
unsigned numLoops) {
182 dims.reserve(numLoops);
183 lbs.reserve(numLoops);
184 ivs.reserve(numLoops);
185 steps.reserve(numLoops);
186 AffineForOp currentLoop = forOp;
187 for (
unsigned i = 0; i < numLoops; ++i) {
190 if (!lowerBound || !upperBound) {
194 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
195 upperBound, lowerBound);
199 builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step);
200 dims.push_back(range);
202 lbs.push_back(lowerBound);
203 ivs.push_back(currentLoop.getInductionVar());
204 steps.push_back(step);
206 if (i != numLoops - 1)
207 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
216 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
217 AffineForOp innermostForOp,
218 unsigned numBlockDims,
219 unsigned numThreadDims) {
220 OpBuilder builder(rootForOp.getOperation());
224 (numBlockDims < 3 || numThreadDims < 3)
225 ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
227 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
228 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
229 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
230 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
231 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
232 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
236 auto launchOp = builder.create<gpu::LaunchOp>(
237 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
238 blockSizeY, blockSizeZ);
244 Operation &terminator = innermostForOp.getBody()->back();
245 Location terminatorLoc = terminator.getLoc();
247 builder.setInsertionPointToEnd(innermostForOp.getBody());
248 builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
249 launchOp.getBody().front().getOperations().splice(
250 launchOp.getBody().front().begin(),
251 innermostForOp.getBody()->getOperations());
257 builder.setInsertionPointToStart(&launchOp.getBody().front());
258 auto *lbArgumentIt = lbs.begin();
259 auto *stepArgumentIt = steps.begin();
262 en.index() < numBlockDims
264 :
getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
265 Value step = steps[en.index()];
267 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
269 Value ivReplacement =
270 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
272 std::advance(lbArgumentIt, 1);
273 std::advance(stepArgumentIt, 1);
282 unsigned numBlockDims,
283 unsigned numThreadDims) {
287 AffineLoopToGpuConverter converter;
288 auto maybeInnerLoop =
289 converter.collectBounds(forOp, numBlockDims + numThreadDims);
292 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
298 unsigned numBlockDims,
299 unsigned numThreadDims) {
307 LogicalResult matchAndRewrite(ParallelOp parallelOp,
316 if (
auto op = upperBound.
getDefiningOp<arith::ConstantIndexOp>()) {
321 for (
const AffineExpr &result : minOp.getMap().getResults()) {
322 if (
auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
323 return rewriter.
create<arith::ConstantIndexOp>(minOp.getLoc(),
324 constExpr.getValue());
329 if (
auto minOp = upperBound.
getDefiningOp<arith::MinSIOp>()) {
330 for (
Value operand : {minOp.getLhs(), minOp.getRhs()}) {
336 if (
auto multiplyOp = upperBound.
getDefiningOp<arith::MulIOp>()) {
337 if (
auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
340 if (
auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
345 if ((lhs.value() < 0) != (rhs.value() < 0))
348 return rewriter.
create<arith::ConstantIndexOp>(
349 multiplyOp.getLoc(), lhs.value() * rhs.value());
357 return processor != gpu::Processor::Sequential;
362 case gpu::Processor::BlockX:
364 case gpu::Processor::BlockY:
366 case gpu::Processor::BlockZ:
368 case gpu::Processor::ThreadX:
370 case gpu::Processor::ThreadY:
372 case gpu::Processor::ThreadZ:
377 "invalid processor type while retrieving launch op argument number");
403 ParallelOp parallelOp, gpu::LaunchOp launchOp,
IRMapping &cloningMap,
412 if (!mapping || parallelOp.getNumResults() != 0)
417 auto launchIndependent = [&launchOp](
Value val) {
418 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
421 auto ensureLaunchIndependent = [&rewriter,
423 if (launchIndependent(val))
425 if (
auto constOp = val.getDefiningOp<arith::ConstantOp>())
426 return rewriter.
create<arith::ConstantOp>(constOp.getLoc(),
431 for (
auto config : llvm::zip(
432 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
433 parallelOp.getUpperBound(), parallelOp.getStep())) {
435 Value iv, lowerBound, upperBound, step;
436 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
438 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
440 return parallelOp.emitOpError()
441 <<
"expected mapping attribute for lowering to GPU";
443 gpu::Processor processor = annotation.getProcessor();
457 newIndex = rewriter.
create<AffineApplyOp>(
458 loc, annotation.getMap().compose(lowerAndStep),
459 ValueRange{operand, ensureLaunchIndependent(step),
460 ensureLaunchIndependent(lowerBound)});
463 if (annotation.getBound()) {
471 if (!launchIndependent(lowerBound) &&
472 !isa_and_nonnull<arith::ConstantOp>(lowerBound.
getDefiningOp()))
475 if (!launchIndependent(step) &&
480 bool boundIsPrecise =
481 launchIndependent(upperBound) ||
482 isa_and_nonnull<arith::ConstantOp>(upperBound.
getDefiningOp());
484 PatternRewriter::InsertionGuard guard(rewriter);
486 if (!boundIsPrecise) {
491 "cannot derive loop-invariant upper bound for number of"
502 Value launchBound = rewriter.
create<AffineApplyOp>(
503 loc, annotation.getBound().compose(stepMap),
505 ensureLaunchIndependent(
507 ensureLaunchIndependent(
512 if (bounds.contains(processor)) {
514 parallelOp,
"cannot redefine the bound for processor " +
515 Twine(
static_cast<int64_t
>(processor)));
517 bounds[processor] = launchBound;
519 if (!boundIsPrecise) {
521 Value originalBound = std::get<3>(config);
522 arith::CmpIOp pred = rewriter.
create<arith::CmpIOp>(
523 loc, arith::CmpIPredicate::slt, newIndex,
525 scf::IfOp ifOp = rewriter.
create<scf::IfOp>(loc, pred,
false);
530 worklist.push_back(launchOp.getOperation());
535 auto loopOp = rewriter.
create<scf::ForOp>(
539 newIndex = loopOp.getInductionVar();
544 worklist.push_back(launchOp.getOperation());
546 cloningMap.
map(iv, newIndex);
551 for (
const auto &namedAttr : parallelOp->getAttrs()) {
553 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
555 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
558 Block *body = parallelOp.getBody();
559 worklist.reserve(worklist.size() + body->
getOperations().size());
561 worklist.push_back(&op);
595 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
602 if (
auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
608 rewriter.
create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
609 gpu::LaunchOp launchOp = rewriter.
create<gpu::LaunchOp>(
613 rewriter.
create<gpu::TerminatorOp>(loc);
620 launchBounds, rewriter)))
624 bool seenSideeffects =
false;
626 bool leftNestingScope =
false;
627 while (!worklist.empty()) {
634 if (
auto nestedParallel = dyn_cast<ParallelOp>(op)) {
643 worklist, launchBounds, rewriter)))
645 }
else if (op == launchOp.getOperation()) {
650 leftNestingScope =
true;
651 seenSideeffects =
false;
661 if (seenSideeffects && leftNestingScope)
668 for (
auto bound : launchBounds)
677 patterns.
add<ParallelToGpuLaunchLowering>(patterns.
getContext());
689 op->
walk([](scf::ParallelOp parallelOp) {
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
static bool isMappedToProcessor(gpu::Processor processor)
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
static constexpr StringLiteral kVisitedAttrName
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
OpListType & getOperations()
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
AffineExpr getAffineSymbolExpr(unsigned position)
AffineExpr getAffineDimExpr(unsigned position)
This class describes a specific conversion target.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
This is a utility class for mapping one set of IR entities to another.
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
This class helps build Operations.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
This class implements the operand iterators for the Operation class.
Operation is the basic unit of execution within MLIR.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
unsigned getNumRegions()
Returns the number of regions held by this operation.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Include the generated interface declarations.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Utility class for the GPU dialect to represent triples of Values accessible through ....