31 #include "llvm/ADT/Sequence.h" 32 #include "llvm/Support/Debug.h" 34 #define DEBUG_TYPE "loops-to-gpu" 67 llvm_unreachable(
"dim3 position out of bounds");
74 return forOp.getLowerBoundOperands();
79 return forOp.getUpperBoundOperands();
109 Region &limit = forOp.getRegion();
110 for (
unsigned i = 0, e = numDims; i < e; ++i) {
111 Operation *nested = &forOp.getBody()->front();
115 "loops with bounds depending on other mapped loops " 116 "are not supported");
123 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
124 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
125 return forOp.emitError(
"expected perfectly nested loops in the body");
127 if (!(forOp = dyn_cast<AffineForOp>(nested)))
128 return nested->
emitError(
"expected a nested loop");
134 unsigned numBlockDims,
135 unsigned numThreadDims) {
136 if (numBlockDims < 1 || numThreadDims < 1) {
137 LLVM_DEBUG(llvm::dbgs() <<
"nothing to map");
141 if (numBlockDims > 3) {
142 return forOp.emitError(
"cannot map to more than 3 block dimensions");
144 if (numThreadDims > 3) {
145 return forOp.emitError(
"cannot map to more than 3 thread dimensions");
153 struct AffineLoopToGpuConverter {
156 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
157 unsigned numBlockDims,
unsigned numThreadDims);
173 return def.value() == 1;
183 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp,
unsigned numLoops) {
185 dims.reserve(numLoops);
186 lbs.reserve(numLoops);
187 ivs.reserve(numLoops);
188 steps.reserve(numLoops);
189 AffineForOp currentLoop = forOp;
190 for (
unsigned i = 0; i < numLoops; ++i) {
193 if (!lowerBound || !upperBound) {
197 Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
198 upperBound, lowerBound);
201 range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step);
202 dims.push_back(range);
204 lbs.push_back(lowerBound);
205 ivs.push_back(currentLoop.getInductionVar());
206 steps.push_back(step);
208 if (i != numLoops - 1)
209 currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
218 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
219 AffineForOp innermostForOp,
220 unsigned numBlockDims,
221 unsigned numThreadDims) {
222 OpBuilder builder(rootForOp.getOperation());
226 (numBlockDims < 3 || numThreadDims < 3)
229 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
230 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
231 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
232 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
233 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
234 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
238 auto launchOp = builder.create<gpu::LaunchOp>(
239 rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
240 blockSizeY, blockSizeZ);
246 Operation &terminator = innermostForOp.getBody()->back();
247 Location terminatorLoc = terminator.getLoc();
249 builder.setInsertionPointToEnd(innermostForOp.getBody());
250 builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
251 launchOp.body().front().getOperations().splice(
252 launchOp.body().front().begin(),
253 innermostForOp.getBody()->getOperations());
259 builder.setInsertionPointToStart(&launchOp.body().front());
260 auto *lbArgumentIt = lbs.begin();
261 auto *stepArgumentIt = steps.begin();
264 en.index() < numBlockDims
266 :
getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
267 Value step = steps[en.index()];
269 id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
271 Value ivReplacement =
272 builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
274 std::advance(lbArgumentIt, 1);
275 std::advance(stepArgumentIt, 1);
284 unsigned numBlockDims,
285 unsigned numThreadDims) {
289 AffineLoopToGpuConverter converter;
290 auto maybeInnerLoop =
291 converter.collectBounds(forOp, numBlockDims + numThreadDims);
294 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
300 unsigned numBlockDims,
301 unsigned numThreadDims) {
323 for (
const AffineExpr &result : minOp.getMap().getResults()) {
326 constExpr.getValue());
331 if (
auto multiplyOp = upperBound.
getDefiningOp<arith::MulIOp>()) {
332 if (
auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
335 if (
auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
340 if (lhs.value() < 0 || rhs.value() < 0)
344 multiplyOp.getLoc(), lhs.value() * rhs.value());
352 return processor != gpu::Processor::Sequential;
357 case gpu::Processor::BlockX:
359 case gpu::Processor::BlockY:
361 case gpu::Processor::BlockZ:
363 case gpu::Processor::ThreadX:
365 case gpu::Processor::ThreadY:
367 case gpu::Processor::ThreadZ:
372 "invalid processor type while retrieving launch op argument number");
398 ParallelOp parallelOp, gpu::LaunchOp launchOp,
407 if (!mapping || parallelOp.getNumResults() != 0)
412 auto launchIndependent = [&launchOp](
Value val) {
413 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
416 auto ensureLaunchIndependent = [&rewriter,
418 if (launchIndependent(val))
420 if (
auto constOp = val.getDefiningOp<arith::ConstantOp>())
421 return rewriter.
create<arith::ConstantOp>(constOp.getLoc(),
426 for (
auto config : llvm::zip(
427 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
428 parallelOp.getUpperBound(), parallelOp.getStep())) {
430 Value iv, lowerBound, upperBound, step;
431 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
433 mappingAttribute.
dyn_cast<gpu::ParallelLoopDimMappingAttr>();
435 return parallelOp.emitOpError()
436 <<
"expected mapping attribute for lowering to GPU";
438 gpu::Processor processor = annotation.getProcessor();
452 newIndex = rewriter.
create<AffineApplyOp>(
453 loc, annotation.getMap().compose(lowerAndStep),
457 if (annotation.getBound()) {
465 if (!launchIndependent(lowerBound) &&
466 !isa_and_nonnull<arith::ConstantOp>(lowerBound.
getDefiningOp()))
469 if (!launchIndependent(step) &&
474 bool boundIsPrecise =
475 launchIndependent(upperBound) ||
476 isa_and_nonnull<arith::ConstantOp>(upperBound.
getDefiningOp());
478 PatternRewriter::InsertionGuard guard(rewriter);
480 if (!boundIsPrecise) {
485 "cannot derive loop-invariant upper bound for number of" 496 Value launchBound = rewriter.
create<AffineApplyOp>(
497 loc, annotation.getBound().compose(stepMap),
499 ensureLaunchIndependent(
501 ensureLaunchIndependent(
506 if (bounds.find(processor) != bounds.end()) {
508 parallelOp,
"cannot redefine the bound for processor " +
509 Twine(static_cast<int64_t>(processor)));
511 bounds[processor] = launchBound;
513 if (!boundIsPrecise) {
515 Value originalBound = std::get<3>(config);
516 arith::CmpIOp pred = rewriter.
create<arith::CmpIOp>(
517 loc, arith::CmpIPredicate::slt, newIndex,
519 scf::IfOp ifOp = rewriter.
create<scf::IfOp>(loc, pred,
false);
524 worklist.push_back(launchOp.getOperation());
529 auto loopOp = rewriter.
create<scf::ForOp>(
533 newIndex = loopOp.getInductionVar();
538 worklist.push_back(launchOp.getOperation());
540 cloningMap.
map(iv, newIndex);
545 for (
const auto &namedAttr : parallelOp->getAttrs()) {
547 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
549 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
552 Block *body = parallelOp.getBody();
553 worklist.reserve(worklist.size() + body->
getOperations().size());
555 worklist.push_back(&op);
589 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
596 if (
auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
603 gpu::LaunchOp launchOp = rewriter.
create<gpu::LaunchOp>(
607 rewriter.
create<gpu::TerminatorOp>(loc);
614 launchBounds, rewriter)))
618 bool seenSideeffects =
false;
620 bool leftNestingScope =
false;
621 while (!worklist.empty()) {
628 if (
auto nestedParallel = dyn_cast<ParallelOp>(op)) {
637 worklist, launchBounds, rewriter)))
639 }
else if (op == launchOp.getOperation()) {
644 leftNestingScope =
true;
645 seenSideeffects =
false;
652 seenSideeffects |= !MemoryEffectOpInterface::hasNoEffect(clone) ||
655 if (seenSideeffects && leftNestingScope)
662 for (
auto bound : launchBounds)
671 patterns.
add<ParallelToGpuLaunchLowering>(patterns.
getContext());
683 op->
walk([](scf::ParallelOp parallelOp) {
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
Include the generated interface declarations.
This class contains a list of basic blocks and a link to the parent operation it is attached to...
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments...
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Operation is a basic unit of execution within MLIR.
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
unsigned getNumRegions()
Returns the number of regions held by this operation.
Block represents an ordered list of Operations.
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Operation * clone(Operation &op, BlockAndValueMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
OpListType & getOperations()
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value...
AffineExpr getAffineSymbolExpr(unsigned position)
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list...
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, BlockAndValueMapping &cloningMap, SmallVectorImpl< Operation *> &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
An integer constant appearing in affine expression.
void replaceAllUsesWith(Value newValue) const
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
static constexpr const bool value
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
void map(Block *from, Block *to)
Inserts a new mapping for 'from' to 'to'.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
static bool isMappedToProcessor(gpu::Processor processor)
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
std::enable_if< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT >::type walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one)...
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
This class represents an efficient way to signal success or failure.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static constexpr StringLiteral kVisitedAttrName
Attributes are known-constant values of operations.
int64_t ceilDiv(int64_t lhs, int64_t rhs)
Returns the result of MLIR's ceildiv operation on constants.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Base type for affine expression.
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Value lowerAffineLowerBound(AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued...
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
Utility class for the GPU dialect to represent triples of Values accessible through ...
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
static bool isConstantOne(Value value)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Specialization of arith.constant op that returns an integer of index type.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Block * lookupOrDefault(Block *from) const
Lookup a mapped value within the map.
AffineExpr getAffineDimExpr(unsigned position)
This class implements the operand iterators for the Operation class.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the rewriter that the IR failed to be rewritten because of a match failure...
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
This class describes a specific conversion target.
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
result_range getResults()
This class helps build Operations.
Value lowerAffineUpperBound(AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
This class provides an abstraction over the different types of ranges over Values.
MLIRContext * getContext() const
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)