MLIR 22.0.0git
SCFToGPU.cpp
Go to the documentation of this file.
1//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This implements a straightforward conversion of an loop nest into a GPU
10// kernel. The caller is expected to guarantee that the conversion is correct
11// or to further transform the kernel to ensure correctness.
12//
13//===----------------------------------------------------------------------===//
14
16
25#include "mlir/IR/AffineExpr.h"
26#include "mlir/IR/Builders.h"
27#include "mlir/IR/IRMapping.h"
31#include "llvm/ADT/DenseSet.h"
32#include "llvm/Support/DebugLog.h"
33#include <optional>
34
35#define DEBUG_TYPE "loops-to-gpu"
36
37using namespace mlir;
38using namespace mlir::affine;
39using namespace mlir::scf;
40
41// Name of internal attribute to mark visited operations during conversion.
42//
43// NOTE: The conversion originally used the following legality criteria:
44// `!parallelOp->hasAttr(gpu::getMappingAttrName())`
45// But the provided pattern might reject some cases based on more detailed
46// analysis of the `mapping` attribute.
47// To avoid dialect conversion failure due to non-converted illegal operation
48// we use this extra Unit attribute as a marker, that the operation was checked
49// by the pattern and is should be considered as legal in the following legality
50// checks. The `finalizeParallelLoopToGPUConversion` function performs clean up
51// of this extra attributes ans is supposed to be called after the dialect
52// conversion.
53//
54// TODO: Implement a cleaner solution, factoring out the "matching" logic
55// from the pattern and its callees into a separate function that can be called
56// from both the pattern and the op legality check.
57static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
58
59// Extract an indexed value from KernelDim3.
60static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
61 switch (pos) {
62 case 0:
63 return dim3.x;
64 case 1:
65 return dim3.y;
66 case 2:
67 return dim3.z;
68 default:
69 llvm_unreachable("dim3 position out of bounds");
70 }
71 return nullptr;
72}
73
74// Get the lower bound-related operands of a loop operation.
76 return forOp.getLowerBoundOperands();
77}
78
79// Get the upper bound-related operands of a loop operation.
81 return forOp.getUpperBoundOperands();
82}
83
84// Get a Value that corresponds to the loop step. If the step is an attribute,
85// materialize a corresponding constant using builder.
86static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
87 return arith::ConstantIndexOp::create(builder, forOp.getLoc(),
88 forOp.getStepAsInt());
89}
90
91// Get a Value for the loop lower bound. If the value requires computation,
92// materialize the instructions using builder.
93static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
94 return lowerAffineLowerBound(forOp, builder);
95}
96
97// Get a Value for the loop upper bound. If the value requires computation,
98// materialize the instructions using builder.
99static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
100 return lowerAffineUpperBound(forOp, builder);
101}
102
103// Check the structure of the loop nest:
104// - there are enough loops to map to numDims;
105// - the loops are perfectly nested;
106// - the loop bounds can be computed above the outermost loop.
107// This roughly corresponds to the "matcher" part of the pattern-based
108// rewriting infrastructure.
109static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
110 unsigned numDims) {
111 Region &limit = forOp.getRegion();
112 for (unsigned i = 0, e = numDims; i < e; ++i) {
113 Operation *nested = &forOp.getBody()->front();
114 if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
116 return forOp.emitError(
117 "loops with bounds depending on other mapped loops "
118 "are not supported");
119
120 // The innermost loop can have an arbitrary body, skip the perfect nesting
121 // check for it.
122 if (i == e - 1)
123 break;
124
125 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
126 if (forOp.getBody()->empty() || std::next(begin, 2) != end)
127 return forOp.emitError("expected perfectly nested loops in the body");
128
129 if (!(forOp = dyn_cast<AffineForOp>(nested)))
130 return nested->emitError("expected a nested loop");
131 }
132 return success();
133}
134
135static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
136 unsigned numBlockDims,
137 unsigned numThreadDims) {
138 if (numBlockDims < 1 || numThreadDims < 1) {
139 LDBG() << "nothing to map";
140 return success();
141 }
142
143 if (numBlockDims > 3) {
144 return forOp.emitError("cannot map to more than 3 block dimensions");
145 }
146 if (numThreadDims > 3) {
147 return forOp.emitError("cannot map to more than 3 thread dimensions");
148 }
149 return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
150}
151
152namespace {
153// Helper structure that holds common state of the loop to GPU kernel
154// conversion.
155struct AffineLoopToGpuConverter {
156 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
157 unsigned numLoops);
158
159 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
160 unsigned numBlockDims, unsigned numThreadDims);
161
162 // Ranges of the loops mapped to blocks or threads.
163 SmallVector<Value, 6> dims;
164 // Lower bounds of the loops mapped to blocks or threads.
165 SmallVector<Value, 6> lbs;
166 // Induction variables of the loops mapped to blocks or threads.
167 SmallVector<Value, 6> ivs;
168 // Steps of the loops mapped to blocks or threads.
169 SmallVector<Value, 6> steps;
170};
171} // namespace
172
173// Collect ranges, bounds, steps and induction variables in preparation for
174// mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
175// This may fail if the IR for computing loop bounds cannot be constructed, for
176// example if an affine loop uses semi-affine maps. Return the last loop to be
177// mapped on success, std::nullopt on failure.
178std::optional<AffineForOp>
179AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
180 OpBuilder builder(forOp.getOperation());
181 dims.reserve(numLoops);
182 lbs.reserve(numLoops);
183 ivs.reserve(numLoops);
184 steps.reserve(numLoops);
185 AffineForOp currentLoop = forOp;
186 for (unsigned i = 0; i < numLoops; ++i) {
187 Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
188 Value upperBound = getOrEmitUpperBound(currentLoop, builder);
189 if (!lowerBound || !upperBound) {
190 return std::nullopt;
191 }
192
193 Value range = arith::SubIOp::create(builder, currentLoop.getLoc(),
194 upperBound, lowerBound);
195 Value step = getOrCreateStep(currentLoop, builder);
196 if (getConstantIntValue(step) != static_cast<int64_t>(1))
197 range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range,
198 step);
199 dims.push_back(range);
200
201 lbs.push_back(lowerBound);
202 ivs.push_back(currentLoop.getInductionVar());
203 steps.push_back(step);
204
205 if (i != numLoops - 1)
206 currentLoop = cast<AffineForOp>(&currentLoop.getBody()->front());
207 }
208 return currentLoop;
209}
210
211// Replace the rooted at "rootForOp" with a GPU launch operation. This expects
212// "innermostForOp" to point to the last loop to be transformed to the kernel,
213// and to have (numBlockDims + numThreadDims) perfectly nested loops between
214// "rootForOp" and "innermostForOp".
215void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
216 AffineForOp innermostForOp,
217 unsigned numBlockDims,
218 unsigned numThreadDims) {
219 OpBuilder builder(rootForOp.getOperation());
220 // Prepare the grid and block sizes for the launch operation. If there is
221 // no loop mapped to a specific dimension, use constant "1" as its size.
222 Value constOne =
223 (numBlockDims < 3 || numThreadDims < 3)
224 ? arith::ConstantIndexOp::create(builder, rootForOp.getLoc(), 1)
225 : nullptr;
226 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
227 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
228 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
229 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
230 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
231 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
232
233 // Create a launch op and move the body region of the innermost loop to the
234 // launch op.
235 auto launchOp =
236 gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY,
237 gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
238
239 // Replace the loop terminator (loops contain only a single block) with the
240 // gpu terminator and move the operations from the loop body block to the gpu
241 // launch body block. Do not move the entire block because of the difference
242 // in block arguments.
243 Operation &terminator = innermostForOp.getBody()->back();
244 Location terminatorLoc = terminator.getLoc();
245 terminator.erase();
246 builder.setInsertionPointToEnd(innermostForOp.getBody());
247 gpu::TerminatorOp::create(builder, terminatorLoc, TypeRange());
248 launchOp.getBody().front().getOperations().splice(
249 launchOp.getBody().front().begin(),
250 innermostForOp.getBody()->getOperations());
251
252 // Remap the loop iterators to use block/thread identifiers instead. Loops
253 // may iterate from LB with step S whereas GPU thread/block ids always iterate
254 // from 0 to N with step 1. Therefore, loop induction variables are replaced
255 // with (gpu-thread/block-id * S) + LB.
256 builder.setInsertionPointToStart(&launchOp.getBody().front());
257 auto *lbArgumentIt = lbs.begin();
258 auto *stepArgumentIt = steps.begin();
259 for (const auto &en : llvm::enumerate(ivs)) {
260 Value id =
261 en.index() < numBlockDims
262 ? getDim3Value(launchOp.getBlockIds(), en.index())
263 : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
264 Value step = steps[en.index()];
265 if (getConstantIntValue(step) != static_cast<int64_t>(1))
266 id = arith::MulIOp::create(builder, rootForOp.getLoc(), step, id);
267
268 Value ivReplacement =
269 arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt, id);
270 en.value().replaceAllUsesWith(ivReplacement);
271 std::advance(lbArgumentIt, 1);
272 std::advance(stepArgumentIt, 1);
273 }
274
275 // We are done and can erase the original outermost loop.
276 rootForOp.erase();
277}
278
279// Generic loop to GPU kernel conversion function.
280static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
281 unsigned numBlockDims,
282 unsigned numThreadDims) {
283 if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
284 return failure();
285
286 AffineLoopToGpuConverter converter;
287 auto maybeInnerLoop =
288 converter.collectBounds(forOp, numBlockDims + numThreadDims);
289 if (!maybeInnerLoop)
290 return failure();
291 converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
292
293 return success();
294}
295
296LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
297 unsigned numBlockDims,
298 unsigned numThreadDims) {
299 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
300}
301
302namespace {
303struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
304 using OpRewritePattern<ParallelOp>::OpRewritePattern;
305
306 LogicalResult matchAndRewrite(ParallelOp parallelOp,
307 PatternRewriter &rewriter) const override;
308};
309} // namespace
310
311/// Tries to derive a static upper bound from the defining operation of
312/// `upperBound`.
314 PatternRewriter &rewriter) {
315 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
316 return op;
317 }
318
319 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
320 for (const AffineExpr &result : minOp.getMap().getResults()) {
321 if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
322 return arith::ConstantIndexOp::create(rewriter, minOp.getLoc(),
323 constExpr.getValue());
324 }
325 }
326 }
327
328 if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
329 for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
330 if (auto staticBound = deriveStaticUpperBound(operand, rewriter))
331 return staticBound;
332 }
333 }
334
335 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
336 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
337 deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
338 .getDefiningOp()))
339 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
340 deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
341 .getDefiningOp())) {
342 // Assumptions about the upper bound of minimum computations no longer
343 // work if multiplied by mixed signs, so abort in this case.
344 if ((lhs.value() < 0) != (rhs.value() < 0))
345 return {};
346
347 return arith::ConstantIndexOp::create(rewriter, multiplyOp.getLoc(),
348 lhs.value() * rhs.value());
349 }
350 }
351
352 return {};
353}
354
355static bool isMappedToProcessor(gpu::Processor processor) {
356 return processor != gpu::Processor::Sequential;
357}
358
359static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
360 switch (processor) {
361 case gpu::Processor::BlockX:
362 return 0;
363 case gpu::Processor::BlockY:
364 return 1;
365 case gpu::Processor::BlockZ:
366 return 2;
367 case gpu::Processor::ThreadX:
368 return 3;
369 case gpu::Processor::ThreadY:
370 return 4;
371 case gpu::Processor::ThreadZ:
372 return 5;
373 default:;
374 }
375 llvm_unreachable(
376 "invalid processor type while retrieving launch op argument number");
377}
378
379/// Modifies the current transformation state to capture the effect of the given
380/// `scf.parallel` operation on index substitutions and the operations to be
381/// inserted.
382/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
383/// this function will
384/// - compute the loop index based on the hardware id and affine map from the
385/// mapping and update `cloningMap` to substitute all uses.
386/// - derive a new upper bound for the hardware id and augment the provided
387/// `gpu.launch operation` accordingly.
388/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
389/// and update the rewriter to insert into the conditional's body.
390/// If the dimension is mapped to sequential,
391/// - insert a for loop into the body and update the rewriter to insert into
392/// the for loop's body.
393/// - update the `cloningMap` to replace uses of the index with the index of
394/// the new for loop.
395/// In either case,
396/// - append the instructions from the loops body to worklist, in reverse order.
397/// To note the end of the current scope in case a loop or conditional was
398/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
399/// worklist. This signals the processor of the worklist to pop the rewriter
400/// one scope-level up.
401static LogicalResult processParallelLoop(
402 ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
405 // TODO: Verify that this is a valid GPU mapping.
406 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
407 ArrayAttr mapping =
408 parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
409
410 // TODO: Support multiple reductions.
411 if (!mapping || parallelOp.getNumResults() > 1)
412 return failure();
413
414 Location loc = parallelOp.getLoc();
415
416 auto launchIndependent = [&launchOp](Value val) {
417 return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
418 };
419
420 auto ensureLaunchIndependent = [&rewriter,
421 launchIndependent](Value val) -> Value {
422 if (launchIndependent(val))
423 return val;
424 if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
425 return arith::ConstantOp::create(rewriter, constOp.getLoc(),
426 constOp.getValue());
427 return {};
428 };
429
430 for (auto config : llvm::zip(
431 mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
432 parallelOp.getUpperBound(), parallelOp.getStep())) {
433 Attribute mappingAttribute;
434 Value iv, lowerBound, upperBound, step;
435 std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
436 auto annotation =
437 dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
438 if (!annotation)
439 return parallelOp.emitOpError()
440 << "expected mapping attribute for lowering to GPU";
441 Value newIndex;
442 gpu::Processor processor = annotation.getProcessor();
443
444 if (isMappedToProcessor(processor)) {
445 // Use the corresponding thread/grid index as replacement for the loop iv.
446 Value operand =
447 launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor));
448 // Take the indexmap and add the lower bound and step computations in.
449 // This computes operand * step + lowerBound.
450 // Use an affine map here so that it composes nicely with the provided
451 // annotation.
452 AffineMap lowerAndStep = AffineMap::get(
453 1, 2,
454 rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
455 rewriter.getAffineSymbolExpr(1));
456 // Map through cloningMap first so we use values valid at the launch
457 // scope, then ensure they are launch-independent (or cloned constants).
458 Value mappedStep = cloningMap.lookupOrDefault(step);
459 Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound);
460
461 mappedStep = ensureLaunchIndependent(mappedStep);
462 mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);
463
464 // If either cannot be made available above the launch, fail gracefully.
465 if (!mappedStep || !mappedLowerBound) {
466 return rewriter.notifyMatchFailure(
467 parallelOp, "lower bound / step must be constant or defined above "
468 "the gpu.launch");
469 }
470
471 newIndex = AffineApplyOp::create(
472 rewriter, loc, annotation.getMap().compose(lowerAndStep),
473 ValueRange{operand, mappedStep, mappedLowerBound});
474 // If there was also a bound, insert that, too.
475 // TODO: Check that we do not assign bounds twice.
476 if (annotation.getBound()) {
477 // We pass as the single operand to the bound-map the number of
478 // iterations, which is (upperBound - lowerBound) ceilDiv step. To
479 // support inner loops with dynamic upper bounds (as generated by e.g.
480 // tiling), try to derive a max for the bounds. If the used bound for
481 // the hardware id is imprecise, wrap the contained code into a
482 // conditional. If the lower-bound is constant or defined before the
483 // launch, we can use it in the launch bounds. Otherwise fail.
484 if (!launchIndependent(lowerBound) &&
485 !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
486 return failure();
487 // The step must also be constant or defined outside of the loop nest.
488 if (!launchIndependent(step) &&
489 !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
490 return failure();
491 // If the upper-bound is constant or defined before the launch, we can
492 // use it in the launch bounds directly. Otherwise try derive a bound.
493 bool boundIsPrecise =
494 launchIndependent(upperBound) ||
495 isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
496 {
497 PatternRewriter::InsertionGuard guard(rewriter);
498 rewriter.setInsertionPoint(launchOp);
499 if (!boundIsPrecise) {
500 upperBound = deriveStaticUpperBound(upperBound, rewriter);
501 if (!upperBound) {
502 return rewriter.notifyMatchFailure(
503 parallelOp,
504 "cannot derive loop-invariant upper bound for number of"
505 "iterations");
506 }
507 }
508 // Compute the number of iterations needed. We compute this as an
509 // affine expression ceilDiv (upperBound - lowerBound) step. We use
510 // affine.apply here so that it composes nicely with the provided map.
511 AffineMap stepMap = AffineMap::get(
512 1, 2,
513 ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
514 .ceilDiv(rewriter.getAffineSymbolExpr(1))));
515 Value launchBound = AffineApplyOp::create(
516 rewriter, loc, annotation.getBound().compose(stepMap),
518 ensureLaunchIndependent(
519 cloningMap.lookupOrDefault(upperBound)),
520 ensureLaunchIndependent(
521 cloningMap.lookupOrDefault(lowerBound)),
522 ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
523 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
524 // when this condition is relaxed.
525 if (!bounds.try_emplace(processor, launchBound).second) {
526 return rewriter.notifyMatchFailure(
527 parallelOp, "cannot redefine the bound for processor " +
528 Twine(static_cast<int64_t>(processor)));
529 }
530 }
531 if (!boundIsPrecise) {
532 // We are using an approximation, create a surrounding conditional.
533 Value originalBound = std::get<3>(config);
534 arith::CmpIOp pred = arith::CmpIOp::create(
535 rewriter, loc, arith::CmpIPredicate::slt, newIndex,
536 cloningMap.lookupOrDefault(originalBound));
537 scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred, false);
538 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
539 // Put a sentinel into the worklist so we know when to pop out of the
540 // if body again. We use the launchOp here, as that cannot be part of
541 // the bodies instruction.
542 worklist.push_back(launchOp.getOperation());
543 }
544 }
545 } else {
546 // Create a sequential for loop.
547 auto loopOp = scf::ForOp::create(rewriter, loc,
548 cloningMap.lookupOrDefault(lowerBound),
549 cloningMap.lookupOrDefault(upperBound),
550 cloningMap.lookupOrDefault(step));
551 newIndex = loopOp.getInductionVar();
552 rewriter.setInsertionPointToStart(loopOp.getBody());
553 // Put a sentinel into the worklist so we know when to pop out of the loop
554 // body again. We use the launchOp here, as that cannot be part of the
555 // bodies instruction.
556 worklist.push_back(launchOp.getOperation());
557 }
558 cloningMap.map(iv, newIndex);
559 }
560
561 // Propagate custom user defined optional attributes, that can be used at
562 // later stage, such as extension data for GPU kernel dispatch
563 for (const auto &namedAttr : parallelOp->getAttrs()) {
564 if (namedAttr.getName() == gpu::getMappingAttrName() ||
565 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
566 continue;
567 launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
568 }
569
570 Block *body = parallelOp.getBody();
571 worklist.reserve(worklist.size() + body->getOperations().size());
572 // Include scf.reduce terminator if exists and has an operand.
573 if (auto terminator = body->getTerminator();
574 isa<scf::ReduceOp>(terminator) && terminator->getOperands().size() == 1) {
575 worklist.push_back(terminator);
576 }
577 for (Operation &op : llvm::reverse(body->without_terminator()))
578 worklist.push_back(&op);
579 return success();
580}
581
582/// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
583/// operation.
584///
585/// This essentially transforms a loop nest into a corresponding SIMT function.
586/// The conversion is driven by mapping annotations on the `scf.parallel`
587/// operations. The mapping is provided via a `DictionaryAttribute` named
588/// `mapping`, which has three entries:
589/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
590/// thread dimensions and 6 is sequential.
591/// - map : An affine map that is used to pre-process hardware ids before
592/// substitution.
593/// - bound : An affine map that is used to compute the bound of the hardware
594/// id based on an upper bound of the number of iterations.
595/// If the `scf.parallel` contains nested `scf.parallel` operations, those
596/// need to be annotated, as well. Structurally, the transformation works by
597/// splicing all operations from nested `scf.parallel` operations into a single
598/// sequence. Indices mapped to hardware ids are substituted with those ids,
599/// wheras sequential mappings result in a sequential for-loop. To have more
600/// flexibility when mapping code to hardware ids, the transform supports two
601/// affine maps. The first `map` is used to compute the actual index for
602/// substitution from the hardware id. The second `bound` is used to compute the
603/// launch dimension for the hardware id from the number of iterations the
604/// mapped loop is performing. Note that the number of iterations might be
605/// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
606/// the hardware id might iterate over additional indices. The transformation
607/// caters for this by predicating the created sequence of instructions on
608/// the actual loop bound. This only works if an static upper bound for the
609/// dynamic loop bound can be derived, currently via analyzing `affine.min`
610/// operations.
611LogicalResult
612ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
613 PatternRewriter &rewriter) const {
614 // Mark the operation as visited for recursive legality check.
615 parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
616
617 // We can only transform starting at the outer-most loop. Launches inside of
618 // parallel loops are not supported.
619 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
620 return failure();
621 // Create a launch operation. We start with bound one for all grid/block
622 // sizes. Those will be refined later as we discover them from mappings.
623 Location loc = parallelOp.getLoc();
624 Value constantOne =
625 arith::ConstantIndexOp::create(rewriter, parallelOp.getLoc(), 1);
626 gpu::LaunchOp launchOp = gpu::LaunchOp::create(
627 rewriter, parallelOp.getLoc(), constantOne, constantOne, constantOne,
628 constantOne, constantOne, constantOne);
629 rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
630 gpu::TerminatorOp::create(rewriter, loc);
631 rewriter.setInsertionPointToStart(&launchOp.getBody().front());
632
633 IRMapping cloningMap;
634 llvm::DenseMap<gpu::Processor, Value> launchBounds;
635 SmallVector<Operation *, 16> worklist;
636 if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
637 launchBounds, rewriter)))
638 return failure();
639
640 // Whether we have seen any side-effects. Reset when leaving an inner scope.
641 bool seenSideeffects = false;
642 // Whether we have left a nesting scope (and hence are no longer innermost).
643 bool leftNestingScope = false;
644 LocalAliasAnalysis aliasAnalysis;
645 llvm::DenseSet<Value> writtenBuffer;
646 while (!worklist.empty()) {
647 Operation *op = worklist.pop_back_val();
648 // Now walk over the body and clone it.
649 // TODO: This is only correct if there either is no further scf.parallel
650 // nested or this code has side-effect but the memory buffer is not
651 // alias to inner loop access buffer. Otherwise we might need
652 // predication.
653 if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
654 // Before entering a nested scope, make sure there have been no
655 // sideeffects until now or the nested operations do not access the
656 // buffer written by outer scope.
657 if (seenSideeffects) {
658 WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) {
659 if (isMemoryEffectFree(nestedOp))
660 return WalkResult::advance();
661
662 auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp);
663 if (!memEffectInterface)
664 return WalkResult::advance();
665
666 SmallVector<MemoryEffects::EffectInstance> effects;
667 memEffectInterface.getEffects(effects);
668 for (const MemoryEffects::EffectInstance &effect : effects) {
669 if (isa<MemoryEffects::Read>(effect.getEffect()) ||
670 isa<MemoryEffects::Write>(effect.getEffect())) {
671 Value baseBuffer = effect.getValue();
672 if (!baseBuffer)
673 return WalkResult::interrupt();
674 for (Value val : writtenBuffer) {
675 if (aliasAnalysis.alias(baseBuffer, val) !=
677 return WalkResult::interrupt();
678 }
679 }
680 }
681 }
682 return WalkResult::advance();
683 });
684 if (walkRes.wasInterrupted())
685 return failure();
686 }
687 // A nested scf.parallel needs insertion of code to compute indices.
688 // Insert that now. This will also update the worklist with the loops
689 // body.
690 if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
691 worklist, launchBounds, rewriter)))
692 return failure();
693 } else if (op == launchOp.getOperation()) {
694 // Found our sentinel value. We have finished the operations from one
695 // nesting level, pop one level back up.
696 auto *parent = rewriter.getInsertionPoint()->getParentOp();
697 rewriter.setInsertionPointAfter(parent);
698 leftNestingScope = true;
699 seenSideeffects = false;
700 writtenBuffer.clear();
701 } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
702 // Convert scf.reduction op
703 auto parentLoop = op->getParentOfType<ParallelOp>();
704 if (!parentLoop || op->getOperands().size() != 1)
705 return failure();
706 auto operand = op->getOperands().front();
707 auto newValue = cloningMap.lookupOrNull(operand);
708 if (!newValue || !operand.getType().isSignlessIntOrFloat())
709 return failure();
710 // Ensure reduction region is isolated from above.
711 llvm::SetVector<Value> externalValues;
712 getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues);
713 if (externalValues.size())
714 return failure();
715 // Replace by gpu.all_reduce.
716 auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue);
717 cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult());
718 // Copy region.
719 rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(),
720 gpuRedOp.getRegion().begin());
721 // Replace src.reduce.return with gpu.yield.
722 auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
723 auto ip = rewriter.saveInsertionPoint();
724 rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front());
725 rewriter.replaceOpWithNewOp<gpu::YieldOp>(
726 scfReturn, scfReturn->getOperands().front());
727 rewriter.restoreInsertionPoint(ip);
728 } else {
729 // Otherwise we copy it over.
730 Operation *clone = rewriter.clone(*op, cloningMap);
731 cloningMap.map(op->getResults(), clone->getResults());
732 // Check for side effects.
734 // Record the buffer accessed by the operations with write effects.
735 if (auto memEffectInterface =
736 dyn_cast<MemoryEffectOpInterface>(clone)) {
737 SmallVector<MemoryEffects::EffectInstance> effects;
738 memEffectInterface.getEffects(effects);
739 for (const MemoryEffects::EffectInstance &effect : effects) {
740 if (isa<MemoryEffects::Write>(effect.getEffect())) {
741 Value writtenBase = effect.getValue();
742 // Conservatively return failure if we cannot find the written
743 // address.
744 if (!writtenBase)
745 return failure();
746 writtenBuffer.insert(writtenBase);
747 }
748 }
749 }
750 }
751 // TODO: Handle region side effects properly.
752 seenSideeffects |=
754 // If we are no longer in the innermost scope, sideeffects are disallowed.
755 if (seenSideeffects && leftNestingScope)
756 return failure();
757 }
758 }
759
760 // Now that we succeeded creating the launch operation, also update the
761 // bounds.
762 for (auto bound : launchBounds)
763 launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
764 std::get<1>(bound));
765
766 rewriter.eraseOp(parallelOp);
767 return success();
768}
769
771 patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
772}
773
775 target.addLegalDialect<memref::MemRefDialect>();
776 target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
777 return !parallelOp->hasAttr(gpu::getMappingAttrName()) ||
778 parallelOp->hasAttr(kVisitedAttrName);
779 });
780}
781
783 op->walk([](scf::ParallelOp parallelOp) {
784 parallelOp->removeAttr(kVisitedAttrName);
785 });
786}
return success()
lhs
ArrayAttr()
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
Definition SCFToGPU.cpp:109
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
Definition SCFToGPU.cpp:99
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
Definition SCFToGPU.cpp:60
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
Definition SCFToGPU.cpp:401
static bool isMappedToProcessor(gpu::Processor processor)
Definition SCFToGPU.cpp:355
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
Definition SCFToGPU.cpp:75
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
Definition SCFToGPU.cpp:86
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
Definition SCFToGPU.cpp:93
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
Definition SCFToGPU.cpp:313
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
Definition SCFToGPU.cpp:359
static constexpr StringLiteral kVisitedAttrName
Definition SCFToGPU.cpp:57
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
Definition SCFToGPU.cpp:80
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Definition SCFToGPU.cpp:135
Base type for affine expression.
Definition AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
@ NoAlias
The two locations do not alias at all.
Attributes are known-constant values of operations.
Definition Attributes.h:25
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType & getOperations()
Definition Block.h:137
Operation * getTerminator()
Get the terminator operation of this block.
Definition Block.cpp:244
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
Definition Block.h:212
UnitAttr getUnitAttr()
Definition Builders.cpp:98
AffineExpr getAffineSymbolExpr(unsigned position)
Definition Builders.cpp:368
AffineExpr getAffineDimExpr(unsigned position)
Definition Builders.cpp:364
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:65
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:58
AliasResult alias(Value lhs, Value rhs)
Given two values, return their aliasing behavior.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:348
This class helps build Operations.
Definition Builders.h:207
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Definition Builders.h:385
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition Builders.h:445
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:562
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition Builders.h:431
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:398
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition Builders.h:436
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
Definition Builders.h:390
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition Operation.h:674
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:223
OperandRange operand_range
Definition Operation.h:371
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition Operation.h:378
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:797
result_range getResults()
Definition Operation.h:415
void erase()
Remove this operation from its parent block and delete it.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition Region.h:26
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
Specialization of arith.constant op that returns an integer of index type.
Definition Arith.h:113
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
SideEffects::EffectInstance< Effect > EffectInstance
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Include the generated interface declarations.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
Definition SCFToGPU.cpp:782
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
Definition SCFToGPU.cpp:770
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
const FrozenRewritePatternSet & patterns
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:126
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition RegionUtils.h:26
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Definition SCFToGPU.cpp:774
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition GPUDialect.h:39