MLIR  22.0.0git
SCFToGPU.cpp
Go to the documentation of this file.
1 //===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This implements a straightforward conversion of an loop nest into a GPU
10 // kernel. The caller is expected to guarantee that the conversion is correct
11 // or to further transform the kernel to ensure correctness.
12 //
13 //===----------------------------------------------------------------------===//
14 
16 
24 #include "mlir/IR/AffineExpr.h"
25 #include "mlir/IR/Builders.h"
26 #include "mlir/IR/IRMapping.h"
30 #include "llvm/Support/DebugLog.h"
31 #include <optional>
32 
33 #define DEBUG_TYPE "loops-to-gpu"
34 
35 using namespace mlir;
36 using namespace mlir::affine;
37 using namespace mlir::scf;
38 
39 // Name of internal attribute to mark visited operations during conversion.
40 //
41 // NOTE: The conversion originally used the following legality criteria:
42 // `!parallelOp->hasAttr(gpu::getMappingAttrName())`
43 // But the provided pattern might reject some cases based on more detailed
44 // analysis of the `mapping` attribute.
45 // To avoid dialect conversion failure due to non-converted illegal operation
46 // we use this extra Unit attribute as a marker, that the operation was checked
47 // by the pattern and is should be considered as legal in the following legality
48 // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up
49 // of this extra attributes ans is supposed to be called after the dialect
50 // conversion.
51 //
52 // TODO: Implement a cleaner solution, factoring out the "matching" logic
53 // from the pattern and its callees into a separate function that can be called
54 // from both the pattern and the op legality check.
55 static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
56 
57 // Extract an indexed value from KernelDim3.
58 static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
59  switch (pos) {
60  case 0:
61  return dim3.x;
62  case 1:
63  return dim3.y;
64  case 2:
65  return dim3.z;
66  default:
67  llvm_unreachable("dim3 position out of bounds");
68  }
69  return nullptr;
70 }
71 
72 // Get the lower bound-related operands of a loop operation.
74  return forOp.getLowerBoundOperands();
75 }
76 
77 // Get the upper bound-related operands of a loop operation.
79  return forOp.getUpperBoundOperands();
80 }
81 
82 // Get a Value that corresponds to the loop step. If the step is an attribute,
83 // materialize a corresponding constant using builder.
84 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
85  return arith::ConstantIndexOp::create(builder, forOp.getLoc(),
86  forOp.getStepAsInt());
87 }
88 
89 // Get a Value for the loop lower bound. If the value requires computation,
90 // materialize the instructions using builder.
91 static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
92  return lowerAffineLowerBound(forOp, builder);
93 }
94 
95 // Get a Value for the loop upper bound. If the value requires computation,
96 // materialize the instructions using builder.
97 static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
98  return lowerAffineUpperBound(forOp, builder);
99 }
100 
101 // Check the structure of the loop nest:
102 // - there are enough loops to map to numDims;
103 // - the loops are perfectly nested;
104 // - the loop bounds can be computed above the outermost loop.
105 // This roughly corresponds to the "matcher" part of the pattern-based
106 // rewriting infrastructure.
107 static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
108  unsigned numDims) {
109  Region &limit = forOp.getRegion();
110  for (unsigned i = 0, e = numDims; i < e; ++i) {
111  Operation *nested = &forOp.getBody()->front();
112  if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
114  return forOp.emitError(
115  "loops with bounds depending on other mapped loops "
116  "are not supported");
117 
118  // The innermost loop can have an arbitrary body, skip the perfect nesting
119  // check for it.
120  if (i == e - 1)
121  break;
122 
123  auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
124  if (forOp.getBody()->empty() || std::next(begin, 2) != end)
125  return forOp.emitError("expected perfectly nested loops in the body");
126 
127  if (!(forOp = dyn_cast<AffineForOp>(nested)))
128  return nested->emitError("expected a nested loop");
129  }
130  return success();
131 }
132 
133 static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
134  unsigned numBlockDims,
135  unsigned numThreadDims) {
136  if (numBlockDims < 1 || numThreadDims < 1) {
137  LDBG() << "nothing to map";
138  return success();
139  }
140 
141  if (numBlockDims > 3) {
142  return forOp.emitError("cannot map to more than 3 block dimensions");
143  }
144  if (numThreadDims > 3) {
145  return forOp.emitError("cannot map to more than 3 thread dimensions");
146  }
147  return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
148 }
149 
150 namespace {
151 // Helper structure that holds common state of the loop to GPU kernel
152 // conversion.
153 struct AffineLoopToGpuConverter {
154  std::optional<AffineForOp> collectBounds(AffineForOp forOp,
155  unsigned numLoops);
156 
157  void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
158  unsigned numBlockDims, unsigned numThreadDims);
159 
160  // Ranges of the loops mapped to blocks or threads.
162  // Lower bounds of the loops mapped to blocks or threads.
164  // Induction variables of the loops mapped to blocks or threads.
166  // Steps of the loops mapped to blocks or threads.
167  SmallVector<Value, 6> steps;
168 };
169 } // namespace
170 
171 // Collect ranges, bounds, steps and induction variables in preparation for
172 // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
173 // This may fail if the IR for computing loop bounds cannot be constructed, for
174 // example if an affine loop uses semi-affine maps. Return the last loop to be
175 // mapped on success, std::nullopt on failure.
176 std::optional<AffineForOp>
177 AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
178  OpBuilder builder(forOp.getOperation());
179  dims.reserve(numLoops);
180  lbs.reserve(numLoops);
181  ivs.reserve(numLoops);
182  steps.reserve(numLoops);
183  AffineForOp currentLoop = forOp;
184  for (unsigned i = 0; i < numLoops; ++i) {
185  Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
186  Value upperBound = getOrEmitUpperBound(currentLoop, builder);
187  if (!lowerBound || !upperBound) {
188  return std::nullopt;
189  }
190 
191  Value range = arith::SubIOp::create(builder, currentLoop.getLoc(),
192  upperBound, lowerBound);
193  Value step = getOrCreateStep(currentLoop, builder);
194  if (getConstantIntValue(step) != static_cast<int64_t>(1))
195  range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range,
196  step);
197  dims.push_back(range);
198 
199  lbs.push_back(lowerBound);
200  ivs.push_back(currentLoop.getInductionVar());
201  steps.push_back(step);
202 
203  if (i != numLoops - 1)
204  currentLoop = cast<AffineForOp>(&currentLoop.getBody()->front());
205  }
206  return currentLoop;
207 }
208 
209 // Replace the rooted at "rootForOp" with a GPU launch operation. This expects
210 // "innermostForOp" to point to the last loop to be transformed to the kernel,
211 // and to have (numBlockDims + numThreadDims) perfectly nested loops between
212 // "rootForOp" and "innermostForOp".
213 void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
214  AffineForOp innermostForOp,
215  unsigned numBlockDims,
216  unsigned numThreadDims) {
217  OpBuilder builder(rootForOp.getOperation());
218  // Prepare the grid and block sizes for the launch operation. If there is
219  // no loop mapped to a specific dimension, use constant "1" as its size.
220  Value constOne =
221  (numBlockDims < 3 || numThreadDims < 3)
222  ? arith::ConstantIndexOp::create(builder, rootForOp.getLoc(), 1)
223  : nullptr;
224  Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
225  Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
226  Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
227  Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
228  Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
229  Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
230 
231  // Create a launch op and move the body region of the innermost loop to the
232  // launch op.
233  auto launchOp =
234  gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY,
235  gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
236 
237  // Replace the loop terminator (loops contain only a single block) with the
238  // gpu terminator and move the operations from the loop body block to the gpu
239  // launch body block. Do not move the entire block because of the difference
240  // in block arguments.
241  Operation &terminator = innermostForOp.getBody()->back();
242  Location terminatorLoc = terminator.getLoc();
243  terminator.erase();
244  builder.setInsertionPointToEnd(innermostForOp.getBody());
245  gpu::TerminatorOp::create(builder, terminatorLoc, TypeRange());
246  launchOp.getBody().front().getOperations().splice(
247  launchOp.getBody().front().begin(),
248  innermostForOp.getBody()->getOperations());
249 
250  // Remap the loop iterators to use block/thread identifiers instead. Loops
251  // may iterate from LB with step S whereas GPU thread/block ids always iterate
252  // from 0 to N with step 1. Therefore, loop induction variables are replaced
253  // with (gpu-thread/block-id * S) + LB.
254  builder.setInsertionPointToStart(&launchOp.getBody().front());
255  auto *lbArgumentIt = lbs.begin();
256  auto *stepArgumentIt = steps.begin();
257  for (const auto &en : llvm::enumerate(ivs)) {
258  Value id =
259  en.index() < numBlockDims
260  ? getDim3Value(launchOp.getBlockIds(), en.index())
261  : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
262  Value step = steps[en.index()];
263  if (getConstantIntValue(step) != static_cast<int64_t>(1))
264  id = arith::MulIOp::create(builder, rootForOp.getLoc(), step, id);
265 
266  Value ivReplacement =
267  arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt, id);
268  en.value().replaceAllUsesWith(ivReplacement);
269  std::advance(lbArgumentIt, 1);
270  std::advance(stepArgumentIt, 1);
271  }
272 
273  // We are done and can erase the original outermost loop.
274  rootForOp.erase();
275 }
276 
277 // Generic loop to GPU kernel conversion function.
278 static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
279  unsigned numBlockDims,
280  unsigned numThreadDims) {
281  if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
282  return failure();
283 
284  AffineLoopToGpuConverter converter;
285  auto maybeInnerLoop =
286  converter.collectBounds(forOp, numBlockDims + numThreadDims);
287  if (!maybeInnerLoop)
288  return failure();
289  converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
290 
291  return success();
292 }
293 
294 LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
295  unsigned numBlockDims,
296  unsigned numThreadDims) {
297  return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
298 }
299 
300 namespace {
301 struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
303 
304  LogicalResult matchAndRewrite(ParallelOp parallelOp,
305  PatternRewriter &rewriter) const override;
306 };
307 } // namespace
308 
309 /// Tries to derive a static upper bound from the defining operation of
310 /// `upperBound`.
312  PatternRewriter &rewriter) {
313  if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
314  return op;
315  }
316 
317  if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
318  for (const AffineExpr &result : minOp.getMap().getResults()) {
319  if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
320  return arith::ConstantIndexOp::create(rewriter, minOp.getLoc(),
321  constExpr.getValue());
322  }
323  }
324  }
325 
326  if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
327  for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
328  if (auto staticBound = deriveStaticUpperBound(operand, rewriter))
329  return staticBound;
330  }
331  }
332 
333  if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
334  if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
335  deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
336  .getDefiningOp()))
337  if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
338  deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
339  .getDefiningOp())) {
340  // Assumptions about the upper bound of minimum computations no longer
341  // work if multiplied by mixed signs, so abort in this case.
342  if ((lhs.value() < 0) != (rhs.value() < 0))
343  return {};
344 
345  return arith::ConstantIndexOp::create(rewriter, multiplyOp.getLoc(),
346  lhs.value() * rhs.value());
347  }
348  }
349 
350  return {};
351 }
352 
353 static bool isMappedToProcessor(gpu::Processor processor) {
354  return processor != gpu::Processor::Sequential;
355 }
356 
357 static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
358  switch (processor) {
359  case gpu::Processor::BlockX:
360  return 0;
361  case gpu::Processor::BlockY:
362  return 1;
363  case gpu::Processor::BlockZ:
364  return 2;
365  case gpu::Processor::ThreadX:
366  return 3;
367  case gpu::Processor::ThreadY:
368  return 4;
369  case gpu::Processor::ThreadZ:
370  return 5;
371  default:;
372  }
373  llvm_unreachable(
374  "invalid processor type while retrieving launch op argument number");
375 }
376 
377 /// Modifies the current transformation state to capture the effect of the given
378 /// `scf.parallel` operation on index substitutions and the operations to be
379 /// inserted.
380 /// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
381 /// this function will
382 /// - compute the loop index based on the hardware id and affine map from the
383 /// mapping and update `cloningMap` to substitute all uses.
384 /// - derive a new upper bound for the hardware id and augment the provided
385 /// `gpu.launch operation` accordingly.
386 /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
387 /// and update the rewriter to insert into the conditional's body.
388 /// If the dimension is mapped to sequential,
389 /// - insert a for loop into the body and update the rewriter to insert into
390 /// the for loop's body.
391 /// - update the `cloningMap` to replace uses of the index with the index of
392 /// the new for loop.
393 /// In either case,
394 /// - append the instructions from the loops body to worklist, in reverse order.
395 /// To note the end of the current scope in case a loop or conditional was
396 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
397 /// worklist. This signals the processor of the worklist to pop the rewriter
398 /// one scope-level up.
399 static LogicalResult processParallelLoop(
400  ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
403  // TODO: Verify that this is a valid GPU mapping.
404  // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
405  ArrayAttr mapping =
406  parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
407 
408  // TODO: Support multiple reductions.
409  if (!mapping || parallelOp.getNumResults() > 1)
410  return failure();
411 
412  Location loc = parallelOp.getLoc();
413 
414  auto launchIndependent = [&launchOp](Value val) {
415  return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
416  };
417 
418  auto ensureLaunchIndependent = [&rewriter,
419  launchIndependent](Value val) -> Value {
420  if (launchIndependent(val))
421  return val;
422  if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
423  return arith::ConstantOp::create(rewriter, constOp.getLoc(),
424  constOp.getValue());
425  return {};
426  };
427 
428  for (auto config : llvm::zip(
429  mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
430  parallelOp.getUpperBound(), parallelOp.getStep())) {
431  Attribute mappingAttribute;
432  Value iv, lowerBound, upperBound, step;
433  std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
434  auto annotation =
435  dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
436  if (!annotation)
437  return parallelOp.emitOpError()
438  << "expected mapping attribute for lowering to GPU";
439  Value newIndex;
440  gpu::Processor processor = annotation.getProcessor();
441 
442  if (isMappedToProcessor(processor)) {
443  // Use the corresponding thread/grid index as replacement for the loop iv.
444  Value operand =
445  launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor));
446  // Take the indexmap and add the lower bound and step computations in.
447  // This computes operand * step + lowerBound.
448  // Use an affine map here so that it composes nicely with the provided
449  // annotation.
450  AffineMap lowerAndStep = AffineMap::get(
451  1, 2,
452  rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
453  rewriter.getAffineSymbolExpr(1));
454  newIndex = AffineApplyOp::create(
455  rewriter, loc, annotation.getMap().compose(lowerAndStep),
456  ValueRange{operand, ensureLaunchIndependent(step),
457  ensureLaunchIndependent(lowerBound)});
458  // If there was also a bound, insert that, too.
459  // TODO: Check that we do not assign bounds twice.
460  if (annotation.getBound()) {
461  // We pass as the single operand to the bound-map the number of
462  // iterations, which is (upperBound - lowerBound) ceilDiv step. To
463  // support inner loops with dynamic upper bounds (as generated by e.g.
464  // tiling), try to derive a max for the bounds. If the used bound for
465  // the hardware id is imprecise, wrap the contained code into a
466  // conditional. If the lower-bound is constant or defined before the
467  // launch, we can use it in the launch bounds. Otherwise fail.
468  if (!launchIndependent(lowerBound) &&
469  !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
470  return failure();
471  // The step must also be constant or defined outside of the loop nest.
472  if (!launchIndependent(step) &&
473  !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
474  return failure();
475  // If the upper-bound is constant or defined before the launch, we can
476  // use it in the launch bounds directly. Otherwise try derive a bound.
477  bool boundIsPrecise =
478  launchIndependent(upperBound) ||
479  isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
480  {
481  PatternRewriter::InsertionGuard guard(rewriter);
482  rewriter.setInsertionPoint(launchOp);
483  if (!boundIsPrecise) {
484  upperBound = deriveStaticUpperBound(upperBound, rewriter);
485  if (!upperBound) {
486  return rewriter.notifyMatchFailure(
487  parallelOp,
488  "cannot derive loop-invariant upper bound for number of"
489  "iterations");
490  }
491  }
492  // Compute the number of iterations needed. We compute this as an
493  // affine expression ceilDiv (upperBound - lowerBound) step. We use
494  // affine.apply here so that it composes nicely with the provided map.
495  AffineMap stepMap = AffineMap::get(
496  1, 2,
497  ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
498  .ceilDiv(rewriter.getAffineSymbolExpr(1))));
499  Value launchBound = AffineApplyOp::create(
500  rewriter, loc, annotation.getBound().compose(stepMap),
501  ValueRange{
502  ensureLaunchIndependent(
503  cloningMap.lookupOrDefault(upperBound)),
504  ensureLaunchIndependent(
505  cloningMap.lookupOrDefault(lowerBound)),
506  ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
507  // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
508  // when this condition is relaxed.
509  if (!bounds.try_emplace(processor, launchBound).second) {
510  return rewriter.notifyMatchFailure(
511  parallelOp, "cannot redefine the bound for processor " +
512  Twine(static_cast<int64_t>(processor)));
513  }
514  }
515  if (!boundIsPrecise) {
516  // We are using an approximation, create a surrounding conditional.
517  Value originalBound = std::get<3>(config);
518  arith::CmpIOp pred = arith::CmpIOp::create(
519  rewriter, loc, arith::CmpIPredicate::slt, newIndex,
520  cloningMap.lookupOrDefault(originalBound));
521  scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred, false);
522  rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
523  // Put a sentinel into the worklist so we know when to pop out of the
524  // if body again. We use the launchOp here, as that cannot be part of
525  // the bodies instruction.
526  worklist.push_back(launchOp.getOperation());
527  }
528  }
529  } else {
530  // Create a sequential for loop.
531  auto loopOp = scf::ForOp::create(rewriter, loc,
532  cloningMap.lookupOrDefault(lowerBound),
533  cloningMap.lookupOrDefault(upperBound),
534  cloningMap.lookupOrDefault(step));
535  newIndex = loopOp.getInductionVar();
536  rewriter.setInsertionPointToStart(loopOp.getBody());
537  // Put a sentinel into the worklist so we know when to pop out of the loop
538  // body again. We use the launchOp here, as that cannot be part of the
539  // bodies instruction.
540  worklist.push_back(launchOp.getOperation());
541  }
542  cloningMap.map(iv, newIndex);
543  }
544 
545  // Propagate custom user defined optional attributes, that can be used at
546  // later stage, such as extension data for GPU kernel dispatch
547  for (const auto &namedAttr : parallelOp->getAttrs()) {
548  if (namedAttr.getName() == gpu::getMappingAttrName() ||
549  namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
550  continue;
551  launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
552  }
553 
554  Block *body = parallelOp.getBody();
555  worklist.reserve(worklist.size() + body->getOperations().size());
556  // Include scf.reduce terminator if exists and has an operand.
557  if (auto terminator = body->getTerminator();
558  isa<scf::ReduceOp>(terminator) && terminator->getOperands().size() == 1) {
559  worklist.push_back(terminator);
560  }
561  for (Operation &op : llvm::reverse(body->without_terminator()))
562  worklist.push_back(&op);
563  return success();
564 }
565 
566 /// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
567 /// operation.
568 ///
569 /// This essentially transforms a loop nest into a corresponding SIMT function.
570 /// The conversion is driven by mapping annotations on the `scf.parallel`
571 /// operations. The mapping is provided via a `DictionaryAttribute` named
572 /// `mapping`, which has three entries:
573 /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
574 /// thread dimensions and 6 is sequential.
575 /// - map : An affine map that is used to pre-process hardware ids before
576 /// substitution.
577 /// - bound : An affine map that is used to compute the bound of the hardware
578 /// id based on an upper bound of the number of iterations.
579 /// If the `scf.parallel` contains nested `scf.parallel` operations, those
580 /// need to be annotated, as well. Structurally, the transformation works by
581 /// splicing all operations from nested `scf.parallel` operations into a single
582 /// sequence. Indices mapped to hardware ids are substituted with those ids,
583 /// wheras sequential mappings result in a sequential for-loop. To have more
584 /// flexibility when mapping code to hardware ids, the transform supports two
585 /// affine maps. The first `map` is used to compute the actual index for
586 /// substitution from the hardware id. The second `bound` is used to compute the
587 /// launch dimension for the hardware id from the number of iterations the
588 /// mapped loop is performing. Note that the number of iterations might be
589 /// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
590 /// the hardware id might iterate over additional indices. The transformation
591 /// caters for this by predicating the created sequence of instructions on
592 /// the actual loop bound. This only works if an static upper bound for the
593 /// dynamic loop bound can be derived, currently via analyzing `affine.min`
594 /// operations.
595 LogicalResult
596 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
597  PatternRewriter &rewriter) const {
598  // Mark the operation as visited for recursive legality check.
599  parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
600 
601  // We can only transform starting at the outer-most loop. Launches inside of
602  // parallel loops are not supported.
603  if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
604  return failure();
605  // Create a launch operation. We start with bound one for all grid/block
606  // sizes. Those will be refined later as we discover them from mappings.
607  Location loc = parallelOp.getLoc();
609  arith::ConstantIndexOp::create(rewriter, parallelOp.getLoc(), 1);
610  gpu::LaunchOp launchOp = gpu::LaunchOp::create(
611  rewriter, parallelOp.getLoc(), constantOne, constantOne, constantOne,
613  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
614  gpu::TerminatorOp::create(rewriter, loc);
615  rewriter.setInsertionPointToStart(&launchOp.getBody().front());
616 
617  IRMapping cloningMap;
620  if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
621  launchBounds, rewriter)))
622  return failure();
623 
624  // Whether we have seen any side-effects. Reset when leaving an inner scope.
625  bool seenSideeffects = false;
626  // Whether we have left a nesting scope (and hence are no longer innermost).
627  bool leftNestingScope = false;
628  while (!worklist.empty()) {
629  Operation *op = worklist.pop_back_val();
630  // Now walk over the body and clone it.
631  // TODO: This is only correct if there either is no further scf.parallel
632  // nested or this code is side-effect free. Otherwise we might need
633  // predication. We are overly conservative for now and only allow
634  // side-effects in the innermost scope.
635  if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
636  // Before entering a nested scope, make sure there have been no
637  // sideeffects until now.
638  if (seenSideeffects)
639  return failure();
640  // A nested scf.parallel needs insertion of code to compute indices.
641  // Insert that now. This will also update the worklist with the loops
642  // body.
643  if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
644  worklist, launchBounds, rewriter)))
645  return failure();
646  } else if (op == launchOp.getOperation()) {
647  // Found our sentinel value. We have finished the operations from one
648  // nesting level, pop one level back up.
649  auto *parent = rewriter.getInsertionPoint()->getParentOp();
650  rewriter.setInsertionPointAfter(parent);
651  leftNestingScope = true;
652  seenSideeffects = false;
653  } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
654  // Convert scf.reduction op
655  auto parentLoop = op->getParentOfType<ParallelOp>();
656  if (!parentLoop || op->getOperands().size() != 1)
657  return failure();
658  auto operand = op->getOperands().front();
659  auto newValue = cloningMap.lookupOrNull(operand);
660  if (!newValue || !operand.getType().isSignlessIntOrFloat())
661  return failure();
662  // Ensure reduction region is isolated from above.
663  llvm::SetVector<Value> externalValues;
664  getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues);
665  if (externalValues.size())
666  return failure();
667  // Replace by gpu.all_reduce.
668  auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue);
669  cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult());
670  // Copy region.
671  rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(),
672  gpuRedOp.getRegion().begin());
673  // Replace src.reduce.return with gpu.yield.
674  auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
675  auto ip = rewriter.saveInsertionPoint();
676  rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front());
677  rewriter.replaceOpWithNewOp<gpu::YieldOp>(
678  scfReturn, scfReturn->getOperands().front());
679  rewriter.restoreInsertionPoint(ip);
680  } else {
681  // Otherwise we copy it over.
682  Operation *clone = rewriter.clone(*op, cloningMap);
683  cloningMap.map(op->getResults(), clone->getResults());
684  // Check for side effects.
685  // TODO: Handle region side effects properly.
686  seenSideeffects |=
688  // If we are no longer in the innermost scope, sideeffects are disallowed.
689  if (seenSideeffects && leftNestingScope)
690  return failure();
691  }
692  }
693 
694  // Now that we succeeded creating the launch operation, also update the
695  // bounds.
696  for (auto bound : launchBounds)
697  launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
698  std::get<1>(bound));
699 
700  rewriter.eraseOp(parallelOp);
701  return success();
702 }
703 
705  patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
706 }
707 
709  target.addLegalDialect<memref::MemRefDialect>();
710  target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
711  return !parallelOp->hasAttr(gpu::getMappingAttrName()) ||
712  parallelOp->hasAttr(kVisitedAttrName);
713  });
714 }
715 
717  op->walk([](scf::ParallelOp parallelOp) {
718  parallelOp->removeAttr(kVisitedAttrName);
719  });
720 }
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Definition: SCFToGPU.cpp:278
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims)
Definition: SCFToGPU.cpp:107
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder)
Definition: SCFToGPU.cpp:97
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos)
Definition: SCFToGPU.cpp:58
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl< Operation * > &worklist, DenseMap< gpu::Processor, Value > &bounds, PatternRewriter &rewriter)
Modifies the current transformation state to capture the effect of the given scf.parallel operation o...
Definition: SCFToGPU.cpp:399
static bool isMappedToProcessor(gpu::Processor processor)
Definition: SCFToGPU.cpp:353
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp)
Definition: SCFToGPU.cpp:73
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder)
Definition: SCFToGPU.cpp:84
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder)
Definition: SCFToGPU.cpp:91
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter)
Tries to derive a static upper bound from the defining operation of upperBound.
Definition: SCFToGPU.cpp:311
static unsigned getLaunchOpArgumentNum(gpu::Processor processor)
Definition: SCFToGPU.cpp:357
static constexpr StringLiteral kVisitedAttrName
Definition: SCFToGPU.cpp:55
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp)
Definition: SCFToGPU.cpp:78
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Definition: SCFToGPU.cpp:133
Base type for affine expression.
Definition: AffineExpr.h:68
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Block represents an ordered list of Operations.
Definition: Block.h:33
Operation * getTerminator()
Get the terminator operation of this block.
Definition: Block.cpp:244
OpListType & getOperations()
Definition: Block.h:137
iterator_range< iterator > without_terminator()
Return an iterator range over the operation within this block excluding the terminator operation at t...
Definition: Block.h:212
UnitAttr getUnitAttr()
Definition: Builders.cpp:97
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:367
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:363
This class describes a specific conversion target.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookupOrDefault(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:65
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
auto lookupOrNull(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:58
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:207
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Definition: Builders.h:385
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition: Builders.h:445
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:552
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:431
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:436
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
Definition: Builders.h:390
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:412
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:797
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition: Operation.h:674
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:267
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:378
result_range getResults()
Definition: Operation.h:415
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:538
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:783
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:716
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
Definition: PatternMatch.h:519
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
StringRef getMappingAttrName()
Name of the mapping attribute produced by loop mappers.
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Definition: CodegenUtils.h:320
Include the generated interface declarations.
void finalizeParallelLoopToGPUConversion(Operation *op)
Clean up after applyPartialConversion/applyFullConversion call.
Definition: SCFToGPU.cpp:716
void populateParallelLoopToGPUPatterns(RewritePatternSet &patterns)
Adds the conversion pattern from scf.parallel to gpu.launch to the provided pattern list.
Definition: SCFToGPU.cpp:704
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims)
Convert a perfect affine loop nest with the outermost loop identified by forOp into a gpu::Launch ope...
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Value lowerAffineUpperBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the upper bound of the given affine loop using standard arithmetic operations...
const FrozenRewritePatternSet & patterns
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:70
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
bool areValuesDefinedAbove(Range values, Region &limit)
Check if all values in the provided range are defined above the limit region.
Definition: RegionUtils.h:26
void configureParallelLoopToGPULegality(ConversionTarget &target)
Configures the rewrite target such that only scf.parallel operations that are not rewritten by the pr...
Definition: SCFToGPU.cpp:708
Value lowerAffineLowerBound(affine::AffineForOp op, OpBuilder &builder)
Emit code that computes the lower bound of the given affine loop using standard arithmetic operations...
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39