MLIR  16.0.0git
KernelOutlining.cpp
Go to the documentation of this file.
1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect kernel outlining pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
14 
18 #include "mlir/Dialect/DLTI/DLTI.h"
24 #include "mlir/IR/Builders.h"
25 #include "mlir/IR/Matchers.h"
26 #include "mlir/IR/SymbolTable.h"
27 #include "mlir/Support/LLVM.h"
29 
30 namespace mlir {
31 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
32 #define GEN_PASS_DEF_GPUKERNELOUTLINING
33 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
34 } // namespace mlir
35 
36 using namespace mlir;
37 
38 template <typename OpTy>
39 static void createForAllDimensions(OpBuilder &builder, Location loc,
40  SmallVectorImpl<Value> &values) {
41  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
42  values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
43 }
44 
45 /// Adds operations generating block/thread ids and grid/block dimensions at the
46 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in
47 /// entry block of `launchOpBody`, to the corresponding result value of the
48 /// added operations.
49 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
50  Region &launchOpBody,
51  BlockAndValueMapping &map) {
52  OpBuilder builder(loc->getContext());
53  Block &firstBlock = launchOpBody.front();
54  builder.setInsertionPointToStart(&launchFuncOpBody.front());
55  SmallVector<Value, 12> indexOps;
56  createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
57  createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
58  createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
59  createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
60  // Replace the leading 12 function args with the respective thread/block index
61  // operations. Iterate backwards since args are erased and indices change.
62  for (const auto &indexOp : enumerate(indexOps))
63  map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
64 }
65 
66 /// Identifies operations that are beneficial to sink into kernels. These
67 /// operations may not have side-effects, as otherwise sinking (and hence
68 /// duplicating them) is not legal.
70  return matchPattern(op, m_Constant()) ||
71  isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
72 }
73 
74 /// For a given operation `op`, computes whether it is beneficial to sink the
75 /// operation into the kernel. An operation can be sunk if doing so does not
76 /// introduce new kernel arguments. Whether a value is already available in the
77 /// kernel (and hence does not introduce new arguments) is checked by
78 /// querying `existingDependencies` and `availableValues`.
79 /// If an operand is not yet available, we recursively check whether it can be
80 /// made available by siking its defining op.
81 /// Operations that are indentified for sinking are added to `beneficiaryOps` in
82 /// the order they should appear in the kernel. Furthermore, `availableValues`
83 /// is updated with results that will be available after sinking the identified
84 /// ops.
86  Operation *op, const SetVector<Value> &existingDependencies,
87  SetVector<Operation *> &beneficiaryOps,
88  llvm::SmallPtrSetImpl<Value> &availableValues,
89  llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
90  if (beneficiaryOps.count(op))
91  return true;
92 
93  if (!isSinkingBeneficiary(op))
94  return false;
95 
96  for (Value operand : op->getOperands()) {
97  // It is already visible in the kernel, keep going.
98  if (availableValues.count(operand))
99  continue;
100  // Else check whether it can be made available via sinking or already is a
101  // dependency.
102  Operation *definingOp = operand.getDefiningOp();
103  if ((!definingOp || !extractBeneficiaryOps(definingOp, existingDependencies,
104  beneficiaryOps, availableValues,
105  isSinkingBeneficiary)) &&
106  !existingDependencies.count(operand))
107  return false;
108  }
109  // We will sink the operation, mark its results as now available.
110  beneficiaryOps.insert(op);
111  for (Value result : op->getResults())
112  availableValues.insert(result);
113  return true;
114 }
115 
117  gpu::LaunchOp launchOp,
118  llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
119  assert(isSinkingBeneficiary);
120  Region &launchOpBody = launchOp.getBody();
121 
122  // Identify uses from values defined outside of the scope of the launch
123  // operation.
124  SetVector<Value> sinkCandidates;
125  getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
126 
127  SetVector<Operation *> toBeSunk;
128  llvm::SmallPtrSet<Value, 4> availableValues;
129  for (Value operand : sinkCandidates) {
130  Operation *operandOp = operand.getDefiningOp();
131  if (!operandOp)
132  continue;
133  extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
134  isSinkingBeneficiary);
135  }
136 
137  // Insert operations so that the defs get cloned before uses.
139  OpBuilder builder(launchOpBody);
140  for (Operation *op : toBeSunk) {
141  Operation *clonedOp = builder.clone(*op, map);
142  // Only replace uses within the launch op.
143  for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
144  replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
145  launchOp.getBody());
146  }
147  return success();
148 }
149 
150 /// Outline the `gpu.launch` operation body into a kernel function. Replace
151 /// `gpu.terminator` operations by `gpu.return` in the generated function.
152 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
153  StringRef kernelFnName,
154  SetVector<Value> &operands) {
155  Location loc = launchOp.getLoc();
156  // Create a builder with no insertion point, insertion will happen separately
157  // due to symbol table manipulation.
158  OpBuilder builder(launchOp.getContext());
159  Region &launchOpBody = launchOp.getBody();
160 
161  // Identify uses from values defined outside of the scope of the launch
162  // operation.
163  getUsedValuesDefinedAbove(launchOpBody, operands);
164 
165  // Create the gpu.func operation.
166  SmallVector<Type, 4> kernelOperandTypes;
167  kernelOperandTypes.reserve(operands.size());
168  for (Value operand : operands) {
169  kernelOperandTypes.push_back(operand.getType());
170  }
171  FunctionType type =
172  FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
173  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
174  outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
175  builder.getUnitAttr());
177 
178  // Map the arguments corresponding to the launch parameters like blockIdx,
179  // threadIdx, etc.
180  Region &outlinedFuncBody = outlinedFunc.getBody();
181  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
182 
183  // Map arguments from gpu.launch region to the arguments of the gpu.func
184  // operation.
185  Block &entryBlock = outlinedFuncBody.front();
186  for (const auto &operand : enumerate(operands))
187  map.map(operand.value(), entryBlock.getArgument(operand.index()));
188 
189  // Clone the region of the gpu.launch operation into the gpu.func operation.
190  // TODO: If cloneInto can be modified such that if a mapping for
191  // a block exists, that block will be used to clone operations into (at the
192  // end of the block), instead of creating a new block, this would be much
193  // cleaner.
194  launchOpBody.cloneInto(&outlinedFuncBody, map);
195 
196  // Branch from entry of the gpu.func operation to the block that is cloned
197  // from the entry block of the gpu.launch operation.
198  Block &launchOpEntry = launchOpBody.front();
199  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
200  builder.setInsertionPointToEnd(&entryBlock);
201  builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
202 
203  outlinedFunc.walk([](gpu::TerminatorOp op) {
204  OpBuilder replacer(op);
205  replacer.create<gpu::ReturnOp>(op.getLoc());
206  op.erase();
207  });
208  return outlinedFunc;
209 }
210 
211 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
212  StringRef kernelFnName,
213  llvm::SmallVectorImpl<Value> &operands) {
214  DenseSet<Value> inputOperandSet;
215  inputOperandSet.insert(operands.begin(), operands.end());
216  SetVector<Value> operandSet(operands.begin(), operands.end());
217  auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
218  for (auto operand : operandSet) {
219  if (!inputOperandSet.count(operand))
220  operands.push_back(operand);
221  }
222  return funcOp;
223 }
224 
225 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation
226 /// launching `kernelFunc`. The kernel func contains the body of the
227 /// `gpu.launch` with constant region arguments inlined.
228 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
229  gpu::GPUFuncOp kernelFunc,
230  ValueRange operands) {
231  OpBuilder builder(launchOp);
232  // The launch op has an optional dynamic shared memory size. If it doesn't
233  // exist, we use zero.
234  Value asyncToken = launchOp.getAsyncToken();
235  auto launchFunc = builder.create<gpu::LaunchFuncOp>(
236  launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
237  launchOp.getBlockSizeOperandValues(),
238  launchOp.getDynamicSharedMemorySize(), operands,
239  asyncToken ? asyncToken.getType() : nullptr,
240  launchOp.getAsyncDependencies());
241  launchOp.replaceAllUsesWith(launchFunc);
242  launchOp.erase();
243 }
244 
245 namespace {
246 /// Pass that moves ops which are likely an index computation into gpu.launch
247 /// body.
248 class GpuLaunchSinkIndexComputationsPass
249  : public impl::GpuLaunchSinkIndexComputationsBase<
250  GpuLaunchSinkIndexComputationsPass> {
251 public:
252  void runOnOperation() override {
253  Operation *op = getOperation();
254  if (op->walk([](gpu::LaunchOp launch) {
255  // Pull in instructions that can be sunk
256  if (failed(sinkOperationsIntoLaunchOp(launch,
257  isLikelyAnIndexComputation)))
258  return WalkResult::interrupt();
259 
260  return WalkResult::advance();
261  }).wasInterrupted())
262  signalPassFailure();
263  }
264 };
265 
266 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
267 ///
268 /// This pass moves the kernel code of each LaunchOp into a function created
269 /// inside a nested module. It also creates an external function of the same
270 /// name in the parent module.
271 ///
272 /// The gpu.modules are intended to be compiled to a cubin blob independently in
273 /// a separate pass. The external functions can then be annotated with the
274 /// symbol of the cubin accessor function.
275 class GpuKernelOutliningPass
276  : public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
277 public:
278  GpuKernelOutliningPass(StringRef dlStr) {
279  if (!dlStr.empty() && !dataLayoutStr.hasValue())
280  dataLayoutStr = dlStr.str();
281  }
282 
283  GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
284  : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
285  dataLayoutStr = other.dataLayoutStr.getValue();
286  }
287 
288  LogicalResult initialize(MLIRContext *context) override {
289  // Initialize the data layout specification from the data layout string.
290  if (!dataLayoutStr.empty()) {
291  Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
292  if (!resultAttr)
293  return failure();
294 
295  dataLayoutSpec = resultAttr.dyn_cast<DataLayoutSpecInterface>();
296  if (!dataLayoutSpec)
297  return failure();
298  }
299 
300  return success();
301  }
302 
303  void runOnOperation() override {
304  SymbolTable symbolTable(getOperation());
305  bool modified = false;
306  for (auto func : getOperation().getOps<func::FuncOp>()) {
307  // Insert just after the function.
308  Block::iterator insertPt(func->getNextNode());
309  auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
310  SetVector<Value> operands;
311  std::string kernelFnName =
312  Twine(op->getParentOfType<func::FuncOp>().getName(), "_kernel")
313  .str();
314 
315  gpu::GPUFuncOp outlinedFunc =
316  outlineKernelFuncImpl(op, kernelFnName, operands);
317 
318  // Create nested module and insert outlinedFunc. The module will
319  // originally get the same name as the function, but may be renamed on
320  // insertion into the parent module.
321  auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
322  symbolTable.insert(kernelModule, insertPt);
323 
324  // Potentially changes signature, pulling in constants.
325  convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
326  modified = true;
327  return WalkResult::advance();
328  });
329  if (funcWalkResult.wasInterrupted())
330  return signalPassFailure();
331  }
332 
333  // If any new module was inserted in this module, annotate this module as
334  // a container module.
335  if (modified)
336  getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
337  UnitAttr::get(&getContext()));
338  }
339 
340 private:
341  /// Returns a gpu.module containing kernelFunc and all callees (recursive).
342  gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
343  const SymbolTable &parentSymbolTable) {
344  // TODO: This code cannot use an OpBuilder because it must be inserted into
345  // a SymbolTable by the caller. SymbolTable needs to be refactored to
346  // prevent manual building of Ops with symbols in code using SymbolTables
347  // and then this needs to use the OpBuilder.
348  auto *context = getOperation().getContext();
349  OpBuilder builder(context);
350  auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
351  kernelFunc.getName());
352 
353  // If a valid data layout spec was provided, attach it to the kernel module.
354  // Otherwise, the default data layout will be used.
355  if (dataLayoutSpec)
356  kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
357 
358  SymbolTable symbolTable(kernelModule);
359  symbolTable.insert(kernelFunc);
360 
361  SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
362  while (!symbolDefWorklist.empty()) {
363  if (Optional<SymbolTable::UseRange> symbolUses =
364  SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
365  for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
366  StringRef symbolName =
367  symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue();
368  if (symbolTable.lookup(symbolName))
369  continue;
370 
371  Operation *symbolDefClone =
372  parentSymbolTable.lookup(symbolName)->clone();
373  symbolDefWorklist.push_back(symbolDefClone);
374  symbolTable.insert(symbolDefClone);
375  }
376  }
377  }
378 
379  return kernelModule;
380  }
381 
382  Option<std::string> dataLayoutStr{
383  *this, "data-layout-str",
384  llvm::cl::desc("String containing the data layout specification to be "
385  "attached to the GPU kernel module")};
386 
387  DataLayoutSpecInterface dataLayoutSpec;
388 };
389 
390 } // namespace
391 
393  return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
394 }
395 
396 std::unique_ptr<OperationPass<ModuleOp>>
397 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
398  return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
399 }
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, BlockAndValueMapping &map)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
Attributes are known-constant values of operations.
Definition: Attributes.h:25
U dyn_cast() const
Definition: Attributes.h:127
MLIRContext * getContext() const
Return the context this attribute belongs to.
Definition: Attributes.cpp:20
Block * lookup(Block *from) const
Lookup a mapped value within the map.
void map(Block *from, Block *to)
Inserts a new mapping for 'from' to 'to'.
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:129
BlockArgument getArgument(unsigned i)
Definition: Block.h:118
UnitAttr getUnitAttr()
Definition: Builders.cpp:99
IndexType getIndexType()
Definition: Builders.cpp:56
A symbol reference with a reference path containing a single element.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:64
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:56
This class helps build Operations.
Definition: Builders.h:198
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:383
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:388
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:422
Operation * clone(Operation &op, BlockAndValueMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:510
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:31
Operation * clone(BlockAndValueMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:558
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition: Operation.h:395
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:295
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:203
result_range getResults()
Definition: Operation.h:332
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:574
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:418
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
void cloneInto(Region *dest, BlockAndValueMapping &mapper)
Clone the internal blocks from this region into dest.
Definition: Region.cpp:70
Block & front()
Definition: Region.h:65
This class represents a specific symbol use.
Definition: SymbolTable.h:147
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Definition: SymbolTable.h:23
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static Optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:349
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:85
Type getType() const
Return the type of this value.
Definition: Value.h:114
static WalkResult advance()
Definition: Visitors.h:51
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:230
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:329
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:24
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::unique_ptr< Pass > createGpuLauchSinkIndexComputationsPass()
Pass that moves ops which are likely an index computation into gpu.launch body.
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:59
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:255
std::unique_ptr< OperationPass< ModuleOp > > createGpuKernelOutliningPass(StringRef dataLayoutStr=StringRef())
Replaces gpu.launch with gpu.launch_func by moving the region into a separate kernel function.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context)
This parses a single MLIR attribute to an MLIR context if it was valid.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26