MLIR  18.0.0git
KernelOutlining.cpp
Go to the documentation of this file.
1 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect kernel outlining pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
14 
18 #include "mlir/Dialect/DLTI/DLTI.h"
23 #include "mlir/IR/Builders.h"
25 #include "mlir/IR/IRMapping.h"
26 #include "mlir/IR/Matchers.h"
27 #include "mlir/IR/SymbolTable.h"
28 #include "mlir/Support/LLVM.h"
30 #include <limits>
31 
32 namespace mlir {
33 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
34 #define GEN_PASS_DEF_GPUKERNELOUTLINING
35 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
36 } // namespace mlir
37 
38 using namespace mlir;
39 
40 template <typename OpTy>
41 static void createForAllDimensions(OpBuilder &builder, Location loc,
42  SmallVectorImpl<Value> &values) {
43  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
44  values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
45 }
46 
47 /// Adds operations generating block/thread ids and grid/block dimensions at the
48 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in
49 /// entry block of `launchOpBody`, to the corresponding result value of the
50 /// added operations.
51 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
52  Region &launchOpBody, IRMapping &map) {
53  OpBuilder builder(loc->getContext());
54  Block &firstBlock = launchOpBody.front();
55  builder.setInsertionPointToStart(&launchFuncOpBody.front());
56  SmallVector<Value, 12> indexOps;
57  createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
58  createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
59  createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
60  createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
61  // Replace the leading 12 function args with the respective thread/block index
62  // operations. Iterate backwards since args are erased and indices change.
63  for (const auto &indexOp : enumerate(indexOps))
64  map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
65 }
66 
67 /// Identifies operations that are beneficial to sink into kernels. These
68 /// operations may not have side-effects, as otherwise sinking (and hence
69 /// duplicating them) is not legal.
71  return matchPattern(op, m_Constant()) ||
72  isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
73 }
74 
75 /// For a given operation `op`, computes whether it is beneficial to sink the
76 /// operation into the kernel. An operation can be sunk if doing so does not
77 /// introduce new kernel arguments. Whether a value is already available in the
78 /// kernel (and hence does not introduce new arguments) is checked by
79 /// querying `existingDependencies` and `availableValues`.
80 /// If an operand is not yet available, we recursively check whether it can be
81 /// made available by siking its defining op.
82 /// Operations that are indentified for sinking are added to `beneficiaryOps` in
83 /// the order they should appear in the kernel. Furthermore, `availableValues`
84 /// is updated with results that will be available after sinking the identified
85 /// ops.
87  Operation *op, const SetVector<Value> &existingDependencies,
88  SetVector<Operation *> &beneficiaryOps,
89  llvm::SmallPtrSetImpl<Value> &availableValues,
90  llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
91  if (beneficiaryOps.count(op))
92  return true;
93 
94  if (!isSinkingBeneficiary(op))
95  return false;
96 
97  for (Value operand : op->getOperands()) {
98  // It is already visible in the kernel, keep going.
99  if (availableValues.count(operand))
100  continue;
101  // Else check whether it can be made available via sinking or already is a
102  // dependency.
103  Operation *definingOp = operand.getDefiningOp();
104  if ((!definingOp || !extractBeneficiaryOps(definingOp, existingDependencies,
105  beneficiaryOps, availableValues,
106  isSinkingBeneficiary)) &&
107  !existingDependencies.count(operand))
108  return false;
109  }
110  // We will sink the operation, mark its results as now available.
111  beneficiaryOps.insert(op);
112  for (Value result : op->getResults())
113  availableValues.insert(result);
114  return true;
115 }
116 
118  gpu::LaunchOp launchOp,
119  llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
120  assert(isSinkingBeneficiary);
121  Region &launchOpBody = launchOp.getBody();
122 
123  // Identify uses from values defined outside of the scope of the launch
124  // operation.
125  SetVector<Value> sinkCandidates;
126  getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
127 
128  SetVector<Operation *> toBeSunk;
129  llvm::SmallPtrSet<Value, 4> availableValues;
130  for (Value operand : sinkCandidates) {
131  Operation *operandOp = operand.getDefiningOp();
132  if (!operandOp)
133  continue;
134  extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
135  isSinkingBeneficiary);
136  }
137 
138  // Insert operations so that the defs get cloned before uses.
139  IRMapping map;
140  OpBuilder builder(launchOpBody);
141  for (Operation *op : toBeSunk) {
142  Operation *clonedOp = builder.clone(*op, map);
143  // Only replace uses within the launch op.
144  for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
145  replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
146  launchOp.getBody());
147  }
148  return success();
149 }
150 
151 /// Return the provided KernelDim3 as an array of i32 constants if possible.
153  SmallVector<int32_t, 3> constants;
154  MLIRContext *ctx = dims.x.getContext();
155  for (Value v : {dims.x, dims.y, dims.z}) {
156  APInt constValue;
157  if (!matchPattern(v, m_ConstantInt(&constValue)))
158  return nullptr;
159  // In the event someone called for a too-large block or grid dimension,
160  // don't set bounds as it is likely to cause more confusing behavior.
161  if (constValue.ugt(std::numeric_limits<uint32_t>::max()))
162  return nullptr;
163  constants.push_back(
164  constValue.getLimitedValue(std::numeric_limits<uint32_t>::max()));
165  }
166  return DenseI32ArrayAttr::get(ctx, constants);
167 }
168 
169 /// Outline the `gpu.launch` operation body into a kernel function. Replace
170 /// `gpu.terminator` operations by `gpu.return` in the generated function.
171 /// Set block and grid size bounds if known.
172 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
173  StringRef kernelFnName,
174  SetVector<Value> &operands) {
175  Location loc = launchOp.getLoc();
176  // Create a builder with no insertion point, insertion will happen separately
177  // due to symbol table manipulation.
178  OpBuilder builder(launchOp.getContext());
179  Region &launchOpBody = launchOp.getBody();
180 
181  // Identify uses from values defined outside of the scope of the launch
182  // operation.
183  getUsedValuesDefinedAbove(launchOpBody, operands);
184 
185  // Create the gpu.func operation.
186  SmallVector<Type, 4> kernelOperandTypes;
187  kernelOperandTypes.reserve(operands.size());
188  for (Value operand : operands) {
189  kernelOperandTypes.push_back(operand.getType());
190  }
191  FunctionType type =
192  FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
193  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
194  loc, kernelFnName, type,
195  TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
196  TypeRange(ValueRange(launchOp.getPrivateAttributions())));
197  outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
198  builder.getUnitAttr());
199 
200  // If we can infer bounds on the grid and/or block sizes from the arguments
201  // to the launch op, propagate them to the generated kernel. This is safe
202  // because multiple launches with the same body are not deduplicated.
203  if (auto blockBounds =
204  maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues()))
205  outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
206  blockBounds);
207  if (auto gridBounds =
208  maybeConstantDimsAttr(launchOp.getGridSizeOperandValues()))
209  outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
210  gridBounds);
211 
212  IRMapping map;
213 
214  // Map the arguments corresponding to the launch parameters like blockIdx,
215  // threadIdx, etc.
216  Region &outlinedFuncBody = outlinedFunc.getBody();
217  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
218 
219  // Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
220  for (const auto &[launchArg, funcArg] :
221  llvm::zip(launchOp.getWorkgroupAttributions(),
222  outlinedFunc.getWorkgroupAttributions()))
223  map.map(launchArg, funcArg);
224  for (const auto &[launchArg, funcArg] :
225  llvm::zip(launchOp.getPrivateAttributions(),
226  outlinedFunc.getPrivateAttributions()))
227  map.map(launchArg, funcArg);
228 
229  // Map arguments from gpu.launch region to the arguments of the gpu.func
230  // operation.
231  Block &entryBlock = outlinedFuncBody.front();
232  for (const auto &operand : enumerate(operands))
233  map.map(operand.value(), entryBlock.getArgument(operand.index()));
234 
235  // Clone the region of the gpu.launch operation into the gpu.func operation.
236  // TODO: If cloneInto can be modified such that if a mapping for
237  // a block exists, that block will be used to clone operations into (at the
238  // end of the block), instead of creating a new block, this would be much
239  // cleaner.
240  launchOpBody.cloneInto(&outlinedFuncBody, map);
241 
242  // Branch from entry of the gpu.func operation to the block that is cloned
243  // from the entry block of the gpu.launch operation.
244  Block &launchOpEntry = launchOpBody.front();
245  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
246  builder.setInsertionPointToEnd(&entryBlock);
247  builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
248 
249  outlinedFunc.walk([](gpu::TerminatorOp op) {
250  OpBuilder replacer(op);
251  replacer.create<gpu::ReturnOp>(op.getLoc());
252  op.erase();
253  });
254  return outlinedFunc;
255 }
256 
257 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
258  StringRef kernelFnName,
259  llvm::SmallVectorImpl<Value> &operands) {
260  DenseSet<Value> inputOperandSet;
261  inputOperandSet.insert(operands.begin(), operands.end());
262  SetVector<Value> operandSet(operands.begin(), operands.end());
263  auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
264  for (auto operand : operandSet) {
265  if (!inputOperandSet.count(operand))
266  operands.push_back(operand);
267  }
268  return funcOp;
269 }
270 
271 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation
272 /// launching `kernelFunc`. The kernel func contains the body of the
273 /// `gpu.launch` with constant region arguments inlined.
274 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
275  gpu::GPUFuncOp kernelFunc,
276  ValueRange operands) {
277  OpBuilder builder(launchOp);
278  // The launch op has an optional dynamic shared memory size. If it doesn't
279  // exist, we use zero.
280  Value asyncToken = launchOp.getAsyncToken();
281  auto launchFunc = builder.create<gpu::LaunchFuncOp>(
282  launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
283  launchOp.getBlockSizeOperandValues(),
284  launchOp.getDynamicSharedMemorySize(), operands,
285  asyncToken ? asyncToken.getType() : nullptr,
286  launchOp.getAsyncDependencies());
287  launchOp.replaceAllUsesWith(launchFunc);
288  launchOp.erase();
289 }
290 
291 namespace {
292 /// Pass that moves ops which are likely an index computation into gpu.launch
293 /// body.
294 class GpuLaunchSinkIndexComputationsPass
295  : public impl::GpuLaunchSinkIndexComputationsBase<
296  GpuLaunchSinkIndexComputationsPass> {
297 public:
298  void runOnOperation() override {
299  Operation *op = getOperation();
300  if (op->walk([](gpu::LaunchOp launch) {
301  // Pull in instructions that can be sunk
302  if (failed(sinkOperationsIntoLaunchOp(launch,
303  isLikelyAnIndexComputation)))
304  return WalkResult::interrupt();
305 
306  return WalkResult::advance();
307  }).wasInterrupted())
308  signalPassFailure();
309  }
310 };
311 
312 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
313 ///
314 /// This pass moves the kernel code of each LaunchOp into a function created
315 /// inside a nested module. It also creates an external function of the same
316 /// name in the parent module.
317 ///
318 /// The gpu.modules are intended to be compiled to a cubin blob independently in
319 /// a separate pass. The external functions can then be annotated with the
320 /// symbol of the cubin accessor function.
321 class GpuKernelOutliningPass
322  : public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
323 public:
324  GpuKernelOutliningPass(StringRef dlStr) {
325  if (!dlStr.empty() && !dataLayoutStr.hasValue())
326  dataLayoutStr = dlStr.str();
327  }
328 
329  GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
330  : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
331  dataLayoutStr = other.dataLayoutStr.getValue();
332  }
333 
334  LogicalResult initialize(MLIRContext *context) override {
335  // Initialize the data layout specification from the data layout string.
336  if (!dataLayoutStr.empty()) {
337  Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
338  if (!resultAttr)
339  return failure();
340 
341  dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
342  if (!dataLayoutSpec)
343  return failure();
344  }
345 
346  return success();
347  }
348 
349  void runOnOperation() override {
350  SymbolTable symbolTable(getOperation());
351  bool modified = false;
352  for (auto func : getOperation().getOps<func::FuncOp>()) {
353  // Insert just after the function.
354  Block::iterator insertPt(func->getNextNode());
355  auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
356  SetVector<Value> operands;
357  std::string kernelFnName =
358  Twine(op->getParentOfType<func::FuncOp>().getName(), "_kernel")
359  .str();
360 
361  gpu::GPUFuncOp outlinedFunc =
362  outlineKernelFuncImpl(op, kernelFnName, operands);
363 
364  // Create nested module and insert outlinedFunc. The module will
365  // originally get the same name as the function, but may be renamed on
366  // insertion into the parent module.
367  auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
368  symbolTable.insert(kernelModule, insertPt);
369 
370  // Potentially changes signature, pulling in constants.
371  convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
372  modified = true;
373  return WalkResult::advance();
374  });
375  if (funcWalkResult.wasInterrupted())
376  return signalPassFailure();
377  }
378 
379  // If any new module was inserted in this module, annotate this module as
380  // a container module.
381  if (modified)
382  getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
384  }
385 
386 private:
387  /// Returns a gpu.module containing kernelFunc and all callees (recursive).
388  gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
389  const SymbolTable &parentSymbolTable) {
390  // TODO: This code cannot use an OpBuilder because it must be inserted into
391  // a SymbolTable by the caller. SymbolTable needs to be refactored to
392  // prevent manual building of Ops with symbols in code using SymbolTables
393  // and then this needs to use the OpBuilder.
394  auto *context = getOperation().getContext();
395  OpBuilder builder(context);
396  auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
397  kernelFunc.getName());
398 
399  // If a valid data layout spec was provided, attach it to the kernel module.
400  // Otherwise, the default data layout will be used.
401  if (dataLayoutSpec)
402  kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
403 
404  SymbolTable symbolTable(kernelModule);
405  symbolTable.insert(kernelFunc);
406 
407  SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
408  while (!symbolDefWorklist.empty()) {
409  if (std::optional<SymbolTable::UseRange> symbolUses =
410  SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
411  for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
412  StringRef symbolName =
413  cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue();
414  if (symbolTable.lookup(symbolName))
415  continue;
416 
417  Operation *symbolDefClone =
418  parentSymbolTable.lookup(symbolName)->clone();
419  symbolDefWorklist.push_back(symbolDefClone);
420  symbolTable.insert(symbolDefClone);
421  }
422  }
423  }
424 
425  return kernelModule;
426  }
427 
428  Option<std::string> dataLayoutStr{
429  *this, "data-layout-str",
430  llvm::cl::desc("String containing the data layout specification to be "
431  "attached to the GPU kernel module")};
432 
433  DataLayoutSpecInterface dataLayoutSpec;
434 };
435 
436 } // namespace
437 
439  return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
440 }
441 
442 std::unique_ptr<OperationPass<ModuleOp>>
443 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
444  return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
445 }
static MLIRContext * getContext(OpFoldResult val)
static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims)
Return the provided KernelDim3 as an array of i32 constants if possible.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, IRMapping &map)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
MLIRContext * getContext() const
Return the context this attribute belongs to.
Definition: Attributes.cpp:37
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:133
BlockArgument getArgument(unsigned i)
Definition: Block.h:122
UnitAttr getUnitAttr()
Definition: Builders.cpp:114
IndexType getIndexType()
Definition: Builders.cpp:71
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
This class helps build Operations.
Definition: Builders.h:206
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:528
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:416
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:421
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:446
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
Definition: Operation.cpp:686
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:776
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition: Operation.h:560
operand_range getOperands()
Returns an iterator on the underlying Value's.
Definition: Operation.h:373
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
Definition: Operation.h:272
result_range getResults()
Definition: Operation.h:410
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:538
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
void cloneInto(Region *dest, IRMapping &mapper)
Clone the internal blocks from this region into dest.
Definition: Region.cpp:70
Block & front()
Definition: Region.h:65
This class represents a specific symbol use.
Definition: SymbolTable.h:148
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Definition: SymbolTable.h:24
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static std::optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:372
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Definition: Value.h:125
Type getType() const
Return the type of this value.
Definition: Value.h:122
static WalkResult advance()
Definition: Visitors.h:52
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Builder from ArrayRef<T>.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
This header declares functions that assist transformations in the MemRef dialect.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:401
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
detail::constant_int_value_binder m_ConstantInt(IntegerAttr::ValueType *bind_value)
Matches a constant holding a scalar/vector/tensor integer (splat) and writes the integer value to bin...
Definition: Matchers.h:438
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region)
Replace all uses of orig within the given region with replacement.
Definition: RegionUtils.cpp:28
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::unique_ptr< Pass > createGpuLauchSinkIndexComputationsPass()
Pass that moves ops which are likely an index computation into gpu.launch body.
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context, Type type={}, size_t *numRead=nullptr, bool isKnownNullTerminated=false)
This parses a single MLIR attribute to an MLIR context if it was valid.
void getUsedValuesDefinedAbove(Region &region, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
Definition: RegionUtils.cpp:63
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:310
std::unique_ptr< OperationPass< ModuleOp > > createGpuKernelOutliningPass(StringRef dataLayoutStr=StringRef())
Replaces gpu.launch with gpu.launch_func by moving the region into a separate kernel function.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:37