32#define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONSPASS
33#define GEN_PASS_DEF_GPUKERNELOUTLININGPASS
34#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
39template <
typename OpTy>
42 for (
auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
43 values.push_back(OpTy::create(builder, loc, builder.
getIndexType(), dim));
52 bool hasCluster =
false) {
68 for (
const auto &indexOp : enumerate(indexOps))
77 isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
96 if (beneficiaryOps.count(op))
99 if (!isSinkingBeneficiary(op))
104 if (availableValues.count(operand))
108 Operation *definingOp = operand.getDefiningOp();
110 beneficiaryOps, availableValues,
111 isSinkingBeneficiary)) &&
112 !existingDependencies.count(operand))
116 beneficiaryOps.insert(op);
118 availableValues.insert(
result);
123 gpu::LaunchOp launchOp,
125 assert(isSinkingBeneficiary);
126 Region &launchOpBody = launchOp.getBody();
135 for (
Value operand : sinkCandidates) {
136 Operation *operandOp = operand.getDefiningOp();
140 isSinkingBeneficiary);
160 for (
Value v : {dims.
x, dims.
y, dims.
z}) {
166 if (constValue.ugt(std::numeric_limits<uint32_t>::max()))
169 constValue.getLimitedValue(std::numeric_limits<uint32_t>::max()));
178 StringRef kernelFnName,
183 OpBuilder builder(launchOp.getContext());
184 Region &launchOpBody = launchOp.getBody();
192 kernelOperandTypes.reserve(operands.size());
193 for (
Value operand : operands) {
194 kernelOperandTypes.push_back(operand.getType());
197 FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
198 auto outlinedFunc = gpu::GPUFuncOp::create(
199 builder, loc, kernelFnName, type,
202 outlinedFunc.setKernel(
true);
207 if (
auto blockBounds =
209 outlinedFunc.setKnownBlockSizeAttr(blockBounds);
210 if (
auto gridBounds =
212 outlinedFunc.setKnownGridSizeAttr(gridBounds);
213 if (
auto clusterSize = launchOp.getClusterSizeOperandValues()) {
215 outlinedFunc.setKnownClusterSizeAttr(clusterBounds);
223 Region &outlinedFuncBody = outlinedFunc.getBody();
225 launchOp.hasClusterSize());
228 for (
const auto &[launchArg, funcArg] :
229 llvm::zip(launchOp.getWorkgroupAttributionBBArgs(),
230 outlinedFunc.getWorkgroupAttributionBBArgs()))
231 map.
map(launchArg, funcArg);
232 for (
const auto &[launchArg, funcArg] :
233 llvm::zip(launchOp.getPrivateAttributions(),
234 outlinedFunc.getPrivateAttributions()))
235 map.
map(launchArg, funcArg);
240 for (
const auto &operand : enumerate(operands))
241 map.
map(operand.value(), entryBlock.
getArgument(operand.index()));
244 launchOpBody.
cloneInto(&outlinedFuncBody, map);
247 for (
Block &block : launchOpBody) {
249 auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->
getTerminator());
253 gpu::ReturnOp::create(replacer, terminator->getLoc());
262 clonedLaunchOpEntry->
erase();
268 StringRef kernelFnName,
271 inputOperandSet.insert_range(operands);
274 for (
auto operand : operandSet) {
275 if (!inputOperandSet.count(operand))
276 operands.push_back(operand);
285 gpu::GPUFuncOp kernelFunc,
290 Value asyncToken = launchOp.getAsyncToken();
291 std::optional<gpu::KernelDim3> clusterSize =
292 launchOp.getClusterSizeOperandValues();
293 auto launchFunc = gpu::LaunchFuncOp::create(
294 builder, launchOp.getLoc(), kernelFunc,
295 launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(),
296 launchOp.getDynamicSharedMemorySize(), operands,
297 asyncToken ? asyncToken.
getType() :
nullptr,
298 launchOp.getAsyncDependencies(), clusterSize);
299 if (launchOp.getCooperative())
300 launchFunc.setCooperative(
true);
301 launchOp.replaceAllUsesWith(launchFunc);
308class GpuLaunchSinkIndexComputationsPass
310 GpuLaunchSinkIndexComputationsPass> {
314 if (op->
walk([](gpu::LaunchOp launch) {
316 if (failed(sinkOperationsIntoLaunchOp(launch,
317 isLikelyAnIndexComputation)))
318 return WalkResult::interrupt();
320 return WalkResult::advance();
335class GpuKernelOutliningPass
342 if (!dataLayoutStr.empty()) {
347 dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
355 void runOnOperation()
override {
357 bool modified =
false;
358 for (
auto func : getOperation().getOps<SymbolOpInterface>()) {
361 auto funcWalkResult =
func.walk([&](gpu::LaunchOp op) {
363 std::string kernelFnName;
364 if (op.getFunction()) {
365 kernelFnName = op.getFunction()->str();
368 Twine(op->getParentOfType<SymbolOpInterface>().getName(),
373 gpu::GPUFuncOp outlinedFunc =
379 FailureOr<gpu::GPUModuleOp> kernelModule =
380 createKernelModule(op, outlinedFunc, symbolTable);
381 if (failed(kernelModule))
383 symbolTable.insert(*kernelModule, insertPt);
390 if (funcWalkResult.wasInterrupted())
397 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
403 FailureOr<gpu::GPUModuleOp>
404 createKernelModule(gpu::LaunchOp gpuLaunchOp, gpu::GPUFuncOp kernelFunc,
412 std::string kernelModuleName;
413 gpu::GPUModuleOp kernelModule;
414 if (gpuLaunchOp.getModule()) {
415 kernelModuleName = gpuLaunchOp.getModule()->str();
417 parentSymbolTable.
lookup<gpu::GPUModuleOp>(kernelModuleName);
419 kernelModuleName = kernelFunc.getName();
425 kernelModule = gpu::GPUModuleOp::create(builder, kernelFunc.getLoc(),
432 kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
434 SymbolTable symbolTable(kernelModule);
435 symbolTable.insert(kernelFunc);
437 SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
438 while (!symbolDefWorklist.empty()) {
439 if (std::optional<SymbolTable::UseRange> symbolUses =
441 for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
448 if (!symbolUse.getSymbolRef().getNestedReferences().empty() &&
450 symbolUse.getSymbolRef().getRootReference())) {
451 symbolUse.getUser()->emitError(
"nested symbol reference '")
452 << symbolUse.getSymbolRef()
453 <<
"' cannot be resolved inside the outlined kernel module; "
454 "gpu-kernel-outlining does not support cross-module symbol "
455 "references inside gpu.launch bodies";
456 kernelModule->erase();
459 StringAttr symbolName = symbolUse.getSymbolRef().getLeafReference();
460 if (symbolTable.lookup(symbolName))
463 Operation *symbolDef = parentSymbolTable.
lookup(symbolName);
466 Operation *symbolDefClone = symbolDef->
clone();
467 symbolDefWorklist.push_back(symbolDefClone);
468 symbolTable.insert(symbolDefClone);
476 DataLayoutSpecInterface dataLayoutSpec;
LogicalResult initialize(unsigned origNumLoops, ArrayRef< ReassociationIndices > foldedIterationDims)
static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims)
Return the provided KernelDim3 as an array of i32 constants if possible.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, IRMapping &map, bool hasCluster=false)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
Attributes are known-constant values of operations.
MLIRContext * getContext() const
Return the context this attribute belongs to.
Block represents an ordered list of Operations.
OpListType::iterator iterator
BlockArgument getArgument(unsigned i)
void erase()
Unlink this Block from its parent region and delete it.
OpListType & getOperations()
Operation * getTerminator()
Get the terminator operation of this block.
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Operation is the basic unit of execution within MLIR.
operand_range getOperands()
Returns an iterator on the underlying Value's.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getResults()
Operation * clone(IRMapping &mapper, const CloneOptions &options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
virtual void runOnOperation()=0
The polymorphic API that runs the pass over the currently held operation.
void signalPassFailure()
Signal that some invariant was broken when running.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
void cloneInto(Region *dest, IRMapping &mapper)
Clone the internal blocks from this region into dest.
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static std::optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Type getType() const
Return the type of this value.
static WalkResult advance()
static WalkResult interrupt()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
detail::constant_int_value_binder m_ConstantInt(IntegerAttr::ValueType *bind_value)
Matches a constant holding a scalar/vector/tensor integer (splat) and writes the integer value to bin...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context, Type type={}, size_t *numRead=nullptr, bool isKnownNullTerminated=false)
This parses a single MLIR attribute to an MLIR context if it was valid.
llvm::SetVector< T, Vector, Set, N > SetVector
detail::DenseArrayAttrImpl< int32_t > DenseI32ArrayAttr
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
Utility class for the GPU dialect to represent triples of Values accessible through ....