32 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONSPASS
33 #define GEN_PASS_DEF_GPUKERNELOUTLININGPASS
34 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
39 template <
typename OpTy>
42 for (
auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
43 values.push_back(OpTy::create(builder, loc, builder.
getIndexType(), dim));
52 bool hasCluster =
false) {
58 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
59 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
60 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
61 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
63 createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
64 createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
68 for (
const auto &indexOp :
enumerate(indexOps))
77 isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
96 if (beneficiaryOps.count(op))
99 if (!isSinkingBeneficiary(op))
104 if (availableValues.count(operand))
108 Operation *definingOp = operand.getDefiningOp();
110 beneficiaryOps, availableValues,
111 isSinkingBeneficiary)) &&
112 !existingDependencies.count(operand))
116 beneficiaryOps.insert(op);
118 availableValues.insert(result);
123 gpu::LaunchOp launchOp,
125 assert(isSinkingBeneficiary);
126 Region &launchOpBody = launchOp.getBody();
135 for (
Value operand : sinkCandidates) {
136 Operation *operandOp = operand.getDefiningOp();
140 isSinkingBeneficiary);
160 for (
Value v : {dims.
x, dims.
y, dims.
z}) {
178 StringRef kernelFnName,
183 OpBuilder builder(launchOp.getContext());
184 Region &launchOpBody = launchOp.getBody();
192 kernelOperandTypes.reserve(operands.size());
193 for (
Value operand : operands) {
194 kernelOperandTypes.push_back(operand.getType());
198 auto outlinedFunc = gpu::GPUFuncOp::create(
199 builder, loc, kernelFnName, type,
202 outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
208 if (
auto blockBounds =
210 outlinedFunc.setKnownBlockSizeAttr(blockBounds);
211 if (
auto gridBounds =
213 outlinedFunc.setKnownGridSizeAttr(gridBounds);
220 Region &outlinedFuncBody = outlinedFunc.getBody();
222 launchOp.hasClusterSize());
225 for (
const auto &[launchArg, funcArg] :
226 llvm::zip(launchOp.getWorkgroupAttributions(),
227 outlinedFunc.getWorkgroupAttributions()))
228 map.
map(launchArg, funcArg);
229 for (
const auto &[launchArg, funcArg] :
230 llvm::zip(launchOp.getPrivateAttributions(),
231 outlinedFunc.getPrivateAttributions()))
232 map.
map(launchArg, funcArg);
237 for (
const auto &operand :
enumerate(operands))
238 map.
map(operand.value(), entryBlock.
getArgument(operand.index()));
241 launchOpBody.
cloneInto(&outlinedFuncBody, map);
244 for (
Block &block : launchOpBody) {
246 auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->
getTerminator());
250 gpu::ReturnOp::create(replacer, terminator->getLoc());
259 clonedLaunchOpEntry->
erase();
265 StringRef kernelFnName,
268 inputOperandSet.insert_range(operands);
271 for (
auto operand : operandSet) {
272 if (!inputOperandSet.count(operand))
273 operands.push_back(operand);
282 gpu::GPUFuncOp kernelFunc,
287 Value asyncToken = launchOp.getAsyncToken();
288 std::optional<gpu::KernelDim3> clusterSize =
289 launchOp.getClusterSizeOperandValues();
290 auto launchFunc = gpu::LaunchFuncOp::create(
291 builder, launchOp.getLoc(), kernelFunc,
292 launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(),
293 launchOp.getDynamicSharedMemorySize(), operands,
294 asyncToken ? asyncToken.
getType() :
nullptr,
295 launchOp.getAsyncDependencies(), clusterSize);
296 launchOp.replaceAllUsesWith(launchFunc);
303 class GpuLaunchSinkIndexComputationsPass
304 :
public impl::GpuLaunchSinkIndexComputationsPassBase<
305 GpuLaunchSinkIndexComputationsPass> {
307 void runOnOperation()
override {
309 if (op->
walk([](gpu::LaunchOp launch) {
311 if (failed(sinkOperationsIntoLaunchOp(launch,
312 isLikelyAnIndexComputation)))
313 return WalkResult::interrupt();
315 return WalkResult::advance();
330 class GpuKernelOutliningPass
331 :
public impl::GpuKernelOutliningPassBase<GpuKernelOutliningPass> {
335 LogicalResult initialize(
MLIRContext *context)
override {
337 if (!dataLayoutStr.empty()) {
342 dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
350 void runOnOperation()
override {
352 bool modified =
false;
353 for (
auto func : getOperation().getOps<SymbolOpInterface>()) {
356 auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
358 std::string kernelFnName;
359 if (op.getFunction()) {
360 kernelFnName = op.getFunction()->str();
363 Twine(op->getParentOfType<SymbolOpInterface>().getName(),
368 gpu::GPUFuncOp outlinedFunc =
374 auto kernelModule = createKernelModule(op, outlinedFunc, symbolTable);
375 symbolTable.insert(kernelModule, insertPt);
382 if (funcWalkResult.wasInterrupted())
383 return signalPassFailure();
389 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
395 gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp,
396 gpu::GPUFuncOp kernelFunc,
402 auto *context = getOperation().getContext();
404 std::string kernelModuleName;
405 gpu::GPUModuleOp kernelModule;
406 if (gpuLaunchOp.getModule()) {
407 kernelModuleName = gpuLaunchOp.getModule()->str();
409 parentSymbolTable.
lookup<gpu::GPUModuleOp>(kernelModuleName);
411 kernelModuleName = kernelFunc.
getName();
417 kernelModule = gpu::GPUModuleOp::create(builder, kernelFunc.getLoc(),
424 kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
427 symbolTable.insert(kernelFunc);
430 while (!symbolDefWorklist.empty()) {
431 if (std::optional<SymbolTable::UseRange> symbolUses =
434 StringAttr symbolName = symbolUse.getSymbolRef().getLeafReference();
435 if (symbolTable.lookup(symbolName))
440 symbolDefWorklist.push_back(symbolDefClone);
441 symbolTable.insert(symbolDefClone);
449 DataLayoutSpecInterface dataLayoutSpec;
static MLIRContext * getContext(OpFoldResult val)
static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims)
Return the provided KernelDim3 as an array of i32 constants if possible.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, IRMapping &map, bool hasCluster=false)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Attributes are known-constant values of operations.
MLIRContext * getContext() const
Return the context this attribute belongs to.
Block represents an ordered list of Operations.
OpListType::iterator iterator
BlockArgument getArgument(unsigned i)
void erase()
Unlink this Block from its parent region and delete it.
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Operation is the basic unit of execution within MLIR.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
OperationName getName()
The name of an operation is the key identifier for it.
operand_range getOperands()
Returns an iterator on the underlying Value's.
result_range getResults()
This class contains a list of basic blocks and a link to the parent operation it is attached to.
void cloneInto(Region *dest, IRMapping &mapper)
Clone the internal blocks from this region into dest.
This class represents a specific symbol use.
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static std::optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Type getType() const
Return the type of this value.
static WalkResult advance()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Builder from ArrayRef<T>.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
detail::constant_int_value_binder m_ConstantInt(IntegerAttr::ValueType *bind_value)
Matches a constant holding a scalar/vector/tensor integer (splat) and writes the integer value to bin...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context, Type type={}, size_t *numRead=nullptr, bool isKnownNullTerminated=false)
This parses a single MLIR attribute to an MLIR context if it was valid.
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
Utility class for the GPU dialect to represent triples of Values accessible through ....