33 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
34 #define GEN_PASS_DEF_GPUKERNELOUTLINING
35 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
40 template <
typename OpTy>
43 for (
auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
57 createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
58 createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
59 createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
60 createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
63 for (
const auto &indexOp :
enumerate(indexOps))
72 isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
91 if (beneficiaryOps.count(op))
94 if (!isSinkingBeneficiary(op))
99 if (availableValues.count(operand))
103 Operation *definingOp = operand.getDefiningOp();
105 beneficiaryOps, availableValues,
106 isSinkingBeneficiary)) &&
107 !existingDependencies.count(operand))
111 beneficiaryOps.insert(op);
113 availableValues.insert(result);
118 gpu::LaunchOp launchOp,
120 assert(isSinkingBeneficiary);
121 Region &launchOpBody = launchOp.getBody();
130 for (
Value operand : sinkCandidates) {
131 Operation *operandOp = operand.getDefiningOp();
135 isSinkingBeneficiary);
155 for (
Value v : {dims.
x, dims.
y, dims.
z}) {
173 StringRef kernelFnName,
178 OpBuilder builder(launchOp.getContext());
179 Region &launchOpBody = launchOp.getBody();
187 kernelOperandTypes.reserve(operands.size());
188 for (
Value operand : operands) {
189 kernelOperandTypes.push_back(operand.getType());
193 auto outlinedFunc = builder.
create<gpu::GPUFuncOp>(
194 loc, kernelFnName, type,
197 outlinedFunc->
setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
203 if (
auto blockBounds =
205 outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
207 if (
auto gridBounds =
209 outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
216 Region &outlinedFuncBody = outlinedFunc.getBody();
220 for (
const auto &[launchArg, funcArg] :
221 llvm::zip(launchOp.getWorkgroupAttributions(),
222 outlinedFunc.getWorkgroupAttributions()))
223 map.
map(launchArg, funcArg);
224 for (
const auto &[launchArg, funcArg] :
225 llvm::zip(launchOp.getPrivateAttributions(),
226 outlinedFunc.getPrivateAttributions()))
227 map.
map(launchArg, funcArg);
232 for (
const auto &operand :
enumerate(operands))
233 map.
map(operand.value(), entryBlock.
getArgument(operand.index()));
240 launchOpBody.
cloneInto(&outlinedFuncBody, map);
245 Block *clonedLaunchOpEntry = map.
lookup(&launchOpEntry);
247 builder.
create<cf::BranchOp>(loc, clonedLaunchOpEntry);
249 outlinedFunc.
walk([](gpu::TerminatorOp op) {
258 StringRef kernelFnName,
261 inputOperandSet.insert(operands.begin(), operands.end());
264 for (
auto operand : operandSet) {
265 if (!inputOperandSet.count(operand))
266 operands.push_back(operand);
275 gpu::GPUFuncOp kernelFunc,
280 Value asyncToken = launchOp.getAsyncToken();
281 auto launchFunc = builder.
create<gpu::LaunchFuncOp>(
282 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
283 launchOp.getBlockSizeOperandValues(),
284 launchOp.getDynamicSharedMemorySize(), operands,
285 asyncToken ? asyncToken.
getType() :
nullptr,
286 launchOp.getAsyncDependencies());
294 class GpuLaunchSinkIndexComputationsPass
295 :
public impl::GpuLaunchSinkIndexComputationsBase<
296 GpuLaunchSinkIndexComputationsPass> {
298 void runOnOperation()
override {
300 if (op->
walk([](gpu::LaunchOp launch) {
302 if (failed(sinkOperationsIntoLaunchOp(launch,
303 isLikelyAnIndexComputation)))
304 return WalkResult::interrupt();
306 return WalkResult::advance();
321 class GpuKernelOutliningPass
322 :
public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
324 GpuKernelOutliningPass(StringRef dlStr) {
325 if (!dlStr.empty() && !dataLayoutStr.hasValue())
326 dataLayoutStr = dlStr.str();
329 GpuKernelOutliningPass(
const GpuKernelOutliningPass &other)
330 : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
331 dataLayoutStr = other.dataLayoutStr.getValue();
336 if (!dataLayoutStr.empty()) {
341 dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
349 void runOnOperation()
override {
351 bool modified =
false;
352 for (
auto func : getOperation().getOps<func::FuncOp>()) {
355 auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
357 std::string kernelFnName =
361 gpu::GPUFuncOp outlinedFunc =
367 auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
368 symbolTable.insert(kernelModule, insertPt);
375 if (funcWalkResult.wasInterrupted())
376 return signalPassFailure();
382 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
388 gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
394 auto *context = getOperation().getContext();
396 auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
397 kernelFunc.getName());
402 kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
405 symbolTable.insert(kernelFunc);
408 while (!symbolDefWorklist.empty()) {
409 if (std::optional<SymbolTable::UseRange> symbolUses =
412 StringRef symbolName =
413 cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue();
414 if (symbolTable.lookup(symbolName))
419 symbolDefWorklist.push_back(symbolDefClone);
420 symbolTable.insert(symbolDefClone);
428 Option<std::string> dataLayoutStr{
429 *
this,
"data-layout-str",
430 llvm::cl::desc(
"String containing the data layout specification to be "
431 "attached to the GPU kernel module")};
433 DataLayoutSpecInterface dataLayoutSpec;
439 return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
442 std::unique_ptr<OperationPass<ModuleOp>>
444 return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
static MLIRContext * getContext(OpFoldResult val)
static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims)
Return the provided KernelDim3 as an array of i32 constants if possible.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, IRMapping &map)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Attributes are known-constant values of operations.
MLIRContext * getContext() const
Return the context this attribute belongs to.
Block represents an ordered list of Operations.
OpListType::iterator iterator
BlockArgument getArgument(unsigned i)
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Location getLoc()
The source location the operation was defined or derived from.
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
operand_range getOperands()
Returns an iterator on the underlying Value's.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
result_range getResults()
void erase()
Remove this operation from its parent block and delete it.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
void cloneInto(Region *dest, IRMapping &mapper)
Clone the internal blocks from this region into dest.
This class represents a specific symbol use.
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static std::optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Type getType() const
Return the type of this value.
static WalkResult advance()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Builder from ArrayRef<T>.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
This header declares functions that assist transformations in the MemRef dialect.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
detail::constant_int_value_binder m_ConstantInt(IntegerAttr::ValueType *bind_value)
Matches a constant holding a scalar/vector/tensor integer (splat) and writes the integer value to bin...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
std::unique_ptr< Pass > createGpuLauchSinkIndexComputationsPass()
Pass that moves ops which are likely an index computation into gpu.launch body.
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context, Type type={}, size_t *numRead=nullptr, bool isKnownNullTerminated=false)
This parses a single MLIR attribute to an MLIR context if it was valid.
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
std::unique_ptr< OperationPass< ModuleOp > > createGpuKernelOutliningPass(StringRef dataLayoutStr=StringRef())
Replaces gpu.launch with gpu.launch_func by moving the region into a separate kernel function.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
This class represents an efficient way to signal success or failure.
Utility class for the GPU dialect to represent triples of Values accessible through ....