44 #include "llvm/Support/FormatVariadic.h"
46 #include "../GPUCommon/GPUOpsLowering.h"
47 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
50 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
51 #include "mlir/Conversion/Passes.h.inc"
61 int64_t intWidth = cast<IntegerType>(value.
getType()).getWidth();
63 auto indexBitwidthType =
66 if (indexBitwidth > intWidth) {
67 return rewriter.
create<LLVM::SExtOp>(loc, indexBitwidthType, value);
69 if (indexBitwidth < intWidth) {
70 return rewriter.
create<LLVM::TruncOp>(loc, indexBitwidthType, value);
78 bool canBeBare =
true;
79 for (
Type type : func.getArgumentTypes())
80 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
86 const unsigned indexBitwidth) {
88 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
89 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
90 Value mbcntLo = rewriter.
create<ROCDL::MbcntLoOp>(loc, int32Type,
92 Value laneId = rewriter.
create<ROCDL::MbcntHiOp>(loc, int32Type,
97 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
98 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
99 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
100 "64-S32-A5-G1-ni:7:8:9";
107 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
109 auto loc = op->getLoc();
115 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
116 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
123 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
124 if (indexBitwidth > 32) {
125 laneId = rewriter.
create<LLVM::SExtOp>(
127 }
else if (indexBitwidth < 32) {
128 laneId = rewriter.
create<LLVM::TruncOp>(
140 amdgpu::Chipset chipset)
145 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
147 LLVM::ConstantRangeAttr bounds =
nullptr;
148 bool isBeforeGfx10 = chipset.majorVersion < 10;
149 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
150 bounds = rewriter.
getAttr<LLVM::ConstantRangeAttr>(
151 32, isBeforeGfx10 ? 64 : 32,
152 op.getUpperBoundAttr().getInt() + 1);
154 Value wavefrontOp = rewriter.
create<ROCDL::WavefrontSizeOp>(
157 *getTypeConverter());
162 const amdgpu::Chipset chipset;
185 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
188 Value initShflValue = adaptor.getValue();
190 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
194 Value width = adaptor.getWidth();
195 Value zero = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 0);
196 Value negwidth = rewriter.
create<LLVM::SubOp>(loc, int32Type, zero, width);
197 Value add = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
198 Value widthOrZeroIfOutside =
199 rewriter.
create<LLVM::AndOp>(loc, int32Type, add, negwidth);
202 switch (op.getMode()) {
203 case gpu::ShuffleMode::UP:
204 dstLane = rewriter.
create<LLVM::SubOp>(loc, int32Type, srcLaneId,
205 adaptor.getOffset());
207 case gpu::ShuffleMode::DOWN:
208 dstLane = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId,
209 adaptor.getOffset());
211 case gpu::ShuffleMode::XOR:
212 dstLane = rewriter.
create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
213 adaptor.getOffset());
215 case gpu::ShuffleMode::IDX:
216 dstLane = adaptor.getOffset();
219 Value isActiveSrcLane = rewriter.
create<LLVM::ICmpOp>(
220 loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
221 Value selectDstLane = rewriter.
create<LLVM::SelectOp>(loc, isActiveSrcLane,
223 Value two = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 2);
224 Value dwordAlignedDstLane =
225 rewriter.
create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
230 for (
Value v : decomposed) {
231 Value res = rewriter.
create<ROCDL::DsBpermuteOp>(loc, int32Type,
232 dwordAlignedDstLane, v);
233 swizzled.emplace_back(res);
237 rewriter.
replaceOp(op, {shflValue, isActiveSrcLane});
243 #include "GPUToROCDL.cpp.inc"
250 struct LowerGpuOpsToROCDLOpsPass final
251 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
252 LowerGpuOpsToROCDLOpsPass() =
default;
253 LowerGpuOpsToROCDLOpsPass(
const std::string &chipset,
unsigned indexBitwidth,
254 bool useBarePtrCallConv,
256 if (this->chipset.getNumOccurrences() == 0)
257 this->chipset = chipset;
258 if (this->indexBitwidth.getNumOccurrences() == 0)
259 this->indexBitwidth = indexBitwidth;
260 if (this->useBarePtrCallConv.getNumOccurrences() == 0)
261 this->useBarePtrCallConv = useBarePtrCallConv;
262 if (this->runtime.getNumOccurrences() == 0)
263 this->runtime = runtime;
267 Base::getDependentDialects(registry);
271 void runOnOperation()
override {
272 gpu::GPUModuleOp m = getOperation();
275 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
276 LLVM::LLVMDialect::getDataLayoutAttrName());
277 if (!llvmDataLayout) {
279 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
282 for (
auto func : m.getOps<func::FuncOp>()) {
283 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
288 if (failed(maybeChipset)) {
290 return signalPassFailure();
295 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
296 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
298 options.overrideIndexBitwidth(indexBitwidth);
300 if (useBarePtrCallConv) {
301 options.useBarePtrCallConv =
true;
303 m.walk([](gpu::GPUFuncOp func) ->
WalkResult {
310 "bare pointer calling convention requires all memrefs to "
311 "have static shape and use the identity map");
312 return signalPassFailure();
328 converter, [](gpu::AddressSpace space) {
330 case gpu::AddressSpace::Global:
332 case gpu::AddressSpace::Workgroup:
334 case gpu::AddressSpace::Private:
337 llvm_unreachable(
"unknown address space enum value");
344 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
345 allowedDialects.end());
347 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
349 if (!allowedDialectsSet.empty() && !allowed)
352 auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
358 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
359 << dialect->getNamespace();
360 return signalPassFailure();
365 iface->populateConvertToLLVMConversionPatterns(target, converter,
377 auto reqdWorkGroupSizeAttrHelper =
378 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
379 auto flatWorkGroupSizeAttrHelper =
380 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
383 m.walk([&](LLVM::LLVMFuncOp op) {
384 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
385 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
388 uint32_t flatSize = 1;
389 for (uint32_t size : blockSizes.asArrayRef()) {
392 StringAttr flatSizeAttr =
394 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
407 target.
addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
408 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
409 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
412 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
415 target.
addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
429 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
430 converter, IndexKind::Block, IntrType::Id);
432 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
433 converter, IndexKind::Grid, IntrType::Id);
436 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
437 converter, IndexKind::Block, IntrType::Dim);
439 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
440 converter, IndexKind::Grid, IntrType::Dim);
445 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
446 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
447 rocdlDialect->getKernelAttrHelper().getName(),
448 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
458 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
459 patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
464 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
466 unsigned indexBitwidth,
467 bool useBarePtrCallConv,
469 return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
470 chipset, indexBitwidth, useBarePtrCallConv, runtime);
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
MLIRContext * getContext() const
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Runtime
Potential runtimes for AMD GPU kernels.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.