43 #include "../GPUCommon/GPUOpsLowering.h"
44 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
47 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
48 #include "mlir/Conversion/Passes.h.inc"
58 int64_t intWidth = cast<IntegerType>(value.
getType()).getWidth();
60 auto indexBitwidthType =
63 if (indexBitwidth > intWidth) {
64 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
66 if (indexBitwidth < intWidth) {
67 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
75 bool canBeBare =
true;
76 for (
Type type : func.getArgumentTypes())
77 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
87 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.
getUnitAttr());
89 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::LLVMDialect::getRangeAttrName(),
96 Value mbcntLo = ROCDL::MbcntLoOp::create(
97 rewriter, loc, int32Type, minus1, zero, {},
100 Value laneId = ROCDL::MbcntHiOp::create(
101 rewriter, loc, int32Type, minus1, mbcntLo, {},
107 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
108 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
110 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
111 "64-S32-A5-G1-ni:7:8:9";
118 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
132 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
133 if (indexBitwidth > 32) {
134 laneId = LLVM::SExtOp::create(
136 }
else if (indexBitwidth < 32) {
137 laneId = LLVM::TruncOp::create(
149 amdgpu::Chipset chipset)
154 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
156 LLVM::ConstantRangeAttr bounds =
nullptr;
157 bool isBeforeGfx10 = chipset.majorVersion < 10;
158 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
159 bounds = rewriter.
getAttr<LLVM::ConstantRangeAttr>(
160 32, isBeforeGfx10 ? 64 : 32,
161 op.getUpperBoundAttr().getInt() + 1);
163 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
164 rewriter, op.getLoc(), rewriter.
getI32Type(), bounds);
166 *getTypeConverter());
171 const amdgpu::Chipset chipset;
174 static bool isSupportedReadLaneType(
Type type) {
178 isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
179 LLVM::LLVMPointerType>(type);
182 struct GPUSubgroupBroadcastOpToROCDL
187 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
189 Value src = adaptor.getSrc();
190 if (!isSupportedReadLaneType(src.
getType()))
193 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
226 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
229 Value initShflValue = adaptor.getValue();
234 Value width = adaptor.getWidth();
235 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
236 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
237 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
238 Value widthOrZeroIfOutside =
239 LLVM::AndOp::create(rewriter, loc, int32Type,
add, negwidth);
242 switch (op.getMode()) {
243 case gpu::ShuffleMode::UP:
244 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
245 adaptor.getOffset());
247 case gpu::ShuffleMode::DOWN:
248 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
249 adaptor.getOffset());
251 case gpu::ShuffleMode::XOR:
252 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
253 adaptor.getOffset());
255 case gpu::ShuffleMode::IDX:
256 dstLane = adaptor.getOffset();
259 Value isActiveSrcLane = LLVM::ICmpOp::create(
260 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
261 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
263 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
264 Value dwordAlignedDstLane =
265 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
270 for (
Value v : decomposed) {
271 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
272 dwordAlignedDstLane, v);
273 swizzled.emplace_back(res);
277 rewriter.
replaceOp(op, {shflValue, isActiveSrcLane});
283 #include "GPUToROCDL.cpp.inc"
290 struct LowerGpuOpsToROCDLOpsPass final
291 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
292 LowerGpuOpsToROCDLOpsPass() =
default;
293 LowerGpuOpsToROCDLOpsPass(
const std::string &chipset,
unsigned indexBitwidth,
294 bool useBarePtrCallConv,
296 if (this->chipset.getNumOccurrences() == 0)
297 this->chipset = chipset;
298 if (this->indexBitwidth.getNumOccurrences() == 0)
299 this->indexBitwidth = indexBitwidth;
300 if (this->useBarePtrCallConv.getNumOccurrences() == 0)
301 this->useBarePtrCallConv = useBarePtrCallConv;
302 if (this->runtime.getNumOccurrences() == 0)
303 this->runtime = runtime;
307 Base::getDependentDialects(registry);
311 void runOnOperation()
override {
312 gpu::GPUModuleOp m = getOperation();
315 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
316 LLVM::LLVMDialect::getDataLayoutAttrName());
317 if (!llvmDataLayout) {
319 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
322 for (
auto func : m.getOps<func::FuncOp>()) {
323 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
328 if (
failed(maybeChipset)) {
330 return signalPassFailure();
335 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
336 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
338 options.overrideIndexBitwidth(indexBitwidth);
340 if (useBarePtrCallConv) {
341 options.useBarePtrCallConv =
true;
343 m.walk([](gpu::GPUFuncOp func) ->
WalkResult {
350 "bare pointer calling convention requires all memrefs to "
351 "have static shape and use the identity map");
352 return signalPassFailure();
368 converter, [](gpu::AddressSpace space) {
370 case gpu::AddressSpace::Global:
372 case gpu::AddressSpace::Workgroup:
374 case gpu::AddressSpace::Private:
377 llvm_unreachable(
"unknown address space enum value");
384 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
385 allowedDialects.end());
387 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
389 if (!allowedDialectsSet.empty() && !allowed)
392 auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
398 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
399 << dialect->getNamespace();
400 return signalPassFailure();
405 iface->populateConvertToLLVMConversionPatterns(target, converter,
417 auto reqdWorkGroupSizeAttrHelper =
418 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
419 auto flatWorkGroupSizeAttrHelper =
420 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
423 m.walk([&](LLVM::LLVMFuncOp op) {
424 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
425 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
428 uint32_t flatSize = 1;
429 for (uint32_t size : blockSizes.asArrayRef()) {
432 StringAttr flatSizeAttr =
434 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
447 target.
addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
448 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
449 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
452 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
455 target.
addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
469 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
470 converter, IndexKind::Block, IntrType::Id);
472 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
473 converter, IndexKind::Grid, IntrType::Id);
476 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
477 converter, IndexKind::Block, IntrType::Dim);
479 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
480 converter, IndexKind::Grid, IntrType::Dim);
485 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
486 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
487 rocdlDialect->getKernelAttrHelper().getName(),
488 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
498 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
499 GPUSubgroupBroadcastOpToROCDL>(converter);
500 patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
505 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
507 unsigned indexBitwidth,
508 bool useBarePtrCallConv,
510 return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
511 chipset, indexBitwidth, useBarePtrCallConv, runtime);
static Value getZero(OpBuilder &b, Location loc, Type elementType)
Get zero value for an element type.
static MLIRContext * getContext(OpFoldResult val)
static Value getLaneId(RewriterBase &rewriter, Location loc)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
MLIRContext * getContext() const
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Operation is the basic unit of execution within MLIR.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isInteger() const
Return true if this is an integer type (with the specified width).
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Runtime
Potential runtimes for AMD GPU kernels.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.