46#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
47#include "mlir/Conversion/Passes.h.inc"
59 auto indexBitwidthType =
62 if (indexBitwidth > intWidth) {
63 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
65 if (indexBitwidth < intWidth) {
66 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
74 bool canBeBare =
true;
75 for (
Type type :
func.getArgumentTypes())
76 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
82 auto int32Type = IntegerType::get(rewriter.
getContext(), 32);
86 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.
getUnitAttr());
88 LLVM::LLVMDialect::getRangeAttrName(),
89 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
92 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
95 Value mbcntLo = ROCDL::MbcntLoOp::create(
96 rewriter, loc, int32Type, minus1, zero, {},
99 Value laneId = ROCDL::MbcntHiOp::create(
100 rewriter, loc, int32Type, minus1, mbcntLo, {},
106 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
107 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
109 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
110 "64-S32-A5-G1-ni:7:8:9";
117 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
118 ConversionPatternRewriter &rewriter)
const override {
131 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
132 if (indexBitwidth > 32) {
133 laneId = LLVM::SExtOp::create(
134 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
135 }
else if (indexBitwidth < 32) {
136 laneId = LLVM::TruncOp::create(
137 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
139 rewriter.replaceOp(op, {laneId});
153 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
154 ConversionPatternRewriter &rewriter)
const override {
155 LLVM::ConstantRangeAttr bounds =
nullptr;
157 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
158 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
159 32, isBeforeGfx10 ? 64 : 32,
160 op.getUpperBoundAttr().getInt() + 1);
162 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
163 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
165 *getTypeConverter());
166 rewriter.replaceOp(op, {wavefrontOp});
182 matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
183 ConversionPatternRewriter &rewriter)
const override {
185 auto int32Type = rewriter.getI32Type();
190 LLVM::ConstantRangeAttr bounds;
191 if (
auto upperBoundAttr = op.getUpperBoundAttr())
192 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
194 upperBoundAttr.getInt());
195 subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type, bounds);
200 Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
201 Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
202 Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
203 Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type);
204 Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type);
209 LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
211 LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
212 Value tidYPlusDimYxTidZ =
213 LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
214 Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
215 tidYPlusDimYxTidZ, flags);
216 Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
220 ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
221 subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
227 rewriter.replaceOp(op, subgroupId);
234static bool isSupportedReadLaneType(
Type type) {
236 if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
237 LLVM::LLVMPointerType>(type))
240 if (
auto intType = dyn_cast<IntegerType>(type))
241 return llvm::is_contained({16, 32, 64},
242 static_cast<int>(intType.getWidth()));
244 if (
auto vecType = dyn_cast<VectorType>(type)) {
245 Type elementType = vecType.getElementType();
249 if (vecType.getNumElements() == 2 &&
250 (isa<Float16Type, BFloat16Type>(elementType) ||
258struct GPUSubgroupBroadcastOpToROCDL
263 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
264 ConversionPatternRewriter &rewriter)
const override {
265 Value src = adaptor.getSrc();
266 if (isSupportedReadLaneType(src.
getType())) {
267 Value result = createReadlaneOp(op, adaptor, rewriter, src);
268 rewriter.replaceOp(op,
result);
272 Type i32 = rewriter.getI32Type();
277 return rewriter.notifyMatchFailure(op,
278 "Unexpected decomposition failure");
281 results.reserve(decomposed.size());
282 for (
Value v : decomposed)
283 results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
286 rewriter.replaceOp(op,
result);
291 static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
292 ConversionPatternRewriter &rewriter,
294 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
295 return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.
getType(),
296 src, adaptor.getLane());
298 return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
324 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
325 ConversionPatternRewriter &rewriter)
const override {
327 Value initShflValue = adaptor.getValue();
331 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
332 Value width = adaptor.getWidth();
333 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
334 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
335 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
336 Value widthOrZeroIfOutside =
337 LLVM::AndOp::create(rewriter, loc, int32Type,
add, negwidth);
340 switch (op.getMode()) {
341 case gpu::ShuffleMode::UP:
342 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
343 adaptor.getOffset());
345 case gpu::ShuffleMode::DOWN:
346 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
347 adaptor.getOffset());
349 case gpu::ShuffleMode::XOR:
350 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
351 adaptor.getOffset());
353 case gpu::ShuffleMode::IDX:
354 dstLane = adaptor.getOffset();
357 Value isActiveSrcLane = LLVM::ICmpOp::create(
358 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
359 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
361 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
362 Value dwordAlignedDstLane =
363 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
368 return rewriter.notifyMatchFailure(op,
369 "failed to decompose value to i32");
371 for (
Value v : decomposed) {
372 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
373 dwordAlignedDstLane, v);
374 swizzled.emplace_back(res);
378 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
391 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
392 ConversionPatternRewriter &rewriter)
const override {
396 bool fenceGlobal =
false;
397 bool fenceLDS =
false;
398 std::optional<ArrayAttr> addrSpacesToFence = op.getAddressSpaces();
400 if (addrSpacesToFence) {
401 for (
auto spaceAttr :
402 addrSpacesToFence->getAsRange<gpu::AddressSpaceAttr>()) {
403 switch (spaceAttr.getValue()) {
404 case gpu::AddressSpace::Global:
407 case gpu::AddressSpace::Workgroup:
410 case gpu::AddressSpace::Private:
421 if (fenceLDS && !fenceGlobal) {
423 rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
"local");
424 }
else if (fenceGlobal && !fenceLDS) {
425 mmra = rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
429 constexpr llvm::StringLiteral scope =
"workgroup";
431 bool emitFences = fenceGlobal || fenceLDS;
434 auto relFence = LLVM::FenceOp::create(
435 rewriter, loc, LLVM::AtomicOrdering::release, scope);
437 relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
442 ROCDL::SBarrierOp::create(rewriter, loc);
444 ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
445 ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
449 auto acqFence = LLVM::FenceOp::create(
450 rewriter, loc, LLVM::AtomicOrdering::acquire, scope);
452 acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
456 rewriter.eraseOp(op);
462#include "GPUToROCDL.cpp.inc"
469struct LowerGpuOpsToROCDLOpsPass final
470 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
474 Base::getDependentDialects(registry);
478 void runOnOperation()
override {
479 gpu::GPUModuleOp m = getOperation();
482 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
483 LLVM::LLVMDialect::getDataLayoutAttrName());
484 if (!llvmDataLayout) {
486 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
489 for (
auto func : m.getOps<func::FuncOp>()) {
490 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
495 if (failed(maybeChipset)) {
496 emitError(UnknownLoc::get(ctx),
"Invalid chipset name: " + chipset);
497 return signalPassFailure();
502 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
503 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
505 options.overrideIndexBitwidth(indexBitwidth);
507 if (useBarePtrCallConv) {
508 options.useBarePtrCallConv =
true;
517 "bare pointer calling convention requires all memrefs to "
518 "have static shape and use the identity map");
519 return signalPassFailure();
539 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
540 allowedDialects.end());
542 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
544 if (!allowedDialectsSet.empty() && !allowed)
547 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
553 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
554 << dialect->getNamespace();
555 return signalPassFailure();
560 iface->populateConvertToLLVMConversionPatterns(
target, converter,
569 if (failed(applyPartialConversion(m,
target, std::move(llvmPatterns))))
571 auto *rocdlDialect =
getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
572 auto reqdWorkGroupSizeAttrHelper =
573 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
574 auto flatWorkGroupSizeAttrHelper =
575 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
578 m.walk([&](LLVM::LLVMFuncOp op) {
579 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
580 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
583 uint32_t flatSize = 1;
584 for (uint32_t size : blockSizes.asArrayRef()) {
587 StringAttr flatSizeAttr =
588 StringAttr::get(ctx, Twine(flatSize) +
"," + Twine(flatSize));
589 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
598 target.addIllegalOp<func::FuncOp>();
599 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
600 target.addLegalDialect<ROCDL::ROCDLDialect>();
601 target.addIllegalDialect<gpu::GPUDialect>();
602 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
603 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
604 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
606 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](
Operation *op) {
607 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
610 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
621 populateWithGenerated(patterns);
624 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
625 converter, IndexKind::Block, IntrType::Id);
627 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
628 converter, IndexKind::Grid, IntrType::Id);
631 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
632 converter, IndexKind::Block, IntrType::Dim);
634 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
635 converter, IndexKind::Grid, IntrType::Dim);
640 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
641 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
642 rocdlDialect->getKernelAttrHelper().getName(),
643 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
647 }
else if (Runtime::OpenCL ==
runtime) {
654 patterns.
add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
655 GPUSubgroupBroadcastOpToROCDL>(converter);
656 patterns.
add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
657 GPUBarrierOpLowering>(converter, chipset);
static Value getLaneId(RewriterBase &rewriter, Location loc)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
MLIRContext * getContext() const
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Operation is the basic unit of execution within MLIR.
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isInteger() const
Return true if this is an integer type (with the specified width).
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
LogicalResult decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType, SmallVectorImpl< Value > &result, bool permitVariablySizedScalars=false)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Runtime
Potential runtimes for AMD GPU kernels.
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.