46#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
47#include "mlir/Conversion/Passes.h.inc"
59 auto indexBitwidthType =
62 if (indexBitwidth > intWidth) {
63 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
65 if (indexBitwidth < intWidth) {
66 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
74 bool canBeBare =
true;
75 for (
Type type :
func.getArgumentTypes())
76 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
82 auto int32Type = IntegerType::get(rewriter.
getContext(), 32);
86 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.
getUnitAttr());
88 LLVM::LLVMDialect::getRangeAttrName(),
89 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
92 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
95 Value mbcntLo = ROCDL::MbcntLoOp::create(
96 rewriter, loc, int32Type, minus1, zero, {},
99 Value laneId = ROCDL::MbcntHiOp::create(
100 rewriter, loc, int32Type, minus1, mbcntLo, {},
114 gpu::Dimension dim,
Operation *contextOp,
115 std::optional<uint32_t> opUpperBound) {
119 auto i32Ty = IntegerType::get(context, 32);
120 auto i64Ty = IntegerType::get(context, 64);
122 if (std::optional<uint32_t> knownDim =
124 return LLVM::ConstantOp::create(rewriter, loc,
127 int32_t dimParam =
static_cast<int32_t
>(dim);
129 StringRef functionName;
131 case gpu::index_lowering::IndexKind::Block:
132 functionName =
"__ockl_get_local_size";
134 case gpu::index_lowering::IndexKind::Grid:
135 functionName =
"__ockl_get_num_groups";
137 case gpu::index_lowering::IndexKind::Cluster:
138 case gpu::index_lowering::IndexKind::Other:
139 llvm_unreachable(
"Not valid index kinds for ockl lookup");
143 auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
145 LLVM::LLVMFuncOp funcOp =
149 Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
151 LLVM::CallOp::create(rewriter, loc, funcOp,
ValueRange{dimConst});
153 LLVM::ConstantRangeAttr range;
155 range = LLVM::ConstantRangeAttr::get(
156 context, APInt(64, 1),
157 APInt(64,
static_cast<uint64_t
>(*opUpperBound) + 1));
158 }
else if (indexKind == gpu::index_lowering::IndexKind::Block) {
160 range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
165 rewriter.
getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
167 return callOp.getResult();
171 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
172 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
174 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
175 "64-S32-A5-G1-ni:7:8:9";
181template <
typename OpTy>
188 matchAndRewrite(OpTy op,
typename OpTy::Adaptor adaptor,
189 ConversionPatternRewriter &rewriter)
const override {
192 std::optional<uint32_t> opUpperBound;
193 if (
auto bound = op.getUpperBound())
194 opUpperBound =
static_cast<uint32_t
>(bound->getZExtValue());
199 *this->getTypeConverter());
200 rewriter.replaceOp(op,
result);
212 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
213 ConversionPatternRewriter &rewriter)
const override {
226 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
227 if (indexBitwidth > 32) {
228 laneId = LLVM::SExtOp::create(
229 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
230 }
else if (indexBitwidth < 32) {
231 laneId = LLVM::TruncOp::create(
232 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
234 rewriter.replaceOp(op, {laneId});
248 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
249 ConversionPatternRewriter &rewriter)
const override {
250 LLVM::ConstantRangeAttr bounds =
nullptr;
252 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
253 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
254 32, isBeforeGfx10 ? 64 : 32,
255 op.getUpperBoundAttr().getInt() + 1);
257 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
258 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
260 *getTypeConverter());
261 rewriter.replaceOp(op, {wavefrontOp});
277 matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
278 ConversionPatternRewriter &rewriter)
const override {
280 auto int32Type = rewriter.getI32Type();
285 LLVM::ConstantRangeAttr bounds;
286 if (
auto upperBoundAttr = op.getUpperBoundAttr())
287 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
289 upperBoundAttr.getInt());
290 subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type, bounds);
295 auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
296 auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
297 auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
298 auto setBoundFromContext = [&](
Operation *tidOp, gpu::Dimension dim) {
299 if (LLVM::ConstantRangeAttr range =
301 op, dim, std::nullopt,
302 gpu::index_lowering::IndexKind::Block,
304 tidOp->
setAttr(
"range", range);
306 setBoundFromContext(tidX, gpu::Dimension::x);
307 setBoundFromContext(tidY, gpu::Dimension::y);
308 setBoundFromContext(tidZ, gpu::Dimension::z);
311 LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
313 auto getBlockDim = [&](gpu::Dimension dim) {
316 dim, op, std::nullopt);
318 LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
321 Value dimX = getBlockDim(gpu::Dimension::x);
322 Value dimY = getBlockDim(gpu::Dimension::y);
327 LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
328 Value tidYPlusDimYxTidZ =
329 LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
330 Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
331 tidYPlusDimYxTidZ, flags);
332 Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
336 ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
337 subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
343 rewriter.replaceOp(op, subgroupId);
350static bool isSupportedReadLaneType(
Type type) {
352 if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
353 LLVM::LLVMPointerType>(type))
356 if (
auto intType = dyn_cast<IntegerType>(type))
357 return llvm::is_contained({16, 32, 64},
358 static_cast<int>(intType.getWidth()));
360 if (
auto vecType = dyn_cast<VectorType>(type)) {
361 Type elementType = vecType.getElementType();
365 if (vecType.getNumElements() == 2 &&
366 (isa<Float16Type, BFloat16Type>(elementType) ||
374struct GPUSubgroupBroadcastOpToROCDL
379 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
380 ConversionPatternRewriter &rewriter)
const override {
381 Value src = adaptor.getSrc();
382 if (isSupportedReadLaneType(src.
getType())) {
383 Value result = createReadlaneOp(op, adaptor, rewriter, src);
384 rewriter.replaceOp(op,
result);
388 Type i32 = rewriter.getI32Type();
393 return rewriter.notifyMatchFailure(op,
394 "Unexpected decomposition failure");
397 results.reserve(decomposed.size());
398 for (
Value v : decomposed)
399 results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
402 rewriter.replaceOp(op,
result);
407 static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
408 ConversionPatternRewriter &rewriter,
410 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
411 return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.
getType(),
412 src, adaptor.getLane());
414 return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
440 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
441 ConversionPatternRewriter &rewriter)
const override {
443 Value initShflValue = adaptor.getValue();
447 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
448 Value width = adaptor.getWidth();
449 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
450 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
451 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
452 Value widthOrZeroIfOutside =
453 LLVM::AndOp::create(rewriter, loc, int32Type,
add, negwidth);
456 switch (op.getMode()) {
457 case gpu::ShuffleMode::UP:
458 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
459 adaptor.getOffset());
461 case gpu::ShuffleMode::DOWN:
462 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
463 adaptor.getOffset());
465 case gpu::ShuffleMode::XOR:
466 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
467 adaptor.getOffset());
469 case gpu::ShuffleMode::IDX:
470 dstLane = adaptor.getOffset();
473 Value isActiveSrcLane = LLVM::ICmpOp::create(
474 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
475 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
477 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
478 Value dwordAlignedDstLane =
479 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
484 return rewriter.notifyMatchFailure(op,
485 "failed to decompose value to i32");
487 for (
Value v : decomposed) {
488 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
489 dwordAlignedDstLane, v);
490 swizzled.emplace_back(res);
494 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
507 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
508 ConversionPatternRewriter &rewriter)
const override {
512 bool fenceGlobal =
false;
513 bool fenceLDS =
false;
514 std::optional<ArrayAttr> addrSpacesToFence = op.getAddressSpaces();
516 if (addrSpacesToFence) {
517 for (
auto spaceAttr :
518 addrSpacesToFence->getAsRange<gpu::AddressSpaceAttr>()) {
519 switch (spaceAttr.getValue()) {
520 case gpu::AddressSpace::Global:
523 case gpu::AddressSpace::Workgroup:
526 case gpu::AddressSpace::Private:
537 if (fenceLDS && !fenceGlobal) {
539 rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
"local");
540 }
else if (fenceGlobal && !fenceLDS) {
541 mmra = rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
545 constexpr llvm::StringLiteral scope =
"workgroup";
547 bool emitFences = fenceGlobal || fenceLDS;
550 auto relFence = LLVM::FenceOp::create(
551 rewriter, loc, LLVM::AtomicOrdering::release, scope);
553 relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
558 ROCDL::SBarrierOp::create(rewriter, loc);
560 ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
561 ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
565 auto acqFence = LLVM::FenceOp::create(
566 rewriter, loc, LLVM::AtomicOrdering::acquire, scope);
568 acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
572 rewriter.eraseOp(op);
578#include "GPUToROCDL.cpp.inc"
585struct LowerGpuOpsToROCDLOpsPass final
590 Base::getDependentDialects(registry);
594 void runOnOperation()
override {
595 gpu::GPUModuleOp m = getOperation();
598 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
599 LLVM::LLVMDialect::getDataLayoutAttrName());
600 if (!llvmDataLayout) {
602 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
605 for (
auto func : m.getOps<func::FuncOp>()) {
606 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
611 if (failed(maybeChipset)) {
612 emitError(UnknownLoc::get(ctx),
"Invalid chipset name: " + chipset);
613 return signalPassFailure();
618 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
619 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
621 options.overrideIndexBitwidth(indexBitwidth);
623 if (useBarePtrCallConv) {
624 options.useBarePtrCallConv =
true;
633 "bare pointer calling convention requires all memrefs to "
634 "have static shape and use the identity map");
635 return signalPassFailure();
655 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
656 allowedDialects.end());
658 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
660 if (!allowedDialectsSet.empty() && !allowed)
663 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
669 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
670 << dialect->getNamespace();
671 return signalPassFailure();
676 iface->populateConvertToLLVMConversionPatterns(
target, converter,
685 if (failed(applyPartialConversion(m,
target, std::move(llvmPatterns))))
687 auto *rocdlDialect =
getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
688 auto reqdWorkGroupSizeAttrHelper =
689 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
690 auto flatWorkGroupSizeAttrHelper =
691 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
694 m.walk([&](LLVM::LLVMFuncOp op) {
695 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
696 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
699 uint32_t flatSize = 1;
700 for (uint32_t size : blockSizes.asArrayRef()) {
703 StringAttr flatSizeAttr =
704 StringAttr::get(ctx, Twine(flatSize) +
"," + Twine(flatSize));
705 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
714 target.addIllegalOp<func::FuncOp>();
715 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
716 target.addLegalDialect<ROCDL::ROCDLDialect>();
717 target.addIllegalDialect<gpu::GPUDialect>();
718 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
719 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
720 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
722 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](
Operation *op) {
723 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
726 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
737 populateWithGenerated(patterns);
740 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
741 converter, IndexKind::Block, IntrType::Id);
743 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
744 converter, IndexKind::Grid, IntrType::Id);
745 patterns.
add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
747 patterns.
add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
752 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
753 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
754 rocdlDialect->getKernelAttrHelper().getName(),
755 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
759 }
else if (Runtime::OpenCL ==
runtime) {
766 patterns.
add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
767 GPUSubgroupBroadcastOpToROCDL>(converter);
768 patterns.
add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
769 GPUBarrierOpLowering>(converter, chipset);
static Value getLaneId(RewriterBase &rewriter, Location loc)
static constexpr int64_t kMaxThreadsPerBlockDim
Maximum number of threads per block dimension on AMD GPUs.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getKnownOrOcklDim(RewriterBase &rewriter, gpu::index_lowering::IndexKind indexKind, gpu::Dimension dim, Operation *contextOp, std::optional< uint32_t > opUpperBound)
Emits a call to an OCKL block/grid size function corresponding to indexKind with argument dim,...
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
IntegerAttr getI64IntegerAttr(int64_t value)
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
MLIRContext * getContext() const
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
A trait used to provide symbol table functionalities to a region operation.
Operation is the basic unit of execution within MLIR.
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Location getLoc()
The source location the operation was defined or derived from.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
MLIRContext * getContext()
Return the context this operation is associated with.
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isInteger() const
Return true if this is an integer type (with the specified width).
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
LogicalResult decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType, SmallVectorImpl< Value > &result, bool permitVariablySizedScalars=false)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Runtime
Potential runtimes for AMD GPU kernels.
gpu::DimensionKind IndexKind
LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim, std::optional< uint32_t > opUpperBound, IndexKind indexKind, IntrType intrType, unsigned bitWidth)
Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds are found.
std::optional< uint32_t > getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim)
Retrieve the constant bounds for a given dimension and dimension kind from the context surrounding op...
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.