46#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
47#include "mlir/Conversion/Passes.h.inc"
59 auto indexBitwidthType =
62 if (indexBitwidth > intWidth) {
63 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
65 if (indexBitwidth < intWidth) {
66 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
74 bool canBeBare =
true;
75 for (
Type type :
func.getArgumentTypes())
76 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
82 auto int32Type = IntegerType::get(rewriter.
getContext(), 32);
86 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.
getUnitAttr());
88 LLVM::LLVMDialect::getRangeAttrName(),
89 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
92 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
95 Value mbcntLo = ROCDL::MbcntLoOp::create(
96 rewriter, loc, int32Type, minus1, zero, {},
99 Value laneId = ROCDL::MbcntHiOp::create(
100 rewriter, loc, int32Type, minus1, mbcntLo, {},
114 gpu::Dimension dim,
Operation *contextOp,
115 std::optional<uint32_t> opUpperBound) {
119 auto i32Ty = IntegerType::get(context, 32);
120 auto i64Ty = IntegerType::get(context, 64);
122 if (std::optional<uint32_t> knownDim =
124 return LLVM::ConstantOp::create(rewriter, loc,
127 int32_t dimParam =
static_cast<int32_t
>(dim);
129 StringRef functionName;
131 case gpu::index_lowering::IndexKind::Block:
132 functionName =
"__ockl_get_local_size";
134 case gpu::index_lowering::IndexKind::Grid:
135 functionName =
"__ockl_get_num_groups";
137 case gpu::index_lowering::IndexKind::Cluster:
138 case gpu::index_lowering::IndexKind::Other:
139 llvm_unreachable(
"Not valid index kinds for ockl lookup");
143 auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
145 LLVM::LLVMFuncOp funcOp =
149 Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
151 LLVM::CallOp::create(rewriter, loc, funcOp,
ValueRange{dimConst});
153 LLVM::ConstantRangeAttr range;
155 range = LLVM::ConstantRangeAttr::get(
156 context, APInt(64, 1),
157 APInt(64,
static_cast<uint64_t
>(*opUpperBound) + 1));
158 }
else if (indexKind == gpu::index_lowering::IndexKind::Block) {
160 range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
165 rewriter.
getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
167 return callOp.getResult();
171 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
172 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
174 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
175 "64-S32-A5-G1-ni:7:8:9";
181template <
typename OpTy>
188 matchAndRewrite(OpTy op,
typename OpTy::Adaptor adaptor,
189 ConversionPatternRewriter &rewriter)
const override {
192 std::optional<uint32_t> opUpperBound;
193 if (
auto bound = op.getUpperBound())
194 opUpperBound =
static_cast<uint32_t
>(bound->getZExtValue());
199 *this->getTypeConverter());
200 rewriter.replaceOp(op,
result);
212 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
213 ConversionPatternRewriter &rewriter)
const override {
226 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
227 if (indexBitwidth > 32) {
228 laneId = LLVM::SExtOp::create(
229 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
230 }
else if (indexBitwidth < 32) {
231 laneId = LLVM::TruncOp::create(
232 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
234 rewriter.replaceOp(op, {laneId});
248 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
249 ConversionPatternRewriter &rewriter)
const override {
250 LLVM::ConstantRangeAttr bounds =
nullptr;
252 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
253 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
254 32, isBeforeGfx10 ? 64 : 32,
255 op.getUpperBoundAttr().getInt() + 1);
257 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
258 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
260 *getTypeConverter());
261 rewriter.replaceOp(op, {wavefrontOp});
277 matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
278 ConversionPatternRewriter &rewriter)
const override {
280 auto int32Type = rewriter.getI32Type();
285 LLVM::ConstantRangeAttr bounds;
286 if (
auto upperBoundAttr = op.getUpperBoundAttr())
287 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
289 upperBoundAttr.getInt());
290 subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type, bounds);
295 auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
296 auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
297 auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
298 auto setBoundFromContext = [&](
Operation *tidOp, gpu::Dimension dim) {
299 if (LLVM::ConstantRangeAttr range =
301 op, dim, std::nullopt,
302 gpu::index_lowering::IndexKind::Block,
304 tidOp->
setAttr(
"range", range);
306 setBoundFromContext(tidX, gpu::Dimension::x);
307 setBoundFromContext(tidY, gpu::Dimension::y);
308 setBoundFromContext(tidZ, gpu::Dimension::z);
311 LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
313 auto getBlockDim = [&](gpu::Dimension dim) {
316 dim, op, std::nullopt);
318 LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
321 Value dimX = getBlockDim(gpu::Dimension::x);
322 Value dimY = getBlockDim(gpu::Dimension::y);
327 LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
328 Value tidYPlusDimYxTidZ =
329 LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
330 Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
331 tidYPlusDimYxTidZ, flags);
332 Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
336 ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
337 subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
343 rewriter.replaceOp(op, subgroupId);
350static bool isSupportedReadLaneType(
Type type) {
352 if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
353 LLVM::LLVMPointerType>(type))
356 if (
auto intType = dyn_cast<IntegerType>(type))
357 return llvm::is_contained({16, 32, 64},
358 static_cast<int>(intType.getWidth()));
360 if (
auto vecType = dyn_cast<VectorType>(type)) {
361 Type elementType = vecType.getElementType();
365 if (vecType.getNumElements() == 2 &&
366 (isa<Float16Type, BFloat16Type>(elementType) ||
374struct GPUSubgroupBroadcastOpToROCDL
379 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
380 ConversionPatternRewriter &rewriter)
const override {
381 Value src = adaptor.getSrc();
382 if (isSupportedReadLaneType(src.
getType())) {
383 Value result = createReadlaneOp(op, adaptor, rewriter, src);
384 rewriter.replaceOp(op,
result);
388 Type i32 = rewriter.getI32Type();
393 return rewriter.notifyMatchFailure(op,
394 "Unexpected decomposition failure");
397 results.reserve(decomposed.size());
398 for (
Value v : decomposed)
399 results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
402 rewriter.replaceOp(op,
result);
407 static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
408 ConversionPatternRewriter &rewriter,
410 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
411 return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.
getType(),
412 src, adaptor.getLane());
414 return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
424 matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
425 ConversionPatternRewriter &rewriter)
const override {
426 auto intType = cast<IntegerType>(op.getType());
427 unsigned width = intType.getWidth();
431 if (width != 32 && width != 64)
432 return rewriter.notifyMatchFailure(
433 op,
"rocdl.ballot only supports i32 and i64 result types");
435 rewriter.replaceOpWithNewOp<ROCDL::BallotOp>(op, op.getType(),
436 adaptor.getPredicate());
461 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
462 ConversionPatternRewriter &rewriter)
const override {
464 Value initShflValue = adaptor.getValue();
468 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
469 Value width = adaptor.getWidth();
470 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
471 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
472 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
473 Value widthOrZeroIfOutside =
474 LLVM::AndOp::create(rewriter, loc, int32Type,
add, negwidth);
477 switch (op.getMode()) {
478 case gpu::ShuffleMode::UP:
479 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
480 adaptor.getOffset());
482 case gpu::ShuffleMode::DOWN:
483 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
484 adaptor.getOffset());
486 case gpu::ShuffleMode::XOR:
487 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
488 adaptor.getOffset());
490 case gpu::ShuffleMode::IDX:
491 dstLane = adaptor.getOffset();
494 Value isActiveSrcLane = LLVM::ICmpOp::create(
495 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
496 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
498 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
499 Value dwordAlignedDstLane =
500 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
505 return rewriter.notifyMatchFailure(op,
506 "failed to decompose value to i32");
508 for (
Value v : decomposed) {
509 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
510 dwordAlignedDstLane, v);
511 swizzled.emplace_back(res);
515 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
528 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
529 ConversionPatternRewriter &rewriter)
const override {
533 bool fenceGlobal =
false;
534 bool fenceLDS =
false;
535 std::optional<ArrayAttr> addrSpacesToFence = op.getAddressSpaces();
537 if (addrSpacesToFence) {
538 for (
auto spaceAttr :
539 addrSpacesToFence->getAsRange<gpu::AddressSpaceAttr>()) {
540 switch (spaceAttr.getValue()) {
541 case gpu::AddressSpace::Global:
544 case gpu::AddressSpace::Workgroup:
547 case gpu::AddressSpace::Private:
548 case gpu::AddressSpace::Constant:
560 if (fenceLDS && !fenceGlobal) {
562 rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
"local");
563 }
else if (fenceGlobal && !fenceLDS) {
564 mmra = rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
568 constexpr llvm::StringLiteral scope =
"workgroup";
570 bool emitFences = fenceGlobal || fenceLDS;
573 auto relFence = LLVM::FenceOp::create(
574 rewriter, loc, LLVM::AtomicOrdering::release, scope);
576 relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
581 ROCDL::SBarrierOp::create(rewriter, loc);
583 ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
584 ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
588 auto acqFence = LLVM::FenceOp::create(
589 rewriter, loc, LLVM::AtomicOrdering::acquire, scope);
591 acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(),
595 rewriter.eraseOp(op);
601#include "GPUToROCDL.cpp.inc"
608struct LowerGpuOpsToROCDLOpsPass final
613 Base::getDependentDialects(registry);
617 void runOnOperation()
override {
618 gpu::GPUModuleOp m = getOperation();
621 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
622 LLVM::LLVMDialect::getDataLayoutAttrName());
623 if (!llvmDataLayout) {
625 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
628 for (
auto func : m.getOps<func::FuncOp>()) {
629 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
634 if (failed(maybeChipset)) {
635 emitError(UnknownLoc::get(ctx),
"Invalid chipset name: " + chipset);
636 return signalPassFailure();
641 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
642 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
644 options.overrideIndexBitwidth(indexBitwidth);
646 if (useBarePtrCallConv) {
647 options.useBarePtrCallConv =
true;
656 "bare pointer calling convention requires all memrefs to "
657 "have static shape and use the identity map");
658 return signalPassFailure();
678 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
679 allowedDialects.end());
681 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
683 if (!allowedDialectsSet.empty() && !allowed)
686 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
692 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
693 << dialect->getNamespace();
694 return signalPassFailure();
699 iface->populateConvertToLLVMConversionPatterns(
target, converter,
708 if (failed(applyPartialConversion(m,
target, std::move(llvmPatterns))))
710 auto *rocdlDialect =
getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
711 auto reqdWorkGroupSizeAttrHelper =
712 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
713 auto flatWorkGroupSizeAttrHelper =
714 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
717 m.walk([&](LLVM::LLVMFuncOp op) {
718 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
719 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
722 uint32_t flatSize = 1;
723 for (uint32_t size : blockSizes.asArrayRef()) {
726 StringAttr flatSizeAttr =
727 StringAttr::get(ctx, Twine(flatSize) +
"," + Twine(flatSize));
728 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
737 target.addIllegalOp<func::FuncOp>();
738 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
739 target.addLegalDialect<ROCDL::ROCDLDialect>();
740 target.addIllegalDialect<gpu::GPUDialect>();
741 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
742 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
743 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
745 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](
Operation *op) {
746 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
749 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
760 populateWithGenerated(patterns);
763 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
764 converter, IndexKind::Block, IntrType::Id);
766 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
767 converter, IndexKind::Grid, IntrType::Id);
768 patterns.
add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
770 patterns.
add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
775 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
776 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
777 rocdlDialect->getKernelAttrHelper().getName(),
778 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
782 }
else if (Runtime::OpenCL ==
runtime) {
789 patterns.
add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
790 GPUSubgroupBroadcastOpToROCDL, GPUBallotOpToROCDL>(converter);
791 patterns.
add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
792 GPUBarrierOpLowering>(converter, chipset);
static Value getLaneId(RewriterBase &rewriter, Location loc)
static constexpr int64_t kMaxThreadsPerBlockDim
Maximum number of threads per block dimension on AMD GPUs.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getKnownOrOcklDim(RewriterBase &rewriter, gpu::index_lowering::IndexKind indexKind, gpu::Dimension dim, Operation *contextOp, std::optional< uint32_t > opUpperBound)
Emits a call to an OCKL block/grid size function corresponding to indexKind with argument dim,...
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
IntegerAttr getI64IntegerAttr(int64_t value)
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
MLIRContext * getContext() const
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
A trait used to provide symbol table functionalities to a region operation.
Operation is the basic unit of execution within MLIR.
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Location getLoc()
The source location the operation was defined or derived from.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
MLIRContext * getContext()
Return the context this operation is associated with.
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isInteger() const
Return true if this is an integer type (with the specified width).
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
LogicalResult decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType, SmallVectorImpl< Value > &result, bool permitVariablySizedScalars=false)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Runtime
Potential runtimes for AMD GPU kernels.
gpu::DimensionKind IndexKind
LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim, std::optional< uint32_t > opUpperBound, IndexKind indexKind, IntrType intrType, unsigned bitWidth)
Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds are found.
std::optional< uint32_t > getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim)
Retrieve the constant bounds for a given dimension and dimension kind from the context surrounding op...
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.