47#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
48#include "mlir/Conversion/Passes.h.inc"
60 auto indexBitwidthType =
63 if (indexBitwidth > intWidth) {
64 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
66 if (indexBitwidth < intWidth) {
67 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
75 bool canBeBare =
true;
76 for (
Type type :
func.getArgumentTypes())
77 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
83 auto int32Type = IntegerType::get(rewriter.
getContext(), 32);
87 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.
getUnitAttr());
89 LLVM::LLVMDialect::getRangeAttrName(),
90 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
93 LLVM::LLVMDialect::getRangeAttrName(),
94 LLVM::ConstantRangeAttr::get(rewriter.
getContext(), APInt::getZero(32),
96 Value mbcntLo = ROCDL::MbcntLoOp::create(
97 rewriter, loc, int32Type, minus1, zero, {},
100 Value laneId = ROCDL::MbcntHiOp::create(
101 rewriter, loc, int32Type, minus1, mbcntLo, {},
115 gpu::Dimension dim,
Operation *contextOp,
116 std::optional<uint32_t> opUpperBound) {
120 auto i32Ty = IntegerType::get(context, 32);
121 auto i64Ty = IntegerType::get(context, 64);
123 if (std::optional<uint32_t> knownDim =
125 return LLVM::ConstantOp::create(rewriter, loc,
128 int32_t dimParam =
static_cast<int32_t
>(dim);
130 StringRef functionName;
132 case gpu::index_lowering::IndexKind::Block:
133 functionName =
"__ockl_get_local_size";
135 case gpu::index_lowering::IndexKind::Grid:
136 functionName =
"__ockl_get_num_groups";
138 case gpu::index_lowering::IndexKind::Cluster:
139 case gpu::index_lowering::IndexKind::Other:
140 llvm_unreachable(
"Not valid index kinds for ockl lookup");
144 auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
146 LLVM::LLVMFuncOp funcOp =
150 Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
152 LLVM::CallOp::create(rewriter, loc, funcOp,
ValueRange{dimConst});
154 LLVM::ConstantRangeAttr range;
156 range = LLVM::ConstantRangeAttr::get(
157 context, APInt(64, 1),
158 APInt(64,
static_cast<uint64_t
>(*opUpperBound) + 1));
159 }
else if (indexKind == gpu::index_lowering::IndexKind::Block) {
161 range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
166 rewriter.
getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
168 return callOp.getResult();
172 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
173 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
175 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
176 "64-S32-A5-G1-ni:7:8:9";
182template <
typename OpTy>
189 matchAndRewrite(OpTy op,
typename OpTy::Adaptor adaptor,
190 ConversionPatternRewriter &rewriter)
const override {
193 std::optional<uint32_t> opUpperBound;
194 if (
auto bound = op.getUpperBound())
195 opUpperBound =
static_cast<uint32_t
>(bound->getZExtValue());
200 *this->getTypeConverter());
201 rewriter.replaceOp(op,
result);
213 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
214 ConversionPatternRewriter &rewriter)
const override {
227 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
228 if (indexBitwidth > 32) {
229 laneId = LLVM::SExtOp::create(
230 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
231 }
else if (indexBitwidth < 32) {
232 laneId = LLVM::TruncOp::create(
233 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
235 rewriter.replaceOp(op, {laneId});
249 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
250 ConversionPatternRewriter &rewriter)
const override {
251 LLVM::ConstantRangeAttr bounds =
nullptr;
253 if (
auto upperBoundAttr = op.getUpperBoundAttr()) {
254 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
255 32, isBeforeGfx10 ? 64 : 32,
256 op.getUpperBoundAttr().getInt() + 1);
258 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
259 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
261 *getTypeConverter());
262 rewriter.replaceOp(op, {wavefrontOp});
278 matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
279 ConversionPatternRewriter &rewriter)
const override {
281 auto int32Type = rewriter.getI32Type();
286 LLVM::ConstantRangeAttr bounds;
287 if (
auto upperBoundAttr = op.getUpperBoundAttr())
288 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
290 upperBoundAttr.getInt());
291 subgroupId = ROCDL::WaveId::create(rewriter, loc, int32Type, bounds);
296 auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
297 auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
298 auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
299 auto setBoundFromContext = [&](
Operation *tidOp, gpu::Dimension dim) {
300 if (LLVM::ConstantRangeAttr range =
302 op, dim, std::nullopt,
303 gpu::index_lowering::IndexKind::Block,
305 tidOp->
setAttr(
"range", range);
307 setBoundFromContext(tidX, gpu::Dimension::x);
308 setBoundFromContext(tidY, gpu::Dimension::y);
309 setBoundFromContext(tidZ, gpu::Dimension::z);
312 LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
314 auto getBlockDim = [&](gpu::Dimension dim) {
317 dim, op, std::nullopt);
319 LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
322 Value dimX = getBlockDim(gpu::Dimension::x);
323 Value dimY = getBlockDim(gpu::Dimension::y);
328 LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
329 Value tidYPlusDimYxTidZ =
330 LLVM::AddOp::create(rewriter, loc, int32Type, tidY, dimYxTidZ, flags);
331 Value dimXxInner = LLVM::MulOp::create(rewriter, loc, int32Type, dimX,
332 tidYPlusDimYxTidZ, flags);
333 Value linearized = LLVM::AddOp::create(rewriter, loc, int32Type, tidX,
337 ROCDL::WavefrontSizeOp::create(rewriter, loc, int32Type);
338 subgroupId = LLVM::UDivOp::create(rewriter, loc, int32Type, linearized,
344 rewriter.replaceOp(op, subgroupId);
351static bool isSupportedReadLaneType(
Type type) {
353 if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
354 LLVM::LLVMPointerType>(type))
357 if (
auto intType = dyn_cast<IntegerType>(type))
358 return llvm::is_contained({16, 32, 64},
359 static_cast<int>(intType.getWidth()));
361 if (
auto vecType = dyn_cast<VectorType>(type)) {
362 Type elementType = vecType.getElementType();
366 if (vecType.getNumElements() == 2 &&
367 (isa<Float16Type, BFloat16Type>(elementType) ||
375struct GPUSubgroupBroadcastOpToROCDL
380 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
381 ConversionPatternRewriter &rewriter)
const override {
382 Value src = adaptor.getSrc();
383 if (isSupportedReadLaneType(src.
getType())) {
384 Value result = createReadlaneOp(op, adaptor, rewriter, src);
385 rewriter.replaceOp(op,
result);
389 Type i32 = rewriter.getI32Type();
394 return rewriter.notifyMatchFailure(op,
395 "Unexpected decomposition failure");
398 results.reserve(decomposed.size());
399 for (
Value v : decomposed)
400 results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
403 rewriter.replaceOp(op,
result);
408 static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
409 ConversionPatternRewriter &rewriter,
411 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
412 return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.
getType(),
413 src, adaptor.getLane());
415 return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
425 matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
426 ConversionPatternRewriter &rewriter)
const override {
427 auto intType = cast<IntegerType>(op.getType());
428 unsigned width = intType.getWidth();
432 if (width != 32 && width != 64)
433 return rewriter.notifyMatchFailure(
434 op,
"rocdl.ballot only supports i32 and i64 result types");
436 rewriter.replaceOpWithNewOp<ROCDL::BallotOp>(op, op.getType(),
437 adaptor.getPredicate());
462 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
463 ConversionPatternRewriter &rewriter)
const override {
465 Value initShflValue = adaptor.getValue();
469 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
470 Value width = adaptor.getWidth();
471 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
472 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
473 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
474 Value widthOrZeroIfOutside =
475 LLVM::AndOp::create(rewriter, loc, int32Type,
add, negwidth);
478 switch (op.getMode()) {
479 case gpu::ShuffleMode::UP:
480 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
481 adaptor.getOffset());
483 case gpu::ShuffleMode::DOWN:
484 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
485 adaptor.getOffset());
487 case gpu::ShuffleMode::XOR:
488 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
489 adaptor.getOffset());
491 case gpu::ShuffleMode::IDX:
492 dstLane = adaptor.getOffset();
495 Value isActiveSrcLane = LLVM::ICmpOp::create(
496 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
497 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
499 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
500 Value dwordAlignedDstLane =
501 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
506 return rewriter.notifyMatchFailure(op,
507 "failed to decompose value to i32");
509 for (
Value v : decomposed) {
510 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
511 dwordAlignedDstLane, v);
512 swizzled.emplace_back(res);
516 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
523static void emitFences(std::optional<ArrayAttr> addrSpaces,
524 ConversionPatternRewriter &rewriter,
Location loc,
525 StringRef scope,
bool before) {
526 bool fenceGlobal =
false;
527 bool fenceLDS =
false;
530 for (
auto spaceAttr : addrSpaces->getAsRange<gpu::AddressSpaceAttr>()) {
531 switch (spaceAttr.getValue()) {
532 case gpu::AddressSpace::Global:
535 case gpu::AddressSpace::Workgroup:
538 case gpu::AddressSpace::Private:
539 case gpu::AddressSpace::Constant:
548 if (!fenceGlobal && !fenceLDS)
552 if (fenceLDS && !fenceGlobal)
554 rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
"local");
555 else if (fenceGlobal && !fenceLDS)
557 rewriter.getAttr<LLVM::MMRATagAttr>(
"amdgpu-synchronize-as",
"global");
560 before ? LLVM::AtomicOrdering::release : LLVM::AtomicOrdering::acquire;
561 auto fence = LLVM::FenceOp::create(rewriter, loc, ordering, scope);
563 fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
566static constexpr int32_t kWholeClusterBarrierId = -3;
567static constexpr int32_t kWholeWorkgroupBarrierId = -1;
576 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
577 ConversionPatternRewriter &rewriter)
const override {
579 gpu::BarrierScope scope = op.getScope();
582 if (scope == gpu::BarrierScope::Subgroup) {
583 emitFences(op.getAddressSpaces(), rewriter, loc,
"wavefront",
585 ROCDL::WaveBarrierOp::create(rewriter, loc);
586 emitFences(op.getAddressSpaces(), rewriter, loc,
"wavefront",
588 rewriter.eraseOp(op);
593 if (scope == gpu::BarrierScope::Cluster) {
595 return op.emitOpError(
"cluster scope barriers require gfx1250+");
596 emitFences(op.getAddressSpaces(), rewriter, loc,
"cluster",
598 ROCDL::BarrierSignalOp::create(rewriter, loc, kWholeClusterBarrierId);
599 ROCDL::BarrierWaitOp::create(
600 rewriter, loc,
static_cast<int16_t
>(kWholeClusterBarrierId));
601 emitFences(op.getAddressSpaces(), rewriter, loc,
"cluster",
603 rewriter.eraseOp(op);
608 assert(scope == gpu::BarrierScope::Workgroup &&
"unsupported scope");
611 if (
Value namedBarrier = adaptor.getNamedBarrier()) {
613 return op.emitOpError(
"named barriers require gfx12+");
615 emitFences(op.getAddressSpaces(), rewriter, loc,
"workgroup",
618 ROCDL::BarrierJoinOp::create(rewriter, loc, namedBarrier);
620 ROCDL::BarrierSignalVarOp::create(rewriter, loc, namedBarrier,
624 ROCDL::BarrierWaitOp::create(rewriter, loc,
static_cast<int16_t
>(1));
625 emitFences(op.getAddressSpaces(), rewriter, loc,
"workgroup",
627 rewriter.eraseOp(op);
632 emitFences(op.getAddressSpaces(), rewriter, loc,
"workgroup",
635 ROCDL::SBarrierOp::create(rewriter, loc);
637 ROCDL::BarrierSignalOp::create(rewriter, loc, kWholeWorkgroupBarrierId);
638 ROCDL::BarrierWaitOp::create(
639 rewriter, loc,
static_cast<int16_t
>(kWholeWorkgroupBarrierId));
641 emitFences(op.getAddressSpaces(), rewriter, loc,
"workgroup",
643 rewriter.eraseOp(op);
648struct GPUInitializeNamedBarrierOpLowering final
658 matchAndRewrite(gpu::InitializeNamedBarrierOp op,
659 gpu::InitializeNamedBarrierOp::Adaptor adaptor,
660 ConversionPatternRewriter &rewriter)
const override {
662 return op.emitOpError(
"named barriers require gfx12+");
667 IntegerAttr countAttr;
669 return op.emitOpError(
670 "named barrier member count must be a constant for ROCDL lowering");
671 int32_t count = countAttr.getInt();
675 auto funcOp = op->getParentOfType<FunctionOpInterface>();
677 return op.emitOpError(
"must be inside a function-like op");
681 return op.emitOpError(
682 "enclosing function-like op must have a symbol-table parent");
684 auto targetTy = LLVM::LLVMTargetExtType::get(
685 rewriter.getContext(),
"amdgcn.named.barrier", {}, {0});
686 auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 3);
690 OpBuilder detachedBuilder(rewriter.getContext());
691 auto globalOp = LLVM::GlobalOp::create(
692 detachedBuilder, loc, targetTy,
false,
693 LLVM::Linkage::Internal,
"__named_barrier",
Attribute(),
697 Region ®ion = globalOp.getInitializerRegion();
700 auto poison = LLVM::PoisonOp::create(detachedBuilder, loc, targetTy);
701 LLVM::ReturnOp::create(detachedBuilder, loc, poison);
708 rewriter.setInsertionPoint(op);
709 auto addrOf = LLVM::AddressOfOp::create(rewriter, loc, ptrTy, globalName);
712 ROCDL::BarrierInitOp::create(rewriter, loc, addrOf, count);
714 rewriter.replaceOp(op, addrOf.getResult());
720#include "GPUToROCDL.cpp.inc"
727struct LowerGpuOpsToROCDLOpsPass final
728 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
732 Base::getDependentDialects(registry);
736 void runOnOperation()
override {
737 gpu::GPUModuleOp m = getOperation();
740 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
741 LLVM::LLVMDialect::getDataLayoutAttrName());
742 if (!llvmDataLayout) {
744 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
747 for (
auto func : m.getOps<func::FuncOp>()) {
748 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
753 if (failed(maybeChipset)) {
754 emitError(UnknownLoc::get(ctx),
"Invalid chipset name: " + chipset);
755 return signalPassFailure();
760 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
761 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
763 options.overrideIndexBitwidth(indexBitwidth);
765 if (useBarePtrCallConv) {
766 options.useBarePtrCallConv =
true;
775 "bare pointer calling convention requires all memrefs to "
776 "have static shape and use the identity map");
777 return signalPassFailure();
797 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
798 allowedDialects.end());
800 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
802 if (!allowedDialectsSet.empty() && !allowed)
805 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
811 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
812 << dialect->getNamespace();
813 return signalPassFailure();
818 iface->populateConvertToLLVMConversionPatterns(
target, converter,
827 if (failed(applyPartialConversion(m,
target, std::move(llvmPatterns))))
829 auto *rocdlDialect =
getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
830 auto reqdWorkGroupSizeAttrHelper =
831 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
832 auto flatWorkGroupSizeAttrHelper =
833 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
836 m.walk([&](LLVM::LLVMFuncOp op) {
837 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
838 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
841 uint32_t flatSize = 1;
842 for (uint32_t size : blockSizes.asArrayRef()) {
845 StringAttr flatSizeAttr =
846 StringAttr::get(ctx, Twine(flatSize) +
"," + Twine(flatSize));
847 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
856 target.addIllegalOp<func::FuncOp>();
857 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
858 target.addLegalDialect<ROCDL::ROCDLDialect>();
859 target.addIllegalDialect<gpu::GPUDialect>();
860 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
861 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
862 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
864 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](
Operation *op) {
865 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
868 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
879 populateWithGenerated(patterns);
882 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
883 converter, IndexKind::Block, IntrType::Id);
885 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
886 converter, IndexKind::Grid, IntrType::Id);
887 patterns.
add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
889 patterns.
add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
894 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
895 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
896 rocdlDialect->getKernelAttrHelper().getName(),
897 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
901 }
else if (Runtime::OpenCL ==
runtime) {
908 patterns.
add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
909 GPUSubgroupBroadcastOpToROCDL, GPUBallotOpToROCDL>(converter);
910 patterns.
add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
911 GPUBarrierOpLowering, GPUInitializeNamedBarrierOpLowering>(
static Value getLaneId(RewriterBase &rewriter, Location loc)
static constexpr int64_t kMaxThreadsPerBlockDim
Maximum number of threads per block dimension on AMD GPUs.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getKnownOrOcklDim(RewriterBase &rewriter, gpu::index_lowering::IndexKind indexKind, gpu::Dimension dim, Operation *contextOp, std::optional< uint32_t > opUpperBound)
Emits a call to an OCKL block/grid size function corresponding to indexKind with argument dim,...
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
IntegerAttr getI64IntegerAttr(int64_t value)
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
MLIRContext * getContext() const
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
This class helps build Operations.
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes={}, ArrayRef< Location > locs={})
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
A trait used to provide symbol table functionalities to a region operation.
Operation is the basic unit of execution within MLIR.
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Location getLoc()
The source location the operation was defined or derived from.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
MLIRContext * getContext()
Return the context this operation is associated with.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
StringAttr insert(Operation *symbol, Block::iterator insertPt={})
Insert a new symbol into the table, and rename it as necessary to avoid collisions.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isInteger() const
Return true if this is an integer type (with the specified width).
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
LogicalResult decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType, SmallVectorImpl< Value > &result, bool permitVariablySizedScalars=false)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Runtime
Potential runtimes for AMD GPU kernels.
gpu::DimensionKind IndexKind
LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim, std::optional< uint32_t > opUpperBound, IndexKind indexKind, IntrType intrType, unsigned bitWidth)
Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds are found.
std::optional< uint32_t > getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim)
Retrieve the constant bounds for a given dimension and dimension kind from the context surrounding op...
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.