44#define GEN_PASS_DEF_CONVERTGPUOPSTONVVMOPS
45#include "mlir/Conversion/Passes.h.inc"
53static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
55 case gpu::ShuffleMode::XOR:
56 return NVVM::ShflKind::bfly;
57 case gpu::ShuffleMode::UP:
58 return NVVM::ShflKind::up;
59 case gpu::ShuffleMode::DOWN:
60 return NVVM::ShflKind::down;
61 case gpu::ShuffleMode::IDX:
62 return NVVM::ShflKind::idx;
64 llvm_unreachable(
"unknown shuffle mode");
67static std::optional<NVVM::ReductionKind>
68convertToNVVMReductionKind(gpu::AllReduceOperation mode) {
70 case gpu::AllReduceOperation::ADD:
71 return NVVM::ReductionKind::ADD;
72 case gpu::AllReduceOperation::MUL:
74 case gpu::AllReduceOperation::MINSI:
75 return NVVM::ReductionKind::MIN;
76 case gpu::AllReduceOperation::MINUI:
78 case gpu::AllReduceOperation::MINNUMF:
79 return NVVM::ReductionKind::MIN;
80 case gpu::AllReduceOperation::MAXSI:
81 return NVVM::ReductionKind::MAX;
82 case gpu::AllReduceOperation::MAXUI:
84 case gpu::AllReduceOperation::MAXNUMF:
85 return NVVM::ReductionKind::MAX;
86 case gpu::AllReduceOperation::AND:
87 return NVVM::ReductionKind::AND;
88 case gpu::AllReduceOperation::OR:
89 return NVVM::ReductionKind::OR;
90 case gpu::AllReduceOperation::XOR:
91 return NVVM::ReductionKind::XOR;
92 case gpu::AllReduceOperation::MINIMUMF:
93 case gpu::AllReduceOperation::MAXIMUMF:
99static constexpr llvm::StringLiteral kNVVMNamedBarrierIdPrefix =
100 "__named_barrier_id";
101static constexpr int32_t kNVVMFirstNamedBarrierId = 1;
102static constexpr int32_t kNVVMLastNamedBarrierId = 15;
103static constexpr int32_t kNVVMWarpSize = 32;
105static FailureOr<StringAttr>
106createNVVMNamedBarrierIdGlobal(gpu::InitializeNamedBarrierOp op,
107 ConversionPatternRewriter &rewriter) {
108 auto funcOp = op->getParentOfType<FunctionOpInterface>();
110 op.emitOpError(
"must be inside a function-like op");
114 if (!symbolTableOp) {
116 "enclosing function-like op must have a symbol-table parent");
120 int32_t numNamedBarriers = 0;
123 if (globalOp.getSymName().starts_with(kNVVMNamedBarrierIdPrefix))
126 int32_t barrierId = kNVVMFirstNamedBarrierId + numNamedBarriers;
127 if (barrierId > kNVVMLastNamedBarrierId) {
128 op.emitOpError(
"NVVM supports at most 15 named barriers per CTA");
132 OpBuilder detachedBuilder(rewriter.getContext());
133 Type i32 = rewriter.getI32Type();
134 auto globalOp = LLVM::GlobalOp::create(
135 detachedBuilder, op.getLoc(), i32,
true,
136 LLVM::Linkage::Internal, kNVVMNamedBarrierIdPrefix,
137 rewriter.getI32IntegerAttr(barrierId), 0,
144struct GPUSubgroupReduceOpLowering
146 using ConvertOpToLLVMPattern<gpu::SubgroupReduceOp>::ConvertOpToLLVMPattern;
149 matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
150 ConversionPatternRewriter &rewriter)
const override {
151 if (op.getClusterSize())
152 return rewriter.notifyMatchFailure(
153 op,
"lowering for clustered reduce not implemented");
155 if (!op.getUniform())
156 return rewriter.notifyMatchFailure(
157 op,
"cannot be lowered to redux as the op must be run "
158 "uniformly (entire subgroup).");
159 if (!op.getValue().getType().isInteger(32))
160 return rewriter.notifyMatchFailure(op,
"unsupported data type");
162 std::optional<NVVM::ReductionKind> mode =
163 convertToNVVMReductionKind(op.getOp());
164 if (!mode.has_value())
165 return rewriter.notifyMatchFailure(
166 op,
"unsupported reduction mode for redux");
168 Location loc = op->getLoc();
169 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
170 Value offset = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
172 auto reduxOp = NVVM::ReduxOp::create(rewriter, loc, int32Type,
173 op.getValue(), mode.value(), offset);
175 rewriter.replaceOp(op, reduxOp->getResult(0));
181 using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
202 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
203 ConversionPatternRewriter &rewriter)
const override {
204 Location loc = op->getLoc();
206 auto valueTy = adaptor.getValue().getType();
207 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
208 auto predTy = IntegerType::get(rewriter.getContext(), 1);
210 Value one = LLVM::ConstantOp::create(rewriter, loc, int32Type, 1);
211 Value minusOne = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
212 Value thirtyTwo = LLVM::ConstantOp::create(rewriter, loc, int32Type, 32);
213 Value numLeadInactiveLane = LLVM::SubOp::create(
214 rewriter, loc, int32Type, thirtyTwo, adaptor.getWidth());
216 Value activeMask = LLVM::LShrOp::create(rewriter, loc, int32Type, minusOne,
217 numLeadInactiveLane);
219 if (op.getMode() == gpu::ShuffleMode::UP) {
221 maskAndClamp = numLeadInactiveLane;
224 maskAndClamp = LLVM::SubOp::create(rewriter, loc, int32Type,
225 adaptor.getWidth(), one);
228 bool predIsUsed = !op->getResult(1).use_empty();
229 UnitAttr returnValueAndIsValidAttr =
nullptr;
230 Type resultTy = valueTy;
232 returnValueAndIsValidAttr = rewriter.getUnitAttr();
233 resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
236 Value shfl = NVVM::ShflOp::create(
237 rewriter, loc, resultTy, activeMask, adaptor.getValue(),
238 adaptor.getOffset(), maskAndClamp, convertShflKind(op.getMode()),
239 returnValueAndIsValidAttr);
241 Value shflValue = LLVM::ExtractValueOp::create(rewriter, loc, shfl, 0);
242 Value isActiveSrcLane =
243 LLVM::ExtractValueOp::create(rewriter, loc, shfl, 1);
244 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
246 rewriter.replaceOp(op, {shfl,
nullptr});
253 using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
256 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
257 ConversionPatternRewriter &rewriter)
const override {
258 auto loc = op->getLoc();
259 MLIRContext *context = rewriter.getContext();
260 LLVM::ConstantRangeAttr bounds =
nullptr;
261 if (std::optional<APInt> upperBound = op.getUpperBound())
262 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
263 32, 0, upperBound->getZExtValue());
265 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
268 NVVM::LaneIdOp::create(rewriter, loc, rewriter.getI32Type(), bounds);
271 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
272 if (indexBitwidth > 32) {
273 newOp = LLVM::SExtOp::create(
274 rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
275 }
else if (indexBitwidth < 32) {
276 newOp = LLVM::TruncOp::create(
277 rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
279 rewriter.replaceOp(op, {newOp});
285 using ConvertOpToLLVMPattern<gpu::BallotOp>::ConvertOpToLLVMPattern;
288 matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
289 ConversionPatternRewriter &rewriter)
const override {
290 Location loc = op->getLoc();
291 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
292 auto intType = cast<IntegerType>(op.getType());
293 unsigned width = intType.getWidth();
297 if (width != 32 && width != 64)
298 return rewriter.notifyMatchFailure(
299 op,
"nvvm.vote.sync ballot only supports i32 and i64 result types");
302 Value mask = LLVM::ConstantOp::create(rewriter, loc, int32Type,
303 rewriter.getI32IntegerAttr(-1));
305 auto voteKind = NVVM::VoteSyncKindAttr::get(rewriter.getContext(),
306 NVVM::VoteSyncKind::ballot);
307 Value
result = NVVM::VoteSyncOp::create(rewriter, loc, int32Type, mask,
308 adaptor.getPredicate(), voteKind);
311 result = LLVM::ZExtOp::create(rewriter, loc, op.getType(),
result);
313 rewriter.replaceOp(op,
result);
319struct AssertOpToAssertfailLowering
321 using ConvertOpToLLVMPattern<cf::AssertOp>::ConvertOpToLLVMPattern;
324 matchAndRewrite(cf::AssertOp assertOp, cf::AssertOpAdaptor adaptor,
325 ConversionPatternRewriter &rewriter)
const override {
326 MLIRContext *ctx = rewriter.getContext();
327 Location loc = assertOp.getLoc();
328 Type i8Type = typeConverter->convertType(rewriter.getIntegerType(8));
329 Type i32Type = typeConverter->convertType(rewriter.getIntegerType(32));
330 Type i64Type = typeConverter->convertType(rewriter.getIntegerType(64));
331 Type ptrType = LLVM::LLVMPointerType::get(ctx);
332 Type voidType = LLVM::LLVMVoidType::get(ctx);
335 auto moduleOp = assertOp->getParentOfType<gpu::GPUModuleOp>();
336 auto assertfailType = LLVM::LLVMFunctionType::get(
337 voidType, {ptrType, ptrType, i32Type, ptrType, i64Type});
339 moduleOp, loc, rewriter,
"__assertfail", assertfailType);
340 assertfailDecl.setPassthroughAttr(
341 ArrayAttr::get(ctx, StringAttr::get(ctx,
"noreturn")));
352 Block *beforeBlock = assertOp->getBlock();
354 rewriter.splitBlock(beforeBlock, assertOp->getIterator());
356 rewriter.splitBlock(assertBlock, ++assertOp->getIterator());
357 rewriter.setInsertionPointToEnd(beforeBlock);
358 cf::CondBranchOp::create(rewriter, loc, adaptor.getArg(), afterBlock,
360 rewriter.setInsertionPointToEnd(assertBlock);
361 cf::BranchOp::create(rewriter, loc, afterBlock);
364 rewriter.setInsertionPoint(assertOp);
368 StringRef fileName =
"(unknown)";
369 StringRef funcName =
"(unknown)";
370 int32_t fileLine = 0;
371 while (
auto callSiteLoc = dyn_cast<CallSiteLoc>(loc))
372 loc = callSiteLoc.getCallee();
373 if (
auto fileLineColLoc = dyn_cast<FileLineColRange>(loc)) {
374 fileName = fileLineColLoc.getFilename().strref();
375 fileLine = fileLineColLoc.getStartLine();
376 }
else if (
auto nameLoc = dyn_cast<NameLoc>(loc)) {
377 funcName = nameLoc.getName().strref();
378 if (
auto fileLineColLoc =
379 dyn_cast<FileLineColRange>(nameLoc.getChildLoc())) {
380 fileName = fileLineColLoc.getFilename().strref();
381 fileLine = fileLineColLoc.getStartLine();
386 auto getGlobal = [&](LLVM::GlobalOp global) {
388 Value globalPtr = LLVM::AddressOfOp::create(
389 rewriter, loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
390 global.getSymNameAttr());
392 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
393 globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
397 rewriter, loc, moduleOp, i8Type,
"assert_message_", assertOp.getMsg()));
399 rewriter, loc, moduleOp, i8Type,
"assert_file_", fileName));
401 rewriter, loc, moduleOp, i8Type,
"assert_func_", funcName));
403 LLVM::ConstantOp::create(rewriter, loc, i32Type, fileLine);
404 Value c1 = LLVM::ConstantOp::create(rewriter, loc, i64Type, 1);
407 SmallVector<Value> arguments{assertMessage, assertFile, assertLine,
409 rewriter.replaceOpWithNewOp<LLVM::CallOp>(assertOp, assertfailDecl,
415struct GPUBarrierOpToNVVMLowering final
420 matchAndRewrite(gpu::BarrierOp op, gpu::BarrierOp::Adaptor adaptor,
421 ConversionPatternRewriter &rewriter)
const override {
422 if (Value namedBarrier = adaptor.getNamedBarrier()) {
423 Location loc = op.getLoc();
425 LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 0);
426 Value numberOfThreads =
427 LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 1);
428 NVVM::BarrierOp::create(rewriter, loc, barrierId, numberOfThreads,
429 NVVM::BarrierReductionAttr{}, Value{});
430 rewriter.eraseOp(op);
434 gpu::BarrierScope scope = op.getScope();
436 case gpu::BarrierScope::Workgroup:
437 rewriter.replaceOpWithNewOp<NVVM::BarrierOp>(op);
439 case gpu::BarrierScope::Subgroup: {
442 LLVM::ConstantOp::create(rewriter, op.getLoc(), rewriter.getI32Type(),
443 rewriter.getI32IntegerAttr(0xFFFFFFFF));
444 rewriter.replaceOpWithNewOp<NVVM::SyncWarpOp>(op, mask);
448 return rewriter.notifyMatchFailure(
449 op,
"unsupported scope for NVVM barrier lowering");
454struct GPUInitializeNamedBarrierOpToNVVMLowering final
459 matchAndRewrite(gpu::InitializeNamedBarrierOp op,
460 gpu::InitializeNamedBarrierOp::Adaptor adaptor,
461 ConversionPatternRewriter &rewriter)
const override {
462 Location loc = op.getLoc();
463 MLIRContext *ctx = rewriter.getContext();
464 Type i32 = rewriter.getI32Type();
465 Type namedBarrierType =
466 getTypeConverter()->convertType(op.getResult().getType());
467 if (!namedBarrierType)
468 return rewriter.notifyMatchFailure(op,
"failed to convert result type");
470 FailureOr<StringAttr> maybeGlobalName =
471 createNVVMNamedBarrierIdGlobal(op, rewriter);
472 if (
failed(maybeGlobalName))
475 auto addressOf = LLVM::AddressOfOp::create(
476 rewriter, loc, LLVM::LLVMPointerType::get(ctx), *maybeGlobalName);
478 LLVM::LoadOp::create(rewriter, loc, i32, addressOf.getResult());
480 Value warpSize = LLVM::ConstantOp::create(
481 rewriter, loc, i32, rewriter.getI32IntegerAttr(kNVVMWarpSize));
482 Value numberOfThreads =
483 LLVM::MulOp::create(rewriter, loc, adaptor.getMemberCount(), warpSize);
486 LLVM::PoisonOp::create(rewriter, loc, namedBarrierType);
489 namedBarrier = LLVM::InsertValueOp::create(rewriter, loc, namedBarrier,
490 barrierId, barrierIdPos);
491 namedBarrier = LLVM::InsertValueOp::create(
492 rewriter, loc, namedBarrier, numberOfThreads, numberOfThreadsPos);
493 rewriter.replaceOp(op, namedBarrier);
503struct LowerGpuOpsToNVVMOpsPass final
504 :
public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
507 void getDependentDialects(DialectRegistry ®istry)
const override {
508 Base::getDependentDialects(registry);
512 void runOnOperation()
override {
513 gpu::GPUModuleOp m = getOperation();
516 for (
auto func : m.getOps<func::FuncOp>()) {
517 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
524 DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
526 options.overrideIndexBitwidth(indexBitwidth);
527 options.useBarePtrCallConv = useBarePtrCallConv;
533 RewritePatternSet patterns(m.getContext());
537 vector::populateVectorFromElementsUnrollPatterns(patterns);
539 return signalPassFailure();
542 LLVMTypeConverter converter(m.getContext(),
options);
544 RewritePatternSet llvmPatterns(m.getContext());
551 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
552 allowedDialects.end());
553 for (Dialect *dialect :
getContext().getLoadedDialects()) {
555 if (isa<math::MathDialect>(dialect))
558 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
560 if (!allowedDialectsSet.empty() && !allowed)
563 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
569 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
570 << dialect->getNamespace();
571 return signalPassFailure();
576 iface->populateConvertToLLVMConversionPatterns(
target, converter,
584 ConversionConfig config;
585 config.allowPatternRollback = allowPatternRollback;
587 applyPartialConversion(m,
target, std::move(llvmPatterns), config)))
595 target.addIllegalOp<func::FuncOp>();
596 target.addIllegalOp<cf::AssertOp>();
597 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
598 target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
599 target.addIllegalDialect<gpu::GPUDialect>();
600 target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
601 LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp,
602 LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp,
603 LLVM::RoundEvenOp, LLVM::RoundOp, LLVM::SinOp,
604 LLVM::SincosOp, LLVM::SqrtOp>();
607 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
614 Type i32 = IntegerType::get(type.getContext(), 32);
615 return LLVM::LLVMStructType::getLiteral(type.getContext(), {i32, i32});
627 patterns.
add<GPUSubgroupReduceOpLowering>(converter, benefit);
636 patterns.
add<GPUBarrierOpToNVVMLowering,
637 GPUInitializeNamedBarrierOpToNVVMLowering,
642 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
643 converter, IndexKind::Block, IntrType::Id, benefit);
646 NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
647 converter, IndexKind::Block, IntrType::Dim, benefit);
650 NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
651 converter, IndexKind::Other, IntrType::Id, benefit);
653 gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
654 NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim,
657 gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
658 NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
659 converter, IndexKind::Cluster, IntrType::Id, benefit);
661 gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
662 NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
663 converter, IndexKind::Cluster, IntrType::Dim, benefit);
665 gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
666 converter, IndexKind::Grid, IntrType::Id, benefit);
668 gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
669 converter, IndexKind::Grid, IntrType::Dim, benefit);
670 patterns.
add<GPULaneIdOpToNVVM, GPUBallotOpToNVVM, GPUShuffleOpLowering,
684 static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared),
686 NVVM::NVVMDialect::getKernelFuncAttrName()),
688 NVVM::NVVMDialect::getMaxntidAttrName()),
690 NVVM::NVVMDialect::getClusterDimAttrName())},
701struct NVVMTargetConvertToLLVMAttrInterface
702 :
public ConvertToLLVMAttrInterface::ExternalModel<
703 NVVMTargetConvertToLLVMAttrInterface, NVVM::NVVMTargetAttr> {
705 void populateConvertToLLVMConversionPatterns(
711void NVVMTargetConvertToLLVMAttrInterface::
712 populateConvertToLLVMConversionPatterns(
Attribute attr,
723 NVVMTargetAttr::attachInterface<NVVMTargetConvertToLLVMAttrInterface>(*ctx);
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
iterator_range< op_iterator< OpT > > getOps()
Return an iterator range over the operations within this block that are of 'OpT'.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Conversion from types to the LLVM IR dialect.
MLIRContext & getContext() const
Returns the MLIR context.
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
A trait used to provide symbol table functionalities to a region operation.
Operation is the basic unit of execution within MLIR.
Region & getRegion(unsigned index)
Returns the region held by this operation at position 'index'.
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
StringAttr insert(Operation *symbol, Block::iterator insertPt={})
Insert a new symbol into the table, and rename it as necessary to avoid collisions.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
constexpr int kSharedMemoryAlignmentBit
void registerConvertGpuToNVVMInterface(DialectRegistry ®istry)
Registers the ConvertToLLVMAttrInterface interface on the NVVM::NVVMTargetAttr attribute.
gpu::DimensionKind IndexKind
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int64_t > DenseI64ArrayAttr
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Type convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter)
Configure the LLVM type convert to convert types and address spaces from the GPU dialect to NVVM.
void configureGpuToNVVMConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to NVVM.
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc, Operation *moduleOp, Type llvmI8, StringRef namePrefix, StringRef str, uint64_t alignment=0, unsigned addrSpace=0)
Create a global that contains the given string.
void populateLibDeviceConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate the given list with patterns that convert from Math to NVVM libdevice calls.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Lowering of gpu.printf to a vprintf standard library.