41#define GEN_PASS_DEF_CONVERTGPUOPSTONVVMOPS
42#include "mlir/Conversion/Passes.h.inc"
50static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
52 case gpu::ShuffleMode::XOR:
53 return NVVM::ShflKind::bfly;
54 case gpu::ShuffleMode::UP:
55 return NVVM::ShflKind::up;
56 case gpu::ShuffleMode::DOWN:
57 return NVVM::ShflKind::down;
58 case gpu::ShuffleMode::IDX:
59 return NVVM::ShflKind::idx;
61 llvm_unreachable(
"unknown shuffle mode");
64static std::optional<NVVM::ReductionKind>
65convertToNVVMReductionKind(gpu::AllReduceOperation mode) {
67 case gpu::AllReduceOperation::ADD:
68 return NVVM::ReductionKind::ADD;
69 case gpu::AllReduceOperation::MUL:
71 case gpu::AllReduceOperation::MINSI:
72 return NVVM::ReductionKind::MIN;
73 case gpu::AllReduceOperation::MINUI:
75 case gpu::AllReduceOperation::MINNUMF:
76 return NVVM::ReductionKind::MIN;
77 case gpu::AllReduceOperation::MAXSI:
78 return NVVM::ReductionKind::MAX;
79 case gpu::AllReduceOperation::MAXUI:
81 case gpu::AllReduceOperation::MAXNUMF:
82 return NVVM::ReductionKind::MAX;
83 case gpu::AllReduceOperation::AND:
84 return NVVM::ReductionKind::AND;
85 case gpu::AllReduceOperation::OR:
86 return NVVM::ReductionKind::OR;
87 case gpu::AllReduceOperation::XOR:
88 return NVVM::ReductionKind::XOR;
89 case gpu::AllReduceOperation::MINIMUMF:
90 case gpu::AllReduceOperation::MAXIMUMF:
98struct GPUSubgroupReduceOpLowering
100 using ConvertOpToLLVMPattern<gpu::SubgroupReduceOp>::ConvertOpToLLVMPattern;
103 matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
104 ConversionPatternRewriter &rewriter)
const override {
105 if (op.getClusterSize())
106 return rewriter.notifyMatchFailure(
107 op,
"lowering for clustered reduce not implemented");
109 if (!op.getUniform())
110 return rewriter.notifyMatchFailure(
111 op,
"cannot be lowered to redux as the op must be run "
112 "uniformly (entire subgroup).");
113 if (!op.getValue().getType().isInteger(32))
114 return rewriter.notifyMatchFailure(op,
"unsupported data type");
116 std::optional<NVVM::ReductionKind> mode =
117 convertToNVVMReductionKind(op.getOp());
118 if (!mode.has_value())
119 return rewriter.notifyMatchFailure(
120 op,
"unsupported reduction mode for redux");
122 Location loc = op->getLoc();
123 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
124 Value offset = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
126 auto reduxOp = NVVM::ReduxOp::create(rewriter, loc, int32Type,
127 op.getValue(), mode.value(), offset);
129 rewriter.replaceOp(op, reduxOp->getResult(0));
135 using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
156 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
157 ConversionPatternRewriter &rewriter)
const override {
158 Location loc = op->getLoc();
160 auto valueTy = adaptor.getValue().getType();
161 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
162 auto predTy = IntegerType::get(rewriter.getContext(), 1);
164 Value one = LLVM::ConstantOp::create(rewriter, loc, int32Type, 1);
165 Value minusOne = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
166 Value thirtyTwo = LLVM::ConstantOp::create(rewriter, loc, int32Type, 32);
167 Value numLeadInactiveLane = LLVM::SubOp::create(
168 rewriter, loc, int32Type, thirtyTwo, adaptor.getWidth());
170 Value activeMask = LLVM::LShrOp::create(rewriter, loc, int32Type, minusOne,
171 numLeadInactiveLane);
173 if (op.getMode() == gpu::ShuffleMode::UP) {
175 maskAndClamp = numLeadInactiveLane;
178 maskAndClamp = LLVM::SubOp::create(rewriter, loc, int32Type,
179 adaptor.getWidth(), one);
182 bool predIsUsed = !op->getResult(1).use_empty();
183 UnitAttr returnValueAndIsValidAttr =
nullptr;
184 Type resultTy = valueTy;
186 returnValueAndIsValidAttr = rewriter.getUnitAttr();
187 resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
190 Value shfl = NVVM::ShflOp::create(
191 rewriter, loc, resultTy, activeMask, adaptor.getValue(),
192 adaptor.getOffset(), maskAndClamp, convertShflKind(op.getMode()),
193 returnValueAndIsValidAttr);
195 Value shflValue = LLVM::ExtractValueOp::create(rewriter, loc, shfl, 0);
196 Value isActiveSrcLane =
197 LLVM::ExtractValueOp::create(rewriter, loc, shfl, 1);
198 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
200 rewriter.replaceOp(op, {shfl,
nullptr});
207 using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
210 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
211 ConversionPatternRewriter &rewriter)
const override {
212 auto loc = op->getLoc();
213 MLIRContext *context = rewriter.getContext();
214 LLVM::ConstantRangeAttr bounds =
nullptr;
215 if (std::optional<APInt> upperBound = op.getUpperBound())
216 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
217 32, 0, upperBound->getZExtValue());
219 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
222 NVVM::LaneIdOp::create(rewriter, loc, rewriter.getI32Type(), bounds);
225 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
226 if (indexBitwidth > 32) {
227 newOp = LLVM::SExtOp::create(
228 rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
229 }
else if (indexBitwidth < 32) {
230 newOp = LLVM::TruncOp::create(
231 rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
233 rewriter.replaceOp(op, {newOp});
239struct AssertOpToAssertfailLowering
241 using ConvertOpToLLVMPattern<cf::AssertOp>::ConvertOpToLLVMPattern;
244 matchAndRewrite(cf::AssertOp assertOp, cf::AssertOpAdaptor adaptor,
245 ConversionPatternRewriter &rewriter)
const override {
246 MLIRContext *ctx = rewriter.getContext();
247 Location loc = assertOp.getLoc();
248 Type i8Type = typeConverter->convertType(rewriter.getIntegerType(8));
249 Type i32Type = typeConverter->convertType(rewriter.getIntegerType(32));
250 Type i64Type = typeConverter->convertType(rewriter.getIntegerType(64));
251 Type ptrType = LLVM::LLVMPointerType::get(ctx);
252 Type voidType = LLVM::LLVMVoidType::get(ctx);
255 auto moduleOp = assertOp->getParentOfType<gpu::GPUModuleOp>();
256 auto assertfailType = LLVM::LLVMFunctionType::get(
257 voidType, {ptrType, ptrType, i32Type, ptrType, i64Type});
259 moduleOp, loc, rewriter,
"__assertfail", assertfailType);
260 assertfailDecl.setPassthroughAttr(
261 ArrayAttr::get(ctx, StringAttr::get(ctx,
"noreturn")));
272 Block *beforeBlock = assertOp->getBlock();
274 rewriter.splitBlock(beforeBlock, assertOp->getIterator());
276 rewriter.splitBlock(assertBlock, ++assertOp->getIterator());
277 rewriter.setInsertionPointToEnd(beforeBlock);
278 cf::CondBranchOp::create(rewriter, loc, adaptor.getArg(), afterBlock,
280 rewriter.setInsertionPointToEnd(assertBlock);
281 cf::BranchOp::create(rewriter, loc, afterBlock);
284 rewriter.setInsertionPoint(assertOp);
288 StringRef fileName =
"(unknown)";
289 StringRef funcName =
"(unknown)";
290 int32_t fileLine = 0;
291 while (
auto callSiteLoc = dyn_cast<CallSiteLoc>(loc))
292 loc = callSiteLoc.getCallee();
293 if (
auto fileLineColLoc = dyn_cast<FileLineColRange>(loc)) {
294 fileName = fileLineColLoc.getFilename().strref();
295 fileLine = fileLineColLoc.getStartLine();
296 }
else if (
auto nameLoc = dyn_cast<NameLoc>(loc)) {
297 funcName = nameLoc.getName().strref();
298 if (
auto fileLineColLoc =
299 dyn_cast<FileLineColRange>(nameLoc.getChildLoc())) {
300 fileName = fileLineColLoc.getFilename().strref();
301 fileLine = fileLineColLoc.getStartLine();
306 auto getGlobal = [&](LLVM::GlobalOp global) {
308 Value globalPtr = LLVM::AddressOfOp::create(
309 rewriter, loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
310 global.getSymNameAttr());
312 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
313 globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
317 rewriter, loc, moduleOp, i8Type,
"assert_message_", assertOp.getMsg()));
319 rewriter, loc, moduleOp, i8Type,
"assert_file_", fileName));
321 rewriter, loc, moduleOp, i8Type,
"assert_func_", funcName));
323 LLVM::ConstantOp::create(rewriter, loc, i32Type, fileLine);
324 Value c1 = LLVM::ConstantOp::create(rewriter, loc, i64Type, 1);
327 SmallVector<Value> arguments{assertMessage, assertFile, assertLine,
329 rewriter.replaceOpWithNewOp<LLVM::CallOp>(assertOp, assertfailDecl,
336#include "GPUToNVVM.cpp.inc"
343struct LowerGpuOpsToNVVMOpsPass final
344 :
public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
347 void getDependentDialects(DialectRegistry ®istry)
const override {
348 Base::getDependentDialects(registry);
352 void runOnOperation()
override {
353 gpu::GPUModuleOp m = getOperation();
356 for (
auto func : m.getOps<func::FuncOp>()) {
357 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
364 DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
366 options.overrideIndexBitwidth(indexBitwidth);
367 options.useBarePtrCallConv = useBarePtrCallConv;
373 RewritePatternSet
patterns(m.getContext());
377 vector::populateVectorFromElementsUnrollPatterns(
patterns);
379 return signalPassFailure();
382 LLVMTypeConverter converter(m.getContext(),
options);
384 RewritePatternSet llvmPatterns(m.getContext());
391 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
392 allowedDialects.end());
393 for (Dialect *dialect :
getContext().getLoadedDialects()) {
395 if (isa<math::MathDialect>(dialect))
398 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
400 if (!allowedDialectsSet.empty() && !allowed)
403 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
409 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
410 << dialect->getNamespace();
411 return signalPassFailure();
416 iface->populateConvertToLLVMConversionPatterns(
target, converter,
425 config.allowPatternRollback = allowPatternRollback;
427 applyPartialConversion(m,
target, std::move(llvmPatterns),
config)))
435 target.addIllegalOp<func::FuncOp>();
436 target.addIllegalOp<cf::AssertOp>();
437 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
438 target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
439 target.addIllegalDialect<gpu::GPUDialect>();
440 target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
441 LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp,
442 LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp,
443 LLVM::RoundEvenOp, LLVM::RoundOp, LLVM::SinOp,
444 LLVM::SincosOp, LLVM::SqrtOp>();
447 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
464 ConversionPatternRewriter &rewriter)
const override {
466 Value input = adaptor.getOperand();
468 auto convertedInput = maybeExt(input, rewriter);
469 auto computeType = convertedInput.getType();
471 StringRef sincosFunc;
472 if (isa<Float32Type>(computeType)) {
473 const arith::FastMathFlags flag = op.getFastmath();
474 const bool useApprox =
475 mlir::arith::bitEnumContainsAny(flag, arith::FastMathFlags::afn);
476 sincosFunc = useApprox ?
"__nv_fast_sincosf" :
"__nv_sincosf";
477 }
else if (isa<Float64Type>(computeType)) {
478 sincosFunc =
"__nv_sincos";
480 return rewriter.notifyMatchFailure(op,
481 "unsupported operand type for sincos");
484 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
486 Value sinPtr, cosPtr;
491 assert(scope &&
"Expected op to be inside automatic allocation scope");
492 rewriter.setInsertionPointToStart(&scope->getRegion(0).front());
493 auto one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(),
494 rewriter.getI32IntegerAttr(1));
496 LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
498 LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
501 createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr,
504 auto sinResult = LLVM::LoadOp::create(rewriter, loc, computeType, sinPtr);
505 auto cosResult = LLVM::LoadOp::create(rewriter, loc, computeType, cosPtr);
507 rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter),
508 maybeTrunc(cosResult, inputType, rewriter)});
514 if (isa<Float16Type, BFloat16Type>(operand.
getType()))
515 return LLVM::FPExtOp::create(rewriter, operand.
getLoc(),
521 Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter)
const {
523 return LLVM::FPTruncOp::create(rewriter, operand.
getLoc(), type, operand);
527 void createSincosCall(ConversionPatternRewriter &rewriter, Location loc,
528 StringRef funcName, Value input, Value sinPtr,
529 Value cosPtr, Operation *op)
const {
530 auto voidType = LLVM::LLVMVoidType::get(rewriter.getContext());
531 auto ptrType = sinPtr.
getType();
533 SmallVector<Type> operandTypes = {input.
getType(), ptrType, ptrType};
534 auto funcType = LLVM::LLVMFunctionType::get(voidType, operandTypes);
536 auto funcAttr = StringAttr::get(op->
getContext(), funcName);
542 assert(parentFunc &&
"expected there to be a parent function");
543 OpBuilder
b(parentFunc);
546 funcOp = LLVM::LLVMFuncOp::create(
b, globalloc, funcName, funcType);
549 SmallVector<Value> callOperands = {input, sinPtr, cosPtr};
550 LLVM::CallOp::create(rewriter, loc, funcOp, callOperands);
554template <
typename OpTy>
558 StringRef f64Func, StringRef f32ApproxFunc =
"",
559 StringRef f16Func =
"") {
562 f32ApproxFunc, f16Func,
566template <
typename OpTy>
575template <
typename OpTy>
579 StringRef f32Func, StringRef f64Func) {
588 patterns.add<GPUSubgroupReduceOpLowering>(converter, benefit);
597 "__nv_fmaxf",
"__nv_fmax");
599 "__nv_fminf",
"__nv_fmin");
623 "__nv_copysignf",
"__nv_copysign");
625 "__nv_cos",
"__nv_fast_cosf");
633 "__nv_exp",
"__nv_fast_expf");
644 "__nv_finitef",
"__nv_isfinited");
650 "__nv_log",
"__nv_fast_logf");
652 "__nv_log10",
"__nv_fast_log10f");
656 "__nv_log2",
"__nv_fast_log2f");
658 "__nv_pow",
"__nv_fast_powf");
660 "__nv_powif",
"__nv_powi");
664 "__nv_rintf",
"__nv_rint");
668 "__nv_sin",
"__nv_fast_sinf");
674 "__nv_tan",
"__nv_fast_tanf");
679 patterns.add<SincosOpLowering>(converter, benefit);
695 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
696 converter, IndexKind::Block, IntrType::Id, benefit);
699 NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
700 converter, IndexKind::Block, IntrType::Dim, benefit);
703 NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
704 converter, IndexKind::Other, IntrType::Id, benefit);
706 gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
707 NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim,
710 gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
711 NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
712 converter, IndexKind::Cluster, IntrType::Id, benefit);
714 gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
715 NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
716 converter, IndexKind::Cluster, IntrType::Dim, benefit);
718 gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
719 converter, IndexKind::Grid, IntrType::Id, benefit);
721 gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
722 converter, IndexKind::Grid, IntrType::Dim, benefit);
737 static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared),
739 NVVM::NVVMDialect::getKernelFuncAttrName()),
741 NVVM::NVVMDialect::getMaxntidAttrName()),
743 NVVM::NVVMDialect::getClusterDimAttrName())},
754struct NVVMTargetConvertToLLVMAttrInterface
755 :
public ConvertToLLVMAttrInterface::ExternalModel<
756 NVVMTargetConvertToLLVMAttrInterface, NVVM::NVVMTargetAttr> {
758 void populateConvertToLLVMConversionPatterns(
764void NVVMTargetConvertToLLVMAttrInterface::
765 populateConvertToLLVMConversionPatterns(
Attribute attr,
776 NVVMTargetAttr::attachInterface<NVVMTargetConvertToLLVMAttrInterface>(*ctx);
static void populateFloatIntOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef f32Func, StringRef f64Func)
static void populateOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef f32Func, StringRef f64Func, StringRef f32ApproxFunc="", StringRef f16Func="")
static void populateIntOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef i32Func)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
MLIRContext * getContext() const
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
typename math::SincosOp::Adaptor OpAdaptor
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Conversion from types to the LLVM IR dialect.
MLIRContext & getContext() const
Returns the MLIR context.
LocationAttr findInstanceOfOrUnknown()
Return an instance of the given location type if one is nested under the current location else return...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
A trait of region holding operations that define a new scope for automatic allocations,...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
MLIRContext * getContext()
Return the context this operation is associated with.
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
static Operation * lookupNearestSymbolFrom(Operation *from, StringAttr symbol)
Returns the operation registered with the given symbol name within the closest parent operation of,...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
Location getLoc() const
Return the location of this value.
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
constexpr int kSharedMemoryAlignmentBit
void registerConvertGpuToNVVMInterface(DialectRegistry ®istry)
Registers the ConvertToLLVMAttrInterface interface on the NVVM::NVVMTargetAttr attribute.
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Include the generated interface declarations.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
const FrozenRewritePatternSet GreedyRewriteConfig config
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Type convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
void configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter)
Configure the LLVM type convert to convert types and address spaces from the GPU dialect to NVVM.
void configureGpuToNVVMConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to NVVM.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc, Operation *moduleOp, Type llvmI8, StringRef namePrefix, StringRef str, uint64_t alignment=0, unsigned addrSpace=0)
Create a global that contains the given string.
void populateLibDeviceConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns that lower certain arith and math dialect ops to libdevice calls.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
LogicalResult matchAndRewrite(math::SincosOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Lowering of gpu.printf to a vprintf standard library.
Rewriting that replaces SourceOp with a CallOp to f32Func or f64Func or f32ApproxFunc or f16Func or i...
Unrolls SourceOp to array/vector elements.