34 #include "../GPUCommon/GPUOpsLowering.h"
35 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
36 #include "../GPUCommon/OpToFuncCallLowering.h"
40 #define GEN_PASS_DEF_CONVERTGPUOPSTONVVMOPS
41 #include "mlir/Conversion/Passes.h.inc"
49 static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
51 case gpu::ShuffleMode::XOR:
52 return NVVM::ShflKind::bfly;
53 case gpu::ShuffleMode::UP:
54 return NVVM::ShflKind::up;
55 case gpu::ShuffleMode::DOWN:
56 return NVVM::ShflKind::down;
57 case gpu::ShuffleMode::IDX:
58 return NVVM::ShflKind::idx;
60 llvm_unreachable(
"unknown shuffle mode");
63 static std::optional<NVVM::ReduxKind>
64 convertReduxKind(gpu::AllReduceOperation mode) {
66 case gpu::AllReduceOperation::ADD:
67 return NVVM::ReduxKind::ADD;
68 case gpu::AllReduceOperation::MUL:
70 case gpu::AllReduceOperation::MINSI:
71 return NVVM::ReduxKind::MIN;
74 case gpu::AllReduceOperation::MINNUMF:
75 return NVVM::ReduxKind::MIN;
76 case gpu::AllReduceOperation::MAXSI:
77 return NVVM::ReduxKind::MAX;
78 case gpu::AllReduceOperation::MAXUI:
80 case gpu::AllReduceOperation::MAXNUMF:
81 return NVVM::ReduxKind::MAX;
82 case gpu::AllReduceOperation::AND:
83 return NVVM::ReduxKind::AND;
84 case gpu::AllReduceOperation::OR:
85 return NVVM::ReduxKind::OR;
86 case gpu::AllReduceOperation::XOR:
87 return NVVM::ReduxKind::XOR;
88 case gpu::AllReduceOperation::MINIMUMF:
89 case gpu::AllReduceOperation::MAXIMUMF:
97 struct GPUSubgroupReduceOpLowering
102 matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
104 if (op.getClusterSize())
106 op,
"lowering for clustered reduce not implemented");
108 if (!op.getUniform())
110 op,
"cannot be lowered to redux as the op must be run "
111 "uniformly (entire subgroup).");
112 if (!op.getValue().getType().isInteger(32))
115 std::optional<NVVM::ReduxKind> mode = convertReduxKind(op.getOp());
116 if (!mode.has_value())
118 op,
"unsupported reduction mode for redux");
122 Value offset = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, -1);
124 auto reduxOp = rewriter.
create<NVVM::ReduxOp>(loc, int32Type, op.getValue(),
125 mode.value(), offset);
127 rewriter.
replaceOp(op, reduxOp->getResult(0));
154 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
158 auto valueTy = adaptor.getValue().getType();
162 Value one = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 1);
163 Value minusOne = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, -1);
164 Value thirtyTwo = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 32);
165 Value numLeadInactiveLane = rewriter.
create<LLVM::SubOp>(
166 loc, int32Type, thirtyTwo, adaptor.getWidth());
168 Value activeMask = rewriter.
create<LLVM::LShrOp>(loc, int32Type, minusOne,
169 numLeadInactiveLane);
171 if (op.getMode() == gpu::ShuffleMode::UP) {
173 maskAndClamp = numLeadInactiveLane;
177 rewriter.
create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
181 UnitAttr returnValueAndIsValidAttr =
nullptr;
182 Type resultTy = valueTy;
184 returnValueAndIsValidAttr = rewriter.
getUnitAttr();
185 resultTy = LLVM::LLVMStructType::getLiteral(rewriter.
getContext(),
189 loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
190 maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
192 Value shflValue = rewriter.
create<LLVM::ExtractValueOp>(loc, shfl, 0);
193 Value isActiveSrcLane =
194 rewriter.
create<LLVM::ExtractValueOp>(loc, shfl, 1);
195 rewriter.
replaceOp(op, {shflValue, isActiveSrcLane});
207 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
209 auto loc = op->getLoc();
211 LLVM::ConstantRangeAttr bounds =
nullptr;
212 if (std::optional<APInt> upperBound = op.getUpperBound())
213 bounds = rewriter.
getAttr<LLVM::ConstantRangeAttr>(
214 32, 0, upperBound->getZExtValue());
216 bounds = rewriter.
getAttr<LLVM::ConstantRangeAttr>(
222 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
223 if (indexBitwidth > 32) {
224 newOp = rewriter.
create<LLVM::SExtOp>(
226 }
else if (indexBitwidth < 32) {
227 newOp = rewriter.
create<LLVM::TruncOp>(
236 struct AssertOpToAssertfailLowering
241 matchAndRewrite(cf::AssertOp assertOp, cf::AssertOpAdaptor adaptor,
252 auto moduleOp = assertOp->getParentOfType<gpu::GPUModuleOp>();
254 voidType, {ptrType, ptrType, i32Type, ptrType, i64Type});
256 moduleOp, loc, rewriter,
"__assertfail", assertfailType);
257 assertfailDecl.setPassthroughAttr(
269 Block *beforeBlock = assertOp->getBlock();
271 rewriter.
splitBlock(beforeBlock, assertOp->getIterator());
273 rewriter.
splitBlock(assertBlock, ++assertOp->getIterator());
275 rewriter.
create<cf::CondBranchOp>(loc, adaptor.getArg(), afterBlock,
278 rewriter.
create<cf::BranchOp>(loc, afterBlock);
285 StringRef fileName =
"(unknown)";
286 StringRef funcName =
"(unknown)";
287 int32_t fileLine = 0;
288 while (
auto callSiteLoc = dyn_cast<CallSiteLoc>(loc))
289 loc = callSiteLoc.getCallee();
290 if (
auto fileLineColLoc = dyn_cast<FileLineColRange>(loc)) {
291 fileName = fileLineColLoc.getFilename().strref();
292 fileLine = fileLineColLoc.getStartLine();
293 }
else if (
auto nameLoc = dyn_cast<NameLoc>(loc)) {
294 funcName = nameLoc.getName().strref();
295 if (
auto fileLineColLoc =
296 dyn_cast<FileLineColRange>(nameLoc.getChildLoc())) {
297 fileName = fileLineColLoc.getFilename().strref();
298 fileLine = fileLineColLoc.getStartLine();
303 auto getGlobal = [&](LLVM::GlobalOp global) {
305 Value globalPtr = rewriter.
create<LLVM::AddressOfOp>(
307 global.getSymNameAttr());
309 rewriter.
create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
314 rewriter, loc, moduleOp, i8Type,
"assert_message_", assertOp.getMsg()));
316 rewriter, loc, moduleOp, i8Type,
"assert_file_", fileName));
318 rewriter, loc, moduleOp, i8Type,
"assert_func_", funcName));
320 rewriter.
create<LLVM::ConstantOp>(loc, i32Type, fileLine);
321 Value c1 = rewriter.
create<LLVM::ConstantOp>(loc, i64Type, 1);
333 #include "GPUToNVVM.cpp.inc"
340 struct LowerGpuOpsToNVVMOpsPass final
341 :
public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
345 Base::getDependentDialects(registry);
349 void runOnOperation()
override {
350 gpu::GPUModuleOp m = getOperation();
353 for (
auto func : m.getOps<func::FuncOp>()) {
354 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
361 DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
363 options.overrideIndexBitwidth(indexBitwidth);
364 options.useBarePtrCallConv = useBarePtrCallConv;
373 return signalPassFailure();
385 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
386 allowedDialects.end());
389 if (isa<math::MathDialect>(dialect))
392 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
394 if (!allowedDialectsSet.empty() && !allowed)
397 auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
403 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
404 << dialect->getNamespace();
405 return signalPassFailure();
410 iface->populateConvertToLLVMConversionPatterns(target, converter,
431 target.
addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
432 LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FMAOp,
433 LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
434 LLVM::PowOp, LLVM::RoundEvenOp, LLVM::RoundOp,
435 LLVM::SinOp, LLVM::SqrtOp>();
438 target.
addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
447 converter, [](gpu::AddressSpace space) ->
unsigned {
449 case gpu::AddressSpace::Global:
450 return static_cast<unsigned>(
452 case gpu::AddressSpace::Workgroup:
453 return static_cast<unsigned>(
455 case gpu::AddressSpace::Private:
458 llvm_unreachable(
"unknown address space enum value");
467 template <
typename OpTy>
471 StringRef f64Func, StringRef f32ApproxFunc =
"",
472 StringRef f16Func =
"") {
475 f32ApproxFunc, f16Func,
479 template <
typename OpTy>
488 template <
typename OpTy>
492 StringRef f32Func, StringRef f64Func) {
501 patterns.add<GPUSubgroupReduceOpLowering>(converter, benefit);
507 populateOpPatterns<arith::RemFOp>(converter,
patterns, benefit,
"__nv_fmodf",
509 populateOpPatterns<arith::MaxNumFOp>(converter,
patterns, benefit,
510 "__nv_fmaxf",
"__nv_fmax");
511 populateOpPatterns<arith::MinNumFOp>(converter,
patterns, benefit,
512 "__nv_fminf",
"__nv_fmin");
514 populateIntOpPatterns<math::AbsIOp>(converter,
patterns, benefit,
"__nv_abs");
515 populateOpPatterns<math::AbsFOp>(converter,
patterns, benefit,
"__nv_fabsf",
517 populateOpPatterns<math::AcosOp>(converter,
patterns, benefit,
"__nv_acosf",
519 populateOpPatterns<math::AcoshOp>(converter,
patterns, benefit,
"__nv_acoshf",
521 populateOpPatterns<math::AsinOp>(converter,
patterns, benefit,
"__nv_asinf",
523 populateOpPatterns<math::AsinhOp>(converter,
patterns, benefit,
"__nv_asinhf",
525 populateOpPatterns<math::AtanOp>(converter,
patterns, benefit,
"__nv_atanf",
527 populateOpPatterns<math::Atan2Op>(converter,
patterns, benefit,
"__nv_atan2f",
529 populateOpPatterns<math::AtanhOp>(converter,
patterns, benefit,
"__nv_atanhf",
531 populateOpPatterns<math::CbrtOp>(converter,
patterns, benefit,
"__nv_cbrtf",
533 populateOpPatterns<math::CeilOp>(converter,
patterns, benefit,
"__nv_ceilf",
535 populateOpPatterns<math::CopySignOp>(converter,
patterns, benefit,
536 "__nv_copysignf",
"__nv_copysign");
537 populateOpPatterns<math::CosOp>(converter,
patterns, benefit,
"__nv_cosf",
538 "__nv_cos",
"__nv_fast_cosf");
539 populateOpPatterns<math::CoshOp>(converter,
patterns, benefit,
"__nv_coshf",
541 populateOpPatterns<math::ErfOp>(converter,
patterns, benefit,
"__nv_erff",
543 populateOpPatterns<math::ErfcOp>(converter,
patterns, benefit,
"__nv_erfcf",
545 populateOpPatterns<math::ExpOp>(converter,
patterns, benefit,
"__nv_expf",
546 "__nv_exp",
"__nv_fast_expf");
547 populateOpPatterns<math::Exp2Op>(converter,
patterns, benefit,
"__nv_exp2f",
549 populateOpPatterns<math::ExpM1Op>(converter,
patterns, benefit,
"__nv_expm1f",
551 populateOpPatterns<math::FloorOp>(converter,
patterns, benefit,
"__nv_floorf",
553 populateOpPatterns<math::FmaOp>(converter,
patterns, benefit,
"__nv_fmaf",
556 populateOpPatterns<math::IsFiniteOp>(converter,
patterns, benefit,
557 "__nv_finitef",
"__nv_isfinited");
558 populateOpPatterns<math::IsInfOp>(converter,
patterns, benefit,
"__nv_isinff",
560 populateOpPatterns<math::IsNaNOp>(converter,
patterns, benefit,
"__nv_isnanf",
562 populateOpPatterns<math::LogOp>(converter,
patterns, benefit,
"__nv_logf",
563 "__nv_log",
"__nv_fast_logf");
564 populateOpPatterns<math::Log10Op>(converter,
patterns, benefit,
"__nv_log10f",
565 "__nv_log10",
"__nv_fast_log10f");
566 populateOpPatterns<math::Log1pOp>(converter,
patterns, benefit,
"__nv_log1pf",
568 populateOpPatterns<math::Log2Op>(converter,
patterns, benefit,
"__nv_log2f",
569 "__nv_log2",
"__nv_fast_log2f");
570 populateOpPatterns<math::PowFOp>(converter,
patterns, benefit,
"__nv_powf",
571 "__nv_pow",
"__nv_fast_powf");
572 populateFloatIntOpPatterns<math::FPowIOp>(converter,
patterns, benefit,
573 "__nv_powif",
"__nv_powi");
574 populateOpPatterns<math::RoundOp>(converter,
patterns, benefit,
"__nv_roundf",
576 populateOpPatterns<math::RoundEvenOp>(converter,
patterns, benefit,
577 "__nv_rintf",
"__nv_rint");
578 populateOpPatterns<math::RsqrtOp>(converter,
patterns, benefit,
"__nv_rsqrtf",
580 populateOpPatterns<math::SinOp>(converter,
patterns, benefit,
"__nv_sinf",
581 "__nv_sin",
"__nv_fast_sinf");
582 populateOpPatterns<math::SinhOp>(converter,
patterns, benefit,
"__nv_sinhf",
584 populateOpPatterns<math::SqrtOp>(converter,
patterns, benefit,
"__nv_sqrtf",
586 populateOpPatterns<math::TanOp>(converter,
patterns, benefit,
"__nv_tanf",
587 "__nv_tan",
"__nv_fast_tanf");
588 populateOpPatterns<math::TanhOp>(converter,
patterns, benefit,
"__nv_tanhf",
605 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
606 converter, IndexKind::Block, IntrType::Id, benefit);
609 NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
610 converter, IndexKind::Block, IntrType::Dim, benefit);
613 NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
614 converter, IndexKind::Other, IntrType::Id, benefit);
616 gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
617 NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim,
620 gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
621 NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
622 converter, IndexKind::Other, IntrType::Id, benefit);
624 gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
625 NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
626 converter, IndexKind::Other, IntrType::Dim, benefit);
628 gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
629 converter, IndexKind::Grid, IntrType::Id, benefit);
631 gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
632 converter, IndexKind::Grid, IntrType::Dim, benefit);
649 NVVM::NVVMDialect::getKernelFuncAttrName()),
651 NVVM::NVVMDialect::getMaxntidAttrName())},
662 struct NVVMTargetConvertToLLVMAttrInterface
663 :
public ConvertToLLVMAttrInterface::ExternalModel<
664 NVVMTargetConvertToLLVMAttrInterface, NVVM::NVVMTargetAttr> {
666 void populateConvertToLLVMConversionPatterns(
672 void NVVMTargetConvertToLLVMAttrInterface::
673 populateConvertToLLVMConversionPatterns(
Attribute attr,
684 NVVMTargetAttr::attachInterface<NVVMTargetConvertToLLVMAttrInterface>(*ctx);
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static void populateFloatIntOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef f32Func, StringRef f64Func)
static void populateOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef f32Func, StringRef f64Func, StringRef f32ApproxFunc="", StringRef f16Func="")
static void populateIntOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit, StringRef i32Func)
static llvm::ManagedStatic< PassManagerOptions > options
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
IntegerType getIntegerType(unsigned width)
MLIRContext * getContext() const
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
MLIRContext & getContext() const
Returns the MLIR context.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Block * splitBlock(Block *block, Block::iterator before)
Split the operations starting at "before" (inclusive) out of the given block into a new block,...
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
void addConversion(FnT &&callback)
Register a conversion function.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
constexpr int kSharedMemoryAlignmentBit
@ kGlobalMemorySpace
Global memory space identifier.
void registerConvertGpuToNVVMInterface(DialectRegistry ®istry)
Registers the ConvertToLLVMAttrInterface interface on the NVVM::NVVMTargetAttr attribute.
Include the generated interface declarations.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
void configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter)
Configure the LLVM type convert to convert types and address spaces from the GPU dialect to NVVM.
void configureGpuToNVVMConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to NVVM.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
LLVM::LLVMFuncOp getOrDefineFunction(gpu::GPUModuleOp moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Find or create an external function declaration in the given module.
LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc, gpu::GPUModuleOp moduleOp, Type llvmI8, StringRef namePrefix, StringRef str, uint64_t alignment=0, unsigned addrSpace=0)
Create a global that contains the given string.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateLibDeviceConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns that lower certain arith and math dialect ops to libdevice calls.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Lowering of gpu.printf to a vprintf standard library.
Rewriting that replaces SourceOp with a CallOp to f32Func or f64Func or f32ApproxFunc or f16Func or i...
Unrolls SourceOp to array/vector elements.