46 #include "llvm/Support/FormatVariadic.h"
48 #include "../GPUCommon/GPUOpsLowering.h"
49 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
50 #include "../GPUCommon/OpToFuncCallLowering.h"
53 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
54 #include "mlir/Conversion/Passes.h.inc"
62 bool canBeBare =
true;
63 for (
Type type : func.getArgumentTypes())
64 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
70 const unsigned indexBitwidth) {
72 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
73 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
74 Value mbcntLo = rewriter.
create<ROCDL::MbcntLoOp>(loc, int32Type,
76 Value laneId = rewriter.
create<ROCDL::MbcntHiOp>(loc, int32Type,
81 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
82 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
83 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
84 "64-S32-A5-G1-ni:7:8:9";
91 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
93 auto loc = op->getLoc();
99 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
100 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
107 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
108 if (indexBitwidth > 32) {
109 laneId = rewriter.
create<LLVM::SExtOp>(
111 }
else if (indexBitwidth < 32) {
112 laneId = rewriter.
create<LLVM::TruncOp>(
140 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
144 if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
146 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
150 Value width = adaptor.getWidth();
151 Value zero = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 0);
152 Value negwidth = rewriter.
create<LLVM::SubOp>(loc, int32Type, zero, width);
153 Value add = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
154 Value widthOrZeroIfOutside =
155 rewriter.
create<LLVM::AndOp>(loc, int32Type, add, negwidth);
160 switch (op.getMode()) {
161 case gpu::ShuffleMode::DOWN:
162 dstLane = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId,
163 adaptor.getOffset());
165 case gpu::ShuffleMode::XOR:
166 dstLane = rewriter.
create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
167 adaptor.getOffset());
169 case gpu::ShuffleMode::IDX:
170 dstLane = adaptor.getOffset();
175 Value isActiveSrcLane = rewriter.
create<LLVM::ICmpOp>(
176 loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
177 Value selectDstLane = rewriter.
create<LLVM::SelectOp>(loc, isActiveSrcLane,
179 Value two = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 2);
180 Value dwordAlignedDstLane =
181 rewriter.
create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
182 Value initShflValue = adaptor.getValue();
183 if (adaptor.getValue().getType().isF32()) {
185 rewriter.
create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
187 Value shflValue = rewriter.
create<ROCDL::DsBpermuteOp>(
188 loc, int32Type, dwordAlignedDstLane, initShflValue);
189 if (adaptor.getValue().getType().isF32()) {
190 shflValue = rewriter.
create<LLVM::BitcastOp>(
191 loc, adaptor.getValue().getType(), shflValue);
193 rewriter.
replaceOp(op, {shflValue, isActiveSrcLane});
199 #include "GPUToROCDL.cpp.inc"
206 struct LowerGpuOpsToROCDLOpsPass
207 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
208 LowerGpuOpsToROCDLOpsPass() =
default;
209 LowerGpuOpsToROCDLOpsPass(
const std::string &chipset,
unsigned indexBitwidth,
210 bool useBarePtrCallConv,
212 if (this->chipset.getNumOccurrences() == 0)
213 this->chipset = chipset;
214 if (this->indexBitwidth.getNumOccurrences() == 0)
215 this->indexBitwidth = indexBitwidth;
216 if (this->useBarePtrCallConv.getNumOccurrences() == 0)
217 this->useBarePtrCallConv = useBarePtrCallConv;
218 if (this->runtime.getNumOccurrences() == 0)
219 this->runtime = runtime;
222 void runOnOperation()
override {
223 gpu::GPUModuleOp m = getOperation();
226 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
227 LLVM::LLVMDialect::getDataLayoutAttrName());
228 if (!llvmDataLayout) {
230 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
233 for (
auto func : m.getOps<func::FuncOp>()) {
234 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
239 if (failed(maybeChipset)) {
241 return signalPassFailure();
246 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
247 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
249 options.overrideIndexBitwidth(indexBitwidth);
251 if (useBarePtrCallConv) {
252 options.useBarePtrCallConv =
true;
254 m.walk([](gpu::GPUFuncOp func) ->
WalkResult {
261 "bare pointer calling convention requires all memrefs to "
262 "have static shape and use the identity map");
263 return signalPassFailure();
279 converter, [](gpu::AddressSpace space) {
281 case gpu::AddressSpace::Global:
283 case gpu::AddressSpace::Workgroup:
285 case gpu::AddressSpace::Private:
288 llvm_unreachable(
"unknown address space enum value");
308 auto reqdWorkGroupSizeAttrHelper =
309 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
310 auto flatWorkGroupSizeAttrHelper =
311 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
314 m.walk([&](LLVM::LLVMFuncOp op) {
315 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
316 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
319 uint32_t flatSize = 1;
320 for (uint32_t size : blockSizes.asArrayRef()) {
323 StringAttr flatSizeAttr =
325 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
338 target.
addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
339 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
340 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
343 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
346 target.
addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
349 template <
typename OpTy>
352 StringRef f64Func, StringRef f32ApproxFunc,
370 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
371 converter, IndexKind::Block, IntrType::Id);
373 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
374 converter, IndexKind::Grid, IntrType::Id);
377 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
378 converter, IndexKind::Block, IntrType::Dim);
380 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
381 converter, IndexKind::Grid, IntrType::Dim);
386 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
387 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
388 rocdlDialect->getKernelAttrHelper().getName(),
389 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
399 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
404 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
406 unsigned indexBitwidth,
407 bool useBarePtrCallConv,
409 return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
410 chipset, indexBitwidth, useBarePtrCallConv, runtime);
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static void populateOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, StringRef f64Func, StringRef f32ApproxFunc, StringRef f16Func)
Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static llvm::ManagedStatic< PassManagerOptions > options
MLIRContext * getContext() const
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
The main mechanism for performing data layout queries.
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
void populateExpandBFloat16Patterns(RewritePatternSet &patterns)
Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
void populateArithToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
void populateControlFlowToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect the patterns to convert from the ControlFlow dialect to LLVM.
Runtime
Potential runtimes for AMD GPU kernels.
Include the generated interface declarations.
void populateMathToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, bool approximateLog1p=true)
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void populateFinalizeMemRefToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert memory-related operations from the MemRef dialect to the LLVM di...
void populateAMDGPUToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: The ROCDL target does not support the LLVM bfloat type at this time and so this function will a...
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateVectorToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, bool reassociateFPReductions=false, bool force32BitVectorIndices=false)
Collect a set of patterns to convert from the Vector dialect to LLVM.
void populateFuncToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, const SymbolTable *symbolTable=nullptr)
Collect the patterns to convert from the Func dialect to LLVM.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Rewriting that replace SourceOp with a CallOp to f32Func or f64Func or f32ApproxFunc or f16Func depen...
Rewriting that unrolls SourceOp to scalars if it's operating on vectors.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.