43 #include "llvm/Support/FormatVariadic.h"
45 #include "../GPUCommon/GPUOpsLowering.h"
46 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
49 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
50 #include "mlir/Conversion/Passes.h.inc"
58 bool canBeBare =
true;
59 for (
Type type : func.getArgumentTypes())
60 if (
auto memrefTy = dyn_cast<BaseMemRefType>(type))
66 const unsigned indexBitwidth) {
68 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
69 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
70 Value mbcntLo = rewriter.
create<ROCDL::MbcntLoOp>(loc, int32Type,
72 Value laneId = rewriter.
create<ROCDL::MbcntHiOp>(loc, int32Type,
77 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
78 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
79 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
80 "64-S32-A5-G1-ni:7:8:9";
87 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
89 auto loc = op->getLoc();
95 Value zero = rewriter.
create<arith::ConstantIntOp>(loc, 0, 32);
96 Value minus1 = rewriter.
create<arith::ConstantIntOp>(loc, -1, 32);
103 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
104 if (indexBitwidth > 32) {
105 laneId = rewriter.
create<LLVM::SExtOp>(
107 }
else if (indexBitwidth < 32) {
108 laneId = rewriter.
create<LLVM::TruncOp>(
136 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
140 if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
142 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
146 Value width = adaptor.getWidth();
147 Value zero = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 0);
148 Value negwidth = rewriter.
create<LLVM::SubOp>(loc, int32Type, zero, width);
149 Value add = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
150 Value widthOrZeroIfOutside =
151 rewriter.
create<LLVM::AndOp>(loc, int32Type, add, negwidth);
156 switch (op.getMode()) {
157 case gpu::ShuffleMode::DOWN:
158 dstLane = rewriter.
create<LLVM::AddOp>(loc, int32Type, srcLaneId,
159 adaptor.getOffset());
161 case gpu::ShuffleMode::XOR:
162 dstLane = rewriter.
create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
163 adaptor.getOffset());
165 case gpu::ShuffleMode::IDX:
166 dstLane = adaptor.getOffset();
171 Value isActiveSrcLane = rewriter.
create<LLVM::ICmpOp>(
172 loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
173 Value selectDstLane = rewriter.
create<LLVM::SelectOp>(loc, isActiveSrcLane,
175 Value two = rewriter.
create<LLVM::ConstantOp>(loc, int32Type, 2);
176 Value dwordAlignedDstLane =
177 rewriter.
create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
178 Value initShflValue = adaptor.getValue();
179 if (adaptor.getValue().getType().isF32()) {
181 rewriter.
create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
183 Value shflValue = rewriter.
create<ROCDL::DsBpermuteOp>(
184 loc, int32Type, dwordAlignedDstLane, initShflValue);
185 if (adaptor.getValue().getType().isF32()) {
186 shflValue = rewriter.
create<LLVM::BitcastOp>(
187 loc, adaptor.getValue().getType(), shflValue);
189 rewriter.
replaceOp(op, {shflValue, isActiveSrcLane});
195 #include "GPUToROCDL.cpp.inc"
202 struct LowerGpuOpsToROCDLOpsPass final
203 :
public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
204 LowerGpuOpsToROCDLOpsPass() =
default;
205 LowerGpuOpsToROCDLOpsPass(
const std::string &chipset,
unsigned indexBitwidth,
206 bool useBarePtrCallConv,
208 if (this->chipset.getNumOccurrences() == 0)
209 this->chipset = chipset;
210 if (this->indexBitwidth.getNumOccurrences() == 0)
211 this->indexBitwidth = indexBitwidth;
212 if (this->useBarePtrCallConv.getNumOccurrences() == 0)
213 this->useBarePtrCallConv = useBarePtrCallConv;
214 if (this->runtime.getNumOccurrences() == 0)
215 this->runtime = runtime;
219 Base::getDependentDialects(registry);
223 void runOnOperation()
override {
224 gpu::GPUModuleOp m = getOperation();
227 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
228 LLVM::LLVMDialect::getDataLayoutAttrName());
229 if (!llvmDataLayout) {
231 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
234 for (
auto func : m.getOps<func::FuncOp>()) {
235 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
240 if (failed(maybeChipset)) {
242 return signalPassFailure();
247 ctx,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
248 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
250 options.overrideIndexBitwidth(indexBitwidth);
252 if (useBarePtrCallConv) {
253 options.useBarePtrCallConv =
true;
255 m.walk([](gpu::GPUFuncOp func) ->
WalkResult {
262 "bare pointer calling convention requires all memrefs to "
263 "have static shape and use the identity map");
264 return signalPassFailure();
280 converter, [](gpu::AddressSpace space) {
282 case gpu::AddressSpace::Global:
284 case gpu::AddressSpace::Workgroup:
286 case gpu::AddressSpace::Private:
289 llvm_unreachable(
"unknown address space enum value");
296 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
297 allowedDialects.end());
299 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
301 if (!allowedDialectsSet.empty() && !allowed)
304 auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
310 <<
"dialect does not implement ConvertToLLVMPatternInterface: "
311 << dialect->getNamespace();
312 return signalPassFailure();
317 iface->populateConvertToLLVMConversionPatterns(target, converter,
328 auto reqdWorkGroupSizeAttrHelper =
329 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
330 auto flatWorkGroupSizeAttrHelper =
331 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
334 m.walk([&](LLVM::LLVMFuncOp op) {
335 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
336 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
339 uint32_t flatSize = 1;
340 for (uint32_t size : blockSizes.asArrayRef()) {
343 StringAttr flatSizeAttr =
345 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
358 target.
addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
359 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
360 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
363 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
366 target.
addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
380 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
381 converter, IndexKind::Block, IntrType::Id);
383 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
384 converter, IndexKind::Grid, IntrType::Id);
387 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
388 converter, IndexKind::Block, IntrType::Dim);
390 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
391 converter, IndexKind::Grid, IntrType::Dim);
396 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
397 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
398 rocdlDialect->getKernelAttrHelper().getName(),
399 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
409 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
414 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
416 unsigned indexBitwidth,
417 bool useBarePtrCallConv,
419 return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
420 chipset, indexBitwidth, useBarePtrCallConv, runtime);
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static llvm::ManagedStatic< PassManagerOptions > options
MLIRContext * getContext() const
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
A utility result that is used to signal how to proceed with an ongoing walk:
static WalkResult advance()
bool wasInterrupted() const
Returns true if the walk was interrupted.
static WalkResult interrupt()
void populateExpandBFloat16Patterns(RewritePatternSet &patterns)
Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
Runtime
Potential runtimes for AMD GPU kernels.
Include the generated interface declarations.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void populateAMDGPUToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: The ROCDL target does not support the LLVM bfloat type at this time and so this function will a...
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry ®istry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.