MLIR
21.0.0git
|
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
#include "../LLVMCommon/MemRefDescriptor.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Casting.h"
#include <optional>
#include "mlir/Conversion/Passes.h.inc"
Go to the source code of this file.
Namespaces | |
mlir | |
Include the generated interface declarations. | |
Macros | |
#define | GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS |
Functions | |
static Value | convertUnsignedToI32 (ConversionPatternRewriter &rewriter, Location loc, Value val) |
Convert an unsigned number val to i32. More... | |
static Value | createI32Constant (ConversionPatternRewriter &rewriter, Location loc, int32_t value) |
static Value | createI1Constant (ConversionPatternRewriter &rewriter, Location loc, bool value) |
static Value | getLinearIndexI32 (ConversionPatternRewriter &rewriter, Location loc, MemRefDescriptor &memRefDescriptor, ValueRange indices, ArrayRef< int64_t > strides) |
Returns the linear index used to access an element in the memref. More... | |
static Value | getNumRecords (ConversionPatternRewriter &rewriter, Location loc, MemRefType memrefType, MemRefDescriptor &memrefDescriptor, ArrayRef< int64_t > strides, uint32_t elementByteWidth) |
Compute the contents of the num_records field for a given memref descriptor - that is, the number of bytes that's one element past the greatest possible valid index into the memref. More... | |
static Value | makeBufferRsrc (ConversionPatternRewriter &rewriter, Location loc, Value basePointer, Value numRecords, bool boundsCheck, amdgpu::Chipset chipset, Value cacheSwizzleStride=nullptr, unsigned addressSpace=8) |
static Value | convertMFMAVectorOperand (ConversionPatternRewriter &rewriter, Location loc, Value input) |
Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention. More... | |
static Value | castMFMAScaleOperand (ConversionPatternRewriter &rewriter, Location loc, Value input) |
Converts the scaled MFMA operands, scalesA and scalesB , from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention. More... | |
static void | wmmaPushInputOperand (ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, bool isUnsigned, Value llvmInput, Value mlirInput, SmallVector< Value, 4 > &operands) |
Push an input operand. More... | |
static void | wmmaPushOutputOperand (ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, Value output, int32_t subwordOffset, bool clamp, SmallVector< Value, 4 > &operands) |
Push the output operand. More... | |
static bool | typeIsExpectedBf8ForChipset (Chipset chipset, Type type) |
Return true if type is the E5M2 variant of an 8-bit float that is supported by the _bf8 instructions on the given chipset . More... | |
static bool | typeIsExpectedFp8ForChipset (Chipset chipset, Type type) |
Return true if type is the E4M3FN variant of an 8-bit float that is supported by the _fp8 instructions on the given chipset . More... | |
static std::optional< StringRef > | mfmaOpToIntrinsic (MFMAOp mfma, Chipset chipset) |
Return the rocdl intrinsic corresponding to a MFMA operation mfma if one exists. More... | |
static std::optional< uint32_t > | mfmaTypeSelectCode (Type mlirElemType) |
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > > | mfmaOpToScaledIntrinsic (Type aType, Type bType, Type destType, uint32_t m, uint32_t n, uint32_t k, uint32_t b, Chipset chipset) |
If there is a scaled MFMA instruction for the input element types aType and bType , output type destType , problem size M, N, K, and B (number of blocks) on the given chipset , return a tuple consisting of the OperationName of the intrinsic and the type codes that need to be passed to that intrinsic. More... | |
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > > | mfmaOpToScaledIntrinsic (MFMAOp mfma, Chipset chipset) |
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > > | mfmaOpToScaledIntrinsic (ScaledMFMAOp smfma, Chipset chipset) |
static std::optional< StringRef > | wmmaOpToIntrinsic (WMMAOp wmma, Chipset chipset) |
Return the rocdl intrinsic corresponding to a WMMA operation wmma if one exists. More... | |
Variables | |
constexpr Chipset | kGfx908 = Chipset(9, 0, 8) |
constexpr Chipset | kGfx90a = Chipset(9, 0, 0xa) |
constexpr Chipset | kGfx942 = Chipset(9, 4, 2) |
constexpr Chipset | kGfx950 = Chipset(9, 5, 0) |
#define GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS |
Definition at line 30 of file AMDGPUToROCDL.cpp.
|
static |
Converts the scaled MFMA operands, scalesA
and scalesB
, from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
Specifically:
input
is a i8 value, zero extend it to i32input
is a vector of length 4 and type i8, cast it to i32Note that the type of input
has already been LLVM type converted: therefore 8-bit and smaller floats are represented as their corresponding iN
integers.
Definition at line 542 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), mlir::Builder::getI32Type(), and mlir::Value::getType().
|
static |
Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
Specifically:
input
is a vector of N <= 8 bytes, bitcast it to a (N * 8)-bit integer.Note that the type of input
has already been LLVM type converted: therefore 8-bit and smaller floats are represented as their corresponding iN
integers.
Definition at line 509 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), mlir::detail::divideCeil(), mlir::get(), mlir::Builder::getI16Type(), mlir::Builder::getI32Type(), mlir::Builder::getIntegerType(), and mlir::Value::getType().
|
static |
Convert an unsigned number val
to i32.
Definition at line 44 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), mlir::Builder::getI32Type(), and mlir::Value::getType().
Referenced by getLinearIndexI32(), and getNumRecords().
|
static |
Definition at line 62 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), and mlir::Builder::getI1Type().
Referenced by wmmaPushInputOperand(), and wmmaPushOutputOperand().
|
static |
Definition at line 56 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), and mlir::Builder::getI32Type().
Referenced by getLinearIndexI32(), getNumRecords(), and makeBufferRsrc().
|
static |
Returns the linear index used to access an element in the memref.
Definition at line 69 of file AMDGPUToROCDL.cpp.
References convertUnsignedToI32(), mlir::OpBuilder::create(), createI32Constant(), mlir::detail::enumerate(), mlir::Builder::getI32Type(), and mlir::MemRefDescriptor::stride().
|
static |
Compute the contents of the num_records
field for a given memref descriptor - that is, the number of bytes that's one element past the greatest possible valid index into the memref.
Definition at line 92 of file AMDGPUToROCDL.cpp.
References convertUnsignedToI32(), mlir::OpBuilder::create(), createI32Constant(), max(), mlir::MemRefDescriptor::size(), and mlir::MemRefDescriptor::stride().
|
static |
Definition at line 122 of file AMDGPUToROCDL.cpp.
References mlir::OpBuilder::create(), createI32Constant(), mlir::OpBuilder::createOrFold(), mlir::get(), mlir::Builder::getContext(), mlir::Builder::getI16IntegerAttr(), mlir::Builder::getI16Type(), and kGfx942.
|
static |
Return the rocdl
intrinsic corresponding to a MFMA operation mfma
if one exists.
This includes checking to ensure the intrinsic is supported on the architecture you are compiling for.
Definition at line 658 of file AMDGPUToROCDL.cpp.
|
static |
Definition at line 849 of file AMDGPUToROCDL.cpp.
References mfmaOpToScaledIntrinsic().
|
static |
Definition at line 857 of file AMDGPUToROCDL.cpp.
References mfmaOpToScaledIntrinsic().
|
static |
If there is a scaled MFMA instruction for the input element types aType
and bType
, output type destType
, problem size M, N, K, and B (number of blocks) on the given chipset
, return a tuple consisting of the OperationName of the intrinsic and the type codes that need to be passed to that intrinsic.
Note that this is also used to implement some un-scaled MFMAs, since the compiler represents the ordinary instruction as a "scaled" MFMA with a scale of 0.
Definition at line 821 of file AMDGPUToROCDL.cpp.
Referenced by mfmaOpToScaledIntrinsic().
|
static |
Definition at line 803 of file AMDGPUToROCDL.cpp.
Return true if type
is the E5M2 variant of an 8-bit float that is supported by the _bf8
instructions on the given chipset
.
Definition at line 643 of file AMDGPUToROCDL.cpp.
References mlir::amdgpu::hasOcpFp8(), and kGfx942.
Return true if type
is the E4M3FN variant of an 8-bit float that is supported by the _fp8
instructions on the given chipset
.
Definition at line 650 of file AMDGPUToROCDL.cpp.
References mlir::amdgpu::hasOcpFp8(), and kGfx942.
|
static |
Return the rocdl
intrinsic corresponding to a WMMA operation wmma
if one exists.
This includes checking to ensure the intrinsic is supported on the architecture you are compiling for.
Definition at line 867 of file AMDGPUToROCDL.cpp.
References mlir::amdgpu::Chipset::majorVersion.
|
static |
Push an input operand.
If it is a float type, nothing to do. If it is an integer type, then we need to also push its signdness (1 for signed, 0 for unsigned) and we need to pack the input 16xi8 vector into a 4xi32 vector (or the 8xi8 vector into a 2xi32 one for gfx12+). We also need to convert bfloat inputs to i16 to account for the bfloat intrinsics having been defined before the AMD backend supported bfloat. We similarly need to pack 8-bit float types into integers as if they were i8 (which they are for the backend's purposes).
Definition at line 559 of file AMDGPUToROCDL.cpp.
References mlir::TypeConverter::convertType(), mlir::OpBuilder::create(), createI1Constant(), mlir::OpBuilder::createOrFold(), mlir::get(), mlir::Builder::getI16Type(), mlir::Builder::getI32Type(), mlir::Builder::getIntegerType(), mlir::Type::getIntOrFloatBitWidth(), mlir::Value::getType(), mlir::Type::isBF16(), mlir::Type::isSignedInteger(), and mlir::Type::isUnsignedInteger().
|
static |
Push the output operand.
For many cases this is only pushing the output in the operand list. But when we have f16 -> f16 or bf16 -> bf16 intrinsics, since the same numbers of VGPRs is used, we need to decide if to store the result in the upper 16 bits of the VGPRs or in the lower part. To store the result in the lower 16 bits, set subwordOffset to 1, otherwise result will be stored it in the upper part. The subwordOffset must not be set for gfx12, as the instructions have been changed to return fewer registers instead.
Definition at line 622 of file AMDGPUToROCDL.cpp.
References clamp(), mlir::OpBuilder::create(), createI1Constant(), mlir::Builder::getI16Type(), mlir::Value::getType(), mlir::Type::isBF16(), mlir::Type::isF16(), and mlir::Type::isInteger().
Definition at line 38 of file AMDGPUToROCDL.cpp.
Definition at line 39 of file AMDGPUToROCDL.cpp.
Definition at line 40 of file AMDGPUToROCDL.cpp.
Referenced by isSupportedF8(), makeBufferRsrc(), typeIsExpectedBf8ForChipset(), and typeIsExpectedFp8ForChipset().
Definition at line 41 of file AMDGPUToROCDL.cpp.