#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
#include "../LLVMCommon/MemRefDescriptor.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include <optional>
#include "mlir/Conversion/Passes.h.inc"

Classes
class	mlir::impl::ConvertAMDGPUToROCDLPassBase< DerivedT >

Namespaces
namespace	mlir
	Include the generated interface declarations.
namespace	mlir::impl
	Attribute collections provide a dictionary-like interface.

Macros
#define	GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS

Functions
std::unique_ptr<::mlir::Pass >	mlir::impl::createConvertAMDGPUToROCDLPass ()
std::unique_ptr<::mlir::Pass >	mlir::impl::createConvertAMDGPUToROCDLPass (ConvertAMDGPUToROCDLPassOptions options)
std::unique_ptr<::mlir::Pass >	mlir::createConvertAMDGPUToROCDLPass ()
std::unique_ptr<::mlir::Pass >	mlir::createConvertAMDGPUToROCDLPass (ConvertAMDGPUToROCDLPassOptions options)
static Value	convertUnsignedToI32 (ConversionPatternRewriter &rewriter, Location loc, Value val)
	Convert an unsigned number val to i32.
static Value	createI32Constant (ConversionPatternRewriter &rewriter, Location loc, int32_t value)
static Value	convertUnsignedToI64 (ConversionPatternRewriter &rewriter, Location loc, Value val)
	Convert an unsigned number val to i64.
static Value	createI64Constant (ConversionPatternRewriter &rewriter, Location loc, int64_t value)
static Value	getLinearIndexI32 (ConversionPatternRewriter &rewriter, Location loc, MemRefDescriptor &memRefDescriptor, ValueRange indices, ArrayRef< int64_t > strides)
	Returns the linear index used to access an element in the memref.
static Value	getNumRecords (ConversionPatternRewriter &rewriter, Location loc, MemRefType memrefType, MemRefDescriptor &memrefDescriptor, ArrayRef< int64_t > strides, int64_t elementByteWidth)
	Compute the contents of the num_records field for a given memref descriptor - that is, the number of bytes that's one element past the greatest possible valid index into the memref.
static Value	makeBufferRsrc (ConversionPatternRewriter &rewriter, Location loc, Value basePointer, Value numRecords, bool boundsCheck, amdgpu::Chipset chipset, Value cacheSwizzleStride=nullptr, unsigned addressSpace=8)
static Value	convertMFMAVectorOperand (ConversionPatternRewriter &rewriter, Location loc, Value input, bool allowBf16=true)
	Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
static Value	castMFMAScaleOperand (ConversionPatternRewriter &rewriter, Location loc, Value input)
	Converts the scaled MFMA operands, scalesA and scalesB, from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
static void	wmmaPushInputOperand (ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, bool isUnsigned, Value llvmInput, Value mlirInput, SmallVectorImpl< Value > &operands, SmallVectorImpl< NamedAttribute > &attrs, StringRef attrName)
	Push an input operand.
static void	wmmaPushOutputOperand (ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, Value output, int32_t subwordOffset, bool clamp, SmallVectorImpl< Value > &operands, SmallVectorImpl< NamedAttribute > &attrs)
	Push the output operand.
static bool	typeIsExpectedBf8ForChipset (Chipset chipset, Type type)
	Return true if type is the E5M2 variant of an 8-bit float that is supported by the _bf8 instructions on the given chipset.
static bool	typeIsExpectedFp8ForChipset (Chipset chipset, Type type)
	Return true if type is the E4M3FN variant of an 8-bit float that is supported by the _fp8 instructions on the given chipset.
static std::optional< StringRef >	mfmaOpToIntrinsic (MFMAOp mfma, Chipset chipset)
	Return the rocdl intrinsic corresponding to a MFMA operation mfma if one exists.
static std::optional< uint32_t >	mfmaTypeSelectCode (Type mlirElemType)
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > >	mfmaOpToScaledIntrinsic (Type aType, Type bType, Type destType, uint32_t m, uint32_t n, uint32_t k, uint32_t b, Chipset chipset)
	If there is a scaled MFMA instruction for the input element types aType and bType, output type destType, problem size M, N, K, and B (number of blocks) on the given chipset, return a tuple consisting of the OperationName of the intrinsic and the type codes that need to be passed to that intrinsic.
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > >	mfmaOpToScaledIntrinsic (MFMAOp mfma, Chipset chipset)
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > >	mfmaOpToScaledIntrinsic (ScaledMFMAOp smfma, Chipset chipset)
static std::optional< StringRef >	wmmaOpToIntrinsicRDNA (Type elemSourceType, Type elemBSourceType, Type elemDestType, uint32_t k, bool isRDNA3)
	Returns the rocdl intrinsic corresponding to a WMMA operation wmma for RDNA3/4 architectures.
static std::optional< StringRef >	wmmaOpToIntrinsicGfx1250 (Type elemSourceType, Type elemBSourceType, Type elemDestType, uint32_t k)
	Return the rocdl intrinsic corresponding to a WMMA operation wmma for the gfx1250 architecture.
static std::optional< StringRef >	wmmaOpToIntrinsic (WMMAOp wmma, Chipset chipset)
	Returns the rocdl intrinsic corresponding to a WMMA operation wmma if one exists.

Variables
constexpr Chipset	kGfx908 = Chipset(9, 0, 8)
constexpr Chipset	kGfx90a = Chipset(9, 0, 0xa)
constexpr Chipset	kGfx942 = Chipset(9, 4, 2)
constexpr Chipset	kGfx950 = Chipset(9, 5, 0)
constexpr Chipset	kGfx1250 = Chipset(12, 5, 0)

Macro Definition Documentation

◆ GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS

#define GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS

Definition at line 35 of file AMDGPUToROCDL.cpp.

Function Documentation

◆ castMFMAScaleOperand()

Value castMFMAScaleOperand	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		Value	input )

static

Converts the scaled MFMA operands, scalesA and scalesB, from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.

Specifically:

If input is a i8 value, zero extend it to i32
If input is a vector of length 4 and type i8, cast it to i32

Note that the type of input has already been LLVM type converted: therefore 8-bit and smaller floats are represented as their corresponding iN integers.

Definition at line 667 of file AMDGPUToROCDL.cpp.

References mlir::Value::getType().

◆ convertMFMAVectorOperand()

Value convertMFMAVectorOperand	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		Value	input,
		bool	allowBf16 = true )

static

Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.

Specifically:

If the element type is bfloat16, bitcast it to i16 unless rocdl intrinsic allows bf16. Newer MFMAs support bf16 types on operand, check IntrinsicsAMDGPU.td file for reference.
If instead we have a more than 64-bit quantity, use a <N / 4 x i32> instead, which is what the f8f6f4 intrinsics use.
If input is a vector of N <= 8 bytes, bitcast it to a (N * 8)-bit integer.

Note that the type of input has already been LLVM type converted: therefore 8-bit and smaller floats are represented as their corresponding iN integers.

Definition at line 631 of file AMDGPUToROCDL.cpp.

References mlir::Value::getType().

◆ convertUnsignedToI32()

Value convertUnsignedToI32	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		Value	val )

static

Convert an unsigned number val to i32.

Definition at line 50 of file AMDGPUToROCDL.cpp.

References mlir::Value::getType().

Referenced by getLinearIndexI32().

◆ convertUnsignedToI64()

Value convertUnsignedToI64	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		Value	val )

static

Convert an unsigned number val to i64.

Definition at line 68 of file AMDGPUToROCDL.cpp.

References mlir::Value::getType().

Referenced by getNumRecords().

◆ createI32Constant()

Value createI32Constant	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		int32_t	value )

static

Definition at line 62 of file AMDGPUToROCDL.cpp.

Referenced by mlir::LLVM::composeValue(), mlir::LLVM::decomposeValue(), getLinearIndexI32(), and makeBufferRsrc().

◆ createI64Constant()

Value createI64Constant	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		int64_t	value )

static

Definition at line 80 of file AMDGPUToROCDL.cpp.

Referenced by getNumRecords().

◆ getLinearIndexI32()

Value getLinearIndexI32	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		MemRefDescriptor &	memRefDescriptor,
		ValueRange	indices,
		ArrayRef< int64_t >	strides )

static

Returns the linear index used to access an element in the memref.

Definition at line 86 of file AMDGPUToROCDL.cpp.

References convertUnsignedToI32(), createI32Constant(), indices, and mlir::MemRefDescriptor::stride().

◆ getNumRecords()

Value getNumRecords	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		MemRefType	memrefType,
		MemRefDescriptor &	memrefDescriptor,
		ArrayRef< int64_t >	strides,
		int64_t	elementByteWidth )

static

Compute the contents of the num_records field for a given memref descriptor - that is, the number of bytes that's one element past the greatest possible valid index into the memref.

Definition at line 109 of file AMDGPUToROCDL.cpp.

References convertUnsignedToI64(), createI64Constant(), mlir::MemRefDescriptor::size(), and mlir::MemRefDescriptor::stride().

◆ makeBufferRsrc()

Value makeBufferRsrc	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		Value	basePointer,
		Value	numRecords,
		bool	boundsCheck,
		amdgpu::Chipset	chipset,
		Value	cacheSwizzleStride = nullptr,
		unsigned	addressSpace = 8 )

static

Definition at line 137 of file AMDGPUToROCDL.cpp.

References createI32Constant(), kGfx942, and mlir::amdgpu::Chipset::majorVersion.

◆ mfmaOpToIntrinsic()

std::optional< StringRef > mfmaOpToIntrinsic	(	MFMAOp	mfma,
		Chipset	chipset )

static

Return the rocdl intrinsic corresponding to a MFMA operation mfma if one exists.

This includes checking to ensure the intrinsic is supported on the architecture you are compiling for.

Definition at line 777 of file AMDGPUToROCDL.cpp.

References b, mlir::getElementTypeOrSelf(), mlir::Type::isBF16(), mlir::Type::isF16(), mlir::Type::isF32(), mlir::Type::isF64(), mlir::Type::isInteger(), kGfx90a, kGfx942, kGfx950, typeIsExpectedBf8ForChipset(), and typeIsExpectedFp8ForChipset().

◆ mfmaOpToScaledIntrinsic() [1/3]

std::optional< std::tuple< StringRef, uint32_t, uint32_t > > mfmaOpToScaledIntrinsic	(	MFMAOp	mfma,
		Chipset	chipset )

static

Definition at line 968 of file AMDGPUToROCDL.cpp.

References mfmaOpToScaledIntrinsic().

◆ mfmaOpToScaledIntrinsic() [2/3]

std::optional< std::tuple< StringRef, uint32_t, uint32_t > > mfmaOpToScaledIntrinsic	(	ScaledMFMAOp	smfma,
		Chipset	chipset )

static

Definition at line 976 of file AMDGPUToROCDL.cpp.

References mfmaOpToScaledIntrinsic().

◆ mfmaOpToScaledIntrinsic() [3/3]

std::optional< std::tuple< StringRef, uint32_t, uint32_t > > mfmaOpToScaledIntrinsic	(	Type	aType,
		Type	bType,
		Type	destType,
		uint32_t	m,
		uint32_t	n,
		uint32_t	k,
		uint32_t	b,
		Chipset	chipset )

static

If there is a scaled MFMA instruction for the input element types aType and bType, output type destType, problem size M, N, K, and B (number of blocks) on the given chipset, return a tuple consisting of the OperationName of the intrinsic and the type codes that need to be passed to that intrinsic.

Note that this is also used to implement some un-scaled MFMAs, since the compiler represents the ordinary instruction as a "scaled" MFMA with a scale of 0.

Definition at line 940 of file AMDGPUToROCDL.cpp.

References b, mlir::getElementTypeOrSelf(), kGfx950, and mfmaTypeSelectCode().

Referenced by mfmaOpToScaledIntrinsic(), and mfmaOpToScaledIntrinsic().

◆ mfmaTypeSelectCode()

std::optional< uint32_t > mfmaTypeSelectCode ( Type mlirElemType )

static

Definition at line 922 of file AMDGPUToROCDL.cpp.

Referenced by mfmaOpToScaledIntrinsic().

◆ typeIsExpectedBf8ForChipset()

bool typeIsExpectedBf8ForChipset	(	Chipset	chipset,
		Type	type )

static

Return true if type is the E5M2 variant of an 8-bit float that is supported by the _bf8 instructions on the given chipset.

Definition at line 762 of file AMDGPUToROCDL.cpp.

References mlir::amdgpu::hasOcpFp8(), and kGfx942.

Referenced by mfmaOpToIntrinsic().

◆ typeIsExpectedFp8ForChipset()

bool typeIsExpectedFp8ForChipset	(	Chipset	chipset,
		Type	type )

static

Return true if type is the E4M3FN variant of an 8-bit float that is supported by the _fp8 instructions on the given chipset.

Definition at line 769 of file AMDGPUToROCDL.cpp.

References mlir::amdgpu::hasOcpFp8(), and kGfx942.

Referenced by mfmaOpToIntrinsic().

◆ wmmaOpToIntrinsic()

std::optional< StringRef > wmmaOpToIntrinsic	(	WMMAOp	wmma,
		Chipset	chipset )

static

Returns the rocdl intrinsic corresponding to a WMMA operation wmma if one exists.

This includes checking to ensure the intrinsic is supported on the architecture you are compiling for.

Definition at line 1135 of file AMDGPUToROCDL.cpp.

References kGfx1250, mlir::amdgpu::Chipset::majorVersion, mlir::amdgpu::Chipset::minorVersion, wmmaOpToIntrinsicGfx1250(), and wmmaOpToIntrinsicRDNA().

◆ wmmaOpToIntrinsicGfx1250()

std::optional< StringRef > wmmaOpToIntrinsicGfx1250	(	Type	elemSourceType,
		Type	elemBSourceType,
		Type	elemDestType,
		uint32_t	k )

static

Return the rocdl intrinsic corresponding to a WMMA operation wmma for the gfx1250 architecture.

Definition at line 1042 of file AMDGPUToROCDL.cpp.

References mlir::Type::isBF16(), mlir::Type::isF16(), mlir::Type::isF32(), and mlir::Type::isInteger().

Referenced by wmmaOpToIntrinsic().

◆ wmmaOpToIntrinsicRDNA()

std::optional< StringRef > wmmaOpToIntrinsicRDNA	(	Type	elemSourceType,
		Type	elemBSourceType,
		Type	elemDestType,
		uint32_t	k,
		bool	isRDNA3 )

static

Returns the rocdl intrinsic corresponding to a WMMA operation wmma for RDNA3/4 architectures.

Definition at line 986 of file AMDGPUToROCDL.cpp.

References mlir::Type::isBF16(), mlir::Type::isF16(), mlir::Type::isF32(), and mlir::Type::isInteger().

Referenced by wmmaOpToIntrinsic().

◆ wmmaPushInputOperand()

void wmmaPushInputOperand	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		const TypeConverter *	typeConverter,
		bool	isUnsigned,
		Value	llvmInput,
		Value	mlirInput,
		SmallVectorImpl< Value > &	operands,
		SmallVectorImpl< NamedAttribute > &	attrs,
		StringRef	attrName )

static

Push an input operand.

If it is a float type, nothing to do. If it is an integer type, then we need to also push its signdness (1 for signed, 0 for unsigned) and we need to pack the input 16xi8 vector into a 4xi32 vector (or the 8xi8 vector into a 2xi32 one for gfx12+). We also need to convert bfloat inputs to i16 to account for the bfloat intrinsics having been defined before the AMD backend supported bfloat. We similarly need to pack 8-bit float types into integers as if they were i8 (which they are for the backend's purposes).

Definition at line 684 of file AMDGPUToROCDL.cpp.

References mlir::Type::getIntOrFloatBitWidth(), mlir::Value::getType(), mlir::Type::isSignedInteger(), and mlir::Type::isUnsignedInteger().

◆ wmmaPushOutputOperand()

void wmmaPushOutputOperand	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		const TypeConverter *	typeConverter,
		Value	output,
		int32_t	subwordOffset,
		bool	clamp,
		SmallVectorImpl< Value > &	operands,
		SmallVectorImpl< NamedAttribute > &	attrs )

static

Push the output operand.

For many cases this is only pushing the output in the operand list. But when we have f16 -> f16 or bf16 -> bf16 intrinsics, since the same numbers of VGPRs is used, we need to decide if to store the result in the upper 16 bits of the VGPRs or in the lower part. To store the result in the lower 16 bits, set subwordOffset to 1, otherwise result will be stored it in the upper part. The subwordOffset must not be set for gfx12, as the instructions have been changed to return fewer registers instead.

Definition at line 742 of file AMDGPUToROCDL.cpp.

References clamp(), mlir::Value::getType(), mlir::Type::isBF16(), mlir::Type::isF16(), and mlir::Type::isInteger().

Variable Documentation

◆ kGfx1250

Chipset kGfx1250 = Chipset(12, 5, 0)

constexpr

Definition at line 47 of file AMDGPUToROCDL.cpp.

Referenced by wmmaOpToIntrinsic().

◆ kGfx908

Chipset kGfx908 = Chipset(9, 0, 8)

constexpr

Definition at line 43 of file AMDGPUToROCDL.cpp.

◆ kGfx90a

Chipset kGfx90a = Chipset(9, 0, 0xa)

constexpr

Definition at line 44 of file AMDGPUToROCDL.cpp.

Referenced by mfmaOpToIntrinsic().

◆ kGfx942

Chipset kGfx942 = Chipset(9, 4, 2)

constexpr

Definition at line 45 of file AMDGPUToROCDL.cpp.

Referenced by isSupportedF8(), makeBufferRsrc(), mfmaOpToIntrinsic(), typeIsExpectedBf8ForChipset(), and typeIsExpectedFp8ForChipset().

◆ kGfx950

Chipset kGfx950 = Chipset(9, 5, 0)

constexpr

Definition at line 46 of file AMDGPUToROCDL.cpp.

Referenced by mfmaOpToIntrinsic(), mfmaOpToScaledIntrinsic(), and mlir::populateGpuPromoteShuffleToAMDGPUPatterns().

Classes

Namespaces

Macros

Functions

Variables

Macro Definition Documentation

◆ GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS

Function Documentation

◆ castMFMAScaleOperand()

◆ convertMFMAVectorOperand()

◆ convertUnsignedToI32()

◆ convertUnsignedToI64()

◆ createI32Constant()

◆ createI64Constant()

◆ getLinearIndexI32()

◆ getNumRecords()

◆ makeBufferRsrc()

◆ mfmaOpToIntrinsic()

◆ mfmaOpToScaledIntrinsic() [1/3]

◆ mfmaOpToScaledIntrinsic() [2/3]

◆ mfmaOpToScaledIntrinsic() [3/3]

◆ mfmaTypeSelectCode()

◆ typeIsExpectedBf8ForChipset()

◆ typeIsExpectedFp8ForChipset()

◆ wmmaOpToIntrinsic()

◆ wmmaOpToIntrinsicGfx1250()

◆ wmmaOpToIntrinsicRDNA()

◆ wmmaPushInputOperand()

◆ wmmaPushOutputOperand()

Variable Documentation

◆ kGfx1250

◆ kGfx908

◆ kGfx90a

◆ kGfx942

◆ kGfx950