#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <optional>
#include "mlir/Conversion/Passes.h.inc"

Namespaces
	mlir
	Include the generated interface declarations.

Macros
#define	DEBUG_TYPE "nvgpu-to-nvvm"

#define	DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")

#define	DBGSE() (llvm::dbgs())

#define	GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS

Functions
static Value	truncToI32 (ImplicitLocOpBuilder &b, Value value)
	GPU has 32 bit registers, this function truncates values when larger width is not needed. More...

static Type	inferIntrinsicResultType (Type vectorResultType)
	Returns the type for the intrinsic given the vectorResultType of the `gpu.mma.sync` operation. More...

static Value	convertIntrinsicResult (Location loc, Type intrinsicResultType, Type resultType, Value intrinsicResult, RewriterBase &rewriter)
	Convert the SSA result of the NVVM intrinsic `nvvm.mma.sync` (which is always an LLVM struct) into a fragment that is compatible with the vector type of this operation. More...

static SmallVector< Value >	unpackOperandVector (ImplicitLocOpBuilder &b, Value operand, NVVM::MMATypes operandPtxType)
	The `gpu.mma.sync` converter below expects matrix fragment operands to be given as 2D `vectors` where the rows are 32b or 64b wide. More...

static bool	isMbarrierShared (nvgpu::MBarrierGroupType barrierType)
	Returns whether mbarrier object has shared memory address space. More...

Variables
constexpr int	exclude4LSB = 4
	Number of bits that needs to be excluded when building matrix descriptor for wgmma operations. More...

Macro Definition Documentation

◆ DBGS

#define DBGS ( ) (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")

Definition at line 34 of file NVGPUToNVVM.cpp.

◆ DBGSE

#define DBGSE ( ) (llvm::dbgs())

Definition at line 35 of file NVGPUToNVVM.cpp.

◆ DEBUG_TYPE

#define DEBUG_TYPE "nvgpu-to-nvvm"

Definition at line 33 of file NVGPUToNVVM.cpp.

◆ GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS

#define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS

Definition at line 38 of file NVGPUToNVVM.cpp.

Function Documentation

◆ convertIntrinsicResult()

static Value convertIntrinsicResult	(	Location	loc,
		Type	intrinsicResultType,
		Type	resultType,
		Value	intrinsicResult,
		RewriterBase &	rewriter
	)

static

Convert the SSA result of the NVVM intrinsic nvvm.mma.sync (which is always an LLVM struct) into a fragment that is compatible with the vector type of this operation.

This involves extracting elements from the struct and inserting them into an LLVM array. These extra data-movement operations should be canonicalized away by the LLVM backend.

Definition at line 99 of file NVGPUToNVVM.cpp.

References mlir::OpBuilder::createOrFold(), mlir::detail::enumerate(), mlir::get(), mlir::Builder::getContext(), mlir::Builder::getF16Type(), mlir::Builder::getF32Type(), mlir::Builder::getF64Type(), mlir::Builder::getI32IntegerAttr(), mlir::Builder::getI32Type(), and mlir::Value::getType().

◆ inferIntrinsicResultType()

static Type inferIntrinsicResultType ( Type vectorResultType )

static

Returns the type for the intrinsic given the vectorResultType of the gpu.mma.sync operation.

Definition at line 60 of file NVGPUToNVVM.cpp.

References mlir::get(), and mlir::Type::getContext().

◆ isMbarrierShared()

static bool isMbarrierShared ( nvgpu::MBarrierGroupType barrierType )

static

Returns whether mbarrier object has shared memory address space.

Definition at line 222 of file NVGPUToNVVM.cpp.

◆ truncToI32()

static Value truncToI32	(	ImplicitLocOpBuilder &	b,
		Value	value
	)

static

GPU has 32 bit registers, this function truncates values when larger width is not needed.

Definition at line 50 of file NVGPUToNVVM.cpp.

References mlir::Builder::getI32Type(), mlir::Type::getIntOrFloatBitWidth(), and mlir::Value::getType().

◆ unpackOperandVector()

static SmallVector<Value> unpackOperandVector	(	ImplicitLocOpBuilder &	b,
		Value	operand,
		NVVM::MMATypes	operandPtxType
	)

static

The gpu.mma.sync converter below expects matrix fragment operands to be given as 2D vectors where the rows are 32b or 64b wide.

The nvvm.mma.sync op expects these argments to be a given in a long list of scalars of certain types. This function helps unpack the vector arguments and cast them to the types expected by nvvm.mma.sync.

Definition at line 175 of file NVGPUToNVVM.cpp.

References mlir::get(), mlir::Builder::getF32Type(), mlir::Builder::getF64Type(), mlir::Builder::getI32Type(), mlir::Builder::getI64IntegerAttr(), mlir::Builder::getI64Type(), mlir::Builder::getI8Type(), mlir::Builder::getIntegerType(), and mlir::Value::getType().

Variable Documentation

◆ exclude4LSB

constexpr int exclude4LSB = 4

constexpr

Number of bits that needs to be excluded when building matrix descriptor for wgmma operations.

Definition at line 46 of file NVGPUToNVVM.cpp.

Namespaces

Macros

Functions

Variables

Macro Definition Documentation

◆ DBGS

◆ DBGSE

◆ DEBUG_TYPE

◆ GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS

Function Documentation

◆ convertIntrinsicResult()

◆ inferIntrinsicResultType()

◆ isMbarrierShared()

◆ truncToI32()

◆ unpackOperandVector()

Variable Documentation

◆ exclude4LSB