#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/ArrayRef.h"
#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"

Classes
struct	RowColIndexing
	Helper struct to encode a pair of row/column indexings in the form of affine expressions. More...

struct	MmaSyncBuilder
	Helper struct to provide a simple mapping from matmul operations to the corresponding mma.sync operation. More...

struct	HopperBuilder
	Helper to create the base Hopper-specific operations that are reused in various other places. More...

struct	CopyBuilder
	Helper to create the tma operations corresponding to `linalg::CopyOp`. More...

Macros
#define	DEBUG_TYPE "nvgpu-transforms"

#define	DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

#define	DBGSNL() (llvm::dbgs() << "\n")

#define	LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")

#define	GET_OP_LIST

#define	GET_OP_CLASSES

Functions
static bool	hasDefaultMemorySpace (BaseMemRefType type)
	Returns true if the given type has the default memory space. More...

static bool	hasSharedMemorySpace (BaseMemRefType type)
	Returns true if the given type has the shared (workgroup) memory space. More...

static Value	getValueLoadedFromGlobal (Operation *op)
	Returns the value produced by a load from the default memory space. More...

static bool	isStoreToShared (Operation *op, Value v)
	Returns true if the operation is storing the given value into shared memory. More...

static bool	isLoadFromGlobalStoredToShared (Operation *op)
	Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space. More...

static LogicalResult	collectStage0PipeliningOps (scf::ForOp forOp, llvm::SmallPtrSet< Operation *, 16 > &ops)
	Populate `ops` with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory. More...

static void	setAsyncWaitGroupsInFlight (OpBuilder &builder, Operation *op, scf::PipeliningOption::PipelinerPart part, unsigned iteration, unsigned depth)
	Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies. More...

static void	getPipelineStages (scf::ForOp forOp, std::vector< std::pair< Operation , unsigned >> &opsWithPipelineStages, unsigned depth, llvm::SmallPtrSetImpl< Operation > &stage0Ops)
	Hook for the loop pipeliner that populates `ops` with the stage information as follows: More...

static Operation *	replaceOpWithPredicatedOp (RewriterBase &rewriter, Operation *op, Value predicate)
	Hook for the loop pipeliner. More...

static std::tuple< DiagnosedSilenceableFailure, scf::ForOp >	pipelineForSharedCopies (RewriterBase &rewriter, scf::ForOp forOp, int64_t depth, bool epiloguePeeling)
	Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined. More...

template<typename ApplyFn , typename ReduceFn >
static void	foreachIndividualVectorElement (Value vector, ApplyFn applyFn, ReduceFn reduceFn)
	Helper functions to create customizable load and stores operations. More...

static std::tuple< SmallVector< int64_t >, SmallVector< int64_t >, SmallVector< int64_t > >	makeVectorShapes (ArrayRef< int64_t > lhs, ArrayRef< int64_t > rhs, ArrayRef< int64_t > res)

static Attribute	getSharedAddressSpaceAttribute (OpBuilder &b)

Macro Definition Documentation

◆ DBGS

#define DBGS ( ) (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

Definition at line 41 of file NVGPUTransformOps.cpp.

◆ DBGSNL

#define DBGSNL ( ) (llvm::dbgs() << "\n")

Definition at line 42 of file NVGPUTransformOps.cpp.

◆ DEBUG_TYPE

#define DEBUG_TYPE "nvgpu-transforms"

Definition at line 40 of file NVGPUTransformOps.cpp.

◆ GET_OP_CLASSES

#define GET_OP_CLASSES

Definition at line 1163 of file NVGPUTransformOps.cpp.

◆ GET_OP_LIST

#define GET_OP_LIST

◆ LDBG

#define LDBG ( X ) LLVM_DEBUG(DBGS() << (X) << "\n")

Definition at line 43 of file NVGPUTransformOps.cpp.

Function Documentation

◆ collectStage0PipeliningOps()

static LogicalResult collectStage0PipeliningOps	(	scf::ForOp	forOp,
		llvm::SmallPtrSet< Operation *, 16 > &	ops
	)

static

Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory.

Specifically, this collects:

all loads from global memory, both sync and async;
the barriers for async loads.

In particular, barriers are omitted if they do not dominate at least one async load for which there is not yet a barrier.

Definition at line 212 of file NVGPUTransformOps.cpp.

References isLoadFromGlobalStoredToShared().

Referenced by pipelineForSharedCopies().

◆ foreachIndividualVectorElement()

template<typename ApplyFn , typename ReduceFn >

static void foreachIndividualVectorElement	(	Value	vector,
		ApplyFn	applyFn,
		ReduceFn	reduceFn
	)

static

Helper functions to create customizable load and stores operations.

The specific shapes of each MMA instruction are passed via the IndexCalculator callback.

Definition at line 655 of file NVGPUTransformOps.cpp.

References mlir::computeStrides(), mlir::delinearize(), mlir::Value::getType(), and vectorShape().

◆ getPipelineStages()

static void getPipelineStages	(	scf::ForOp	forOp,
		std::vector< std::pair< Operation *, unsigned >> &	opsWithPipelineStages,
		unsigned	depth,
		llvm::SmallPtrSetImpl< Operation * > &	stage0Ops
	)

static

Hook for the loop pipeliner that populates ops with the stage information as follows:

operations in stage0Ops (typically loads from global memory and related barriers) are at stage 0;
operations in the backward slice of any stage0Ops are all at stage 0;
other operations are at stage depth;
the internal order of the pipelined loop has ops at stage depth first, then those at stage 0, with relative order within each group preserved.

Definition at line 283 of file NVGPUTransformOps.cpp.

References mlir::getBackwardSlice(), mlir::Operation::getBlock(), and options.

Referenced by pipelineForSharedCopies().

◆ getSharedAddressSpaceAttribute()

static Attribute getSharedAddressSpaceAttribute ( OpBuilder & b )

static

Definition at line 933 of file NVGPUTransformOps.cpp.

References mlir::get(), and mlir::Builder::getContext().

Referenced by HopperBuilder::buildAndInitBarrierInSharedMemory(), and HopperBuilder::buildGlobalMemRefDescriptor().

◆ getValueLoadedFromGlobal()

static Value getValueLoadedFromGlobal ( Operation * op )

static

Returns the value produced by a load from the default memory space.

Returns null if the operation is not such a load.

Definition at line 169 of file NVGPUTransformOps.cpp.

References hasDefaultMemorySpace().

Referenced by isLoadFromGlobalStoredToShared().

◆ hasDefaultMemorySpace()

static bool hasDefaultMemorySpace ( BaseMemRefType type )

static

Returns true if the given type has the default memory space.

Definition at line 155 of file NVGPUTransformOps.cpp.

References mlir::BaseMemRefType::getMemorySpace(), and mlir::BaseMemRefType::getMemorySpaceAsInt().

Referenced by getValueLoadedFromGlobal().

◆ hasSharedMemorySpace()

static bool hasSharedMemorySpace ( BaseMemRefType type )

static

Returns true if the given type has the shared (workgroup) memory space.

Definition at line 160 of file NVGPUTransformOps.cpp.

References mlir::BaseMemRefType::getMemorySpace().

Referenced by isStoreToShared().

◆ isLoadFromGlobalStoredToShared()

static bool isLoadFromGlobalStoredToShared ( Operation * op )

static

Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space.

Definition at line 194 of file NVGPUTransformOps.cpp.

References mlir::Value::getUsers(), getValueLoadedFromGlobal(), mlir::Value::hasOneUse(), and isStoreToShared().

Referenced by collectStage0PipeliningOps().

◆ isStoreToShared()

static bool isStoreToShared	(	Operation *	op,
		Value	v
	)

static

Returns true if the operation is storing the given value into shared memory.

Definition at line 182 of file NVGPUTransformOps.cpp.

References hasSharedMemorySpace().

Referenced by isLoadFromGlobalStoredToShared().

◆ makeVectorShapes()

static std::tuple<SmallVector<int64_t>, SmallVector<int64_t>, SmallVector<int64_t> > makeVectorShapes	(	ArrayRef< int64_t >	lhs,
		ArrayRef< int64_t >	rhs,
		ArrayRef< int64_t >	res
	)

static

Definition at line 744 of file NVGPUTransformOps.cpp.

◆ pipelineForSharedCopies()

static std::tuple<DiagnosedSilenceableFailure, scf::ForOp> pipelineForSharedCopies	(	RewriterBase &	rewriter,
		scf::ForOp	forOp,
		int64_t	depth,
		bool	epiloguePeeling
	)

static

Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined.

Doesn't affect other loops. Returns a pair containing the error state and the pipelined op, the latter being null in case of any failure. The error state contains a definite error if the IR has been modified and a silenceable error otherwise.

Definition at line 358 of file NVGPUTransformOps.cpp.

References collectStage0PipeliningOps(), mlir::DiagnosedSilenceableFailure::definiteFailure(), mlir::emitSilenceableFailure(), getPipelineStages(), options, mlir::scf::pipelineForLoop(), replaceOpWithPredicatedOp(), setAsyncWaitGroupsInFlight(), mlir::OpBuilder::setInsertionPoint(), and mlir::DiagnosedSilenceableFailure::success().

◆ replaceOpWithPredicatedOp()

static Operation* replaceOpWithPredicatedOp	(	RewriterBase &	rewriter,
		Operation *	op,
		Value	predicate
	)

static

Hook for the loop pipeliner.

Replaces op with a predicated version and returns the resulting operation. Returns the original op if the predication isn't necessary for the given op. Returns null if predication is needed but not supported.

Definition at line 314 of file NVGPUTransformOps.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::isMemoryEffectFree(), and mlir::RewriterBase::replaceOp().

Referenced by pipelineForSharedCopies().

◆ setAsyncWaitGroupsInFlight()

static void setAsyncWaitGroupsInFlight	(	OpBuilder &	builder,
		Operation *	op,
		scf::PipeliningOption::PipelinerPart	part,
		unsigned	iteration,
		unsigned	depth
	)

static

Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies.

Definition at line 249 of file NVGPUTransformOps.cpp.

References mlir::scf::PipeliningOption::Epilogue, mlir::scf::PipeliningOption::Kernel, and mlir::scf::PipeliningOption::Prologue.

Referenced by pipelineForSharedCopies().

Classes

Macros

Functions

Macro Definition Documentation

◆ DBGS

◆ DBGSNL

◆ DEBUG_TYPE

◆ GET_OP_CLASSES

◆ GET_OP_LIST

◆ LDBG

Function Documentation

◆ collectStage0PipeliningOps()

◆ foreachIndividualVectorElement()

◆ getPipelineStages()

◆ getSharedAddressSpaceAttribute()

◆ getValueLoadedFromGlobal()

◆ hasDefaultMemorySpace()

◆ hasSharedMemorySpace()

◆ isLoadFromGlobalStoredToShared()

◆ isStoreToShared()

◆ makeVectorShapes()

◆ pipelineForSharedCopies()

◆ replaceOpWithPredicatedOp()

◆ setAsyncWaitGroupsInFlight()