MLIR  20.0.0git
Classes | Macros | Functions
NVGPUTransformOps.cpp File Reference
#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/ArrayRef.h"
#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"

Go to the source code of this file.

Classes

struct  RowColIndexing
 Helper struct to encode a pair of row/column indexings in the form of affine expressions. More...
 
struct  MmaSyncBuilder
 Helper struct to provide a simple mapping from matmul operations to the corresponding mma.sync operation. More...
 
struct  HopperBuilder
 Helper to create the base Hopper-specific operations that are reused in various other places. More...
 
struct  CopyBuilder
 Helper to create the tma operations corresponding to linalg::CopyOp. More...
 

Macros

#define DEBUG_TYPE   "nvgpu-transforms"
 
#define DBGS()   (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 
#define DBGSNL()   (llvm::dbgs() << "\n")
 
#define LDBG(X)   LLVM_DEBUG(DBGS() << (X) << "\n")
 
#define GET_OP_LIST
 
#define GET_OP_CLASSES
 

Functions

static bool hasDefaultMemorySpace (BaseMemRefType type)
 Returns true if the given type has the default memory space. More...
 
static bool hasSharedMemorySpace (BaseMemRefType type)
 Returns true if the given type has the shared (workgroup) memory space. More...
 
static Value getValueLoadedFromGlobal (Operation *op)
 Returns the value produced by a load from the default memory space. More...
 
static bool isStoreToShared (Operation *op, Value v)
 Returns true if the operation is storing the given value into shared memory. More...
 
static bool isLoadFromGlobalStoredToShared (Operation *op)
 Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space. More...
 
static LogicalResult collectStage0PipeliningOps (scf::ForOp forOp, llvm::SmallPtrSet< Operation *, 16 > &ops)
 Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory. More...
 
static void setAsyncWaitGroupsInFlight (OpBuilder &builder, Operation *op, scf::PipeliningOption::PipelinerPart part, unsigned iteration, unsigned depth)
 Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies. More...
 
static void getPipelineStages (scf::ForOp forOp, std::vector< std::pair< Operation *, unsigned >> &opsWithPipelineStages, unsigned depth, llvm::SmallPtrSetImpl< Operation * > &stage0Ops)
 Hook for the loop pipeliner that populates ops with the stage information as follows: More...
 
static OperationreplaceOpWithPredicatedOp (RewriterBase &rewriter, Operation *op, Value predicate)
 Hook for the loop pipeliner. More...
 
static std::tuple< DiagnosedSilenceableFailure, scf::ForOp > pipelineForSharedCopies (RewriterBase &rewriter, scf::ForOp forOp, int64_t depth, bool epiloguePeeling)
 Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined. More...
 
template<typename ApplyFn , typename ReduceFn >
static void foreachIndividualVectorElement (Value vector, ApplyFn applyFn, ReduceFn reduceFn)
 Helper functions to create customizable load and stores operations. More...
 
static std::tuple< SmallVector< int64_t >, SmallVector< int64_t >, SmallVector< int64_t > > makeVectorShapes (ArrayRef< int64_t > lhs, ArrayRef< int64_t > rhs, ArrayRef< int64_t > res)
 
static Attribute getSharedAddressSpaceAttribute (OpBuilder &b)
 

Macro Definition Documentation

◆ DBGS

#define DBGS ( )    (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

Definition at line 41 of file NVGPUTransformOps.cpp.

◆ DBGSNL

#define DBGSNL ( )    (llvm::dbgs() << "\n")

Definition at line 42 of file NVGPUTransformOps.cpp.

◆ DEBUG_TYPE

#define DEBUG_TYPE   "nvgpu-transforms"

Definition at line 40 of file NVGPUTransformOps.cpp.

◆ GET_OP_CLASSES

#define GET_OP_CLASSES

Definition at line 1160 of file NVGPUTransformOps.cpp.

◆ GET_OP_LIST

#define GET_OP_LIST

◆ LDBG

#define LDBG (   X)    LLVM_DEBUG(DBGS() << (X) << "\n")

Definition at line 43 of file NVGPUTransformOps.cpp.

Function Documentation

◆ collectStage0PipeliningOps()

static LogicalResult collectStage0PipeliningOps ( scf::ForOp  forOp,
llvm::SmallPtrSet< Operation *, 16 > &  ops 
)
static

Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory.

Specifically, this collects:

  1. all loads from global memory, both sync and async;
  2. the barriers for async loads.

In particular, barriers are omitted if they do not dominate at least one async load for which there is not yet a barrier.

Definition at line 212 of file NVGPUTransformOps.cpp.

References isLoadFromGlobalStoredToShared().

Referenced by pipelineForSharedCopies().

◆ foreachIndividualVectorElement()

template<typename ApplyFn , typename ReduceFn >
static void foreachIndividualVectorElement ( Value  vector,
ApplyFn  applyFn,
ReduceFn  reduceFn 
)
static

Helper functions to create customizable load and stores operations.

The specific shapes of each MMA instruction are passed via the IndexCalculator callback.

Definition at line 652 of file NVGPUTransformOps.cpp.

References mlir::computeStrides(), mlir::delinearize(), mlir::Value::getType(), and vectorShape().

◆ getPipelineStages()

static void getPipelineStages ( scf::ForOp  forOp,
std::vector< std::pair< Operation *, unsigned >> &  opsWithPipelineStages,
unsigned  depth,
llvm::SmallPtrSetImpl< Operation * > &  stage0Ops 
)
static

Hook for the loop pipeliner that populates ops with the stage information as follows:

  • operations in stage0Ops (typically loads from global memory and related barriers) are at stage 0;
  • operations in the backward slice of any stage0Ops are all at stage 0;
  • other operations are at stage depth;
  • the internal order of the pipelined loop has ops at stage depth first, then those at stage 0, with relative order within each group preserved.

Definition at line 283 of file NVGPUTransformOps.cpp.

References mlir::getBackwardSlice(), mlir::Operation::getBlock(), and options.

Referenced by pipelineForSharedCopies().

◆ getSharedAddressSpaceAttribute()

static Attribute getSharedAddressSpaceAttribute ( OpBuilder b)
static

◆ getValueLoadedFromGlobal()

static Value getValueLoadedFromGlobal ( Operation op)
static

Returns the value produced by a load from the default memory space.

Returns null if the operation is not such a load.

Definition at line 169 of file NVGPUTransformOps.cpp.

References hasDefaultMemorySpace().

Referenced by isLoadFromGlobalStoredToShared().

◆ hasDefaultMemorySpace()

static bool hasDefaultMemorySpace ( BaseMemRefType  type)
static

Returns true if the given type has the default memory space.

Definition at line 155 of file NVGPUTransformOps.cpp.

References mlir::BaseMemRefType::getMemorySpace(), and mlir::BaseMemRefType::getMemorySpaceAsInt().

Referenced by getValueLoadedFromGlobal().

◆ hasSharedMemorySpace()

static bool hasSharedMemorySpace ( BaseMemRefType  type)
static

Returns true if the given type has the shared (workgroup) memory space.

Definition at line 160 of file NVGPUTransformOps.cpp.

References mlir::BaseMemRefType::getMemorySpace().

Referenced by isStoreToShared().

◆ isLoadFromGlobalStoredToShared()

static bool isLoadFromGlobalStoredToShared ( Operation op)
static

Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space.

Definition at line 194 of file NVGPUTransformOps.cpp.

References mlir::Value::getUsers(), getValueLoadedFromGlobal(), mlir::Value::hasOneUse(), and isStoreToShared().

Referenced by collectStage0PipeliningOps().

◆ isStoreToShared()

static bool isStoreToShared ( Operation op,
Value  v 
)
static

Returns true if the operation is storing the given value into shared memory.

Definition at line 182 of file NVGPUTransformOps.cpp.

References hasSharedMemorySpace().

Referenced by isLoadFromGlobalStoredToShared().

◆ makeVectorShapes()

static std::tuple<SmallVector<int64_t>, SmallVector<int64_t>, SmallVector<int64_t> > makeVectorShapes ( ArrayRef< int64_t >  lhs,
ArrayRef< int64_t >  rhs,
ArrayRef< int64_t >  res 
)
static

Definition at line 741 of file NVGPUTransformOps.cpp.

◆ pipelineForSharedCopies()

static std::tuple<DiagnosedSilenceableFailure, scf::ForOp> pipelineForSharedCopies ( RewriterBase rewriter,
scf::ForOp  forOp,
int64_t  depth,
bool  epiloguePeeling 
)
static

Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined.

Doesn't affect other loops. Returns a pair containing the error state and the pipelined op, the latter being null in case of any failure. The error state contains a definite error if the IR has been modified and a silenceable error otherwise.

Definition at line 355 of file NVGPUTransformOps.cpp.

References collectStage0PipeliningOps(), mlir::DiagnosedSilenceableFailure::definiteFailure(), mlir::emitSilenceableFailure(), getPipelineStages(), options, mlir::scf::pipelineForLoop(), replaceOpWithPredicatedOp(), setAsyncWaitGroupsInFlight(), mlir::OpBuilder::setInsertionPoint(), and mlir::DiagnosedSilenceableFailure::success().

◆ replaceOpWithPredicatedOp()

static Operation* replaceOpWithPredicatedOp ( RewriterBase rewriter,
Operation op,
Value  predicate 
)
static

Hook for the loop pipeliner.

Replaces op with a predicated version and returns the resulting operation. Returns the original op if the predication isn't necessary for the given op. Returns null if predication is needed but not supported.

Definition at line 311 of file NVGPUTransformOps.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::isMemoryEffectFree(), and mlir::RewriterBase::replaceOp().

Referenced by pipelineForSharedCopies().

◆ setAsyncWaitGroupsInFlight()

static void setAsyncWaitGroupsInFlight ( OpBuilder builder,
Operation op,
scf::PipeliningOption::PipelinerPart  part,
unsigned  iteration,
unsigned  depth 
)
static

Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies.

Definition at line 249 of file NVGPUTransformOps.cpp.

References mlir::scf::PipeliningOption::Epilogue, mlir::scf::PipeliningOption::Kernel, and mlir::scf::PipeliningOption::Prologue.

Referenced by pipelineForSharedCopies().