MLIR
20.0.0git
|
#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/ArrayRef.h"
#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"
Go to the source code of this file.
Classes | |
struct | RowColIndexing |
Helper struct to encode a pair of row/column indexings in the form of affine expressions. More... | |
struct | MmaSyncBuilder |
Helper struct to provide a simple mapping from matmul operations to the corresponding mma.sync operation. More... | |
struct | HopperBuilder |
Helper to create the base Hopper-specific operations that are reused in various other places. More... | |
struct | CopyBuilder |
Helper to create the tma operations corresponding to linalg::CopyOp . More... | |
Macros | |
#define | DEBUG_TYPE "nvgpu-transforms" |
#define | DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
#define | DBGSNL() (llvm::dbgs() << "\n") |
#define | LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n") |
#define | GET_OP_LIST |
#define | GET_OP_CLASSES |
Functions | |
static bool | hasDefaultMemorySpace (BaseMemRefType type) |
Returns true if the given type has the default memory space. More... | |
static bool | hasSharedMemorySpace (BaseMemRefType type) |
Returns true if the given type has the shared (workgroup) memory space. More... | |
static Value | getValueLoadedFromGlobal (Operation *op) |
Returns the value produced by a load from the default memory space. More... | |
static bool | isStoreToShared (Operation *op, Value v) |
Returns true if the operation is storing the given value into shared memory. More... | |
static bool | isLoadFromGlobalStoredToShared (Operation *op) |
Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space. More... | |
static LogicalResult | collectStage0PipeliningOps (scf::ForOp forOp, llvm::SmallPtrSet< Operation *, 16 > &ops) |
Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory. More... | |
static void | setAsyncWaitGroupsInFlight (OpBuilder &builder, Operation *op, scf::PipeliningOption::PipelinerPart part, unsigned iteration, unsigned depth) |
Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies. More... | |
static void | getPipelineStages (scf::ForOp forOp, std::vector< std::pair< Operation *, unsigned >> &opsWithPipelineStages, unsigned depth, llvm::SmallPtrSetImpl< Operation * > &stage0Ops) |
Hook for the loop pipeliner that populates ops with the stage information as follows: More... | |
static Operation * | replaceOpWithPredicatedOp (RewriterBase &rewriter, Operation *op, Value predicate) |
Hook for the loop pipeliner. More... | |
static std::tuple< DiagnosedSilenceableFailure, scf::ForOp > | pipelineForSharedCopies (RewriterBase &rewriter, scf::ForOp forOp, int64_t depth, bool epiloguePeeling) |
Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined. More... | |
template<typename ApplyFn , typename ReduceFn > | |
static void | foreachIndividualVectorElement (Value vector, ApplyFn applyFn, ReduceFn reduceFn) |
Helper functions to create customizable load and stores operations. More... | |
static std::tuple< SmallVector< int64_t >, SmallVector< int64_t >, SmallVector< int64_t > > | makeVectorShapes (ArrayRef< int64_t > lhs, ArrayRef< int64_t > rhs, ArrayRef< int64_t > res) |
static Attribute | getSharedAddressSpaceAttribute (OpBuilder &b) |
#define DBGS | ( | ) | (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
Definition at line 41 of file NVGPUTransformOps.cpp.
#define DBGSNL | ( | ) | (llvm::dbgs() << "\n") |
Definition at line 42 of file NVGPUTransformOps.cpp.
#define DEBUG_TYPE "nvgpu-transforms" |
Definition at line 40 of file NVGPUTransformOps.cpp.
#define GET_OP_CLASSES |
Definition at line 1160 of file NVGPUTransformOps.cpp.
#define GET_OP_LIST |
#define LDBG | ( | X | ) | LLVM_DEBUG(DBGS() << (X) << "\n") |
Definition at line 43 of file NVGPUTransformOps.cpp.
|
static |
Populate ops
with the set of operations that belong to the stage 0 of the pipelined version of the given loop when pipelining copies to shared memory.
Specifically, this collects:
In particular, barriers are omitted if they do not dominate at least one async load for which there is not yet a barrier.
Definition at line 212 of file NVGPUTransformOps.cpp.
References isLoadFromGlobalStoredToShared().
Referenced by pipelineForSharedCopies().
|
static |
Helper functions to create customizable load and stores operations.
The specific shapes of each MMA instruction are passed via the IndexCalculator callback.
Definition at line 652 of file NVGPUTransformOps.cpp.
References mlir::computeStrides(), mlir::delinearize(), mlir::Value::getType(), and vectorShape().
|
static |
Hook for the loop pipeliner that populates ops
with the stage information as follows:
stage0Ops
(typically loads from global memory and related barriers) are at stage 0;depth
;depth
first, then those at stage 0, with relative order within each group preserved. Definition at line 283 of file NVGPUTransformOps.cpp.
References mlir::getBackwardSlice(), mlir::Operation::getBlock(), and options.
Referenced by pipelineForSharedCopies().
Definition at line 930 of file NVGPUTransformOps.cpp.
References mlir::get(), and mlir::Builder::getContext().
Referenced by HopperBuilder::buildAndInitBarrierInSharedMemory(), and HopperBuilder::buildGlobalMemRefDescriptor().
Returns the value produced by a load from the default memory space.
Returns null if the operation is not such a load.
Definition at line 169 of file NVGPUTransformOps.cpp.
References hasDefaultMemorySpace().
Referenced by isLoadFromGlobalStoredToShared().
|
static |
Returns true if the given type has the default memory space.
Definition at line 155 of file NVGPUTransformOps.cpp.
References mlir::BaseMemRefType::getMemorySpace(), and mlir::BaseMemRefType::getMemorySpaceAsInt().
Referenced by getValueLoadedFromGlobal().
|
static |
Returns true if the given type has the shared (workgroup) memory space.
Definition at line 160 of file NVGPUTransformOps.cpp.
References mlir::BaseMemRefType::getMemorySpace().
Referenced by isStoreToShared().
|
static |
Returns true if the operation is a load from the default memory space the result of which is only stored into the shared memory space.
Definition at line 194 of file NVGPUTransformOps.cpp.
References mlir::Value::getUsers(), getValueLoadedFromGlobal(), mlir::Value::hasOneUse(), and isStoreToShared().
Referenced by collectStage0PipeliningOps().
Returns true if the operation is storing the given value into shared memory.
Definition at line 182 of file NVGPUTransformOps.cpp.
References hasSharedMemorySpace().
Referenced by isLoadFromGlobalStoredToShared().
|
static |
Definition at line 741 of file NVGPUTransformOps.cpp.
|
static |
Applies loop pipelining with the given depth to the given loop so that copies into the shared memory are pipelined.
Doesn't affect other loops. Returns a pair containing the error state and the pipelined op, the latter being null in case of any failure. The error state contains a definite error if the IR has been modified and a silenceable error otherwise.
Definition at line 355 of file NVGPUTransformOps.cpp.
References collectStage0PipeliningOps(), mlir::DiagnosedSilenceableFailure::definiteFailure(), mlir::emitSilenceableFailure(), getPipelineStages(), options, mlir::scf::pipelineForLoop(), replaceOpWithPredicatedOp(), setAsyncWaitGroupsInFlight(), mlir::OpBuilder::setInsertionPoint(), and mlir::DiagnosedSilenceableFailure::success().
|
static |
Hook for the loop pipeliner.
Replaces op with a predicated version and returns the resulting operation. Returns the original op if the predication isn't necessary for the given op. Returns null if predication is needed but not supported.
Definition at line 311 of file NVGPUTransformOps.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::isMemoryEffectFree(), and mlir::RewriterBase::replaceOp().
Referenced by pipelineForSharedCopies().
|
static |
Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations corresponding to pipelined shared memory copies.
Definition at line 249 of file NVGPUTransformOps.cpp.
References mlir::scf::PipeliningOption::Epilogue, mlir::scf::PipeliningOption::Kernel, and mlir::scf::PipeliningOption::Prologue.
Referenced by pipelineForSharedCopies().