doxygen/MMAUtils_8cpp_source.html

 //===- MMAUtils.cpp - MLIR NVGPU dialect utils for MMA operations----------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/NVGPU/Utils/MMAUtils.h"


 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"

 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"


 using namespace mlir;

 using namespace mlir::nvgpu;


 /// There are always 4 threads per [128|256|512] bit row.

 static constexpr int64_t kThreadsPerRow = 4;

 static constexpr int64_t kNumRowsPerTile = 8;


 static bool isAccumulatorOrResult(MatMulOperandRole operandType) {

   return operandType == MatMulOperandRole::C;

 }


 /// Returns the number of registers which compose a matrix fragment held by a

 /// single thread.

 static int64_t inferNumRegistersPerMatrixFragment(const WarpMatrixInfo &type) {

   int64_t lineSize = inferTileWidthInBits(type);

   auto shape = type.vectorType.getShape();

   return (shape[0] / kNumRowsPerTile) *

          (shape[1] * type.vectorType.getElementType().getIntOrFloatBitWidth()) /

          lineSize;

 }


 /// Returns the number of 8 x [128|256|512] bit tiles that compose the given

 /// operand shape.

 static std::array<int64_t, 2> getTileShape(ArrayRef<int64_t> operandShape,

                                            Type elementType,

                                            int64_t lineSizeBits) {

   // For each 8x128bit square, a thread is responsible for one 32bit register.

   return {operandShape[0] / kNumRowsPerTile,

           (operandShape[1] * elementType.getIntOrFloatBitWidth()) /

               lineSizeBits};

 }


 /// Returns the first user of the `op` that is vector.contract. If no

 /// vector.contract user exists, return failure.

 FailureOr<vector::ContractionOp> nvgpu::getUserContract(Operation *op) {

   for (Operation *user : op->getUsers()) {

     if (auto contractOp = dyn_cast<vector::ContractionOp>(user))

       return contractOp;

   }

   return failure();

 }


 FailureOr<WarpMatrixInfo> nvgpu::getWarpMatrixInfo(Operation *op) {

   WarpMatrixInfo info;


   // Determine the vector type at warp-level.

   if (vector::TransferWriteOp writeOp = dyn_cast<vector::TransferWriteOp>(op)) {

     info.vectorType = writeOp.getVectorType();

   } else if (isa<vector::TransferReadOp, vector::ContractionOp,

                  vector::ExtractStridedSliceOp, arith::ConstantOp>(op)) {

     info.vectorType = cast<VectorType>(op->getResult(0).getType());

   } else {

     return op->emitError()

            << "unhandled operation type in nvgpu.mma.sync conversion path";

   }


   // Determine the operand role. We assume it is an accumulator/result unless it

   // is directly consumed by a `vector.contract` op.

   info.operandRole = MatMulOperandRole::C;

   FailureOr<vector::ContractionOp> contractOp = getUserContract(op);

   if (failed(contractOp))

     return info;


   if ((*contractOp).getLhs() == op->getResult(0))

     info.operandRole = MatMulOperandRole::A;

   else if ((*contractOp).getRhs() == op->getResult(0))

     info.operandRole = MatMulOperandRole::B;


   return info;

 }


 int64_t nvgpu::inferTileWidthInBits(const WarpMatrixInfo &type) {

   bool isAcc = isAccumulatorOrResult(type.operandRole);

   Type elType = type.vectorType.getElementType();

   if (isAcc && elType.getIntOrFloatBitWidth() == 32) {

     return 256;

   }

   if (elType.getIntOrFloatBitWidth() == 64) {

     return isAcc ? 512 : 256;

   }

   return 128;

 }


 FailureOr<FragmentElementInfo>

 nvgpu::getMmaSyncRegisterType(const WarpMatrixInfo &type) {

   MLIRContext *ctx = type.vectorType.getContext();

   const bool isAccum = isAccumulatorOrResult(type.operandRole);


   Type elType = type.vectorType.getElementType();

   if (elType.isF16()) {

     return FragmentElementInfo{VectorType::get(2, Float16Type::get(ctx)), 2, 32,

                                inferNumRegistersPerMatrixFragment(type)};

   }


   // f64 operand

   Type f64Ty = Float64Type::get(ctx);

   if (elType.isF64()) {

     return isAccum

                ? FragmentElementInfo{VectorType::get(2, f64Ty), 2, 128,

                                      inferNumRegistersPerMatrixFragment(type)}

                : FragmentElementInfo{f64Ty, 1, 64,

                                      inferNumRegistersPerMatrixFragment(type)};

   }


   // int8 operand

   if (elType.isInteger(8)) {

     return FragmentElementInfo{VectorType::get(4, IntegerType::get(ctx, 8)), 4,

                                32, inferNumRegistersPerMatrixFragment(type)};

   }


   // int4 operand

   if (elType.isInteger(4)) {

     return FragmentElementInfo{VectorType::get(8, IntegerType::get(ctx, 4)), 8,

                                32, inferNumRegistersPerMatrixFragment(type)};

   }


   // Integer 32bit acc operands

   if (elType.isInteger(32)) {

     return FragmentElementInfo{VectorType::get(2, IntegerType::get(ctx, 32)), 2,

                                64, inferNumRegistersPerMatrixFragment(type)};

   }


   // Floating point 32bit operands

   if (elType.isF32()) {

     Type f32Ty = Float32Type::get(ctx);

     return isAccum

                ? FragmentElementInfo{VectorType::get(2, f32Ty), 2, 64,

                                      inferNumRegistersPerMatrixFragment(type)}

                : FragmentElementInfo{f32Ty, 1, 32,

                                      inferNumRegistersPerMatrixFragment(type)};

   }

   return failure();

 }


 static AffineMap getRegisterIndexToTileOffsetMap(int64_t lineSize,

                                                  Type elementType,

                                                  ArrayRef<int64_t> operandShape,

                                                  bool isAccumulator,

                                                  int64_t elementsPerRegister,

                                                  AffineExpr logicalValueId) {

   const int64_t elementsPerLine =

       lineSize / elementType.getIntOrFloatBitWidth();

   const std::array<int64_t, 2> num8x128bTiles =

       getTileShape(operandShape, elementType, lineSize);

   AffineExpr registerIdx = logicalValueId.floorDiv(elementsPerRegister);

   return AffineMap::get(

       2, 0,

       {(registerIdx % num8x128bTiles[0]) * 8,

        (registerIdx.floorDiv(num8x128bTiles[0])) * elementsPerLine},

       elementType.getContext());

 }


 FailureOr<AffineMap>

 nvgpu::getLaneIdAndValueIdToOperandCoord(OpBuilder &builder, Location loc,

                                          const WarpMatrixInfo &fragmentType) {

   Type elementType = fragmentType.vectorType.getElementType();

   ArrayRef<int64_t> operandShape = fragmentType.vectorType.getShape();

   FailureOr<nvgpu::FragmentElementInfo> regInfo =

       getMmaSyncRegisterType(fragmentType);

   if (failed(regInfo))

     return failure();


   const int64_t elementBitWidth = elementType.getIntOrFloatBitWidth();

   const int64_t elementsPerRegister =

       regInfo->registerWidthBits / elementBitWidth;

   const int64_t lineSize = inferTileWidthInBits(fragmentType);


   AffineExpr laneId, logicalValueIdDim;

   bindDims(builder.getContext(), laneId, logicalValueIdDim);


   // Determine what register logicalValueId corresponds to. Use that as a

   // linear index into the coordinate mapping `index -> (tile row, tile col)`.

   AffineMap registerIndexToTileCoord = getRegisterIndexToTileOffsetMap(

       lineSize, elementType, operandShape,

       isAccumulatorOrResult(fragmentType.operandRole), elementsPerRegister,

       logicalValueIdDim);


   auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {

     return AffineMap::get(2, 0, dimExprs, builder.getContext());

   };


   auto tileRow = registerIndexToTileCoord.getResult(0);

   auto tileCol = registerIndexToTileCoord.getResult(1);

   return makeMap({tileRow + laneId.floorDiv(kThreadsPerRow),

                   tileCol + (laneId % kThreadsPerRow) * elementsPerRegister +

                       (logicalValueIdDim % elementsPerRegister)});

 }


 FailureOr<nvgpu::LdMatrixParams>

 nvgpu::getLdMatrixParams(const WarpMatrixInfo &type, bool transpose) {

   LdMatrixParams params;

   Type elType = type.vectorType.getElementType();

   params.fragmentType = type.vectorType;

   if (type.operandRole == MatMulOperandRole::A ||

       type.operandRole == MatMulOperandRole::C) {

     params.targetLayout = NVVM::MMALayout::row;

   } else {

     params.targetLayout = NVVM::MMALayout::col;

   }

   ArrayRef<int64_t> shape = type.vectorType.getShape();

   params.contiguousDimType = transpose ? vector::IteratorType::parallel

                                        : vector::IteratorType::reduction;


   if (params.contiguousDimType == vector::IteratorType::reduction) {

     params.numTiles = (shape[0] / kNumRowsPerTile) *

                       ((shape[1] * elType.getIntOrFloatBitWidth()) / 128);

   } else {

     params.numTiles = (shape[1] / kNumRowsPerTile) *

                       ((shape[0] * elType.getIntOrFloatBitWidth()) / 128);

   }


   if (params.numTiles == 0)

     return failure();


   return params;

 }


 FailureOr<AffineMap>

 nvgpu::getLaneIdToLdMatrixMatrixCoord(OpBuilder &builder, Location loc,

                                       const LdMatrixParams &params) {

   // One thread per 128b row.

   const int bitsPerElement = static_cast<int>(

       params.fragmentType.getElementType().getIntOrFloatBitWidth());

   const int kElementsPer128b = (128 / bitsPerElement);

   ArrayRef<int64_t> operandShape = params.fragmentType.getShape();

   AffineExpr d0 = getAffineDimExpr(0, builder.getContext());


   auto makeMap = [&](ArrayRef<AffineExpr> dimExprs) -> AffineMap {

     return AffineMap::get(1, 0, dimExprs, builder.getContext());

   };


   // Index `idx` in vectorType `operandShape` maps to the strided dimension of

   // the `srcMemref` memory of the LdMatrixOp.

   int idx =

       (params.contiguousDimType == vector::IteratorType::reduction) ? 0 : 1;


   // Affine expr in strided and contiguous dimension encodes the coordinate

   // mapping for the element a thread points to for warp-wide LdMatrixOp.

   AffineExpr strided = d0 % (operandShape[idx]);

   AffineExpr contiguous = d0.floorDiv(operandShape[idx]) * (kElementsPer128b);


   // This case corresponds to row-major matrixA or col-major matrixB or

   // row-major matrixC. This is when the memory layout in `srcMemref`

   // match mma.sync hardware vector register operand layout.

   if (params.contiguousDimType == vector::IteratorType::reduction)

     return makeMap({strided, contiguous});


   // This case corresponds to col-major matrixA or row-major matrixB or

   // col-major matrixC. This is when the memory layout in `srcMemref` does not

   // match mma.sync hardware vector register operand layout.

   if (params.contiguousDimType == vector::IteratorType::parallel)

     return makeMap({contiguous, strided});


   return failure();

 }


 bool nvgpu::canLowerToWarpMatrixOperation(vector::TransferReadOp op) {

   if (op.getMask() || op.hasOutOfBoundsDim())

     return false;

   VectorType type = op.getType();

   // The result type should be 2D. Note that it is possible to expand support so

   // that we are robust to extra unit dimensions that failed to fold, but that

   // would significantly increase downstream code complexity in the conversion

   // step. For now, we rely on other patterns to ensure canonical 2D form is

   // used when targeting the `nvgpu.mma.sync` lowering path.

   if (!type.hasStaticShape() || type.getRank() != 2)

     return false;


   // Currently we can't support reads on tensor types because we need stride

   // information to ensure correctness of downstream assumptions. It is possible

   // to enable this if caller can assert that tensor will be lowered in a

   // particular manner.

   auto sourceType = dyn_cast<MemRefType>(op.getBase().getType());

   if (!sourceType)

     return false;


   // Check that the last dimension of the read is contiguous. Note that it is

   // possible to expand support for this by scalarizing all the loads during

   // conversion.

   auto [strides, offset] = sourceType.getStridesAndOffset();

   return strides.back() == 1;

 }


 bool nvgpu::canLowerToWarpMatrixOperation(vector::TransferWriteOp op) {

   if (op.getMask() || op.hasOutOfBoundsDim() || op.getTransferRank() == 0)

     return false;

   VectorType type = op.getVectorType();

   if (!type.hasStaticShape() || type.getRank() != 2)

     return false;

   // TODO: Currently we rely on lowering to a `vector.store` operation. We could

   // support the transposed write case by lowering to scalarized `memref.store`

   // operations.

   if (!op.getPermutationMap().isMinorIdentity())

     return false;

   // Currently we can't support reads on tensor types because we need stride

   // information to ensure correctness of downstream assumptions.

   auto sourceType = dyn_cast<MemRefType>(op.getBase().getType());

   if (!sourceType)

     return false;


   // Check that the last dimension of the target memref is contiguous. Note that

   // it is possible to expand support for this by scalarizing all the stores

   // during conversion.

   auto [strides, offset] = sourceType.getStridesAndOffset();

   return strides.back() == 1;

 }

AffineOps.h

kNumRowsPerTile
static constexpr int64_t kNumRowsPerTile
Definition: MMAUtils.cpp:21

getRegisterIndexToTileOffsetMap
static AffineMap getRegisterIndexToTileOffsetMap(int64_t lineSize, Type elementType, ArrayRef< int64_t > operandShape, bool isAccumulator, int64_t elementsPerRegister, AffineExpr logicalValueId)
Definition: MMAUtils.cpp:150

kThreadsPerRow
static constexpr int64_t kThreadsPerRow
There are always 4 threads per [128|256|512] bit row.
Definition: MMAUtils.cpp:20

isAccumulatorOrResult
static bool isAccumulatorOrResult(MatMulOperandRole operandType)
Definition: MMAUtils.cpp:23

inferNumRegistersPerMatrixFragment
static int64_t inferNumRegistersPerMatrixFragment(const WarpMatrixInfo &type)
Returns the number of registers which compose a matrix fragment held by a single thread.
Definition: MMAUtils.cpp:29

getTileShape
static std::array< int64_t, 2 > getTileShape(ArrayRef< int64_t > operandShape, Type elementType, int64_t lineSizeBits)
Returns the number of 8 x [128|256|512] bit tiles that compose the given operand shape.
Definition: MMAUtils.cpp:39

MMAUtils.h

NVGPUDialect.h

NVVMDialect.h

VectorOps.h

llvm::ArrayRef
Definition: LLVM.h:48

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineExpr::floorDiv
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46

mlir::AffineMap::get
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Definition: MLIRContext.cpp:1206

mlir::AffineMap::getResult
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:411

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407

mlir::Operation::emitError
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268

mlir::Operation::getUsers
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:873

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::isF64
bool isF64() const
Definition: Types.cpp:41

mlir::Type::getContext
MLIRContext * getContext() const
Return the MLIRContext in which this type was uniqued.
Definition: Types.cpp:35

mlir::Type::isF32
bool isF32() const
Definition: Types.cpp:40

mlir::Type::isInteger
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition: Types.cpp:56

mlir::Type::isF16
bool isF16() const
Definition: Types.cpp:38

mlir::Type::getIntOrFloatBitWidth
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:122

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

Arith.h

mlir::nvgpu
Definition: NVGPUToNVVM.h:25

mlir::nvgpu::inferTileWidthInBits
int64_t inferTileWidthInBits(const WarpMatrixInfo &type)
Returns the number of bits in a single tile row.
Definition: MMAUtils.cpp:87

mlir::nvgpu::getUserContract
FailureOr< vector::ContractionOp > getUserContract(Operation *op)
Returns the first user of the op that is vector.contract.
Definition: MMAUtils.cpp:50

mlir::nvgpu::getLaneIdAndValueIdToOperandCoord
FailureOr< AffineMap > getLaneIdAndValueIdToOperandCoord(OpBuilder &builder, Location loc, const WarpMatrixInfo &fragmentType)
Returns an AffineMap which maps a two dimensions representing (laneId, logicalValueId) and returns tw...
Definition: MMAUtils.cpp:169

mlir::nvgpu::getWarpMatrixInfo
FailureOr< WarpMatrixInfo > getWarpMatrixInfo(Operation *op)
If op is a vector.transfer_write, return the WarpMatrixInfo for the vector operand.
Definition: MMAUtils.cpp:58

mlir::nvgpu::getLaneIdToLdMatrixMatrixCoord
FailureOr< AffineMap > getLaneIdToLdMatrixMatrixCoord(OpBuilder &builder, Location loc, const LdMatrixParams &params)
Returns an AffineMap which maps a single dimension representing the laneId to two results representin...
Definition: MMAUtils.cpp:234

mlir::nvgpu::MatMulOperandRole
MatMulOperandRole
Represents the role of an operand in an MMA instruction: result := matmul(A, B) + C
Definition: MMAUtils.h:26

mlir::nvgpu::MatMulOperandRole::C
@ C

mlir::nvgpu::MatMulOperandRole::A
@ A

mlir::nvgpu::MatMulOperandRole::B
@ B

mlir::nvgpu::getLdMatrixParams
FailureOr< LdMatrixParams > getLdMatrixParams(const WarpMatrixInfo &type, bool transpose)
Given type that contains info for a warp-matrix operand and whether or not the load is a transposed l...
Definition: MMAUtils.cpp:205

mlir::nvgpu::getMmaSyncRegisterType
FailureOr< FragmentElementInfo > getMmaSyncRegisterType(const WarpMatrixInfo &type)
Returns a FragmentElementInfo struct describing the register types for the given matrix fragment type...
Definition: MMAUtils.cpp:100

mlir::nvgpu::canLowerToWarpMatrixOperation
bool canLowerToWarpMatrixOperation(vector::TransferReadOp op)
Returns whether the vector.transfer_read instruction can be interpreted as a warp-level cooperative m...
Definition: MMAUtils.cpp:272

mlir::xegpu::transpose
static void transpose(llvm::ArrayRef< int64_t > trans, SmallVector< int64_t > &shape)
Definition: XeGPUOps.cpp:23

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::bindDims
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::getAffineDimExpr
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:621

mlir::nvgpu::FragmentElementInfo
Specifies information about the registers which compose a matrix fragment according to the PTX docume...
Definition: MMAUtils.h:52

mlir::nvgpu::LdMatrixParams
Encapsulates the parameters needed to lower a nvgpu.ldmatrix operation to nvvm.ldmatrix.
Definition: MMAUtils.h:77

mlir::nvgpu::LdMatrixParams::numTiles
int64_t numTiles
Definition: MMAUtils.h:80

mlir::nvgpu::LdMatrixParams::targetLayout
NVVM::MMALayout targetLayout
Definition: MMAUtils.h:82

mlir::nvgpu::LdMatrixParams::fragmentType
VectorType fragmentType
Definition: MMAUtils.h:78

mlir::nvgpu::LdMatrixParams::contiguousDimType
vector::IteratorType contiguousDimType
Definition: MMAUtils.h:81

mlir::nvgpu::WarpMatrixInfo
Collects information about a warp-level matrix operand represented by a VectorType.
Definition: MMAUtils.h:34

mlir::nvgpu::WarpMatrixInfo::operandRole
MatMulOperandRole operandRole
Definition: MMAUtils.h:36

mlir::nvgpu::WarpMatrixInfo::vectorType
VectorType vectorType
Definition: MMAUtils.h:35