doxygen/NVGPUTransformOps_8cpp_source.html

 //===- NVGPUTransformOps.cpp - Implementation of NVGPU transform ops ------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"


 #include "mlir/Analysis/SliceAnalysis.h"

 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"

 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"

 #include "mlir/Dialect/Affine/IR/AffineOps.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/Arith/Utils/Utils.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"

 #include "mlir/Dialect/Linalg/IR/Linalg.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"

 #include "mlir/Dialect/NVGPU/Transforms/Transforms.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/Dialect/SCF/Transforms/Transforms.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/Dialect/Utils/StaticValueUtils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/IR/AffineExpr.h"

 #include "mlir/IR/BuiltinTypes.h"

 #include "mlir/IR/Value.h"

 #include "llvm/ADT/ArrayRef.h"


 using namespace mlir;

 using namespace mlir::linalg;

 using namespace mlir::nvgpu;

 using namespace mlir::NVVM;

 using namespace mlir::transform;


 #define DEBUG_TYPE "nvgpu-transforms"


 //===----------------------------------------------------------------------===//

 // Apply...ConversionPatternsOp

 //===----------------------------------------------------------------------===//


 void ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(

     TypeConverter &typeConverter, RewritePatternSet &patterns) {

   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

   /// device-side async tokens cannot be materialized in nvvm. We just

   /// convert them to a dummy i32 type in order to easily drop them during

   /// conversion.

   populateGpuMemorySpaceAttributeConversions(

       llvmTypeConverter, [](gpu::AddressSpace space) -> unsigned {

         switch (space) {

         case gpu::AddressSpace::Global:

           return static_cast<unsigned>(NVVM::NVVMMemorySpace::Global);

         case gpu::AddressSpace::Workgroup:

           return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);

         case gpu::AddressSpace::Private:

           return 0;

         }

         llvm_unreachable("unknown address space enum value");

         return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);

       });

   llvmTypeConverter.addConversion([&](DeviceAsyncTokenType type) -> Type {

     return llvmTypeConverter.convertType(

         IntegerType::get(type.getContext(), 32));

   });

   llvmTypeConverter.addConversion([&](MBarrierTokenType type) -> Type {

     return llvmTypeConverter.convertType(

         IntegerType::get(type.getContext(), 64));

   });

   llvmTypeConverter.addConversion([&](WarpgroupAccumulatorType type) -> Type {

     Type elemType = type.getFragmented().getElementType();

     int64_t sizeM = type.getFragmented().getDimSize(0);

     int64_t sizeN = type.getFragmented().getDimSize(1);


     unsigned numMembers;

     if (elemType.isF32() || elemType.isInteger(32))

       numMembers = sizeN / 2;

     else if (elemType.isF16())

       numMembers = sizeN / 4;

     else

       llvm_unreachable("unsupported type for warpgroup accumulator");


     SmallVector<Type> innerStructBody;

     for (unsigned i = 0; i < numMembers; i++)

       innerStructBody.push_back(elemType);

     auto innerStructType =

         LLVM::LLVMStructType::getLiteral(type.getContext(), innerStructBody);


     SmallVector<Type> structBody;

     for (int i = 0; i < sizeM; i += kWgmmaSizeM)

       structBody.push_back(innerStructType);


     auto convertedType =

         LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);

     return llvmTypeConverter.convertType(convertedType);

   });

   llvmTypeConverter.addConversion([&](MBarrierGroupType type) -> Type {

     return llvmTypeConverter.convertType(

         getMBarrierMemrefType(type.getContext(), type));

   });

   llvmTypeConverter.addConversion(

       [&](WarpgroupMatrixDescriptorType type) -> Type {

         return llvmTypeConverter.convertType(

             IntegerType::get(type.getContext(), 64));

       });

   llvmTypeConverter.addConversion([&](TensorMapDescriptorType type) -> Type {

     return LLVM::LLVMPointerType::get(type.getContext());

   });

   populateNVGPUToNVVMConversionPatterns(llvmTypeConverter, patterns);

 }


 LogicalResult ApplyNVGPUToNVVMConversionPatternsOp::verifyTypeConverter(

     TypeConverterBuilderOpInterface builder) {

   if (builder.getTypeConverterType() != "LLVMTypeConverter")

     return emitOpError("expected LLVMTypeConverter");

   return success();

 }


 //===---------------------------------------------------------------------===//

 // CreateAsyncGroupsOp

 //===---------------------------------------------------------------------===//


 void CreateAsyncGroupsOp::getEffects(

     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {

   consumesHandle(getTargetMutable(), effects);

   producesHandle(getOperation()->getOpResults(), effects);

   modifiesPayload(effects);

 }


 DiagnosedSilenceableFailure

 CreateAsyncGroupsOp::applyToOne(TransformRewriter &rewriter, Operation *target,

                                 ApplyToEachResultList &results,

                                 TransformState &state) {

   createAsyncGroups(rewriter, target, getBypassL1());

   results.push_back(target);

   return DiagnosedSilenceableFailure::success();

 }


 //===----------------------------------------------------------------------===//

 // PipelineSharedMemoryCopiesOp

 //===----------------------------------------------------------------------===//


 /// Returns true if the given type has the default memory space.

 static bool hasDefaultMemorySpace(BaseMemRefType type) {

   return !type.getMemorySpace() || type.getMemorySpaceAsInt() == 0;

 }


 /// Returns true if the given type has the shared (workgroup) memory space.

 static bool hasSharedMemorySpace(BaseMemRefType type) {

   auto space =

       dyn_cast_if_present<gpu::AddressSpaceAttr>(type.getMemorySpace());

   return space &&

          space.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace();

 }


 /// Returns the value produced by a load from the default memory space. Returns

 /// null if the operation is not such a load.

 static Value getValueLoadedFromGlobal(Operation *op) {

   // TODO: consider an interface or leveraging the memory effects interface.

   auto load = dyn_cast<vector::TransferReadOp>(op);

   if (!load)

     return nullptr;


   auto loadType = dyn_cast<MemRefType>(load.getBase().getType());

   if (!loadType || !hasDefaultMemorySpace(loadType))

     return nullptr;

   return load;

 }


 /// Returns true if the operation is storing the given value into shared memory.

 static bool isStoreToShared(Operation *op, Value v) {

   // TOD: consider an interface or leveraging the memory effects interface.

   auto store = dyn_cast<vector::TransferWriteOp>(op);

   if (!store || store.getVector() != v)

     return false;


   auto storeType = dyn_cast<MemRefType>(store.getBase().getType());

   return storeType || hasSharedMemorySpace(storeType);

 }


 /// Returns true if the operation is a load from the default memory space the

 /// result of which is only stored into the shared memory space.

 static bool isLoadFromGlobalStoredToShared(Operation *op) {

   Value loaded = getValueLoadedFromGlobal(op);

   if (!loaded || !loaded.hasOneUse())

     return false;


   return isStoreToShared(*loaded.getUsers().begin(), loaded);

 }


 /// Populate `ops` with the set of operations that belong to the stage 0 of the

 /// pipelined version of the given loop when pipelining copies to shared memory.

 /// Specifically, this collects:

 ///

 ///   1. all loads from global memory, both sync and async;

 ///   2. the barriers for async loads.

 ///

 /// In particular, barriers are omitted if they do not dominate at least one

 /// async load for which there is not yet a barrier.

 static LogicalResult

 collectStage0PipeliningOps(scf::ForOp forOp,

                            llvm::SmallPtrSet<Operation *, 16> &ops) {


   llvm::SmallPtrSet<Operation *, 4> barriers;

   for (Operation &op : *forOp.getBody()) {

     // Bail on nested ops for now.

     if (op.getNumRegions() > 0)

       return failure();


     if (isa<gpu::BarrierOp>(op)) {

       barriers.insert(&op);

       continue;

     }


     if (isa<DeviceAsyncCopyOp, DeviceAsyncCreateGroupOp>(op)) {

       ops.insert(&op);

       ops.insert(std::make_move_iterator(barriers.begin()),

                  std::make_move_iterator(barriers.end()));

       assert(barriers.empty() &&

              "expected to have moved the barriers into another set");

       continue;

     }


     if (isLoadFromGlobalStoredToShared(&op)) {

       ops.insert(&op);

       continue;

     }

   }


   return success();

 }


 /// Hook for the loop pipeliner that sets the "num groups in flight" attribute

 /// of async wait operations corresponding to pipelined shared memory copies.

 // TODO: this currently assumes that there are no groups that could be in flight

 // in the existing code.

 static void

 setAsyncWaitGroupsInFlight(OpBuilder &builder, Operation *op,

                            scf::PipeliningOption::PipelinerPart part,

                            unsigned iteration, unsigned depth) {

   // Based on the order of copies within the loop we need to set the number

   // of copies in flight, unless it is already set.

   auto waitOp = dyn_cast<DeviceAsyncWaitOp>(op);

   if (!waitOp || waitOp.getNumGroups())

     return;


   int numGroupInFlight = 0;

   if (part == scf::PipeliningOption::PipelinerPart::Kernel ||

       part == scf::PipeliningOption::PipelinerPart::Prologue) {

     numGroupInFlight = depth - 1;

   } else {

     // By construction there should be no wait op in the prologue as all the

     // wait should be in the last stage.

     assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);

     // Based on the schedule we pick we know how many groups are in flight for

     // each iteration of the epilogue.

     numGroupInFlight = depth - 1 - iteration;

   }

   waitOp.setNumGroups(numGroupInFlight);

 }


 /// Hook for the loop pipeliner that populates `ops` with the stage information

 /// as follows:

 ///

 ///   - operations in `stage0Ops` (typically loads from global memory and

 ///     related barriers) are at stage 0;

 ///   - operations in the backward slice of any stage0Ops are all at stage 0;

 ///   - other operations are at stage `depth`;

 ///   - the internal order of the pipelined loop has ops at stage `depth` first,

 ///   then those at stage 0, with relative order within each group preserved.

 ///

 static void getPipelineStages(

     scf::ForOp forOp,

     std::vector<std::pair<Operation *, unsigned>> &opsWithPipelineStages,

     unsigned depth, llvm::SmallPtrSetImpl<Operation *> &stage0Ops) {

   SetVector<Operation *> dependencies;

   BackwardSliceOptions options([&](Operation *visited) {

     return visited->getBlock() == forOp.getBody();

   });

   options.inclusive = true;

   for (Operation &op : forOp.getBody()->getOperations()) {

     if (stage0Ops.contains(&op)) {

       LogicalResult result = getBackwardSlice(&op, &dependencies, options);

       assert(result.succeeded() && "expected a backward slice");

       (void)result;

     }

   }


   for (Operation &op : forOp.getBody()->getOperations()) {

     if (!dependencies.contains(&op) && !isa<scf::YieldOp>(op))

       opsWithPipelineStages.emplace_back(&op, depth);

   }

   for (Operation &op : forOp.getBody()->getOperations()) {

     if (dependencies.contains(&op))

       opsWithPipelineStages.emplace_back(&op, 0);

   }

 }


 /// Hook for the loop pipeliner. Replaces op with a predicated version and

 /// returns the resulting operation. Returns the original op if the predication

 /// isn't necessary for the given op. Returns null if predication is needed but

 /// not supported.

 static Operation *replaceOpWithPredicatedOp(RewriterBase &rewriter,

                                             Operation *op, Value predicate) {

   // Some operations may be fine to execute "speculatively" more times than the

   // original number of iterations, in particular side-effect free operations

   // and barriers, even if they cannot be predicated.

   if (isMemoryEffectFree(op) ||

       isa<gpu::BarrierOp, DeviceAsyncCreateGroupOp, DeviceAsyncWaitOp>(op)) {

     return op;

   }


   // Otherwise, only async copies can currently be predicated.

   auto asyncCopyOp = dyn_cast<DeviceAsyncCopyOp>(op);

   if (!asyncCopyOp)

     return nullptr;


   // Create srcElement Value based on `predicate`. The next lines generate

   // the following code:

   //

   //   srcElement = (pred) ?  prevSrcElements : 0;

   //

   Location loc = asyncCopyOp->getLoc();

   Value dstElements = arith::ConstantOp::create(

       rewriter, loc, asyncCopyOp.getDstElementsAttr());

   Value originalSrcElement =

       asyncCopyOp.getSrcElements() ? asyncCopyOp.getSrcElements() : dstElements;

   Value c0Index = arith::ConstantIndexOp::create(rewriter, loc, 0);

   auto srcElements = arith::SelectOp::create(rewriter, loc, predicate,

                                              originalSrcElement, c0Index);

   auto asyncCopyZeroFillOp = DeviceAsyncCopyOp::create(

       rewriter, loc, DeviceAsyncTokenType::get(asyncCopyOp.getContext()),

       asyncCopyOp.getDst(), asyncCopyOp.getDstIndices(), asyncCopyOp.getSrc(),

       asyncCopyOp.getSrcIndices(), asyncCopyOp.getDstElements(), srcElements,

       UnitAttr());

   rewriter.replaceOp(asyncCopyOp, asyncCopyZeroFillOp);

   return asyncCopyZeroFillOp;

 }


 /// Applies loop pipelining with the given depth to the given loop so that

 /// copies into the shared memory are pipelined. Doesn't affect other loops.

 /// Returns a pair containing the error state and the pipelined op, the latter

 /// being null in case of any failure. The error state contains a definite error

 /// if the IR has been modified and a silenceable error otherwise.

 static std::tuple<DiagnosedSilenceableFailure, scf::ForOp>

 pipelineForSharedCopies(RewriterBase &rewriter, scf::ForOp forOp, int64_t depth,

                         bool epiloguePeeling) {

   llvm::SmallPtrSet<Operation *, 16> stage0Ops;

   if (failed(collectStage0PipeliningOps(forOp, stage0Ops))) {

     return std::make_tuple(

         emitSilenceableFailure(forOp, "cannot find stage 0 ops for pipelining"),

         scf::ForOp());

   }

   if (stage0Ops.empty()) {

     return std::make_tuple(

         emitSilenceableFailure(forOp, "no shared memory copy"), scf::ForOp());

   }


   scf::PipeliningOption options;

   unsigned maxDepth = depth;

   auto setAnnotation = [&](Operation *op,

                            scf::PipeliningOption::PipelinerPart part,

                            unsigned iteration) {

     return setAsyncWaitGroupsInFlight(rewriter, op, part, iteration, maxDepth);

   };

   options.getScheduleFn =

       [&](scf::ForOp schedulingFor,

           std::vector<std::pair<Operation *, unsigned>> &ops) {

         if (schedulingFor != forOp)

           return;

         return getPipelineStages(forOp, ops, maxDepth, stage0Ops);

       };

   options.annotateFn = setAnnotation;

   if (!epiloguePeeling) {

     options.peelEpilogue = false;

     options.predicateFn = replaceOpWithPredicatedOp;

   }


   OpBuilder::InsertionGuard guard(rewriter);

   rewriter.setInsertionPoint(forOp);

   bool modifiedIR;

   FailureOr<scf::ForOp> maybePipelined =

       pipelineForLoop(rewriter, forOp, options, &modifiedIR);

   if (succeeded(maybePipelined)) {

     return std::make_tuple(DiagnosedSilenceableFailure::success(),

                            *maybePipelined);

   }

   return std::make_tuple(

       modifiedIR

           ? DiagnosedSilenceableFailure::definiteFailure()

           : emitSilenceableFailure(forOp, "pipelining preconditions failed"),

       scf::ForOp());

 }


 DiagnosedSilenceableFailure PipelineSharedMemoryCopiesOp::applyToOne(

     TransformRewriter &rewriter, scf::ForOp forOp,

     ApplyToEachResultList &results, TransformState &state) {

   auto [diag, pipelined] = pipelineForSharedCopies(

       rewriter, forOp, static_cast<int64_t>(getDepth()), getPeelEpilogue());

   if (diag.succeeded()) {

     results.push_back(pipelined);

     return DiagnosedSilenceableFailure::success();

   }

   if (diag.isDefiniteFailure()) {

     auto diag = emitDefiniteFailure("irreversible pipelining failure");

     if (!getPeelEpilogue()) {

       diag.attachNote(forOp->getLoc()) << "couldn't predicate?";

       diag.attachNote(getLoc()) << "try setting " << getPeelEpilogueAttrName();

     }

     return diag;

   }


   return std::move(diag);

 }


 //===----------------------------------------------------------------------===//

 // RewriteMatmulAsMmaSyncOp

 //===----------------------------------------------------------------------===//


 /// Helper struct to encode a pair of row/column indexings in the form of

 /// affine expressions.

 struct RowColIndexing : private std::pair<AffineExpr, AffineExpr> {

   RowColIndexing(AffineExpr row, AffineExpr col)

       : std::pair<AffineExpr, AffineExpr>(row, col) {}


   AffineExpr row() const { return first; };

   AffineExpr col() const { return second; };


   void print(llvm::raw_ostream &os) const {

     os << "- indexing: " << first << ", " << second;

   }

 };


 /// Helper struct to provide a simple mapping from matmul operations to the

 /// corresponding mma.sync operation. This is constrained to the case where the

 /// matmul matches the mma.sync operation 1-1.

 struct MmaSyncBuilder {

   MmaSyncBuilder(OpBuilder &b, Location loc, OpFoldResult laneId)

       : b(b), loc(loc), laneId(laneId) {}


   using IndexCalculator =

       std::function<SmallVector<RowColIndexing>(MLIRContext *)>;


   /// Create the mma.sync operation corresponding to `linalgOp` along with all

   /// the supporting load/store and vector operations.

   FailureOr<Operation *> buildMmaSync(LinalgOp linalgOp);


 private:

   struct MmaSyncInfo {

     std::tuple<IndexCalculator, IndexCalculator, IndexCalculator> indexFns;

     std::tuple<SmallVector<int64_t>, SmallVector<int64_t>, SmallVector<int64_t>>

         vectorShapes;

     SmallVector<int64_t> mmaShape;

     bool tf32Enabled;

   };


   /// Return the specific index calculator for the given `linalgOp` or failure

   /// if the op is not supported. This is the toplevel switch that should just

   /// be Tablegen'd in the future.

   FailureOr<MmaSyncInfo> getIndexCalculators(ArrayRef<int64_t> opShape,

                                              TypeRange elementalTypes);


   //===--------------------------------------------------------------------===//

   // Instruction-specific row, column indexing expression builders.

   // These should all be declaratively specified via Tablegen in the future.

   // The Tablegen specification should be as straightforward as possible to

   // only model the existing size and type combinations.

   //===--------------------------------------------------------------------===//

   //

   // TODO: Tablegen all this.

   //===--------------------------------------------------------------------===//

   // m16n8k4 tf32 case.

   //===--------------------------------------------------------------------===//

   /// From the NVIDIA doc:

   /// groupID           = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   /// row =      groupID            for a0

   ///            groupID + 8        for a1

   /// col =  threadIDInGroup

   static SmallVector<RowColIndexing> m16n8k4tf32Lhs(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     return {RowColIndexing{groupID, threadIDInGroup},

             RowColIndexing{groupID + 8, threadIDInGroup}};

   }


   /// From the NVIDIA doc:

   /// groupID           = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   /// row =  threadIDInGroup

   /// col =  groupID

   static SmallVector<RowColIndexing> m16n8k4tf32Rhs(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     return {RowColIndexing{threadIDInGroup, groupID}};

   }


   /// From the NVIDIA doc:

   /// groupID          = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   /// row =      groupID                            for c0 and c1

   ///          groupID + 8                          for c2 and c3

   /// col =  (threadIDInGroup * 2) + (i & 0x1)    for ci   where i = {0,..,3}

   static SmallVector<RowColIndexing> m16n8k4tf32Res(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     return {RowColIndexing{groupID, threadIDInGroup * 2 + 0},

             RowColIndexing{groupID, threadIDInGroup * 2 + 1},

             RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},

             RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}};

   }


   //===--------------------------------------------------------------------===//

   // m16n8k16 f16 case.

   //===--------------------------------------------------------------------===//

   /// From the NVIDIA doc:

   /// groupID           = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   ///

   /// row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6

   ///           groupID + 8         Otherwise

   ///

   /// col =  (threadIDInGroup * 2) + (i & 0x1)          for ai where i <  4

   ///        (threadIDInGroup * 2) + (i & 0x1) + 8      for ai where i >= 4

   static SmallVector<RowColIndexing> m16n8k16f16Lhs(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     // clang-format off

     return {

       RowColIndexing{groupID, threadIDInGroup * 2 + 0},         // i == 0

       RowColIndexing{groupID, threadIDInGroup * 2 + 1},         // i == 1

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},     // i == 2

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1},     // i == 3

       RowColIndexing{groupID, threadIDInGroup * 2 + 0 + 8},     // i == 4

       RowColIndexing{groupID, threadIDInGroup * 2 + 1 + 8},     // i == 5

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0 + 8}, // i == 6

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1 + 8}  // i == 7

     };

     // clang-format on

   }


   /// From the NVIDIA doc:

   /// groupID           = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   ///

   /// row =  (threadIDInGroup * 2) + (i & 0x1)           for bi where i <  2

   ///        (threadIDInGroup * 2) + (i & 0x1) + 8       for bi where i >= 2

   ///

   /// col = groupID

   static SmallVector<RowColIndexing> m16n8k16f16Rhs(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     // clang-format off

     return {

       RowColIndexing{threadIDInGroup * 2 + 0, groupID},        // i == 0

       RowColIndexing{threadIDInGroup * 2 + 1, groupID},        // i == 1

       RowColIndexing{threadIDInGroup * 2 + 0 + 8, groupID},    // i == 2

       RowColIndexing{threadIDInGroup * 2 + 1 + 8, groupID}     // i == 3

     };

     // clang-format on

   }


   /// From the NVIDIA doc:

   /// groupID           = %laneid >> 2

   /// threadIDInGroup = %laneid % 4

   ///

   /// row =      groupID                               for ci where i <  2

   ///          groupID + 8                             for ci where i >= 2

   ///

   /// col =  (threadIDInGroup * 2) + (i & 0x1)      for ci where i = {0,..,3}

   static SmallVector<RowColIndexing> m16n8k16f16Res(MLIRContext *ctx) {

     auto dim = getAffineDimExpr(0, ctx);

     AffineExpr groupID = dim.floorDiv(4);

     AffineExpr threadIDInGroup = dim % 4;

     // clang-format off

     return {

       RowColIndexing{groupID, threadIDInGroup * 2 + 0},        // i == 0

       RowColIndexing{groupID, threadIDInGroup * 2 + 1},        // i == 1

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},    // i == 2

       RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}     // i == 3

     };

     // clang-format on

   }


   //===--------------------------------------------------------------------===//

   /// Helper functions to create customizable load and stores operations. The

   /// specific shapes of each MMA instruction are passed via the

   /// IndexCalculator callback.

   //===--------------------------------------------------------------------===//

   /// Build a list of memref.load operations indexed at `(row, col)` indices

   /// that make sense for a particular MMA instruction and specified via the

   /// IndexCalculator callback.

   SmallVector<Value> buildMemRefLoads(OpBuilder &b, Location loc,

                                       OpFoldResult laneId, Value memref,

                                       const IndexCalculator &indexFn);


   /// Perform a distributed load of a vector operand of `vectorShape` for a

   /// particular MMA instruction whose `(row, col)` indices are specified via

   /// the IndexCalculator callback. Each `laneId` loads the subportion of the

   /// data that makes sense for the particular MMA operation.

   /// The `vectorShape` matches existing NVGPU dialect op specification but

   /// could also be flattened in the future if needed for simplification.

   Value buildMmaSyncMemRefLoadOperand(OpBuilder &b, Location loc,

                                       OpFoldResult laneId, Value memref,

                                       IndexCalculator indexFn,

                                       ArrayRef<int64_t> vectorShape);


   /// Build a list of memref.store operations indexed at `(row, col)` indices

   /// that make sense for a particular MMA instruction and specified via the

   /// IndexCalculator callback.

   SmallVector<Operation *> buildMemRefStores(OpBuilder &b, Location loc,

                                              ValueRange toStore,

                                              OpFoldResult laneId, Value memref,

                                              const IndexCalculator &indexFn);


   /// Perform a distributed store of a vector operand of `vectorShape` for a

   /// particular MMA instruction whose `(row, col)` indices are specified via

   /// the IndexCalculator callback. Each `laneId` loads the subportion of the

   /// data that makes sense for the particular MMA operation.

   /// The `vectorShape` matches existing NVGPU dialect op specification but

   /// could also be flattened in the future if needed for simplification.

   SmallVector<Operation *> buildMmaSyncMemRefStoreOperand(

       OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId,

       Value memref, IndexCalculator indexFn, ArrayRef<int64_t> vectorShape);


   OpBuilder &b;

   Location loc;

   OpFoldResult laneId;

 };


 //===--------------------------------------------------------------------===//

 /// Helper functions to create customizable load and stores operations. The

 /// specific shapes of each MMA instruction are passed via the

 /// IndexCalculator callback.

 //===--------------------------------------------------------------------===//


 template <typename ApplyFn, typename ReduceFn>

 static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn,

                                            ReduceFn reduceFn) {

   VectorType vectorType = cast<VectorType>(vector.getType());

   auto vectorShape = vectorType.getShape();

   auto strides = computeStrides(vectorShape);

   for (int64_t idx = 0, e = vectorShape[0] * strides[0]; idx < e; ++idx) {

     auto indices = delinearize(idx, strides);

     reduceFn(applyFn(vector, idx, indices), idx, indices);

   }

 }


 SmallVector<Value>

 MmaSyncBuilder::buildMemRefLoads(OpBuilder &b, Location loc,

                                  OpFoldResult laneId, Value memref,

                                  const IndexCalculator &indexFn) {

   auto aff = [&](AffineExpr e) {

     return affine::makeComposedFoldedAffineApply(b, loc, e, laneId);

   };

   SmallVector<Value> res;

   SmallVector<RowColIndexing> indexings = indexFn(b.getContext());

   for (auto indexing : indexings) {

     Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));

     Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));

     auto load = memref::LoadOp::create(b, loc, memref, ValueRange{row, col});

     res.push_back(load);

   }

   return res;

 }


 Value MmaSyncBuilder::buildMmaSyncMemRefLoadOperand(

     OpBuilder &b, Location loc, OpFoldResult laneId, Value memref,

     IndexCalculator indexFn, ArrayRef<int64_t> vectorShape) {

   auto loads = buildMemRefLoads(b, loc, laneId, memref, std::move(indexFn));


   Type elementType = getElementTypeOrSelf(memref.getType());

   auto vt = VectorType::get(vectorShape, elementType);

   Value res = vector::BroadcastOp::create(b, loc, vt, loads[0]);

   foreachIndividualVectorElement(

       res,

       /*applyFn=*/

       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

         return loads[linearIdx];

       },

       /*reduceFn=*/

       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

         res = vector::InsertOp::create(b, loc, v, res, indices);

       });


   return res;

 }


 SmallVector<Operation *> MmaSyncBuilder::buildMemRefStores(

     OpBuilder &b, Location loc, ValueRange toStore, OpFoldResult laneId,

     Value memref, const IndexCalculator &indexFn) {

   auto aff = [&](AffineExpr e) {

     return affine::makeComposedFoldedAffineApply(b, loc, e, laneId);

   };

   SmallVector<Operation *> res;

   for (auto [indexing, val] :

        llvm::zip_equal(indexFn(b.getContext()), toStore)) {

     Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));

     Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));

     Operation *store =

         memref::StoreOp::create(b, loc, val, memref, ValueRange{row, col});

     res.push_back(store);

   }

   return res;

 }


 SmallVector<Operation *> MmaSyncBuilder::buildMmaSyncMemRefStoreOperand(

     OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId,

     Value memref, IndexCalculator indexFn, ArrayRef<int64_t> vectorShape) {

   SmallVector<Value> toStore;

   toStore.reserve(32);

   foreachIndividualVectorElement(

       vectorToStore,

       /*applyFn=*/

       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

         return vector::ExtractOp::create(b, loc, vectorToStore, indices);

       },

       /*reduceFn=*/

       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

         toStore.push_back(v);

       });

   return buildMemRefStores(b, loc, toStore, laneId, memref, std::move(indexFn));

 }


 static std::tuple<SmallVector<int64_t>, SmallVector<int64_t>,

                   SmallVector<int64_t>>

 makeVectorShapes(ArrayRef<int64_t> lhs, ArrayRef<int64_t> rhs,

                  ArrayRef<int64_t> res) {

   SmallVector<int64_t> vlhs(lhs);

   SmallVector<int64_t> vrhs(rhs);

   SmallVector<int64_t> vres(res);

   return std::make_tuple(vlhs, vrhs, vres);

 }


 FailureOr<MmaSyncBuilder::MmaSyncInfo>

 MmaSyncBuilder::getIndexCalculators(ArrayRef<int64_t> opShape,

                                     TypeRange elementalTypes) {

   // TODO: Tablegen all this.

   Type f16 = b.getF16Type();

   Type f32 = b.getF32Type();

   if (opShape == ArrayRef<int64_t>{16, 8, 4} &&

       elementalTypes == TypeRange{f32, f32, f32}) {

     return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k4tf32Lhs,

                                        &MmaSyncBuilder::m16n8k4tf32Rhs,

                                        &MmaSyncBuilder::m16n8k4tf32Res),

                        makeVectorShapes({2, 1}, {1, 1}, {2, 2}),

                        SmallVector<int64_t>{opShape},

                        /*tf32Enabled=*/true};

   }

   // This is the version with f16 accumulation.

   // TODO: version with f32 accumulation.

   if (opShape == ArrayRef<int64_t>{16, 8, 16} &&

       elementalTypes == TypeRange{f16, f16, f16}) {

     return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k16f16Lhs,

                                        &MmaSyncBuilder::m16n8k16f16Rhs,

                                        &MmaSyncBuilder::m16n8k16f16Res),

                        makeVectorShapes({4, 2}, {2, 2}, {2, 2}),

                        SmallVector<int64_t>{opShape},

                        /*tf32Enabled=*/false};

   }

   return failure();

 }


 FailureOr<Operation *> MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) {

   Value lhsMemRef = linalgOp.getDpsInputOperand(0)->get();

   Value rhsMemRef = linalgOp.getDpsInputOperand(1)->get();

   Value resMemRef = linalgOp.getDpsInitOperand(0)->get();

   assert(cast<MemRefType>(lhsMemRef.getType()).getRank() == 2 &&

          "expected lhs to be a 2D memref");

   assert(cast<MemRefType>(rhsMemRef.getType()).getRank() == 2 &&

          "expected rhs to be a 2D memref");

   assert(cast<MemRefType>(resMemRef.getType()).getRank() == 2 &&

          "expected res to be a 2D memref");


   int64_t m = cast<MemRefType>(lhsMemRef.getType()).getShape()[0];

   int64_t n = cast<MemRefType>(rhsMemRef.getType()).getShape()[1];

   int64_t k = cast<MemRefType>(lhsMemRef.getType()).getShape()[1];

   Type lhsType = getElementTypeOrSelf(lhsMemRef.getType());

   Type rhsType = getElementTypeOrSelf(rhsMemRef.getType());

   Type resType = getElementTypeOrSelf(resMemRef.getType());


   FailureOr<MmaSyncInfo> maybeInfo =

       getIndexCalculators({m, n, k}, {lhsType, rhsType, resType});

   if (failed(maybeInfo))

     return failure();


   MmaSyncInfo info = *maybeInfo;

   auto [lhsIndexFn, rhsIndexFn, resIndexFn] = info.indexFns;

   auto [lhsShape, rhsShape, resShape] = info.vectorShapes;

   Value lhs = buildMmaSyncMemRefLoadOperand(b, loc, laneId, lhsMemRef,

                                             lhsIndexFn, lhsShape);

   Value rhs = buildMmaSyncMemRefLoadOperand(b, loc, laneId, rhsMemRef,

                                             rhsIndexFn, rhsShape);

   Value res = buildMmaSyncMemRefLoadOperand(b, loc, laneId, resMemRef,

                                             resIndexFn, resShape);

   res =

       MmaSyncOp::create(b, loc, lhs, rhs, res, info.mmaShape, info.tf32Enabled);

   buildMmaSyncMemRefStoreOperand(b, loc, res, laneId, resMemRef, resIndexFn,

                                  resShape);

   return res.getDefiningOp();

 }


 DiagnosedSilenceableFailure RewriteMatmulAsMmaSyncOp::applyToOne(

     TransformRewriter &rewriter, LinalgOp linalgOp,

     ApplyToEachResultList &results, TransformState &state) {

   bool fail = true;

   // TODO: more robust detection of matmulOp, with transposes etc.

   if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {

     // Check to not let go the matmul with extended semantic, through this

     // transform.

     if (linalgOp.hasUserDefinedMaps()) {

       return emitSilenceableError()

              << "only matmul ops with non-extended semantics are supported";

     }

     Location loc = linalgOp.getLoc();

     // TODO: more robust computation of laneId, for now assume a single warp.

     Value laneId = gpu::ThreadIdOp::create(

         rewriter, loc, rewriter.getIndexType(), gpu::Dimension::x);

     if (succeeded(MmaSyncBuilder(rewriter, loc, laneId).buildMmaSync(linalgOp)))

       fail = false;

   }


   if (fail) {

     DiagnosedSilenceableFailure diag = emitSilenceableError()

                                        << "unsupported target op: " << linalgOp;

     diag.attachNote(linalgOp->getLoc()) << "target op";

     return diag;

   }


   rewriter.eraseOp(linalgOp);

   return DiagnosedSilenceableFailure::success();

 }


 //===----------------------------------------------------------------------===//

 // Hopper builders.

 //===----------------------------------------------------------------------===//


 /// Helper to create the base Hopper-specific operations that are reused in

 /// various other places.

 struct HopperBuilder {

   HopperBuilder(RewriterBase &rewriter, Location loc)

       : rewriter(rewriter), loc(loc) {}


   TypedValue<MBarrierGroupType>

   buildAndInitBarrierInSharedMemory(OpFoldResult numThreads);


   /// Create tma descriptor op to initiate transfer from global to shared

   /// memory. This must be done before the launch op, on the host.

   TypedValue<TensorMapDescriptorType>

   buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,

                               gpu::LaunchOp launchOp);


   /// Build a tma load from global memory to shared memory using `barrier` to

   /// synchronize. Return the number of bytes that will be transferred.

   OpFoldResult buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,

                                  TypedValue<MemRefType> sharedMemref,

                                  TypedValue<MBarrierGroupType> barrier,

                                  SmallVectorImpl<Operation *> &loadOps);

   void buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,

                             ArrayRef<OpFoldResult> sizes);


   /// If threadIdx.x == 0 does TMA request + wait, else just wait.

   /// Return the operation that performs the transfer on thread0.

   // TODO: In the future, don't hardcode to thread 0 but elect a leader.

   SmallVector<Operation *> buildPredicateLoadsOnThread0(

       ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,

       ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,

       TypedValue<MBarrierGroupType> barrier);


   void buildTryWaitParity(TypedValue<MBarrierGroupType> barrier);


   RewriterBase &rewriter;

   Location loc;

 };


 SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(

     ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,

     ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,

     TypedValue<MBarrierGroupType> barrier) {

   SmallVector<Operation *> loadOps;

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   Value tidx = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);

   Value cond = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,

                                      tidx, zero);

   // clang-format off

   scf::IfOp::create(rewriter,

     /*location=*/loc,

     /*conditional=*/cond,

     /*thenBuilder=*/

     [&](OpBuilder &lb, Location loc) {

       SmallVector<OpFoldResult> sizes;

       sizes.reserve(globalDescriptors.size());

       for (auto [desc, shmem] : llvm::zip_equal(

               globalDescriptors, sharedMemBuffers)) {

         OpFoldResult sz = buildTmaAsyncLoad(desc, shmem, barrier, loadOps);

         sizes.push_back(sz);

       }

       // TODO: Note that cutlass predeclares the barrier arrive tx before the tma.async.load.

       // This may or may not have perf implications.

       buildBarrierArriveTx(barrier, sizes);

       scf::YieldOp::create(rewriter, loc);

     },

     /*elseBuilder=*/

     [&](OpBuilder &lb, Location loc) {

       // TODO: is this for no-thread divergence?

       // Should we just yield the size and hoist?

       buildBarrierArriveTx(barrier, getAsIndexOpFoldResult(rewriter.getContext(), 0));

       scf::YieldOp::create(rewriter, loc);

     });

   // clang-format on

   return loadOps;

 }


 static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) {

   return gpu::AddressSpaceAttr::get(

       b.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());

   // return b.getI64IntegerAttr(static_cast<int64_t>(kSharedMemorySpace));

 }


 TypedValue<MBarrierGroupType>

 HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {

   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);

   Value barrier = MBarrierCreateOp::create(

       rewriter, loc,

       MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   nvgpu::MBarrierInitOp::create(

       rewriter, loc, barrier,

       getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), zero,

       Value());

   gpu::BarrierOp::create(rewriter, loc);

   return cast<TypedValue<MBarrierGroupType>>(barrier);

 }


 TypedValue<TensorMapDescriptorType>

 HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,

                                            gpu::LaunchOp launchOp) {

   OpBuilder::InsertionGuard guard(rewriter);

   rewriter.setInsertionPoint(launchOp);

   Value unrankedMemRef = memref::CastOp::create(

       rewriter, loc,

       UnrankedMemRefType::get(memref.getType().getElementType(),

                               memref.getType().getMemorySpace()),

       memref);

   SmallVector<OpFoldResult> mixedSizes =

       memref::getMixedSizes(rewriter, loc, memref);

   SmallVector<Value> sizes =

       getValueOrCreateConstantIndexOp(rewriter, loc, mixedSizes);


   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);

   Value desc = TmaCreateDescriptorOp::create(

       rewriter, loc,

       TensorMapDescriptorType::get(rewriter.getContext(),

                                    MemRefType::Builder(memref.getType())

                                        .setMemorySpace(sharedMemorySpace),

                                    TensorMapSwizzleKind::SWIZZLE_NONE,

                                    TensorMapL2PromoKind::L2PROMO_NONE,

                                    TensorMapOOBKind::OOB_ZERO,

                                    TensorMapInterleaveKind::INTERLEAVE_NONE),

       unrankedMemRef, sizes);

   return cast<TypedValue<TensorMapDescriptorType>>(desc);

 }


 OpFoldResult

 HopperBuilder::buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,

                                  TypedValue<MemRefType> sharedMemref,

                                  TypedValue<MBarrierGroupType> barrier,

                                  SmallVectorImpl<Operation *> &loadOps) {

   MLIRContext *ctx = rewriter.getContext();

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   Operation *loadOp =

       TmaAsyncLoadOp::create(rewriter, loc, sharedMemref, barrier, globalDesc,

                              ValueRange{zero, zero}, zero, Value(), Value());

   loadOps.push_back(loadOp);

   auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);

   SmallVector<AffineExpr> symbols(mixedSizes.size());

   bindSymbolsList(ctx, llvm::MutableArrayRef{symbols});

   AffineExpr prodExprInBytes =

       computeProduct(ctx, symbols) *

       (sharedMemref.getType().getElementTypeBitWidth() / 8);

   auto res = affine::makeComposedFoldedAffineApply(rewriter, loc,

                                                    prodExprInBytes, mixedSizes);

   return res;

 }


 void HopperBuilder::buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,

                                          ArrayRef<OpFoldResult> mixedSizes) {

   assert(!mixedSizes.empty() && "expecte non-empty sizes");

   MLIRContext *ctx = rewriter.getContext();

   SmallVector<AffineExpr> symbols(mixedSizes.size());

   bindSymbolsList(ctx, llvm::MutableArrayRef{symbols});

   AffineExpr sumExpr = computeSum(ctx, symbols);

   OpFoldResult size =

       affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes);

   Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size);

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   nvgpu::MBarrierArriveExpectTxOp::create(rewriter, loc, barrier, sizeVal, zero,

                                           Value());

 }


 void HopperBuilder::buildTryWaitParity(TypedValue<MBarrierGroupType> barrier) {

   Type i1 = rewriter.getI1Type();

   Value parity = LLVM::ConstantOp::create(rewriter, loc, i1, 0);

   // 10M is an arbitrary, not too small or too big number to specify the number

   // of ticks before retry.

   // TODO: hoist this in a default dialect constant.

   Value ticksBeforeRetry =

       arith::ConstantIndexOp::create(rewriter, loc, 10000000);

   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

   nvgpu::MBarrierTryWaitParityOp::create(rewriter, loc, barrier, parity,

                                          ticksBeforeRetry, zero);

 }


 //===----------------------------------------------------------------------===//

 // RewriteCopyAsTmaOp

 //===----------------------------------------------------------------------===//


 /// Helper to create the tma operations corresponding to `linalg::CopyOp`.

 struct CopyBuilder : public HopperBuilder {

   CopyBuilder(RewriterBase &rewriter, Location loc)

       : HopperBuilder(rewriter, loc) {}


   SmallVector<Operation *> rewrite(ArrayRef<Operation *> copyOps);

 };


 SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {

   MLIRContext *ctx = rewriter.getContext();

   if (copyOps.empty())

     return SmallVector<Operation *>();


   auto launchOp = copyOps.front()->getParentOfType<gpu::LaunchOp>();

   assert(launchOp && "expected launch op");


   // 1. Init a barrier object in shared memory.

   OpBuilder::InsertionGuard g(rewriter);

   rewriter.setInsertionPoint(copyOps.front());

   AffineExpr bx, by, bz;

   bindSymbols(ctx, bx, by, bz);

   AffineExpr prod = computeProduct(ctx, ArrayRef<AffineExpr>{bx, by, bz});

   OpFoldResult numThreads = affine::makeComposedFoldedAffineApply(

       rewriter, loc, prod,

       ArrayRef<OpFoldResult>{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(),

                              launchOp.getBlockSizeZ()});


   TypedValue<MBarrierGroupType> barrier =

       buildAndInitBarrierInSharedMemory(numThreads);


   SmallVector<TypedValue<MemRefType>> shmems;

   SmallVector<TypedValue<TensorMapDescriptorType>> globalDescs;

   for (Operation *op : copyOps) {

     auto copyOp = cast<linalg::CopyOp>(op);

     auto inMemRef =

         cast<TypedValue<MemRefType>>(copyOp.getDpsInputOperand(0)->get());

     assert(inMemRef.getType().getRank() == 2 &&

            "expected in to be a 2D memref");


     // 2. Build global memory descriptor.

     TypedValue<TensorMapDescriptorType> globalDesc =

         buildGlobalMemRefDescriptor(inMemRef, launchOp);

     globalDescs.push_back(globalDesc);


     // 3. Shared memory and descriptor for the tmp array.

     auto shmem =

         cast<TypedValue<MemRefType>>(copyOp.getDpsInitOperand(0)->get());

     shmems.push_back(shmem);

   }


   // 4. Load in from global memory to shared memory using tma.

   OpBuilder::InsertionGuard g2(rewriter);

   rewriter.setInsertionPoint(copyOps.front());

   SmallVector<Operation *> results =

       buildPredicateLoadsOnThread0(globalDescs, shmems, barrier);


   // 5. Spin-loop until data is ready.

   buildTryWaitParity(barrier);


   // 6. Erase the ops that have now been rewritten.

   for (Operation *op : copyOps)

     rewriter.eraseOp(op);


   return results;

 }


 DiagnosedSilenceableFailure

 RewriteCopyAsTmaOp::apply(TransformRewriter &rewriter,

                           TransformResults &results, TransformState &state) {

   auto payloadOps = state.getPayloadOps(getTarget());

   gpu::LaunchOp commonLaunchOp;

   Operation *firstOp, *failingOp;

   if (llvm::any_of(payloadOps, [&](Operation *op) {

         if (!commonLaunchOp) {

           commonLaunchOp = op->getParentOfType<gpu::LaunchOp>();

           firstOp = op;

         }

         auto fail = !op->getParentOfType<gpu::LaunchOp>() ||

                     commonLaunchOp != op->getParentOfType<gpu::LaunchOp>() ||

                     !isa<linalg::CopyOp>(op);

         if (fail)

           failingOp = op;

         return fail;

       })) {

     DiagnosedSilenceableFailure diag =

         emitSilenceableError()

         << "target ops must be linalg::CopyOp nested under a common "

            "gpu.LaunchOp to be rewritten because the tma descriptors need to "

            "be created on the host.\nBut got: "

         << *firstOp << "\nand " << *failingOp;

     return diag;

   }


   // TODO: more robust detection of copy, with transposes etc.

   CopyBuilder(rewriter, getLoc()).rewrite(llvm::to_vector(payloadOps));


   return DiagnosedSilenceableFailure::success();

 }


 //===----------------------------------------------------------------------===//

 // Transform op registration

 //===----------------------------------------------------------------------===//


 namespace {

 class NVGPUTransformDialectExtension

     : public TransformDialectExtension<NVGPUTransformDialectExtension> {

 public:

   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(NVGPUTransformDialectExtension)


   NVGPUTransformDialectExtension() {

     declareGeneratedDialect<arith::ArithDialect>();

     declareGeneratedDialect<affine::AffineDialect>();

     declareGeneratedDialect<NVGPUDialect>();

     declareGeneratedDialect<NVVM::NVVMDialect>();

     declareGeneratedDialect<vector::VectorDialect>();

     registerTransformOps<

 #define GET_OP_LIST

 #include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"

         >();

   }

 };

 } // namespace


 #define GET_OP_CLASSES

 #include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"


 void mlir::nvgpu::registerTransformDialectExtension(DialectRegistry &registry) {

   registry.addExtensions<NVGPUTransformDialectExtension>();

 }

AffineOps.h

Utils.h

GPUCommonPass.h

GPUDialect.h

IndexingUtils.h

diag
static std::string diag(const llvm::Value &value)
Definition: ModuleImport.cpp:55

NVGPUDialect.h

kWgmmaSizeM
constexpr int kWgmmaSizeM
M size of wgmma.mma_async instruction.
Definition: NVGPUDialect.h:40

NVGPUToNVVM.h

getSharedAddressSpaceAttribute
static Attribute getSharedAddressSpaceAttribute(OpBuilder &b)
Definition: NVGPUTransformOps.cpp:922

hasDefaultMemorySpace
static bool hasDefaultMemorySpace(BaseMemRefType type)
Returns true if the given type has the default memory space.
Definition: NVGPUTransformOps.cpp:147

collectStage0PipeliningOps
static LogicalResult collectStage0PipeliningOps(scf::ForOp forOp, llvm::SmallPtrSet< Operation *, 16 > &ops)
Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the gi...
Definition: NVGPUTransformOps.cpp:204

replaceOpWithPredicatedOp
static Operation * replaceOpWithPredicatedOp(RewriterBase &rewriter, Operation *op, Value predicate)
Hook for the loop pipeliner.
Definition: NVGPUTransformOps.cpp:306

isStoreToShared
static bool isStoreToShared(Operation *op, Value v)
Returns true if the operation is storing the given value into shared memory.
Definition: NVGPUTransformOps.cpp:174

foreachIndividualVectorElement
static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn, ReduceFn reduceFn)
Helper functions to create customizable load and stores operations.
Definition: NVGPUTransformOps.cpp:646

pipelineForSharedCopies
static std::tuple< DiagnosedSilenceableFailure, scf::ForOp > pipelineForSharedCopies(RewriterBase &rewriter, scf::ForOp forOp, int64_t depth, bool epiloguePeeling)
Applies loop pipelining with the given depth to the given loop so that copies into the shared memory ...
Definition: NVGPUTransformOps.cpp:349

hasSharedMemorySpace
static bool hasSharedMemorySpace(BaseMemRefType type)
Returns true if the given type has the shared (workgroup) memory space.
Definition: NVGPUTransformOps.cpp:152

isLoadFromGlobalStoredToShared
static bool isLoadFromGlobalStoredToShared(Operation *op)
Returns true if the operation is a load from the default memory space the result of which is only sto...
Definition: NVGPUTransformOps.cpp:186

makeVectorShapes
static std::tuple< SmallVector< int64_t >, SmallVector< int64_t >, SmallVector< int64_t > > makeVectorShapes(ArrayRef< int64_t > lhs, ArrayRef< int64_t > rhs, ArrayRef< int64_t > res)
Definition: NVGPUTransformOps.cpp:735

setAsyncWaitGroupsInFlight
static void setAsyncWaitGroupsInFlight(OpBuilder &builder, Operation *op, scf::PipeliningOption::PipelinerPart part, unsigned iteration, unsigned depth)
Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations c...
Definition: NVGPUTransformOps.cpp:241

getPipelineStages
static void getPipelineStages(scf::ForOp forOp, std::vector< std::pair< Operation *, unsigned >> &opsWithPipelineStages, unsigned depth, llvm::SmallPtrSetImpl< Operation * > &stage0Ops)
Hook for the loop pipeliner that populates ops with the stage information as follows:
Definition: NVGPUTransformOps.cpp:275

getValueLoadedFromGlobal
static Value getValueLoadedFromGlobal(Operation *op)
Returns the value produced by a load from the default memory space.
Definition: NVGPUTransformOps.cpp:161

NVGPUTransformOps.h

NVVMDialect.h

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition: PassManagerOptions.cpp:89

vectorShape
static std::optional< VectorShape > vectorShape(Type type)
Definition: PolynomialApproximation.cpp:47

rewrite
static void rewrite(DataFlowSolver &solver, MLIRContext *context, MutableArrayRef< Region > initialRegions)
Rewrite the given regions using the computing analysis.
Definition: SCCP.cpp:67

SliceAnalysis.h

StaticValueUtils.h

TypeConverter.h

MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition: TypeID.h:331

Value.h

VectorOps.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::MutableArrayRef
Definition: LLVM.h:62

llvm::SetVector
Definition: LLVM.h:66

llvm::SmallPtrSetImpl
Definition: LLVM.h:70

llvm::SmallPtrSet
Definition: LLVM.h:68

llvm::SmallVectorImpl
Definition: LLVM.h:74

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineExpr::floorDiv
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:959

mlir::Attribute
Attributes are known-constant values of operations.
Definition: Attributes.h:25

mlir::BaseMemRefType
This class provides a shared interface for ranked and unranked memref types.
Definition: BuiltinTypes.h:104

mlir::BaseMemRefType::getMemorySpace
Attribute getMemorySpace() const
Returns the memory space in which data referred to by this memref resides.
Definition: BuiltinTypes.cpp:439

mlir::BaseMemRefType::getMemorySpaceAsInt
unsigned getMemorySpaceAsInt() const
[deprecated] Returns the memory space in old raw integer representation.
Definition: BuiltinTypes.cpp:445

mlir::Builder::getF32Type
FloatType getF32Type()
Definition: Builders.cpp:43

mlir::Builder::getF16Type
FloatType getF16Type()
Definition: Builders.cpp:39

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:56

mlir::Builder::getI1Type
IntegerType getI1Type()
Definition: Builders.cpp:53

mlir::Builder::getIndexType
IndexType getIndexType()
Definition: Builders.cpp:51

mlir::DiagnosedSilenceableFailure
The result of a transform IR operation application.
Definition: DiagnosedSilenceableFailure.h:38

mlir::DiagnosedSilenceableFailure::success
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
Definition: DiagnosedSilenceableFailure.h:48

mlir::DiagnosedSilenceableFailure::definiteFailure
static DiagnosedSilenceableFailure definiteFailure()
Constructs a DiagnosedSilenceableFailure in the failure state.
Definition: DiagnosedSilenceableFailure.h:54

mlir::DialectRegistry
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Definition: DialectRegistry.h:139

mlir::DialectRegistry::addExtensions
void addExtensions()
Add the given extensions to the registry.
Definition: DialectRegistry.h:222

mlir::LLVMTypeConverter
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63

mlir::MemRefType::Builder
This is a builder type that keeps local references to arguments.
Definition: BuiltinTypes.h:182

mlir::MemRefType::Builder::setMemorySpace
Builder & setMemorySpace(Attribute newMemorySpace)
Definition: BuiltinTypes.h:208

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:348

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:207

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398

mlir::OpFoldResult
This class represents a single result from folding an operation.
Definition: OpDefinition.h:272

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getBlock
Block * getBlock()
Returns the operation block that contains this operation.
Definition: Operation.h:213

mlir::Operation::getParentOfType
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238

mlir::RewritePatternSet
Definition: PatternMatch.h:816

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:127

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:155

mlir::TypeConverter
Type conversion class.
Definition: DialectConversion.h:41

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::isF32
bool isF32() const
Definition: Types.cpp:40

mlir::Type::isInteger
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition: Types.cpp:56

mlir::Type::isF16
bool isF16() const
Definition: Types.cpp:38

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::Value::getUsers
user_range getUsers() const
Definition: Value.h:218

mlir::Value::hasOneUse
bool hasOneUse() const
Returns true if this value has exactly one use.
Definition: Value.h:197

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359

mlir::transform::ApplyToEachResultList
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
Definition: TransformInterfaces.h:1409

mlir::transform::ApplyToEachResultList::push_back
void push_back(Operation *op)
Appends an element to the list.
Definition: TransformInterfaces.h:1442

mlir::transform::TransformDialectExtension
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
Definition: TransformDialect.h:118

mlir::transform::TransformResults
Local mapping between values defined by a specific op implementing the TransformOpInterface and the p...
Definition: TransformInterfaces.h:815

mlir::transform::TransformRewriter
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
Definition: TransformInterfaces.h:1108

mlir::transform::TransformState
The state maintained across applications of various ops implementing the TransformOpInterface.
Definition: TransformInterfaces.h:173

Arith.h

Linalg.h

MemRef.h

Transforms.h

SCF.h

Transforms.h

AffineExpr.h

BuiltinTypes.h

mlir::NVVM
Definition: GPUToNVVM.h:18

mlir::affine::makeComposedFoldedAffineApply
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1469

mlir::linalg
Definition: LinalgToStandard.h:24

mlir::memref::getMixedSizes
SmallVector< OpFoldResult > getMixedSizes(OpBuilder &builder, Location loc, Value value)
Return the dimensions of the given memref value.
Definition: MemRefOps.cpp:77

mlir::nvgpu
Definition: NVGPUToNVVM.h:25

mlir::nvgpu::getMBarrierMemrefType
MemRefType getMBarrierMemrefType(MLIRContext *context, MBarrierGroupType barrierType)
Return the memref type that can be used to represent an mbarrier object.

mlir::nvgpu::registerTransformDialectExtension
void registerTransformDialectExtension(DialectRegistry &registry)

mlir::nvgpu::createAsyncGroups
void createAsyncGroups(RewriterBase &rewriter, Operation *op, bool bypassL1)
Convert global->shared vector transfers to async device copies.
Definition: CreateAsyncGroups.cpp:152

mlir::remark::failed
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:561

mlir::scf::pipelineForLoop
FailureOr< ForOp > pipelineForLoop(RewriterBase &rewriter, ForOp forOp, const PipeliningOption &options, bool *modifiedIR=nullptr)
Generate a pipelined version of the scf.for loop based on the schedule given as option.
Definition: LoopPipelining.cpp:789

mlir::transform
Definition: DLTITransformOps.h:18

mlir::transform::producesHandle
void producesHandle(ResultRange handles, SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Definition: TransformInterfaces.cpp:1824

mlir::transform::consumesHandle
void consumesHandle(MutableArrayRef< OpOperand > handles, SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Populates effects with the memory effects indicating the operation on the given handle value:
Definition: TransformInterfaces.cpp:1794

mlir::transform::modifiesPayload
void modifiesPayload(SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Populates effects with the memory effects indicating the access to payload IR resource.
Definition: TransformInterfaces.cpp:1855

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::getAsIndexOpFoldResult
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
Definition: StaticValueUtils.cpp:106

mlir::getBackwardSlice
LogicalResult getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, const BackwardSliceOptions &options={})
Fills backwardSlice with the computed backward slice (i.e.
Definition: SliceAnalysis.cpp:179

mlir::TypedValue
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition: Value.h:498

mlir::computeStrides
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
Definition: IndexingUtils.h:47

mlir::emitSilenceableFailure
DiagnosedSilenceableFailure emitSilenceableFailure(Location loc, const Twine &message={})
Emits a silenceable failure with the given message.
Definition: DiagnosedSilenceableFailure.h:256

mlir::delinearize
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
Definition: IndexingUtils.cpp:97

mlir::isMemoryEffectFree
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Definition: SideEffectInterfaces.cpp:320

mlir::computeProduct
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
Definition: IndexingUtils.cpp:84

mlir::emitDefiniteFailure
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
Definition: DiagnosedSilenceableFailure.h:243

mlir::populateNVGPUToNVVMConversionPatterns
void populateNVGPUToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Definition: NVGPUToNVVM.cpp:1748

mlir::getElementTypeOrSelf
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
Definition: TypeUtilities.cpp:23

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::bindSymbols
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325

mlir::populateGpuMemorySpaceAttributeConversions
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
Definition: GPUOpsLowering.cpp:818

mlir::getValueOrCreateConstantIndexOp
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:111

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::getAffineDimExpr
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:619

mlir::computeSum
int64_t computeSum(ArrayRef< int64_t > basis)
Self-explicit.

mlir::bindSymbolsList
void bindSymbolsList(MLIRContext *ctx, MutableArrayRef< AffineExprTy > exprs)
Definition: AffineExpr.h:330

CopyBuilder
Helper to create the tma operations corresponding to linalg::CopyOp.
Definition: NVGPUTransformOps.cpp:1027

CopyBuilder::rewrite
SmallVector< Operation * > rewrite(ArrayRef< Operation * > copyOps)
Definition: NVGPUTransformOps.cpp:1034

CopyBuilder::CopyBuilder
CopyBuilder(RewriterBase &rewriter, Location loc)
Definition: NVGPUTransformOps.cpp:1028

HopperBuilder
Helper to create the base Hopper-specific operations that are reused in various other places.
Definition: NVGPUTransformOps.cpp:848

HopperBuilder::buildBarrierArriveTx
void buildBarrierArriveTx(TypedValue< MBarrierGroupType > barrier, ArrayRef< OpFoldResult > sizes)
Definition: NVGPUTransformOps.cpp:994

HopperBuilder::buildTmaAsyncLoad
OpFoldResult buildTmaAsyncLoad(TypedValue< TensorMapDescriptorType > globalDesc, TypedValue< MemRefType > sharedMemref, TypedValue< MBarrierGroupType > barrier, SmallVectorImpl< Operation * > &loadOps)
Build a tma load from global memory to shared memory using barrier to synchronize.
Definition: NVGPUTransformOps.cpp:973

HopperBuilder::rewriter
RewriterBase & rewriter
Definition: NVGPUTransformOps.cpp:880

HopperBuilder::buildGlobalMemRefDescriptor
TypedValue< TensorMapDescriptorType > buildGlobalMemRefDescriptor(TypedValue< MemRefType > memref, gpu::LaunchOp launchOp)
Create tma descriptor op to initiate transfer from global to shared memory.
Definition: NVGPUTransformOps.cpp:944

HopperBuilder::buildTryWaitParity
void buildTryWaitParity(TypedValue< MBarrierGroupType > barrier)
Definition: NVGPUTransformOps.cpp:1009

HopperBuilder::buildAndInitBarrierInSharedMemory
TypedValue< MBarrierGroupType > buildAndInitBarrierInSharedMemory(OpFoldResult numThreads)
Definition: NVGPUTransformOps.cpp:929

HopperBuilder::buildPredicateLoadsOnThread0
SmallVector< Operation * > buildPredicateLoadsOnThread0(ArrayRef< TypedValue< TensorMapDescriptorType >> globalDescriptors, ArrayRef< TypedValue< MemRefType >> sharedMemBuffers, TypedValue< MBarrierGroupType > barrier)
If threadIdx.x == 0 does TMA request + wait, else just wait.
Definition: NVGPUTransformOps.cpp:884

HopperBuilder::loc
Location loc
Definition: NVGPUTransformOps.cpp:881

HopperBuilder::HopperBuilder
HopperBuilder(RewriterBase &rewriter, Location loc)
Definition: NVGPUTransformOps.cpp:849

MmaSyncBuilder
Helper struct to provide a simple mapping from matmul operations to the corresponding mma....
Definition: NVGPUTransformOps.cpp:440

MmaSyncBuilder::IndexCalculator
std::function< SmallVector< RowColIndexing >(MLIRContext *)> IndexCalculator
Definition: NVGPUTransformOps.cpp:445

MmaSyncBuilder::MmaSyncBuilder
MmaSyncBuilder(OpBuilder &b, Location loc, OpFoldResult laneId)
Definition: NVGPUTransformOps.cpp:441

MmaSyncBuilder::buildMmaSync
FailureOr< Operation * > buildMmaSync(LinalgOp linalgOp)
Create the mma.sync operation corresponding to linalgOp along with all the supporting load/store and ...
Definition: NVGPUTransformOps.cpp:772

RowColIndexing
Helper struct to encode a pair of row/column indexings in the form of affine expressions.
Definition: NVGPUTransformOps.cpp:425

RowColIndexing::col
AffineExpr col() const
Definition: NVGPUTransformOps.cpp:430

RowColIndexing::RowColIndexing
RowColIndexing(AffineExpr row, AffineExpr col)
Definition: NVGPUTransformOps.cpp:426

RowColIndexing::print
void print(llvm::raw_ostream &os) const
Definition: NVGPUTransformOps.cpp:432

RowColIndexing::row
AffineExpr row() const
Definition: NVGPUTransformOps.cpp:429

f16
Definition: Float16bits.h:35

mlir::BackwardSliceOptions
Definition: SliceAnalysis.h:42

mlir::scf::PipeliningOption
Options to dictate how loops should be pipelined.
Definition: Transforms.h:129

mlir::scf::PipeliningOption::PipelinerPart
PipelinerPart
Definition: Transforms.h:135

mlir::scf::PipeliningOption::PipelinerPart::Prologue
@ Prologue

mlir::scf::PipeliningOption::PipelinerPart::Epilogue
@ Epilogue

mlir::scf::PipeliningOption::PipelinerPart::Kernel
@ Kernel