doxygen/NVGPUTransformOps%5F8cpp%5Fsource.html

//===- NVGPUTransformOps.cpp - Implementation of NVGPU transform ops ------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"


#include "mlir/Analysis/SliceAnalysis.h"

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

#include "mlir/Conversion/LLVMCommon/TypeConverter.h"

#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"

#include "mlir/Dialect/Arith/IR/Arith.h"

#include "mlir/Dialect/Arith/Utils/Utils.h"

#include "mlir/Dialect/GPU/IR/GPUDialect.h"

#include "mlir/Dialect/LLVMIR/NVVMDialect.h"

#include "mlir/Dialect/Linalg/IR/Linalg.h"

#include "mlir/Dialect/MemRef/IR/MemRef.h"

#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"

#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"

#include "mlir/Dialect/SCF/IR/SCF.h"

#include "mlir/Dialect/SCF/Transforms/Transforms.h"

#include "mlir/Dialect/Utils/IndexingUtils.h"

#include "mlir/Dialect/Utils/StaticValueUtils.h"

#include "mlir/Dialect/Vector/IR/VectorOps.h"

#include "mlir/IR/AffineExpr.h"

#include "mlir/IR/BuiltinTypes.h"

#include "mlir/IR/Value.h"

#include "llvm/ADT/ArrayRef.h"


using namespace mlir;

using namespace mlir::linalg;

using namespace mlir::nvgpu;

using namespace mlir::NVVM;

using namespace mlir::transform;


#define DEBUG_TYPE "nvgpu-transforms"


//===----------------------------------------------------------------------===//

// Apply...ConversionPatternsOp

//===----------------------------------------------------------------------===//


void ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(

    TypeConverter &typeConverter, RewritePatternSet &patterns) {

  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);

  /// device-side async tokens cannot be materialized in nvvm. We just

  /// convert them to a dummy i32 type in order to easily drop them during

  /// conversion.

  nvgpu::populateCommonGPUTypeAndAttributeConversions(llvmTypeConverter);

  llvmTypeConverter.addConversion([&](DeviceAsyncTokenType type) -> Type {

    return llvmTypeConverter.convertType(

        IntegerType::get(type.getContext(), 32));

  });

  llvmTypeConverter.addConversion([&](MBarrierTokenType type) -> Type {

    return llvmTypeConverter.convertType(

        IntegerType::get(type.getContext(), 64));

  });

  llvmTypeConverter.addConversion([&](WarpgroupAccumulatorType type) -> Type {

    Type elemType = type.getFragmented().getElementType();

    int64_t sizeM = type.getFragmented().getDimSize(0);

    int64_t sizeN = type.getFragmented().getDimSize(1);


    unsigned numMembers;

    if (elemType.isF32() || elemType.isInteger(32))

      numMembers = sizeN / 2;

    else if (elemType.isF16())

      numMembers = sizeN / 4;

    else

      llvm_unreachable("unsupported type for warpgroup accumulator");


    SmallVector<Type> innerStructBody;

    for (unsigned i = 0; i < numMembers; i++)

      innerStructBody.push_back(elemType);

    auto innerStructType =

        LLVM::LLVMStructType::getLiteral(type.getContext(), innerStructBody);


    SmallVector<Type> structBody;

    for (int i = 0; i < sizeM; i += kWgmmaSizeM)

      structBody.push_back(innerStructType);


    auto convertedType =

        LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);

    return llvmTypeConverter.convertType(convertedType);

  });

  llvmTypeConverter.addConversion([&](MBarrierGroupType type) -> Type {

    return llvmTypeConverter.convertType(

        getMBarrierMemrefType(type.getContext(), type));

  });

  llvmTypeConverter.addConversion(

      [&](WarpgroupMatrixDescriptorType type) -> Type {

        return llvmTypeConverter.convertType(

            IntegerType::get(type.getContext(), 64));

      });

  llvmTypeConverter.addConversion([&](TensorMapDescriptorType type) -> Type {

    return LLVM::LLVMPointerType::get(type.getContext());

  });

  populateNVGPUToNVVMConversionPatterns(llvmTypeConverter, patterns);

}


LogicalResult ApplyNVGPUToNVVMConversionPatternsOp::verifyTypeConverter(

    TypeConverterBuilderOpInterface builder) {

  if (builder.getTypeConverterType() != "LLVMTypeConverter")

    return emitOpError("expected LLVMTypeConverter");

  return success();

}


//===---------------------------------------------------------------------===//

// CreateAsyncGroupsOp

//===---------------------------------------------------------------------===//


void CreateAsyncGroupsOp::getEffects(

    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {

  consumesHandle(getTargetMutable(), effects);

  producesHandle(getOperation()->getOpResults(), effects);

  modifiesPayload(effects);

}


DiagnosedSilenceableFailure

CreateAsyncGroupsOp::applyToOne(TransformRewriter &rewriter, Operation *target,

                                ApplyToEachResultList &results,

                                TransformState &state) {

  createAsyncGroups(rewriter, target, getBypassL1());

  results.push_back(target);

  return DiagnosedSilenceableFailure::success();

}


//===----------------------------------------------------------------------===//

// PipelineSharedMemoryCopiesOp

//===----------------------------------------------------------------------===//


/// Returns true if the given type has the default memory space.


static bool hasDefaultMemorySpace(BaseMemRefType type) {

  return !type.getMemorySpace() || type.getMemorySpaceAsInt() == 0;

}


/// Returns true if the given type has the shared (workgroup) memory space.


static bool hasSharedMemorySpace(BaseMemRefType type) {

  auto space =

      dyn_cast_if_present<gpu::AddressSpaceAttr>(type.getMemorySpace());

  return space &&

         space.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace();

}


/// Returns the value produced by a load from the default memory space. Returns

/// null if the operation is not such a load.


static Value getValueLoadedFromGlobal(Operation *op) {

  // TODO: consider an interface or leveraging the memory effects interface.

  auto load = dyn_cast<vector::TransferReadOp>(op);

  if (!load)

    return nullptr;


  auto loadType = dyn_cast<MemRefType>(load.getBase().getType());

  if (!loadType || !hasDefaultMemorySpace(loadType))

    return nullptr;

  return load;

}


/// Returns true if the operation is storing the given value into shared memory.


static bool isStoreToShared(Operation *op, Value v) {

  // TOD: consider an interface or leveraging the memory effects interface.

  auto store = dyn_cast<vector::TransferWriteOp>(op);

  if (!store || store.getVector() != v)

    return false;


  auto storeType = dyn_cast<MemRefType>(store.getBase().getType());

  return storeType || hasSharedMemorySpace(storeType);

}


/// Returns true if the operation is a load from the default memory space the

/// result of which is only stored into the shared memory space.


static bool isLoadFromGlobalStoredToShared(Operation *op) {

  Value loaded = getValueLoadedFromGlobal(op);

  if (!loaded || !loaded.hasOneUse())

    return false;


  return isStoreToShared(*loaded.getUsers().begin(), loaded);

}


/// Populate `ops` with the set of operations that belong to the stage 0 of the

/// pipelined version of the given loop when pipelining copies to shared memory.

/// Specifically, this collects:

///

///   1. all loads from global memory, both sync and async;

///   2. the barriers for async loads.

///

/// In particular, barriers are omitted if they do not dominate at least one

/// async load for which there is not yet a barrier.

static LogicalResult


collectStage0PipeliningOps(scf::ForOp forOp,

                           llvm::SmallPtrSet<Operation *, 16> &ops) {


  llvm::SmallPtrSet<Operation *, 4> barriers;

  for (Operation &op : *forOp.getBody()) {

    // Bail on nested ops for now.

    if (op.getNumRegions() > 0)

      return failure();


    if (isa<gpu::BarrierOp>(op)) {

      barriers.insert(&op);

      continue;

    }


    if (isa<DeviceAsyncCopyOp, DeviceAsyncCreateGroupOp>(op)) {

      ops.insert(&op);

      ops.insert(std::make_move_iterator(barriers.begin()),

                 std::make_move_iterator(barriers.end()));

      assert(barriers.empty() &&

             "expected to have moved the barriers into another set");

      continue;

    }


    if (isLoadFromGlobalStoredToShared(&op)) {

      ops.insert(&op);

      continue;

    }

  }


  return success();

}


/// Hook for the loop pipeliner that sets the "num groups in flight" attribute

/// of async wait operations corresponding to pipelined shared memory copies.

// TODO: this currently assumes that there are no groups that could be in flight

// in the existing code.

static void


setAsyncWaitGroupsInFlight(OpBuilder &builder, Operation *op,

                           scf::PipeliningOption::PipelinerPart part,

                           unsigned iteration, unsigned depth) {

  // Based on the order of copies within the loop we need to set the number

  // of copies in flight, unless it is already set.

  auto waitOp = dyn_cast<DeviceAsyncWaitOp>(op);

  if (!waitOp || waitOp.getNumGroups())

    return;


  int numGroupInFlight = 0;

  if (part == scf::PipeliningOption::PipelinerPart::Kernel ||

      part == scf::PipeliningOption::PipelinerPart::Prologue) {

    numGroupInFlight = depth - 1;

  } else {

    // By construction there should be no wait op in the prologue as all the

    // wait should be in the last stage.

    assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);

    // Based on the schedule we pick we know how many groups are in flight for

    // each iteration of the epilogue.

    numGroupInFlight = depth - 1 - iteration;

  }

  waitOp.setNumGroups(numGroupInFlight);

}


/// Hook for the loop pipeliner that populates `ops` with the stage information

/// as follows:

///

///   - operations in `stage0Ops` (typically loads from global memory and

///     related barriers) are at stage 0;

///   - operations in the backward slice of any stage0Ops are all at stage 0;

///   - other operations are at stage `depth`;

///   - the internal order of the pipelined loop has ops at stage `depth` first,

///   then those at stage 0, with relative order within each group preserved.

///


static void getPipelineStages(

    scf::ForOp forOp,

    std::vector<std::pair<Operation *, unsigned>> &opsWithPipelineStages,

    unsigned depth, llvm::SmallPtrSetImpl<Operation *> &stage0Ops) {

  SetVector<Operation *> dependencies;

  BackwardSliceOptions options([&](Operation *visited) {

    return visited->getBlock() == forOp.getBody();

  });

  options.inclusive = true;

  for (Operation &op : forOp.getBody()->getOperations()) {

    if (stage0Ops.contains(&op)) {

      LogicalResult result = getBackwardSlice(&op, &dependencies, options);

      assert(result.succeeded() && "expected a backward slice");

      (void)result;

    }

  }


  for (Operation &op : forOp.getBody()->getOperations()) {

    if (!dependencies.contains(&op) && !isa<scf::YieldOp>(op))

      opsWithPipelineStages.emplace_back(&op, depth);

  }

  for (Operation &op : forOp.getBody()->getOperations()) {

    if (dependencies.contains(&op))

      opsWithPipelineStages.emplace_back(&op, 0);

  }

}


/// Hook for the loop pipeliner. Replaces op with a predicated version and

/// returns the resulting operation. Returns the original op if the predication

/// isn't necessary for the given op. Returns null if predication is needed but

/// not supported.


static Operation *replaceOpWithPredicatedOp(RewriterBase &rewriter,

                                            Operation *op, Value predicate) {

  // Some operations may be fine to execute "speculatively" more times than the

  // original number of iterations, in particular side-effect free operations

  // and barriers, even if they cannot be predicated.

  if (isMemoryEffectFree(op) ||

      isa<gpu::BarrierOp, DeviceAsyncCreateGroupOp, DeviceAsyncWaitOp>(op)) {

    return op;

  }


  // Otherwise, only async copies can currently be predicated.

  auto asyncCopyOp = dyn_cast<DeviceAsyncCopyOp>(op);

  if (!asyncCopyOp)

    return nullptr;


  // Create srcElement Value based on `predicate`. The next lines generate

  // the following code:

  //

  //   srcElement = (pred) ?  prevSrcElements : 0;

  //

  Location loc = asyncCopyOp->getLoc();

  Value dstElements = arith::ConstantOp::create(

      rewriter, loc, asyncCopyOp.getDstElementsAttr());

  Value originalSrcElement =

      asyncCopyOp.getSrcElements() ? asyncCopyOp.getSrcElements() : dstElements;

  Value c0Index = arith::ConstantIndexOp::create(rewriter, loc, 0);

  auto srcElements = arith::SelectOp::create(rewriter, loc, predicate,

                                             originalSrcElement, c0Index);

  auto asyncCopyZeroFillOp = DeviceAsyncCopyOp::create(

      rewriter, loc, DeviceAsyncTokenType::get(asyncCopyOp.getContext()),

      asyncCopyOp.getDst(), asyncCopyOp.getDstIndices(), asyncCopyOp.getSrc(),

      asyncCopyOp.getSrcIndices(), asyncCopyOp.getDstElements(), srcElements,

      UnitAttr());

  rewriter.replaceOp(asyncCopyOp, asyncCopyZeroFillOp);

  return asyncCopyZeroFillOp;

}


/// Applies loop pipelining with the given depth to the given loop so that

/// copies into the shared memory are pipelined. Doesn't affect other loops.

/// Returns a pair containing the error state and the pipelined op, the latter

/// being null in case of any failure. The error state contains a definite error

/// if the IR has been modified and a silenceable error otherwise.

static std::tuple<DiagnosedSilenceableFailure, scf::ForOp>


pipelineForSharedCopies(RewriterBase &rewriter, scf::ForOp forOp, int64_t depth,

                        bool epiloguePeeling) {

  llvm::SmallPtrSet<Operation *, 16> stage0Ops;

  if (failed(collectStage0PipeliningOps(forOp, stage0Ops))) {

    return std::make_tuple(

        emitSilenceableFailure(forOp, "cannot find stage 0 ops for pipelining"),

        scf::ForOp());

  }

  if (stage0Ops.empty()) {

    return std::make_tuple(

        emitSilenceableFailure(forOp, "no shared memory copy"), scf::ForOp());

  }


  scf::PipeliningOption options;

  unsigned maxDepth = depth;

  auto setAnnotation = [&](Operation *op,

                           scf::PipeliningOption::PipelinerPart part,

                           unsigned iteration) {

    return setAsyncWaitGroupsInFlight(rewriter, op, part, iteration, maxDepth);

  };

  options.getScheduleFn =

      [&](scf::ForOp schedulingFor,

          std::vector<std::pair<Operation *, unsigned>> &ops) {

        if (schedulingFor != forOp)

          return;

        return getPipelineStages(forOp, ops, maxDepth, stage0Ops);

      };

  options.annotateFn = setAnnotation;

  if (!epiloguePeeling) {

    options.peelEpilogue = false;

    options.predicateFn = replaceOpWithPredicatedOp;

  }


  OpBuilder::InsertionGuard guard(rewriter);

  rewriter.setInsertionPoint(forOp);

  bool modifiedIR;

  FailureOr<scf::ForOp> maybePipelined =

      pipelineForLoop(rewriter, forOp, options, &modifiedIR);

  if (succeeded(maybePipelined)) {

    return std::make_tuple(DiagnosedSilenceableFailure::success(),

                           *maybePipelined);

  }

  return std::make_tuple(

      modifiedIR

          ? DiagnosedSilenceableFailure::definiteFailure()

          : emitSilenceableFailure(forOp, "pipelining preconditions failed"),

      scf::ForOp());

}


DiagnosedSilenceableFailure PipelineSharedMemoryCopiesOp::applyToOne(

    TransformRewriter &rewriter, scf::ForOp forOp,

    ApplyToEachResultList &results, TransformState &state) {

  auto [diag, pipelined] = pipelineForSharedCopies(

      rewriter, forOp, static_cast<int64_t>(getDepth()), getPeelEpilogue());

  if (diag.succeeded()) {

    results.push_back(pipelined);

    return DiagnosedSilenceableFailure::success();

  }

  if (diag.isDefiniteFailure()) {

    auto diag = emitDefiniteFailure("irreversible pipelining failure");

    if (!getPeelEpilogue()) {

      diag.attachNote(forOp->getLoc()) << "couldn't predicate?";

      diag.attachNote(getLoc()) << "try setting " << getPeelEpilogueAttrName();

    }

    return diag;

  }


  return std::move(diag);

}


//===----------------------------------------------------------------------===//

// RewriteMatmulAsMmaSyncOp

//===----------------------------------------------------------------------===//


/// Helper struct to encode a pair of row/column indexings in the form of

/// affine expressions.


struct RowColIndexing : private std::pair<AffineExpr, AffineExpr> {


  RowColIndexing(AffineExpr row, AffineExpr col)

      : std::pair<AffineExpr, AffineExpr>(row, col) {}


  AffineExpr row() const { return first; };

  AffineExpr col() const { return second; };


  void print(llvm::raw_ostream &os) const {

    os << "- indexing: " << first << ", " << second;

  }


};


/// Helper struct to provide a simple mapping from matmul operations to the

/// corresponding mma.sync operation. This is constrained to the case where the

/// matmul matches the mma.sync operation 1-1.


struct MmaSyncBuilder {


  MmaSyncBuilder(OpBuilder &b, Location loc, OpFoldResult laneId)

      : b(b), loc(loc), laneId(laneId) {}


  using IndexCalculator =

      std::function<SmallVector<RowColIndexing>(MLIRContext *)>;


  /// Create the mma.sync operation corresponding to `linalgOp` along with all

  /// the supporting load/store and vector operations.

  FailureOr<Operation *> buildMmaSync(LinalgOp linalgOp);


private:

  struct MmaSyncInfo {

    std::tuple<IndexCalculator, IndexCalculator, IndexCalculator> indexFns;

    std::tuple<SmallVector<int64_t>, SmallVector<int64_t>, SmallVector<int64_t>>

        vectorShapes;

    SmallVector<int64_t> mmaShape;

    bool tf32Enabled;

  };


  /// Return the specific index calculator for the given `linalgOp` or failure

  /// if the op is not supported. This is the toplevel switch that should just

  /// be Tablegen'd in the future.

  FailureOr<MmaSyncInfo> getIndexCalculators(ArrayRef<int64_t> opShape,

                                             TypeRange elementalTypes);


  //===--------------------------------------------------------------------===//

  // Instruction-specific row, column indexing expression builders.

  // These should all be declaratively specified via Tablegen in the future.

  // The Tablegen specification should be as straightforward as possible to

  // only model the existing size and type combinations.

  //===--------------------------------------------------------------------===//

  //

  // TODO: Tablegen all this.

  //===--------------------------------------------------------------------===//

  // m16n8k4 tf32 case.

  //===--------------------------------------------------------------------===//

  /// From the NVIDIA doc:

  /// groupID           = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  /// row =      groupID            for a0

  ///            groupID + 8        for a1

  /// col =  threadIDInGroup

  static SmallVector<RowColIndexing> m16n8k4tf32Lhs(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    return {RowColIndexing{groupID, threadIDInGroup},

            RowColIndexing{groupID + 8, threadIDInGroup}};

  }


  /// From the NVIDIA doc:

  /// groupID           = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  /// row =  threadIDInGroup

  /// col =  groupID

  static SmallVector<RowColIndexing> m16n8k4tf32Rhs(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    return {RowColIndexing{threadIDInGroup, groupID}};

  }


  /// From the NVIDIA doc:

  /// groupID          = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  /// row =      groupID                            for c0 and c1

  ///          groupID + 8                          for c2 and c3

  /// col =  (threadIDInGroup * 2) + (i & 0x1)    for ci   where i = {0,..,3}

  static SmallVector<RowColIndexing> m16n8k4tf32Res(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    return {RowColIndexing{groupID, threadIDInGroup * 2 + 0},

            RowColIndexing{groupID, threadIDInGroup * 2 + 1},

            RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},

            RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}};

  }


  //===--------------------------------------------------------------------===//

  // m16n8k16 f16 case.

  //===--------------------------------------------------------------------===//

  /// From the NVIDIA doc:

  /// groupID           = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  ///

  /// row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6

  ///           groupID + 8         Otherwise

  ///

  /// col =  (threadIDInGroup * 2) + (i & 0x1)          for ai where i <  4

  ///        (threadIDInGroup * 2) + (i & 0x1) + 8      for ai where i >= 4

  static SmallVector<RowColIndexing> m16n8k16f16Lhs(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    // clang-format off

    return {

      RowColIndexing{groupID, threadIDInGroup * 2 + 0},         // i == 0

      RowColIndexing{groupID, threadIDInGroup * 2 + 1},         // i == 1

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},     // i == 2

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1},     // i == 3

      RowColIndexing{groupID, threadIDInGroup * 2 + 0 + 8},     // i == 4

      RowColIndexing{groupID, threadIDInGroup * 2 + 1 + 8},     // i == 5

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0 + 8}, // i == 6

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1 + 8}  // i == 7

    };

    // clang-format on

  }


  /// From the NVIDIA doc:

  /// groupID           = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  ///

  /// row =  (threadIDInGroup * 2) + (i & 0x1)           for bi where i <  2

  ///        (threadIDInGroup * 2) + (i & 0x1) + 8       for bi where i >= 2

  ///

  /// col = groupID

  static SmallVector<RowColIndexing> m16n8k16f16Rhs(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    // clang-format off

    return {

      RowColIndexing{threadIDInGroup * 2 + 0, groupID},        // i == 0

      RowColIndexing{threadIDInGroup * 2 + 1, groupID},        // i == 1

      RowColIndexing{threadIDInGroup * 2 + 0 + 8, groupID},    // i == 2

      RowColIndexing{threadIDInGroup * 2 + 1 + 8, groupID}     // i == 3

    };

    // clang-format on

  }


  /// From the NVIDIA doc:

  /// groupID           = %laneid >> 2

  /// threadIDInGroup = %laneid % 4

  ///

  /// row =      groupID                               for ci where i <  2

  ///          groupID + 8                             for ci where i >= 2

  ///

  /// col =  (threadIDInGroup * 2) + (i & 0x1)      for ci where i = {0,..,3}

  static SmallVector<RowColIndexing> m16n8k16f16Res(MLIRContext *ctx) {

    auto dim = getAffineDimExpr(0, ctx);

    AffineExpr groupID = dim.floorDiv(4);

    AffineExpr threadIDInGroup = dim % 4;

    // clang-format off

    return {

      RowColIndexing{groupID, threadIDInGroup * 2 + 0},        // i == 0

      RowColIndexing{groupID, threadIDInGroup * 2 + 1},        // i == 1

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0},    // i == 2

      RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}     // i == 3

    };

    // clang-format on

  }


  //===--------------------------------------------------------------------===//

  /// Helper functions to create customizable load and stores operations. The

  /// specific shapes of each MMA instruction are passed via the

  /// IndexCalculator callback.

  //===--------------------------------------------------------------------===//

  /// Build a list of memref.load operations indexed at `(row, col)` indices

  /// that make sense for a particular MMA instruction and specified via the

  /// IndexCalculator callback.

  SmallVector<Value> buildMemRefLoads(OpBuilder &b, Location loc,

                                      OpFoldResult laneId, Value memref,

                                      const IndexCalculator &indexFn);


  /// Perform a distributed load of a vector operand of `vectorShape` for a

  /// particular MMA instruction whose `(row, col)` indices are specified via

  /// the IndexCalculator callback. Each `laneId` loads the subportion of the

  /// data that makes sense for the particular MMA operation.

  /// The `vectorShape` matches existing NVGPU dialect op specification but

  /// could also be flattened in the future if needed for simplification.

  Value buildMmaSyncMemRefLoadOperand(OpBuilder &b, Location loc,

                                      OpFoldResult laneId, Value memref,

                                      IndexCalculator indexFn,

                                      ArrayRef<int64_t> vectorShape);


  /// Build a list of memref.store operations indexed at `(row, col)` indices

  /// that make sense for a particular MMA instruction and specified via the

  /// IndexCalculator callback.

  SmallVector<Operation *> buildMemRefStores(OpBuilder &b, Location loc,

                                             ValueRange toStore,

                                             OpFoldResult laneId, Value memref,

                                             const IndexCalculator &indexFn);


  /// Perform a distributed store of a vector operand of `vectorShape` for a

  /// particular MMA instruction whose `(row, col)` indices are specified via

  /// the IndexCalculator callback. Each `laneId` loads the subportion of the

  /// data that makes sense for the particular MMA operation.

  /// The `vectorShape` matches existing NVGPU dialect op specification but

  /// could also be flattened in the future if needed for simplification.

  SmallVector<Operation *> buildMmaSyncMemRefStoreOperand(

      OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId,

      Value memref, IndexCalculator indexFn, ArrayRef<int64_t> vectorShape);


  OpBuilder &b;

  Location loc;

  OpFoldResult laneId;

};


//===--------------------------------------------------------------------===//

/// Helper functions to create customizable load and stores operations. The

/// specific shapes of each MMA instruction are passed via the

/// IndexCalculator callback.

//===--------------------------------------------------------------------===//


template <typename ApplyFn, typename ReduceFn>


static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn,

                                           ReduceFn reduceFn) {

  VectorType vectorType = cast<VectorType>(vector.getType());

  auto vectorShape = vectorType.getShape();

  auto strides = computeStrides(vectorShape);

  for (int64_t idx = 0, e = vectorShape[0] * strides[0]; idx < e; ++idx) {

    auto indices = delinearize(idx, strides);

    reduceFn(applyFn(vector, idx, indices), idx, indices);

  }

}


SmallVector<Value>

MmaSyncBuilder::buildMemRefLoads(OpBuilder &b, Location loc,

                                 OpFoldResult laneId, Value memref,

                                 const IndexCalculator &indexFn) {

  auto aff = [&](AffineExpr e) {

    return affine::makeComposedFoldedAffineApply(b, loc, e, laneId);

  };

  SmallVector<Value> res;

  SmallVector<RowColIndexing> indexings = indexFn(b.getContext());

  for (auto indexing : indexings) {

    Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));

    Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));

    auto load = memref::LoadOp::create(b, loc, memref, ValueRange{row, col});

    res.push_back(load);

  }

  return res;

}


Value MmaSyncBuilder::buildMmaSyncMemRefLoadOperand(

    OpBuilder &b, Location loc, OpFoldResult laneId, Value memref,

    IndexCalculator indexFn, ArrayRef<int64_t> vectorShape) {

  auto loads = buildMemRefLoads(b, loc, laneId, memref, indexFn);


  Type elementType = getElementTypeOrSelf(memref.getType());

  auto vt = VectorType::get(vectorShape, elementType);

  Value res = vector::BroadcastOp::create(b, loc, vt, loads[0]);

  foreachIndividualVectorElement(

      res,

      /*applyFn=*/

      [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

        return loads[linearIdx];

      },

      /*reduceFn=*/

      [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

        res = vector::InsertOp::create(b, loc, v, res, indices);

      });


  return res;

}


SmallVector<Operation *> MmaSyncBuilder::buildMemRefStores(

    OpBuilder &b, Location loc, ValueRange toStore, OpFoldResult laneId,

    Value memref, const IndexCalculator &indexFn) {

  auto aff = [&](AffineExpr e) {

    return affine::makeComposedFoldedAffineApply(b, loc, e, laneId);

  };

  SmallVector<Operation *> res;

  for (auto [indexing, val] :

       llvm::zip_equal(indexFn(b.getContext()), toStore)) {

    Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));

    Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));

    Operation *store =

        memref::StoreOp::create(b, loc, val, memref, ValueRange{row, col});

    res.push_back(store);

  }

  return res;

}


SmallVector<Operation *> MmaSyncBuilder::buildMmaSyncMemRefStoreOperand(

    OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId,

    Value memref, IndexCalculator indexFn, ArrayRef<int64_t> vectorShape) {

  SmallVector<Value> toStore;

  toStore.reserve(32);

  foreachIndividualVectorElement(

      vectorToStore,

      /*applyFn=*/

      [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

        return vector::ExtractOp::create(b, loc, vectorToStore, indices);

      },

      /*reduceFn=*/

      [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {

        toStore.push_back(v);

      });

  return buildMemRefStores(b, loc, toStore, laneId, memref, indexFn);

}


static std::tuple<SmallVector<int64_t>, SmallVector<int64_t>,

                  SmallVector<int64_t>>


makeVectorShapes(ArrayRef<int64_t> lhs, ArrayRef<int64_t> rhs,

                 ArrayRef<int64_t> res) {

  SmallVector<int64_t> vlhs(lhs);

  SmallVector<int64_t> vrhs(rhs);

  SmallVector<int64_t> vres(res);

  return std::make_tuple(vlhs, vrhs, vres);

}


FailureOr<MmaSyncBuilder::MmaSyncInfo>

MmaSyncBuilder::getIndexCalculators(ArrayRef<int64_t> opShape,

                                    TypeRange elementalTypes) {

  // TODO: Tablegen all this.

  Type f16 = b.getF16Type();

  Type f32 = b.getF32Type();

  if (opShape == ArrayRef<int64_t>{16, 8, 4} &&

      elementalTypes == TypeRange{f32, f32, f32}) {

    return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k4tf32Lhs,

                                       &MmaSyncBuilder::m16n8k4tf32Rhs,

                                       &MmaSyncBuilder::m16n8k4tf32Res),

                       makeVectorShapes({2, 1}, {1, 1}, {2, 2}),

                       SmallVector<int64_t>{opShape},

                       /*tf32Enabled=*/true};

  }

  // This is the version with f16 accumulation.

  // TODO: version with f32 accumulation.

  if (opShape == ArrayRef<int64_t>{16, 8, 16} &&

      elementalTypes == TypeRange{f16, f16, f16}) {

    return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k16f16Lhs,

                                       &MmaSyncBuilder::m16n8k16f16Rhs,

                                       &MmaSyncBuilder::m16n8k16f16Res),

                       makeVectorShapes({4, 2}, {2, 2}, {2, 2}),

                       SmallVector<int64_t>{opShape},

                       /*tf32Enabled=*/false};

  }

  return failure();

}


FailureOr<Operation *> MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) {

  Value lhsMemRef = linalgOp.getDpsInputOperand(0)->get();

  Value rhsMemRef = linalgOp.getDpsInputOperand(1)->get();

  Value resMemRef = linalgOp.getDpsInitOperand(0)->get();

  assert(cast<MemRefType>(lhsMemRef.getType()).getRank() == 2 &&

         "expected lhs to be a 2D memref");

  assert(cast<MemRefType>(rhsMemRef.getType()).getRank() == 2 &&

         "expected rhs to be a 2D memref");

  assert(cast<MemRefType>(resMemRef.getType()).getRank() == 2 &&

         "expected res to be a 2D memref");


  int64_t m = cast<MemRefType>(lhsMemRef.getType()).getShape()[0];

  int64_t n = cast<MemRefType>(rhsMemRef.getType()).getShape()[1];

  int64_t k = cast<MemRefType>(lhsMemRef.getType()).getShape()[1];

  Type lhsType = getElementTypeOrSelf(lhsMemRef.getType());

  Type rhsType = getElementTypeOrSelf(rhsMemRef.getType());

  Type resType = getElementTypeOrSelf(resMemRef.getType());


  FailureOr<MmaSyncInfo> maybeInfo =

      getIndexCalculators({m, n, k}, {lhsType, rhsType, resType});

  if (failed(maybeInfo))

    return failure();


  const MmaSyncInfo &info = *maybeInfo;

  auto [lhsIndexFn, rhsIndexFn, resIndexFn] = info.indexFns;

  auto [lhsShape, rhsShape, resShape] = info.vectorShapes;

  Value lhs = buildMmaSyncMemRefLoadOperand(b, loc, laneId, lhsMemRef,

                                            lhsIndexFn, lhsShape);

  Value rhs = buildMmaSyncMemRefLoadOperand(b, loc, laneId, rhsMemRef,

                                            rhsIndexFn, rhsShape);

  Value res = buildMmaSyncMemRefLoadOperand(b, loc, laneId, resMemRef,

                                            resIndexFn, resShape);

  res =

      MmaSyncOp::create(b, loc, lhs, rhs, res, info.mmaShape, info.tf32Enabled);

  buildMmaSyncMemRefStoreOperand(b, loc, res, laneId, resMemRef, resIndexFn,

                                 resShape);

  return res.getDefiningOp();

}


DiagnosedSilenceableFailure RewriteMatmulAsMmaSyncOp::applyToOne(

    TransformRewriter &rewriter, LinalgOp linalgOp,

    ApplyToEachResultList &results, TransformState &state) {

  bool fail = true;

  // TODO: more robust detection of matmulOp, with transposes etc.

  if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {

    // Check to not let go the matmul with extended semantic, through this

    // transform.

    if (linalgOp.hasUserDefinedMaps()) {

      return emitSilenceableError()

             << "only matmul ops with non-extended semantics are supported";

    }

    Location loc = linalgOp.getLoc();

    // TODO: more robust computation of laneId, for now assume a single warp.

    Value laneId = gpu::ThreadIdOp::create(

        rewriter, loc, rewriter.getIndexType(), gpu::Dimension::x);

    if (succeeded(MmaSyncBuilder(rewriter, loc, laneId).buildMmaSync(linalgOp)))

      fail = false;

  }


  if (fail) {

    DiagnosedSilenceableFailure diag = emitSilenceableError()

                                       << "unsupported target op: " << linalgOp;

    diag.attachNote(linalgOp->getLoc()) << "target op";

    return diag;

  }


  rewriter.eraseOp(linalgOp);

  return DiagnosedSilenceableFailure::success();

}


//===----------------------------------------------------------------------===//

// Hopper builders.

//===----------------------------------------------------------------------===//


/// Helper to create the base Hopper-specific operations that are reused in

/// various other places.


struct HopperBuilder {


  HopperBuilder(RewriterBase &rewriter, Location loc)

      : rewriter(rewriter), loc(loc) {}


  TypedValue<MBarrierGroupType>

  buildAndInitBarrierInSharedMemory(OpFoldResult numThreads);


  /// Create tma descriptor op to initiate transfer from global to shared

  /// memory. This must be done before the launch op, on the host.

  TypedValue<TensorMapDescriptorType>

  buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,

                              gpu::LaunchOp launchOp);


  /// Build a tma load from global memory to shared memory using `barrier` to

  /// synchronize. Return the number of bytes that will be transferred.

  OpFoldResult buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,

                                 TypedValue<MemRefType> sharedMemref,

                                 TypedValue<MBarrierGroupType> barrier,

                                 SmallVectorImpl<Operation *> &loadOps);

  void buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,

                            ArrayRef<OpFoldResult> sizes);


  /// If threadIdx.x == 0 does TMA request + wait, else just wait.

  /// Return the operation that performs the transfer on thread0.

  // TODO: In the future, don't hardcode to thread 0 but elect a leader.

  SmallVector<Operation *> buildPredicateLoadsOnThread0(

      ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,

      ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,

      TypedValue<MBarrierGroupType> barrier);


  void buildTryWaitParity(TypedValue<MBarrierGroupType> barrier);


  RewriterBase &rewriter;

  Location loc;

};


SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(

    ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,

    ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,

    TypedValue<MBarrierGroupType> barrier) {

  SmallVector<Operation *> loadOps;

  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

  Value tidx = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);

  Value cond = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,

                                     tidx, zero);

  // clang-format off

  scf::IfOp::create(rewriter,

    /*location=*/loc,

    /*conditional=*/cond,

    /*thenBuilder=*/

    [&](OpBuilder &lb, Location loc) {

      SmallVector<OpFoldResult> sizes;

      sizes.reserve(globalDescriptors.size());

      for (auto [desc, shmem] : llvm::zip_equal(

              globalDescriptors, sharedMemBuffers)) {

        OpFoldResult sz = buildTmaAsyncLoad(desc, shmem, barrier, loadOps);

        sizes.push_back(sz);

      }

      // TODO: Note that cutlass predeclares the barrier arrive tx before the tma.async.load.

      // This may or may not have perf implications.

      buildBarrierArriveTx(barrier, sizes);

      scf::YieldOp::create(rewriter, loc);

    },

    /*elseBuilder=*/

    [&](OpBuilder &lb, Location loc) {

      // TODO: is this for no-thread divergence?

      // Should we just yield the size and hoist?

      buildBarrierArriveTx(barrier, getAsIndexOpFoldResult(rewriter.getContext(), 0));

      scf::YieldOp::create(rewriter, loc);

    });

  // clang-format on

  return loadOps;

}


static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) {

  return gpu::AddressSpaceAttr::get(

      b.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());

  // return b.getI64IntegerAttr(static_cast<int64_t>(kSharedMemorySpace));

}


TypedValue<MBarrierGroupType>


HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {

  auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);

  Value barrier = MBarrierCreateOp::create(

      rewriter, loc,

      MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));

  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

  nvgpu::MBarrierInitOp::create(

      rewriter, loc, barrier,

      getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), zero,

      Value());

  gpu::BarrierOp::create(rewriter, loc);

  return cast<TypedValue<MBarrierGroupType>>(barrier);

}


TypedValue<TensorMapDescriptorType>


HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,

                                           gpu::LaunchOp launchOp) {

  OpBuilder::InsertionGuard guard(rewriter);

  rewriter.setInsertionPoint(launchOp);

  Value unrankedMemRef = memref::CastOp::create(

      rewriter, loc,

      UnrankedMemRefType::get(memref.getType().getElementType(),

                              memref.getType().getMemorySpace()),

      memref);

  SmallVector<OpFoldResult> mixedSizes =

      memref::getMixedSizes(rewriter, loc, memref);

  SmallVector<Value> sizes =

      getValueOrCreateConstantIndexOp(rewriter, loc, mixedSizes);


  auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);

  Value desc = TmaCreateDescriptorOp::create(

      rewriter, loc,

      TensorMapDescriptorType::get(rewriter.getContext(),

                                   MemRefType::Builder(memref.getType())

                                       .setMemorySpace(sharedMemorySpace),

                                   TensorMapSwizzleKind::SWIZZLE_NONE,

                                   TensorMapL2PromoKind::L2PROMO_NONE,

                                   TensorMapOOBKind::OOB_ZERO,

                                   TensorMapInterleaveKind::INTERLEAVE_NONE),

      unrankedMemRef, sizes);

  return cast<TypedValue<TensorMapDescriptorType>>(desc);

}


OpFoldResult


HopperBuilder::buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,

                                 TypedValue<MemRefType> sharedMemref,

                                 TypedValue<MBarrierGroupType> barrier,

                                 SmallVectorImpl<Operation *> &loadOps) {

  MLIRContext *ctx = rewriter.getContext();

  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

  Operation *loadOp =

      TmaAsyncLoadOp::create(rewriter, loc, sharedMemref, barrier, globalDesc,

                             ValueRange{zero, zero}, zero, Value(), Value());

  loadOps.push_back(loadOp);

  auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);

  SmallVector<AffineExpr> symbols(mixedSizes.size());

  bindSymbolsList(ctx, llvm::MutableArrayRef{symbols});

  AffineExpr prodExprInBytes =

      computeProduct(ctx, symbols) *

      (sharedMemref.getType().getElementTypeBitWidth() / 8);

  auto res = affine::makeComposedFoldedAffineApply(rewriter, loc,

                                                   prodExprInBytes, mixedSizes);

  return res;

}


void HopperBuilder::buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,

                                         ArrayRef<OpFoldResult> mixedSizes) {

  assert(!mixedSizes.empty() && "expecte non-empty sizes");

  MLIRContext *ctx = rewriter.getContext();

  SmallVector<AffineExpr> symbols(mixedSizes.size());

  bindSymbolsList(ctx, llvm::MutableArrayRef{symbols});

  AffineExpr sumExpr = computeSum(ctx, symbols);

  OpFoldResult size =

      affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes);

  Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size);

  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

  nvgpu::MBarrierArriveExpectTxOp::create(rewriter, loc, barrier, sizeVal, zero,

                                          Value());

}


void HopperBuilder::buildTryWaitParity(TypedValue<MBarrierGroupType> barrier) {

  Type i1 = rewriter.getI1Type();

  Value parity = LLVM::ConstantOp::create(rewriter, loc, i1, 0);

  // 10M is an arbitrary, not too small or too big number to specify the number

  // of ticks before retry.

  // TODO: hoist this in a default dialect constant.

  Value ticksBeforeRetry =

      arith::ConstantIndexOp::create(rewriter, loc, 10000000);

  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);

  nvgpu::MBarrierTryWaitParityOp::create(rewriter, loc, barrier, parity,

                                         ticksBeforeRetry, zero);

}


//===----------------------------------------------------------------------===//

// RewriteCopyAsTmaOp

//===----------------------------------------------------------------------===//


/// Helper to create the tma operations corresponding to `linalg::CopyOp`.


struct CopyBuilder : public HopperBuilder {


  CopyBuilder(RewriterBase &rewriter, Location loc)

      : HopperBuilder(rewriter, loc) {}


  SmallVector<Operation *> rewrite(ArrayRef<Operation *> copyOps);

};


SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {

  MLIRContext *ctx = rewriter.getContext();

  if (copyOps.empty())

    return SmallVector<Operation *>();


  auto launchOp = copyOps.front()->getParentOfType<gpu::LaunchOp>();

  assert(launchOp && "expected launch op");


  // 1. Init a barrier object in shared memory.

  OpBuilder::InsertionGuard g(rewriter);

  rewriter.setInsertionPoint(copyOps.front());

  AffineExpr bx, by, bz;

  bindSymbols(ctx, bx, by, bz);

  AffineExpr prod = computeProduct(ctx, ArrayRef<AffineExpr>{bx, by, bz});

  OpFoldResult numThreads = affine::makeComposedFoldedAffineApply(

      rewriter, loc, prod,

      ArrayRef<OpFoldResult>{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(),

                             launchOp.getBlockSizeZ()});


  TypedValue<MBarrierGroupType> barrier =

      buildAndInitBarrierInSharedMemory(numThreads);


  SmallVector<TypedValue<MemRefType>> shmems;

  SmallVector<TypedValue<TensorMapDescriptorType>> globalDescs;

  for (Operation *op : copyOps) {

    auto copyOp = cast<linalg::CopyOp>(op);

    auto inMemRef =

        cast<TypedValue<MemRefType>>(copyOp.getDpsInputOperand(0)->get());

    assert(inMemRef.getType().getRank() == 2 &&

           "expected in to be a 2D memref");


    // 2. Build global memory descriptor.

    TypedValue<TensorMapDescriptorType> globalDesc =

        buildGlobalMemRefDescriptor(inMemRef, launchOp);

    globalDescs.push_back(globalDesc);


    // 3. Shared memory and descriptor for the tmp array.

    auto shmem =

        cast<TypedValue<MemRefType>>(copyOp.getDpsInitOperand(0)->get());

    shmems.push_back(shmem);

  }


  // 4. Load in from global memory to shared memory using tma.

  OpBuilder::InsertionGuard g2(rewriter);

  rewriter.setInsertionPoint(copyOps.front());

  SmallVector<Operation *> results =

      buildPredicateLoadsOnThread0(globalDescs, shmems, barrier);


  // 5. Spin-loop until data is ready.

  buildTryWaitParity(barrier);


  // 6. Erase the ops that have now been rewritten.

  for (Operation *op : copyOps)

    rewriter.eraseOp(op);


  return results;

}


DiagnosedSilenceableFailure

RewriteCopyAsTmaOp::apply(TransformRewriter &rewriter,

                          TransformResults &results, TransformState &state) {

  auto payloadOps = state.getPayloadOps(getTarget());

  gpu::LaunchOp commonLaunchOp;

  Operation *firstOp, *failingOp;

  if (llvm::any_of(payloadOps, [&](Operation *op) {

        if (!commonLaunchOp) {

          commonLaunchOp = op->getParentOfType<gpu::LaunchOp>();

          firstOp = op;

        }

        auto fail = !op->getParentOfType<gpu::LaunchOp>() ||

                    commonLaunchOp != op->getParentOfType<gpu::LaunchOp>() ||

                    !isa<linalg::CopyOp>(op);

        if (fail)

          failingOp = op;

        return fail;

      })) {

    DiagnosedSilenceableFailure diag =

        emitSilenceableError()

        << "target ops must be linalg::CopyOp nested under a common "

           "gpu.LaunchOp to be rewritten because the tma descriptors need to "

           "be created on the host.\nBut got: "

        << *firstOp << "\nand " << *failingOp;

    return diag;

  }


  // TODO: more robust detection of copy, with transposes etc.

  CopyBuilder(rewriter, getLoc()).rewrite(llvm::to_vector(payloadOps));


  return DiagnosedSilenceableFailure::success();

}


//===----------------------------------------------------------------------===//

// Transform op registration

//===----------------------------------------------------------------------===//


namespace {

class NVGPUTransformDialectExtension

    : public TransformDialectExtension<NVGPUTransformDialectExtension> {

public:

  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(NVGPUTransformDialectExtension)


  NVGPUTransformDialectExtension() {

    declareGeneratedDialect<arith::ArithDialect>();

    declareGeneratedDialect<affine::AffineDialect>();

    declareGeneratedDialect<NVGPUDialect>();

    declareGeneratedDialect<NVVM::NVVMDialect>();

    declareGeneratedDialect<vector::VectorDialect>();

    registerTransformOps<

#define GET_OP_LIST

#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"

        >();

  }

};

} // namespace


#define GET_OP_CLASSES

#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc"


void mlir::nvgpu::registerTransformDialectExtension(DialectRegistry &registry) {

  registry.addExtensions<NVGPUTransformDialectExtension>();

}

indices
indices
Definition AffineAnalysis.cpp:262

success
return success()

emitOpError
p<< " : "<< getMemRefType()<< ", "<< getType();}static LogicalResult verifyVectorMemoryOp(Operation *op, MemRefType memrefType, VectorType vectorType) { if(memrefType.getElementType() !=vectorType.getElementType()) return op-> emitOpError("requires memref and vector types of the same elemental type")
Given a list of lists of parsed operands, populates uniqueOperands with unique operands.

AffineOps.h

Utils.h

GPUCommonPass.h

GPUDialect.h

lhs
lhs
Definition AffineExpr.cpp:832

IndexingUtils.h

b
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
Definition LinalgTransformOps.cpp:2097

TypeRange
TypeRange
Definition LinalgTransformOps.cpp:2099

ValueRange
b ValueRange
Definition LinalgTransformOps.cpp:2103

target
target
Definition LinalgTransformOps.cpp:2100

result
result
Definition LinalgTransformOps.cpp:2098

load
auto load
Definition LoopUtils.cpp:1923

diag
static std::string diag(const llvm::Value &value)
Definition ModuleImport.cpp:57

NVGPUDialect.h

kWgmmaSizeM
constexpr int kWgmmaSizeM
M size of wgmma.mma_async instruction.
Definition NVGPUDialect.h:40

NVGPUToNVVM.h

getSharedAddressSpaceAttribute
static Attribute getSharedAddressSpaceAttribute(OpBuilder &b)
Definition NVGPUTransformOps.cpp:910

hasDefaultMemorySpace
static bool hasDefaultMemorySpace(BaseMemRefType type)
Returns true if the given type has the default memory space.
Definition NVGPUTransformOps.cpp:135

collectStage0PipeliningOps
static LogicalResult collectStage0PipeliningOps(scf::ForOp forOp, llvm::SmallPtrSet< Operation *, 16 > &ops)
Populate ops with the set of operations that belong to the stage 0 of the pipelined version of the gi...
Definition NVGPUTransformOps.cpp:192

pipelineForSharedCopies
static std::tuple< DiagnosedSilenceableFailure, scf::ForOp > pipelineForSharedCopies(RewriterBase &rewriter, scf::ForOp forOp, int64_t depth, bool epiloguePeeling)
Applies loop pipelining with the given depth to the given loop so that copies into the shared memory ...
Definition NVGPUTransformOps.cpp:337

isStoreToShared
static bool isStoreToShared(Operation *op, Value v)
Returns true if the operation is storing the given value into shared memory.
Definition NVGPUTransformOps.cpp:162

foreachIndividualVectorElement
static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn, ReduceFn reduceFn)
Helper functions to create customizable load and stores operations.
Definition NVGPUTransformOps.cpp:634

hasSharedMemorySpace
static bool hasSharedMemorySpace(BaseMemRefType type)
Returns true if the given type has the shared (workgroup) memory space.
Definition NVGPUTransformOps.cpp:140

isLoadFromGlobalStoredToShared
static bool isLoadFromGlobalStoredToShared(Operation *op)
Returns true if the operation is a load from the default memory space the result of which is only sto...
Definition NVGPUTransformOps.cpp:174

makeVectorShapes
static std::tuple< SmallVector< int64_t >, SmallVector< int64_t >, SmallVector< int64_t > > makeVectorShapes(ArrayRef< int64_t > lhs, ArrayRef< int64_t > rhs, ArrayRef< int64_t > res)
Definition NVGPUTransformOps.cpp:723

getPipelineStages
static void getPipelineStages(scf::ForOp forOp, std::vector< std::pair< Operation *, unsigned > > &opsWithPipelineStages, unsigned depth, llvm::SmallPtrSetImpl< Operation * > &stage0Ops)
Hook for the loop pipeliner that populates ops with the stage information as follows:
Definition NVGPUTransformOps.cpp:263

setAsyncWaitGroupsInFlight
static void setAsyncWaitGroupsInFlight(OpBuilder &builder, Operation *op, scf::PipeliningOption::PipelinerPart part, unsigned iteration, unsigned depth)
Hook for the loop pipeliner that sets the "num groups in flight" attribute of async wait operations c...
Definition NVGPUTransformOps.cpp:229

replaceOpWithPredicatedOp
static Operation * replaceOpWithPredicatedOp(RewriterBase &rewriter, Operation *op, Value predicate)
Hook for the loop pipeliner.
Definition NVGPUTransformOps.cpp:294

getValueLoadedFromGlobal
static Value getValueLoadedFromGlobal(Operation *op)
Returns the value produced by a load from the default memory space.
Definition NVGPUTransformOps.cpp:149

NVGPUTransformOps.h

NVVMDialect.h

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition PassManagerOptions.cpp:89

vectorShape
static std::optional< VectorShape > vectorShape(Type type)
Definition PolynomialApproximation.cpp:47

SliceAnalysis.h

StaticValueUtils.h

TypeConverter.h

MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition TypeID.h:331

Value.h

VectorOps.h

rhs
*B rhs
Definition VectorTransforms.cpp:2249

TypeConverter

int64_t

llvm::ArrayRef
Definition LLVM.h:48

llvm::MutableArrayRef
Definition LLVM.h:62

llvm::SmallPtrSetImpl
Definition LLVM.h:70

llvm::SmallPtrSet
Definition LLVM.h:68

llvm::SmallVectorImpl
Definition LLVM.h:74

llvm::SmallVector
Definition LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition AffineExpr.h:68

mlir::AffineExpr::floorDiv
AffineExpr floorDiv(uint64_t v) const
Definition AffineExpr.cpp:959

mlir::Attribute
Attributes are known-constant values of operations.
Definition Attributes.h:25

mlir::BaseMemRefType
This class provides a shared interface for ranked and unranked memref types.
Definition BuiltinTypes.h:104

mlir::BaseMemRefType::getMemorySpace
Attribute getMemorySpace() const
Returns the memory space in which data referred to by this memref resides.
Definition BuiltinTypes.cpp:439

mlir::BaseMemRefType::getMemorySpaceAsInt
unsigned getMemorySpaceAsInt() const
[deprecated] Returns the memory space in old raw integer representation.
Definition BuiltinTypes.cpp:445

mlir::Builder::getIndexType
IndexType getIndexType()
Definition Builders.cpp:51

mlir::DiagnosedSilenceableFailure
The result of a transform IR operation application.
Definition DiagnosedSilenceableFailure.h:38

mlir::DiagnosedSilenceableFailure::success
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
Definition DiagnosedSilenceableFailure.h:48

mlir::DiagnosedSilenceableFailure::definiteFailure
static DiagnosedSilenceableFailure definiteFailure()
Constructs a DiagnosedSilenceableFailure in the failure state.
Definition DiagnosedSilenceableFailure.h:54

mlir::DialectRegistry
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Definition DialectRegistry.h:139

mlir::DialectRegistry::addExtensions
void addExtensions()
Add the given extensions to the registry.
Definition DialectRegistry.h:222

mlir::LLVMTypeConverter
Conversion from types to the LLVM IR dialect.
Definition TypeConverter.h:35

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63

mlir::MemRefType::Builder
This is a builder type that keeps local references to arguments.
Definition BuiltinTypes.h:182

mlir::MemRefType::Builder::setMemorySpace
Builder & setMemorySpace(Attribute newMemorySpace)
Definition BuiltinTypes.h:208

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:348

mlir::OpBuilder
This class helps build Operations.
Definition Builders.h:207

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:398

mlir::OpFoldResult
This class represents a single result from folding an operation.
Definition OpDefinition.h:272

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88

mlir::Operation::getBlock
Block * getBlock()
Returns the operation block that contains this operation.
Definition Operation.h:213

mlir::Operation::getParentOfType
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238

mlir::RewritePatternSet
Definition PatternMatch.h:816

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition PatternMatch.h:368

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition PatternMatch.cpp:127

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition PatternMatch.cpp:155

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:37

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74

mlir::Type::isF32
bool isF32() const
Definition Types.cpp:40

mlir::Type::isInteger
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition Types.cpp:56

mlir::Type::isF16
bool isF16() const
Definition Types.cpp:38

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition Value.h:105

mlir::Value::getUsers
user_range getUsers() const
Definition Value.h:218

mlir::Value::hasOneUse
bool hasOneUse() const
Returns true if this value has exactly one use.
Definition Value.h:197

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18

mlir::arith::ConstantIndexOp::create
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359

mlir::transform::ApplyToEachResultList
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
Definition TransformInterfaces.h:1409

mlir::transform::ApplyToEachResultList::push_back
void push_back(Operation *op)
Appends an element to the list.
Definition TransformInterfaces.h:1442

mlir::transform::TransformDialectExtension
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
Definition TransformDialect.h:118

mlir::transform::TransformResults
Local mapping between values defined by a specific op implementing the TransformOpInterface and the p...
Definition TransformInterfaces.h:815

mlir::transform::TransformRewriter
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
Definition TransformInterfaces.h:1108

mlir::transform::TransformState
The state maintained across applications of various ops implementing the TransformOpInterface.
Definition TransformInterfaces.h:173

mlir::transform::TransformState::getPayloadOps
auto getPayloadOps(Value value) const
Returns an iterator that enumerates all ops that the given transform IR value corresponds to.
Definition TransformInterfaces.h:248

void

Arith.h

Linalg.h

MemRef.h

Transforms.h

SCF.h

Transforms.h

AffineExpr.h

BuiltinTypes.h

mlir::NVVM
Definition GPUToNVVM.h:18

mlir::affine::makeComposedFoldedAffineApply
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition AffineOps.cpp:1469

mlir::linalg
Definition LinalgToStandard.h:24

mlir::memref
Definition Passes.h:26

mlir::memref::getMixedSizes
SmallVector< OpFoldResult > getMixedSizes(OpBuilder &builder, Location loc, Value value)
Return the dimensions of the given memref value.
Definition MemRefOps.cpp:77

mlir::nvgpu
Definition NVGPUToNVVM.h:26

mlir::nvgpu::populateCommonGPUTypeAndAttributeConversions
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Definition NVGPUToNVVM.cpp:1711

mlir::nvgpu::getMBarrierMemrefType
MemRefType getMBarrierMemrefType(MLIRContext *context, MBarrierGroupType barrierType)
Return the memref type that can be used to represent an mbarrier object.

mlir::nvgpu::registerTransformDialectExtension
void registerTransformDialectExtension(DialectRegistry &registry)

mlir::nvgpu::createAsyncGroups
void createAsyncGroups(RewriterBase &rewriter, Operation *op, bool bypassL1)
Convert global->shared vector transfers to async device copies.
Definition CreateAsyncGroups.cpp:152

mlir::transform
Definition DLTITransformOps.h:18

mlir::transform::producesHandle
void producesHandle(ResultRange handles, SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Definition TransformInterfaces.cpp:1821

mlir::transform::consumesHandle
void consumesHandle(MutableArrayRef< OpOperand > handles, SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Populates effects with the memory effects indicating the operation on the given handle value:
Definition TransformInterfaces.cpp:1791

mlir::transform::modifiesPayload
void modifiesPayload(SmallVectorImpl< MemoryEffects::EffectInstance > &effects)
Populates effects with the memory effects indicating the access to payload IR resource.
Definition TransformInterfaces.cpp:1852

mlir::vector
Definition ConvertVectorToLLVM.h:22

mlir
Include the generated interface declarations.
Definition AliasAnalysis.h:19

mlir::getAsIndexOpFoldResult
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
Definition StaticValueUtils.cpp:106

mlir::getBackwardSlice
LogicalResult getBackwardSlice(Operation *op, SetVector< Operation * > *backwardSlice, const BackwardSliceOptions &options={})
Fills backwardSlice with the computed backward slice (i.e.
Definition SliceAnalysis.cpp:179

mlir::computeStrides
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
Definition IndexingUtils.h:47

mlir::emitSilenceableFailure
DiagnosedSilenceableFailure emitSilenceableFailure(Location loc, const Twine &message={})
Emits a silenceable failure with the given message.
Definition DiagnosedSilenceableFailure.h:256

mlir::delinearize
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
Definition IndexingUtils.cpp:97

mlir::isMemoryEffectFree
bool isMemoryEffectFree(Operation *op)
Returns true if the given operation is free of memory effects.
Definition SideEffectInterfaces.cpp:320

mlir::computeProduct
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
Definition IndexingUtils.cpp:84

mlir::SetVector
llvm::SetVector< T, Vector, Set, N > SetVector
Definition LLVM.h:131

mlir::emitDefiniteFailure
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
Definition DiagnosedSilenceableFailure.h:243

mlir::populateNVGPUToNVVMConversionPatterns
void populateNVGPUToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Definition NVGPUToNVVM.cpp:1731

mlir::getElementTypeOrSelf
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
Definition TypeUtilities.cpp:23

mlir::TypedValue
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition GreedyPatternRewriteDriver.h:283

mlir::bindSymbols
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325

mlir::getValueOrCreateConstantIndexOp
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:111

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition BytecodeImplementation.h:509

mlir::getAffineDimExpr
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.

mlir::computeSum
int64_t computeSum(ArrayRef< int64_t > basis)
Self-explicit.

mlir::bindSymbolsList
void bindSymbolsList(MLIRContext *ctx, MutableArrayRef< AffineExprTy > exprs)
Definition AffineExpr.h:330

CopyBuilder
Helper to create the tma operations corresponding to linalg::CopyOp.
Definition NVGPUTransformOps.cpp:1015

CopyBuilder::rewrite
SmallVector< Operation * > rewrite(ArrayRef< Operation * > copyOps)
Definition NVGPUTransformOps.cpp:1022

CopyBuilder::CopyBuilder
CopyBuilder(RewriterBase &rewriter, Location loc)
Definition NVGPUTransformOps.cpp:1016

HopperBuilder::buildBarrierArriveTx
void buildBarrierArriveTx(TypedValue< MBarrierGroupType > barrier, ArrayRef< OpFoldResult > sizes)
Definition NVGPUTransformOps.cpp:982

HopperBuilder::buildTmaAsyncLoad
OpFoldResult buildTmaAsyncLoad(TypedValue< TensorMapDescriptorType > globalDesc, TypedValue< MemRefType > sharedMemref, TypedValue< MBarrierGroupType > barrier, SmallVectorImpl< Operation * > &loadOps)
Build a tma load from global memory to shared memory using barrier to synchronize.
Definition NVGPUTransformOps.cpp:961

HopperBuilder::rewriter
RewriterBase & rewriter
Definition NVGPUTransformOps.cpp:868

HopperBuilder::buildGlobalMemRefDescriptor
TypedValue< TensorMapDescriptorType > buildGlobalMemRefDescriptor(TypedValue< MemRefType > memref, gpu::LaunchOp launchOp)
Create tma descriptor op to initiate transfer from global to shared memory.
Definition NVGPUTransformOps.cpp:932

HopperBuilder::buildTryWaitParity
void buildTryWaitParity(TypedValue< MBarrierGroupType > barrier)
Definition NVGPUTransformOps.cpp:997

HopperBuilder::buildAndInitBarrierInSharedMemory
TypedValue< MBarrierGroupType > buildAndInitBarrierInSharedMemory(OpFoldResult numThreads)
Definition NVGPUTransformOps.cpp:917

HopperBuilder::buildPredicateLoadsOnThread0
SmallVector< Operation * > buildPredicateLoadsOnThread0(ArrayRef< TypedValue< TensorMapDescriptorType > > globalDescriptors, ArrayRef< TypedValue< MemRefType > > sharedMemBuffers, TypedValue< MBarrierGroupType > barrier)
If threadIdx.x == 0 does TMA request + wait, else just wait.
Definition NVGPUTransformOps.cpp:872

HopperBuilder::loc
Location loc
Definition NVGPUTransformOps.cpp:869

HopperBuilder::HopperBuilder
HopperBuilder(RewriterBase &rewriter, Location loc)
Definition NVGPUTransformOps.cpp:837

MmaSyncBuilder
Helper struct to provide a simple mapping from matmul operations to the corresponding mma....
Definition NVGPUTransformOps.cpp:428

MmaSyncBuilder::MmaSyncBuilder
MmaSyncBuilder(OpBuilder &b, Location loc, OpFoldResult laneId)
Definition NVGPUTransformOps.cpp:429

MmaSyncBuilder::IndexCalculator
std::function< SmallVector< RowColIndexing >(MLIRContext *)> IndexCalculator
Definition NVGPUTransformOps.cpp:432

MmaSyncBuilder::buildMmaSync
FailureOr< Operation * > buildMmaSync(LinalgOp linalgOp)
Create the mma.sync operation corresponding to linalgOp along with all the supporting load/store and ...
Definition NVGPUTransformOps.cpp:760

RowColIndexing
Helper struct to encode a pair of row/column indexings in the form of affine expressions.
Definition NVGPUTransformOps.cpp:413

RowColIndexing::col
AffineExpr col() const
Definition NVGPUTransformOps.cpp:418

RowColIndexing::RowColIndexing
RowColIndexing(AffineExpr row, AffineExpr col)
Definition NVGPUTransformOps.cpp:414

RowColIndexing::print
void print(llvm::raw_ostream &os) const
Definition NVGPUTransformOps.cpp:420

RowColIndexing::row
AffineExpr row() const
Definition NVGPUTransformOps.cpp:417

mlir::BackwardSliceOptions
Definition SliceAnalysis.h:42

mlir::scf::PipeliningOption
Options to dictate how loops should be pipelined.
Definition Transforms.h:129

mlir::scf::PipeliningOption::PipelinerPart
PipelinerPart
Definition Transforms.h:135

mlir::scf::PipeliningOption::PipelinerPart::Prologue
@ Prologue
Definition Transforms.h:136

mlir::scf::PipeliningOption::PipelinerPart::Epilogue
@ Epilogue
Definition Transforms.h:138

mlir::scf::PipeliningOption::PipelinerPart::Kernel
@ Kernel
Definition Transforms.h:137