doxygen/CreateAsyncGroups_8cpp_source.html

 //===- CreateAsyncGroups.cpp - Create async device copies -----------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/NVGPU/Transforms/Transforms.h"


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"

 #include "mlir/Dialect/NVGPU/Transforms/Utils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/IR/BuiltinAttributes.h"

 #include "mlir/IR/BuiltinTypes.h"


 using namespace mlir;


 /// Return "true" if the given vector transfer op is contiguous and suitable

 /// for replacement with an async copy.

 template <typename OpTy>

 static bool isContiguousXferOp(OpTy op) {

   return op.getPermutationMap().isMinorIdentity() && op.isDimInBounds(0) &&

          op.hasPureBufferSemantics() &&

          cast<MemRefType>(nvgpu::getMemrefOperand(op).getType())

              .isLastDimUnitStride();

 }


 /// Return "true" if the given op is a contiguous and suitable

 /// vector.transfer_write or vector.store op.

 static bool isContiguousStore(Operation *write) {

   if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(write))

     return isContiguousXferOp(transferWrite) && !transferWrite.getMask();

   // vector.store are always contiguous.

   return isa<vector::StoreOp>(write);

 }


 /// Return "true" if the given op is a contiguous and suitable

 /// vector.transfer_read or vector.load op.

 static bool isContiguousRead(Operation *read) {

   if (auto transferRead = dyn_cast<vector::TransferReadOp>(read))

     return isContiguousXferOp(transferRead);

   // vector.load are always contiguous.

   return isa<vector::LoadOp>(read);

 }


 namespace {

 /// A vector.create_mask op and extract position.

 struct TransferMask {

   vector::CreateMaskOp createMaskOp;

   SmallVector<int64_t> extractPosition;

 };

 } // namespace


 /// If the given vector load op has a mask that is defined by

 /// vector.create_mask, return that op.

 static FailureOr<TransferMask> getMaskOp(Operation *loadOp) {

   auto transferRead = dyn_cast<vector::TransferReadOp>(loadOp);

   if (!transferRead || !transferRead.getMask())

     return TransferMask{{}, {}};

   assert(transferRead.getMask().getType().getRank() == 1 &&

          "expected 1-D mask");


   // Case 1: Mask is the result of a vector.create_mask.

   if (auto maskOp =

           transferRead.getMask().getDefiningOp<vector::CreateMaskOp>())

     return TransferMask{maskOp, {}};


   // Case 2: Mask is the result of a vector.extract(vector.create_mask).

   if (auto extractOp =

           transferRead.getMask().getDefiningOp<vector::ExtractOp>())

     if (auto maskOp =

             extractOp.getVector().getDefiningOp<vector::CreateMaskOp>())

       return TransferMask{maskOp,

                           SmallVector<int64_t>(extractOp.getStaticPosition())};


   // All other cases: not supported.

   return failure();

 }


 /// Build an SSA value that represents the number of read elements.

 static Value buildNumReadElements(OpBuilder &b, Location loc,

                                   Operation *readOp) {

   FailureOr<TransferMask> transferMask = getMaskOp(readOp);

   assert(succeeded(transferMask) && "invalid transfer mask");


   // No mask => no num_read_elements.

   if (!transferMask->createMaskOp)

     return Value();


   // No extract: return size of "ones" segment in the mask.

   if (transferMask->extractPosition.empty()) {

     assert(transferMask->createMaskOp.getNumOperands() == 1 &&

            "expected single operand");

     return transferMask->createMaskOp.getOperand(0);

   }


   // vector.extract(vector.create_mask).

   // If extract_pos < num_ones, take number of elements from the least

   // significant dimension. (Do this for all dimensions and bit-AND the

   // conditions.)

   assert(transferMask->createMaskOp.getVectorType().getRank() -

                  transferMask->extractPosition.size() ==

              1 &&

          "expected N-D -> (N-1)-D extract");

   Value cond;

   // Note: There is one more `sz` than `pos`. The loop end with the last `pos`.

   for (auto [pos, sz] : llvm::zip(transferMask->extractPosition,

                                   transferMask->createMaskOp->getOperands())) {

     Value cmp =

         b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,

                                 b.create<arith::ConstantIndexOp>(loc, pos), sz);

     if (!cond) {

       cond = cmp;

       continue;

     }

     cond = b.create<arith::AndIOp>(loc, cmp, cond);

   }

   return b.create<arith::SelectOp>(

       loc, cond, transferMask->createMaskOp->getOperands().back(),

       b.create<arith::ConstantIndexOp>(loc, 0));

 }


 /// Return "true" if the conversion to async copy is supported by "async copy".

 static bool resultsInSupportedAsyncCopy(MemRefType memrefType,

                                         VectorType vecType) {

   assert(vecType.getRank() == 1 && "expected 1-D vector");

   constexpr int64_t kSupportedCpAsyncAlignmentsInBytes[3] = {4, 8, 16};


   // Condition 1: the copy size must be supported.

   bool supportedCopySize = false;

   int64_t numElements = vecType.getNumElements();

   Type elementType = vecType.getElementType();

   for (int64_t alignmentInBytes : kSupportedCpAsyncAlignmentsInBytes) {

     if (alignmentInBytes * 8 ==

         numElements * elementType.getIntOrFloatBitWidth()) {

       supportedCopySize = true;

       break;

     }

   }

   if (!supportedCopySize)

     return false;


   // TODO: Condition 2: the alignments must be supported. For cp.async the

   // NVIDIA doc (section 6.4.1) says: "The address must be naturally aligned to

   // a multiple of the access size. If an address is not properly aligned, the

   // resulting behavior is undefined.".

   return true;

 }


 void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,

                               bool bypassL1) {

   llvm::SmallSetVector<Operation *, 16> copyToSharedMem;


   // Look for all the copy that can be converted to async copy ops.

   op->walk([&](Operation *writeOp) {

     // Look for contiguous 1D vector store into shared memory.

     if (!isContiguousStore(writeOp))

       return;

     Value vectorVal = nvgpu::getValueStored(writeOp);

     if (cast<VectorType>(vectorVal.getType()).getRank() != 1)

       return;

     Value storeBase = nvgpu::getMemrefOperand(writeOp);

     if (!nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(

             cast<MemRefType>(storeBase.getType())))

       return;


     // The stored vector must originate from a contiguous 1D vector load.

     Operation *readOp = vectorVal.getDefiningOp();

     if (readOp == nullptr || !isContiguousRead(readOp))

       return;

     Value loadBase = nvgpu::getMemrefOperand(readOp);

     // Should be reading from global memory (not shared memory).

     if (nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(

             cast<MemRefType>(loadBase.getType())))

       return;


     // Look for compatible mask and padding.

     if (auto transferRead = dyn_cast<vector::TransferReadOp>(readOp)) {

       if (Value mask = transferRead.getMask()) {

         if (getConstantIntValue(transferRead.getPadding()) ==

             static_cast<int64_t>(0))

           return;

         if (failed(getMaskOp(readOp)))

           return;

       }

     }


     // Check whether both accesses are supported before we emit: this is

     // necessary to ensure the correctness of DeviceAsyncCopyOp.

     VectorType vecType = cast<VectorType>(vectorVal.getType());


     if (!resultsInSupportedAsyncCopy(cast<MemRefType>(loadBase.getType()),

                                      vecType) ||

         !resultsInSupportedAsyncCopy(cast<MemRefType>(storeBase.getType()),

                                      vecType))

       return;


     copyToSharedMem.insert(writeOp);

     return;

   });


   while (!copyToSharedMem.empty()) {

     // Start a group with the first write.

     SmallVector<Operation *> group;

     Operation *writeOp = *copyToSharedMem.begin();

     copyToSharedMem.remove(writeOp);

     group.push_back(writeOp);

     Operation *nextNode = writeOp;


     // Look in the next nodes for more copies to add to the same group.

     while ((nextNode = nextNode->getNextNode())) {

       // Ignore ops without side effects.

       auto memInterface = dyn_cast<MemoryEffectOpInterface>(nextNode);

       if (memInterface && memInterface.hasNoEffect() &&

           !nextNode->hasTrait<OpTrait::HasRecursiveMemoryEffects>())

         continue;

       // Ignore read from a different address space.

       if (isa<vector::TransferReadOp, vector::LoadOp>(nextNode)) {

         Operation *readOp = nextNode;

         Value memrefOperand = nvgpu::getMemrefOperand(readOp);

         if (!nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(

                 cast<MemRefType>(memrefOperand.getType()))) {

           continue;

         }

       }

       if (copyToSharedMem.count(nextNode)) {

         // Found another copy, add it to the group.

         copyToSharedMem.remove(nextNode);

         group.push_back(nextNode);

         continue;

       }

       // If the op is something else stop the accumulating op in the group.

       break;

     }


     // Emit the group.

     SmallVector<Value> tokens;

     for (Operation *writeOp : group) {

       rewriter.setInsertionPoint(writeOp);

       Value vectorVal = nvgpu::getValueStored(writeOp);

       auto vectorType = cast<VectorType>(vectorVal.getType());

       int64_t numElements = vectorType.getNumElements();

       Operation *readOp = vectorVal.getDefiningOp();

       Value storeBase = nvgpu::getMemrefOperand(writeOp);

       Value loadBase = nvgpu::getMemrefOperand(readOp);

       Value numReadElements =

           buildNumReadElements(rewriter, writeOp->getLoc(), readOp);

       auto dstMemref = cast<MemRefType>(storeBase.getType());

       int64_t sizeInBytes =

           (dstMemref.getElementTypeBitWidth() * numElements) / 8;

       // bypass_l1 only possible with 16 byte transfer.

       Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(

           writeOp->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),

           /*dst=*/storeBase, /*dstIndices=*/nvgpu::getIndices(writeOp),

           /*src=*/loadBase,

           /*srcIndices=*/nvgpu::getIndices(readOp),

           /*dstElements=*/rewriter.getIndexAttr(numElements),

           /*srcElements=*/numReadElements,

           /*bypassL1=*/bypassL1 && sizeInBytes == 16 ? rewriter.getUnitAttr()

                                                      : UnitAttr());

       tokens.push_back(token);

     }


     // Create the group and wait for it right after.

     Value groupToken = rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(

         op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),

         tokens);

     rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), groupToken,

                                               nullptr);

     // Clean up old stores.

     for (Operation *writeOp : group)

       rewriter.eraseOp(writeOp);

   }

 }

isContiguousStore
static bool isContiguousStore(Operation *write)
Return "true" if the given op is a contiguous and suitable vector.transfer_write or vector....
Definition: CreateAsyncGroups.cpp:33

isContiguousXferOp
static bool isContiguousXferOp(OpTy op)
Return "true" if the given vector transfer op is contiguous and suitable for replacement with an asyn...
Definition: CreateAsyncGroups.cpp:24

resultsInSupportedAsyncCopy
static bool resultsInSupportedAsyncCopy(MemRefType memrefType, VectorType vecType)
Return "true" if the conversion to async copy is supported by "async copy".
Definition: CreateAsyncGroups.cpp:127

isContiguousRead
static bool isContiguousRead(Operation *read)
Return "true" if the given op is a contiguous and suitable vector.transfer_read or vector....
Definition: CreateAsyncGroups.cpp:42

getMaskOp
static FailureOr< TransferMask > getMaskOp(Operation *loadOp)
If the given vector load op has a mask that is defined by vector.create_mask, return that op.
Definition: CreateAsyncGroups.cpp:59

buildNumReadElements
static Value buildNumReadElements(OpBuilder &b, Location loc, Operation *readOp)
Build an SSA value that represents the number of read elements.
Definition: CreateAsyncGroups.cpp:84

Utils.h

GPUDialect.h

extractPosition
static SmallVector< unsigned > extractPosition(ArrayRef< int64_t > indices)
Convert the value of a DenseI64ArrayAttr to a vector of unsigned indices.
Definition: LLVMToLLVMIRTranslation.cpp:56

NVGPUDialect.h

VectorOps.h

llvm::SmallVector
Definition: LLVM.h:72

mlir::Builder::getIndexAttr
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:106

mlir::Builder::getUnitAttr
UnitAttr getUnitAttr()
Definition: Builders.cpp:96

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455

mlir::OpTrait::HasRecursiveMemoryEffects
This trait indicates that the memory effects of an operation includes the effects of operations neste...
Definition: SideEffectInterfaces.h:317

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::hasTrait
bool hasTrait()
Returns true if the operation was registered with a particular trait, e.g.
Definition: Operation.h:749

mlir::Operation::walk
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:797

mlir::Operation::getContext
MLIRContext * getContext()
Return the context this operation is associated with.
Definition: Operation.h:216

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223

mlir::Operation::remove
void remove()
Remove the operation from its parent block, but don't delete it.
Definition: Operation.cpp:547

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358

mlir::RewriterBase::eraseOp
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Definition: PatternMatch.cpp:157

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::getIntOrFloatBitWidth
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:122

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20

Arith.h

Transforms.h

BuiltinAttributes.h

BuiltinTypes.h

mlir::nvgpu::getMemrefOperand
Value getMemrefOperand(Operation *op)
Get the memref that is loaded from/stored into by the given load/store operation.
Definition: Utils.cpp:68

mlir::nvgpu::getValueStored
Value getValueStored(Operation *op)
Get the value that is stored by the given store operation.
Definition: Utils.cpp:58

mlir::nvgpu::getIndices
Operation::operand_range getIndices(Operation *op)
Get the indices that the given load/store operation is operating on.
Definition: Utils.cpp:18

mlir::nvgpu::createAsyncGroups
void createAsyncGroups(RewriterBase &rewriter, Operation *op, bool bypassL1)
Convert global->shared vector transfers to async device copies.
Definition: CreateAsyncGroups.cpp:153

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::getConstantIntValue
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Definition: StaticValueUtils.cpp:115

mlir::getType
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition: Utils.cpp:305

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509