doxygen/OptimizeSharedMemory_8cpp_source.html

 //===- OptimizeSharedMemory.cpp - MLIR NVGPU pass implementation ----------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements transforms to optimize accesses to shared memory.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"

 #include "mlir/Dialect/NVGPU/Transforms/Transforms.h"

 #include "mlir/Dialect/NVGPU/Transforms/Utils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/Interfaces/SideEffectInterfaces.h"

 #include "mlir/Support/LogicalResult.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/Support/MathExtras.h"


 namespace mlir {

 namespace nvgpu {

 #define GEN_PASS_DEF_OPTIMIZESHAREDMEMORY

 #include "mlir/Dialect/NVGPU/Transforms/Passes.h.inc"

 } // namespace nvgpu

 } // namespace mlir


 using namespace mlir;

 using namespace mlir::nvgpu;


 /// The size of a shared memory line according to NV documentation.

 constexpr int64_t kSharedMemoryLineSizeBytes = 128;

 /// We optimize for 128bit accesses, but this can be made an argument in the

 /// future.

 constexpr int64_t kDefaultVectorSizeBits = 128;


 /// Uses `srcIndexValue` to permute `tgtIndexValue` via

 /// `result = xor(floordiv(srcIdxVal,permuteEveryN),

 ///               floordiv(tgtIdxVal,vectorSize)))

 ///            + tgtIdxVal % vectorSize`

 /// This is done using an optimized sequence of `arith` operations.

 static Value permuteVectorOffset(OpBuilder &b, Location loc,

                                  ArrayRef<Value> indices, MemRefType memrefTy,

                                  int64_t srcDim, int64_t tgtDim) {

   // Adjust the src index to change how often the permutation changes

   // if necessary.

   Value src = indices[srcDim];


   // We only want to permute every N iterations of the target dim where N is

   // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).

   const int64_t permuteEveryN = std::max<int64_t>(

       1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *

                                         memrefTy.getElementTypeBitWidth()) /

                                        8));


   // clang-format off

   // Index bit representation (b0 = least significant bit) for dim(1)

   // of a `memref<?x?xDT>` is as follows:

   // N := log2(128/elementSizeBits)

   // M := log2(dimSize(1))

   // then

   // bits[0:N] = sub-vector element offset

   // bits[N:M] = vector index

   // clang-format on

   int64_t n =

       llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());

   int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));


   // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.

   int64_t mask = (1LL << (m - n)) - 1;

   if (permuteEveryN > 1)

     mask = mask << llvm::Log2_64(permuteEveryN);

   Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);

   srcBits = b.create<arith::AndIOp>(loc, src, srcBits);


   // Use the src bits to permute the target bits b[N:M] containing the

   // vector offset.

   if (permuteEveryN > 1) {

     int64_t shlBits = n - llvm::Log2_64(permuteEveryN);

     if (shlBits > 0) {

       Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);

       srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);

     } else if (shlBits < 0) {

       Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);

       srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);

     }

   } else {

     Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, n);

     srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);

   }


   Value permutedVectorIdx =

       b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);

   return permutedVectorIdx;

 }


 static void transformIndices(OpBuilder &builder, Location loc,

                              SmallVector<Value, 4> &indices,

                              MemRefType memrefTy, int64_t srcDim,

                              int64_t tgtDim) {

   indices[tgtDim] =

       permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);

 }


 /// Return all operations within `parentOp` that read from or write to

 /// `shmMemRef`.

 static LogicalResult

 getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,

                       SmallVector<Operation *, 16> &readOps,

                       SmallVector<Operation *, 16> &writeOps) {

   parentOp->walk([&](Operation *op) {

     MemoryEffectOpInterface iface = dyn_cast<MemoryEffectOpInterface>(op);

     if (!iface)

       return;

     std::optional<MemoryEffects::EffectInstance> effect =

         iface.getEffectOnValue<MemoryEffects::Read>(shmMemRef);

     if (effect) {

       readOps.push_back(op);

       return;

     }

     effect = iface.getEffectOnValue<MemoryEffects::Write>(shmMemRef);

     if (effect)

       writeOps.push_back(op);

   });


   // Restrict to a supported set of ops. We also require at least 2D access,

   // although this could be relaxed.

   if (llvm::any_of(readOps, [](Operation *op) {

         return !isa<memref::LoadOp, vector::LoadOp, nvgpu::LdMatrixOp>(op) ||

                getIndices(op).size() < 2;

       }))

     return failure();

   if (llvm::any_of(writeOps, [](Operation *op) {

         return !isa<memref::StoreOp, vector::StoreOp, nvgpu::DeviceAsyncCopyOp>(

                    op) ||

                getIndices(op).size() < 2;

       }))

     return failure();


   return success();

 }


 mlir::LogicalResult

 mlir::nvgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,

                                                 Value memrefValue) {

   auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());

   if (!memRefType || !NVGPUDialect::hasSharedMemoryAddressSpace(memRefType))

     return failure();


   // Abort if the given value has any sub-views; we do not do any alias

   // analysis.

   bool hasSubView = false;

   parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });

   if (hasSubView)

     return failure();


   // Check if this is necessary given the assumption of 128b accesses:

   // If dim[rank-1] is small enough to fit 8 rows in a 128B line.

   const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);

   const int64_t rowsPerLine =

       (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /

       rowSize;

   const int64_t threadGroupSize =

       1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));

   if (rowsPerLine >= threadGroupSize)

     return failure();


   // Get sets of operations within the function that read/write to shared

   // memory.

   SmallVector<Operation *, 16> shmReadOps;

   SmallVector<Operation *, 16> shmWriteOps;

   if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,

                                    shmWriteOps)))

     return failure();


   if (shmReadOps.empty() || shmWriteOps.empty())

     return failure();


   OpBuilder builder(parentOp->getContext());


   int64_t tgtDim = memRefType.getRank() - 1;

   int64_t srcDim = memRefType.getRank() - 2;


   // Transform indices for the ops writing to shared memory.

   while (!shmWriteOps.empty()) {

     Operation *shmWriteOp = shmWriteOps.back();

     shmWriteOps.pop_back();

     builder.setInsertionPoint(shmWriteOp);


     auto indices = getIndices(shmWriteOp);

     SmallVector<Value, 4> transformedIndices(indices.begin(), indices.end());

     transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,

                      memRefType, srcDim, tgtDim);

     setIndices(shmWriteOp, transformedIndices);

   }


   // Transform indices for the ops reading from shared memory.

   while (!shmReadOps.empty()) {

     Operation *shmReadOp = shmReadOps.back();

     shmReadOps.pop_back();

     builder.setInsertionPoint(shmReadOp);


     auto indices = getIndices(shmReadOp);

     SmallVector<Value, 4> transformedIndices(indices.begin(), indices.end());

     transformIndices(builder, shmReadOp->getLoc(), transformedIndices,

                      memRefType, srcDim, tgtDim);

     setIndices(shmReadOp, transformedIndices);

   }


   return success();

 }


 namespace {

 class OptimizeSharedMemoryPass

     : public nvgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {

 public:

   OptimizeSharedMemoryPass() = default;


   void runOnOperation() override {

     Operation *op = getOperation();

     SmallVector<memref::AllocOp> shmAllocOps;

     op->walk([&](memref::AllocOp allocOp) {

       if (!NVGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))

         return;

       shmAllocOps.push_back(allocOp);

     });

     for (auto allocOp : shmAllocOps) {

       if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),

                                                     allocOp.getMemref())))

         return;

     }

   }

 };

 } // namespace


 std::unique_ptr<Pass> mlir::nvgpu::createOptimizeSharedMemoryPass() {

   return std::make_unique<OptimizeSharedMemoryPass>();

 }

Passes.h

Utils.h

GPUDialect.h

LogicalResult.h

NVGPUDialect.h

kSharedMemoryLineSizeBytes
constexpr int64_t kSharedMemoryLineSizeBytes
The size of a shared memory line according to NV documentation.
Definition: OptimizeSharedMemory.cpp:38

transformIndices
static void transformIndices(OpBuilder &builder, Location loc, SmallVector< Value, 4 > &indices, MemRefType memrefTy, int64_t srcDim, int64_t tgtDim)
Definition: OptimizeSharedMemory.cpp:103

kDefaultVectorSizeBits
constexpr int64_t kDefaultVectorSizeBits
We optimize for 128bit accesses, but this can be made an argument in the future.
Definition: OptimizeSharedMemory.cpp:41

permuteVectorOffset
static Value permuteVectorOffset(OpBuilder &b, Location loc, ArrayRef< Value > indices, MemRefType memrefTy, int64_t srcDim, int64_t tgtDim)
Uses srcIndexValue to permute tgtIndexValue via `result = xor(floordiv(srcIdxVal,permuteEveryN),...
Definition: OptimizeSharedMemory.cpp:48

getShmReadAndWriteOps
static LogicalResult getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef, SmallVector< Operation *, 16 > &readOps, SmallVector< Operation *, 16 > &writeOps)
Return all operations within parentOp that read from or write to shmMemRef.
Definition: OptimizeSharedMemory.cpp:114

SideEffectInterfaces.h

VectorOps.h

llvm::ArrayRef
Definition: LLVM.h:45

llvm::SmallVector
Definition: LLVM.h:69

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:209

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:400

mlir::OpBuilder::createOrFold
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:522

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::walk
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:793

mlir::Operation::getContext
MLIRContext * getContext()
Return the context this operation is associated with.
Definition: Operation.h:216

mlir::Operation::getLoc
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:129

Arith.h

MemRef.h

Transforms.h

mlir::nvgpu
Definition: NVGPUToNVVM.h:25

mlir::nvgpu::optimizeSharedMemoryReadsAndWrites
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue)
Passes.
Definition: OptimizeSharedMemory.cpp:150

mlir::nvgpu::createOptimizeSharedMemoryPass
std::unique_ptr< Pass > createOptimizeSharedMemoryPass()
Create a pass to optimize shared memory reads and writes.
Definition: OptimizeSharedMemory.cpp:242

mlir::nvgpu::setIndices
void setIndices(Operation *op, ArrayRef< Value > indices)
Set the indices that the given load/store operation is operating on.
Definition: Utils.cpp:38

mlir::nvgpu::getIndices
Operation::operand_range getIndices(Operation *op)
Get the indices that the given load/store operation is operating on.
Definition: Utils.cpp:18

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::failure
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62

mlir::success
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56

mlir::failed
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72

mlir::LogicalResult
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26

mlir::MemoryEffects::Read
The following effect indicates that the operation reads from some resource.
Definition: SideEffectInterfaces.h:339

mlir::MemoryEffects::Write
The following effect indicates that the operation writes to some resource.
Definition: SideEffectInterfaces.h:344