doxygen/MemoryPromotion_8cpp_source.html

 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements utilities that allow one to create IR moving the data

 // across different levels of the GPU memory hierarchy.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h"


 #include "mlir/Dialect/Affine/LoopUtils.h"

 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/SCF/IR/SCF.h"

 #include "mlir/IR/ImplicitLocOpBuilder.h"

 #include "mlir/Pass/Pass.h"


 using namespace mlir;

 using namespace mlir::gpu;


 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"

 /// values using the bounds derived from the "from" value. Emits at least

 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with

 /// single-iteration loops. Maps the innermost loops to thread dimensions, in

 /// reverse order to enable access coalescing in the innermost loop.

 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {

   auto memRefType = cast<MemRefType>(from.getType());

   auto rank = memRefType.getRank();


   SmallVector<Value, 4> lbs, ubs, steps;

   Value zero = b.create<arith::ConstantIndexOp>(0);

   Value one = b.create<arith::ConstantIndexOp>(1);


   // Make sure we have enough loops to use all thread dimensions, these trivial

   // loops should be outermost and therefore inserted first.

   if (rank < GPUDialect::getNumWorkgroupDimensions()) {

     unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;

     lbs.resize(extraLoops, zero);

     ubs.resize(extraLoops, one);

     steps.resize(extraLoops, one);

   }


   // Add existing bounds.

   lbs.append(rank, zero);

   ubs.reserve(lbs.size());

   steps.reserve(lbs.size());

   for (auto idx = 0; idx < rank; ++idx) {

     ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));

     steps.push_back(one);

   }


   // Obtain thread identifiers and block sizes, necessary to map to them.

   auto indexType = b.getIndexType();

   SmallVector<Value, 3> threadIds, blockDims;

   for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {

     threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));

     blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));

   }


   // Produce the loop nest with copies.

   SmallVector<Value, 8> ivs(lbs.size());

   mlir::scf::buildLoopNest(

       b, b.getLoc(), lbs, ubs, steps,

       [&](OpBuilder &b, Location loc, ValueRange loopIvs) {

         ivs.assign(loopIvs.begin(), loopIvs.end());

         auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);

         Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);

         b.create<memref::StoreOp>(loc, loaded, to, activeIvs);

       });


   // Map the innermost loops to threads in reverse order.

   for (const auto &en :

        llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(

            GPUDialect::getNumWorkgroupDimensions())))) {

     Value v = en.value();

     auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());

     affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},

                                   {blockDims[en.index()]});

   }

 }


 /// Emits the loop nests performing the copy to the designated location in the

 /// beginning of the region, and from the designated location immediately before

 /// the terminator of the first block of the region. The region is expected to

 /// have one block. This boils down to the following structure

 ///

 ///   ^bb(...):

 ///     <loop-bound-computation>

 ///     for %arg0 = ... to ... step ... {

 ///       ...

 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {

 ///           %0 = load %from[%arg0, ..., %argN]

 ///           store %0, %to[%arg0, ..., %argN]

 ///         }

 ///       ...

 ///     }

 ///     gpu.barrier

 ///     <... original body ...>

 ///     gpu.barrier

 ///     for %arg0 = ... to ... step ... {

 ///       ...

 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {

 ///           %1 = load %to[%arg0, ..., %argN]

 ///           store %1, %from[%arg0, ..., %argN]

 ///         }

 ///       ...

 ///     }

 ///

 /// Inserts the barriers unconditionally since different threads may be copying

 /// values and reading them. An analysis would be required to eliminate barriers

 /// in case where value is only used by the thread that copies it. Both copies

 /// are inserted unconditionally, an analysis would be required to only copy

 /// live-in and live-out values when necessary. This copies the entire memref

 /// pointed to by "from". In case a smaller block would be sufficient, the

 /// caller can create a subview of the memref and promote it instead.

 static void insertCopies(Region &region, Location loc, Value from, Value to) {

   auto fromType = cast<MemRefType>(from.getType());

   auto toType = cast<MemRefType>(to.getType());

   (void)fromType;

   (void)toType;

   assert(fromType.getShape() == toType.getShape());

   assert(fromType.getRank() != 0);

   assert(llvm::hasSingleElement(region) &&

          "unstructured control flow not supported");


   auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());

   insertCopyLoops(b, from, to);

   b.create<gpu::BarrierOp>();


   b.setInsertionPoint(&region.front().back());

   b.create<gpu::BarrierOp>();

   insertCopyLoops(b, to, from);

 }


 /// Promotes a function argument to workgroup memory in the given function. The

 /// copies will be inserted in the beginning and in the end of the function.

 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {

   Value value = op.getArgument(arg);

   auto type = dyn_cast<MemRefType>(value.getType());

   assert(type && type.hasStaticShape() && "can only promote memrefs");


   // Get the type of the buffer in the workgroup memory.

   auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(

       op->getContext(), gpu::AddressSpace::Workgroup);

   auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),

                                     MemRefLayoutAttrInterface{},

                                     Attribute(workgroupMemoryAddressSpace));

   Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());


   // Replace the uses first since only the original uses are currently present.

   // Then insert the copies.

   value.replaceAllUsesWith(attribution);

   insertCopies(op.getBody(), op.getLoc(), value, attribution);

 }

GPUDialect.h

ImplicitLocOpBuilder.h

LoopUtils.h

insertCopyLoops
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
Definition: MemoryPromotion.cpp:32

insertCopies
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region,...
Definition: MemoryPromotion.cpp:122

MemoryPromotion.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

mlir::Attribute
Attributes are known-constant values of operations.
Definition: Attributes.h:25

mlir::Block::back
Operation & back()
Definition: Block.h:152

mlir::Builder::getIndexType
IndexType getIndexType()
Definition: Builders.cpp:53

mlir::ImplicitLocOpBuilder
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition: ImplicitLocOpBuilder.h:23

mlir::ImplicitLocOpBuilder::getLoc
Location getLoc() const
Accessors for the implied location.
Definition: ImplicitLocOpBuilder.h:56

mlir::ImplicitLocOpBuilder::atBlockBegin
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
Definition: ImplicitLocOpBuilder.h:33

mlir::ImplicitLocOpBuilder::create
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
Definition: ImplicitLocOpBuilder.h:66

mlir::ImplicitLocOpBuilder::createOrFold
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: ImplicitLocOpBuilder.h:74

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::OpBuilder
This class helps build Operations.
Definition: Builders.h:205

mlir::Region
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26

mlir::Region::getParentOp
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition: Region.h:200

mlir::Region::front
Block & front()
Definition: Region.h:65

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::Value::replaceAllUsesWith
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition: Value.h:149

mlir::Value::getLoc
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26

mlir::Value::getParentRegion
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41

Pass.h

Arith.h

MemRef.h

SCF.h

mlir::affine::mapLoopToProcessorIds
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir::gpu
Definition: GPUCommonPass.h:35

mlir::scf::buildLoopNest
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition: SCF.cpp:694

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::promoteToWorkgroupMemory
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509