MLIR 22.0.0git
MemoryPromotion.cpp
Go to the documentation of this file.
1//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utilities that allow one to create IR moving the data
10// across different levels of the GPU memory hierarchy.
11//
12//===----------------------------------------------------------------------===//
13
15
21
22using namespace mlir;
23using namespace mlir::gpu;
24
25/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
26/// values using the bounds derived from the "from" value. Emits at least
27/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
28/// single-iteration loops. Maps the innermost loops to thread dimensions, in
29/// reverse order to enable access coalescing in the innermost loop.
31 auto memRefType = cast<MemRefType>(from.getType());
32 auto rank = memRefType.getRank();
33
34 SmallVector<Value, 4> lbs, ubs, steps;
37
38 // Make sure we have enough loops to use all thread dimensions, these trivial
39 // loops should be outermost and therefore inserted first.
40 if (rank < GPUDialect::getNumWorkgroupDimensions()) {
41 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
42 lbs.resize(extraLoops, zero);
43 ubs.resize(extraLoops, one);
44 steps.resize(extraLoops, one);
45 }
46
47 // Add existing bounds.
48 lbs.append(rank, zero);
49 ubs.reserve(lbs.size());
50 steps.reserve(lbs.size());
51 for (auto idx = 0; idx < rank; ++idx) {
52 ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
53 steps.push_back(one);
54 }
55
56 // Obtain thread identifiers and block sizes, necessary to map to them.
57 auto indexType = b.getIndexType();
58 SmallVector<Value, 3> threadIds, blockDims;
59 for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
60 threadIds.push_back(gpu::ThreadIdOp::create(b, indexType, dim));
61 blockDims.push_back(gpu::BlockDimOp::create(b, indexType, dim));
62 }
63
64 // Produce the loop nest with copies.
65 SmallVector<Value, 8> ivs(lbs.size());
67 b, b.getLoc(), lbs, ubs, steps,
68 [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
69 ivs.assign(loopIvs.begin(), loopIvs.end());
70 auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);
71 Value loaded = memref::LoadOp::create(b, loc, from, activeIvs);
72 memref::StoreOp::create(b, loc, loaded, to, activeIvs);
73 });
74
75 // Map the innermost loops to threads in reverse order.
76 for (const auto &en :
77 llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
78 GPUDialect::getNumWorkgroupDimensions())))) {
79 Value v = en.value();
80 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
81 affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
82 {blockDims[en.index()]});
83 }
84}
85
86/// Emits the loop nests performing the copy to the designated location in the
87/// beginning of the region, and from the designated location immediately before
88/// the terminator of the first block of the region. The region is expected to
89/// have one block. This boils down to the following structure
90///
91/// ^bb(...):
92/// <loop-bound-computation>
93/// for %arg0 = ... to ... step ... {
94/// ...
95/// for %argN = <thread-id-x> to ... step <block-dim-x> {
96/// %0 = load %from[%arg0, ..., %argN]
97/// store %0, %to[%arg0, ..., %argN]
98/// }
99/// ...
100/// }
101/// gpu.barrier
102/// <... original body ...>
103/// gpu.barrier
104/// for %arg0 = ... to ... step ... {
105/// ...
106/// for %argN = <thread-id-x> to ... step <block-dim-x> {
107/// %1 = load %to[%arg0, ..., %argN]
108/// store %1, %from[%arg0, ..., %argN]
109/// }
110/// ...
111/// }
112///
113/// Inserts the barriers unconditionally since different threads may be copying
114/// values and reading them. An analysis would be required to eliminate barriers
115/// in case where value is only used by the thread that copies it. Both copies
116/// are inserted unconditionally, an analysis would be required to only copy
117/// live-in and live-out values when necessary. This copies the entire memref
118/// pointed to by "from". In case a smaller block would be sufficient, the
119/// caller can create a subview of the memref and promote it instead.
120static void insertCopies(Region &region, Location loc, Value from, Value to) {
121 auto fromType = cast<MemRefType>(from.getType());
122 auto toType = cast<MemRefType>(to.getType());
123 (void)fromType;
124 (void)toType;
125 assert(fromType.getShape() == toType.getShape());
126 assert(fromType.getRank() != 0);
127 assert(region.hasOneBlock() && "unstructured control flow not supported");
128
129 auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
130 insertCopyLoops(b, from, to);
131 gpu::BarrierOp::create(b);
132
133 b.setInsertionPoint(&region.front().back());
134 gpu::BarrierOp::create(b);
135 insertCopyLoops(b, to, from);
136}
137
138/// Promotes a function argument to workgroup memory in the given function. The
139/// copies will be inserted in the beginning and in the end of the function.
140void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
141 Value value = op.getArgument(arg);
142 auto type = dyn_cast<MemRefType>(value.getType());
143 assert(type && type.hasStaticShape() && "can only promote memrefs");
144
145 // Get the type of the buffer in the workgroup memory.
146 auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
147 op->getContext(), gpu::AddressSpace::Workgroup);
148 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
149 MemRefLayoutAttrInterface{},
150 Attribute(workgroupMemoryAddressSpace));
151 Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
152
153 // Replace the uses first since only the original uses are currently present.
154 // Then insert the copies.
155 value.replaceAllUsesWith(attribution);
156 insertCopies(op.getBody(), op.getLoc(), value, attribution);
157}
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region,...
Attributes are known-constant values of operations.
Definition Attributes.h:25
Operation & back()
Definition Block.h:152
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition Builders.h:630
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
Definition Builders.h:640
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:207
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition Region.h:26
Block & front()
Definition Region.h:65
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition Region.h:200
bool hasOneBlock()
Return true if this region has exactly one block.
Definition Region.h:68
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition Value.h:149
Location getLoc() const
Return the location of this value.
Definition Value.cpp:24
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition Value.cpp:39
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition SCF.cpp:837
Include the generated interface declarations.
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.