MLIR  22.0.0git
MemoryPromotion.cpp
Go to the documentation of this file.
1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
21 
22 using namespace mlir;
23 using namespace mlir::gpu;
24 
25 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
26 /// values using the bounds derived from the "from" value. Emits at least
27 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
28 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
29 /// reverse order to enable access coalescing in the innermost loop.
30 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
31  auto memRefType = cast<MemRefType>(from.getType());
32  auto rank = memRefType.getRank();
33 
34  SmallVector<Value, 4> lbs, ubs, steps;
37 
38  // Make sure we have enough loops to use all thread dimensions, these trivial
39  // loops should be outermost and therefore inserted first.
40  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
41  unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
42  lbs.resize(extraLoops, zero);
43  ubs.resize(extraLoops, one);
44  steps.resize(extraLoops, one);
45  }
46 
47  // Add existing bounds.
48  lbs.append(rank, zero);
49  ubs.reserve(lbs.size());
50  steps.reserve(lbs.size());
51  for (auto idx = 0; idx < rank; ++idx) {
52  ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
53  steps.push_back(one);
54  }
55 
56  // Obtain thread identifiers and block sizes, necessary to map to them.
57  auto indexType = b.getIndexType();
58  SmallVector<Value, 3> threadIds, blockDims;
59  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
60  threadIds.push_back(gpu::ThreadIdOp::create(b, indexType, dim));
61  blockDims.push_back(gpu::BlockDimOp::create(b, indexType, dim));
62  }
63 
64  // Produce the loop nest with copies.
65  SmallVector<Value, 8> ivs(lbs.size());
67  b, b.getLoc(), lbs, ubs, steps,
68  [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
69  ivs.assign(loopIvs.begin(), loopIvs.end());
70  auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);
71  Value loaded = memref::LoadOp::create(b, loc, from, activeIvs);
72  memref::StoreOp::create(b, loc, loaded, to, activeIvs);
73  });
74 
75  // Map the innermost loops to threads in reverse order.
76  for (const auto &en :
77  llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
78  GPUDialect::getNumWorkgroupDimensions())))) {
79  Value v = en.value();
80  auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
81  affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
82  {blockDims[en.index()]});
83  }
84 }
85 
86 /// Emits the loop nests performing the copy to the designated location in the
87 /// beginning of the region, and from the designated location immediately before
88 /// the terminator of the first block of the region. The region is expected to
89 /// have one block. This boils down to the following structure
90 ///
91 /// ^bb(...):
92 /// <loop-bound-computation>
93 /// for %arg0 = ... to ... step ... {
94 /// ...
95 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
96 /// %0 = load %from[%arg0, ..., %argN]
97 /// store %0, %to[%arg0, ..., %argN]
98 /// }
99 /// ...
100 /// }
101 /// gpu.barrier
102 /// <... original body ...>
103 /// gpu.barrier
104 /// for %arg0 = ... to ... step ... {
105 /// ...
106 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
107 /// %1 = load %to[%arg0, ..., %argN]
108 /// store %1, %from[%arg0, ..., %argN]
109 /// }
110 /// ...
111 /// }
112 ///
113 /// Inserts the barriers unconditionally since different threads may be copying
114 /// values and reading them. An analysis would be required to eliminate barriers
115 /// in case where value is only used by the thread that copies it. Both copies
116 /// are inserted unconditionally, an analysis would be required to only copy
117 /// live-in and live-out values when necessary. This copies the entire memref
118 /// pointed to by "from". In case a smaller block would be sufficient, the
119 /// caller can create a subview of the memref and promote it instead.
120 static void insertCopies(Region &region, Location loc, Value from, Value to) {
121  auto fromType = cast<MemRefType>(from.getType());
122  auto toType = cast<MemRefType>(to.getType());
123  (void)fromType;
124  (void)toType;
125  assert(fromType.getShape() == toType.getShape());
126  assert(fromType.getRank() != 0);
127  assert(region.hasOneBlock() && "unstructured control flow not supported");
128 
129  auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
130  insertCopyLoops(b, from, to);
131  gpu::BarrierOp::create(b);
132 
133  b.setInsertionPoint(&region.front().back());
134  gpu::BarrierOp::create(b);
135  insertCopyLoops(b, to, from);
136 }
137 
138 /// Promotes a function argument to workgroup memory in the given function. The
139 /// copies will be inserted in the beginning and in the end of the function.
140 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
141  Value value = op.getArgument(arg);
142  auto type = dyn_cast<MemRefType>(value.getType());
143  assert(type && type.hasStaticShape() && "can only promote memrefs");
144 
145  // Get the type of the buffer in the workgroup memory.
146  auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
147  op->getContext(), gpu::AddressSpace::Workgroup);
148  auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
149  MemRefLayoutAttrInterface{},
150  Attribute(workgroupMemoryAddressSpace));
151  Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
152 
153  // Replace the uses first since only the original uses are currently present.
154  // Then insert the copies.
155  value.replaceAllUsesWith(attribution);
156  insertCopies(op.getBody(), op.getLoc(), value, attribution);
157 }
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region,...
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Operation & back()
Definition: Block.h:152
IndexType getIndexType()
Definition: Builders.cpp:50
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition: Builders.h:621
Location getLoc() const
Accessors for the implied location.
Definition: Builders.h:654
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
Definition: Builders.h:631
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:672
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition: Region.h:200
Block & front()
Definition: Region.h:65
bool hasOneBlock()
Return true if this region has exactly one block.
Definition: Region.h:68
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition: Value.h:149
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:24
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:39
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1718
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition: SCF.cpp:707
Include the generated interface declarations.
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...