MLIR  20.0.0git
MemoryPromotion.cpp
Go to the documentation of this file.
1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
22 #include "mlir/Pass/Pass.h"
23 
24 using namespace mlir;
25 using namespace mlir::gpu;
26 
27 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28 /// values using the bounds derived from the "from" value. Emits at least
29 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
31 /// reverse order to enable access coalescing in the innermost loop.
32 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33  auto memRefType = cast<MemRefType>(from.getType());
34  auto rank = memRefType.getRank();
35 
36  SmallVector<Value, 4> lbs, ubs, steps;
37  Value zero = b.create<arith::ConstantIndexOp>(0);
38  Value one = b.create<arith::ConstantIndexOp>(1);
39 
40  // Make sure we have enough loops to use all thread dimensions, these trivial
41  // loops should be outermost and therefore inserted first.
42  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43  unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44  lbs.resize(extraLoops, zero);
45  ubs.resize(extraLoops, one);
46  steps.resize(extraLoops, one);
47  }
48 
49  // Add existing bounds.
50  lbs.append(rank, zero);
51  ubs.reserve(lbs.size());
52  steps.reserve(lbs.size());
53  for (auto idx = 0; idx < rank; ++idx) {
54  ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
55  steps.push_back(one);
56  }
57 
58  // Obtain thread identifiers and block sizes, necessary to map to them.
59  auto indexType = b.getIndexType();
60  SmallVector<Value, 3> threadIds, blockDims;
61  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
62  threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
63  blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
64  }
65 
66  // Produce the loop nest with copies.
67  SmallVector<Value, 8> ivs(lbs.size());
69  b, b.getLoc(), lbs, ubs, steps,
70  [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
71  ivs.assign(loopIvs.begin(), loopIvs.end());
72  auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);
73  Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
74  b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
75  });
76 
77  // Map the innermost loops to threads in reverse order.
78  for (const auto &en :
79  llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
80  GPUDialect::getNumWorkgroupDimensions())))) {
81  Value v = en.value();
82  auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
83  affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
84  {blockDims[en.index()]});
85  }
86 }
87 
88 /// Emits the loop nests performing the copy to the designated location in the
89 /// beginning of the region, and from the designated location immediately before
90 /// the terminator of the first block of the region. The region is expected to
91 /// have one block. This boils down to the following structure
92 ///
93 /// ^bb(...):
94 /// <loop-bound-computation>
95 /// for %arg0 = ... to ... step ... {
96 /// ...
97 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
98 /// %0 = load %from[%arg0, ..., %argN]
99 /// store %0, %to[%arg0, ..., %argN]
100 /// }
101 /// ...
102 /// }
103 /// gpu.barrier
104 /// <... original body ...>
105 /// gpu.barrier
106 /// for %arg0 = ... to ... step ... {
107 /// ...
108 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
109 /// %1 = load %to[%arg0, ..., %argN]
110 /// store %1, %from[%arg0, ..., %argN]
111 /// }
112 /// ...
113 /// }
114 ///
115 /// Inserts the barriers unconditionally since different threads may be copying
116 /// values and reading them. An analysis would be required to eliminate barriers
117 /// in case where value is only used by the thread that copies it. Both copies
118 /// are inserted unconditionally, an analysis would be required to only copy
119 /// live-in and live-out values when necessary. This copies the entire memref
120 /// pointed to by "from". In case a smaller block would be sufficient, the
121 /// caller can create a subview of the memref and promote it instead.
122 static void insertCopies(Region &region, Location loc, Value from, Value to) {
123  auto fromType = cast<MemRefType>(from.getType());
124  auto toType = cast<MemRefType>(to.getType());
125  (void)fromType;
126  (void)toType;
127  assert(fromType.getShape() == toType.getShape());
128  assert(fromType.getRank() != 0);
129  assert(llvm::hasSingleElement(region) &&
130  "unstructured control flow not supported");
131 
132  auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
133  insertCopyLoops(b, from, to);
134  b.create<gpu::BarrierOp>();
135 
136  b.setInsertionPoint(&region.front().back());
137  b.create<gpu::BarrierOp>();
138  insertCopyLoops(b, to, from);
139 }
140 
141 /// Promotes a function argument to workgroup memory in the given function. The
142 /// copies will be inserted in the beginning and in the end of the function.
143 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
144  Value value = op.getArgument(arg);
145  auto type = dyn_cast<MemRefType>(value.getType());
146  assert(type && type.hasStaticShape() && "can only promote memrefs");
147 
148  // Get the type of the buffer in the workgroup memory.
149  auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
150  op->getContext(), gpu::AddressSpace::Workgroup);
151  auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
152  MemRefLayoutAttrInterface{},
153  Attribute(workgroupMemoryAddressSpace));
154  Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
155 
156  // Replace the uses first since only the original uses are currently present.
157  // Then insert the copies.
158  value.replaceAllUsesWith(attribution);
159  insertCopies(op.getBody(), op.getLoc(), value, attribution);
160 }
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region,...
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Operation & back()
Definition: Block.h:152
IndexType getIndexType()
Definition: Builders.cpp:95
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Location getLoc() const
Accessors for the implied location.
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
This class helps build Operations.
Definition: Builders.h:216
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition: Region.h:200
Block & front()
Definition: Region.h:65
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:129
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition: Value.h:173
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition: SCF.cpp:687
Include the generated interface declarations.
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...