MLIR  22.0.0git
MemoryPromotion.cpp
Go to the documentation of this file.
1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
22 #include "mlir/Pass/Pass.h"
23 
24 using namespace mlir;
25 using namespace mlir::gpu;
26 
27 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28 /// values using the bounds derived from the "from" value. Emits at least
29 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
31 /// reverse order to enable access coalescing in the innermost loop.
32 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33  auto memRefType = cast<MemRefType>(from.getType());
34  auto rank = memRefType.getRank();
35 
36  SmallVector<Value, 4> lbs, ubs, steps;
37  Value zero = b.create<arith::ConstantIndexOp>(0);
38  Value one = b.create<arith::ConstantIndexOp>(1);
39 
40  // Make sure we have enough loops to use all thread dimensions, these trivial
41  // loops should be outermost and therefore inserted first.
42  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43  unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44  lbs.resize(extraLoops, zero);
45  ubs.resize(extraLoops, one);
46  steps.resize(extraLoops, one);
47  }
48 
49  // Add existing bounds.
50  lbs.append(rank, zero);
51  ubs.reserve(lbs.size());
52  steps.reserve(lbs.size());
53  for (auto idx = 0; idx < rank; ++idx) {
54  ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
55  steps.push_back(one);
56  }
57 
58  // Obtain thread identifiers and block sizes, necessary to map to them.
59  auto indexType = b.getIndexType();
60  SmallVector<Value, 3> threadIds, blockDims;
61  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
62  threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
63  blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
64  }
65 
66  // Produce the loop nest with copies.
67  SmallVector<Value, 8> ivs(lbs.size());
69  b, b.getLoc(), lbs, ubs, steps,
70  [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
71  ivs.assign(loopIvs.begin(), loopIvs.end());
72  auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);
73  Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
74  b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
75  });
76 
77  // Map the innermost loops to threads in reverse order.
78  for (const auto &en :
79  llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
80  GPUDialect::getNumWorkgroupDimensions())))) {
81  Value v = en.value();
82  auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
83  affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
84  {blockDims[en.index()]});
85  }
86 }
87 
88 /// Emits the loop nests performing the copy to the designated location in the
89 /// beginning of the region, and from the designated location immediately before
90 /// the terminator of the first block of the region. The region is expected to
91 /// have one block. This boils down to the following structure
92 ///
93 /// ^bb(...):
94 /// <loop-bound-computation>
95 /// for %arg0 = ... to ... step ... {
96 /// ...
97 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
98 /// %0 = load %from[%arg0, ..., %argN]
99 /// store %0, %to[%arg0, ..., %argN]
100 /// }
101 /// ...
102 /// }
103 /// gpu.barrier
104 /// <... original body ...>
105 /// gpu.barrier
106 /// for %arg0 = ... to ... step ... {
107 /// ...
108 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
109 /// %1 = load %to[%arg0, ..., %argN]
110 /// store %1, %from[%arg0, ..., %argN]
111 /// }
112 /// ...
113 /// }
114 ///
115 /// Inserts the barriers unconditionally since different threads may be copying
116 /// values and reading them. An analysis would be required to eliminate barriers
117 /// in case where value is only used by the thread that copies it. Both copies
118 /// are inserted unconditionally, an analysis would be required to only copy
119 /// live-in and live-out values when necessary. This copies the entire memref
120 /// pointed to by "from". In case a smaller block would be sufficient, the
121 /// caller can create a subview of the memref and promote it instead.
122 static void insertCopies(Region &region, Location loc, Value from, Value to) {
123  auto fromType = cast<MemRefType>(from.getType());
124  auto toType = cast<MemRefType>(to.getType());
125  (void)fromType;
126  (void)toType;
127  assert(fromType.getShape() == toType.getShape());
128  assert(fromType.getRank() != 0);
129  assert(llvm::hasSingleElement(region) &&
130  "unstructured control flow not supported");
131 
132  auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
133  insertCopyLoops(b, from, to);
134  b.create<gpu::BarrierOp>();
135 
136  b.setInsertionPoint(&region.front().back());
137  b.create<gpu::BarrierOp>();
138  insertCopyLoops(b, to, from);
139 }
140 
141 /// Promotes a function argument to workgroup memory in the given function. The
142 /// copies will be inserted in the beginning and in the end of the function.
143 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
144  Value value = op.getArgument(arg);
145  auto type = dyn_cast<MemRefType>(value.getType());
146  assert(type && type.hasStaticShape() && "can only promote memrefs");
147 
148  // Get the type of the buffer in the workgroup memory.
149  auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
150  op->getContext(), gpu::AddressSpace::Workgroup);
151  auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
152  MemRefLayoutAttrInterface{},
153  Attribute(workgroupMemoryAddressSpace));
154  Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
155 
156  // Replace the uses first since only the original uses are currently present.
157  // Then insert the copies.
158  value.replaceAllUsesWith(attribution);
159  insertCopies(op.getBody(), op.getLoc(), value, attribution);
160 }
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region,...
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Operation & back()
Definition: Block.h:152
IndexType getIndexType()
Definition: Builders.cpp:50
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition: Builders.h:621
Location getLoc() const
Accessors for the implied location.
Definition: Builders.h:654
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
Definition: Builders.h:631
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
Definition: Builders.h:664
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:672
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
This class contains a list of basic blocks and a link to the parent operation it is attached to.
Definition: Region.h:26
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition: Region.h:200
Block & front()
Definition: Region.h:65
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition: Value.h:149
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1725
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition: SCF.cpp:693
Include the generated interface declarations.
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...