MLIR  14.0.0git
MemoryPromotion.cpp
Go to the documentation of this file.
1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
18 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Pass/Pass.h"
23 
24 using namespace mlir;
25 using namespace mlir::gpu;
26 
27 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28 /// values using the bounds derived from the "from" value. Emits at least
29 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
31 /// reverse order to enable access coalescing in the innermost loop.
32 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33  auto memRefType = from.getType().cast<MemRefType>();
34  auto rank = memRefType.getRank();
35 
36  SmallVector<Value, 4> lbs, ubs, steps;
37  Value zero = b.create<arith::ConstantIndexOp>(0);
39 
40  // Make sure we have enough loops to use all thread dimensions, these trivial
41  // loops should be outermost and therefore inserted first.
42  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43  unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44  lbs.resize(extraLoops, zero);
45  ubs.resize(extraLoops, one);
46  steps.resize(extraLoops, one);
47  }
48 
49  // Add existing bounds.
50  lbs.append(rank, zero);
51  ubs.reserve(lbs.size());
52  steps.reserve(lbs.size());
53  for (auto idx = 0; idx < rank; ++idx) {
54  ubs.push_back(b.createOrFold<memref::DimOp>(
55  from, b.create<arith::ConstantIndexOp>(idx)));
56  steps.push_back(one);
57  }
58 
59  // Obtain thread identifiers and block sizes, necessary to map to them.
60  auto indexType = b.getIndexType();
61  SmallVector<Value, 3> threadIds, blockDims;
62  for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
63  threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
64  blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
65  }
66 
67  // Produce the loop nest with copies.
68  SmallVector<Value, 8> ivs(lbs.size());
70  b, b.getLoc(), lbs, ubs, steps,
71  [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
72  ivs.assign(loopIvs.begin(), loopIvs.end());
73  auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
74  Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
75  b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
76  });
77 
78  // Map the innermost loops to threads in reverse order.
79  for (const auto &en :
80  llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
81  GPUDialect::getNumWorkgroupDimensions())))) {
82  Value v = en.value();
83  auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
84  mapLoopToProcessorIds(loop, {threadIds[en.index()]},
85  {blockDims[en.index()]});
86  }
87 }
88 
89 /// Emits the loop nests performing the copy to the designated location in the
90 /// beginning of the region, and from the designated location immediately before
91 /// the terminator of the first block of the region. The region is expected to
92 /// have one block. This boils down to the following structure
93 ///
94 /// ^bb(...):
95 /// <loop-bound-computation>
96 /// for %arg0 = ... to ... step ... {
97 /// ...
98 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
99 /// %0 = load %from[%arg0, ..., %argN]
100 /// store %0, %to[%arg0, ..., %argN]
101 /// }
102 /// ...
103 /// }
104 /// gpu.barrier
105 /// <... original body ...>
106 /// gpu.barrier
107 /// for %arg0 = ... to ... step ... {
108 /// ...
109 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
110 /// %1 = load %to[%arg0, ..., %argN]
111 /// store %1, %from[%arg0, ..., %argN]
112 /// }
113 /// ...
114 /// }
115 ///
116 /// Inserts the barriers unconditionally since different threads may be copying
117 /// values and reading them. An analysis would be required to eliminate barriers
118 /// in case where value is only used by the thread that copies it. Both copies
119 /// are inserted unconditionally, an analysis would be required to only copy
120 /// live-in and live-out values when necessary. This copies the entire memref
121 /// pointed to by "from". In case a smaller block would be sufficient, the
122 /// caller can create a subview of the memref and promote it instead.
123 static void insertCopies(Region &region, Location loc, Value from, Value to) {
124  auto fromType = from.getType().cast<MemRefType>();
125  auto toType = to.getType().cast<MemRefType>();
126  (void)fromType;
127  (void)toType;
128  assert(fromType.getShape() == toType.getShape());
129  assert(fromType.getRank() != 0);
130  assert(llvm::hasSingleElement(region) &&
131  "unstructured control flow not supported");
132 
133  auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
134  insertCopyLoops(b, from, to);
135  b.create<gpu::BarrierOp>();
136 
137  b.setInsertionPoint(&region.front().back());
138  b.create<gpu::BarrierOp>();
139  insertCopyLoops(b, to, from);
140 }
141 
142 /// Promotes a function argument to workgroup memory in the given function. The
143 /// copies will be inserted in the beginning and in the end of the function.
144 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
145  Value value = op.getArgument(arg);
146  auto type = value.getType().dyn_cast<MemRefType>();
147  assert(type && type.hasStaticShape() && "can only promote memrefs");
148 
149  // Get the type of the buffer in the workgroup memory.
150  int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
151  auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
152  workgroupMemoryAddressSpace);
153  Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
154 
155  // Replace the uses first since only the original uses are currently present.
156  // Then insert the copies.
157  value.replaceAllUsesWith(attribution);
158  insertCopies(op.getBody(), op.getLoc(), value, attribution);
159 }
Include the generated interface declarations.
This class contains a list of basic blocks and a link to the parent operation it is attached to...
Definition: Region.h:26
Operation & back()
Definition: Block.h:143
Block & front()
Definition: Region.h:65
static ImplicitLocOpBuilder atBlockBegin(Location loc, Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to before the first operation in the block but still ins...
void replaceAllUsesWith(Value newValue) const
Replace all uses of &#39;this&#39; value with the new value, updating anything in the IR that uses &#39;this&#39; to ...
Definition: Value.h:161
static constexpr const bool value
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:48
Region * getParentRegion()
Return the Region in which this Value is defined.
Definition: Value.cpp:41
void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg)
Promotes a function argument to workgroup memory in the given function.
U dyn_cast() const
Definition: Types.h:244
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:206
void createOrFold(llvm::SmallVectorImpl< Value > &results, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Operation * getParentOp()
Return the parent operation this region is attached to.
Definition: Region.h:200
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:84
LoopNest buildLoopNest(OpBuilder &builder, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ValueRange iterArgs, function_ref< ValueVector(OpBuilder &, Location, ValueRange, ValueRange)> bodyBuilder=nullptr)
Creates a perfect nest of "for" loops, i.e.
Definition: SCF.cpp:468
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
Type getType() const
Return the type of this value.
Definition: Value.h:117
IndexType getIndexType()
Definition: Builders.cpp:48
ImplicitLocOpBuilder maintains a &#39;current location&#39;, allowing use of the create<> method without spec...
static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to)
Emits the (imperfect) loop nest performing the copy between "from" and "to" values using the bounds d...
Specialization of arith.constant op that returns an integer of index type.
Definition: Arithmetic.h:78
static void insertCopies(Region &region, Location loc, Value from, Value to)
Emits the loop nests performing the copy to the designated location in the beginning of the region...
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors...
Definition: LoopUtils.cpp:2433
This class helps build Operations.
Definition: Builders.h:177
This class provides an abstraction over the different types of ranges over Values.
Location getLoc() const
Accessors for the implied location.
U cast() const
Definition: Types.h:250