MLIR 22.0.0git
AffineDataCopyGeneration.cpp
Go to the documentation of this file.
1//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to automatically promote accessed memref regions
10// to buffers in a faster memory space that is explicitly managed, with the
11// necessary data movement operations performed through either regular
12// point-wise load/store's or DMAs. Such explicit copying (also referred to as
13// array packing/unpacking in the literature), when done on arrays that exhibit
14// reuse, results in near elimination of conflict misses, TLB misses, reduced
15// use of hardware prefetch streams, and reduced false sharing. It is also
16// necessary for hardware that explicitly managed levels in the memory
17// hierarchy, and where DMAs may have to be used. This optimization is often
18// performed on already tiled code.
19//
20//===----------------------------------------------------------------------===//
21
23
31#include "llvm/Support/CommandLine.h"
32#include <algorithm>
33#include <optional>
34
35namespace mlir {
36namespace affine {
37#define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
38#include "mlir/Dialect/Affine/Passes.h.inc"
39} // namespace affine
40} // namespace mlir
41
42#define DEBUG_TYPE "affine-data-copy-generate"
43
44using namespace mlir;
45using namespace mlir::affine;
46
47namespace {
48
49/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
50/// introducing copy operations to transfer data into `fastMemorySpace` and
51/// rewriting the original load's/store's to instead load/store from the
52/// allocated fast memory buffers. Additional options specify the identifier
53/// corresponding to the fast memory space and the amount of fast memory space
54/// available. The pass traverses through the nesting structure, recursing to
55/// inner levels if necessary to determine at what depth copies need to be
56/// placed so that the allocated buffers fit within the memory capacity
57/// provided.
58// TODO: We currently can't generate copies correctly when stores
59// are strided. Check for strided stores.
60struct AffineDataCopyGeneration
62 AffineDataCopyGeneration> {
63 AffineDataCopyGeneration() = default;
64 explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
66 unsigned tagMemorySpace,
68 uint64_t fastMemCapacityBytes) {
69 this->slowMemorySpace = slowMemorySpace;
70 this->fastMemorySpace = fastMemorySpace;
71 this->tagMemorySpace = tagMemorySpace;
72 this->minDmaTransferSize = minDmaTransferSize;
73 this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
74 }
75
76 void runOnOperation() override;
77 void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
78
79 // Constant zero index to avoid too many duplicates.
80 Value zeroIndex = nullptr;
81};
82
83} // namespace
84
85/// Generates copies for memref's living in 'slowMemorySpace' into newly created
86/// buffers in 'fastMemorySpace', and replaces memory operations to the former
87/// by the latter.
88std::unique_ptr<OperationPass<func::FuncOp>>
90 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
91 int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
92 return std::make_unique<AffineDataCopyGeneration>(
94 fastMemCapacityBytes);
95}
96std::unique_ptr<OperationPass<func::FuncOp>>
98 return std::make_unique<AffineDataCopyGeneration>();
99}
101/// Generate copies for this block. The block is partitioned into separate
102/// ranges: each range is either a sequence of one or more operations starting
103/// and ending with an affine load or store op, or just an affine.for op (which
104/// could have other affine for op's nested within).
105void AffineDataCopyGeneration::runOnBlock(Block *block,
107 if (block->empty())
108 return;
109
110 uint64_t fastMemCapacityBytes =
111 fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
112 ? fastMemoryCapacity * 1024
116 fastMemCapacityBytes};
117
118 // Every affine.for op in the block starts and ends a block range for copying;
119 // in addition, a contiguous sequence of operations starting with a
120 // load/store op but not including any copy nests themselves is also
121 // identified as a copy block range. Straightline code (a contiguous chunk of
122 // operations excluding AffineForOp's) are always assumed to not exhaust
123 // memory. As a result, this approach is conservative in some cases at the
124 // moment; we do a check later and report an error with location info.
125
126 // Get to the first load, store, or for op (that is not a copy nest itself).
127 auto curBegin = llvm::find_if(*block, [&](Operation &op) {
128 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
129 copyNests.count(&op) == 0;
130 });
131
132 // Create [begin, end) ranges.
133 auto it = curBegin;
134 while (it != block->end()) {
135 AffineForOp forOp;
136 // If you hit a non-copy for loop, we will split there.
137 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
138 // Perform the copying up unti this 'for' op first.
139 (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
140 /*filterMemRef=*/std::nullopt, copyNests);
141
142 // Returns true if the footprint is known to exceed capacity.
143 auto exceedsCapacity = [&](AffineForOp forOp) {
144 std::optional<int64_t> footprint =
146 /*memorySpace=*/0);
147 return (footprint.has_value() &&
148 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
149 };
150
151 // If the memory footprint of the 'affine.for' loop is higher than fast
152 // memory capacity (when provided), we recurse to copy at an inner level
153 // until we find a depth at which footprint fits in fast mem capacity. If
154 // the footprint can't be calculated, we assume for now it fits. Recurse
155 // inside if footprint for 'forOp' exceeds capacity, or when
156 // skipNonUnitStrideLoops is set and the step size is not one.
157 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
158 : exceedsCapacity(forOp);
159 if (recurseInner) {
160 // We'll recurse and do the copies at an inner level for 'forInst'.
161 // Recurse onto the body of this loop.
162 runOnBlock(forOp.getBody(), copyNests);
163 } else {
164 // We have enough capacity, i.e., copies will be computed for the
165 // portion of the block until 'it', and for 'it', which is 'forOp'. Note
166 // that for the latter, the copies are placed just before this loop (for
167 // incoming copies) and right after (for outgoing ones).
168
169 // Inner loop copies have their own scope - we don't thus update
170 // consumed capacity. The footprint check above guarantees this inner
171 // loop's footprint fits.
172 (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
173 copyOptions,
174 /*filterMemRef=*/std::nullopt, copyNests);
175 }
176 // Get to the next load or store op after 'forOp'.
177 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
178 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
179 copyNests.count(&op) == 0;
180 });
181 it = curBegin;
182 } else {
183 assert(copyNests.count(&*it) == 0 &&
184 "all copy nests generated should have been skipped above");
185 // We simply include this op in the current range and continue for more.
186 ++it;
187 }
188 }
189
190 // Generate the copy for the final block range.
191 if (curBegin != block->end()) {
192 // Can't be a terminator because it would have been skipped above.
193 assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
194 "can't be a terminator");
195 // Exclude the affine.yield - hence, the std::prev.
196 (void)affineDataCopyGenerate(/*begin=*/curBegin,
197 /*end=*/std::prev(block->end()), copyOptions,
198 /*filterMemRef=*/std::nullopt, copyNests);
199 }
200}
201
202void AffineDataCopyGeneration::runOnOperation() {
203 func::FuncOp f = getOperation();
204 OpBuilder topBuilder(f.getBody());
205 zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
206
207 // Nests that are copy-in's or copy-out's; the root AffineForOps of those
208 // nests are stored herein.
209 DenseSet<Operation *> copyNests;
210
211 // Clear recorded copy nests.
212 copyNests.clear();
213
214 for (auto &block : f)
215 runOnBlock(&block, copyNests);
216
217 // Promote any single iteration loops in the copy nests and collect
218 // load/stores to simplify.
220 for (Operation *nest : copyNests)
221 // With a post order walk, the erasure of loops does not affect
222 // continuation of the walk or the collection of load/store ops.
223 nest->walk([&](Operation *op) {
224 if (auto forOp = dyn_cast<AffineForOp>(op))
226 else if (isa<AffineLoadOp, AffineStoreOp>(op))
227 copyOps.push_back(op);
228 });
229
230 // Promoting single iteration loops could lead to simplification of
231 // contained load's/store's, and the latter could anyway also be
232 // canonicalized.
234 AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
235 AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
236 FrozenRewritePatternSet frozenPatterns(std::move(patterns));
238 copyOps, frozenPatterns,
239 GreedyRewriteConfig().setStrictness(
241}
b getContext())
Block represents an ordered list of Operations.
Definition Block.h:33
bool empty()
Definition Block.h:148
iterator end()
Definition Block.h:144
This class represents a frozen set of patterns that can be processed by a pattern applicator.
This class allows control over how the GreedyPatternRewriteDriver works.
This class helps build Operations.
Definition Builders.h:207
This class provides the API for ops that are known to be terminators.
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
virtual void runOnOperation()=0
The polymorphic API that runs the pass over the currently held operation.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
Definition Utils.cpp:2173
std::unique_ptr< OperationPass< func::FuncOp > > createAffineDataCopyGenerationPass()
Overload relying on pass options for initialization.
Include the generated interface declarations.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:128
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition LoopUtils.h:159