MLIR  20.0.0git
AffineDataCopyGeneration.cpp
Go to the documentation of this file.
1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
23 
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Debug.h"
34 #include <algorithm>
35 #include <optional>
36 
37 namespace mlir {
38 namespace affine {
39 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
40 #include "mlir/Dialect/Affine/Passes.h.inc"
41 } // namespace affine
42 } // namespace mlir
43 
44 #define DEBUG_TYPE "affine-data-copy-generate"
45 
46 using namespace mlir;
47 using namespace mlir::affine;
48 
49 namespace {
50 
51 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
52 /// introducing copy operations to transfer data into `fastMemorySpace` and
53 /// rewriting the original load's/store's to instead load/store from the
54 /// allocated fast memory buffers. Additional options specify the identifier
55 /// corresponding to the fast memory space and the amount of fast memory space
56 /// available. The pass traverses through the nesting structure, recursing to
57 /// inner levels if necessary to determine at what depth copies need to be
58 /// placed so that the allocated buffers fit within the memory capacity
59 /// provided.
60 // TODO: We currently can't generate copies correctly when stores
61 // are strided. Check for strided stores.
62 struct AffineDataCopyGeneration
63  : public affine::impl::AffineDataCopyGenerationBase<
64  AffineDataCopyGeneration> {
65  AffineDataCopyGeneration() = default;
66  explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
67  unsigned fastMemorySpace,
68  unsigned tagMemorySpace,
69  int minDmaTransferSize,
70  uint64_t fastMemCapacityBytes) {
71  this->slowMemorySpace = slowMemorySpace;
72  this->fastMemorySpace = fastMemorySpace;
73  this->tagMemorySpace = tagMemorySpace;
74  this->minDmaTransferSize = minDmaTransferSize;
75  this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
76  }
77 
78  void runOnOperation() override;
79  void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
80 
81  // Constant zero index to avoid too many duplicates.
82  Value zeroIndex = nullptr;
83 };
84 
85 } // namespace
86 
87 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
88 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
89 /// by the latter. Only load op's handled for now.
90 std::unique_ptr<OperationPass<func::FuncOp>>
92  unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
93  int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
94  return std::make_unique<AffineDataCopyGeneration>(
95  slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
96  fastMemCapacityBytes);
97 }
98 std::unique_ptr<OperationPass<func::FuncOp>>
100  return std::make_unique<AffineDataCopyGeneration>();
101 }
102 
103 /// Generate copies for this block. The block is partitioned into separate
104 /// ranges: each range is either a sequence of one or more operations starting
105 /// and ending with an affine load or store op, or just an affine.for op (which
106 /// could have other affine for op's nested within).
107 void AffineDataCopyGeneration::runOnBlock(Block *block,
108  DenseSet<Operation *> &copyNests) {
109  if (block->empty())
110  return;
111 
112  uint64_t fastMemCapacityBytes =
113  fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
114  ? fastMemoryCapacity * 1024
115  : fastMemoryCapacity;
116  AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
117  fastMemorySpace, tagMemorySpace,
118  fastMemCapacityBytes};
119 
120  // Every affine.for op in the block starts and ends a block range for copying;
121  // in addition, a contiguous sequence of operations starting with a
122  // load/store op but not including any copy nests themselves is also
123  // identified as a copy block range. Straightline code (a contiguous chunk of
124  // operations excluding AffineForOp's) are always assumed to not exhaust
125  // memory. As a result, this approach is conservative in some cases at the
126  // moment; we do a check later and report an error with location info.
127 
128  // Get to the first load, store, or for op (that is not a copy nest itself).
129  auto curBegin =
130  std::find_if(block->begin(), block->end(), [&](Operation &op) {
131  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
132  copyNests.count(&op) == 0;
133  });
134 
135  // Create [begin, end) ranges.
136  auto it = curBegin;
137  while (it != block->end()) {
138  AffineForOp forOp;
139  // If you hit a non-copy for loop, we will split there.
140  if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
141  // Perform the copying up unti this 'for' op first.
142  (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
143  /*filterMemRef=*/std::nullopt, copyNests);
144 
145  // Returns true if the footprint is known to exceed capacity.
146  auto exceedsCapacity = [&](AffineForOp forOp) {
147  std::optional<int64_t> footprint =
149  /*memorySpace=*/0);
150  return (footprint.has_value() &&
151  static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
152  };
153 
154  // If the memory footprint of the 'affine.for' loop is higher than fast
155  // memory capacity (when provided), we recurse to copy at an inner level
156  // until we find a depth at which footprint fits in fast mem capacity. If
157  // the footprint can't be calculated, we assume for now it fits. Recurse
158  // inside if footprint for 'forOp' exceeds capacity, or when
159  // skipNonUnitStrideLoops is set and the step size is not one.
160  bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
161  : exceedsCapacity(forOp);
162  if (recurseInner) {
163  // We'll recurse and do the copies at an inner level for 'forInst'.
164  // Recurse onto the body of this loop.
165  runOnBlock(forOp.getBody(), copyNests);
166  } else {
167  // We have enough capacity, i.e., copies will be computed for the
168  // portion of the block until 'it', and for 'it', which is 'forOp'. Note
169  // that for the latter, the copies are placed just before this loop (for
170  // incoming copies) and right after (for outgoing ones).
171 
172  // Inner loop copies have their own scope - we don't thus update
173  // consumed capacity. The footprint check above guarantees this inner
174  // loop's footprint fits.
175  (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
176  copyOptions,
177  /*filterMemRef=*/std::nullopt, copyNests);
178  }
179  // Get to the next load or store op after 'forOp'.
180  curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
181  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
182  copyNests.count(&op) == 0;
183  });
184  it = curBegin;
185  } else {
186  assert(copyNests.count(&*it) == 0 &&
187  "all copy nests generated should have been skipped above");
188  // We simply include this op in the current range and continue for more.
189  ++it;
190  }
191  }
192 
193  // Generate the copy for the final block range.
194  if (curBegin != block->end()) {
195  // Can't be a terminator because it would have been skipped above.
196  assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
197  "can't be a terminator");
198  // Exclude the affine.yield - hence, the std::prev.
199  (void)affineDataCopyGenerate(/*begin=*/curBegin,
200  /*end=*/std::prev(block->end()), copyOptions,
201  /*filterMemRef=*/std::nullopt, copyNests);
202  }
203 }
204 
205 void AffineDataCopyGeneration::runOnOperation() {
206  func::FuncOp f = getOperation();
207  OpBuilder topBuilder(f.getBody());
208  zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
209 
210  // Nests that are copy-in's or copy-out's; the root AffineForOps of those
211  // nests are stored herein.
212  DenseSet<Operation *> copyNests;
213 
214  // Clear recorded copy nests.
215  copyNests.clear();
216 
217  for (auto &block : f)
218  runOnBlock(&block, copyNests);
219 
220  // Promote any single iteration loops in the copy nests and collect
221  // load/stores to simplify.
223  for (Operation *nest : copyNests)
224  // With a post order walk, the erasure of loops does not affect
225  // continuation of the walk or the collection of load/store ops.
226  nest->walk([&](Operation *op) {
227  if (auto forOp = dyn_cast<AffineForOp>(op))
228  (void)promoteIfSingleIteration(forOp);
229  else if (isa<AffineLoadOp, AffineStoreOp>(op))
230  copyOps.push_back(op);
231  });
232 
233  // Promoting single iteration loops could lead to simplification of
234  // contained load's/store's, and the latter could anyway also be
235  // canonicalized.
236  RewritePatternSet patterns(&getContext());
237  AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
238  AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
239  FrozenRewritePatternSet frozenPatterns(std::move(patterns));
240  GreedyRewriteConfig config;
242  (void)applyOpPatternsAndFold(copyOps, frozenPatterns, config);
243 }
static MLIRContext * getContext(OpFoldResult val)
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Block represents an ordered list of Operations.
Definition: Block.h:31
bool empty()
Definition: Block.h:146
iterator end()
Definition: Block.h:142
iterator begin()
Definition: Block.h:141
This class represents a frozen set of patterns that can be processed by a pattern applicator.
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This class helps build Operations.
Definition: Builders.h:215
This class provides the API for ops that are known to be terminators.
Definition: OpDefinition.h:764
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:118
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2264
std::unique_ptr< OperationPass< func::FuncOp > > createAffineDataCopyGenerationPass(unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace=0, int minDmaTransferSize=1024, uint64_t fastMemCapacityBytes=std::numeric_limits< uint64_t >::max())
Performs packing (or explicit copying) of accessed memref regions into buffers in the specified faste...
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
Definition: Utils.cpp:1953
Include the generated interface declarations.
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:165