MLIR  17.0.0git
AffineDataCopyGeneration.cpp
Go to the documentation of this file.
1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
23 
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Debug.h"
34 #include <algorithm>
35 #include <optional>
36 
37 namespace mlir {
38 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
39 #include "mlir/Dialect/Affine/Passes.h.inc"
40 } // namespace mlir
41 
42 #define DEBUG_TYPE "affine-data-copy-generate"
43 
44 using namespace mlir;
45 
46 namespace {
47 
48 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
49 /// introducing copy operations to transfer data into `fastMemorySpace` and
50 /// rewriting the original load's/store's to instead load/store from the
51 /// allocated fast memory buffers. Additional options specify the identifier
52 /// corresponding to the fast memory space and the amount of fast memory space
53 /// available. The pass traverses through the nesting structure, recursing to
54 /// inner levels if necessary to determine at what depth copies need to be
55 /// placed so that the allocated buffers fit within the memory capacity
56 /// provided.
57 // TODO: We currently can't generate copies correctly when stores
58 // are strided. Check for strided stores.
59 struct AffineDataCopyGeneration
60  : public impl::AffineDataCopyGenerationBase<AffineDataCopyGeneration> {
61  AffineDataCopyGeneration() = default;
62  explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
63  unsigned fastMemorySpace,
64  unsigned tagMemorySpace,
65  int minDmaTransferSize,
66  uint64_t fastMemCapacityBytes) {
67  this->slowMemorySpace = slowMemorySpace;
68  this->fastMemorySpace = fastMemorySpace;
69  this->tagMemorySpace = tagMemorySpace;
70  this->minDmaTransferSize = minDmaTransferSize;
71  this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
72  }
73 
74  void runOnOperation() override;
75  void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
76 
77  // Constant zero index to avoid too many duplicates.
78  Value zeroIndex = nullptr;
79 };
80 
81 } // namespace
82 
83 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
84 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
85 /// by the latter. Only load op's handled for now.
86 /// TODO: extend this to store op's.
87 std::unique_ptr<OperationPass<func::FuncOp>>
89  unsigned fastMemorySpace,
90  unsigned tagMemorySpace,
91  int minDmaTransferSize,
92  uint64_t fastMemCapacityBytes) {
93  return std::make_unique<AffineDataCopyGeneration>(
94  slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
95  fastMemCapacityBytes);
96 }
97 std::unique_ptr<OperationPass<func::FuncOp>>
99  return std::make_unique<AffineDataCopyGeneration>();
100 }
101 
102 /// Generate copies for this block. The block is partitioned into separate
103 /// ranges: each range is either a sequence of one or more operations starting
104 /// and ending with an affine load or store op, or just an affine.forop (which
105 /// could have other affine for op's nested within).
106 void AffineDataCopyGeneration::runOnBlock(Block *block,
107  DenseSet<Operation *> &copyNests) {
108  if (block->empty())
109  return;
110 
111  uint64_t fastMemCapacityBytes =
112  fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
113  ? fastMemoryCapacity * 1024
114  : fastMemoryCapacity;
115  AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
116  fastMemorySpace, tagMemorySpace,
117  fastMemCapacityBytes};
118 
119  // Every affine.for op in the block starts and ends a block range for copying;
120  // in addition, a contiguous sequence of operations starting with a
121  // load/store op but not including any copy nests themselves is also
122  // identified as a copy block range. Straightline code (a contiguous chunk of
123  // operations excluding AffineForOp's) are always assumed to not exhaust
124  // memory. As a result, this approach is conservative in some cases at the
125  // moment; we do a check later and report an error with location info.
126  // TODO: An 'affine.if' operation is being treated similar to an
127  // operation. 'affine.if''s could have 'affine.for's in them;
128  // treat them separately.
129 
130  // Get to the first load, store, or for op (that is not a copy nest itself).
131  auto curBegin =
132  std::find_if(block->begin(), block->end(), [&](Operation &op) {
133  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
134  copyNests.count(&op) == 0;
135  });
136 
137  // Create [begin, end) ranges.
138  auto it = curBegin;
139  while (it != block->end()) {
140  AffineForOp forOp;
141  // If you hit a non-copy for loop, we will split there.
142  if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
143  // Perform the copying up unti this 'for' op first.
144  (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
145  /*filterMemRef=*/std::nullopt, copyNests);
146 
147  // Returns true if the footprint is known to exceed capacity.
148  auto exceedsCapacity = [&](AffineForOp forOp) {
149  std::optional<int64_t> footprint =
151  /*memorySpace=*/0);
152  return (footprint.has_value() &&
153  static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
154  };
155 
156  // If the memory footprint of the 'affine.for' loop is higher than fast
157  // memory capacity (when provided), we recurse to copy at an inner level
158  // until we find a depth at which footprint fits in fast mem capacity. If
159  // the footprint can't be calculated, we assume for now it fits. Recurse
160  // inside if footprint for 'forOp' exceeds capacity, or when
161  // skipNonUnitStrideLoops is set and the step size is not one.
162  bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
163  : exceedsCapacity(forOp);
164  if (recurseInner) {
165  // We'll recurse and do the copies at an inner level for 'forInst'.
166  // Recurse onto the body of this loop.
167  runOnBlock(forOp.getBody(), copyNests);
168  } else {
169  // We have enough capacity, i.e., copies will be computed for the
170  // portion of the block until 'it', and for 'it', which is 'forOp'. Note
171  // that for the latter, the copies are placed just before this loop (for
172  // incoming copies) and right after (for outgoing ones).
173 
174  // Inner loop copies have their own scope - we don't thus update
175  // consumed capacity. The footprint check above guarantees this inner
176  // loop's footprint fits.
177  (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
178  copyOptions,
179  /*filterMemRef=*/std::nullopt, copyNests);
180  }
181  // Get to the next load or store op after 'forOp'.
182  curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
183  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
184  copyNests.count(&op) == 0;
185  });
186  it = curBegin;
187  } else {
188  assert(copyNests.count(&*it) == 0 &&
189  "all copy nests generated should have been skipped above");
190  // We simply include this op in the current range and continue for more.
191  ++it;
192  }
193  }
194 
195  // Generate the copy for the final block range.
196  if (curBegin != block->end()) {
197  // Can't be a terminator because it would have been skipped above.
198  assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
199  "can't be a terminator");
200  // Exclude the affine.yield - hence, the std::prev.
201  (void)affineDataCopyGenerate(/*begin=*/curBegin,
202  /*end=*/std::prev(block->end()), copyOptions,
203  /*filterMemRef=*/std::nullopt, copyNests);
204  }
205 }
206 
207 void AffineDataCopyGeneration::runOnOperation() {
208  func::FuncOp f = getOperation();
209  OpBuilder topBuilder(f.getBody());
210  zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
211 
212  // Nests that are copy-in's or copy-out's; the root AffineForOps of those
213  // nests are stored herein.
214  DenseSet<Operation *> copyNests;
215 
216  // Clear recorded copy nests.
217  copyNests.clear();
218 
219  for (auto &block : f)
220  runOnBlock(&block, copyNests);
221 
222  // Promote any single iteration loops in the copy nests and collect
223  // load/stores to simplify.
225  for (Operation *nest : copyNests)
226  // With a post order walk, the erasure of loops does not affect
227  // continuation of the walk or the collection of load/store ops.
228  nest->walk([&](Operation *op) {
229  if (auto forOp = dyn_cast<AffineForOp>(op))
230  (void)promoteIfSingleIteration(forOp);
231  else if (isa<AffineLoadOp, AffineStoreOp>(op))
232  copyOps.push_back(op);
233  });
234 
235  // Promoting single iteration loops could lead to simplification of
236  // contained load's/store's, and the latter could anyway also be
237  // canonicalized.
238  RewritePatternSet patterns(&getContext());
239  AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
240  AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
241  FrozenRewritePatternSet frozenPatterns(std::move(patterns));
242  GreedyRewriteConfig config;
244  (void)applyOpPatternsAndFold(copyOps, frozenPatterns, config);
245 }
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Block represents an ordered list of Operations.
Definition: Block.h:30
bool empty()
Definition: Block.h:137
iterator end()
Definition: Block.h:133
iterator begin()
Definition: Block.h:132
This class represents a frozen set of patterns that can be processed by a pattern applicator.
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteStrictness strictMode
Strict mode can restrict the ops that are added to the worklist during the rewrite.
This class helps build Operations.
Definition: Builders.h:202
This class provides the API for ops that are known to be terminators.
Definition: OpDefinition.h:703
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:75
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
This header declares functions that assit transformations in the MemRef dialect.
LogicalResult applyOpPatternsAndFold(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Applies the specified rewrite patterns on ops while also trying to fold these ops.
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2297
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:131
std::unique_ptr< OperationPass< func::FuncOp > > createAffineDataCopyGenerationPass(unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace=0, int minDmaTransferSize=1024, uint64_t fastMemCapacityBytes=std::numeric_limits< uint64_t >::max())
Performs packing (or explicit copying) of accessed memref regions into buffers in the specified faste...
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
Definition: Utils.cpp:1820
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:164