MLIR  22.0.0git
AffineDataCopyGeneration.cpp
Go to the documentation of this file.
1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
23 
31 #include "llvm/Support/CommandLine.h"
32 #include <algorithm>
33 #include <optional>
34 
35 namespace mlir {
36 namespace affine {
37 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
38 #include "mlir/Dialect/Affine/Passes.h.inc"
39 } // namespace affine
40 } // namespace mlir
41 
42 #define DEBUG_TYPE "affine-data-copy-generate"
43 
44 using namespace mlir;
45 using namespace mlir::affine;
46 
47 namespace {
48 
49 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
50 /// introducing copy operations to transfer data into `fastMemorySpace` and
51 /// rewriting the original load's/store's to instead load/store from the
52 /// allocated fast memory buffers. Additional options specify the identifier
53 /// corresponding to the fast memory space and the amount of fast memory space
54 /// available. The pass traverses through the nesting structure, recursing to
55 /// inner levels if necessary to determine at what depth copies need to be
56 /// placed so that the allocated buffers fit within the memory capacity
57 /// provided.
58 // TODO: We currently can't generate copies correctly when stores
59 // are strided. Check for strided stores.
60 struct AffineDataCopyGeneration
61  : public affine::impl::AffineDataCopyGenerationBase<
62  AffineDataCopyGeneration> {
63  AffineDataCopyGeneration() = default;
64  explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
65  unsigned fastMemorySpace,
66  unsigned tagMemorySpace,
67  int minDmaTransferSize,
68  uint64_t fastMemCapacityBytes) {
69  this->slowMemorySpace = slowMemorySpace;
70  this->fastMemorySpace = fastMemorySpace;
71  this->tagMemorySpace = tagMemorySpace;
72  this->minDmaTransferSize = minDmaTransferSize;
73  this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
74  }
75 
76  void runOnOperation() override;
77  void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
78 
79  // Constant zero index to avoid too many duplicates.
80  Value zeroIndex = nullptr;
81 };
82 
83 } // namespace
84 
85 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
86 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
87 /// by the latter.
88 std::unique_ptr<OperationPass<func::FuncOp>>
90  unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
91  int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
92  return std::make_unique<AffineDataCopyGeneration>(
93  slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
94  fastMemCapacityBytes);
95 }
96 std::unique_ptr<OperationPass<func::FuncOp>>
98  return std::make_unique<AffineDataCopyGeneration>();
99 }
100 
101 /// Generate copies for this block. The block is partitioned into separate
102 /// ranges: each range is either a sequence of one or more operations starting
103 /// and ending with an affine load or store op, or just an affine.for op (which
104 /// could have other affine for op's nested within).
105 void AffineDataCopyGeneration::runOnBlock(Block *block,
106  DenseSet<Operation *> &copyNests) {
107  if (block->empty())
108  return;
109 
110  uint64_t fastMemCapacityBytes =
111  fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
112  ? fastMemoryCapacity * 1024
113  : fastMemoryCapacity;
114  AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
115  fastMemorySpace, tagMemorySpace,
116  fastMemCapacityBytes};
117 
118  // Every affine.for op in the block starts and ends a block range for copying;
119  // in addition, a contiguous sequence of operations starting with a
120  // load/store op but not including any copy nests themselves is also
121  // identified as a copy block range. Straightline code (a contiguous chunk of
122  // operations excluding AffineForOp's) are always assumed to not exhaust
123  // memory. As a result, this approach is conservative in some cases at the
124  // moment; we do a check later and report an error with location info.
125 
126  // Get to the first load, store, or for op (that is not a copy nest itself).
127  auto curBegin = llvm::find_if(*block, [&](Operation &op) {
128  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
129  copyNests.count(&op) == 0;
130  });
131 
132  // Create [begin, end) ranges.
133  auto it = curBegin;
134  while (it != block->end()) {
135  AffineForOp forOp;
136  // If you hit a non-copy for loop, we will split there.
137  if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
138  // Perform the copying up unti this 'for' op first.
139  (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
140  /*filterMemRef=*/std::nullopt, copyNests);
141 
142  // Returns true if the footprint is known to exceed capacity.
143  auto exceedsCapacity = [&](AffineForOp forOp) {
144  std::optional<int64_t> footprint =
146  /*memorySpace=*/0);
147  return (footprint.has_value() &&
148  static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
149  };
150 
151  // If the memory footprint of the 'affine.for' loop is higher than fast
152  // memory capacity (when provided), we recurse to copy at an inner level
153  // until we find a depth at which footprint fits in fast mem capacity. If
154  // the footprint can't be calculated, we assume for now it fits. Recurse
155  // inside if footprint for 'forOp' exceeds capacity, or when
156  // skipNonUnitStrideLoops is set and the step size is not one.
157  bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
158  : exceedsCapacity(forOp);
159  if (recurseInner) {
160  // We'll recurse and do the copies at an inner level for 'forInst'.
161  // Recurse onto the body of this loop.
162  runOnBlock(forOp.getBody(), copyNests);
163  } else {
164  // We have enough capacity, i.e., copies will be computed for the
165  // portion of the block until 'it', and for 'it', which is 'forOp'. Note
166  // that for the latter, the copies are placed just before this loop (for
167  // incoming copies) and right after (for outgoing ones).
168 
169  // Inner loop copies have their own scope - we don't thus update
170  // consumed capacity. The footprint check above guarantees this inner
171  // loop's footprint fits.
172  (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
173  copyOptions,
174  /*filterMemRef=*/std::nullopt, copyNests);
175  }
176  // Get to the next load or store op after 'forOp'.
177  curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
178  return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
179  copyNests.count(&op) == 0;
180  });
181  it = curBegin;
182  } else {
183  assert(copyNests.count(&*it) == 0 &&
184  "all copy nests generated should have been skipped above");
185  // We simply include this op in the current range and continue for more.
186  ++it;
187  }
188  }
189 
190  // Generate the copy for the final block range.
191  if (curBegin != block->end()) {
192  // Can't be a terminator because it would have been skipped above.
193  assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
194  "can't be a terminator");
195  // Exclude the affine.yield - hence, the std::prev.
196  (void)affineDataCopyGenerate(/*begin=*/curBegin,
197  /*end=*/std::prev(block->end()), copyOptions,
198  /*filterMemRef=*/std::nullopt, copyNests);
199  }
200 }
201 
202 void AffineDataCopyGeneration::runOnOperation() {
203  func::FuncOp f = getOperation();
204  OpBuilder topBuilder(f.getBody());
205  zeroIndex = arith::ConstantIndexOp::create(topBuilder, f.getLoc(), 0);
206 
207  // Nests that are copy-in's or copy-out's; the root AffineForOps of those
208  // nests are stored herein.
209  DenseSet<Operation *> copyNests;
210 
211  // Clear recorded copy nests.
212  copyNests.clear();
213 
214  for (auto &block : f)
215  runOnBlock(&block, copyNests);
216 
217  // Promote any single iteration loops in the copy nests and collect
218  // load/stores to simplify.
220  for (Operation *nest : copyNests)
221  // With a post order walk, the erasure of loops does not affect
222  // continuation of the walk or the collection of load/store ops.
223  nest->walk([&](Operation *op) {
224  if (auto forOp = dyn_cast<AffineForOp>(op))
225  (void)promoteIfSingleIteration(forOp);
226  else if (isa<AffineLoadOp, AffineStoreOp>(op))
227  copyOps.push_back(op);
228  });
229 
230  // Promoting single iteration loops could lead to simplification of
231  // contained load's/store's, and the latter could anyway also be
232  // canonicalized.
234  AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
235  AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
236  FrozenRewritePatternSet frozenPatterns(std::move(patterns));
238  copyOps, frozenPatterns,
241 }
static MLIRContext * getContext(OpFoldResult val)
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Block represents an ordered list of Operations.
Definition: Block.h:33
bool empty()
Definition: Block.h:148
iterator end()
Definition: Block.h:144
This class represents a frozen set of patterns that can be processed by a pattern applicator.
This class allows control over how the GreedyPatternRewriteDriver works.
GreedyRewriteConfig & setStrictness(GreedyRewriteStrictness mode)
This class helps build Operations.
Definition: Builders.h:207
This class provides the API for ops that are known to be terminators.
Definition: OpDefinition.h:773
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:119
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2300
std::unique_ptr< OperationPass< func::FuncOp > > createAffineDataCopyGenerationPass(unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace=0, int minDmaTransferSize=1024, uint64_t fastMemCapacityBytes=std::numeric_limits< uint64_t >::max())
Performs packing (or explicit copying) of accessed memref regions into buffers in the specified faste...
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
Definition: Utils.cpp:2173
Include the generated interface declarations.
LogicalResult applyOpPatternsGreedily(ArrayRef< Operation * > ops, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr, bool *allErased=nullptr)
Rewrite the specified ops by repeatedly applying the highest benefit patterns in a greedy worklist dr...
const FrozenRewritePatternSet & patterns
@ ExistingAndNewOps
Only pre-existing and newly created ops are processed.
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:159