MLIR  22.0.0git
LoopUtils.h
Go to the documentation of this file.
1 //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This header file defines prototypes for various loop transformation utility
10 // methods: these are not passes by themselves but are used either by passes,
11 // optimization sequences, or in turn by other transformation utilities.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
16 #define MLIR_DIALECT_AFFINE_LOOPUTILS_H
17 
18 #include "mlir/IR/Block.h"
19 #include "mlir/Support/LLVM.h"
21 #include <optional>
22 
23 namespace mlir {
24 class AffineMap;
25 class LoopLikeOpInterface;
26 class OpBuilder;
27 class Value;
28 class ValueRange;
29 
30 namespace func {
31 class FuncOp;
32 } // namespace func
33 
34 namespace scf {
35 class ForOp;
36 class ParallelOp;
37 } // namespace scf
38 
39 namespace affine {
40 class AffineForOp;
41 struct MemRefRegion;
42 
43 /// Unrolls this for operation completely if the trip count is known to be
44 /// constant. Returns failure otherwise.
45 LogicalResult loopUnrollFull(AffineForOp forOp);
46 
47 /// Unrolls this for operation by the specified unroll factor. Returns failure
48 /// if the loop cannot be unrolled either due to restrictions or due to invalid
49 /// unroll factors. Requires positive loop bounds and step. If specified,
50 /// annotates the Ops in each unrolled iteration by applying `annotateFn`.
51 /// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled
52 /// regardless of the unroll factor.
53 LogicalResult loopUnrollByFactor(
54  AffineForOp forOp, uint64_t unrollFactor,
55  function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr,
56  bool cleanUpUnroll = false);
57 
58 /// Unrolls this loop by the specified unroll factor or its trip count,
59 /// whichever is lower.
60 LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
61 
62 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
63 /// in it from outermost to innermost.
64 bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
65 
66 /// Get perfectly nested sequence of loops starting at root of loop nest
67 /// (the first op being another AffineFor, and the second op - a terminator).
68 /// A loop is perfectly nested iff: the first op in the loop's body is another
69 /// AffineForOp, and the second op is a terminator).
70 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
71  AffineForOp root);
72 
73 /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
74 /// with iteration arguments performing supported reductions and its inner loops
75 /// can have iteration arguments. Returns success if the loop is successfully
76 /// unroll-jammed.
77 LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
78  uint64_t unrollJamFactor);
79 
80 /// Unrolls and jams this loop by the specified factor or by the trip count (if
81 /// constant), whichever is lower.
82 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
83  uint64_t unrollJamFactor);
84 
85 /// Promotes the loop body of a AffineForOp to its containing block if the loop
86 /// was known to have a single iteration.
87 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
88 
89 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
90 /// their body into the containing Block.
91 void promoteSingleIterationLoops(func::FuncOp f);
92 
93 /// Skew the operations in an affine.for's body with the specified
94 /// operation-wise shifts. The shifts are with respect to the original execution
95 /// order, and are multiplied by the loop 'step' before being applied. If
96 /// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue
97 /// loops when possible.
98 LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
99  bool unrollPrologueEpilogue = false);
100 
101 /// Tiles the specified band of perfectly nested loops creating tile-space loops
102 /// and intra-tile loops. A band is a contiguous set of loops. This utility
103 /// doesn't check for the validity of tiling itself, but just performs it.
104 LogicalResult
106  ArrayRef<unsigned> tileSizes,
107  SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
108 
109 /// Tiles the specified band of perfectly nested loops creating tile-space
110 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
111 /// is a contiguous set of loops.
112 LogicalResult tilePerfectlyNestedParametric(
114  SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
115 
116 /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
117 /// and 'forOpB' are part of a perfectly nested sequence of loops.
118 void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
119 
120 /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
121 /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
122 /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
124  ArrayRef<unsigned> loopPermMap);
125 
126 /// Performs a loop permutation on a perfectly nested loop nest `inputNest`
127 /// (where the contained loops appear from outer to inner) as specified by the
128 /// permutation `permMap`: loop 'i' in `inputNest` is mapped to location
129 /// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position
130 /// to inner. Returns the position in `inputNest` of the AffineForOp that
131 /// becomes the new outermost loop of this nest. This method always succeeds,
132 /// asserts out on invalid input / specifications.
133 unsigned permuteLoops(ArrayRef<AffineForOp> inputNest,
134  ArrayRef<unsigned> permMap);
135 
136 // Sinks all sequential loops to the innermost levels (while preserving
137 // relative order among them) and moves all parallel loops to the
138 // outermost (while again preserving relative order among them).
139 // Returns AffineForOp of the root of the new loop nest after loop interchanges.
140 AffineForOp sinkSequentialLoops(AffineForOp forOp);
141 
142 /// Performs tiling fo imperfectly nested loops (with interchange) by
143 /// strip-mining the `forOps` by `sizes` and sinking them, in their order of
144 /// occurrence in `forOps`, under each of the `targets`.
145 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
146 /// nested immediately under each of `targets`.
148  ArrayRef<uint64_t> sizes,
149  ArrayRef<AffineForOp> targets);
150 
151 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
152 /// and sinking them, in their order of occurrence in `forOps`, under `target`.
153 /// Returns the new AffineForOps, one per `forOps`, nested immediately under
154 /// `target`.
156  ArrayRef<uint64_t> sizes, AffineForOp target);
157 
158 /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
160  // True if DMAs should be generated instead of point-wise copies.
162  // The slower memory space from which data is to be moved.
163  unsigned slowMemorySpace;
164  // Memory space of the faster one (typically a scratchpad).
165  unsigned fastMemorySpace;
166  // Memory space to place tags in: only meaningful for DMAs.
167  unsigned tagMemorySpace;
168  // Capacity of the fast memory space in bytes.
170 };
171 
172 /// Performs explicit copying for the contiguous sequence of operations in the
173 /// block iterator range [`begin', `end'), where `end' can't be past the
174 /// terminator of the block (since additional operations are potentially
175 /// inserted right before `end`. `copyOptions` provides various parameters, and
176 /// the output argument `copyNests` is the set of all copy nests inserted, each
177 /// represented by its root affine.for. Since we generate alloc's and dealloc's
178 /// for all fast buffers (before and after the range of operations resp. or at a
179 /// hoisted position), all of the fast memory capacity is assumed to be
180 /// available for processing this block range. When 'filterMemRef' is specified,
181 /// copies are only generated for the provided MemRef. Returns success if the
182 /// explicit copying succeeded for all memrefs on which affine load/stores were
183 /// encountered. For memrefs for whose element types a size in bytes can't be
184 /// computed (`index` type), their capacity is not accounted for and the
185 /// `fastMemCapacityBytes` copy option would be non-functional in such cases.
187  const AffineCopyOptions &copyOptions,
188  std::optional<Value> filterMemRef,
189  DenseSet<Operation *> &copyNests);
190 
191 /// A convenience version of affineDataCopyGenerate for all ops in the body of
192 /// an AffineForOp.
193 LogicalResult affineDataCopyGenerate(AffineForOp forOp,
194  const AffineCopyOptions &copyOptions,
195  std::optional<Value> filterMemRef,
196  DenseSet<Operation *> &copyNests);
197 
198 /// Result for calling generateCopyForMemRegion.
200  // Number of bytes used by alloc.
201  uint64_t sizeInBytes;
202 
203  // The newly created buffer allocation.
205 
206  // Generated loop nest for copying data between the allocated buffer and the
207  // original memref.
209 };
210 
211 /// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
212 /// with a single memref region. `memrefRegion` is supposed to contain analysis
213 /// information within analyzedOp. The generated prologue and epilogue always
214 /// surround `analyzedOp`.
215 ///
216 /// Note that `analyzedOp` is a single op for API convenience, and the
217 /// [begin, end) version can be added as needed.
218 ///
219 /// Also note that certain options in `copyOptions` aren't looked at anymore,
220 /// like slowMemorySpace.
221 LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
222  Operation *analyzedOp,
223  const AffineCopyOptions &copyOptions,
224  CopyGenerateResult &result);
225 
226 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
227 /// `loops` contains a list of perfectly nested loops outermost to innermost
228 /// that are normalized (step one and lower bound of zero) and with bounds and
229 /// steps independent of any loop induction variable involved in the nest.
230 /// Coalescing affine.for loops is not always possible, i.e., the result may not
231 /// be representable using affine.for.
232 LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
233 
234 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
235 /// size given by `numProcessors`. This is achieved by embedding the SSA values
236 /// corresponding to `processorIds` and `numProcessors` into the bounds and step
237 /// of the `forOp`. No check is performed on the legality of the rewrite, it is
238 /// the caller's responsibility to ensure legality.
239 ///
240 /// Requires that `processorIds` and `numProcessors` have the same size and that
241 /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
242 /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
243 /// 1. GPU (threadIdx, get_local_id(), ...)
244 /// 2. MPI (MPI_Comm_rank)
245 /// 3. OpenMP (omp_get_thread_num)
246 ///
247 /// Example:
248 /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
249 /// numProcessors = [gridDim.x, blockDim.x], the loop:
250 ///
251 /// ```
252 /// scf.for %i = %lb to %ub step %step {
253 /// ...
254 /// }
255 /// ```
256 ///
257 /// is rewritten into a version resembling the following pseudo-IR:
258 ///
259 /// ```
260 /// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x)
261 /// to %ub step %gridDim.x * blockDim.x * %step {
262 /// ...
263 /// }
264 /// ```
265 void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
266  ArrayRef<Value> numProcessors);
267 
268 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
269 void gatherLoops(func::FuncOp func,
270  std::vector<SmallVector<AffineForOp, 2>> &depthToLoops);
271 
272 /// Creates an AffineForOp while ensuring that the lower and upper bounds are
273 /// canonicalized, i.e., unused and duplicate operands are removed, any constant
274 /// operands propagated/folded in, and duplicate bound maps dropped.
276  ValueRange lbOperands,
277  AffineMap lbMap,
278  ValueRange ubOperands,
279  AffineMap ubMap, int64_t step = 1);
280 
281 /// Separates full tiles from partial tiles for a perfect nest `nest` by
282 /// generating a conditional guard that selects between the full tile version
283 /// and the partial tile version using an AffineIfOp. The original loop nest
284 /// is replaced by this guarded two version form.
285 ///
286 /// affine.if (cond)
287 /// // full_tile
288 /// else
289 /// // partial tile
290 ///
291 LogicalResult
293  SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
294 
295 /// Walk an affine.for to find a band to coalesce.
296 LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op);
297 
298 /// Count the number of loops surrounding `operand` such that operand could be
299 /// hoisted above.
300 /// Stop counting at the first loop over which the operand cannot be hoisted.
301 /// This counts any LoopLikeOpInterface, not just affine.for.
302 int64_t numEnclosingInvariantLoops(OpOperand &operand);
303 } // namespace affine
304 } // namespace mlir
305 
306 #endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
OpListType::iterator iterator
Definition: Block.h:140
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:205
This class represents an operand of an operation.
Definition: Value.h:257
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
LogicalResult coalesceLoops(MutableArrayRef< AffineForOp > loops)
Replace a perfect nest of "for" loops with a single linearized loop.
Definition: LoopUtils.cpp:1605
LogicalResult loopUnrollFull(AffineForOp forOp)
Unrolls this for operation completely if the trip count is known to be constant.
Definition: LoopUtils.cpp:871
LogicalResult promoteIfSingleIteration(AffineForOp forOp)
Promotes the loop body of a AffineForOp to its containing block if the loop was known to have a singl...
Definition: LoopUtils.cpp:119
LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions &copyOptions, std::optional< Value > filterMemRef, DenseSet< Operation * > &copyNests)
Performs explicit copying for the contiguous sequence of operations in the block iterator range [‘beg...
Definition: LoopUtils.cpp:2300
LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor or by the trip count (if constant),...
Definition: LoopUtils.cpp:1061
LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor, function_ref< void(unsigned, Operation *, OpBuilder)> annotateFn=nullptr, bool cleanUpUnroll=false)
Unrolls this for operation by the specified unroll factor.
Definition: LoopUtils.cpp:995
void gatherLoops(func::FuncOp func, std::vector< SmallVector< AffineForOp, 2 >> &depthToLoops)
Gathers all AffineForOps in 'func.func' grouped by loop depth.
Definition: LoopUtils.cpp:2545
bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef< AffineForOp > loops)
Returns true if loops is a perfectly nested loop nest, where loops appear in it from outermost to inn...
Definition: LoopUtils.cpp:1361
AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step=1)
Creates an AffineForOp while ensuring that the lower and upper bounds are canonicalized,...
Definition: LoopUtils.cpp:2557
void getPerfectlyNestedLoops(SmallVectorImpl< AffineForOp > &nestedLoops, AffineForOp root)
Get perfectly nested sequence of loops starting at root of loop nest (the first op being another Affi...
Definition: LoopUtils.cpp:856
LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef< uint64_t > shifts, bool unrollPrologueEpilogue=false)
Skew the operations in an affine.for's body with the specified operation-wise shifts.
Definition: LoopUtils.cpp:230
bool isValidLoopInterchangePermutation(ArrayRef< AffineForOp > loops, ArrayRef< unsigned > loopPermMap)
Checks if the loop interchange permutation 'loopPermMap', of the perfectly nested sequence of loops i...
Definition: LoopUtils.cpp:1336
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions &copyOptions, CopyGenerateResult &result)
generateCopyForMemRegion is similar to affineDataCopyGenerate, but works with a single memref region.
Definition: LoopUtils.cpp:2501
LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor)
Unrolls this loop by the specified unroll factor or its trip count, whichever is lower.
Definition: LoopUtils.cpp:886
unsigned permuteLoops(ArrayRef< AffineForOp > inputNest, ArrayRef< unsigned > permMap)
Performs a loop permutation on a perfectly nested loop nest inputNest (where the contained loops appe...
Definition: LoopUtils.cpp:1383
LogicalResult loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor)
Unrolls and jams this loop by the specified factor.
Definition: LoopUtils.cpp:1084
LogicalResult tilePerfectlyNestedParametric(MutableArrayRef< AffineForOp > input, ArrayRef< Value > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops,...
Definition: LoopUtils.cpp:814
void promoteSingleIterationLoops(func::FuncOp f)
Promotes all single iteration AffineForOp's in the Function, i.e., moves their body into the containi...
int64_t numEnclosingInvariantLoops(OpOperand &operand)
Count the number of loops surrounding operand such that operand could be hoisted above.
Definition: LoopUtils.cpp:2817
void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef< Value > processorId, ArrayRef< Value > numProcessors)
Maps forOp for execution on a parallel grid of virtual processorIds of size given by numProcessors.
Definition: LoopUtils.cpp:1718
SmallVector< SmallVector< AffineForOp, 8 >, 8 > tile(ArrayRef< AffineForOp > forOps, ArrayRef< uint64_t > sizes, ArrayRef< AffineForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: LoopUtils.cpp:1584
AffineForOp sinkSequentialLoops(AffineForOp forOp)
Definition: LoopUtils.cpp:1453
LogicalResult tilePerfectlyNested(MutableArrayRef< AffineForOp > input, ArrayRef< unsigned > tileSizes, SmallVectorImpl< AffineForOp > *tiledNest=nullptr)
Tiles the specified band of perfectly nested loops creating tile-space loops and intra-tile loops.
Definition: LoopUtils.cpp:772
void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB)
Performs loop interchange on 'forOpA' and 'forOpB'.
Definition: LoopUtils.cpp:1279
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op)
Walk an affine.for to find a band to coalesce.
Definition: LoopUtils.cpp:2769
LogicalResult separateFullTiles(MutableArrayRef< AffineForOp > nest, SmallVectorImpl< AffineForOp > *fullTileNest=nullptr)
Separates full tiles from partial tiles for a perfect nest nest by generating a conditional guard tha...
Definition: LoopUtils.cpp:2715
Include the generated interface declarations.
llvm::function_ref< Fn > function_ref
Definition: LLVM.h:152
Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
Definition: LoopUtils.h:159
Result for calling generateCopyForMemRegion.
Definition: LoopUtils.h:199
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
Definition: Utils.h:481