MLIR  20.0.0git
TileUsingInterface.h
Go to the documentation of this file.
1 //===- TileUsingInterface.h - Tiling ops using TilingInterface --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
10 #define MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
11 
14 #include "mlir/IR/PatternMatch.h"
19 
20 #include <deque>
21 
22 namespace mlir {
23 class Operation;
24 class RewriterBase;
25 class TilingInterface;
26 } // namespace mlir
27 
28 namespace mlir {
29 namespace scf {
30 
32  std::function<SmallVector<OpFoldResult>(OpBuilder &, Operation *)>;
33 
34 /// Options to use to control tiling.
36  /// Computation function that returns the tile sizes to use for each loop.
37  /// Returning a tile size of zero implies no tiling for that loop. If the
38  /// size of the returned vector is smaller than the number of loops, the inner
39  /// loops are not tiled. If the size of the returned vector is larger, then
40  /// the vector is truncated to number of loops.
42 
45  tileSizeComputationFunction = std::move(fun);
46  return *this;
47  }
48  /// Convenience function to set the `tileSizeComputationFunction` to a
49  /// function that computes tile sizes at the point they are needed. Allows
50  /// proper interaction with folding.
52 
53  /// Computation function that returns the number of threads to use for
54  /// each loop. Returning a num threads of zero implies no tiling for that
55  /// loop. If the size of the returned vector is smaller than the number of
56  /// loops, the inner loops are not tiled. If the size of the returned vector
57  /// is larger, then the vector is truncated to number of loops. Note: This
58  /// option is only supported with loopType set to `LoopType::ForallOp`. If the
59  /// tile size function is not specified while the num threads computation is,
60  /// then the tile size is determined automatically to map at most one tile per
61  /// thread.
63 
66  numThreadsComputationFunction = std::move(fun);
67  return *this;
68  }
69  /// Convenience function to set the `numThreadsComputationFunction` to a
70  /// function that computes num threads at the point they are needed.
72 
73  /// The interchange vector to reorder the tiled loops.
76  interchangeVector = llvm::to_vector(interchange);
77  return *this;
78  }
79 
80  /// Specify which loop construct to use for tile and fuse.
81  enum class LoopType { ForOp, ForallOp };
84  loopType = type;
85  return *this;
86  }
87 
88  /// Specify mapping of loops to devices. This is only respected when the loop
89  /// constructs support such a mapping (like `scf.forall`). Will be ignored
90  /// when using loop constructs that dont support such a mapping (like
91  /// `scf.for`)
94  mappingVector = llvm::to_vector(mapping);
95  return *this;
96  }
97 };
98 
99 /// Transformation information returned after tiling.
101  /// Tiled operations that are generated during tiling. The order does not
102  /// matter except the last op. The replacements are expected to be the results
103  /// of the last op.
105  /// The `scf.for` operations that iterate over the tiles.
107  /// Values to use as replacements for the untiled op. Is the same size as the
108  /// number of results of the untiled op.
110  /// Slices generated after tiling that can be used for fusing with the tiled
111  /// producer.
113 };
114 
115 /// Method to tile an op that implements the `TilingInterface` using
116 /// `scf.for` for iterating over the tiles.
117 FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
118  TilingInterface op,
119  const SCFTilingOptions &options);
120 
121 /// Options used to control tile + fuse.
123  /// The tiling options used to control the tiling of the consumer.
127  return *this;
128  }
129 
130  /// Control function to check if a slice needs to be fused or not,
131  /// The control function receives
132  /// 1) the slice along which fusion is to be done,
133  /// 2) the producer value that is to be fused
134  /// 3) a boolean value set to `true` if the fusion is from
135  /// a destination operand.
136  /// The control function returns an `std::optiona<ControlFnResult>`.
137  /// If the return value is `std::nullopt`, that implies no fusion
138  /// is to be performed along that slice.
140  /// Set to true if the loop nest has to return a replacement value
141  /// for the fused producer.
143  };
144  using ControlFnTy = std::function<std::optional<ControlFnResult>(
145  tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
146  bool isDestinationOperand)>;
147  /// The default control function implements greedy fusion without yielding
148  /// a replacement for any of the fused results.
149  ControlFnTy fusionControlFn = [](tensor::ExtractSliceOp, OpResult,
150  bool) -> std::optional<ControlFnResult> {
151  return ControlFnResult{};
152  };
154  fusionControlFn = controlFn;
155  return *this;
156  }
157 
158  /// An optional set of rewrite patterns to apply to the results of tiling
159  /// before fusion. This will track deleted and newly inserted
160  /// `tensor.extract_slice` ops and update the worklist.
161  std::optional<FrozenRewritePatternSet> cleanupPatterns = std::nullopt;
162 };
163 
164 /// Fuse the producer of the source of `candidateSliceOp` by computing the
165 /// required slice of the producer in-place. Note that the method
166 /// replaces the uses of `candidateSliceOp` with the tiled and fused producer
167 /// value but does not delete the slice operation.
169  OpResult origProducer; // Original untiled producer.
170  Value tiledAndFusedProducer; // Tile and fused producer value.
173 };
174 std::optional<SCFFuseProducerOfSliceResult>
176  tensor::ExtractSliceOp candidateSliceOp,
178 
179 /// Reconstruct the fused producer from within the tiled-and-fused code. Based
180 /// on the slice of the producer computed in place it is possible that within
181 /// the loop nest same slice of the producer is computed multiple times. It is
182 /// in general not possible to recompute the value of the fused producer from
183 /// the tiled loop code in such cases. For the cases where no slice of the
184 /// producer is computed in a redundant fashion it is possible to reconstruct
185 /// the value of the original producer from within the tiled loop. It is upto
186 /// the caller to ensure that the producer is not computed redundantly within
187 /// the tiled loop nest. For example, consider
188 ///
189 /// ```mlir
190 /// %0 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
191 /// %1 = linalg.matmul ins(%0, ..) outs(...) -> tensor<?x?x?f32>
192 /// ```
193 ///
194 /// If `%1` is tiled in a 2D fashion and `%0` is fused with it, the resulting IR
195 /// is,
196 ///
197 /// ```mlir
198 /// %t1_0 = scf.for .... iter_args(%arg0 = ...) {
199 /// %t1_1 = scf.for ... iter_args(%arg1 = %arg0) {
200 /// ...
201 /// %t1_2 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
202 /// %t1_3 = linalg.matmul ins(%t1_2, ...)
203 /// %t1_4 = tensor.insert_slice %t1_3 into %arg1 ...
204 /// scf.yield %t1_4
205 /// }
206 /// scf.yield %t1_1
207 /// }
208 /// ```
209 ///
210 /// Here `%t1_2` is the same for all iterations of the inner `scf.for`. Instead
211 /// if `%1` were tiled only along the rows, the resultant code would be
212 ///
213 /// ```mlir
214 /// %t2_0 = scf.for .... iter_args(%arg0 = ...) {
215 /// ...
216 /// %t2_1 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
217 /// %t2_2 = linalg.matmul ins(%t2_1, ...)
218 /// %t2_3 = tensor.insert_slice %t2_2 into %arg0 ...
219 /// scf.yield %t2_3
220 /// }
221 /// ```
222 ///
223 /// Here there is no intersection in the different slices of `%t2_1` computed
224 /// across iterations of the `scf.for`. In such cases, the value of the original
225 /// `%0` can be reconstructed from within the loop body. This is useful in cases
226 /// where `%0` had other uses as well. If not reconstructed from within the loop
227 /// body, uses of `%0` could not be replaced, making it still live and the
228 /// fusion immaterial.
229 ///
230 /// The @param `yieldResultNumber` decides which result would be yield. If not
231 /// given, yield all `opResult` of fused producer.
232 ///
233 /// The method returns the list of new slices added during the process (which
234 /// can be used to fuse along).
235 FailureOr<SmallVector<Operation *>> yieldReplacementForFusedProducer(
236  RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
237  scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
239  ArrayRef<unsigned> yieldResultNumber = ArrayRef<unsigned>{});
240 
241 /// Transformation information returned after tile and fuse.
243  /// List of untiled operations that were fused with the tiled consumer.
245  /// List of tiled and fused operations generated. The first one in this list
246  /// is guaranteed to be the tiled operations generated during tiling of the
247  /// generated operation.
249  /// The `scf.for` operations that iterate over the tiles.
251  /// The replacement values to use for the tiled and fused operations.
253 };
254 
255 /// Method to tile and fuse a sequence of operations, by tiling the consumer
256 /// and fusing its producers. Note that this assumes that it is valid to
257 /// tile+fuse the producer into the innermost tiled loop. Its up to the caller
258 /// to ensure that the tile sizes provided make this fusion valid.
259 ///
260 /// For example, for the following sequence
261 ///
262 /// ```mlir
263 /// %0 =
264 /// %1 = linalg.fill ... outs(%0 : ... )
265 /// %2 = linalg.matmul ... outs(%1 : ...) ...
266 /// ```
267 ///
268 /// it is legal to fuse the fill with the matmul only if the matmul is tiled
269 /// along the parallel dimensions and not the reduction dimension, i.e. the tile
270 /// size for the reduction dimension should be 0. The resulting fused
271 /// transformation is
272 ///
273 /// ```mlir
274 /// %1 = scf.for ... iter_args(%arg0 = %0)
275 /// %2 = tensor.extract_slice %arg0
276 /// %3 = linalg.fill .. outs(%2 : ... )
277 /// %4 = linalg.matmul .. outs(%3 : ...)
278 /// }
279 /// ```
280 FailureOr<SCFTileAndFuseResult>
282  TilingInterface consumer,
284 
285 /// Fuse the consumer of the source of `candidateSliceOp` by computing the
286 /// required slice of the consumer in-place. Note that the method
287 /// replaces the uses of `candidateSliceOp` with the tiled and fused consumer
288 /// value but does not delete the slice operation.
290  OpOperand *origConsumerOperand; // Original untiled consumer's operand.
291  OpOperand
292  *tiledAndFusedConsumerOperand; // Tiled and fused consumer's operand.
294 };
295 FailureOr<scf::SCFFuseConsumerOfSliceResult>
296 tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp);
297 
298 /// Method to lower an `op` that implements the `TilingInterface` to
299 /// loops/scalars.
300 FailureOr<SmallVector<scf::ForOp>>
301 lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op);
302 
303 /// Transformation information returned after reduction tiling.
305  /// The partial reduction tiled op generated.
307  /// The final reduction operation merging all the partial reductions.
309  /// Initial values used for reduction.
311  /// The loop operations that iterate over the tiles.
313  /// The replacements to use for the results of the tiled operation.
315 };
316 
317 /// Method to tile a reduction and generate a parallel op within a serial loop.
318 /// Each of the partial reductions are calculated in parallel. Then after the
319 /// loop all the partial reduction are merged into a final reduction.
320 /// For example for the following sequence
321 ///
322 /// ```mlir
323 /// %0 = linalg.generic %in ["parallel", "reduction"]
324 /// : tensor<7x9xf32> -> tensor<7xf32>
325 /// ```
326 ///
327 /// into:
328 ///
329 /// ```mlir
330 /// %0 = linalg.fill ... : tensor<7x4xf32>
331 /// %1 = scf.for ... iter_args(%arg0 = %0)
332 /// %2 = tensor.extract_slice %arg0 : tensor<7x4xf32> -> tensor<7x?xf32>
333 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
334 /// %4 = linalg.generic %2, %3 ["parallel", "parallel"]
335 /// : tensor<7x?xf32> -> tensor<7x?xf32>
336 /// %5 = tensor.insert_slice %3, %0[0, 0] : tensor<7x4xf32>
337 /// }
338 /// %6 = linalg.generic %1 ["parallel", "reduction"]
339 /// : tensor<7x4xf32> -> tensor<7xf32>
340 /// ```
341 FailureOr<scf::SCFReductionTilingResult>
342 tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op,
343  ArrayRef<OpFoldResult> tileSize);
344 
345 } // namespace scf
346 } // namespace mlir
347 
348 #endif // MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
static llvm::ManagedStatic< PassManagerOptions > options
This class helps build Operations.
Definition: Builders.h:215
This class represents an operand of an operation.
Definition: Value.h:267
This is a value defined by a result of an operation.
Definition: Value.h:457
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
FailureOr< scf::SCFReductionTilingResult > tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > tileSize)
Method to tile a reduction and generate a parallel op within a serial loop.
FailureOr< SCFTilingResult > tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const SCFTilingOptions &options)
Method to tile an op that implements the TilingInterface using scf.for for iterating over the tiles.
FailureOr< scf::SCFFuseConsumerOfSliceResult > tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp)
Implementation of fusing consumer of a single slice by computing the slice of the consumer in-place f...
FailureOr< SmallVector< scf::ForOp > > lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op)
Method to lower an op that implements the TilingInterface to loops/scalars.
FailureOr< SmallVector< Operation * > > yieldReplacementForFusedProducer(RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp, scf::SCFFuseProducerOfSliceResult fusedProducerInfo, MutableArrayRef< LoopLikeOpInterface > loops, ArrayRef< unsigned > yieldResultNumber=ArrayRef< unsigned >{})
Reconstruct the fused producer from within the tiled-and-fused code.
FailureOr< SCFTileAndFuseResult > tileConsumerAndFuseProducersUsingSCF(RewriterBase &rewriter, TilingInterface consumer, const SCFTileAndFuseOptions &options)
Method to tile and fuse a sequence of operations, by tiling the consumer and fusing its producers.
std::function< SmallVector< OpFoldResult >(OpBuilder &, Operation *)> SCFTileSizeComputationFunction
std::optional< SCFFuseProducerOfSliceResult > tileAndFuseProducerOfSlice(RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp, MutableArrayRef< LoopLikeOpInterface > loops)
Implementation of fusing producer of a single slice by computing the slice of the producer in-place.
Include the generated interface declarations.
Fuse the consumer of the source of candidateSliceOp by computing the required slice of the consumer i...
Fuse the producer of the source of candidateSliceOp by computing the required slice of the producer i...
SmallVector< Operation * > generatedSlices
Transformation information returned after reduction tiling.
SmallVector< Value > replacements
The replacements to use for the results of the tiled operation.
SmallVector< Value > initialValues
Initial values used for reduction.
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
SmallVector< LoopLikeOpInterface > loops
The loop operations that iterate over the tiles.
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
Control function to check if a slice needs to be fused or not, The control function receives 1) the s...
bool yieldProducerReplacement
Set to true if the loop nest has to return a replacement value for the fused producer.
Options used to control tile + fuse.
SCFTilingOptions tilingOptions
The tiling options used to control the tiling of the consumer.
std::optional< FrozenRewritePatternSet > cleanupPatterns
An optional set of rewrite patterns to apply to the results of tiling before fusion.
SCFTileAndFuseOptions & setTilingOptions(SCFTilingOptions options)
ControlFnTy fusionControlFn
The default control function implements greedy fusion without yielding a replacement for any of the f...
std::function< std::optional< ControlFnResult >(tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer, bool isDestinationOperand)> ControlFnTy
SCFTileAndFuseOptions & setFusionControlFn(ControlFnTy controlFn)
Transformation information returned after tile and fuse.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
llvm::SetVector< Operation * > fusedProducers
List of untiled operations that were fused with the tiled consumer.
llvm::DenseMap< Value, Value > replacements
The replacement values to use for the tiled and fused operations.
llvm::SetVector< Operation * > tiledAndFusedOps
List of tiled and fused operations generated.
Options to use to control tiling.
SCFTileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes to use for each loop.
SCFTilingOptions & setTileSizeComputationFunction(SCFTileSizeComputationFunction fun)
SCFTilingOptions & setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun)
SCFTilingOptions & setNumThreads(ArrayRef< OpFoldResult > numThreads)
Convenience function to set the numThreadsComputationFunction to a function that computes num threads...
SCFTilingOptions & setInterchange(ArrayRef< int64_t > interchange)
SCFTilingOptions & setTileSizes(ArrayRef< OpFoldResult > tileSizes)
Convenience function to set the tileSizeComputationFunction to a function that computes tile sizes at...
SCFTileSizeComputationFunction numThreadsComputationFunction
Computation function that returns the number of threads to use for each loop.
SmallVector< int64_t > interchangeVector
The interchange vector to reorder the tiled loops.
LoopType
Specify which loop construct to use for tile and fuse.
SmallVector< Attribute > mappingVector
Specify mapping of loops to devices.
SCFTilingOptions & setLoopType(LoopType type)
SCFTilingOptions & setMapping(ArrayRef< Attribute > mapping)
Transformation information returned after tiling.
SmallVector< Operation * > tiledOps
Tiled operations that are generated during tiling.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
SmallVector< Operation * > generatedSlices
Slices generated after tiling that can be used for fusing with the tiled producer.
SmallVector< Value > replacements
Values to use as replacements for the untiled op.