MLIR  22.0.0git
TileUsingInterface.h
Go to the documentation of this file.
1 //===- TileUsingInterface.h - Tiling ops using TilingInterface --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
10 #define MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
11 
14 #include "mlir/IR/PatternMatch.h"
19 
20 #include <deque>
21 
22 namespace mlir {
23 class Operation;
24 class RewriterBase;
25 class TilingInterface;
26 } // namespace mlir
27 
28 namespace mlir {
29 namespace scf {
30 
32  std::function<SmallVector<OpFoldResult>(OpBuilder &, Operation *)>;
33 
34 /// Options to use to control tiling.
36  /// Specify which loop construct to use for tile and fuse.
37  enum class LoopType { ForOp, ForallOp, CustomOp };
40  loopType = type;
41  return *this;
42  }
43 
44  /// Computation function that returns the tile sizes to use for each loop.
45  /// Returning a tile size of zero implies no tiling for that loop. If the
46  /// size of the returned vector is smaller than the number of loops, the inner
47  /// loops are not tiled. If the size of the returned vector is larger, then
48  /// the vector is truncated to number of loops.
50 
53  tileSizeComputationFunction = std::move(fun);
54  return *this;
55  }
56  /// Convenience function to set the `tileSizeComputationFunction` to a
57  /// function that computes tile sizes at the point they are needed. Allows
58  /// proper interaction with folding.
60 
61  /// The interchange vector to reorder the tiled loops.
64  interchangeVector = llvm::to_vector(interchange);
65  return *this;
66  }
67 
68  //-------------------------------------------------------------------------//
69  // Options related to tiling using `scf.forall`.
70  //-------------------------------------------------------------------------//
71 
72  /// Computation function that returns the number of threads to use for
73  /// each loop. Returning a num threads of zero implies no tiling for that
74  /// loop. If the size of the returned vector is smaller than the number of
75  /// loops, the inner loops are not tiled. If the size of the returned vector
76  /// is larger, then the vector is truncated to number of loops. Note: This
77  /// option is only supported with loopType set to `LoopType::ForallOp`. If the
78  /// tile size function is not specified while the num threads computation is,
79  /// then the tile size is determined automatically to map at most one tile per
80  /// thread.
82 
85  numThreadsComputationFunction = std::move(fun);
86  return *this;
87  }
88  /// Convenience function to set the `numThreadsComputationFunction` to a
89  /// function that computes num threads at the point they are needed.
91 
92  /// Specify mapping of loops to devices. This is only respected when the loop
93  /// constructs support such a mapping (like `scf.forall`). Will be ignored
94  /// when using loop constructs that dont support such a mapping (like
95  /// `scf.for`)
98  mappingVector = llvm::to_vector(mapping);
99  return *this;
100  }
101 
102  //-------------------------------------------------------------------------//
103  // Options related reduction tiling
104  //-------------------------------------------------------------------------//
105 
106  /// Specify how reduction dimensions should be tiled.
111  reductionStrategy = strategy;
112  return *this;
113  }
114 
115  /// Specify the reduction dimensions to be tiled. Note that this needs to be
116  /// specified. If left unspecified, then none of the reduction dimensions are
117  /// tiled.
120  reductionDims.clear();
121  reductionDims.insert(dims.begin(), dims.end());
122  return *this;
123  }
124 
125  //-------------------------------------------------------------------------//
126  // Options related to tiling using custom loop.
127  //-------------------------------------------------------------------------//
128 
129  // For generating the inter-tile loops using a custom loop, two callback
130  // functions are needed
131  // 1. That generates the "loop header", i.e. the loop that iterates over the
132  // different tiles.
133  // 2. That generates the loop terminator
134  //
135  // For `scf.forall` case the call back to generate loop header would generate
136  //
137  // ```mlir
138  // scf.forall (...) = ... {
139  // ..
140  // }
141  // ```
142  //
143  // and the call back to generate the loop terminator would generate the
144  // `scf.in_parallel` region
145  //
146  // ```mlir
147  // scf.forall (...) = ... {
148  // scf.in_parallel {
149  // tensor.parallel_insert_slice ...
150  // }
151  // }
152  // ```
153  //
154 
155  // Information that is to be returned by loop header callback needed for the
156  // rest of the tiled codegeneration.
157  // - `loops`: The generated loops
158  // - `tileOffset`: The values that represent the offset of the iteration space
159  // tile.
160  // - `tileSizes` : The values that represent the size of the iteration space
161  // tile.
162  // - `destinationTensors` : The tensors to use as destinations during tiling.
168  };
169 
170  // Type of the callback function that generates the loop headers.
171  // - `loopRanges` : Values that represent the full size of the iteration space
172  // being tiled.
173  // - `givenTileSizes` : The tile sizes that are to be used to tile the
174  // iteration space.
175  // - `destinationTensors` : The tensors to use as destinations for the results
176  // of the tiled loop for loops that implement
177  // `DestinationStyleOpInterface`.
178  // Returns the `CustomLoopHeaderInfo` object (described above). it is expected
179  // that this function sets the insertion point of `rewriter` to the program
180  // point where the intra-tile loop computation is to be generated.
181  using GenerateLoopHeaderFn = std::function<FailureOr<CustomLoopHeaderInfo>(
182  RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
183  ArrayRef<OpFoldResult> givenTileSizes, ValueRange destinationTensors)>;
184 
185  // Type of the callback function that generates the loop terminator.
186  // - `loops` : generated loops from the GenerateLoopHeaderFn callback
187  // - `tiledResults` : Tiles of the result computed for the iteration space
188  // tile.
189  // - `resultOffsets` : For each of the `tiledResults`, the offset at which
190  // the result tile is to be "inserted" back into the
191  // destination tensor.
192  // - `resultSizes` : For each of the `tiledResults`, the size of the result
193  // tile that is to be "inserted" back into the destination
194  // tensor.
195  // Returns the `CustomLoopHeaderInfo` object (described above)
196  using GenerateLoopTerminatorFn = std::function<LogicalResult(
198  ValueRange tiledResults,
199  ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
200  ArrayRef<SmallVector<OpFoldResult>> resultSizes,
201  ValueRange destinationTensors)>;
202 
203  // Callback function to generate the inter-tile loop header.
205  // Callback function to generate the inter-tile loop terminator.
207  // Helper function to set the callbacks for inter-tile loop header and
208  // terminator functions when using a custom operation for the loop.
211  GenerateLoopTerminatorFn terminatorFn) {
212  generateLoopHeaderFn = std::move(headerFn);
213  generateLoopTerminatorFn = std::move(terminatorFn);
214  return *this;
215  }
216 };
217 
218 /// Transformation information returned after tiling.
220  /// Tiled operations that are generated during tiling. The order does not
221  /// matter except the last op. The replacements are expected to be the results
222  /// of the last op.
224  /// The initial destination values passed to the tiled operations.
226  /// The `scf.for` operations that iterate over the tiles.
228  /// Values to use as replacements for the untiled op. Is the same size as the
229  /// number of results of the untiled op.
231  /// Slices generated after tiling that can be used for fusing with the tiled
232  /// producer.
234  /// In cases where there as an additional merge step after tiling
235  /// return the merged ops after tiling. This list is empty when reduction
236  /// tiling strategy is
237  /// `scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction.
239 };
240 
241 /// Method to tile an op that implements the `TilingInterface` using
242 /// `scf.for` for iterating over the tiles.
243 FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
244  TilingInterface op,
245  const SCFTilingOptions &options);
246 
247 /// Options used to control tile + fuse.
249  /// The tiling options used to control the tiling of the consumer.
253  return *this;
254  }
255 
256  /// Control function to check if a slice needs to be fused or not,
257  /// The control function receives
258  /// 1) the slice along which fusion is to be done,
259  /// 2) the producer value that is to be fused
260  /// 3) a boolean value set to `true` if the fusion is from
261  /// a destination operand.
262  /// The control function returns an `std::optiona<ControlFnResult>`.
263  /// If the return value is `std::nullopt`, that implies no fusion
264  /// is to be performed along that slice.
266  /// Set to true if the loop nest has to return a replacement value
267  /// for the fused producer.
269  };
270  using ControlFnTy = std::function<std::optional<ControlFnResult>(
271  tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
272  bool isDestinationOperand)>;
273  /// The default control function implements greedy fusion without yielding
274  /// a replacement for any of the fused results.
275  ControlFnTy fusionControlFn = [](tensor::ExtractSliceOp, OpResult,
276  bool) -> std::optional<ControlFnResult> {
277  return ControlFnResult{};
278  };
280  fusionControlFn = controlFn;
281  return *this;
282  }
283 
284  /// An optional set of rewrite patterns to apply to the results of tiling
285  /// before fusion. This will track deleted and newly inserted
286  /// `tensor.extract_slice` ops and update the worklist.
287  std::optional<FrozenRewritePatternSet> cleanupPatterns = std::nullopt;
288 };
289 
290 /// Fuse the producer of the source of `candidateSliceOp` by computing the
291 /// required slice of the producer in-place. Note that the method
292 /// replaces the uses of `candidateSliceOp` with the tiled and fused producer
293 /// value but does not delete the slice operation.
295  OpResult origProducer; // Original untiled producer.
296  Value tiledAndFusedProducer; // Tile and fused producer value.
299 };
300 std::optional<SCFFuseProducerOfSliceResult>
302  tensor::ExtractSliceOp candidateSliceOp,
304 
305 /// Reconstruct the fused producer from within the tiled-and-fused code. Based
306 /// on the slice of the producer computed in place it is possible that within
307 /// the loop nest same slice of the producer is computed multiple times. It is
308 /// in general not possible to recompute the value of the fused producer from
309 /// the tiled loop code in such cases. For the cases where no slice of the
310 /// producer is computed in a redundant fashion it is possible to reconstruct
311 /// the value of the original producer from within the tiled loop. It is upto
312 /// the caller to ensure that the producer is not computed redundantly within
313 /// the tiled loop nest. For example, consider
314 ///
315 /// ```mlir
316 /// %0 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
317 /// %1 = linalg.matmul ins(%0, ..) outs(...) -> tensor<?x?x?f32>
318 /// ```
319 ///
320 /// If `%1` is tiled in a 2D fashion and `%0` is fused with it, the resulting IR
321 /// is,
322 ///
323 /// ```mlir
324 /// %t1_0 = scf.for .... iter_args(%arg0 = ...) {
325 /// %t1_1 = scf.for ... iter_args(%arg1 = %arg0) {
326 /// ...
327 /// %t1_2 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
328 /// %t1_3 = linalg.matmul ins(%t1_2, ...)
329 /// %t1_4 = tensor.insert_slice %t1_3 into %arg1 ...
330 /// scf.yield %t1_4
331 /// }
332 /// scf.yield %t1_1
333 /// }
334 /// ```
335 ///
336 /// Here `%t1_2` is the same for all iterations of the inner `scf.for`. Instead
337 /// if `%1` were tiled only along the rows, the resultant code would be
338 ///
339 /// ```mlir
340 /// %t2_0 = scf.for .... iter_args(%arg0 = ...) {
341 /// ...
342 /// %t2_1 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
343 /// %t2_2 = linalg.matmul ins(%t2_1, ...)
344 /// %t2_3 = tensor.insert_slice %t2_2 into %arg0 ...
345 /// scf.yield %t2_3
346 /// }
347 /// ```
348 ///
349 /// Here there is no intersection in the different slices of `%t2_1` computed
350 /// across iterations of the `scf.for`. In such cases, the value of the original
351 /// `%0` can be reconstructed from within the loop body. This is useful in cases
352 /// where `%0` had other uses as well. If not reconstructed from within the loop
353 /// body, uses of `%0` could not be replaced, making it still live and the
354 /// fusion immaterial.
355 ///
356 /// The @param `yieldResultNumber` decides which result would be yield. If not
357 /// given, yield all `opResult` of fused producer.
358 ///
359 /// The method returns the list of new slices added during the process (which
360 /// can be used to fuse along).
361 FailureOr<SmallVector<Operation *>> yieldReplacementForFusedProducer(
362  RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
363  scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
365  ArrayRef<unsigned> yieldResultNumber = ArrayRef<unsigned>{});
366 
367 /// Transformation information returned after tile and fuse.
369  /// List of untiled operations that were fused with the tiled consumer.
371  /// List of tiled and fused operations generated. The first element is always
372  /// the tiled version of the original consumer operation processed by
373  /// `tileConsumerAndFuseProducersUsingSCF`, followed by any operations that
374  /// were fused with it.
376  /// The `scf.for` operations that iterate over the tiles.
378  /// The replacement values to use for the tiled and fused operations.
380 };
381 
382 /// Method to tile and fuse a sequence of operations, by tiling the consumer
383 /// and fusing its producers. Note that this assumes that it is valid to
384 /// tile+fuse the producer into the innermost tiled loop. Its up to the caller
385 /// to ensure that the tile sizes provided make this fusion valid.
386 ///
387 /// For example, for the following sequence
388 ///
389 /// ```mlir
390 /// %0 =
391 /// %1 = linalg.fill ... outs(%0 : ... )
392 /// %2 = linalg.matmul ... outs(%1 : ...) ...
393 /// ```
394 ///
395 /// it is legal to fuse the fill with the matmul only if the matmul is tiled
396 /// along the parallel dimensions and not the reduction dimension, i.e. the tile
397 /// size for the reduction dimension should be 0. The resulting fused
398 /// transformation is
399 ///
400 /// ```mlir
401 /// %1 = scf.for ... iter_args(%arg0 = %0)
402 /// %2 = tensor.extract_slice %arg0
403 /// %3 = linalg.fill .. outs(%2 : ... )
404 /// %4 = linalg.matmul .. outs(%3 : ...)
405 /// }
406 /// ```
407 FailureOr<SCFTileAndFuseResult>
409  TilingInterface consumer,
411 
412 /// Fuse the consumer `candidateSlices` by computing the required slice of the
413 /// consumer in-place. All the entries of `candidateSlices` are expected to map
414 /// to the same consumer. The method returns an error if the consumer cannot be
415 /// tiled in a manner that is consistent for all the passed slices. Note that
416 /// the method replaces the uses of `candidateSlices` with the tiled and fused
417 /// consumer value but does not delete the slice operations.
419  // Original untiled consumer operands.
421  // Tiled and fused consumer operands.
424 };
425 FailureOr<scf::SCFFuseConsumerOfSliceResult>
427  ArrayRef<Operation *> candidateSlices,
429 
430 /// Method to lower an `op` that implements the `TilingInterface` to
431 /// loops/scalars.
432 FailureOr<SmallVector<scf::ForOp>>
433 lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op);
434 
435 /// Method to tile a reduction and generate a parallel op within a serial loop.
436 /// Each of the partial reductions are calculated in parallel. Then after the
437 /// loop all the partial reduction are merged into a final reduction.
438 /// For example for the following sequence
439 ///
440 /// ```mlir
441 /// %0 = linalg.generic %in ["parallel", "reduction"]
442 /// : tensor<7x9xf32> -> tensor<7xf32>
443 /// ```
444 ///
445 /// into:
446 ///
447 /// ```mlir
448 /// %0 = linalg.fill ... : tensor<7x4xf32>
449 /// %1 = scf.for ... iter_args(%arg0 = %0)
450 /// %2 = tensor.extract_slice %arg0 : tensor<7x4xf32> -> tensor<7x?xf32>
451 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
452 /// %4 = linalg.generic %2, %3 ["parallel", "parallel"]
453 /// : tensor<7x?xf32> -> tensor<7x?xf32>
454 /// %5 = tensor.insert_slice %3, %0[0, 0] : tensor<7x4xf32>
455 /// }
456 /// %6 = linalg.generic %1 ["parallel", "reduction"]
457 /// : tensor<7x4xf32> -> tensor<7xf32>
458 /// ```
459 FailureOr<scf::SCFTilingResult>
460 tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op,
461  ArrayRef<OpFoldResult> tileSizes);
462 
463 } // namespace scf
464 } // namespace mlir
465 
466 #endif // MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
static llvm::ManagedStatic< PassManagerOptions > options
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:207
This is a value defined by a result of an operation.
Definition: Value.h:447
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
FailureOr< scf::SCFTilingResult > tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > tileSizes)
Method to tile a reduction and generate a parallel op within a serial loop.
FailureOr< SCFTilingResult > tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const SCFTilingOptions &options)
Method to tile an op that implements the TilingInterface using scf.for for iterating over the tiles.
FailureOr< scf::SCFFuseConsumerOfSliceResult > tileAndFuseConsumerOfSlices(RewriterBase &rewriter, ArrayRef< Operation * > candidateSlices, MutableArrayRef< LoopLikeOpInterface > loops)
Implementation of fusing consumer of a single slice by computing the slice of the consumer in-place f...
FailureOr< SmallVector< scf::ForOp > > lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op)
Method to lower an op that implements the TilingInterface to loops/scalars.
FailureOr< SmallVector< Operation * > > yieldReplacementForFusedProducer(RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp, scf::SCFFuseProducerOfSliceResult fusedProducerInfo, MutableArrayRef< LoopLikeOpInterface > loops, ArrayRef< unsigned > yieldResultNumber=ArrayRef< unsigned >{})
Reconstruct the fused producer from within the tiled-and-fused code.
FailureOr< SCFTileAndFuseResult > tileConsumerAndFuseProducersUsingSCF(RewriterBase &rewriter, TilingInterface consumer, const SCFTileAndFuseOptions &options)
Method to tile and fuse a sequence of operations, by tiling the consumer and fusing its producers.
std::function< SmallVector< OpFoldResult >(OpBuilder &, Operation *)> SCFTileSizeComputationFunction
std::optional< SCFFuseProducerOfSliceResult > tileAndFuseProducerOfSlice(RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp, MutableArrayRef< LoopLikeOpInterface > loops)
Implementation of fusing producer of a single slice by computing the slice of the producer in-place.
Include the generated interface declarations.
ReductionTilingStrategy
Tiling can be thought of as splitting a dimension into 2 and materializing the outer dimension as a l...
Fuse the consumer candidateSlices by computing the required slice of the consumer in-place.
SmallVector< OpOperand * > tiledAndFusedConsumerOperands
SmallVector< OpOperand * > origConsumerOperands
Fuse the producer of the source of candidateSliceOp by computing the required slice of the producer i...
SmallVector< Operation * > generatedSlices
Control function to check if a slice needs to be fused or not, The control function receives 1) the s...
bool yieldProducerReplacement
Set to true if the loop nest has to return a replacement value for the fused producer.
Options used to control tile + fuse.
SCFTilingOptions tilingOptions
The tiling options used to control the tiling of the consumer.
std::optional< FrozenRewritePatternSet > cleanupPatterns
An optional set of rewrite patterns to apply to the results of tiling before fusion.
SCFTileAndFuseOptions & setTilingOptions(SCFTilingOptions options)
ControlFnTy fusionControlFn
The default control function implements greedy fusion without yielding a replacement for any of the f...
std::function< std::optional< ControlFnResult >(tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer, bool isDestinationOperand)> ControlFnTy
SCFTileAndFuseOptions & setFusionControlFn(ControlFnTy controlFn)
Transformation information returned after tile and fuse.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
llvm::SetVector< Operation * > fusedProducers
List of untiled operations that were fused with the tiled consumer.
llvm::DenseMap< Value, Value > replacements
The replacement values to use for the tiled and fused operations.
llvm::SetVector< Operation * > tiledAndFusedOps
List of tiled and fused operations generated.
Options to use to control tiling.
SCFTileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes to use for each loop.
SCFTilingOptions & setTileSizeComputationFunction(SCFTileSizeComputationFunction fun)
SCFTilingOptions & setCustomLoopGenerationFns(GenerateLoopHeaderFn headerFn, GenerateLoopTerminatorFn terminatorFn)
GenerateLoopHeaderFn generateLoopHeaderFn
SCFTilingOptions & setReductionTilingStrategy(ReductionTilingStrategy strategy)
SCFTilingOptions & setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun)
SCFTilingOptions & setNumThreads(ArrayRef< OpFoldResult > numThreads)
Convenience function to set the numThreadsComputationFunction to a function that computes num threads...
SCFTilingOptions & setInterchange(ArrayRef< int64_t > interchange)
SCFTilingOptions & setTileSizes(ArrayRef< OpFoldResult > tileSizes)
Convenience function to set the tileSizeComputationFunction to a function that computes tile sizes at...
SCFTilingOptions & setReductionDims(ArrayRef< unsigned > dims)
SetVector< unsigned > reductionDims
Specify the reduction dimensions to be tiled.
SCFTileSizeComputationFunction numThreadsComputationFunction
Computation function that returns the number of threads to use for each loop.
GenerateLoopTerminatorFn generateLoopTerminatorFn
SmallVector< int64_t > interchangeVector
The interchange vector to reorder the tiled loops.
LoopType
Specify which loop construct to use for tile and fuse.
SmallVector< Attribute > mappingVector
Specify mapping of loops to devices.
SCFTilingOptions & setLoopType(LoopType type)
SCFTilingOptions & setMapping(ArrayRef< Attribute > mapping)
ReductionTilingStrategy reductionStrategy
Specify how reduction dimensions should be tiled.
std::function< FailureOr< CustomLoopHeaderInfo >(RewriterBase &rewriter, Location loc, ArrayRef< Range > loopRanges, ArrayRef< OpFoldResult > givenTileSizes, ValueRange destinationTensors)> GenerateLoopHeaderFn
std::function< LogicalResult(RewriterBase &rewriter, Location loc, ArrayRef< LoopLikeOpInterface > loops, ValueRange tiledResults, ArrayRef< SmallVector< OpFoldResult > > resultOffsets, ArrayRef< SmallVector< OpFoldResult > > resultSizes, ValueRange destinationTensors)> GenerateLoopTerminatorFn
Transformation information returned after tiling.
SmallVector< Operation * > tiledOps
Tiled operations that are generated during tiling.
SmallVector< Value > initialValues
The initial destination values passed to the tiled operations.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
SmallVector< Operation * > generatedSlices
Slices generated after tiling that can be used for fusing with the tiled producer.
SmallVector< Value > replacements
Values to use as replacements for the untiled op.
SmallVector< Operation * > mergeOps
In cases where there as an additional merge step after tiling return the merged ops after tiling.