MLIR  22.0.0git
TileUsingInterface.h
Go to the documentation of this file.
1 //===- TileUsingInterface.h - Tiling ops using TilingInterface --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
10 #define MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
11 
14 #include "mlir/IR/PatternMatch.h"
19 
20 #include <deque>
21 
22 namespace mlir {
23 class Operation;
24 class RewriterBase;
25 class TilingInterface;
26 } // namespace mlir
27 
28 namespace mlir {
29 namespace scf {
30 
32  std::function<SmallVector<OpFoldResult>(OpBuilder &, Operation *)>;
33 
34 /// Options to use to control tiling.
36  /// Specify which loop construct to use for tile and fuse.
37  enum class LoopType { ForOp, ForallOp, CustomOp };
40  loopType = type;
41  return *this;
42  }
43 
44  /// Computation function that returns the tile sizes to use for each loop.
45  /// Returning a tile size of zero implies no tiling for that loop. If the
46  /// size of the returned vector is smaller than the number of loops, the inner
47  /// loops are not tiled. If the size of the returned vector is larger, then
48  /// the vector is truncated to number of loops.
50 
53  tileSizeComputationFunction = std::move(fun);
54  return *this;
55  }
56  /// Convenience function to set the `tileSizeComputationFunction` to a
57  /// function that computes tile sizes at the point they are needed. Allows
58  /// proper interaction with folding.
60 
61  /// The interchange vector to reorder the tiled loops.
64  interchangeVector = llvm::to_vector(interchange);
65  return *this;
66  }
67 
68  //-------------------------------------------------------------------------//
69  // Options related to tiling using `scf.forall`.
70  //-------------------------------------------------------------------------//
71 
72  /// Computation function that returns the number of threads to use for
73  /// each loop. Returning a num threads of zero implies no tiling for that
74  /// loop. If the size of the returned vector is smaller than the number of
75  /// loops, the inner loops are not tiled. If the size of the returned vector
76  /// is larger, then the vector is truncated to number of loops. Note: This
77  /// option is only supported with loopType set to `LoopType::ForallOp`. If the
78  /// tile size function is not specified while the num threads computation is,
79  /// then the tile size is determined automatically to map at most one tile per
80  /// thread.
82 
85  numThreadsComputationFunction = std::move(fun);
86  return *this;
87  }
88  /// Convenience function to set the `numThreadsComputationFunction` to a
89  /// function that computes num threads at the point they are needed.
91 
92  /// Specify mapping of loops to devices. This is only respected when the loop
93  /// constructs support such a mapping (like `scf.forall`). Will be ignored
94  /// when using loop constructs that dont support such a mapping (like
95  /// `scf.for`)
98  mappingVector = llvm::to_vector(mapping);
99  return *this;
100  }
101 
102  //-------------------------------------------------------------------------//
103  // Options related reduction tiling
104  //-------------------------------------------------------------------------//
105 
106  /// Specify how reduction dimensions should be tiled.
111  reductionStrategy = strategy;
112  return *this;
113  }
114 
115  /// Specify the reduction dimensions to be tiled. Note that this needs to be
116  /// specified. If left unspecified, then none of the reduction dimensions are
117  /// tiled.
120  reductionDims.clear();
121  reductionDims.insert(dims.begin(), dims.end());
122  return *this;
123  }
124 
125  //-------------------------------------------------------------------------//
126  // Options related to tiling using custom loop.
127  //-------------------------------------------------------------------------//
128 
129  // For generating the inter-tile loops using a custom loop, two callback
130  // functions are needed
131  // 1. That generates the "loop header", i.e. the loop that iterates over the
132  // different tiles.
133  // 2. That generates the loop terminator
134  //
135  // For `scf.forall` case the call back to generate loop header would generate
136  //
137  // ```mlir
138  // scf.forall (...) = ... {
139  // ..
140  // }
141  // ```
142  //
143  // and the call back to generate the loop terminator would generate the
144  // `scf.in_parallel` region
145  //
146  // ```mlir
147  // scf.forall (...) = ... {
148  // scf.in_parallel {
149  // tensor.parallel_insert_slice ...
150  // }
151  // }
152  // ```
153  //
154 
155  // Information that is to be returned by loop header callback needed for the
156  // rest of the tiled codegeneration.
157  // - `loops`: The generated loops
158  // - `tileOffset`: The values that represent the offset of the iteration space
159  // tile.
160  // - `tileSizes` : The values that represent the size of the iteration space
161  // tile.
162  // - `destinationTensors` : The tensors to use as destinations during tiling.
168  };
169 
170  // Type of the callback function that generates the loop headers.
171  // - `loopRanges` : Values that represent the full size of the iteration space
172  // being tiled.
173  // - `givenTileSizes` : The tile sizes that are to be used to tile the
174  // iteration space.
175  // - `destinationTensors` : The tensors to use as destinations for the results
176  // of the tiled loop for loops that implement
177  // `DestinationStyleOpInterface`.
178  // Returns the `CustomLoopHeaderInfo` object (described above). it is expected
179  // that this function sets the insertion point of `rewriter` to the program
180  // point where the intra-tile loop computation is to be generated.
181  using GenerateLoopHeaderFn = std::function<FailureOr<CustomLoopHeaderInfo>(
182  RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
183  ArrayRef<OpFoldResult> givenTileSizes, ValueRange destinationTensors)>;
184 
185  // Type of the callback function that generates the loop terminator.
186  // - `tiledResults` : Tiles of the result computed for the iteration space
187  // tile.
188  // - `resultOffsets` : For each of the `tiledResults`, the offset at which
189  // the result tile is to be "inserted" back into the
190  // destination tensor.
191  // - `resultSizes` : For each of the `tiledResults`, the size of the result
192  // tile that is to be "inserted" back into the destination
193  // tensor.
194  // Returns the `CustomLoopHeaderInfo` object (described above)
195  using GenerateLoopTerminatorFn = std::function<LogicalResult(
196  RewriterBase &rewriter, Location loc, ValueRange tiledResults,
197  ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
198  ArrayRef<SmallVector<OpFoldResult>> resultSizes,
199  ValueRange destinationTensors)>;
200 
201  // Callback function to generate the inter-tile loop header.
203  // Callback function to generate the inter-tile loop terminator.
205  // Helper function to set the callbacks for inter-tile loop header and
206  // terminator functions when using a custom operation for the loop.
209  GenerateLoopTerminatorFn terminatorFn) {
210  generateLoopHeaderFn = std::move(headerFn);
211  generateLoopTerminatorFn = std::move(terminatorFn);
212  return *this;
213  }
214 };
215 
216 /// Transformation information returned after tiling.
218  /// Tiled operations that are generated during tiling. The order does not
219  /// matter except the last op. The replacements are expected to be the results
220  /// of the last op.
222  /// The initial destination values passed to the tiled operations.
224  /// The `scf.for` operations that iterate over the tiles.
226  /// Values to use as replacements for the untiled op. Is the same size as the
227  /// number of results of the untiled op.
229  /// Slices generated after tiling that can be used for fusing with the tiled
230  /// producer.
232  /// In cases where there as an additional merge step after tiling
233  /// return the merged ops after tiling. This list is empty when reduction
234  /// tiling strategy is
235  /// `scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction.
237 };
238 
239 /// Method to tile an op that implements the `TilingInterface` using
240 /// `scf.for` for iterating over the tiles.
241 FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
242  TilingInterface op,
243  const SCFTilingOptions &options);
244 
245 /// Options used to control tile + fuse.
247  /// The tiling options used to control the tiling of the consumer.
251  return *this;
252  }
253 
254  /// Control function to check if a slice needs to be fused or not,
255  /// The control function receives
256  /// 1) the slice along which fusion is to be done,
257  /// 2) the producer value that is to be fused
258  /// 3) a boolean value set to `true` if the fusion is from
259  /// a destination operand.
260  /// The control function returns an `std::optiona<ControlFnResult>`.
261  /// If the return value is `std::nullopt`, that implies no fusion
262  /// is to be performed along that slice.
264  /// Set to true if the loop nest has to return a replacement value
265  /// for the fused producer.
267  };
268  using ControlFnTy = std::function<std::optional<ControlFnResult>(
269  tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
270  bool isDestinationOperand)>;
271  /// The default control function implements greedy fusion without yielding
272  /// a replacement for any of the fused results.
273  ControlFnTy fusionControlFn = [](tensor::ExtractSliceOp, OpResult,
274  bool) -> std::optional<ControlFnResult> {
275  return ControlFnResult{};
276  };
278  fusionControlFn = controlFn;
279  return *this;
280  }
281 
282  /// An optional set of rewrite patterns to apply to the results of tiling
283  /// before fusion. This will track deleted and newly inserted
284  /// `tensor.extract_slice` ops and update the worklist.
285  std::optional<FrozenRewritePatternSet> cleanupPatterns = std::nullopt;
286 };
287 
288 /// Fuse the producer of the source of `candidateSliceOp` by computing the
289 /// required slice of the producer in-place. Note that the method
290 /// replaces the uses of `candidateSliceOp` with the tiled and fused producer
291 /// value but does not delete the slice operation.
293  OpResult origProducer; // Original untiled producer.
294  Value tiledAndFusedProducer; // Tile and fused producer value.
297 };
298 std::optional<SCFFuseProducerOfSliceResult>
300  tensor::ExtractSliceOp candidateSliceOp,
302 
303 /// Reconstruct the fused producer from within the tiled-and-fused code. Based
304 /// on the slice of the producer computed in place it is possible that within
305 /// the loop nest same slice of the producer is computed multiple times. It is
306 /// in general not possible to recompute the value of the fused producer from
307 /// the tiled loop code in such cases. For the cases where no slice of the
308 /// producer is computed in a redundant fashion it is possible to reconstruct
309 /// the value of the original producer from within the tiled loop. It is upto
310 /// the caller to ensure that the producer is not computed redundantly within
311 /// the tiled loop nest. For example, consider
312 ///
313 /// ```mlir
314 /// %0 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
315 /// %1 = linalg.matmul ins(%0, ..) outs(...) -> tensor<?x?x?f32>
316 /// ```
317 ///
318 /// If `%1` is tiled in a 2D fashion and `%0` is fused with it, the resulting IR
319 /// is,
320 ///
321 /// ```mlir
322 /// %t1_0 = scf.for .... iter_args(%arg0 = ...) {
323 /// %t1_1 = scf.for ... iter_args(%arg1 = %arg0) {
324 /// ...
325 /// %t1_2 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
326 /// %t1_3 = linalg.matmul ins(%t1_2, ...)
327 /// %t1_4 = tensor.insert_slice %t1_3 into %arg1 ...
328 /// scf.yield %t1_4
329 /// }
330 /// scf.yield %t1_1
331 /// }
332 /// ```
333 ///
334 /// Here `%t1_2` is the same for all iterations of the inner `scf.for`. Instead
335 /// if `%1` were tiled only along the rows, the resultant code would be
336 ///
337 /// ```mlir
338 /// %t2_0 = scf.for .... iter_args(%arg0 = ...) {
339 /// ...
340 /// %t2_1 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
341 /// %t2_2 = linalg.matmul ins(%t2_1, ...)
342 /// %t2_3 = tensor.insert_slice %t2_2 into %arg0 ...
343 /// scf.yield %t2_3
344 /// }
345 /// ```
346 ///
347 /// Here there is no intersection in the different slices of `%t2_1` computed
348 /// across iterations of the `scf.for`. In such cases, the value of the original
349 /// `%0` can be reconstructed from within the loop body. This is useful in cases
350 /// where `%0` had other uses as well. If not reconstructed from within the loop
351 /// body, uses of `%0` could not be replaced, making it still live and the
352 /// fusion immaterial.
353 ///
354 /// The @param `yieldResultNumber` decides which result would be yield. If not
355 /// given, yield all `opResult` of fused producer.
356 ///
357 /// The method returns the list of new slices added during the process (which
358 /// can be used to fuse along).
359 FailureOr<SmallVector<Operation *>> yieldReplacementForFusedProducer(
360  RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
361  scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
363  ArrayRef<unsigned> yieldResultNumber = ArrayRef<unsigned>{});
364 
365 /// Transformation information returned after tile and fuse.
367  /// List of untiled operations that were fused with the tiled consumer.
369  /// List of tiled and fused operations generated. The first element is always
370  /// the tiled version of the original consumer operation processed by
371  /// `tileConsumerAndFuseProducersUsingSCF`, followed by any operations that
372  /// were fused with it.
374  /// The `scf.for` operations that iterate over the tiles.
376  /// The replacement values to use for the tiled and fused operations.
378 };
379 
380 /// Method to tile and fuse a sequence of operations, by tiling the consumer
381 /// and fusing its producers. Note that this assumes that it is valid to
382 /// tile+fuse the producer into the innermost tiled loop. Its up to the caller
383 /// to ensure that the tile sizes provided make this fusion valid.
384 ///
385 /// For example, for the following sequence
386 ///
387 /// ```mlir
388 /// %0 =
389 /// %1 = linalg.fill ... outs(%0 : ... )
390 /// %2 = linalg.matmul ... outs(%1 : ...) ...
391 /// ```
392 ///
393 /// it is legal to fuse the fill with the matmul only if the matmul is tiled
394 /// along the parallel dimensions and not the reduction dimension, i.e. the tile
395 /// size for the reduction dimension should be 0. The resulting fused
396 /// transformation is
397 ///
398 /// ```mlir
399 /// %1 = scf.for ... iter_args(%arg0 = %0)
400 /// %2 = tensor.extract_slice %arg0
401 /// %3 = linalg.fill .. outs(%2 : ... )
402 /// %4 = linalg.matmul .. outs(%3 : ...)
403 /// }
404 /// ```
405 FailureOr<SCFTileAndFuseResult>
407  TilingInterface consumer,
409 
410 /// Fuse the consumer `candidateSlices` by computing the required slice of the
411 /// consumer in-place. All the entries of `candidateSlices` are expected to map
412 /// to the same consumer. The method returns an error if the consumer cannot be
413 /// tiled in a manner that is consistent for all the passed slices. Note that
414 /// the method replaces the uses of `candidateSlices` with the tiled and fused
415 /// consumer value but does not delete the slice operations.
417  // Original untiled consumer operands.
419  // Tiled and fused consumer operands.
422 };
423 FailureOr<scf::SCFFuseConsumerOfSliceResult>
425  ArrayRef<Operation *> candidateSlices,
427 
428 /// Method to lower an `op` that implements the `TilingInterface` to
429 /// loops/scalars.
430 FailureOr<SmallVector<scf::ForOp>>
431 lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op);
432 
433 /// Method to tile a reduction and generate a parallel op within a serial loop.
434 /// Each of the partial reductions are calculated in parallel. Then after the
435 /// loop all the partial reduction are merged into a final reduction.
436 /// For example for the following sequence
437 ///
438 /// ```mlir
439 /// %0 = linalg.generic %in ["parallel", "reduction"]
440 /// : tensor<7x9xf32> -> tensor<7xf32>
441 /// ```
442 ///
443 /// into:
444 ///
445 /// ```mlir
446 /// %0 = linalg.fill ... : tensor<7x4xf32>
447 /// %1 = scf.for ... iter_args(%arg0 = %0)
448 /// %2 = tensor.extract_slice %arg0 : tensor<7x4xf32> -> tensor<7x?xf32>
449 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
450 /// %4 = linalg.generic %2, %3 ["parallel", "parallel"]
451 /// : tensor<7x?xf32> -> tensor<7x?xf32>
452 /// %5 = tensor.insert_slice %3, %0[0, 0] : tensor<7x4xf32>
453 /// }
454 /// %6 = linalg.generic %1 ["parallel", "reduction"]
455 /// : tensor<7x4xf32> -> tensor<7xf32>
456 /// ```
457 FailureOr<scf::SCFTilingResult>
458 tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op,
459  ArrayRef<OpFoldResult> tileSizes);
460 
461 } // namespace scf
462 } // namespace mlir
463 
464 #endif // MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
static llvm::ManagedStatic< PassManagerOptions > options
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:207
This is a value defined by a result of an operation.
Definition: Value.h:447
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
FailureOr< scf::SCFTilingResult > tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > tileSizes)
Method to tile a reduction and generate a parallel op within a serial loop.
FailureOr< SCFTilingResult > tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const SCFTilingOptions &options)
Method to tile an op that implements the TilingInterface using scf.for for iterating over the tiles.
FailureOr< scf::SCFFuseConsumerOfSliceResult > tileAndFuseConsumerOfSlices(RewriterBase &rewriter, ArrayRef< Operation * > candidateSlices, MutableArrayRef< LoopLikeOpInterface > loops)
Implementation of fusing consumer of a single slice by computing the slice of the consumer in-place f...
FailureOr< SmallVector< scf::ForOp > > lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op)
Method to lower an op that implements the TilingInterface to loops/scalars.
FailureOr< SmallVector< Operation * > > yieldReplacementForFusedProducer(RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp, scf::SCFFuseProducerOfSliceResult fusedProducerInfo, MutableArrayRef< LoopLikeOpInterface > loops, ArrayRef< unsigned > yieldResultNumber=ArrayRef< unsigned >{})
Reconstruct the fused producer from within the tiled-and-fused code.
FailureOr< SCFTileAndFuseResult > tileConsumerAndFuseProducersUsingSCF(RewriterBase &rewriter, TilingInterface consumer, const SCFTileAndFuseOptions &options)
Method to tile and fuse a sequence of operations, by tiling the consumer and fusing its producers.
std::function< SmallVector< OpFoldResult >(OpBuilder &, Operation *)> SCFTileSizeComputationFunction
std::optional< SCFFuseProducerOfSliceResult > tileAndFuseProducerOfSlice(RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp, MutableArrayRef< LoopLikeOpInterface > loops)
Implementation of fusing producer of a single slice by computing the slice of the producer in-place.
Include the generated interface declarations.
ReductionTilingStrategy
Tiling can be thought of as splitting a dimension into 2 and materializing the outer dimension as a l...
Fuse the consumer candidateSlices by computing the required slice of the consumer in-place.
SmallVector< OpOperand * > tiledAndFusedConsumerOperands
SmallVector< OpOperand * > origConsumerOperands
Fuse the producer of the source of candidateSliceOp by computing the required slice of the producer i...
SmallVector< Operation * > generatedSlices
Control function to check if a slice needs to be fused or not, The control function receives 1) the s...
bool yieldProducerReplacement
Set to true if the loop nest has to return a replacement value for the fused producer.
Options used to control tile + fuse.
SCFTilingOptions tilingOptions
The tiling options used to control the tiling of the consumer.
std::optional< FrozenRewritePatternSet > cleanupPatterns
An optional set of rewrite patterns to apply to the results of tiling before fusion.
SCFTileAndFuseOptions & setTilingOptions(SCFTilingOptions options)
ControlFnTy fusionControlFn
The default control function implements greedy fusion without yielding a replacement for any of the f...
std::function< std::optional< ControlFnResult >(tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer, bool isDestinationOperand)> ControlFnTy
SCFTileAndFuseOptions & setFusionControlFn(ControlFnTy controlFn)
Transformation information returned after tile and fuse.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
llvm::SetVector< Operation * > fusedProducers
List of untiled operations that were fused with the tiled consumer.
llvm::DenseMap< Value, Value > replacements
The replacement values to use for the tiled and fused operations.
llvm::SetVector< Operation * > tiledAndFusedOps
List of tiled and fused operations generated.
Options to use to control tiling.
SCFTileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes to use for each loop.
SCFTilingOptions & setTileSizeComputationFunction(SCFTileSizeComputationFunction fun)
std::function< LogicalResult(RewriterBase &rewriter, Location loc, ValueRange tiledResults, ArrayRef< SmallVector< OpFoldResult > > resultOffsets, ArrayRef< SmallVector< OpFoldResult > > resultSizes, ValueRange destinationTensors)> GenerateLoopTerminatorFn
SCFTilingOptions & setCustomLoopGenerationFns(GenerateLoopHeaderFn headerFn, GenerateLoopTerminatorFn terminatorFn)
GenerateLoopHeaderFn generateLoopHeaderFn
SCFTilingOptions & setReductionTilingStrategy(ReductionTilingStrategy strategy)
SCFTilingOptions & setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun)
SCFTilingOptions & setNumThreads(ArrayRef< OpFoldResult > numThreads)
Convenience function to set the numThreadsComputationFunction to a function that computes num threads...
SCFTilingOptions & setInterchange(ArrayRef< int64_t > interchange)
SCFTilingOptions & setTileSizes(ArrayRef< OpFoldResult > tileSizes)
Convenience function to set the tileSizeComputationFunction to a function that computes tile sizes at...
SCFTilingOptions & setReductionDims(ArrayRef< unsigned > dims)
SetVector< unsigned > reductionDims
Specify the reduction dimensions to be tiled.
SCFTileSizeComputationFunction numThreadsComputationFunction
Computation function that returns the number of threads to use for each loop.
GenerateLoopTerminatorFn generateLoopTerminatorFn
SmallVector< int64_t > interchangeVector
The interchange vector to reorder the tiled loops.
LoopType
Specify which loop construct to use for tile and fuse.
SmallVector< Attribute > mappingVector
Specify mapping of loops to devices.
SCFTilingOptions & setLoopType(LoopType type)
SCFTilingOptions & setMapping(ArrayRef< Attribute > mapping)
ReductionTilingStrategy reductionStrategy
Specify how reduction dimensions should be tiled.
std::function< FailureOr< CustomLoopHeaderInfo >(RewriterBase &rewriter, Location loc, ArrayRef< Range > loopRanges, ArrayRef< OpFoldResult > givenTileSizes, ValueRange destinationTensors)> GenerateLoopHeaderFn
Transformation information returned after tiling.
SmallVector< Operation * > tiledOps
Tiled operations that are generated during tiling.
SmallVector< Value > initialValues
The initial destination values passed to the tiled operations.
SmallVector< LoopLikeOpInterface > loops
The scf.for operations that iterate over the tiles.
SmallVector< Operation * > generatedSlices
Slices generated after tiling that can be used for fusing with the tiled producer.
SmallVector< Value > replacements
Values to use as replacements for the untiled op.
SmallVector< Operation * > mergeOps
In cases where there as an additional merge step after tiling return the merged ops after tiling.