MLIR  20.0.0git
Transforms.h
Go to the documentation of this file.
1 //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
11 
12 #include <utility>
13 
23 #include "mlir/IR/PatternMatch.h"
26 #include "llvm/ADT/SmallBitVector.h"
27 #include "llvm/ADT/SmallSet.h"
28 
29 namespace mlir {
30 namespace bufferization {
31 class AllocTensorOp;
32 class OneShotAnalysisState;
33 } // namespace bufferization
34 
35 namespace linalg {
36 
37 class LinalgOp;
38 
39 //===----------------------------------------------------------------------===//
40 // Utils.
41 //===----------------------------------------------------------------------===//
42 
43 /// Return vector::CombiningKind for the given op.
44 std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp);
45 
46 //===----------------------------------------------------------------------===//
47 // Bufferization-related transforms.
48 //===----------------------------------------------------------------------===//
49 
51  enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 };
53 
54  enum class MemcpyOp {
56  MemrefCopy = 1,
57  LinalgCopy = 2
58  };
60 
61  /// If set to "true", only the destination tensor operands are bufferized to
62  /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the
63  /// targeted op itself.
65 
66  /// If set to "true", a `memref.dealloc` operation will be emitted for each
67  /// allocated buffer. Otherwise, the memory is leaked, which is useful if
68  /// the buffer deallocation pipeline should be run after bufferization is
69  /// done.
70  bool emitDealloc = false;
71 };
72 
73 /// Materialize a buffer allocation for the given tensor.pad op and lower the
74 /// op to linalg.fill/linalg.generic + bufferization.materialize_in_destination.
75 /// E.g.:
76 ///
77 /// %0 = tensor.pad low[%l] high[%h] %t ...
78 ///
79 /// is lowered to:
80 ///
81 /// %alloc = memref.alloc
82 /// linalg.fill ... outs(%alloc)
83 /// %subview = memref.subview %alloc [%l] [...] [1]
84 /// bufferization.materialize_in_destination %t in %subview
85 /// %0 = bufferization.to_tensor %alloc restrict writable
86 ///
87 /// In addition to rewriting the IR as shown above, this function returns the
88 /// newly allocated buffer. The `insertionPoint` parameter can be used to
89 /// specify a custom insertion point for the buffer allocation.
92  tensor::PadOp padOp, Attribute memorySpace = {},
93  Operation *insertionPoint = nullptr);
94 
95 /// Materialize a buffer allocation for the given vector.mask op and bufferize
96 /// the op, including its region. E.g.:
97 ///
98 /// %0 = vector.mask {
99 /// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32>
100 /// } : vector<16xi1> -> tensor<?xf32>
101 ///
102 /// is lowered to:
103 ///
104 /// %alloc = memref.alloc
105 /// bufferization.materialize_in_destination %t in %subview
106 /// vector.mask {
107 /// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32>
108 /// } : vector<16xi1>
109 /// %0 = bufferization.to_tensor %alloc restrict writable
110 ///
111 /// In addition to rewriting the IR as shown above, this function returns the
112 /// newly allocated buffer. The `insertionPoint` parameter can be used to
113 /// specify a custom insertion point for the buffer allocation.
115  const BufferizeToAllocationOptions &options,
116  vector::MaskOp maskOp, Attribute memorySpace = {},
117  Operation *insertionPoint = nullptr);
118 
119 /// Materialize a buffer allocation for the given bufferization.alloc_tensor op
120 /// and lower the op to memref.alloc + memref.tensor_store.
121 ///
122 /// In addition to rewriting the IR, this function returns the newly allocated
123 /// buffer. The `insertionPoint` parameter can be used to specify a custom
124 /// insertion point for the buffer allocation.
125 Value bufferizeToAllocation(RewriterBase &rewriter,
126  const BufferizeToAllocationOptions &options,
127  bufferization::AllocTensorOp allocTensorOp,
128  Attribute memorySpace = {},
129  Operation *insertionPoint = nullptr);
130 
131 /// Bufferize the given op with tensor semantics and materialize the result in
132 /// a newly allocated buffer.
133 ///
134 /// Only bufferizable ops that bufferize to a memory write or have an
135 /// aliasing OpOperand (and do not themselves bufferize to an allocation) are
136 /// supported. They are bufferized using their BufferizableOpInterface
137 /// implementation.
138 ///
139 /// Selected ops that bufferize to an allocation (or need special handling) are
140 /// also supported:
141 /// - tensor.pad
142 /// - vector.mask
143 ///
144 /// This function returns the newly allocated buffer. The `insertionPoint`
145 /// parameter can be used to specify a custom insertion point for the buffer
146 /// allocation.
147 Value bufferizeToAllocation(RewriterBase &rewriter,
148  const BufferizeToAllocationOptions &options,
149  Operation *op, Attribute memorySpace = {},
150  Operation *insertionPoint = nullptr);
151 
152 /// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a
153 /// LinalgOp. This transforms looks for LinalgOps that have an unused output
154 /// operand and an input operand that is rooted in a tensor::EmptyOp. The
155 /// tensor::EmptyOp uses are replaced with the output operand and the two
156 /// operands of the LinalgOp are swapped.
157 ///
158 /// Example:
159 /// %0 = tensor.empty()
160 /// %1 = linalg.matmul ins(...) outs(%0)
161 /// %2 = linalg.generic ins(%1) outs(%dest) {
162 /// ^bb0(%in: f32, %out: f32):
163 /// // out not used
164 /// }
165 ///
166 /// The IR is transformed as follows:
167 /// %0 = tensor.empty()
168 /// %1 = linalg.matmul ins(...) outs(%dest)
169 /// %2 = linalg.generic ins(%0) outs(%1) {
170 /// ^bb0(%in: f32, %out: f32):
171 /// // Use %out instead of %in
172 /// }
173 ///
174 /// The "ins" operand has no uses inside the body of the LinalgOp and can be
175 /// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp
176 /// can also fold away.
178  RewriterBase &rewriter, Operation *op,
179  bufferization::OneShotAnalysisState &state);
180 
181 //===----------------------------------------------------------------------===//
182 // Structs that configure the behavior of various transformations.
183 //===----------------------------------------------------------------------===//
184 
186  std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;
187 
189  /// Computation function that returns the tile sizes for each operation.
190  /// Delayed construction of constant tile sizes should occur to interoperate
191  /// with folding.
193 
196  tileSizeComputationFunction = std::move(fun);
197  return *this;
198  }
199  /// Set the `tileSizeComputationFunction` to return the values `ts`. The
200  /// values must not fold away when tiling. Otherwise, use a more robust
201  /// `tileSizeComputationFunction`.
203  tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; };
204  return *this;
205  }
206  /// Convenience function to set the `tileSizeComputationFunction` to a
207  /// function that computes tile sizes at the point they are needed. Allows
208  /// proper interaction with folding.
210 
211  /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions.
212  /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together.
214 
215  /// The interchange vector to reorder the tiled loops.
217 
219  interchangeVector.assign(interchange.begin(), interchange.end());
220  return *this;
221  }
222 
223  /// The type of tile loops to generate.
225 
227  loopType = lt;
228  return *this;
229  }
230 
231  /// When specified, specifies distribution of generated tile loops to
232  /// processors.
233  std::optional<LinalgLoopDistributionOptions> distribution;
234 
237  distribution = std::move(distributionOptions);
238  return *this;
239  }
240 
241  /// Specification markers of how to distribute the `linalg.tiled_loop`.
243 
245  distributionTypes.assign(types.begin(), types.end());
246  return *this;
247  }
248 
249  /// Peel the specified loops.
251 
253  peeledLoops.clear();
254  peeledLoops.append(loops.begin(), loops.end());
255  return *this;
256  }
257 };
258 
260  /// Tile sizes used to tile the root operation.
263  tileSizes.assign(ts.begin(), ts.end());
264  return *this;
265  }
266  /// Tile interchange used to permute the tile loops.
268  /// When specified, specifies distribution of generated tile loops to
269  /// processors.
270  std::optional<LinalgLoopDistributionOptions> tileDistribution;
273  tileDistribution = std::move(distributionOptions);
274  return *this;
275  }
276 };
277 
279  /// A padding value for every operand.
282  paddingValues.assign(pv.begin(), pv.end());
283  return *this;
284  }
285  /// A list of iterator dimensions to pad.
288  paddingDimensions.assign(pd.begin(), pd.end());
289  return *this;
290  }
291  /// A list of multiples to which each padding dimension should be padded to.
292  std::optional<SmallVector<int64_t>> padToMultipleOf;
294  padToMultipleOf.emplace(m.begin(), m.end());
295  return *this;
296  }
297  /// A flag for every operand to mark the PadOp as nofold which enables
298  /// packing for statically shaped operands.
301  packPaddings.assign(pp.begin(), pp.end());
302  return *this;
303  }
304  /// A number of loops to hoist the PadOp out for every operand.
307  hoistPaddings.assign(hp.begin(), hp.end());
308  return *this;
309  }
310  /// A permutation vector for every operand used to transpose the packed
311  /// PadOp results.
315  transposePaddings.assign(tp.begin(), tp.end());
316  return *this;
317  }
318  enum class CopyBackOp : int8_t {
319  None = 0,
321  LinalgCopy = 2
322  };
323  /// The op to be used for copying the padded result to the original
324  /// destination tensor.
327  copyBackOp = op;
328  return *this;
329  }
330 };
331 
332 /// Callback function type used to perform the allocation for the promoted
333 /// `subView`. In `boundingSubViewsize` a best attempt is made to find the
334 /// smallest constant value for the size of the buffer needed for each
335 /// dimension. If that is not possible, contains the dynamic size of the
336 /// subview. The call back should return the buffer to use.
337 using AllocBufferCallbackFn = std::function<std::optional<Value>(
338  OpBuilder &b, memref::SubViewOp subView,
339  ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>;
340 
341 /// Callback function type used to deallocate the buffers used to hold the
342 /// promoted subview.
344  std::function<LogicalResult(OpBuilder &b, Value buffer)>;
345 
346 /// Callback function type used to insert copy from original subview to
347 /// subview of the promoted region for the read operands/subview of promoted
348 /// region to original subview for the results. The copy has to happen from
349 /// `src` to `dst`.
351  std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
352 
354  /// Indices of subViews to promote. If `std::nullopt`, try to promote all
355  /// operands.
356  std::optional<DenseSet<unsigned>> operandsToPromote;
359  operandsToPromote->insert(operands.begin(), operands.end());
360  return *this;
361  }
362  /// If ith element of `useFullTiles` is true the full view should be used
363  /// for the promoted buffer of the ith operand in `operandsToPromote`.
364  /// Otherwise the partial view will be used. The decision is defaulted to
365  /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and
366  /// for operands missing from `useFullTileBuffers`.
367  std::optional<llvm::SmallBitVector> useFullTileBuffers;
369  unsigned size = useFullTiles.size();
370  llvm::SmallBitVector tmp(size, false);
371  for (unsigned i = 0; i < size; ++i)
372  tmp[i] = useFullTiles[i];
373  useFullTileBuffers = tmp;
374  return *this;
375  }
376  /// If true all operands unspecified by `useFullTileBuffers` will use the
377  /// full view, otherwise the partial view.
381  return *this;
382  }
383  /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment.
384  std::optional<unsigned> alignment;
386  alignment = align;
387  return *this;
388  }
389  /// Memory space of promoted buffer. If `std::nullopt` do not specify memory
390  /// space.
391  std::optional<Attribute> memorySpace;
393  memorySpace = memorySpc;
394  return *this;
395  }
396  /// Use alloca with the default allocation scheme.
397  bool useAlloca = false;
399  useAlloca = use;
400  return *this;
401  }
402  /// Callback function to do the allocation of the promoted buffer. If
403  /// std::nullopt, then the default allocation scheme of allocating a
404  /// memref<?xi8> buffer followed by a view operation is used.
405  std::optional<AllocBufferCallbackFn> allocationFn;
406  std::optional<DeallocBufferCallbackFn> deallocationFn;
409  DeallocBufferCallbackFn const &deallocFn) {
410  allocationFn = allocFn;
411  deallocationFn = deallocFn;
412  return *this;
413  }
414  /// Callback function to do the copy of data to and from the promoted
415  /// subview. If std::nullopt then a memref.copy is used.
416  std::optional<CopyCallbackFn> copyInFn;
417  std::optional<CopyCallbackFn> copyOutFn;
419  CopyCallbackFn const &copyOut) {
420  copyInFn = copyIn;
421  copyOutFn = copyOut;
422  return *this;
423  }
424 };
425 
426 /// Split Reduction options.
428  // Ratio used to split the reduction dimension. If the ratio is <= 1,
429  // nothing will be done.
430  int64_t ratio = 0;
431  // Index where the extra dimension is added to the intermediate tensor
432  // shape.
433  unsigned index = 0;
434  // If the inner dimension after splitting is parallel or reduction.
435  bool innerParallel = false;
436 };
437 
438 /// Function signature to control reduction splitting. This returns
439 /// `SplitReductionOptions`.
440 // TODO: don't use unsigned unless doing bit manipulation.
442  std::function<SplitReductionOptions(LinalgOp op)>;
443 
444 //===----------------------------------------------------------------------===//
445 // Preconditions that ensure the corresponding transformation succeeds and can
446 // be applied as a rewrite pattern.
447 //===----------------------------------------------------------------------===//
448 
449 /// Return true if two `linalg.generic` operations with producer/consumer
450 /// relationship through `fusedOperand` can be fused using elementwise op
451 /// fusion.
452 bool areElementwiseOpsFusable(OpOperand *fusedOperand);
453 
454 /// Promote memref.subviews feeding linalg-on-buffers operations.
455 LogicalResult promoteSubviewsPrecondition(Operation *op,
457 
458 /// Return success if the operation can be vectorized.
459 LogicalResult vectorizeOpPrecondition(Operation *op,
460  ArrayRef<int64_t> inputVectorSizes = {},
461  ArrayRef<bool> inputScalableVecDims = {},
462  bool vectorizeNDExtract = false,
463  bool flatten1DDepthwiseConv = false);
464 
465 //===----------------------------------------------------------------------===//
466 // Transformations exposed as functional-style API calls.
467 //===----------------------------------------------------------------------===//
468 
470 
471 /// Transformation to drop unit-extent dimensions from `linalg.generic`
472 /// operations.
475 
478 
479  using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>;
481  if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
482  return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
483  }
484  if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) {
485  return llvm::to_vector(
486  llvm::seq<unsigned>(0, padOp.getSourceType().getRank()));
487  }
488  return SmallVector<unsigned>{};
489  };
490 };
492  linalg::GenericOp resultOp;
494 };
495 FailureOr<DropUnitDimsResult> dropUnitDims(RewriterBase &rewriter,
496  GenericOp genericOp,
498 
499 /// Fuse two `linalg.generic` operations that have a producer-consumer
500 /// relationship captured through `fusedOperand`. The method expects
501 /// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.
505 };
506 FailureOr<ElementwiseOpFusionResult>
507 fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand);
508 
509 /// Returns a set of indices of the producer's results which would
510 /// be preserved after the fusion.
511 /// * There is a chance that the implementation of the transformation does not
512 /// agree with the result of this method. This function gives a prediction based
513 /// on an optimized fusion.
514 llvm::SmallDenseSet<int> getPreservedProducerResults(GenericOp producer,
515  GenericOp consumer,
516  OpOperand *fusedOperand);
517 
518 /// Try to peel and canonicalize loop `op` and return the new result.
519 /// Also applies affine_min/max bounds simplification on the fly where relevant.
520 // TODO: Add support for scf.parallel and affine.for loops.
522 
523 /// Peel 'loops' and applies affine_min/max bounds simplification on the fly
524 /// where relevant.
525 void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
526 
527 /// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands
528 /// to a static bounding box. The original `opToPad` is cloned and operates on
529 /// the padded tensors.
530 ///
531 /// * "options.padToMultipleOf" indicates that each padding dimension should be
532 /// padded to the specified multiple.
533 /// * Use "options.paddingValues" and "options.packPaddings" to set padding
534 /// value and nofold attribute of the created tensor::PadOps, respectively.
535 /// * The unpadded results (extracted slice of the cloned operation) are
536 /// returned via `replacements`.
537 /// * The tensor::PadOps are returned via `padOps`.
538 /// * "options.copyBackOp" specifies the op type for copying back the unpadded
539 /// result to the original destination tensor.
540 LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
542  LinalgOp &paddedOp,
543  SmallVector<Value> &replacements,
545 
546 namespace detail {
547 
548 /// Helper struct to hold the results of building a packing loop nest.
552  GenericOp maybeTransposeOp;
553  tensor::PadOp hoistedPadOp;
554 };
555 
556 /// Build the packing loop nest required to hoist `opToHoist` above
557 /// `outermostEnclosingForOp`.
558 /// The loop nest is built just before `outermostEnclosingForOp`.
559 FailureOr<PackingResult>
560 buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
561  scf::ForOp outermostEnclosingForOp,
562  ArrayRef<int64_t> transposeVector);
563 
564 } // namespace detail
565 
566 /// Mechanically hoist padding operations on tensors by `numLoops` into a new,
567 /// generally larger tensor. This achieves packing of multiple padding ops into
568 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
569 /// in the packing loop so the caller can continue reasoning about the padding
570 /// operation. If `transposeVector` is non-empty, hoist padding introduces a
571 /// GenericOp to transpose the padded tensor before inserting it into the packed
572 /// tensor. A `transposeVector` can change the storage order of the padded
573 /// tensor but does not change the order of the pack or compute loops.
574 ///
575 /// TODO: In the future, we should consider rewriting as a tensor.pack after
576 /// hoisting since this abstraction is now available.
577 ///
578 /// Example in pseudo-mlir:
579 /// =======================
580 ///
581 /// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
582 /// ```
583 /// scf.for (%i, %j, %k)
584 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
585 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
586 /// ^bb0( ... ):
587 /// linalg.yield %pad
588 /// } : tensor<?x?xf32> to tensor<4x8xf32>
589 /// compute(%0)
590 /// ```
591 ///
592 /// IR resembling the following is produced:
593 ///
594 /// ```
595 /// scf.for (%i) {
596 /// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32>
597 /// %packed = scf.for (%k) iter_args(%p : %packed_init) {
598 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
599 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
600 /// ^bb0( ... ):
601 /// linalg.yield %pad
602 /// } : tensor<?x?xf32> to tensor<4x8xf32>
603 /// %1 = tensor.insert_slice %0 ...
604 /// : tensor<4x8xf32> to tensor<?x4x8xf32>
605 /// scf.yield %1: tensor<?x4x8xf32>
606 /// } -> tensor<?x4x8xf32>
607 /// scf.for (%j, %k) {
608 /// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
609 /// tensor<?x4x8xf32> to tensor<4x8xf32>
610 /// compute(%st0)
611 /// }
612 /// }
613 /// ```
614 FailureOr<Value>
615 hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
616  int64_t numLoops, ArrayRef<int64_t> transposeVector,
617  tensor::PadOp &hoistedOp,
618  SmallVectorImpl<GenericOp> &transposeOps);
619 /// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
620 FailureOr<Value>
621 hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
622  ArrayRef<int64_t> transposeVector,
623  tensor::PadOp &hoistedOp,
624  SmallVectorImpl<GenericOp> &transposeOps);
625 
626 /// Apply padding and hoisting to `linalgOp` according to the configuration
627 /// specified in `options`.
628 FailureOr<LinalgOp> padAndHoistLinalgOp(RewriterBase &rewriter,
629  LinalgOp linalgOp,
631 
632 /// Split the given `op` into two parts along the given iteration space
633 /// `dimension` at the specified `splitPoint`, and return the two parts.
634 /// If the second part is statically known to be empty, do not create it
635 /// and return nullptr instead. Error state is signalled by returning
636 /// a pair of nullptrs.
637 ///
638 /// For example, the following op:
639 ///
640 /// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>)
641 /// outs(%2 : tensor<128x64xf32>)
642 ///
643 /// split along the first dimension at position 42 will result in:
644 ///
645 /// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1]
646 /// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1]
647 /// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>)
648 /// outs(%5 : tensor<42x64xf32>)
649 /// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1]
650 ///
651 /// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1]
652 /// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1]
653 /// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>)
654 /// outs(%8 : tensor<86x64xf32>)
655 /// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1]
656 ///
657 /// Note that there is no simplification other than constant propagation applied
658 /// to slice extraction and insertion.
659 std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter,
660  TilingInterface op,
661  unsigned dimension,
662  OpFoldResult splitPoint);
663 
664 /// Perform standalone tiling of a single LinalgOp by `tileSizes`.
665 /// and permute the loop nest according to `interchangeVector`
666 /// The permutation is expressed as a list of integers that specify
667 /// the new ordering of the loop nest. The length of `interchangeVector`
668 /// must be equal to the length of `tileSizes`.
669 /// An empty vector is interpreted as the identity permutation and the
670 /// transformation returns early.
671 ///
672 /// Return a struct containing the tiled loops in the specified order
673 /// and the cloned op if successful, std::nullopt otherwise.
674 ///
675 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by
676 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
677 /// integers, in the range 0..`tileSizes.size()` without duplications
678 /// (i.e. `[1,1,2]` is an invalid permutation).
680  LinalgOp op;
683 };
684 FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op,
686 
687 /// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts
688 /// the index accesses of `op`. This is an in-place transformation controlled
689 /// by `interchangeVector`. An empty vector is interpreted as the identity
690 /// permutation and the transformation returns early.
691 ///
692 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
693 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
694 /// integers, in the range 0..`op.rank` without duplications
695 /// (i.e. `[1,1,2]` is an invalid permutation).
696 ///
697 /// Return failure if the permutation is not valid.
698 FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
699  GenericOp genericOp,
700  ArrayRef<unsigned> interchangeVector);
701 
702 /// Create a GenericOp from the given named operation `namedOp` and replace
703 /// namedOp.
704 /// Return failure if `namedOp` is a GenericOp or misses a region builder.
705 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
706  LinalgOp namedOp);
707 
708 /// Create a namedOp from the given GenericOp and replace the GenericOp.
709 /// Currently we can specialize only trivial linalg copy operations.
710 FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter,
711  GenericOp genericOp);
712 
713 /// Create a new buffer using the `allocationFn` provided. The size of this
714 /// buffer is the smallest constant bounding size along each dimension that
715 /// can be computed for the size of the result of `subView`. Returns the
716 /// allocated buffer as `fullLocalView` and the view that matches the size of
717 /// the result of subview operation as `partialLocalView`.
721 };
722 FailureOr<PromotionInfo>
723 promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
724  const AllocBufferCallbackFn &allocationFn,
725  DataLayout &layout);
726 
727 /// Promote the `subViews` into a new buffer allocated at the insertion point
728 /// `b`. Promotion occurs in 3 steps:
729 /// 1. Create a new buffer for a full tile (i.e. not clipped at the
730 /// boundary).
731 /// 2. Take a full view on the buffer.
732 /// 3. Take a partial slice of the full view in step 2. and copy into it.
733 ///
734 /// Return the modified linalg op (the modification happens in place) as well
735 /// as all the copy ops created.
736 FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
738 
739 /// Allocate the subview in the GPU workgroup memory.
740 std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
741  memref::SubViewOp subview,
742  ArrayRef<Value> sizeBounds,
743  DataLayout &);
744 
745 /// In case of GPU group memory there is no need to deallocate.
746 LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/);
747 
748 /// Create Memref copy operations and add gpu barrier guards before and after
749 /// the copy operation to ensure data integrity.
750 LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
751 
752 /// Allocate the subview in the GPU private memory.
753 std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
754  memref::SubViewOp subview,
755  ArrayRef<Value> sizeBounds,
756  DataLayout &);
757 
758 /// Normal copy to between src and dst.
759 LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst);
760 
761 /// In case of GPU private memory there is no need to deallocate since the
762 /// memory is freed when going outside of the scope.
763 LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/);
764 
765 /// Emit a suitable vector form for an operation. If provided,
766 /// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes`
767 /// must match the rank of the iteration space of the operation and the sizes
768 /// must be smaller or equal than their counterpart interation space sizes, if
769 /// static. `inputVectorShapes` also allows the vectorization of operations with
770 /// dynamic shapes.
771 LogicalResult vectorize(RewriterBase &rewriter, Operation *op,
772  ArrayRef<int64_t> inputVectorSizes = {},
773  ArrayRef<bool> inputScalableVecDims = {},
774  bool vectorizeNDExtract = false,
775  bool flatten1DDepthwiseConv = false);
776 
777 /// Emit a suitable vector form for a Copy op with fully static shape.
778 LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
779 
780 /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`.
781 FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter,
782  LinalgOp linalgOp);
783 
784 /// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`.
785 FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter,
786  LinalgOp linalgOp);
787 
788 /// Emit a loop nest of `affine.for` with the proper body for `linalgOp`.
789 FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter,
790  LinalgOp linalgOp);
791 
792 /// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
793 /// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument
794 /// has one entry per surrounding loop. It uses zero as the convention that a
795 /// particular loop is not tiled. This convention simplifies implementations
796 /// by avoiding affine map manipulations. The returned ranges correspond to
797 /// the loop ranges, in the proper order, that are tiled and for which new
798 /// loops will be created. Also the function returns a map from loop indices
799 /// of the LinalgOp to the corresponding non-empty range indices of newly
800 /// created loops.
802 std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
804  ArrayRef<OpFoldResult> allShapeSizes,
805  ArrayRef<OpFoldResult> allTileSizes);
806 
807 namespace detail {
808 template <typename T>
810  /// Tile sizes.
812  /// Number of tiles associated with each size.
814 };
815 
816 template <typename T>
818  /// Tile sizes.
820  /// Number of tiles associated with each size.
822 };
823 
824 } // namespace detail
825 
826 /// A description of a multi-size tiling comprising tile sizes and numbers of
827 /// tiles, expressed as Values which may or may not be constant. Multi-size
828 /// currently means two-size.
830  : public detail::MultiSizeSpecificationBase<Value> {};
832  : public detail::MultiSizeSpecificationBase<int64_t> {};
833 
837  : public detail::ContinuousTileSizeSpecificationBase<int64_t> {};
838 
839 /// Emits the IR computing the multi-sized tiling specification with two tile
840 /// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such
841 /// that there exist numbers of tiles with these sizes that fully cover the
842 /// given iteration space `dimension` of the structured `op`.
843 ///
844 /// The computation is as follows:
845 ///
846 /// b = originalTripCount floordiv sizeDivisor
847 /// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor
848 /// d = (b + t - 1) floordiv t
849 /// s = (b floordiv d) * sizeDivisor
850 /// v = b % d
851 /// u = d - v
852 ///
853 /// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of
854 /// the corresponding tiles are `u` and `v`, respectively. Alternatively,
855 ///
856 /// s * u + (s + sizeDivisor) * v == original size,
857 /// where s mod sizeDivisor = 0.
858 ///
859 /// Expects all values to be positive. In some cases with the target tile size
860 /// sufficiently close to the dimension shape and non-unit divisor, it is
861 /// impossible to compute such sizes. If `emitAssertion` is set, also emit the
862 /// assertion that size computation succeeded.
863 ///
864 /// Returns the specification consisting of both tile values and the number of
865 /// tiles of each size.
866 FailureOr<MultiSizeSpecification>
867 computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension,
868  OpFoldResult targetSize, OpFoldResult divisor,
869  bool emitAssertions = true);
870 FailureOr<StaticMultiSizeSpecification>
871 computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
872  int64_t divisor);
873 
874 FailureOr<StaticContinuousTileSizeSpecification>
875 computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension,
876  unsigned targetSize);
877 FailureOr<ContinuousTileSizeSpecification>
878 computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,
879  unsigned dimension, OpFoldResult targetSize,
880  bool emitAssertions);
881 
882 /// Transformation information returned after reduction tiling.
884  /// The partial reduction tiled op generated.
886  /// The final reduction operation merging all the partial reductions.
888  /// Initial values used for partial reductions.
890  /// The `scf.forall` operation that iterate over the tiles.
891  scf::ForallOp loops;
892 };
893 
894 /// Method to tile a reduction to parallel iterations computing partial
895 /// reductions. After the loop all the partial reduction are merged into a final
896 /// reduction. For example for the following sequence
897 ///
898 /// ```mlir
899 /// %0 = linalg.generic %in ["parallel", "reduction"]
900 /// : tensor<7x9xf32> -> tensor<7xf32>
901 /// ```
902 ///
903 /// into:
904 ///
905 /// ```mlir
906 /// %0 = linalg.fill ... : tensor<7x4xf32>
907 /// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
908 /// -> (tensor<7x4xf32>) {
909 /// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
910 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
911 /// %4 = linalg.generic %2, %3 ["parallel", "reduction"]
912 /// : tensor<7x?xf32> -> tensor<7xf32>
913 /// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32>
914 /// }
915 /// %6 = linalg.generic %1 ["parallel", "reduction"]
916 /// : tensor<7x4xf32> -> tensor<7xf32>
917 /// ```
918 FailureOr<ForallReductionTilingResult>
919 tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
920  ArrayRef<OpFoldResult> numThreads,
921  ArrayRef<OpFoldResult> tileSizes = {},
922  std::optional<ArrayAttr> mapping = std::nullopt);
923 
924 /// All indices returned by IndexOp should be invariant with respect to
925 /// tiling. Therefore, if an operation is tiled, we have to transform the
926 /// indices accordingly, i.e. offset them by the values of the corresponding
927 /// induction variables that are captured implicitly in the body of the op.
928 ///
929 /// Example. `linalg.generic` before tiling:
930 ///
931 /// #id_2d = (i, j) -> (i, j)
932 /// #pointwise_2d_trait = {
933 /// indexing_maps = [#id_2d, #id_2d],
934 /// iterator_types = ["parallel", "parallel"]
935 /// }
936 /// linalg.generic #pointwise_2d_trait %operand, %result {
937 /// ^bb0(%operand_in: f32, %result_in: f32):
938 /// %i = linalg.index 0 : index
939 /// %j = linalg.index 1 : index
940 /// <some operations that use %i, %j>
941 /// }: memref<50x100xf32>, memref<50x100xf32>
942 ///
943 /// After tiling pass with tiles sizes 10 and 25:
944 ///
945 /// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
946 ///
947 /// %c1 = arith.constant 1 : index
948 /// %c0 = arith.constant 0 : index
949 /// %c25 = arith.constant 25 : index
950 /// %c10 = arith.constant 10 : index
951 /// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
952 /// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
953 /// scf.for %k = %c0 to operand_dim_0 step %c10 {
954 /// scf.for %l = %c0 to operand_dim_1 step %c25 {
955 /// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
956 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
957 /// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1]
958 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
959 /// linalg.generic pointwise_2d_trait %4, %5 {
960 /// ^bb0(%operand_in: f32, %result_in: f32):
961 /// %i = linalg.index 0 : index
962 /// %j = linalg.index 1 : index
963 /// // Indices `k` and `l` are implicitly captured in the body.
964 /// %transformed_i = arith.addi %i, %k : index // index `i` is offset by
965 /// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset
966 /// by %l
967 /// // Every use of %i, %j is replaced with %transformed_i,
968 /// %transformed_j <some operations that use %transformed_i,
969 /// %transformed_j>
970 /// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
971 /// }
972 /// }
973 ///
974 /// TODO: Investigate whether mixing implicit and explicit indices
975 /// does not lead to losing information.
976 void transformIndexOps(RewriterBase &b, LinalgOp op,
978  const LoopIndexToRangeIndexMap &loopIndexToRangeIndex);
979 
980 /// Apply transformation to split the single linalg op reduction into a
981 /// parallel and reduction dimension. Then create a new linalg.generic op
982 /// doing the rest of the reduction. Return the new linalg op with an extra
983 /// parallel dimension or failure if the transformation didn't happen.
984 ///
985 /// Example:
986 /// ```
987 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
988 /// affine_map<(d0) -> ()>],
989 /// iterator_types = ["reduction"]}
990 /// ins(%in : tensor<32xf32>)
991 /// outs(%out : tensor<f32>) {
992 /// ^bb0(%arg1: f32, %arg2: f32):
993 /// %y = arith.addf %arg1, %arg2 : f32
994 /// linalg.yield %y : f32
995 /// } -> tensor<f32>
996 /// ```
997 /// To:
998 /// ```
999 /// %cst = arith.constant 0.000000e+00 : f32
1000 /// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into
1001 /// tensor<4x8xf32> %1 = tensor.empty [4] : tensor<4xf32> %2 = linalg.fill
1002 /// ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> %3 =
1003 /// linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
1004 /// affine_map<(d0, d1) -> (d0)>],
1005 /// iterator_types = ["parallel", "reduction"]}
1006 /// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) {
1007 /// ^bb0(%arg3: f32, %arg5: f32):
1008 /// %5 = arith.addf %arg3, %arg4 : f32
1009 /// linalg.yield %5 : f32
1010 /// } -> tensor<4xf32>
1011 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
1012 /// affine_map<(d0) -> ()>],
1013 /// iterator_types = ["reduction"]}
1014 /// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) {
1015 /// ^bb0(%arg3: f32, %arg4: f32):
1016 /// %5 = arith.addf %arg3, %arg4 : f32
1017 /// linalg.yield %5 : f32
1018 /// } -> tensor<f32>
1019 /// ```
1022  FillOp fillOp;
1023  LinalgOp splitLinalgOp;
1025 };
1026 FailureOr<SplitReductionResult>
1027 splitReduction(RewriterBase &b, LinalgOp op,
1028  const ControlSplitReductionFn &controlSplitReductionFn,
1029  bool useAlloc = false);
1030 
1031 /// Scaling-based implementation of the split reduction transformation.
1032 /// Instead of introducing an ExpandShapeOp, this rewrites a reduction
1033 /// dimension `k` into `k * scale + kk`.
1034 ///
1035 /// Example:
1036 /// ```
1037 /// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
1038 /// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
1039 /// ```
1040 ///
1041 /// Is transformed to:
1042 ///
1043 /// ```
1044 /// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
1045 /// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
1046 /// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
1047 /// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1048 /// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
1049 /// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
1050 /// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32>
1051 /// %cst = arith.constant 0.000000e+00 : f32
1052 /// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
1053 /// tensor<16x32x64xf32>
1054 /// %2 = tensor.empty [64, 4] : tensor<64x4xi1>
1055 ///
1056 /// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
1057 /// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
1058 /// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>,
1059 /// tensor<64x4xi1>)
1060 /// outs(%1 : tensor<16x32x64xf32>) {
1061 /// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
1062 /// %5 = arith.mulf %arg3, %arg4 : f32
1063 /// %6 = arith.addf %arg6, %5 : f32
1064 /// linalg.yield %6 : f32
1065 /// } -> tensor<16x32x64xf32>
1066 ///
1067 /// %4 = linalg.generic {indexing_maps = [#map4, #map5],
1068 /// iterator_types = ["parallel", "parallel", "reduction"]}
1069 // ins(%3 : tensor<16x32x64xf32>)
1070 /// outs(%C : tensor<16x32xf32>) {
1071 /// ^bb0(%arg3: f32, %arg4: f32):
1072 /// %5 = arith.addf %arg3, %arg4 : f32
1073 /// linalg.yield %5 : f32
1074 /// } -> tensor<16x32xf32>
1075 ///
1076 /// return %4 : tensor<16x32xf32>
1077 /// ```
1078 FailureOr<SplitReductionResult>
1079 splitReductionByScaling(RewriterBase &b, LinalgOp op,
1080  const ControlSplitReductionFn &controlSplitReductionFn,
1081  bool useAlloc = false);
1082 
1083 /// Return `true` if a given sequence of dimensions are contiguous in the
1084 /// range of the specified indexing map.
1086 /// Return `true` if all sequences of dimensions specified in `dimSequences` are
1087 /// contiguous in all the ranges of the `maps`.
1089  ArrayRef<ReassociationIndices> dimSequences);
1090 
1093  LinalgOp collapsedOp;
1094 };
1095 
1096 /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
1097 /// to calling this method is that for each list in `foldedIterationDim`, the
1098 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
1099 /// the `linalgOp`. This can be checked using `areDimSequencePreserved` method.
1100 /// When valid, the method also collapses the operands of the op. Returns
1101 /// replacement values of the results of the original `linalgOp` by inserting
1102 /// reshapes to get back values of compatible types.
1103 FailureOr<CollapseResult>
1104 collapseOpIterationDims(LinalgOp op,
1105  ArrayRef<ReassociationIndices> foldedIterationDims,
1106  RewriterBase &rewriter);
1107 
1109  tensor::PadOp padOp;
1110  tensor::ExpandShapeOp expandShapeOp;
1111  linalg::TransposeOp transposeOp;
1112 };
1113 
1114 /// Rewrite pack as pad + reshape + transpose.
1115 FailureOr<LowerPackResult> lowerPack(RewriterBase &rewriter,
1116  tensor::PackOp packOp);
1117 
1119  tensor::EmptyOp emptyOp;
1120  linalg::TransposeOp transposeOp;
1121  tensor::CollapseShapeOp collapseShapeOp;
1122  tensor::ExtractSliceOp extractSliceOp;
1123 };
1124 
1125 /// Rewrite pack as empty + transpose + reshape + extract_slice.
1126 FailureOr<LowerUnPackOpResult> lowerUnPack(RewriterBase &rewriter,
1127  tensor::UnPackOp unPackOp);
1128 
1129 /// Struct to hold the result of a `pack` call.
1130 struct PackResult {
1132  linalg::LinalgOp packedLinalgOp;
1134 };
1135 /// Implement packing of a single LinalgOp by `packedSizes`.
1136 /// There must be one packedSizes entry per `linalgOp` iterator.
1137 /// Return the packed Linalg op on success, failure otherwise.
1138 FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1139  ArrayRef<OpFoldResult> packedSizes);
1140 
1141 /// Struct to hold the result of a `packTranspose` call.
1143  tensor::PackOp transposedPackOp;
1144  linalg::LinalgOp transposedLinalgOp;
1145  tensor::UnPackOp transposedUnPackOp;
1146 };
1147 /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the
1148 /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements.
1149 /// Return failure if either:
1150 /// 1. the `packOp` does not have the `linalgOp` as its unique use.
1151 /// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied
1152 /// to the unique `packOp` use.
1153 /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of
1154 /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty.
1155 FailureOr<PackTransposeResult>
1156 packTranspose(RewriterBase &rewriter, tensor::PackOp packOp,
1157  linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp,
1158  ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm);
1159 
1160 /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m
1161 /// and n are proper parallel dimensions and k is a proper reduction
1162 /// dimension. Packing occurs by rewriting the op as a linalg.generic and
1163 /// calling linalg::pack by `mnkPackedSizes`. The order of the packed
1164 /// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2}
1165 /// to reorder {m, n, k} into one of the 8 possible forms. The outer
1166 /// dimensions of the operands are not permuted at this time, this is left for
1167 /// future work.
1168 FailureOr<PackResult>
1169 packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp,
1170  ArrayRef<OpFoldResult> mnkPackedSizes,
1171  ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf,
1172  ArrayRef<int64_t> mnkOrder);
1173 
1175  /// Minor block factors (mb, nb, kb) for packing relayout where mb, mn are
1176  /// the parallel dimensions and kb is the reduction dimension.
1178 
1179  /// If true, allows packing of dimensions that only partially fit into the
1180  /// block factors.
1181  bool allowPadding = true;
1182 
1183  /// Next multiples of the packing sizes.
1185 
1186  /// Permutation of matmul (M, N, K) dimensions order.
1188 
1189  /// Transpose LHS outer block layout [MB][KB] -> [KB][MB].
1191 
1192  /// Transpose LHS inner block layout [mb][kb] -> [kb][mb].
1194 
1195  /// Transpose RHS outer block layout [KB][NB] -> [NB][KB].
1197 
1198  /// Transpose RHS inner block layout [kb][nb] -> [nb][kb].
1200 };
1201 
1202 /// Function type which is used to control matmul packing.
1203 /// It is expected to return valid packing configuration for each operation.
1204 /// Lack of packing options indicates that no valid configuration could be
1205 /// assigned and the operation will not be packed.
1207  std::function<std::optional<BlockPackMatmulOptions>(linalg::LinalgOp)>;
1208 
1209 /// Pack a matmul operation into blocked 4D layout.
1210 ///
1211 /// Relayout a matmul operation into blocked layout with two levels of
1212 /// subdivision:
1213 /// - major 2D blocks - outer dimensions, consist of minor blocks
1214 /// - minor 2D blocks - inner dimensions, consist of scalar elements
1215 ///
1216 /// A 2D matmul MxNxK gets reshaped into blocked 4D representation
1217 /// as: [MB][NB][mb][nb] += [MB][KB][mb][kb] * [NB][KB][nb][kb]
1218 /// where the (MB, NB, KB) dimensions represent the major blocks,
1219 /// and the (mb, nb, kb) are the minor blocks of their respective
1220 /// original 2D dimensions (M, N, K).
1221 ///
1222 /// Depending on the initial operands' data layout and the specified
1223 /// packing options, the major blocks dimensions might get transposed
1224 /// e.g., [MB][KB] -> [KB][MB]. The minor blocks can also be transposed
1225 /// e.g., [mb][kb] -> [kb][mb].
1226 /// Any present batch dimensions remain unchanged.
1227 /// The final result is unpacked back to the original shape.
1228 ///
1229 /// Return failure if no valid packing options are provided.
1230 FailureOr<PackResult>
1231 blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1232  const ControlBlockPackMatmulFn &controlPackMatmul);
1233 
1234 /// Rewrite tensor.from_elements to linalg.generic.
1235 FailureOr<Operation *>
1237  tensor::FromElementsOp fromElementsOp);
1238 
1239 /// Rewrite tensor.generate to linalg.generic.
1240 FailureOr<Operation *>
1242  tensor::GenerateOp generateOp);
1243 
1244 /// Rewrite tensor.pad to linalg.generic + tensor.insert_slice.
1245 FailureOr<Operation *> rewriteInDestinationPassingStyle(RewriterBase &rewriter,
1246  tensor::PadOp padOp);
1247 
1248 /// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing)
1249 /// and linalg.matmul.
1250 ///
1251 /// A convolution operation can be written as a matrix-matrix multiplication by
1252 /// unfolding the cross-correlation between input and filter and explicitly copy
1253 /// overlapped sliding window inputs.
1254 ///
1255 /// Consider 2D input X with single channel input and output and 2x2 filter W:
1256 /// [x(0, 0) , x(0, 1) , ..., x(0, n) ]
1257 /// [x(1, 0) , x(1, 1) , ..., x(1, n) ]
1258 /// [. , . ,. , . ] [w(0, 0), w(0, 1)]
1259 /// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)]
1260 /// [. , . , ., . ]
1261 /// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)]
1262 ///
1263 /// The packed input data (img2col) is a matrix with |rows| = output spatial
1264 /// size, |columns| = filter spatial size. To compute the output Y(i, j) we need
1265 /// to calculate the dot product between filter window at input X(x, y)) and the
1266 /// filter which will look like the following where r.h.s is the img2col matrix
1267 /// and l.h.s is the flattened filter:
1268 ///
1269 /// [x(0,0), x(0,1), x(1,0), x(1,1)]
1270 /// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)]
1271 /// [x(0,1), x(1,1), x(0,2), x(1,2)]
1272 /// [ . , . , . , . ]
1273 ///
1274 /// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter
1275 /// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix
1276 /// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in
1277 /// the N input. For the case where N > 1 its a batched matrix-matrix
1278 /// multiplication.
1279 ///
1280 /// On success, return both the operation that produces the img2col tensor and
1281 /// the final operation of the sequence that replaces the original convolution.
1282 FailureOr<std::pair<Operation *, Operation *>>
1283 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp);
1284 
1285 /// Same as the above but for Fhwc channel orderings in the filter. In this case
1286 /// the matrix multiplication is actually a row-wise dot-product rather than a
1287 /// row-column dot-product. This is to avoid transposing the filter matrix which
1288 /// would be required for a regular matrix multiplication to produce the correct
1289 /// output dimensions.
1290 FailureOr<std::pair<Operation *, Operation *>>
1291 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp);
1292 
1293 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no
1294 /// reduction among the input channels so each convolution can be a
1295 /// matrix-vector product and by transposing both input filter so channels are
1296 /// outer most the computation is a batched matrix-vector product.
1297 FailureOr<std::pair<Operation *, Operation *>>
1298 rewriteInIm2Col(RewriterBase &rewriter,
1299  linalg::DepthwiseConv2DNhwcHwcOp convOp);
1300 
1301 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the
1302 /// channels are to the left of the image shape dimensions, the position of the
1303 /// contraction dimension in the resulting matmul is reversed. This swaps the
1304 /// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) *
1305 /// (C x Kh x Kw, Ho x Wo))
1306 FailureOr<std::pair<Operation *, Operation *>>
1307 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp);
1308 
1309 /// Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by
1310 /// materializing transpose.
1311 FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1312  linalg::Conv2DNhwcFhwcOp op);
1313 FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1314  linalg::Conv2DNhwcFhwcQOp op);
1315 
1316 /// Convert Linalg matmul ops to transposed variants.
1317 FailureOr<Operation *> transposeMatmul(RewriterBase &rewriter,
1318  linalg::MatmulOp op,
1319  bool transposeLHS = true);
1320 FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter,
1321  linalg::BatchMatmulOp op,
1322  bool transposeLHS = true);
1323 
1324 /// Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm
1325 /// F(m x m, r x r). m is the dimension size of output and r is the dimension
1326 /// size of filter.
1327 FailureOr<Operation *> winogradConv2D(RewriterBase &rewriter,
1328  linalg::Conv2DNhwcFhwcOp op, int64_t m,
1329  int64_t r);
1330 
1331 /// Rewrite linalg.winograd_filter_transform. The data layout of the filter is
1332 /// FHWC. The transformation matrix is 2-dimension. We need to extract H x W
1333 /// from FHWC first. We generate 2 levels of loops to iterate on F and C. After
1334 /// the rewriting, we get
1335 ///
1336 /// scf.for %f = lo_f to hi_f step 1
1337 /// scf.for %c = lo_c to hi_c step 1
1338 /// %extracted = extract filter<h x w> from filter<f x h x w x c>
1339 /// %ret = linalg.matmul G, %extracted
1340 /// %ret = linalg.matmul %ret, GT
1341 /// %inserted = insert %ret into filter<h x w x c x f>
1342 FailureOr<Operation *>
1344  linalg::WinogradFilterTransformOp op);
1345 
1346 /// Rewrite linalg.winograd_input_transform. The data layout of the input is
1347 /// NHWC. The transformation matrix is 2-dimension. We need to extract H x W
1348 /// from NHWC first. We generate 4 levels of loops to iterate on N, C, tileH,
1349 /// and tileW. After the rewriting, we get
1350 ///
1351 /// scf.for %h = 0 to tileH step 1
1352 /// scf.for %w = 0 to tileW step 1
1353 /// scf.for %n = 0 to N step 1
1354 /// scf.for %c = 0 to C step 1
1355 /// %extracted = extract %extracted<alphaH x alphaW> from
1356 /// %input<N x H x W x C>
1357 /// at [%n, (%h x m), (%w x m), %c]
1358 /// %ret = linalg.matmul BT, %extracted
1359 /// %ret = linalg.matmul %ret, B
1360 /// %inserted = insert %ret<alphaH x alphaW> into
1361 /// %output<alphaH x alphaW x tileH x tileW x N x C>
1362 /// at [0, 0, %h, %w, %n, %c]
1363 FailureOr<Operation *>
1365  linalg::WinogradInputTransformOp op);
1366 
1367 /// Rewrite linalg.winograd_output_transform. The data layout of the output is
1368 /// HWNF. The transformation matrix is 2-dimension. We need to extract H x W
1369 /// from HWNF first. We generate 4 levels of loops to iterate on N, F, tileH,
1370 /// and tileW. After the transformation, we get
1371 ///
1372 /// scf.for %h = 0 to tileH step 1
1373 /// scf.for %w = 0 to tileW step 1
1374 /// scf.for %n = 0 to N step 1
1375 /// scf.for %f = 0 to F step 1
1376 /// %extracted = extract %extracted<alphaH x alphaW> from
1377 /// %input<alphaH x alphaW x tileH x tileW x N x F>
1378 /// at [0, 0, %h, %w, %n, %f]
1379 /// %ret = linalg.matmul AT, %extracted
1380 /// %ret = linalg.matmul %ret, A
1381 /// %inserted = insert %ret<alphaH x alphaW> into
1382 /// output<N x H x W x F>
1383 /// at [%n, (%h x m), (%w x m), %f]
1384 FailureOr<Operation *>
1386  linalg::WinogradOutputTransformOp op);
1387 
1388 //===----------------------------------------------------------------------===//
1389 // Rewrite patterns wrapping transformations.
1390 // TODO: every single such pattern should be a close to noop wrapper around a
1391 // functional-stye API call.
1392 //===----------------------------------------------------------------------===//
1393 
1394 /// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D
1395 /// convolution ops.
1396 template <typename Conv2DOp, typename Conv1DOp>
1398  : public OpRewritePattern<Conv2DOp> {
1400 
1401  FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp,
1402  PatternRewriter &rewriter) const;
1403 
1404  LogicalResult matchAndRewrite(Conv2DOp convOp,
1405  PatternRewriter &rewriter) const override {
1406  return returningMatchAndRewrite(convOp, rewriter);
1407  }
1408 };
1409 
1410 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
1411  Conv1DNwcWcfOp>;
1412 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
1413  Conv1DNcwFcwOp>;
1414 
1415 /// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh)
1416 /// dimensions into 1-D depthwise convolution ops.
1418  : public OpRewritePattern<DepthwiseConv2DNhwcHwcOp> {
1420  PatternBenefit benefit = 1)
1421  : OpRewritePattern<DepthwiseConv2DNhwcHwcOp>(context, benefit) {}
1422 
1423  FailureOr<DepthwiseConv1DNwcWcOp>
1424  returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1425  PatternRewriter &rewriter) const;
1426 
1427  LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1428  PatternRewriter &rewriter) const override {
1429  return returningMatchAndRewrite(convOp, rewriter);
1430  }
1431 };
1432 
1433 struct DownscaleConv2DOp final : public OpRewritePattern<Conv2DOp> {
1435  : OpRewritePattern<Conv2DOp>(context, benefit) {}
1436 
1437  FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp,
1438  PatternRewriter &rewriter) const;
1439 
1440  LogicalResult matchAndRewrite(Conv2DOp convOp,
1441  PatternRewriter &rewriter) const override {
1442  return returningMatchAndRewrite(convOp, rewriter);
1443  }
1444 };
1445 
1446 ///
1447 /// Linalg generalization pattern.
1448 ///
1449 /// Apply the `generalization` transformation as a pattern.
1450 /// See `generalization` for more details.
1451 //
1452 // TODO: Automatic default pattern class that just unwraps a function
1453 // returning FailureOr<GenericOp>.
1455  : public OpInterfaceRewritePattern<LinalgOp> {
1457 
1458  /// `matchAndRewrite` implementation that returns the significant
1459  /// transformed pieces of IR.
1460  FailureOr<GenericOp>
1461  returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const {
1462  return generalizeNamedOp(rewriter, op);
1463  }
1464 
1465  LogicalResult matchAndRewrite(LinalgOp op,
1466  PatternRewriter &rewriter) const override {
1467  return returningMatchAndRewrite(op, rewriter);
1468  }
1469 };
1470 
1471 struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
1473 
1474  FailureOr<GenericOp>
1475  returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
1476  return specializeGenericOp(rewriter, op);
1477  }
1478 
1479  LogicalResult matchAndRewrite(GenericOp op,
1480  PatternRewriter &rewriter) const override {
1481  return returningMatchAndRewrite(op, rewriter);
1482  }
1483 };
1484 
1485 /// Vectorization pattern for memref::CopyOp.
1486 struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
1488 
1489  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
1490  PatternRewriter &rewriter) const override;
1491 };
1492 
1494  std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>;
1495 
1496 /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
1497 /// InsertSliceOp. For now, only constant padding values are supported.
1498 /// `OptimizeCopyFn` can be used to customize copying step optimization.
1499 struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> {
1501  OptimizeCopyFn optimizeCopyFn = nullptr,
1502  PatternBenefit benefit = 1)
1503  : OpRewritePattern<tensor::PadOp>(context, benefit),
1504  optimizeCopyFn(std::move(optimizeCopyFn)) {}
1505  LogicalResult matchAndRewrite(tensor::PadOp padOp,
1506  PatternRewriter &rewriter) const override;
1507 
1508 protected:
1510  Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp,
1511  Value dest,
1512  const SmallVector<Value> &dynSizes) const;
1513 };
1514 
1515 /// Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose +
1516 /// tensor.insert_slice ops, where the tensor::PackOp has outer dims being all
1517 /// 1s.
1519  : public OpRewritePattern<tensor::PackOp> {
1521  LogicalResult matchAndRewrite(tensor::PackOp packOp,
1522  PatternRewriter &rewriter) const override;
1523 };
1524 
1525 /// Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op
1526 /// + transpose op + insert_slice op, where the tensor::UnPackOp has outer dims
1527 /// being all 1s.
1529  : public OpRewritePattern<tensor::UnPackOp> {
1531  LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp,
1532  PatternRewriter &rewriter) const override;
1533 };
1534 
1535 /// Match and rewrite for the pattern:
1536 /// ```
1537 /// %alloc = ...
1538 /// [optional] %view = memref.view %alloc ...
1539 /// %subView = subview %allocOrView ...
1540 /// [optional] linalg.fill(%allocOrView, %cst) ...
1541 /// ...
1542 /// memref.copy(%in, %subView) ...
1543 /// vector.transfer_read %allocOrView[...], %cst ...
1544 /// ```
1545 /// into
1546 /// ```
1547 /// [unchanged] %alloc = ...
1548 /// [unchanged] [optional] %view = memref.view %alloc ...
1549 /// [unchanged] [unchanged] %subView = subview %allocOrView ...
1550 /// ...
1551 /// vector.transfer_read %in[...], %cst ...
1552 /// ```
1553 /// Where there is no interleaved use between memref.copy and transfer_read as
1554 /// well as no interleaved use between linalg.fill and memref.copy (if
1555 /// linalg.fill is specified).
1556 /// This is a custom rewrite to forward partial reads (with optional fills) to
1557 /// vector.transfer_read.
1559  : public OpRewritePattern<vector::TransferReadOp> {
1561 
1562  LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
1563  PatternRewriter &rewriter) const override;
1564 };
1565 
1566 /// Match and rewrite for the pattern:
1567 /// ```
1568 /// %alloc = ...
1569 /// [optional] %view = memref.view %alloc ...
1570 /// %subView = subview %allocOrView...
1571 /// ...
1572 /// vector.transfer_write %..., %allocOrView[...]
1573 /// memref.copy(%subView, %out)
1574 /// ```
1575 /// into
1576 /// ```
1577 /// [unchanged] %alloc = ...
1578 /// [unchanged] [optional] %view = memref.view %alloc ...
1579 /// [unchanged] %subView = subview %allocOrView...
1580 /// ...
1581 /// vector.transfer_write %..., %out[...]
1582 /// ```
1583 /// Where there is no interleaved use between transfer_write and memref.copy.
1584 /// This is a custom rewrite to forward partial writes to
1585 /// vector.transfer_write.
1587  : public OpRewritePattern<vector::TransferWriteOp> {
1589 
1590  LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
1591  PatternRewriter &rewriter) const override;
1592 };
1593 
1594 /// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
1596  : public OpRewritePattern<tensor::ExtractSliceOp> {
1597  /// A function to control pattern application and rewrite logic.
1598  ///
1599  /// The function will be given the slice op and should return:
1600  /// - std::nullopt: to fail the match and not apply the pattern;
1601  /// - true: to apply the pattern with zero slice guard;
1602  /// - false: to apply the pattern without zero slice guard.
1603  ///
1604  /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice
1605  /// guard.
1606  using ControlFn = std::function<std::optional<bool>(tensor::ExtractSliceOp)>;
1607 
1609  ControlFn controlFn = nullptr,
1610  PatternBenefit benefit = 1)
1611  : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {}
1612 
1613  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
1614  PatternRewriter &rewriter) const override;
1615 
1616 private:
1617  ControlFn controlFn;
1618 };
1619 
1620 //===----------------------------------------------------------------------===//
1621 // Populate functions.
1622 //===----------------------------------------------------------------------===//
1623 
1624 /// Canonicalization patterns relevant to apply after tiling patterns. These
1625 /// are applied automatically by the tiling pass but need to be applied
1626 /// manually when tiling is called programmatically.
1629 
1630 /// Linalg generalization patterns
1631 
1632 /// Populates `patterns` with patterns to convert spec-generated named ops to
1633 /// linalg.generic ops.
1635 
1636 /// Populates `patterns` with patterns to convert linalg.generic ops to named
1637 /// ops where possible. A linalg.generic can represent wide range and complex
1638 /// computations for which equivalent linalg named op may not exist e.g.
1639 /// linalg.generic that takes a tensor and computes a polynomial such as:
1640 /// p(x) = an*x^n + ... + a1x + a0
1641 /// There is no equivalent named op to convert to. Many such cases exist.
1643  RewritePatternSet &patterns);
1644 
1645 /// Linalg decompose convolutions patterns
1646 
1647 /// Populates patterns to decompose high-D convolution ops into low-D ones.
1648 /// This is a step in progressive lowering for convolution ops, afterwards we
1649 /// can vectorize the low-D convolution ops.
1651  PatternBenefit benefit = 1);
1652 
1653 /// Populates patterns to transform linalg.conv_2d_xxx operations into
1654 /// linalg.generic (for img2col packing) and linalg.matmul.
1655 /// \see rewriteInIm2Col for more details.
1657 
1658 /// Populates `patterns` with patterns that vectorize tensor.pad.
1659 /// These patterns are meant to apply in a complementary fashion. Benefits
1660 /// are used to encode a certain ordering of pattern application. To avoid
1661 /// scattering magic constants throughout the code base, the patterns must be
1662 /// added with this function. `baseBenefit` can be used to offset the benefit
1663 /// of all tensor::PadOp vectorization patterns by a certain value.
1665  PatternBenefit baseBenefit = 1);
1666 
1667 /// Populate patterns for splitting a `LinalgOp` with multiple statements within
1668 /// its payload into multiple `GenericOp` that have a single statement.
1669 /// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments
1670 /// and results from the generated decomposed ops. This is default `true` since
1671 /// the core decomposition patterns relies on these clean up patterns. It is set
1672 /// to false only for testing purposes.
1674  bool removeDeadArgsAndResults = true);
1675 
1676 /// Populate patterns that convert non-destination-style ops to destination
1677 /// style ops.
1679 
1680 /// Populate patterns for vectorizing low-D convolution ops. This is a step in
1681 /// progressive lowering for convolution ops, it assume high-D convolution ops
1682 /// were decomposed previously.
1684  PatternBenefit benefit = 1);
1685 
1686 /// Populate patterns that convert `ElementwiseMappable` ops to linalg
1687 /// parallel loops.
1689 
1690 /// Populate patterns that are only useful in the context of sparse tensors.
1692 
1693 /// Function type which is used to control when to stop fusion. It is expected
1694 /// that OpOperand is not modified in the callback. The OpOperand is not marked
1695 /// as const to allow callers to use non-const methods.
1696 using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>;
1697 
1698 /// Patterns for fusing linalg operation on tensors.
1699 
1700 /// Pattern to fuse `linalg.generic` -> `linalg.generic` operations
1701 /// when both operations are fusable elementwise operations.
1703  RewritePatternSet &patterns,
1704  const ControlFusionFn &controlElementwiseOpFusion);
1705 
1706 /// Function type which is used to control propagation of tensor.pack/unpack
1707 /// ops.
1708 using ControlPropagationFn = std::function<bool(OpOperand *opOperand)>;
1709 
1710 /// Patterns to bubble up or down data layout ops across other operations.
1712  RewritePatternSet &patterns,
1713  const ControlPropagationFn &controlPackUnPackPropagation);
1714 
1715 /// Pattern to remove dead operands and results of `linalg.generic` operations.
1716 /// This is effectively DCE for a linalg op.
1718 
1719 /// Patterns to promote inputs to outputs and remove unused inputs of
1720 /// `linalg.generic` ops.
1722 
1723 /// Function type to control generic op dimension collapsing. It is expected
1724 /// to return an array of `ReassociationIndices` representing dimensions that
1725 /// should be merged.
1727  std::function<SmallVector<ReassociationIndices>(linalg::LinalgOp)>;
1728 
1729 /// Pattern to collapse dimensions in a linalg.generic op. This will collapse
1730 /// tensor operands when needed and expand back the result tensors.
1732  RewritePatternSet &patterns,
1733  const GetCollapsableDimensionsFn &controlCollapseDimensions);
1734 
1735 /// Patterns to fold an expanding (collapsing) tensor_reshape operation with its
1736 /// producer (consumer) generic operation by expanding the dimensionality of the
1737 /// loop in the generic op.
1739  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1740 
1741 /// Patterns to fold an expanding tensor.expand_shape operation with its
1742 /// producer generic operation by collapsing the dimensions of the generic op.
1744  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1745 
1746 /// Patterns to constant fold Linalg operations.
1748  const ControlFusionFn &controlFn);
1749 
1750 /// Pattern to fuse a `tensor.pad` operation with the producer of its source,
1751 /// if the producer is a `linalg` operation with all parallel iterator types.
1753  RewritePatternSet &patterns);
1754 
1755 /// Patterns to convert from one named op to another. These can be seen as
1756 /// canonicalizations of named ops into another named op.
1758 
1759 /// Patterns to fold unit-extent dimensions in operands/results of linalg ops on
1760 /// tensors via reassociative reshape ops.
1763 
1764 /// A pattern that converts init operands to input operands.
1766 
1767 /// Patterns that are used to inline constant operands into linalg generic ops.
1769 
1770 /// Patterns that are used to bubble up extract slice op above linalg op.
1772 
1773 /// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into
1774 /// linalg.fill(%cst, tensor.extract_slice(%init)).
1776 
1777 /// Patterns to apply `splitReduction` below.
1779  RewritePatternSet &patterns,
1780  const ControlSplitReductionFn &controlSplitReductionFn,
1781  bool useAlloc = false);
1782 
1783 /// Patterns to convert Linalg matmul ops to transposed variants.
1785  bool transposeLHS = true);
1786 
1787 /// Patterns to block pack Linalg matmul ops.
1789  const ControlBlockPackMatmulFn &controlFn);
1790 
1791 /// Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
1792 void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m,
1793  int64_t r);
1794 
1795 /// Patterns to decompose Winograd operators.
1797 
1798 /// Adds patterns that reduce the rank of named contraction ops that have
1799 /// unit dimensions in the operand(s) by converting to a sequence of
1800 /// `collapse_shape`,
1801 /// `<corresponding linalg named op>`, `expand_shape` (if on tensors). For
1802 /// example a `linalg.batch_matmul` with unit batch size will convert to
1803 /// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will
1804 /// convert to a `linalg.dot`.
1806 
1807 } // namespace linalg
1808 } // namespace mlir
1809 
1810 #endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
static llvm::ManagedStatic< PassManagerOptions > options
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The main mechanism for performing data layout queries.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
This class helps build Operations.
Definition: Builders.h:212
This class represents a single result from folding an operation.
Definition: OpDefinition.h:268
This class represents an operand of an operation.
Definition: Value.h:267
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
Definition: PatternMatch.h:34
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:785
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
void populateLinalgNamedOpConversionPatterns(RewritePatternSet &patterns)
Patterns to convert from one named op to another.
void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns)
A pattern that converts init operands to input operands.
FailureOr< GenericOp > generalizeNamedOp(RewriterBase &rewriter, LinalgOp namedOp)
Create a GenericOp from the given named operation namedOp and replace namedOp.
void populateTransposeMatmulPatterns(RewritePatternSet &patterns, bool transposeLHS=true)
Patterns to convert Linalg matmul ops to transposed variants.
void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns)
Adds patterns that reduce the rank of named contraction ops that have unit dimensions in the operand(...
LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, const LinalgPaddingOptions &options, LinalgOp &paddedOp, SmallVector< Value > &replacements, SmallVector< tensor::PadOp > &padOps)
Pad the iterator dimensions paddingDimensions of all opToPad operands to a static bounding box.
Definition: Padding.cpp:153
void populateSplitReductionPattern(RewritePatternSet &patterns, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Patterns to apply splitReduction below.
void populateFuseTensorPadWithProducerLinalgOpPatterns(RewritePatternSet &patterns)
Pattern to fuse a tensor.pad operation with the producer of its source, if the producer is a linalg o...
FailureOr< std::pair< Operation *, Operation * > > rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp)
Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) and linalg....
bool areDimSequencesPreserved(ArrayRef< AffineMap > maps, ArrayRef< ReassociationIndices > dimSequences)
Return true if all sequences of dimensions specified in dimSequences are contiguous in all the ranges...
FailureOr< LowerUnPackOpResult > lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp)
Rewrite pack as empty + transpose + reshape + extract_slice.
Definition: Transforms.cpp:354
void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns)
Patterns that are used to bubble up extract slice op above linalg op.
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
Definition: Tiling.cpp:78
std::function< std::optional< Value >(OpBuilder &b, memref::SubViewOp subView, ArrayRef< Value > boundingSubViewSize, DataLayout &layout)> AllocBufferCallbackFn
Callback function type used to perform the allocation for the promoted subView.
Definition: Transforms.h:339
void populateBlockPackMatmulPatterns(RewritePatternSet &patterns, const ControlBlockPackMatmulFn &controlFn)
Patterns to block pack Linalg matmul ops.
void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns)
Populates patterns to transform linalg.conv_2d_xxx operations into linalg.generic (for img2col packin...
FailureOr< Operation * > decomposeWinogradFilterTransformOp(RewriterBase &rewriter, linalg::WinogradFilterTransformOp op)
Rewrite linalg.winograd_filter_transform.
DenseMap< int, int > LoopIndexToRangeIndexMap
Creates a number of ranges equal to the number of non-zero in tileSizes.
Definition: Transforms.h:801
std::optional< Value > allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU workgroup memory.
Definition: Promotion.cpp:470
Value bufferizeToAllocation(RewriterBase &rewriter, const BufferizeToAllocationOptions &options, tensor::PadOp padOp, Attribute memorySpace={}, Operation *insertionPoint=nullptr)
Materialize a buffer allocation for the given tensor.pad op and lower the op to linalg....
std::function< bool(OpOperand *fusedOperand)> ControlFusionFn
Function type which is used to control when to stop fusion.
Definition: Transforms.h:1696
bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence)
Return true if a given sequence of dimensions are contiguous in the range of the specified indexing m...
FailureOr< LinalgOp > specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp)
Create a namedOp from the given GenericOp and replace the GenericOp.
Definition: Specialize.cpp:260
void populateFoldReshapeOpsByCollapsingPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding tensor.expand_shape operation with its producer generic operation by co...
LinalgTilingLoopType
The type of loops to be generated during tiling.
Definition: Utils.h:103
std::function< LogicalResult(OpBuilder &b, Value buffer)> DeallocBufferCallbackFn
Callback function type used to deallocate the buffers used to hold the promoted subview.
Definition: Transforms.h:344
void populateDataLayoutPropagationPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation)
Patterns to bubble up or down data layout ops across other operations.
void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit baseBenefit=1)
Populates patterns with patterns that vectorize tensor.pad.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
Definition: Tiling.cpp:864
LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value)
In case of GPU private memory there is no need to deallocate since the memory is freed when going out...
Definition: Promotion.cpp:511
void populateSparseTensorRewriting(RewritePatternSet &patterns)
Populate patterns that are only useful in the context of sparse tensors.
FailureOr< Operation * > decomposeWinogradOutputTransformOp(RewriterBase &rewriter, linalg::WinogradOutputTransformOp op)
Rewrite linalg.winograd_output_transform.
FailureOr< ElementwiseOpFusionResult > fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand)
llvm::SmallDenseSet< int > getPreservedProducerResults(GenericOp producer, GenericOp consumer, OpOperand *fusedOperand)
Returns a set of indices of the producer's results which would be preserved after the fusion.
FailureOr< PromotionInfo > promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, const AllocBufferCallbackFn &allocationFn, DataLayout &layout)
Definition: Promotion.cpp:238
std::optional< Value > allocateGPUPrivateMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU private memory.
Definition: Promotion.cpp:495
FailureOr< Operation * > rewriteInDestinationPassingStyle(RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp)
Rewrite tensor.from_elements to linalg.generic.
FailureOr< DropUnitDimsResult > dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, const ControlDropUnitDims &options)
FailureOr< PackResult > blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp, const ControlBlockPackMatmulFn &controlPackMatmul)
Pack a matmul operation into blocked 4D layout.
void peelLoops(RewriterBase &rewriter, ArrayRef< scf::ForOp > loops)
Peel 'loops' and applies affine_min/max bounds simplification on the fly where relevant.
Definition: Transforms.cpp:75
FailureOr< Operation * > winogradConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op, int64_t m, int64_t r)
Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm F(m x m, r x r).
void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns)
Populate patterns that convert non-destination-style ops to destination style ops.
FailureOr< Operation * > transposeConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op)
Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by materializing transpose.
void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, ControlDropUnitDims &options)
Patterns to fold unit-extent dimensions in operands/results of linalg ops on tensors via reassociativ...
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst)
Create Memref copy operations and add gpu barrier guards before and after the copy operation to ensur...
Definition: Promotion.cpp:486
std::function< SmallVector< Value, 4 >(OpBuilder &, Operation *)> TileSizeComputationFunction
Definition: Transforms.h:186
std::function< LogicalResult(RewriterBase &, tensor::PadOp, Value)> OptimizeCopyFn
Definition: Transforms.h:1494
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns)
Populate patterns that convert ElementwiseMappable ops to linalg parallel loops.
LogicalResult linalgOpAnchoredEmptyTensorEliminationStep(RewriterBase &rewriter, Operation *op, bufferization::OneShotAnalysisState &state)
Try to eliminate tensor::EmptyOps inside op that are anchored on a LinalgOp.
FailureOr< LinalgLoops > linalgOpToLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.for with the proper body for linalgOp.
Definition: Loops.cpp:368
LogicalResult vectorize(RewriterBase &rewriter, Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false)
Emit a suitable vector form for an operation.
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
Definition: Tiling.cpp:49
FailureOr< Operation * > transposeBatchMatmul(RewriterBase &rewriter, linalg::BatchMatmulOp op, bool transposeLHS=true)
Pattern to replace.
LogicalResult promoteSubviewsPrecondition(Operation *op, LinalgPromotionOptions options)
Promote memref.subviews feeding linalg-on-buffers operations.
Definition: Promotion.cpp:399
LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst)
Normal copy to between src and dst.
Definition: Promotion.cpp:503
void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Linalg decompose convolutions patterns.
void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns)
Patterns to decompose Winograd operators.
void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns for vectorizing low-D convolution ops.
LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp)
Emit a suitable vector form for a Copy op with fully static shape.
LogicalResult vectorizeOpPrecondition(Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false)
Return success if the operation can be vectorized.
FailureOr< GenericOp > interchangeGenericOp(RewriterBase &rewriter, GenericOp genericOp, ArrayRef< unsigned > interchangeVector)
Interchange the iterator_types and iterator_maps dimensions and adapts the index accesses of op.
Definition: Interchange.cpp:50
void populateCollapseDimensions(RewritePatternSet &patterns, const GetCollapsableDimensionsFn &controlCollapseDimensions)
Pattern to collapse dimensions in a linalg.generic op.
bool areElementwiseOpsFusable(OpOperand *fusedOperand)
Return true if two linalg.generic operations with producer/consumer relationship through fusedOperand...
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
Definition: Tiling.cpp:242
FailureOr< LinalgLoops > linalgOpToAffineLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of affine.for with the proper body for linalgOp.
Definition: Loops.cpp:363
void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns)
Pattern to remove dead operands and results of linalg.generic operations.
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
Definition: Tiling.cpp:162
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
Definition: Tiling.cpp:111
std::function< LogicalResult(OpBuilder &b, Value src, Value dst)> CopyCallbackFn
Callback function type used to insert copy from original subview to subview of the promoted region fo...
Definition: Transforms.h:351
FailureOr< SplitReductionResult > splitReduction(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
FailureOr< LinalgOp > padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp, const LinalgPaddingOptions &options)
Apply padding and hoisting to linalgOp according to the configuration specified in options.
Definition: Padding.cpp:265
void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, bool removeDeadArgsAndResults=true)
Populate patterns for splitting a LinalgOp with multiple statements within its payload into multiple ...
std::function< bool(OpOperand *opOperand)> ControlPropagationFn
Function type which is used to control propagation of tensor.pack/unpack ops.
Definition: Transforms.h:1708
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
Definition: Tiling.cpp:596
FailureOr< PackResult > packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef< OpFoldResult > mnkPackedSizes, ArrayRef< int64_t > mnkPaddedSizesNextMultipleOf, ArrayRef< int64_t > mnkOrder)
Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m and n are proper parallel d...
Definition: Transforms.cpp:766
FailureOr< PackResult > pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, ArrayRef< OpFoldResult > packedSizes)
Implement packing of a single LinalgOp by packedSizes.
Definition: Transforms.cpp:477
void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns)
Patterns to promote inputs to outputs and remove unused inputs of linalg.generic ops.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
Definition: Tiling.cpp:824
std::function< SmallVector< ReassociationIndices >(linalg::LinalgOp)> GetCollapsableDimensionsFn
Function type to control generic op dimension collapsing.
Definition: Transforms.h:1727
FailureOr< LowerPackResult > lowerPack(RewriterBase &rewriter, tensor::PackOp packOp)
Rewrite pack as pad + reshape + transpose.
Definition: Transforms.cpp:219
void populateFoldReshapeOpsByExpansionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding (collapsing) tensor_reshape operation with its producer (consumer) gene...
void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns)
Adds patterns that waps tensor.extract_slice(linalg.fill(cst, init)) into linalg.fill(cst,...
void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns)
Patterns that are used to inline constant operands into linalg generic ops.
FailureOr< LinalgOp > promoteSubViews(OpBuilder &b, LinalgOp op, const LinalgPromotionOptions &options)
Promote the subViews into a new buffer allocated at the insertion point b.
Definition: Promotion.cpp:421
void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, const ControlFusionFn &controlFn)
Patterns to constant fold Linalg operations.
std::function< SplitReductionOptions(LinalgOp op)> ControlSplitReductionFn
Function signature to control reduction splitting.
Definition: Transforms.h:442
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value)
In case of GPU group memory there is no need to deallocate.
Definition: Promotion.cpp:479
FailureOr< Operation * > transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp op, bool transposeLHS=true)
Convert Linalg matmul ops to transposed variants.
void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns)
Linalg generalization patterns.
void populateLinalgGenericOpsSpecializationPatterns(RewritePatternSet &patterns)
Populates patterns with patterns to convert linalg.generic ops to named ops where possible.
Definition: Specialize.cpp:328
void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m, int64_t r)
Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
std::function< std::optional< BlockPackMatmulOptions >(linalg::LinalgOp)> ControlBlockPackMatmulFn
Function type which is used to control matmul packing.
Definition: Transforms.h:1207
std::optional< vector::CombiningKind > getCombinerOpKind(Operation *combinerOp)
Return vector::CombiningKind for the given op.
SmallVector< Value > peelLoop(RewriterBase &rewriter, Operation *op)
Try to peel and canonicalize loop op and return the new result.
Definition: Transforms.cpp:59
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
Definition: Tiling.cpp:858
FailureOr< CollapseResult > collapseOpIterationDims(LinalgOp op, ArrayRef< ReassociationIndices > foldedIterationDims, RewriterBase &rewriter)
Collapses dimensions of linalg.generic/linalg.copy operation.
FailureOr< Operation * > decomposeWinogradInputTransformOp(RewriterBase &rewriter, linalg::WinogradInputTransformOp op)
Rewrite linalg.winograd_input_transform.
FailureOr< PackTransposeResult > packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, ArrayRef< int64_t > outerPerm, ArrayRef< int64_t > innerPerm)
Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the transposed PackOp -> LinalgOp ...
Definition: Transforms.cpp:675
std::pair< TilingInterface, TilingInterface > splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension, OpFoldResult splitPoint)
Split the given op into two parts along the given iteration space dimension at the specified splitPoi...
Definition: Split.cpp:67
void populateElementwiseOpsFusionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlElementwiseOpFusion)
Patterns for fusing linalg operation on tensors.
FailureOr< SplitReductionResult > splitReductionByScaling(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Scaling-based implementation of the split reduction transformation.
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
Definition: Tiling.cpp:268
FailureOr< LinalgLoops > linalgOpToParallelLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.parallel with the proper body for linalgOp.
Definition: Loops.cpp:375
Include the generated interface declarations.
ArrayRef< int64_t > ReassociationIndicesRef
OpInterfaceRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting a...
Definition: PatternMatch.h:373
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:358
SmallVector< int64_t, 3 > mnkOrder
Permutation of matmul (M, N, K) dimensions order.
Definition: Transforms.h:1187
SmallVector< int64_t, 3 > blockFactors
Minor block factors (mb, nb, kb) for packing relayout where mb, mn are the parallel dimensions and kb...
Definition: Transforms.h:1177
bool rhsTransposeOuterBlocks
Transpose RHS outer block layout [KB][NB] -> [NB][KB].
Definition: Transforms.h:1196
bool lhsTransposeInnerBlocks
Transpose LHS inner block layout [mb][kb] -> [kb][mb].
Definition: Transforms.h:1193
SmallVector< int64_t, 3 > mnkPaddedSizesNextMultipleOf
Next multiples of the packing sizes.
Definition: Transforms.h:1184
bool lhsTransposeOuterBlocks
Transpose LHS outer block layout [MB][KB] -> [KB][MB].
Definition: Transforms.h:1190
bool allowPadding
If true, allows packing of dimensions that only partially fit into the block factors.
Definition: Transforms.h:1181
bool rhsTransposeInnerBlocks
Transpose RHS inner block layout [kb][nb] -> [nb][kb].
Definition: Transforms.h:1199
bool bufferizeDestinationOnly
If set to "true", only the destination tensor operands are bufferized to a new allocation (and wrappe...
Definition: Transforms.h:64
bool emitDealloc
If set to "true", a memref.dealloc operation will be emitted for each allocated buffer.
Definition: Transforms.h:70
SmallVector< Value > results
Definition: Transforms.h:1092
Transformation to drop unit-extent dimensions from linalg.generic operations.
Definition: Transforms.h:473
RankReductionStrategy rankReductionStrategy
Definition: Transforms.h:476
std::function< SmallVector< unsigned >(Operation *)> ControlFnTy
Definition: Transforms.h:479
Vectorization pattern for memref::CopyOp.
Definition: Transforms.h:1486
LogicalResult matchAndRewrite(memref::CopyOp copyOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:917
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1440
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1434
Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh) dimensions into 1-D depthwise c...
Definition: Transforms.h:1418
FailureOr< DepthwiseConv1DNwcWcOp > returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const
LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1427
DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1419
Rewrites 2-D convolution ops with size-1 window dimensions into 1-D convolution ops.
Definition: Transforms.h:1398
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1404
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
SmallVector< Value > replacements
Definition: Transforms.h:493
Fuse two linalg.generic operations that have a producer-consumer relationship captured through fusedO...
Definition: Transforms.h:502
llvm::DenseMap< Value, Value > replacements
Definition: Transforms.h:504
Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
Definition: Transforms.h:1596
std::function< std::optional< bool >(tensor::ExtractSliceOp)> ControlFn
A function to control pattern application and rewrite logic.
Definition: Transforms.h:1606
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:996
ExtractSliceOfPadTensorSwapPattern(MLIRContext *context, ControlFn controlFn=nullptr, PatternBenefit benefit=1)
Definition: Transforms.h:1608
Transformation information returned after reduction tiling.
Definition: Transforms.h:883
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
Definition: Transforms.h:887
SmallVector< Value > initialValues
Initial values used for partial reductions.
Definition: Transforms.h:889
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
Definition: Transforms.h:891
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
Definition: Transforms.h:885
Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose + tensor....
Definition: Transforms.h:1519
LogicalResult matchAndRewrite(tensor::PackOp packOp, PatternRewriter &rewriter) const override
Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op.
Definition: Transforms.h:1529
LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, PatternRewriter &rewriter) const override
Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and InsertSliceOp.
Definition: Transforms.h:1499
LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:941
Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, Value dest, const SmallVector< Value > &dynSizes) const
Filling dest using FillOp constant padding value if possible.
Definition: Transforms.cpp:924
GeneralizePadOpPattern(MLIRContext *context, OptimizeCopyFn optimizeCopyFn=nullptr, PatternBenefit benefit=1)
Definition: Transforms.h:1500
Match and rewrite for the pattern:
Definition: Transforms.h:1559
LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Match and rewrite for the pattern:
Definition: Transforms.h:1587
LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Linalg generalization pattern.
Definition: Transforms.h:1455
LogicalResult matchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const override
Definition: Transforms.h:1465
FailureOr< GenericOp > returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const
matchAndRewrite implementation that returns the significant transformed pieces of IR.
Definition: Transforms.h:1461
Options that allow distribution of loops generated in Linalg transforms to processors while generatin...
Definition: Utils.h:304
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition: Transforms.h:280
LinalgPaddingOptions & setPadToMultipleOf(ArrayRef< int64_t > m)
Definition: Transforms.h:293
SmallVector< bool > packPaddings
A flag for every operand to mark the PadOp as nofold which enables packing for statically shaped oper...
Definition: Transforms.h:299
std::optional< SmallVector< int64_t > > padToMultipleOf
A list of multiples to which each padding dimension should be padded to.
Definition: Transforms.h:292
LinalgPaddingOptions & setPaddingDimensions(ArrayRef< int64_t > pd)
Definition: Transforms.h:287
LinalgPaddingOptions & setTransposePaddings(ArrayRef< SmallVector< int64_t >> tp)
Definition: Transforms.h:314
SmallVector< SmallVector< int64_t > > transposePaddings
A permutation vector for every operand used to transpose the packed PadOp results.
Definition: Transforms.h:312
LinalgPaddingOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition: Transforms.h:281
LinalgPaddingOptions & setPackPaddings(ArrayRef< bool > pp)
Definition: Transforms.h:300
LinalgPaddingOptions & setCopyBackOp(CopyBackOp op)
Definition: Transforms.h:326
LinalgPaddingOptions & setHoistPaddings(ArrayRef< int64_t > hp)
Definition: Transforms.h:306
SmallVector< int64_t > hoistPaddings
A number of loops to hoist the PadOp out for every operand.
Definition: Transforms.h:305
SmallVector< int64_t > paddingDimensions
A list of iterator dimensions to pad.
Definition: Transforms.h:286
CopyBackOp copyBackOp
The op to be used for copying the padded result to the original destination tensor.
Definition: Transforms.h:325
std::optional< unsigned > alignment
Alignment of promoted buffer. If std::nullopt do not specify alignment.
Definition: Transforms.h:384
LinalgPromotionOptions & setUseFullTileBuffersByDefault(bool use)
Definition: Transforms.h:379
bool useAlloca
Use alloca with the default allocation scheme.
Definition: Transforms.h:397
LinalgPromotionOptions & setAlignment(unsigned align)
Definition: Transforms.h:385
std::optional< Attribute > memorySpace
Memory space of promoted buffer.
Definition: Transforms.h:391
std::optional< CopyCallbackFn > copyOutFn
Definition: Transforms.h:417
std::optional< CopyCallbackFn > copyInFn
Callback function to do the copy of data to and from the promoted subview.
Definition: Transforms.h:416
LinalgPromotionOptions & setUseAlloca(bool use)
Definition: Transforms.h:398
std::optional< DenseSet< unsigned > > operandsToPromote
Indices of subViews to promote.
Definition: Transforms.h:356
LinalgPromotionOptions & setCopyInOutFns(CopyCallbackFn const &copyIn, CopyCallbackFn const &copyOut)
Definition: Transforms.h:418
LinalgPromotionOptions & setUseFullTileBuffers(ArrayRef< bool > useFullTiles)
Definition: Transforms.h:368
std::optional< AllocBufferCallbackFn > allocationFn
Callback function to do the allocation of the promoted buffer.
Definition: Transforms.h:405
bool useFullTileBuffersDefault
If true all operands unspecified by useFullTileBuffers will use the full view, otherwise the partial ...
Definition: Transforms.h:378
std::optional< DeallocBufferCallbackFn > deallocationFn
Definition: Transforms.h:406
LinalgPromotionOptions & setMemorySpace(Attribute memorySpc)
Definition: Transforms.h:392
LinalgPromotionOptions & setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, DeallocBufferCallbackFn const &deallocFn)
Definition: Transforms.h:408
std::optional< llvm::SmallBitVector > useFullTileBuffers
If ith element of useFullTiles is true the full view should be used for the promoted buffer of the it...
Definition: Transforms.h:367
LinalgPromotionOptions & setOperandsToPromote(ArrayRef< int64_t > operands)
Definition: Transforms.h:357
LogicalResult matchAndRewrite(GenericOp op, PatternRewriter &rewriter) const override
Definition: Transforms.h:1479
FailureOr< GenericOp > returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const
Definition: Transforms.h:1475
std::optional< LinalgLoopDistributionOptions > tileDistribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:270
LinalgTilingAndFusionOptions & setTileSizes(ArrayRef< int64_t > ts)
Definition: Transforms.h:262
SmallVector< int64_t > tileInterchange
Tile interchange used to permute the tile loops.
Definition: Transforms.h:267
LinalgTilingAndFusionOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:272
SmallVector< int64_t > tileSizes
Tile sizes used to tile the root operation.
Definition: Transforms.h:261
LinalgTilingOptions & setLoopType(LinalgTilingLoopType lt)
Definition: Transforms.h:226
LinalgTilingOptions & setDistributionTypes(ArrayRef< StringRef > types)
Definition: Transforms.h:244
LinalgTilingOptions & setInterchange(ArrayRef< unsigned > interchange)
Definition: Transforms.h:218
LinalgTilingLoopType loopType
The type of tile loops to generate.
Definition: Transforms.h:224
LinalgTilingOptions & setTileSizeComputationFunction(TileSizeComputationFunction fun)
Definition: Transforms.h:195
LinalgTilingOptions & setTileSizes(const SmallVector< Value, 4 > &ts)
Set the tileSizeComputationFunction to return the values ts.
Definition: Transforms.h:202
LinalgTilingOptions & setPeeledLoops(ArrayRef< int64_t > loops)
Definition: Transforms.h:252
SmallVector< int64_t > peeledLoops
Peel the specified loops.
Definition: Transforms.h:250
LinalgTilingOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:236
SmallVector< unsigned, 4 > interchangeVector
The interchange vector to reorder the tiled loops.
Definition: Transforms.h:216
TileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes for each operation.
Definition: Transforms.h:192
LinalgTilingOptions & scalarizeDynamicDims()
Tile all dynamic dimensions by 1.
std::optional< LinalgLoopDistributionOptions > distribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:233
SmallVector< StringRef, 2 > distributionTypes
Specification markers of how to distribute the linalg.tiled_loop.
Definition: Transforms.h:242
linalg::TransposeOp transposeOp
Definition: Transforms.h:1111
tensor::ExpandShapeOp expandShapeOp
Definition: Transforms.h:1110
tensor::ExtractSliceOp extractSliceOp
Definition: Transforms.h:1122
linalg::TransposeOp transposeOp
Definition: Transforms.h:1120
tensor::CollapseShapeOp collapseShapeOp
Definition: Transforms.h:1121
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Definition: Transforms.h:830
Struct to hold the result of a pack call.
Definition: Transforms.h:1130
linalg::LinalgOp packedLinalgOp
Definition: Transforms.h:1132
SmallVector< tensor::PackOp > packOps
Definition: Transforms.h:1131
SmallVector< tensor::UnPackOp > unPackOps
Definition: Transforms.h:1133
Struct to hold the result of a packTranspose call.
Definition: Transforms.h:1142
linalg::LinalgOp transposedLinalgOp
Definition: Transforms.h:1144
tensor::UnPackOp transposedUnPackOp
Definition: Transforms.h:1145
Create a new buffer using the allocationFn provided.
Definition: Transforms.h:718
Split Reduction options.
Definition: Transforms.h:427
Apply transformation to split the single linalg op reduction into a parallel and reduction dimension.
Definition: Transforms.h:1020
Perform standalone tiling of a single LinalgOp by tileSizes.
Definition: Transforms.h:679
SmallVector< Operation *, 8 > loops
Definition: Transforms.h:681
SmallVector< Value, 4 > tensorResults
Definition: Transforms.h:682
SmallVector< T > tripCounts
Number of tiles associated with each size.
Definition: Transforms.h:821
T lowTripCount
Number of tiles associated with each size.
Definition: Transforms.h:813
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:549
SmallVector< OpFoldResult > strides
Definition: Transforms.h:550
SmallVector< Value > leadingPackedTensorIndexings
Definition: Transforms.h:551
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:551
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:550
SmallVector< OpFoldResult > offsets
Definition: Transforms.h:550