MLIR  18.0.0git
Transforms.h
Go to the documentation of this file.
1 //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
11 
12 #include <utility>
13 
22 #include "mlir/IR/PatternMatch.h"
26 #include "llvm/ADT/SmallBitVector.h"
27 #include "llvm/ADT/SmallSet.h"
28 
29 namespace mlir {
30 namespace bufferization {
31 class OneShotAnalysisState;
32 } // namespace bufferization
33 
34 namespace linalg {
35 
36 class LinalgOp;
37 
38 //===----------------------------------------------------------------------===//
39 // Utils.
40 //===----------------------------------------------------------------------===//
41 
42 /// Return vector::CombiningKind for the given op.
43 std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp);
44 
45 //===----------------------------------------------------------------------===//
46 // Bufferization-related transforms.
47 //===----------------------------------------------------------------------===//
48 
50  enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 };
52 
53  enum class MemcpyOp { MemrefTensorStore = 0, MemrefCopy = 1, LinalgCopy = 2 };
55 
56  /// If set to "true", only the destination tensor operands are bufferized to
57  /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the
58  /// targeted op itself.
60 
61  /// If set to "true", a `memref.dealloc` operation will be emitted for each
62  /// allocated buffer. Otherwise, the memory is leaked, which is useful if
63  /// the buffer deallocation pipeline should be run after bufferization is
64  /// done.
65  bool emitDealloc = false;
66 };
67 
68 /// Materialize a buffer allocation for the given tensor.pad op and lower the
69 /// op to linalg.fill/linalg.generic + memref.tensor_store. E.g.:
70 ///
71 /// %0 = tensor.pad low[%l] high[%h] %t ...
72 ///
73 /// is lowered to:
74 ///
75 /// %alloc = memref.alloc
76 /// linalg.fill ... outs(%alloc)
77 /// %subview = memref.subview %alloc [%l] [...] [1]
78 /// memref.tensor_store %t, %subview
79 /// %0 = bufferization.to_tensor %alloc restrict writable
80 ///
81 /// In addition to rewriting the IR as shown above, this function returns the
82 /// newly allocated buffer. The `insertionPoint` parameter can be used to
83 /// specify a custom insertion point for the buffer allocation.
86  tensor::PadOp padOp, Attribute memorySpace = {},
87  Operation *insertionPoint = nullptr);
88 
89 /// Materialize a buffer allocation for the given vector.mask op and bufferize
90 /// the op, including its region. E.g.:
91 ///
92 /// %0 = vector.mask {
93 /// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32>
94 /// } : vector<16xi1> -> tensor<?xf32>
95 ///
96 /// is lowered to:
97 ///
98 /// %alloc = memref.alloc
99 /// memref.tensor_store %t, %subview
100 /// vector.mask {
101 /// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32>
102 /// } : vector<16xi1>
103 /// %0 = bufferization.to_tensor %alloc restrict writable
104 ///
105 /// In addition to rewriting the IR as shown above, this function returns the
106 /// newly allocated buffer. The `insertionPoint` parameter can be used to
107 /// specify a custom insertion point for the buffer allocation.
109  const BufferizeToAllocationOptions &options,
110  vector::MaskOp maskOp, Attribute memorySpace = {},
111  Operation *insertionPoint = nullptr);
112 
113 /// Bufferize the given op with tensor semantics and materialize the result in
114 /// a newly allocated buffer.
115 ///
116 /// Only bufferizable ops that bufferize to a memory write or have an
117 /// aliasing OpOperand (and do not themselves bufferize to an allocation) are
118 /// supported. They are bufferized using their BufferizableOpInterface
119 /// implementation.
120 ///
121 /// Selected ops that bufferize to an allocation (or need special handling) are
122 /// also supported:
123 /// - tensor.pad
124 /// - vector.mask
125 ///
126 /// This function returns the newly allocated buffer. The `insertionPoint`
127 /// parameter can be used to specify a custom insertion point for the buffer
128 /// allocation.
129 Value bufferizeToAllocation(RewriterBase &rewriter,
130  const BufferizeToAllocationOptions &options,
131  Operation *op, Attribute memorySpace = {},
132  Operation *insertionPoint = nullptr);
133 
134 /// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a
135 /// LinalgOp. This transforms looks for LinalgOps that have an unused output
136 /// operand and an input operand that is rooted in a tensor::EmptyOp. The
137 /// tensor::EmptyOp uses are replaced with the output operand and the two
138 /// operands of the LinalgOp are swapped.
139 ///
140 /// Example:
141 /// %0 = tensor.empty()
142 /// %1 = linalg.matmul ins(...) outs(%0)
143 /// %2 = linalg.generic ins(%1) outs(%dest) {
144 /// ^bb0(%in: f32, %out: f32):
145 /// // out not used
146 /// }
147 ///
148 /// The IR is transformed as follows:
149 /// %0 = tensor.empty()
150 /// %1 = linalg.matmul ins(...) outs(%dest)
151 /// %2 = linalg.generic ins(%0) outs(%1) {
152 /// ^bb0(%in: f32, %out: f32):
153 /// // Use %out instead of %in
154 /// }
155 ///
156 /// The "ins" operand has no uses inside the body of the LinalgOp and can be
157 /// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp
158 /// can also fold away.
160  RewriterBase &rewriter, Operation *op,
161  bufferization::OneShotAnalysisState &state);
162 
163 //===----------------------------------------------------------------------===//
164 // Structs that configure the behavior of various transformations.
165 //===----------------------------------------------------------------------===//
166 
168  std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;
169 
171  /// Computation function that returns the tile sizes for each operation.
172  /// Delayed construction of constant tile sizes should occur to interoperate
173  /// with folding.
175 
178  tileSizeComputationFunction = std::move(fun);
179  return *this;
180  }
181  /// Set the `tileSizeComputationFunction` to return the values `ts`. The
182  /// values must not fold away when tiling. Otherwise, use a more robust
183  /// `tileSizeComputationFunction`.
185  tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; };
186  return *this;
187  }
188  /// Convenience function to set the `tileSizeComputationFunction` to a
189  /// function that computes tile sizes at the point they are needed. Allows
190  /// proper interaction with folding.
192 
193  /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions.
194  /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together.
196 
197  /// The interchange vector to reorder the tiled loops.
199 
201  interchangeVector.assign(interchange.begin(), interchange.end());
202  return *this;
203  }
204 
205  /// The type of tile loops to generate.
207 
209  loopType = lt;
210  return *this;
211  }
212 
213  /// When specified, specifies distribution of generated tile loops to
214  /// processors.
215  std::optional<LinalgLoopDistributionOptions> distribution;
216 
219  distribution = std::move(distributionOptions);
220  return *this;
221  }
222 
223  /// Specification markers of how to distribute the `linalg.tiled_loop`.
225 
227  distributionTypes.assign(types.begin(), types.end());
228  return *this;
229  }
230 
231  /// Peel the specified loops.
233 
235  peeledLoops.clear();
236  peeledLoops.append(loops.begin(), loops.end());
237  return *this;
238  }
239 };
240 
242  /// Tile sizes used to tile the root operation.
245  tileSizes.assign(ts.begin(), ts.end());
246  return *this;
247  }
248  /// Tile interchange used to permute the tile loops.
250  /// When specified, specifies distribution of generated tile loops to
251  /// processors.
252  std::optional<LinalgLoopDistributionOptions> tileDistribution;
255  tileDistribution = std::move(distributionOptions);
256  return *this;
257  }
258 };
259 
261  /// A padding value for every operand.
264  paddingValues.assign(pv.begin(), pv.end());
265  return *this;
266  }
267  /// A list of iterator dimensions to pad.
270  paddingDimensions.assign(pd.begin(), pd.end());
271  return *this;
272  }
273  /// A list of multiples to which each padding dimension should be padded to.
274  std::optional<SmallVector<int64_t>> padToMultipleOf;
276  padToMultipleOf.emplace(m.begin(), m.end());
277  return *this;
278  }
279  /// A flag for every operand to mark the PadOp as nofold which enables
280  /// packing for statically shaped operands.
283  packPaddings.assign(pp.begin(), pp.end());
284  return *this;
285  }
286  /// A number of loops to hoist the PadOp out for every operand.
289  hoistPaddings.assign(hp.begin(), hp.end());
290  return *this;
291  }
292  /// A permutation vector for every operand used to transpose the packed
293  /// PadOp results.
297  transposePaddings.assign(tp.begin(), tp.end());
298  return *this;
299  }
300  enum class CopyBackOp : int8_t {
301  None = 0,
303  LinalgCopy = 2
304  };
305  /// The op to be used for copying the padded result to the original
306  /// destination tensor.
309  copyBackOp = op;
310  return *this;
311  }
312 };
313 
314 /// Callback function type used to perform the allocation for the promoted
315 /// `subView`. In `boundingSubViewsize` a best attempt is made to find the
316 /// smallest constant value for the size of the buffer needed for each
317 /// dimension. If that is not possible, contains the dynamic size of the
318 /// subview. The call back should return the buffer to use.
319 using AllocBufferCallbackFn = std::function<std::optional<Value>(
320  OpBuilder &b, memref::SubViewOp subView,
321  ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>;
322 
323 /// Callback function type used to deallocate the buffers used to hold the
324 /// promoted subview.
326  std::function<LogicalResult(OpBuilder &b, Value buffer)>;
327 
328 /// Callback function type used to insert copy from original subview to
329 /// subview of the promoted region for the read operands/subview of promoted
330 /// region to original subview for the results. The copy has to happen from
331 /// `src` to `dst`.
333  std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
334 
336  /// Indices of subViews to promote. If `std::nullopt`, try to promote all
337  /// operands.
338  std::optional<DenseSet<unsigned>> operandsToPromote;
341  operandsToPromote->insert(operands.begin(), operands.end());
342  return *this;
343  }
344  /// If ith element of `useFullTiles` is true the full view should be used
345  /// for the promoted buffer of the ith operand in `operandsToPromote`.
346  /// Otherwise the partial view will be used. The decision is defaulted to
347  /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and
348  /// for operands missing from `useFullTileBuffers`.
349  std::optional<llvm::SmallBitVector> useFullTileBuffers;
351  unsigned size = useFullTiles.size();
352  llvm::SmallBitVector tmp(size, false);
353  for (unsigned i = 0; i < size; ++i)
354  tmp[i] = useFullTiles[i];
355  useFullTileBuffers = tmp;
356  return *this;
357  }
358  /// If true all operands unspecified by `useFullTileBuffers` will use the
359  /// full view, otherwise the partial view.
363  return *this;
364  }
365  /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment.
366  std::optional<unsigned> alignment;
368  alignment = align;
369  return *this;
370  }
371  /// Memory space of promoted buffer. If `std::nullopt` do not specify memory
372  /// space.
373  std::optional<Attribute> memorySpace;
375  memorySpace = memorySpc;
376  return *this;
377  }
378  /// Use alloca with the default allocation scheme.
379  bool useAlloca = false;
381  useAlloca = use;
382  return *this;
383  }
384  /// Callback function to do the allocation of the promoted buffer. If
385  /// std::nullopt, then the default allocation scheme of allocating a
386  /// memref<?xi8> buffer followed by a view operation is used.
387  std::optional<AllocBufferCallbackFn> allocationFn;
388  std::optional<DeallocBufferCallbackFn> deallocationFn;
391  DeallocBufferCallbackFn const &deallocFn) {
392  allocationFn = allocFn;
393  deallocationFn = deallocFn;
394  return *this;
395  }
396  /// Callback function to do the copy of data to and from the promoted
397  /// subview. If std::nullopt then a memref.copy is used.
398  std::optional<CopyCallbackFn> copyInFn;
399  std::optional<CopyCallbackFn> copyOutFn;
401  CopyCallbackFn const &copyOut) {
402  copyInFn = copyIn;
403  copyOutFn = copyOut;
404  return *this;
405  }
406 };
407 
408 /// Split Reduction options.
410  // Ratio used to split the reduction dimension. If the ratio is <= 1,
411  // nothing will be done.
412  int64_t ratio = 0;
413  // Index where the extra dimension is added to the intermediate tensor
414  // shape.
415  unsigned index = 0;
416  // If the inner dimension after splitting is parallel or reduction.
417  bool innerParallel = false;
418 };
419 
420 /// Function signature to control reduction splitting. This returns
421 /// `SplitReductionOptions`.
422 // TODO: don't use unsigned unless doing bit manipulation.
424  std::function<SplitReductionOptions(LinalgOp op)>;
425 
426 //===----------------------------------------------------------------------===//
427 // Preconditions that ensure the corresponding transformation succeeds and can
428 // be applied as a rewrite pattern.
429 //===----------------------------------------------------------------------===//
430 
431 /// Return true if two `linalg.generic` operations with producer/consumer
432 /// relationship through `fusedOperand` can be fused using elementwise op
433 /// fusion.
434 bool areElementwiseOpsFusable(OpOperand *fusedOperand);
435 
436 /// Promote memref.subviews feeding linalg-on-buffers operations.
439 
440 /// Return success if the operation can be vectorized.
442  ArrayRef<int64_t> inputVectorSizes = {},
443  ArrayRef<bool> inputScalableVecDims = {},
444  bool vectorizeNDExtract = false);
445 
446 //===----------------------------------------------------------------------===//
447 // Transformations exposed as functional-style API calls.
448 //===----------------------------------------------------------------------===//
449 
451 
452 /// Transformation to drop unit-extent dimensions from `linalg.generic`
453 /// operations.
456 
459 
460  using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>;
462  if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
463  return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
464  }
465  return SmallVector<unsigned>{};
466  };
467 };
468 LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp,
469  const ControlDropUnitDims &options);
470 
471 /// Fuse two `linalg.generic` operations that have a producer-consumer
472 /// relationship captured through `fusedOperand`. The method expects
473 /// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.
477 };
479 fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand);
480 
481 /// Try to peel and canonicalize loop `op` and return the new result.
482 /// Also applies affine_min/max bounds simplification on the fly where relevant.
483 // TODO: Add support for scf.parallel and affine.for loops.
485 
486 /// Peel 'loops' and applies affine_min/max bounds simplification on the fly
487 /// where relevant.
488 void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
489 
490 /// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands
491 /// to a static bounding box. The original `opToPad` is cloned and operates on
492 /// the padded tensors.
493 ///
494 /// * "options.padToMultipleOf" indicates that each padding dimension should be
495 /// padded to the specified multiple.
496 /// * Use "options.paddingValues" and "options.packPaddings" to set padding
497 /// value and nofold attribute of the created tensor::PadOps, respectively.
498 /// * The unpadded results (extracted slice of the cloned operation) are
499 /// returned via `replacements`.
500 /// * The tensor::PadOps are returned via `padOps`.
501 /// * "options.copyBackOp" specifies the op type for copying back the unpadded
502 /// result to the original destination tensor.
503 LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
505  LinalgOp &paddedOp,
506  SmallVector<Value> &replacements,
508 
509 namespace detail {
510 
511 /// Helper struct to hold the results of building a packing loop nest.
515  GenericOp maybeTransposeOp;
516  tensor::PadOp hoistedPadOp;
517 };
518 
519 /// Build the packing loop nest required to hoist `opToHoist` above
520 /// `outermostEnclosingForOp`.
521 /// The loop nest is built just before `outermostEnclosingForOp`.
523 buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
524  scf::ForOp outermostEnclosingForOp,
525  ArrayRef<int64_t> transposeVector);
526 
527 } // namespace detail
528 
529 /// Mechanically hoist padding operations on tensors by `numLoops` into a new,
530 /// generally larger tensor. This achieves packing of multiple padding ops into
531 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
532 /// in the packing loop so the caller can continue reasoning about the padding
533 /// operation. If `transposeVector` is non-empty, hoist padding introduces a
534 /// GenericOp to transpose the padded tensor before inserting it into the packed
535 /// tensor. A `transposeVector` can change the storage order of the padded
536 /// tensor but does not change the order of the pack or compute loops.
537 ///
538 /// TODO: In the future, we should consider rewriting as a tensor.pack after
539 /// hoisting since this abstraction is now available.
540 ///
541 /// Example in pseudo-mlir:
542 /// =======================
543 ///
544 /// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
545 /// ```
546 /// scf.for (%i, %j, %k)
547 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
548 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
549 /// ^bb0( ... ):
550 /// linalg.yield %pad
551 /// } : tensor<?x?xf32> to tensor<4x8xf32>
552 /// compute(%0)
553 /// ```
554 ///
555 /// IR resembling the following is produced:
556 ///
557 /// ```
558 /// scf.for (%i) {
559 /// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32>
560 /// %packed = scf.for (%k) iter_args(%p : %packed_init) {
561 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
562 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
563 /// ^bb0( ... ):
564 /// linalg.yield %pad
565 /// } : tensor<?x?xf32> to tensor<4x8xf32>
566 /// %1 = tensor.insert_slice %0 ...
567 /// : tensor<4x8xf32> to tensor<?x4x8xf32>
568 /// scf.yield %1: tensor<?x4x8xf32>
569 /// } -> tensor<?x4x8xf32>
570 /// scf.for (%j, %k) {
571 /// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
572 /// tensor<?x4x8xf32> to tensor<4x8xf32>
573 /// compute(%st0)
574 /// }
575 /// }
576 /// ```
578 hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
579  int64_t numLoops, ArrayRef<int64_t> transposeVector,
580  tensor::PadOp &hoistedOp,
581  SmallVectorImpl<GenericOp> &transposeOps);
582 /// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
584 hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
585  ArrayRef<int64_t> transposeVector,
586  tensor::PadOp &hoistedOp,
587  SmallVectorImpl<GenericOp> &transposeOps);
588 
589 /// Apply padding and hoisting to `linalgOp` according to the configuration
590 /// specified in `options`.
592  LinalgOp linalgOp,
594 
595 /// Split the given `op` into two parts along the given iteration space
596 /// `dimension` at the specified `splitPoint`, and return the two parts.
597 /// If the second part is statically known to be empty, do not create it
598 /// and return nullptr instead. Error state is signalled by returning
599 /// a pair of nullptrs.
600 ///
601 /// For example, the following op:
602 ///
603 /// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>)
604 /// outs(%2 : tensor<128x64xf32>)
605 ///
606 /// split along the first dimension at position 42 will result in:
607 ///
608 /// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1]
609 /// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1]
610 /// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>)
611 /// outs(%5 : tensor<42x64xf32>)
612 /// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1]
613 ///
614 /// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1]
615 /// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1]
616 /// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>)
617 /// outs(%8 : tensor<86x64xf32>)
618 /// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1]
619 ///
620 /// Note that there is no simplification other than constant propagation applied
621 /// to slice extraction and insertion.
622 std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter,
623  TilingInterface op,
624  unsigned dimension,
625  OpFoldResult splitPoint);
626 
627 /// Perform standalone tiling of a single LinalgOp by `tileSizes`.
628 /// and permute the loop nest according to `interchangeVector`
629 /// The permutation is expressed as a list of integers that specify
630 /// the new ordering of the loop nest. The length of `interchangeVector`
631 /// must be equal to the length of `tileSizes`.
632 /// An empty vector is interpreted as the identity permutation and the
633 /// transformation returns early.
634 ///
635 /// Return a struct containing the tiled loops in the specified order
636 /// and the cloned op if successful, std::nullopt otherwise.
637 ///
638 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by
639 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
640 /// integers, in the range 0..`tileSizes.size()` without duplications
641 /// (i.e. `[1,1,2]` is an invalid permutation).
643  LinalgOp op;
646 };
649 
650 /// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts
651 /// the index accesses of `op`. This is an in-place transformation controlled
652 /// by `interchangeVector`. An empty vector is interpreted as the identity
653 /// permutation and the transformation returns early.
654 ///
655 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
656 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
657 /// integers, in the range 0..`op.rank` without duplications
658 /// (i.e. `[1,1,2]` is an invalid permutation).
659 ///
660 /// Return failure if the permutation is not valid.
662  GenericOp genericOp,
663  ArrayRef<unsigned> interchangeVector);
664 
665 /// Create a GenericOp from the given named operation `namedOp` and replace
666 /// namedOp.
667 /// Return failure if `namedOp` is a GenericOp or misses a region builder.
669  LinalgOp namedOp);
670 
671 /// Create a new buffer using the `allocationFn` provided. The size of this
672 /// buffer is the smallest constant bounding size along each dimension that
673 /// can be computed for the size of the result of `subView`. Returns the
674 /// allocated buffer as `fullLocalView` and the view that matches the size of
675 /// the result of subview operation as `partialLocalView`.
679 };
681 promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
682  const AllocBufferCallbackFn &allocationFn,
683  DataLayout &layout);
684 
685 /// Promote the `subViews` into a new buffer allocated at the insertion point
686 /// `b`. Promotion occurs in 3 steps:
687 /// 1. Create a new buffer for a full tile (i.e. not clipped at the
688 /// boundary).
689 /// 2. Take a full view on the buffer.
690 /// 3. Take a partial slice of the full view in step 2. and copy into it.
691 ///
692 /// Return the modified linalg op (the modification happens in place) as well
693 /// as all the copy ops created.
696 
697 /// Allocate the subview in the GPU workgroup memory.
698 std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
699  memref::SubViewOp subview,
700  ArrayRef<Value> sizeBounds,
701  DataLayout &);
702 
703 /// In case of GPU group memory there is no need to deallocate.
705 
706 /// Create Memref copy operations and add gpu barrier guards before and after
707 /// the copy operation to ensure data integrity.
709 
710 /// Allocate the subview in the GPU private memory.
711 std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
712  memref::SubViewOp subview,
713  ArrayRef<Value> sizeBounds,
714  DataLayout &);
715 
716 /// Normal copy to between src and dst.
718 
719 /// In case of GPU private memory there is no need to deallocate since the
720 /// memory is freed when going outside of the scope.
722 
723 /// Emit a suitable vector form for an operation. If provided,
724 /// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes`
725 /// must match the rank of the iteration space of the operation and the sizes
726 /// must be smaller or equal than their counterpart interation space sizes, if
727 /// static. `inputVectorShapes` also allows the vectorization of operations with
728 /// dynamic shapes.
730  ArrayRef<int64_t> inputVectorSizes = {},
731  ArrayRef<bool> inputScalableVecDims = {},
732  bool vectorizeNDExtract = false);
733 
734 /// Emit a suitable vector form for a Copy op with fully static shape.
735 LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
736 
737 /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`.
738 FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter,
739  LinalgOp linalgOp);
740 
741 /// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`.
742 FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter,
743  LinalgOp linalgOp);
744 
745 /// Emit a loop nest of `affine.for` with the proper body for `linalgOp`.
746 FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter,
747  LinalgOp linalgOp);
748 
749 /// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
750 /// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument
751 /// has one entry per surrounding loop. It uses zero as the convention that a
752 /// particular loop is not tiled. This convention simplifies implementations
753 /// by avoiding affine map manipulations. The returned ranges correspond to
754 /// the loop ranges, in the proper order, that are tiled and for which new
755 /// loops will be created. Also the function returns a map from loop indices
756 /// of the LinalgOp to the corresponding non-empty range indices of newly
757 /// created loops.
759 std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
761  ArrayRef<OpFoldResult> allShapeSizes,
762  ArrayRef<OpFoldResult> allTileSizes);
763 
764 namespace detail {
765 template <typename T>
767  /// Tile sizes.
769  /// Number of tiles associated with each size.
771 };
772 } // namespace detail
773 
774 /// A description of a multi-size tiling comprising tile sizes and numbers of
775 /// tiles, expressed as Values which may or may not be constant. Multi-size
776 /// currently means two-size.
778  : public detail::MultiSizeSpecificationBase<Value> {};
780  : public detail::MultiSizeSpecificationBase<int64_t> {};
781 
782 /// Emits the IR computing the multi-sized tiling specification with two tile
783 /// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such
784 /// that there exist numbers of tiles with these sizes that fully cover the
785 /// given iteration space `dimension` of the structured `op`.
786 ///
787 /// The computation is as follows:
788 ///
789 /// b = originalTripCount floordiv sizeDivisor
790 /// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor
791 /// d = (b + t - 1) floordiv t
792 /// s = (b floordiv d) * sizeDivisor
793 /// v = b % d
794 /// u = d - v
795 ///
796 /// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of
797 /// the corresponding tiles are `u` and `v`, respectively. Alternatively,
798 ///
799 /// s * u + (s + sizeDivisor) * v == original size,
800 /// where s mod sizeDivisor = 0.
801 ///
802 /// Expects all values to be positive. In some cases with the target tile size
803 /// sufficiently close to the dimension shape and non-unit divisor, it is
804 /// impossible to compute such sizes. If `emitAssertion` is set, also emit the
805 /// assertion that size computation succeeded.
806 ///
807 /// Returns the specification consisting of both tile values and the number of
808 /// tiles of each size.
810 computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension,
811  OpFoldResult targetSize, OpFoldResult divisor,
812  bool emitAssertions = true);
814 computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
815  int64_t divisor);
816 
817 /// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying
818 /// tiling by `numThreads`.
819 /// If non-empty, the `mapping` is added as an attribute to the
820 /// resulting `scf.forall`.
821 /// Zero tile sizes indicate that the dimension is not tiled, and can be
822 /// thought of as tiling by the full size of data. It is the user's
823 /// responsibility to ensure that `numThreads` is a valid tiling specification
824 /// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
828 };
830  TilingInterface op,
831  ArrayRef<OpFoldResult> numThreads,
832  std::optional<ArrayAttr> mapping);
833 
834 /// Same as `tileToForallOp`, but calculate the number of threads
835 /// required using the given tileSizes.
837 tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
838  ArrayRef<OpFoldResult> tileSizes,
839  std::optional<ArrayAttr> mapping);
840 
841 /// Transformation information returned after reduction tiling.
843  /// The partial reduction tiled op generated.
845  /// The final reduction operation merging all the partial reductions.
847  /// The op initializing the tensor used for partial reductions.
849  /// The `scf.forall` operation that iterate over the tiles.
850  scf::ForallOp loops;
851 };
852 
853 /// Method to tile a reduction to parallel iterations computing partial
854 /// reductions. After the loop all the partial reduction are merged into a final
855 /// reduction. For example for the following sequence
856 ///
857 /// ```mlir
858 /// %0 = linalg.generic %in ["parallel", "reduction"]
859 /// : tensor<7x9xf32> -> tensor<7xf32>
860 /// ```
861 ///
862 /// into:
863 ///
864 /// ```mlir
865 /// %0 = linalg.fill ... : tensor<7x4xf32>
866 /// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
867 /// -> (tensor<7x4xf32>) {
868 /// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
869 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
870 /// %4 = linalg.generic %2, %3 ["parallel", "reduction"]
871 /// : tensor<7x?xf32> -> tensor<7xf32>
872 /// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32>
873 /// }
874 /// %6 = linalg.generic %1 ["parallel", "reduction"]
875 /// : tensor<7x4xf32> -> tensor<7xf32>
876 /// ```
878 tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
879  ArrayRef<OpFoldResult> numThreads,
880  ArrayRef<OpFoldResult> tileSizes = {},
881  std::optional<ArrayAttr> mapping = std::nullopt);
882 
883 /// All indices returned by IndexOp should be invariant with respect to
884 /// tiling. Therefore, if an operation is tiled, we have to transform the
885 /// indices accordingly, i.e. offset them by the values of the corresponding
886 /// induction variables that are captured implicitly in the body of the op.
887 ///
888 /// Example. `linalg.generic` before tiling:
889 ///
890 /// #id_2d = (i, j) -> (i, j)
891 /// #pointwise_2d_trait = {
892 /// indexing_maps = [#id_2d, #id_2d],
893 /// iterator_types = ["parallel", "parallel"]
894 /// }
895 /// linalg.generic #pointwise_2d_trait %operand, %result {
896 /// ^bb0(%operand_in: f32, %result_in: f32):
897 /// %i = linalg.index 0 : index
898 /// %j = linalg.index 1 : index
899 /// <some operations that use %i, %j>
900 /// }: memref<50x100xf32>, memref<50x100xf32>
901 ///
902 /// After tiling pass with tiles sizes 10 and 25:
903 ///
904 /// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
905 ///
906 /// %c1 = arith.constant 1 : index
907 /// %c0 = arith.constant 0 : index
908 /// %c25 = arith.constant 25 : index
909 /// %c10 = arith.constant 10 : index
910 /// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
911 /// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
912 /// scf.for %k = %c0 to operand_dim_0 step %c10 {
913 /// scf.for %l = %c0 to operand_dim_1 step %c25 {
914 /// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
915 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
916 /// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1]
917 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
918 /// linalg.generic pointwise_2d_trait %4, %5 {
919 /// ^bb0(%operand_in: f32, %result_in: f32):
920 /// %i = linalg.index 0 : index
921 /// %j = linalg.index 1 : index
922 /// // Indices `k` and `l` are implicitly captured in the body.
923 /// %transformed_i = arith.addi %i, %k : index // index `i` is offset by
924 /// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset
925 /// by %l
926 /// // Every use of %i, %j is replaced with %transformed_i,
927 /// %transformed_j <some operations that use %transformed_i,
928 /// %transformed_j>
929 /// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
930 /// }
931 /// }
932 ///
933 /// TODO: Investigate whether mixing implicit and explicit indices
934 /// does not lead to losing information.
935 void transformIndexOps(RewriterBase &b, LinalgOp op,
937  const LoopIndexToRangeIndexMap &loopIndexToRangeIndex);
938 
939 /// Apply transformation to split the single linalg op reduction into a
940 /// parallel and reduction dimension. Then create a new linalg.generic op
941 /// doing the rest of the reduction. Return the new linalg op with an extra
942 /// parallel dimension or failure if the transformation didn't happen.
943 ///
944 /// Example:
945 /// ```
946 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
947 /// affine_map<(d0) -> ()>],
948 /// iterator_types = ["reduction"]}
949 /// ins(%in : tensor<32xf32>)
950 /// outs(%out : tensor<f32>) {
951 /// ^bb0(%arg1: f32, %arg2: f32):
952 /// %y = arith.addf %arg1, %arg2 : f32
953 /// linalg.yield %y : f32
954 /// } -> tensor<f32>
955 /// ```
956 /// To:
957 /// ```
958 /// %cst = arith.constant 0.000000e+00 : f32
959 /// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into
960 /// tensor<4x8xf32> %1 = tensor.empty [4] : tensor<4xf32> %2 = linalg.fill
961 /// ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> %3 =
962 /// linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
963 /// affine_map<(d0, d1) -> (d0)>],
964 /// iterator_types = ["parallel", "reduction"]}
965 /// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) {
966 /// ^bb0(%arg3: f32, %arg5: f32):
967 /// %5 = arith.addf %arg3, %arg4 : f32
968 /// linalg.yield %5 : f32
969 /// } -> tensor<4xf32>
970 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
971 /// affine_map<(d0) -> ()>],
972 /// iterator_types = ["reduction"]}
973 /// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) {
974 /// ^bb0(%arg3: f32, %arg4: f32):
975 /// %5 = arith.addf %arg3, %arg4 : f32
976 /// linalg.yield %5 : f32
977 /// } -> tensor<f32>
978 /// ```
981  FillOp fillOp;
982  LinalgOp splitLinalgOp;
984 };
986 splitReduction(RewriterBase &b, LinalgOp op,
987  const ControlSplitReductionFn &controlSplitReductionFn,
988  bool useAlloc = false);
989 
990 /// Scaling-based implementation of the split reduction transformation.
991 /// Instead of introducing an ExpandShapeOp, this rewrites a reduction
992 /// dimension `k` into `k * scale + kk`.
993 ///
994 /// Example:
995 /// ```
996 /// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
997 /// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
998 /// ```
999 ///
1000 /// Is transformed to:
1001 ///
1002 /// ```
1003 /// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
1004 /// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
1005 /// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
1006 /// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1007 /// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
1008 /// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
1009 /// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32>
1010 /// %cst = arith.constant 0.000000e+00 : f32
1011 /// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
1012 /// tensor<16x32x64xf32>
1013 /// %2 = tensor.empty [64, 4] : tensor<64x4xi1>
1014 ///
1015 /// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
1016 /// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
1017 /// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>,
1018 /// tensor<64x4xi1>)
1019 /// outs(%1 : tensor<16x32x64xf32>) {
1020 /// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
1021 /// %5 = arith.mulf %arg3, %arg4 : f32
1022 /// %6 = arith.addf %arg6, %5 : f32
1023 /// linalg.yield %6 : f32
1024 /// } -> tensor<16x32x64xf32>
1025 ///
1026 /// %4 = linalg.generic {indexing_maps = [#map4, #map5],
1027 /// iterator_types = ["parallel", "parallel", "reduction"]}
1028 // ins(%3 : tensor<16x32x64xf32>)
1029 /// outs(%C : tensor<16x32xf32>) {
1030 /// ^bb0(%arg3: f32, %arg4: f32):
1031 /// %5 = arith.addf %arg3, %arg4 : f32
1032 /// linalg.yield %5 : f32
1033 /// } -> tensor<16x32xf32>
1034 ///
1035 /// return %4 : tensor<16x32xf32>
1036 /// ```
1038 splitReductionByScaling(RewriterBase &b, LinalgOp op,
1039  const ControlSplitReductionFn &controlSplitReductionFn,
1040  bool useAlloc = false);
1041 
1042 /// Return `true` if a given sequence of dimensions are contiguous in the
1043 /// range of the specified indexing map.
1045 /// Return `true` if all sequences of dimensions specified in `dimSequences` are
1046 /// contiguous in all the ranges of the `maps`.
1048  ArrayRef<ReassociationIndices> dimSequences);
1049 
1050 /// Collapses dimensions of linalg.generic operation. A precondition to
1051 /// calling this method is that for each list in `foldedIterationDim`, the
1052 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
1053 /// the `genericOp`. This can be checked using `areDimSequencePreserved` method.
1054 /// When valid, the method also collapses the operands of the op. Returns
1055 /// replacement values of the results of the original `genericOp` by inserting
1056 /// reshapes to get back values of compatible types.
1058  GenericOp genericOp, ArrayRef<ReassociationIndices> foldedIterationDims,
1059  RewriterBase &rewriter);
1060 
1062  tensor::PadOp padOp;
1063  tensor::ExpandShapeOp expandShapeOp;
1064  linalg::TransposeOp transposeOp;
1065 };
1066 
1067 /// Rewrite pack as pad + reshape + transpose.
1069  tensor::PackOp packOp);
1070 
1072  tensor::EmptyOp emptyOp;
1073  linalg::TransposeOp transposeOp;
1074  tensor::CollapseShapeOp collapseShapeOp;
1075  tensor::ExtractSliceOp extractSliceOp;
1076 };
1077 
1078 /// Rewrite pack as empty + transpose + reshape + extract_slice.
1080  tensor::UnPackOp unPackOp);
1081 
1082 /// Struct to hold the result of a `pack` call.
1083 struct PackResult {
1085  linalg::LinalgOp packedLinalgOp;
1087 };
1088 /// Implement packing of a single LinalgOp by `packedSizes`.
1089 /// There must be one packedSizes entry per `linalgOp` iterator.
1090 /// Return the packed Linalg op on success, failure otherwise.
1091 FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1092  ArrayRef<OpFoldResult> packedSizes);
1093 
1094 /// Struct to hold the result of a `packTranspose` call.
1096  tensor::PackOp transposedPackOp;
1097  linalg::LinalgOp transposedLinalgOp;
1098  tensor::UnPackOp transposedUnPackOp;
1099 };
1100 /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the
1101 /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements.
1102 /// Return failure if either:
1103 /// 1. the `packOp` does not have the `linalgOp` as its unique use.
1104 /// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied
1105 /// to the unique `packOp` use.
1106 /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of
1107 /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty.
1109 packTranspose(RewriterBase &rewriter, tensor::PackOp packOp,
1110  linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp,
1111  ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm);
1112 
1113 /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m
1114 /// and n are proper parallel dimensions and k is a proper reduction
1115 /// dimension. Packing occurs by rewriting the op as a linalg.generic and
1116 /// calling linalg::pack by `mnkPackedSizes`. The order of the packed
1117 /// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2}
1118 /// to reorder {m, n, k} into one of the 8 possible forms. The outer
1119 /// dimensions of the operands are not permuted at this time, this is left for
1120 /// future work.
1122 packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp,
1123  ArrayRef<OpFoldResult> mnkPackedSizes,
1124  ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf,
1125  ArrayRef<int64_t> mnkOrder);
1126 
1127 /// Rewrite tensor.from_elements to linalg.generic.
1130  tensor::FromElementsOp fromElementsOp);
1131 
1132 /// Rewrite tensor.generate to linalg.generic.
1135  tensor::GenerateOp generateOp);
1136 
1137 /// Rewrite tensor.pad to linalg.generic + tensor.insert_slice.
1139  tensor::PadOp padOp);
1140 
1141 /// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing)
1142 /// and linalg.matmul.
1143 ///
1144 /// A convolution operation can be written as a matrix-matrix multiplication by
1145 /// unfolding the cross-correlation between input and filter and explicitly copy
1146 /// overlapped sliding window inputs.
1147 ///
1148 /// Consider 2D input X with single channel input and output and 2x2 filter W:
1149 /// [x(0, 0) , x(0, 1) , ..., x(0, n) ]
1150 /// [x(1, 0) , x(1, 1) , ..., x(1, n) ]
1151 /// [. , . ,. , . ] [w(0, 0), w(0, 1)]
1152 /// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)]
1153 /// [. , . , ., . ]
1154 /// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)]
1155 ///
1156 /// The packed input data (img2col) is a matrix with |rows| = output spatial
1157 /// size, |columns| = filter spatial size. To compute the output Y(i, j) we need
1158 /// to calculate the dot product between filter window at input X(x, y)) and the
1159 /// filter which will look like the following where r.h.s is the img2col matrix
1160 /// and l.h.s is the flattened filter:
1161 ///
1162 /// [x(0,0), x(0,1), x(1,0), x(1,1)]
1163 /// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)]
1164 /// [x(0,1), x(1,1), x(0,2), x(1,2)]
1165 /// [ . , . , . , . ]
1166 ///
1167 /// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter
1168 /// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix
1169 /// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in
1170 /// the N input. For the case where N > 1 its a batched matrix-matrix
1171 /// multiplication.
1172 ///
1173 /// On success, return both the operation that produces the img2col tensor and
1174 /// the final operation of the sequence that replaces the original convolution.
1176 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp);
1177 
1178 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no
1179 /// reduction among the input channels so each convolution can be a
1180 /// matrix-vector product and by transposing both input filter so channels are
1181 /// outer most the computation is a batched matrix-vector product.
1183 rewriteInIm2Col(RewriterBase &rewriter,
1184  linalg::DepthwiseConv2DNhwcHwcOp convOp);
1185 
1186 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the
1187 /// channels are to the left of the image shape dimensions, the position of the
1188 /// contraction dimension in the resulting matmul is reversed. This swaps the
1189 /// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) *
1190 /// (C x Kh x Kw, Ho x Wo))
1192 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp);
1193 
1194 //===----------------------------------------------------------------------===//
1195 // Rewrite patterns wrapping transformations.
1196 // TODO: every single such pattern should be a close to noop wrapper around a
1197 // functional-stye API call.
1198 //===----------------------------------------------------------------------===//
1199 
1200 /// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D
1201 /// convolution ops.
1202 template <typename Conv2DOp, typename Conv1DOp>
1204  : public OpRewritePattern<Conv2DOp> {
1206 
1208  PatternRewriter &rewriter) const;
1209 
1211  PatternRewriter &rewriter) const override {
1212  return returningMatchAndRewrite(convOp, rewriter);
1213  }
1214 };
1215 
1216 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
1217  Conv1DNwcWcfOp>;
1218 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
1219  Conv1DNcwFcwOp>;
1220 
1221 /// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh)
1222 /// dimensions into 1-D depthwise convolution ops.
1224  : public OpRewritePattern<DepthwiseConv2DNhwcHwcOp> {
1226  PatternBenefit benefit = 1)
1227  : OpRewritePattern<DepthwiseConv2DNhwcHwcOp>(context, benefit) {}
1228 
1230  returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1231  PatternRewriter &rewriter) const;
1232 
1233  LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1234  PatternRewriter &rewriter) const override {
1235  return returningMatchAndRewrite(convOp, rewriter);
1236  }
1237 };
1238 
1239 struct DownscaleConv2DOp final : public OpRewritePattern<Conv2DOp> {
1241  : OpRewritePattern<Conv2DOp>(context, benefit) {}
1242 
1244  PatternRewriter &rewriter) const;
1245 
1247  PatternRewriter &rewriter) const override {
1248  return returningMatchAndRewrite(convOp, rewriter);
1249  }
1250 };
1251 
1252 ///
1253 /// Linalg generalization pattern.
1254 ///
1255 /// Apply the `generalization` transformation as a pattern.
1256 /// See `generalization` for more details.
1257 //
1258 // TODO: Automatic default pattern class that just unwraps a function
1259 // returning FailureOr<GenericOp>.
1261  : public OpInterfaceRewritePattern<LinalgOp> {
1263 
1264  /// `matchAndRewrite` implementation that returns the significant
1265  /// transformed pieces of IR.
1267  returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const {
1268  return generalizeNamedOp(rewriter, op);
1269  }
1270 
1272  PatternRewriter &rewriter) const override {
1273  return returningMatchAndRewrite(op, rewriter);
1274  }
1275 };
1276 
1277 /// Vectorization pattern for memref::CopyOp.
1278 struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
1280 
1281  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
1282  PatternRewriter &rewriter) const override;
1283 };
1284 
1286  std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>;
1287 
1288 /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
1289 /// InsertSliceOp. For now, only constant padding values are supported.
1290 /// `OptimizeCopyFn` can be used to customize copying step optimization.
1291 struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> {
1293  OptimizeCopyFn optimizeCopyFn = nullptr,
1294  PatternBenefit benefit = 1)
1295  : OpRewritePattern<tensor::PadOp>(context, benefit),
1296  optimizeCopyFn(std::move(optimizeCopyFn)) {}
1297  LogicalResult matchAndRewrite(tensor::PadOp padOp,
1298  PatternRewriter &rewriter) const override;
1299 
1300 protected:
1302  Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp,
1303  Value dest,
1304  const SmallVector<Value> &dynSizes) const;
1305 };
1306 
1307 /// Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose +
1308 /// tensor.insert_slice ops, where the tensor::PackOp has outer dims being all
1309 /// 1s.
1311  : public OpRewritePattern<tensor::PackOp> {
1313  LogicalResult matchAndRewrite(tensor::PackOp packOp,
1314  PatternRewriter &rewriter) const override;
1315 };
1316 
1317 /// Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op
1318 /// + transpose op + insert_slice op, where the tensor::UnPackOp has outer dims
1319 /// being all 1s.
1321  : public OpRewritePattern<tensor::UnPackOp> {
1323  LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp,
1324  PatternRewriter &rewriter) const override;
1325 };
1326 
1327 /// Match and rewrite for the pattern:
1328 /// ```
1329 /// %alloc = ...
1330 /// [optional] %view = memref.view %alloc ...
1331 /// %subView = subview %allocOrView ...
1332 /// [optional] linalg.fill(%allocOrView, %cst) ...
1333 /// ...
1334 /// memref.copy(%in, %subView) ...
1335 /// vector.transfer_read %allocOrView[...], %cst ...
1336 /// ```
1337 /// into
1338 /// ```
1339 /// [unchanged] %alloc = ...
1340 /// [unchanged] [optional] %view = memref.view %alloc ...
1341 /// [unchanged] [unchanged] %subView = subview %allocOrView ...
1342 /// ...
1343 /// vector.transfer_read %in[...], %cst ...
1344 /// ```
1345 /// Where there is no interleaved use between memref.copy and transfer_read as
1346 /// well as no interleaved use between linalg.fill and memref.copy (if
1347 /// linalg.fill is specified).
1348 /// This is a custom rewrite to forward partial reads (with optional fills) to
1349 /// vector.transfer_read.
1351  : public OpRewritePattern<vector::TransferReadOp> {
1353 
1354  LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
1355  PatternRewriter &rewriter) const override;
1356 };
1357 
1358 /// Match and rewrite for the pattern:
1359 /// ```
1360 /// %alloc = ...
1361 /// [optional] %view = memref.view %alloc ...
1362 /// %subView = subview %allocOrView...
1363 /// ...
1364 /// vector.transfer_write %..., %allocOrView[...]
1365 /// memref.copy(%subView, %out)
1366 /// ```
1367 /// into
1368 /// ```
1369 /// [unchanged] %alloc = ...
1370 /// [unchanged] [optional] %view = memref.view %alloc ...
1371 /// [unchanged] %subView = subview %allocOrView...
1372 /// ...
1373 /// vector.transfer_write %..., %out[...]
1374 /// ```
1375 /// Where there is no interleaved use between transfer_write and memref.copy.
1376 /// This is a custom rewrite to forward partial writes to
1377 /// vector.transfer_write.
1379  : public OpRewritePattern<vector::TransferWriteOp> {
1381 
1382  LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
1383  PatternRewriter &rewriter) const override;
1384 };
1385 
1386 /// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
1388  : public OpRewritePattern<tensor::ExtractSliceOp> {
1389  /// A function to control pattern application and rewrite logic.
1390  ///
1391  /// The function will be given the slice op and should return:
1392  /// - std::nullopt: to fail the match and not apply the pattern;
1393  /// - true: to apply the pattern with zero slice guard;
1394  /// - false: to apply the pattern without zero slice guard.
1395  ///
1396  /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice
1397  /// guard.
1398  using ControlFn = std::function<std::optional<bool>(tensor::ExtractSliceOp)>;
1399 
1401  ControlFn controlFn = nullptr,
1402  PatternBenefit benefit = 1)
1403  : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {}
1404 
1405  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
1406  PatternRewriter &rewriter) const override;
1407 
1408 private:
1409  ControlFn controlFn;
1410 };
1411 
1412 //===----------------------------------------------------------------------===//
1413 // Populate functions.
1414 //===----------------------------------------------------------------------===//
1415 
1416 /// Canonicalization patterns relevant to apply after tiling patterns. These
1417 /// are applied automatically by the tiling pass but need to be applied
1418 /// manually when tiling is called programmatically.
1421 
1422 /// Linalg generalization patterns
1423 
1424 /// Populates `patterns` with patterns to convert spec-generated named ops to
1425 /// linalg.generic ops.
1427 
1428 /// Linalg decompose convolutions patterns
1429 
1430 /// Populates patterns to decompose high-D convolution ops into low-D ones.
1431 /// This is a step in progressive lowering for convolution ops, afterwards we
1432 /// can vectorize the low-D convolution ops.
1434  PatternBenefit benefit = 1);
1435 
1436 /// Populates patterns to transform linalg.conv_2d_xxx operations into
1437 /// linalg.generic (for img2col packing) and linalg.matmul.
1438 /// \see rewriteInIm2Col for more details.
1440 
1441 /// Populates `patterns` with patterns that vectorize tensor.pad.
1442 /// These patterns are meant to apply in a complementary fashion. Benefits
1443 /// are used to encode a certain ordering of pattern application. To avoid
1444 /// scattering magic constants throughout the code base, the patterns must be
1445 /// added with this function. `baseBenefit` can be used to offset the benefit
1446 /// of all tensor::PadOp vectorization patterns by a certain value.
1448  PatternBenefit baseBenefit = 1);
1449 
1450 /// Populate patterns for splitting a `LinalgOp` with multiple statements within
1451 /// its payload into multiple `GenericOp` that have a single statement.
1452 /// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments
1453 /// and results from the generated decomposed ops. This is default `true` since
1454 /// the core decomposition patterns relies on these clean up patterns. It is set
1455 /// to false only for testing purposes.
1457  bool removeDeadArgsAndResults = true);
1458 
1459 /// Populate patterns that convert non-destination-style ops to destination
1460 /// style ops.
1462 
1463 /// Populate patterns for vectorizing low-D convolution ops. This is a step in
1464 /// progressive lowering for convolution ops, it assume high-D convolution ops
1465 /// were decomposed previously.
1467  PatternBenefit benefit = 1);
1468 
1469 /// Populate patterns that convert `ElementwiseMappable` ops to linalg
1470 /// parallel loops.
1472 
1473 /// Populate patterns that are only useful in the context of sparse tensors.
1475 
1476 /// Function type which is used to control when to stop fusion. It is expected
1477 /// that OpOperand is not modified in the callback. The OpOperand is not marked
1478 /// as const to allow callers to use non-const methods.
1479 using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>;
1480 
1481 /// Patterns for fusing linalg operation on tensors.
1482 
1483 /// Pattern to fuse `linalg.generic` -> `linalg.generic` operations
1484 /// when both operations are fusable elementwise operations.
1486  RewritePatternSet &patterns,
1487  const ControlFusionFn &controlElementwiseOpFusion);
1488 
1489 /// Function type which is used to control propagation of tensor.pack/unpack
1490 /// ops.
1491 using ControlPropagationFn = std::function<bool(Operation *op)>;
1492 
1493 /// Patterns to bubble up or down data layout ops across other operations.
1495  RewritePatternSet &patterns,
1496  const ControlPropagationFn &controlPackUnPackPropagation);
1497 
1498 /// Pattern to remove dead operands and results of `linalg.generic` operations.
1499 /// This is effectively DCE for a linalg op.
1501 
1502 /// Patterns to promote inputs to outputs and remove unused inputs of
1503 /// `linalg.generic` ops.
1505 
1506 /// Function type to control generic op dimension collapsing. It is expected
1507 /// to return an array of `ReassociationIndices` representing dimensions that
1508 /// should be merged.
1510  std::function<SmallVector<ReassociationIndices>(linalg::GenericOp)>;
1511 
1512 /// Pattern to collapse dimensions in a linalg.generic op. This will collapse
1513 /// tensor operands when needed and expand back the result tensors.
1515  RewritePatternSet &patterns,
1516  const GetCollapsableDimensionsFn &controlCollapseDimensions);
1517 
1518 /// Patterns to fold an expanding (collapsing) tensor_reshape operation with its
1519 /// producer (consumer) generic operation by expanding the dimensionality of the
1520 /// loop in the generic op.
1522  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1523 
1524 /// Patterns to fold an expanding tensor.expand_shape operation with its
1525 /// producer generic operation by collapsing the dimensions of the generic op.
1527  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1528 
1529 /// Patterns to constant fold Linalg operations.
1531  const ControlFusionFn &controlFn);
1532 
1533 /// Pattern to fuse a `tensor.pad` operation with the producer of its source,
1534 /// if the producer is a `linalg` operation with all parallel iterator types.
1536  RewritePatternSet &patterns);
1537 
1538 /// Patterns to convert from one named op to another. These can be seen as
1539 /// canonicalizations of named ops into another named op.
1541 
1542 /// Patterns to fold unit-extent dimensions in operands/results of linalg ops on
1543 /// tensors via reassociative reshape ops.
1546 
1547 /// A pattern that converts init operands to input operands.
1549 
1550 /// Patterns that are used to inline constant operands into linalg generic ops.
1552 
1553 /// Patterns that are used to bubble up extract slice op above linalg op.
1555 
1556 /// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into
1557 /// linalg.fill(%cst, tensor.extract_slice(%init)).
1559 
1560 /// Patterns to apply `splitReduction` below.
1562  RewritePatternSet &patterns,
1563  const ControlSplitReductionFn &controlSplitReductionFn,
1564  bool useAlloc = false);
1565 
1566 } // namespace linalg
1567 } // namespace mlir
1568 
1569 #endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
static llvm::ManagedStatic< PassManagerOptions > options
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:44
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The main mechanism for performing data layout queries.
This class provides support for representing a failure result, or a valid value of type T.
Definition: LogicalResult.h:78
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
This class helps build Operations.
Definition: Builders.h:206
This class represents a single result from folding an operation.
Definition: OpDefinition.h:266
This class represents an operand of an operation.
Definition: Value.h:261
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
Definition: PatternMatch.h:33
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:727
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:93
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
void populateLinalgNamedOpConversionPatterns(RewritePatternSet &patterns)
Patterns to convert from one named op to another.
void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns)
A pattern that converts init operands to input operands.
FailureOr< GenericOp > generalizeNamedOp(RewriterBase &rewriter, LinalgOp namedOp)
Create a GenericOp from the given named operation namedOp and replace namedOp.
LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, const LinalgPaddingOptions &options, LinalgOp &paddedOp, SmallVector< Value > &replacements, SmallVector< tensor::PadOp > &padOps)
Pad the iterator dimensions paddingDimensions of all opToPad operands to a static bounding box.
Definition: Padding.cpp:151
void populateSplitReductionPattern(RewritePatternSet &patterns, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Patterns to apply splitReduction below.
void populateFuseTensorPadWithProducerLinalgOpPatterns(RewritePatternSet &patterns)
Pattern to fuse a tensor.pad operation with the producer of its source, if the producer is a linalg o...
FailureOr< std::pair< Operation *, Operation * > > rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp)
Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) and linalg....
bool areDimSequencesPreserved(ArrayRef< AffineMap > maps, ArrayRef< ReassociationIndices > dimSequences)
Return true if all sequences of dimensions specified in dimSequences are contiguous in all the ranges...
FailureOr< ForallTilingResult > tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > tileSizes, std::optional< ArrayAttr > mapping)
Same as tileToForallOp, but calculate the number of threads required using the given tileSizes.
Definition: Tiling.cpp:433
FailureOr< LowerUnPackOpResult > lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp)
Rewrite pack as empty + transpose + reshape + extract_slice.
Definition: Transforms.cpp:378
LogicalResult vectorizeOpPrecondition(Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false)
Return success if the operation can be vectorized.
void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns)
Patterns that are used to bubble up extract slice op above linalg op.
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
Definition: Tiling.cpp:78
std::function< std::optional< Value >(OpBuilder &b, memref::SubViewOp subView, ArrayRef< Value > boundingSubViewSize, DataLayout &layout)> AllocBufferCallbackFn
Callback function type used to perform the allocation for the promoted subView.
Definition: Transforms.h:321
void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns)
Populates patterns to transform linalg.conv_2d_xxx operations into linalg.generic (for img2col packin...
DenseMap< int, int > LoopIndexToRangeIndexMap
Creates a number of ranges equal to the number of non-zero in tileSizes.
Definition: Transforms.h:758
std::optional< Value > allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU workgroup memory.
Definition: Promotion.cpp:459
Value bufferizeToAllocation(RewriterBase &rewriter, const BufferizeToAllocationOptions &options, tensor::PadOp padOp, Attribute memorySpace={}, Operation *insertionPoint=nullptr)
Materialize a buffer allocation for the given tensor.pad op and lower the op to linalg....
std::function< bool(OpOperand *fusedOperand)> ControlFusionFn
Function type which is used to control when to stop fusion.
Definition: Transforms.h:1479
bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence)
Return true if a given sequence of dimensions are contiguous in the range of the specified indexing m...
std::function< bool(Operation *op)> ControlPropagationFn
Function type which is used to control propagation of tensor.pack/unpack ops.
Definition: Transforms.h:1491
void populateFoldReshapeOpsByCollapsingPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding tensor.expand_shape operation with its producer generic operation by co...
LinalgTilingLoopType
The type of loops to be generated during tiling.
Definition: Utils.h:103
std::function< LogicalResult(OpBuilder &b, Value buffer)> DeallocBufferCallbackFn
Callback function type used to deallocate the buffers used to hold the promoted subview.
Definition: Transforms.h:326
void populateDataLayoutPropagationPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation)
Patterns to bubble up or down data layout ops across other operations.
void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit baseBenefit=1)
Populates patterns with patterns that vectorize tensor.pad.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
Definition: Tiling.cpp:877
LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value)
In case of GPU private memory there is no need to deallocate since the memory is freed when going out...
Definition: Promotion.cpp:500
void populateSparseTensorRewriting(RewritePatternSet &patterns)
Populate patterns that are only useful in the context of sparse tensors.
FailureOr< ElementwiseOpFusionResult > fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand)
FailureOr< PromotionInfo > promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, const AllocBufferCallbackFn &allocationFn, DataLayout &layout)
Definition: Promotion.cpp:228
std::optional< Value > allocateGPUPrivateMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU private memory.
Definition: Promotion.cpp:484
FailureOr< Operation * > rewriteInDestinationPassingStyle(RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp)
Rewrite tensor.from_elements to linalg.generic.
void peelLoops(RewriterBase &rewriter, ArrayRef< scf::ForOp > loops)
Peel 'loops' and applies affine_min/max bounds simplification on the fly where relevant.
Definition: Transforms.cpp:75
FailureOr< SmallVector< Value > > collapseGenericOpIterationDims(GenericOp genericOp, ArrayRef< ReassociationIndices > foldedIterationDims, RewriterBase &rewriter)
Collapses dimensions of linalg.generic operation.
FailureOr< ForallTilingResult > tileToForallOp(RewriterBase &builder, TilingInterface op, ArrayRef< OpFoldResult > numThreads, std::optional< ArrayAttr > mapping)
Definition: Tiling.cpp:424
void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns)
Populate patterns that convert non-destination-style ops to destination style ops.
void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, ControlDropUnitDims &options)
Patterns to fold unit-extent dimensions in operands/results of linalg ops on tensors via reassociativ...
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst)
Create Memref copy operations and add gpu barrier guards before and after the copy operation to ensur...
Definition: Promotion.cpp:475
std::function< SmallVector< Value, 4 >(OpBuilder &, Operation *)> TileSizeComputationFunction
Definition: Transforms.h:168
std::function< LogicalResult(RewriterBase &, tensor::PadOp, Value)> OptimizeCopyFn
Definition: Transforms.h:1286
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< GenericOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns)
Populate patterns that convert ElementwiseMappable ops to linalg parallel loops.
LogicalResult linalgOpAnchoredEmptyTensorEliminationStep(RewriterBase &rewriter, Operation *op, bufferization::OneShotAnalysisState &state)
Try to eliminate tensor::EmptyOps inside op that are anchored on a LinalgOp.
FailureOr< LinalgLoops > linalgOpToLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.for with the proper body for linalgOp.
Definition: Loops.cpp:375
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
Definition: Tiling.cpp:49
LogicalResult promoteSubviewsPrecondition(Operation *op, LinalgPromotionOptions options)
Promote memref.subviews feeding linalg-on-buffers operations.
Definition: Promotion.cpp:388
LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst)
Normal copy to between src and dst.
Definition: Promotion.cpp:492
void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Linalg decompose convolutions patterns.
void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns for vectorizing low-D convolution ops.
LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp)
Emit a suitable vector form for a Copy op with fully static shape.
FailureOr< GenericOp > interchangeGenericOp(RewriterBase &rewriter, GenericOp genericOp, ArrayRef< unsigned > interchangeVector)
Interchange the iterator_types and iterator_maps dimensions and adapts the index accesses of op.
Definition: Interchange.cpp:50
void populateCollapseDimensions(RewritePatternSet &patterns, const GetCollapsableDimensionsFn &controlCollapseDimensions)
Pattern to collapse dimensions in a linalg.generic op.
bool areElementwiseOpsFusable(OpOperand *fusedOperand)
Return true if two linalg.generic operations with producer/consumer relationship through fusedOperand...
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
Definition: Tiling.cpp:111
FailureOr< LinalgLoops > linalgOpToAffineLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of affine.for with the proper body for linalgOp.
Definition: Loops.cpp:370
void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns)
Pattern to remove dead operands and results of linalg.generic operations.
std::function< LogicalResult(OpBuilder &b, Value src, Value dst)> CopyCallbackFn
Callback function type used to insert copy from original subview to subview of the promoted region fo...
Definition: Transforms.h:333
FailureOr< SplitReductionResult > splitReduction(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
FailureOr< LinalgOp > padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp, const LinalgPaddingOptions &options)
Apply padding and hoisting to linalgOp according to the configuration specified in options.
Definition: Padding.cpp:261
void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, bool removeDeadArgsAndResults=true)
Populate patterns for splitting a LinalgOp with multiple statements within its payload into multiple ...
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
Definition: Tiling.cpp:613
FailureOr< PackResult > packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef< OpFoldResult > mnkPackedSizes, ArrayRef< int64_t > mnkPaddedSizesNextMultipleOf, ArrayRef< int64_t > mnkOrder)
Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m and n are proper parallel d...
Definition: Transforms.cpp:795
LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, const ControlDropUnitDims &options)
FailureOr< PackResult > pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, ArrayRef< OpFoldResult > packedSizes)
Implement packing of a single LinalgOp by packedSizes.
Definition: Transforms.cpp:508
void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns)
Patterns to promote inputs to outputs and remove unused inputs of linalg.generic ops.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
Definition: Tiling.cpp:837
FailureOr< LowerPackResult > lowerPack(RewriterBase &rewriter, tensor::PackOp packOp)
Rewrite pack as pad + reshape + transpose.
Definition: Transforms.cpp:220
void populateFoldReshapeOpsByExpansionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding (collapsing) tensor_reshape operation with its producer (consumer) gene...
void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns)
Adds patterns that waps tensor.extract_slice(linalg.fill(cst, init)) into linalg.fill(cst,...
void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns)
Patterns that are used to inline constant operands into linalg generic ops.
FailureOr< LinalgOp > promoteSubViews(OpBuilder &b, LinalgOp op, const LinalgPromotionOptions &options)
Promote the subViews into a new buffer allocated at the insertion point b.
Definition: Promotion.cpp:410
void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, const ControlFusionFn &controlFn)
Patterns to constant fold Linalg operations.
std::function< SplitReductionOptions(LinalgOp op)> ControlSplitReductionFn
Function signature to control reduction splitting.
Definition: Transforms.h:424
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value)
In case of GPU group memory there is no need to deallocate.
Definition: Promotion.cpp:468
void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns)
Linalg generalization patterns.
std::function< SmallVector< ReassociationIndices >(linalg::GenericOp)> GetCollapsableDimensionsFn
Function type to control generic op dimension collapsing.
Definition: Transforms.h:1510
std::optional< vector::CombiningKind > getCombinerOpKind(Operation *combinerOp)
Return vector::CombiningKind for the given op.
SmallVector< Value > peelLoop(RewriterBase &rewriter, Operation *op)
Try to peel and canonicalize loop op and return the new result.
Definition: Transforms.cpp:59
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
Definition: Tiling.cpp:871
FailureOr< PackTransposeResult > packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, ArrayRef< int64_t > outerPerm, ArrayRef< int64_t > innerPerm)
Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the transposed PackOp -> LinalgOp ...
Definition: Transforms.cpp:704
LogicalResult vectorize(RewriterBase &rewriter, Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false)
Emit a suitable vector form for an operation.
std::pair< TilingInterface, TilingInterface > splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension, OpFoldResult splitPoint)
Split the given op into two parts along the given iteration space dimension at the specified splitPoi...
Definition: Split.cpp:67
void populateElementwiseOpsFusionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlElementwiseOpFusion)
Patterns for fusing linalg operation on tensors.
FailureOr< SplitReductionResult > splitReductionByScaling(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Scaling-based implementation of the split reduction transformation.
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
Definition: Tiling.cpp:137
FailureOr< LinalgLoops > linalgOpToParallelLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.parallel with the proper body for linalgOp.
Definition: Loops.cpp:382
This header declares functions that assist transformations in the MemRef dialect.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
OpInterfaceRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting a...
Definition: PatternMatch.h:372
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:357
bool bufferizeDestinationOnly
If set to "true", only the destination tensor operands are bufferized to a new allocation (and wrappe...
Definition: Transforms.h:59
bool emitDealloc
If set to "true", a memref.dealloc operation will be emitted for each allocated buffer.
Definition: Transforms.h:65
Transformation to drop unit-extent dimensions from linalg.generic operations.
Definition: Transforms.h:454
RankReductionStrategy rankReductionStrategy
Definition: Transforms.h:457
std::function< SmallVector< unsigned >(Operation *)> ControlFnTy
Definition: Transforms.h:460
Vectorization pattern for memref::CopyOp.
Definition: Transforms.h:1278
LogicalResult matchAndRewrite(memref::CopyOp copyOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:946
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1246
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1240
Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh) dimensions into 1-D depthwise c...
Definition: Transforms.h:1224
FailureOr< DepthwiseConv1DNwcWcOp > returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const
LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1233
DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1225
Rewrites 2-D convolution ops with size-1 window dimensions into 1-D convolution ops.
Definition: Transforms.h:1204
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1210
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
Fuse two linalg.generic operations that have a producer-consumer relationship captured through fusedO...
Definition: Transforms.h:474
llvm::DenseMap< Value, Value > replacements
Definition: Transforms.h:476
Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
Definition: Transforms.h:1388
std::function< std::optional< bool >(tensor::ExtractSliceOp)> ControlFn
A function to control pattern application and rewrite logic.
Definition: Transforms.h:1398
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override
ExtractSliceOfPadTensorSwapPattern(MLIRContext *context, ControlFn controlFn=nullptr, PatternBenefit benefit=1)
Definition: Transforms.h:1400
Transformation information returned after reduction tiling.
Definition: Transforms.h:842
Operation * parallelTiledOp
The partial reduction tiled op generated.
Definition: Transforms.h:844
Operation * initialOp
The op initializing the tensor used for partial reductions.
Definition: Transforms.h:848
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
Definition: Transforms.h:850
Operation * mergeOp
The final reduction operation merging all the partial reductions.
Definition: Transforms.h:846
Rewrite a TilingInterface op to a tiled scf.forall, applying tiling by numThreads.
Definition: Transforms.h:825
Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose + tensor....
Definition: Transforms.h:1311
LogicalResult matchAndRewrite(tensor::PackOp packOp, PatternRewriter &rewriter) const override
Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op.
Definition: Transforms.h:1321
LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, PatternRewriter &rewriter) const override
Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and InsertSliceOp.
Definition: Transforms.h:1291
LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:970
Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, Value dest, const SmallVector< Value > &dynSizes) const
Filling dest using FillOp constant padding value if possible.
Definition: Transforms.cpp:953
GeneralizePadOpPattern(MLIRContext *context, OptimizeCopyFn optimizeCopyFn=nullptr, PatternBenefit benefit=1)
Definition: Transforms.h:1292
Match and rewrite for the pattern:
Definition: Transforms.h:1351
LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Match and rewrite for the pattern:
Definition: Transforms.h:1379
LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Linalg generalization pattern.
Definition: Transforms.h:1261
LogicalResult matchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const override
Definition: Transforms.h:1271
FailureOr< GenericOp > returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const
matchAndRewrite implementation that returns the significant transformed pieces of IR.
Definition: Transforms.h:1267
Options that allow distribution of loops generated in Linalg transforms to processors while generatin...
Definition: Utils.h:305
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition: Transforms.h:262
LinalgPaddingOptions & setPadToMultipleOf(ArrayRef< int64_t > m)
Definition: Transforms.h:275
SmallVector< bool > packPaddings
A flag for every operand to mark the PadOp as nofold which enables packing for statically shaped oper...
Definition: Transforms.h:281
std::optional< SmallVector< int64_t > > padToMultipleOf
A list of multiples to which each padding dimension should be padded to.
Definition: Transforms.h:274
LinalgPaddingOptions & setPaddingDimensions(ArrayRef< int64_t > pd)
Definition: Transforms.h:269
LinalgPaddingOptions & setTransposePaddings(ArrayRef< SmallVector< int64_t >> tp)
Definition: Transforms.h:296
SmallVector< SmallVector< int64_t > > transposePaddings
A permutation vector for every operand used to transpose the packed PadOp results.
Definition: Transforms.h:294
LinalgPaddingOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition: Transforms.h:263
LinalgPaddingOptions & setPackPaddings(ArrayRef< bool > pp)
Definition: Transforms.h:282
LinalgPaddingOptions & setCopyBackOp(CopyBackOp op)
Definition: Transforms.h:308
LinalgPaddingOptions & setHoistPaddings(ArrayRef< int64_t > hp)
Definition: Transforms.h:288
SmallVector< int64_t > hoistPaddings
A number of loops to hoist the PadOp out for every operand.
Definition: Transforms.h:287
SmallVector< int64_t > paddingDimensions
A list of iterator dimensions to pad.
Definition: Transforms.h:268
CopyBackOp copyBackOp
The op to be used for copying the padded result to the original destination tensor.
Definition: Transforms.h:307
std::optional< unsigned > alignment
Alignment of promoted buffer. If std::nullopt do not specify alignment.
Definition: Transforms.h:366
LinalgPromotionOptions & setUseFullTileBuffersByDefault(bool use)
Definition: Transforms.h:361
bool useAlloca
Use alloca with the default allocation scheme.
Definition: Transforms.h:379
LinalgPromotionOptions & setAlignment(unsigned align)
Definition: Transforms.h:367
std::optional< Attribute > memorySpace
Memory space of promoted buffer.
Definition: Transforms.h:373
std::optional< CopyCallbackFn > copyOutFn
Definition: Transforms.h:399
std::optional< CopyCallbackFn > copyInFn
Callback function to do the copy of data to and from the promoted subview.
Definition: Transforms.h:398
LinalgPromotionOptions & setUseAlloca(bool use)
Definition: Transforms.h:380
std::optional< DenseSet< unsigned > > operandsToPromote
Indices of subViews to promote.
Definition: Transforms.h:338
LinalgPromotionOptions & setCopyInOutFns(CopyCallbackFn const &copyIn, CopyCallbackFn const &copyOut)
Definition: Transforms.h:400
LinalgPromotionOptions & setUseFullTileBuffers(ArrayRef< bool > useFullTiles)
Definition: Transforms.h:350
std::optional< AllocBufferCallbackFn > allocationFn
Callback function to do the allocation of the promoted buffer.
Definition: Transforms.h:387
bool useFullTileBuffersDefault
If true all operands unspecified by useFullTileBuffers will use the full view, otherwise the partial ...
Definition: Transforms.h:360
std::optional< DeallocBufferCallbackFn > deallocationFn
Definition: Transforms.h:388
LinalgPromotionOptions & setMemorySpace(Attribute memorySpc)
Definition: Transforms.h:374
LinalgPromotionOptions & setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, DeallocBufferCallbackFn const &deallocFn)
Definition: Transforms.h:390
std::optional< llvm::SmallBitVector > useFullTileBuffers
If ith element of useFullTiles is true the full view should be used for the promoted buffer of the it...
Definition: Transforms.h:349
LinalgPromotionOptions & setOperandsToPromote(ArrayRef< int64_t > operands)
Definition: Transforms.h:339
std::optional< LinalgLoopDistributionOptions > tileDistribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:252
LinalgTilingAndFusionOptions & setTileSizes(ArrayRef< int64_t > ts)
Definition: Transforms.h:244
SmallVector< int64_t > tileInterchange
Tile interchange used to permute the tile loops.
Definition: Transforms.h:249
LinalgTilingAndFusionOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:254
SmallVector< int64_t > tileSizes
Tile sizes used to tile the root operation.
Definition: Transforms.h:243
LinalgTilingOptions & setLoopType(LinalgTilingLoopType lt)
Definition: Transforms.h:208
LinalgTilingOptions & setDistributionTypes(ArrayRef< StringRef > types)
Definition: Transforms.h:226
LinalgTilingOptions & setInterchange(ArrayRef< unsigned > interchange)
Definition: Transforms.h:200
LinalgTilingLoopType loopType
The type of tile loops to generate.
Definition: Transforms.h:206
LinalgTilingOptions & setTileSizeComputationFunction(TileSizeComputationFunction fun)
Definition: Transforms.h:177
LinalgTilingOptions & setTileSizes(const SmallVector< Value, 4 > &ts)
Set the tileSizeComputationFunction to return the values ts.
Definition: Transforms.h:184
LinalgTilingOptions & setPeeledLoops(ArrayRef< int64_t > loops)
Definition: Transforms.h:234
SmallVector< int64_t > peeledLoops
Peel the specified loops.
Definition: Transforms.h:232
LinalgTilingOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:218
SmallVector< unsigned, 4 > interchangeVector
The interchange vector to reorder the tiled loops.
Definition: Transforms.h:198
TileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes for each operation.
Definition: Transforms.h:174
LinalgTilingOptions & scalarizeDynamicDims()
Tile all dynamic dimensions by 1.
std::optional< LinalgLoopDistributionOptions > distribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:215
SmallVector< StringRef, 2 > distributionTypes
Specification markers of how to distribute the linalg.tiled_loop.
Definition: Transforms.h:224
linalg::TransposeOp transposeOp
Definition: Transforms.h:1064
tensor::ExpandShapeOp expandShapeOp
Definition: Transforms.h:1063
tensor::ExtractSliceOp extractSliceOp
Definition: Transforms.h:1075
linalg::TransposeOp transposeOp
Definition: Transforms.h:1073
tensor::CollapseShapeOp collapseShapeOp
Definition: Transforms.h:1074
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Definition: Transforms.h:778
Struct to hold the result of a pack call.
Definition: Transforms.h:1083
linalg::LinalgOp packedLinalgOp
Definition: Transforms.h:1085
SmallVector< tensor::PackOp > packOps
Definition: Transforms.h:1084
SmallVector< tensor::UnPackOp > unPackOps
Definition: Transforms.h:1086
Struct to hold the result of a packTranspose call.
Definition: Transforms.h:1095
linalg::LinalgOp transposedLinalgOp
Definition: Transforms.h:1097
tensor::UnPackOp transposedUnPackOp
Definition: Transforms.h:1098
Create a new buffer using the allocationFn provided.
Definition: Transforms.h:676
Split Reduction options.
Definition: Transforms.h:409
Apply transformation to split the single linalg op reduction into a parallel and reduction dimension.
Definition: Transforms.h:979
Perform standalone tiling of a single LinalgOp by tileSizes.
Definition: Transforms.h:642
SmallVector< Operation *, 8 > loops
Definition: Transforms.h:644
SmallVector< Value, 4 > tensorResults
Definition: Transforms.h:645
T lowTripCount
Number of tiles associated with each size.
Definition: Transforms.h:770
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:512
SmallVector< OpFoldResult > strides
Definition: Transforms.h:513
SmallVector< Value > leadingPackedTensorIndexings
Definition: Transforms.h:514
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:514
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:513
SmallVector< OpFoldResult > offsets
Definition: Transforms.h:513