MLIR  22.0.0git
Transforms.h
Go to the documentation of this file.
1 //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
11 
12 #include <utility>
13 
23 #include "mlir/IR/OpDefinition.h"
24 #include "mlir/IR/PatternMatch.h"
27 #include "llvm/ADT/SmallBitVector.h"
28 #include "llvm/ADT/SmallSet.h"
29 
30 namespace mlir {
31 namespace bufferization {
32 class AllocTensorOp;
33 class OneShotAnalysisState;
34 class BufferizationState;
35 } // namespace bufferization
36 
37 namespace linalg {
38 
39 class LinalgOp;
40 enum class WinogradConv2DFmr : uint32_t;
41 
42 //===----------------------------------------------------------------------===//
43 // Utils.
44 //===----------------------------------------------------------------------===//
45 
46 /// Return vector::CombiningKind for the given op.
47 std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp);
48 
49 //===----------------------------------------------------------------------===//
50 // Bufferization-related transforms.
51 //===----------------------------------------------------------------------===//
52 
54  enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 };
55  AllocOp allocOp = AllocOp::MemrefAlloc;
56 
57  enum class MemcpyOp {
58  MaterializeInDestination = 0,
59  MemrefCopy = 1,
60  LinalgCopy = 2
61  };
62  MemcpyOp memcpyOp = MemcpyOp::MaterializeInDestination;
63 
64  /// If set to "true", only the destination tensor operands are bufferized to
65  /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the
66  /// targeted op itself.
67  bool bufferizeDestinationOnly = false;
68 
69  /// If set to "true", a `memref.dealloc` operation will be emitted for each
70  /// allocated buffer. Otherwise, the memory is leaked, which is useful if
71  /// the buffer deallocation pipeline should be run after bufferization is
72  /// done.
73  bool emitDealloc = false;
74 };
75 
76 /// Materialize a buffer allocation for the given tensor.pad op and lower the
77 /// op to linalg.fill/linalg.generic + bufferization.materialize_in_destination.
78 /// E.g.:
79 ///
80 /// %0 = tensor.pad low[%l] high[%h] %t ...
81 ///
82 /// is lowered to:
83 ///
84 /// %alloc = memref.alloc
85 /// linalg.fill ... outs(%alloc)
86 /// %subview = memref.subview %alloc [%l] [...] [1]
87 /// bufferization.materialize_in_destination %t in %subview
88 /// %0 = bufferization.to_tensor %alloc restrict writable
89 ///
90 /// In addition to rewriting the IR as shown above, this function returns the
91 /// newly allocated buffer. The `insertionPoint` parameter can be used to
92 /// specify a custom insertion point for the buffer allocation.
95  tensor::PadOp padOp, Attribute memorySpace = {},
96  Operation *insertionPoint = nullptr);
97 
98 /// Materialize a buffer allocation for the given vector.mask op and bufferize
99 /// the op, including its region. E.g.:
100 ///
101 /// %0 = vector.mask {
102 /// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32>
103 /// } : vector<16xi1> -> tensor<?xf32>
104 ///
105 /// is lowered to:
106 ///
107 /// %alloc = memref.alloc
108 /// bufferization.materialize_in_destination %t in %subview
109 /// vector.mask {
110 /// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32>
111 /// } : vector<16xi1>
112 /// %0 = bufferization.to_tensor %alloc restrict writable
113 ///
114 /// In addition to rewriting the IR as shown above, this function returns the
115 /// newly allocated buffer. The `insertionPoint` parameter can be used to
116 /// specify a custom insertion point for the buffer allocation.
118  const BufferizeToAllocationOptions &options,
119  vector::MaskOp maskOp, Attribute memorySpace = {},
120  Operation *insertionPoint = nullptr);
121 
122 /// Materialize a buffer allocation for the given bufferization.alloc_tensor op
123 /// and lower the op to memref.alloc + memref.tensor_store.
124 ///
125 /// In addition to rewriting the IR, this function returns the newly allocated
126 /// buffer. The `insertionPoint` parameter can be used to specify a custom
127 /// insertion point for the buffer allocation.
128 Value bufferizeToAllocation(RewriterBase &rewriter,
129  const BufferizeToAllocationOptions &options,
130  bufferization::AllocTensorOp allocTensorOp,
131  Attribute memorySpace = {},
132  Operation *insertionPoint = nullptr);
133 
134 /// Bufferize the given op with tensor semantics and materialize the result in
135 /// a newly allocated buffer.
136 ///
137 /// Only bufferizable ops that bufferize to a memory write or have an
138 /// aliasing OpOperand (and do not themselves bufferize to an allocation) are
139 /// supported. They are bufferized using their BufferizableOpInterface
140 /// implementation.
141 ///
142 /// Selected ops that bufferize to an allocation (or need special handling) are
143 /// also supported:
144 /// - tensor.pad
145 /// - vector.mask
146 ///
147 /// This function returns the newly allocated buffer. The `insertionPoint`
148 /// parameter can be used to specify a custom insertion point for the buffer
149 /// allocation.
150 Value bufferizeToAllocation(RewriterBase &rewriter,
151  const BufferizeToAllocationOptions &options,
152  Operation *op, Attribute memorySpace = {},
153  Operation *insertionPoint = nullptr);
154 
155 /// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a
156 /// LinalgOp. This transforms looks for LinalgOps that have an unused output
157 /// operand and an input operand that is rooted in a tensor::EmptyOp. The
158 /// tensor::EmptyOp uses are replaced with the output operand and the two
159 /// operands of the LinalgOp are swapped.
160 ///
161 /// Example:
162 /// %0 = tensor.empty()
163 /// %1 = linalg.matmul ins(...) outs(%0)
164 /// %2 = linalg.generic ins(%1) outs(%dest) {
165 /// ^bb0(%in: f32, %out: f32):
166 /// // out not used
167 /// }
168 ///
169 /// The IR is transformed as follows:
170 /// %0 = tensor.empty()
171 /// %1 = linalg.matmul ins(...) outs(%dest)
172 /// %2 = linalg.generic ins(%0) outs(%1) {
173 /// ^bb0(%in: f32, %out: f32):
174 /// // Use %out instead of %in
175 /// }
176 ///
177 /// The "ins" operand has no uses inside the body of the LinalgOp and can be
178 /// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp
179 /// can also fold away.
181  RewriterBase &rewriter, Operation *op,
182  bufferization::OneShotAnalysisState &state);
183 
184 //===----------------------------------------------------------------------===//
185 // Structs that configure the behavior of various transformations.
186 //===----------------------------------------------------------------------===//
187 
189  std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;
190 
192  /// Computation function that returns the tile sizes for each operation.
193  /// Delayed construction of constant tile sizes should occur to interoperate
194  /// with folding.
196 
199  tileSizeComputationFunction = std::move(fun);
200  return *this;
201  }
202  /// Set the `tileSizeComputationFunction` to return the values `ts`. The
203  /// values must not fold away when tiling. Otherwise, use a more robust
204  /// `tileSizeComputationFunction`.
206  tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; };
207  return *this;
208  }
209  /// Convenience function to set the `tileSizeComputationFunction` to a
210  /// function that computes tile sizes at the point they are needed. Allows
211  /// proper interaction with folding.
213 
214  /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions.
215  /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together.
217 
218  /// The interchange vector to reorder the tiled loops.
220 
222  interchangeVector.assign(interchange.begin(), interchange.end());
223  return *this;
224  }
225 
226  /// The type of tile loops to generate.
228 
230  loopType = lt;
231  return *this;
232  }
233 
234  /// When specified, specifies distribution of generated tile loops to
235  /// processors.
236  std::optional<LinalgLoopDistributionOptions> distribution;
237 
240  distribution = std::move(distributionOptions);
241  return *this;
242  }
243 
244  /// Specification markers of how to distribute the `linalg.tiled_loop`.
246 
248  distributionTypes.assign(types.begin(), types.end());
249  return *this;
250  }
251 
252  /// Peel the specified loops.
254 
256  peeledLoops.clear();
257  peeledLoops.append(loops.begin(), loops.end());
258  return *this;
259  }
260 };
261 
263  /// Tile sizes used to tile the root operation.
266  tileSizes.assign(ts.begin(), ts.end());
267  return *this;
268  }
269  /// Tile interchange used to permute the tile loops.
271  /// When specified, specifies distribution of generated tile loops to
272  /// processors.
273  std::optional<LinalgLoopDistributionOptions> tileDistribution;
276  tileDistribution = std::move(distributionOptions);
277  return *this;
278  }
279 };
280 
282  /// A padding value for every operand.
285  paddingValues.assign(pv.begin(), pv.end());
286  return *this;
287  }
288  /// A list of iterator dimensions to pad.
291  paddingDimensions.assign(pd.begin(), pd.end());
292  return *this;
293  }
294  /// A list of multiples to which each padding dimension should be padded to.
295  std::optional<SmallVector<int64_t>> padToMultipleOf;
297  padToMultipleOf.emplace(m.begin(), m.end());
298  return *this;
299  }
300  /// A mapping between an operand and shape dim, and a size for a padding
301  /// dimension. Each size is expected to be greater or equal than the
302  /// corresponding shape dim. If no value is provided then the constant upper
303  /// bound will be used.
305  LinalgPaddingOptions &setSizeToPadTo(unsigned operandIndex, unsigned dimIndex,
306  OpFoldResult size) {
307  assert(size && "expected non-null size");
308  sizeToPadTo[{operandIndex, dimIndex}] = size;
309  return *this;
310  }
311  /// Given the operand index and shape dim it returns the size to pad to.
312  OpFoldResult getSizeToPadTo(unsigned operandIndex, unsigned dimIndex) const {
313  return sizeToPadTo.lookup_or(
314  std::pair<unsigned, unsigned>(operandIndex, dimIndex), nullptr);
315  }
316 
317  /// A flag for every operand to mark the PadOp as nofold which enables
318  /// packing for statically shaped operands.
321  nofoldFlags.assign(pp.begin(), pp.end());
322  return *this;
323  }
324  /// A number of loops to hoist the PadOp out for every operand.
327  hoistPaddings.assign(hp.begin(), hp.end());
328  return *this;
329  }
330  /// A permutation vector for every operand used to transpose the packed
331  /// PadOp results.
335  transposePaddings.assign(tp.begin(), tp.end());
336  return *this;
337  }
338  enum class CopyBackOp : int8_t {
339  None = 0,
341  LinalgCopy = 2
342  };
343  /// The op to be used for copying the padded result to the original
344  /// destination tensor.
347  copyBackOp = op;
348  return *this;
349  }
350 };
351 
353  /// A padding value for every operand.
356  paddingValues.assign(pv.begin(), pv.end());
357  return *this;
358  }
359  /// A list of iterator dimensions sizes to pad to.
362  paddingSizes.assign(m.begin(), m.end());
363  return *this;
364  }
365  /// Pad iterator `paddingDimension[i]` to next multiple of `paddingSizes[i]`
366  /// if true. Otherwise pad to `paddingSizes[i]`.
369  padToMultipleOf = b;
370  return *this;
371  }
372 };
373 
374 /// Callback function type used to perform the allocation for the promoted
375 /// `subView`. In `boundingSubViewsize` a best attempt is made to find the
376 /// smallest constant value for the size of the buffer needed for each
377 /// dimension. If that is not possible, contains the dynamic size of the
378 /// subview. The call back should return the buffer to use.
379 using AllocBufferCallbackFn = std::function<std::optional<Value>(
380  OpBuilder &b, memref::SubViewOp subView,
381  ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>;
382 
383 /// Callback function type used to deallocate the buffers used to hold the
384 /// promoted subview.
386  std::function<LogicalResult(OpBuilder &b, Value buffer)>;
387 
388 /// Callback function type used to insert copy from original subview to
389 /// subview of the promoted region for the read operands/subview of promoted
390 /// region to original subview for the results. The copy has to happen from
391 /// `src` to `dst`.
393  std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
394 
396  /// Indices of subViews to promote. If `std::nullopt`, try to promote all
397  /// operands.
398  std::optional<DenseSet<unsigned>> operandsToPromote;
401  operandsToPromote->insert_range(operands);
402  return *this;
403  }
404  /// If ith element of `useFullTiles` is true the full view should be used
405  /// for the promoted buffer of the ith operand in `operandsToPromote`.
406  /// Otherwise the partial view will be used. The decision is defaulted to
407  /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and
408  /// for operands missing from `useFullTileBuffers`.
409  std::optional<llvm::SmallBitVector> useFullTileBuffers;
411  unsigned size = useFullTiles.size();
412  llvm::SmallBitVector tmp(size, false);
413  for (unsigned i = 0; i < size; ++i)
414  tmp[i] = useFullTiles[i];
415  useFullTileBuffers = tmp;
416  return *this;
417  }
418  /// If true all operands unspecified by `useFullTileBuffers` will use the
419  /// full view, otherwise the partial view.
423  return *this;
424  }
425  /// If true, buffers will be allocated with the original subview size. This
426  /// may result in more dynamic allocations, in case of dynamic sizes.
429  useOriginalSubviewSize = originalSize;
430  return *this;
431  }
432  /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment.
433  std::optional<unsigned> alignment;
435  alignment = align;
436  return *this;
437  }
438  /// Memory space of promoted buffer. If `std::nullopt` do not specify memory
439  /// space.
440  std::optional<Attribute> memorySpace;
442  memorySpace = memorySpc;
443  return *this;
444  }
445  /// Use alloca with the default allocation scheme.
446  bool useAlloca = false;
448  useAlloca = use;
449  return *this;
450  }
451  /// Callback function to do the allocation of the promoted buffer. If
452  /// std::nullopt, then the default allocation scheme of allocating a
453  /// memref<?xi8> buffer followed by a view operation is used.
454  std::optional<AllocBufferCallbackFn> allocationFn;
455  std::optional<DeallocBufferCallbackFn> deallocationFn;
458  DeallocBufferCallbackFn const &deallocFn) {
459  allocationFn = allocFn;
460  deallocationFn = deallocFn;
461  return *this;
462  }
463  /// Callback function to do the copy of data to and from the promoted
464  /// subview. If std::nullopt then a memref.copy is used.
465  std::optional<CopyCallbackFn> copyInFn;
466  std::optional<CopyCallbackFn> copyOutFn;
468  CopyCallbackFn const &copyOut) {
469  copyInFn = copyIn;
470  copyOutFn = copyOut;
471  return *this;
472  }
473 };
474 
475 /// Split Reduction options.
477  // Ratio used to split the reduction dimension. If the ratio is <= 1,
478  // nothing will be done.
479  int64_t ratio = 0;
480  // Index where the extra dimension is added to the intermediate tensor
481  // shape.
482  unsigned index = 0;
483  // If the inner dimension after splitting is parallel or reduction.
484  bool innerParallel = false;
485 };
486 
487 /// Function signature to control reduction splitting. This returns
488 /// `SplitReductionOptions`.
489 // TODO: don't use unsigned unless doing bit manipulation.
491  std::function<SplitReductionOptions(LinalgOp op)>;
492 
493 //===----------------------------------------------------------------------===//
494 // Preconditions that ensure the corresponding transformation succeeds and can
495 // be applied as a rewrite pattern.
496 //===----------------------------------------------------------------------===//
497 
498 /// Return true if two `linalg.generic` operations with producer/consumer
499 /// relationship through `fusedOperand` can be fused using elementwise op
500 /// fusion.
501 bool areElementwiseOpsFusable(OpOperand *fusedOperand);
502 
503 /// Promote memref.subviews feeding linalg-on-buffers operations.
504 LogicalResult promoteSubviewsPrecondition(Operation *op,
506 
507 /// Return success if the operation can be vectorized.
508 LogicalResult vectorizeOpPrecondition(Operation *op,
509  ArrayRef<int64_t> inputVectorSizes = {},
510  ArrayRef<bool> inputScalableVecDims = {},
511  bool vectorizeNDExtract = false,
512  bool flatten1DDepthwiseConv = false);
513 
514 //===----------------------------------------------------------------------===//
515 // Transformations exposed as functional-style API calls.
516 //===----------------------------------------------------------------------===//
517 
519 
520 /// Transformation to drop unit-extent dimensions from `linalg.generic`
521 /// operations.
524 
527 
528  using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>;
530  if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
531  return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
532  }
533  if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) {
534  return llvm::to_vector(
535  llvm::seq<unsigned>(0, padOp.getSourceType().getRank()));
536  }
537  return SmallVector<unsigned>{};
538  };
539 };
540 
542  IndexingMapOpInterface resultOp;
544 };
545 using DroppedUnitDimsBuilder = std::function<IndexingMapOpInterface(
546  Location loc, OpBuilder &, IndexingMapOpInterface,
547  ArrayRef<Value> newOperands, ArrayRef<AffineMap> newIndexingMaps,
548  const llvm::SmallDenseSet<unsigned> &droppedDims)>;
549 
550 FailureOr<DropUnitDimsResult>
551 dropUnitDims(RewriterBase &rewriter, IndexingMapOpInterface op,
552  const DroppedUnitDimsBuilder &droppedUnitDimsBuilder,
554 FailureOr<DropUnitDimsResult> dropUnitDims(RewriterBase &rewriter,
555  GenericOp genericOp,
557 
558 /// Fuse two `linalg.generic` operations that have a producer-consumer
559 /// relationship captured through `fusedOperand`. The method expects
560 /// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.
564 };
565 FailureOr<ElementwiseOpFusionResult>
566 fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand);
567 
568 /// Returns a set of indices of the producer's results which would
569 /// be preserved after the fusion.
570 /// * There is a chance that the implementation of the transformation does not
571 /// agree with the result of this method. This function gives a prediction based
572 /// on an optimized fusion.
573 llvm::SmallDenseSet<int> getPreservedProducerResults(GenericOp producer,
574  GenericOp consumer,
575  OpOperand *fusedOperand);
576 
577 /// Try to peel and canonicalize loop `op` and return the new result.
578 /// Also applies affine_min/max bounds simplification on the fly where relevant.
579 // TODO: Add support for scf.parallel and affine.for loops.
581 
582 /// Peel 'loops' and applies affine_min/max bounds simplification on the fly
583 /// where relevant.
584 void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
585 
586 /// Pad the iterator dimensions `options.paddingDimensions` of all `opToPad`
587 /// operands to a static bounding box. The original `opToPad` is cloned and
588 /// operates on the padded tensors.
589 ///
590 /// * "options.padToMultipleOf" indicates that each padding dimension should be
591 /// padded to the specified multiple.
592 /// * Use "options.paddingValues" and "options.nofoldFlags" to set padding
593 /// value and nofold attribute of the created tensor::PadOps, respectively.
594 /// * The unpadded results (extracted slice of the cloned operation) are
595 /// returned via `replacements`.
596 /// * The tensor::PadOps are returned via `padOps`.
597 /// * "options.copyBackOp" specifies the op type for copying back the unpadded
598 /// result to the original destination tensor.
599 LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
601  LinalgOp &paddedOp,
602  SmallVector<Value> &replacements,
604 
605 /// Helper function to compute the padded shape of the given value `v` of
606 /// `RankedTensorType` given:
607 /// - the `indexingSizes` as a list of OpFoldResult.
608 /// - an `indexingMap` that encodes how the padded shape varies with
609 /// increases in `indexingSizes`.
610 /// The implementation iteratively combines increases from contributing using
611 /// affine.apply operations.
612 /// The `indexingMap` + `indexingSizes` encoding suits StructuredOps and
613 /// provides a gentle portability path for Linalg-like ops with affine maps.
614 /// The padded shape is computed by evaluating the maximum accessed index per
615 /// dimension, which may involve multiplying by constant factors derived from
616 /// the affine indexing expressions. Currently, only a limited set of projected
617 /// permuation indexing maps are supported, such as
618 /// - affine_map<(d0, d1, d2) -> (d0, d1)>
619 /// - affine_map<(d0, d1, d2) -> (d0, d1 + d2)>
620 /// - affine_map<(d0, d1) -> (d0 * 3 + d1)>
621 /// In the future, more general interfaces can be devised to encode similar
622 /// shape evolutions and map between an op and its operands.
625  AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
627 
629  std::function<FailureOr<SmallVector<OpFoldResult>>(
631  const PadTilingInterfaceOptions &)>;
632 
633 /// Specific helper for Linalg ops.
634 FailureOr<SmallVector<OpFoldResult>> computeIndexingMapOpInterfacePaddedShape(
635  RewriterBase &rewriter, OpOperand &operandToPad,
636  ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options);
637 
638 /// Pad the iterator dimensions `options.paddingDimensions` of `opToPad`.
639 ///
640 /// * "options.paddingSizes" indicates that each padding dimension should be
641 /// padded to the specified padding size.
642 /// * "options.padToMultipleOf" indicates that the paddingSizes should be
643 // interpreted as the bounding box (dynamic) value to pad to.
644 /// * Use "options.paddingValues" to set the padding value of the created
645 // tensor::PadOp.
646 /// * The tensor::PadOp is returned on success.
647 
648 FailureOr<TilingInterface>
649 rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
650  const PadTilingInterfaceOptions &constOptions,
652  const PadSizeComputationFunction &computePaddingSizeFun =
654 
655 namespace detail {
656 
657 /// Helper struct to hold the results of building a packing loop nest.
661  TransposeOp maybeTransposeOp;
662  tensor::PadOp hoistedPadOp;
663 };
664 
665 /// Build the packing loop nest required to hoist `opToHoist` above
666 /// `outermostEnclosingForOp`.
667 /// The loop nest is built just before `outermostEnclosingForOp`.
668 FailureOr<PackingResult>
669 buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
670  scf::ForOp outermostEnclosingForOp,
671  ArrayRef<int64_t> transposeVector);
672 
673 } // namespace detail
674 
675 /// Mechanically hoist padding operations on tensors by `numLoops` into a new,
676 /// generally larger tensor. This achieves packing of multiple padding ops into
677 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
678 /// in the packing loop so the caller can continue reasoning about the padding
679 /// operation. If `transposeVector` is non-empty, hoist padding introduces a
680 /// TransposeOp to transpose the padded tensor before inserting it into the
681 /// packed tensor. A `transposeVector` can change the storage order of the
682 /// padded tensor but does not change the order of the pack or compute loops.
683 ///
684 /// TODO: In the future, we should consider rewriting as a linalg.pack after
685 /// hoisting since this abstraction is now available.
686 ///
687 /// Example in pseudo-mlir:
688 /// =======================
689 ///
690 /// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
691 /// ```
692 /// scf.for (%i, %j, %k)
693 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
694 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
695 /// ^bb0( ... ):
696 /// linalg.yield %pad
697 /// } : tensor<?x?xf32> to tensor<4x8xf32>
698 /// compute(%0)
699 /// ```
700 ///
701 /// IR resembling the following is produced:
702 ///
703 /// ```
704 /// scf.for (%i) {
705 /// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32>
706 /// %packed = scf.for (%k) iter_args(%p : %packed_init) {
707 /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
708 /// %0 = tensor.pad %st0 low[0, 0] high[...] {
709 /// ^bb0( ... ):
710 /// linalg.yield %pad
711 /// } : tensor<?x?xf32> to tensor<4x8xf32>
712 /// %1 = tensor.insert_slice %0 ...
713 /// : tensor<4x8xf32> to tensor<?x4x8xf32>
714 /// scf.yield %1: tensor<?x4x8xf32>
715 /// } -> tensor<?x4x8xf32>
716 /// scf.for (%j, %k) {
717 /// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
718 /// tensor<?x4x8xf32> to tensor<4x8xf32>
719 /// compute(%st0)
720 /// }
721 /// }
722 /// ```
723 FailureOr<Value>
724 hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
725  int64_t numLoops, ArrayRef<int64_t> transposeVector,
726  tensor::PadOp &hoistedOp,
727  SmallVectorImpl<TransposeOp> &transposeOps);
728 /// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
729 FailureOr<Value>
730 hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
731  ArrayRef<int64_t> transposeVector,
732  tensor::PadOp &hoistedOp,
733  SmallVectorImpl<TransposeOp> &transposeOps);
734 
735 /// Apply padding and hoisting to `linalgOp` according to the configuration
736 /// specified in `options`.
737 FailureOr<LinalgOp> padAndHoistLinalgOp(RewriterBase &rewriter,
738  LinalgOp linalgOp,
740 
741 /// Split the given `op` into two parts along the given iteration space
742 /// `dimension` at the specified `splitPoint`, and return the two parts.
743 /// If the second part is statically known to be empty, do not create it
744 /// and return nullptr instead. Error state is signalled by returning
745 /// a pair of nullptrs.
746 ///
747 /// For example, the following op:
748 ///
749 /// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>)
750 /// outs(%2 : tensor<128x64xf32>)
751 ///
752 /// split along the first dimension at position 42 will result in:
753 ///
754 /// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1]
755 /// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1]
756 /// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>)
757 /// outs(%5 : tensor<42x64xf32>)
758 /// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1]
759 ///
760 /// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1]
761 /// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1]
762 /// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>)
763 /// outs(%8 : tensor<86x64xf32>)
764 /// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1]
765 ///
766 /// Note that there is no simplification other than constant propagation applied
767 /// to slice extraction and insertion.
768 std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter,
769  TilingInterface op,
770  unsigned dimension,
771  OpFoldResult splitPoint);
772 
773 /// Perform standalone tiling of a single LinalgOp by `tileSizes`.
774 /// and permute the loop nest according to `interchangeVector`
775 /// The permutation is expressed as a list of integers that specify
776 /// the new ordering of the loop nest. The length of `interchangeVector`
777 /// must be equal to the length of `tileSizes`.
778 /// An empty vector is interpreted as the identity permutation and the
779 /// transformation returns early.
780 ///
781 /// Return a struct containing the tiled loops in the specified order
782 /// and the cloned op if successful, std::nullopt otherwise.
783 ///
784 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by
785 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
786 /// integers, in the range 0..`tileSizes.size()` without duplications
787 /// (i.e. `[1,1,2]` is an invalid permutation).
789  LinalgOp op;
792 };
793 FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op,
795 
796 /// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts
797 /// the index accesses of `op`. This is an in-place transformation controlled
798 /// by `interchangeVector`. An empty vector is interpreted as the identity
799 /// permutation and the transformation returns early.
800 ///
801 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
802 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
803 /// integers, in the range 0..`op.rank` without duplications
804 /// (i.e. `[1,1,2]` is an invalid permutation).
805 ///
806 /// Return failure if the permutation is not valid.
807 FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
808  GenericOp genericOp,
809  ArrayRef<unsigned> interchangeVector);
810 
811 /// Create a GenericOp from the given named operation `linalgOp` and replace
812 /// the given `linalgOp`.
813 /// Return failure if `linalgOp` is a GenericOp or misses a region builder.
814 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
815  LinalgOp linalgOp);
816 
817 /// Create a namedOp from the given GenericOp and replace the GenericOp.
818 /// Currently we can specialize only trivial linalg copy operations.
819 FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter,
820  GenericOp genericOp);
821 
822 /// Create a new buffer using the `allocationFn` provided. The size of this
823 /// buffer is either the original subview size when 'useOriginalSubviewSize' is
824 /// set to true or the smallest constant bounding size along each dimension that
825 /// can be computed for the size of the result of `subView`. Returns the
826 /// allocated buffer as `fullLocalView` and the view that matches the size of
827 /// the result of subview operation as `partialLocalView`.
831 };
832 FailureOr<PromotionInfo>
833 promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
834  bool useOriginalSubviewSize,
835  const AllocBufferCallbackFn &allocationFn,
836  DataLayout &layout);
837 
838 /// Promote the `subViews` into a new buffer allocated at the insertion point
839 /// `b`. Promotion occurs in 3 steps:
840 /// 1. Create a new buffer for a full tile (i.e. not clipped at the
841 /// boundary).
842 /// 2. Take a full view on the buffer.
843 /// 3. Take a partial slice of the full view in step 2. and copy into it.
844 ///
845 /// Return the modified linalg op (the modification happens in place) as well
846 /// as all the copy ops created.
847 FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
849 
850 /// Allocate the subview in the GPU workgroup memory.
851 std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
852  memref::SubViewOp subview,
853  ArrayRef<Value> sizeBounds,
854  DataLayout &);
855 
856 /// In case of GPU group memory there is no need to deallocate.
857 LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/);
858 
859 /// Create Memref copy operations and add gpu barrier guards before and after
860 /// the copy operation to ensure data integrity.
861 LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
862 
863 /// Allocate the subview in the GPU private memory.
864 std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
865  memref::SubViewOp subview,
866  ArrayRef<Value> sizeBounds,
867  DataLayout &);
868 
869 /// Normal copy to between src and dst.
870 LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst);
871 
872 /// In case of GPU private memory there is no need to deallocate since the
873 /// memory is freed when going outside of the scope.
874 LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/);
875 
876 /// Return true if there's dedicated logic in the Linalg Vectorizer to
877 /// vectorize this Op, false otherwise.
878 ///
879 /// Note that this helper merely implements a very high level check and that the
880 /// vectorizer also requires various additional pre-conditions to be met for it
881 /// to work (these are checked by the vectorizer itself).
883 
884 /// Transformation information returned after vectorizing.
886  /// Results of the vectorization transform to replace the original operation.
888 };
889 /// Returns a `VectorizationResult` containing the results of the vectorized op,
890 /// or failure if the transformation fails. If provided, `inputVectorSizes` are
891 /// used to vectorize this operation. `inputVectorSizes` must match the rank of
892 /// the iteration space of the operation and the input vector sizes must be
893 /// greater than or equal to their counterpart iteration space sizes, if static.
894 /// `inputVectorShapes` also allows the vectorization of operations with dynamic
895 /// shapes.
896 /// Optionally, `createNamedContraction` can force compatible contractions to be
897 /// vectorized directly to vector.contract operation.
898 FailureOr<VectorizationResult>
899 vectorize(RewriterBase &rewriter, Operation *op,
900  ArrayRef<int64_t> inputVectorSizes = {},
901  ArrayRef<bool> inputScalableVecDims = {},
902  bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false,
903  bool assumeDynamicDimsMatchVecSizes = false,
904  bool createNamedContraction = false);
905 
906 /// Emit a suitable vector form for a Copy op with fully static shape.
907 LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
908 
909 /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`.
910 FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter,
911  LinalgOp linalgOp);
912 
913 /// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`.
914 FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter,
915  LinalgOp linalgOp);
916 
917 /// Emit a loop nest of `affine.for` with the proper body for `linalgOp`.
918 FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter,
919  LinalgOp linalgOp);
920 
921 /// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
922 /// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument
923 /// has one entry per surrounding loop. It uses zero as the convention that a
924 /// particular loop is not tiled. This convention simplifies implementations
925 /// by avoiding affine map manipulations. The returned ranges correspond to
926 /// the loop ranges, in the proper order, that are tiled and for which new
927 /// loops will be created. Also the function returns a map from loop indices
928 /// of the LinalgOp to the corresponding non-empty range indices of newly
929 /// created loops.
931 std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
933  ArrayRef<OpFoldResult> allShapeSizes,
934  ArrayRef<OpFoldResult> allTileSizes);
935 
936 namespace detail {
937 template <typename T>
939  /// Tile sizes.
941  /// Number of tiles associated with each size.
943 };
944 
945 template <typename T>
947  /// Tile sizes.
949  /// Number of tiles associated with each size.
951 };
952 
953 } // namespace detail
954 
955 /// A description of a multi-size tiling comprising tile sizes and numbers of
956 /// tiles, expressed as Values which may or may not be constant. Multi-size
957 /// currently means two-size.
959  : public detail::MultiSizeSpecificationBase<Value> {};
961  : public detail::MultiSizeSpecificationBase<int64_t> {};
962 
966  : public detail::ContinuousTileSizeSpecificationBase<int64_t> {};
967 
968 /// Emits the IR computing the multi-sized tiling specification with two tile
969 /// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such
970 /// that there exist numbers of tiles with these sizes that fully cover the
971 /// given iteration space `dimension` of the structured `op`.
972 ///
973 /// The computation is as follows:
974 ///
975 /// b = originalTripCount floordiv sizeDivisor
976 /// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor
977 /// d = (b + t - 1) floordiv t
978 /// s = (b floordiv d) * sizeDivisor
979 /// v = b % d
980 /// u = d - v
981 ///
982 /// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of
983 /// the corresponding tiles are `u` and `v`, respectively. Alternatively,
984 ///
985 /// s * u + (s + sizeDivisor) * v == original size,
986 /// where s mod sizeDivisor = 0.
987 ///
988 /// Expects all values to be positive. In some cases with the target tile size
989 /// sufficiently close to the dimension shape and non-unit divisor, it is
990 /// impossible to compute such sizes. If `emitAssertion` is set, also emit the
991 /// assertion that size computation succeeded.
992 ///
993 /// Returns the specification consisting of both tile values and the number of
994 /// tiles of each size.
995 FailureOr<MultiSizeSpecification>
996 computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension,
997  OpFoldResult targetSize, OpFoldResult divisor,
998  bool emitAssertions = true);
999 FailureOr<StaticMultiSizeSpecification>
1000 computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
1001  int64_t divisor);
1002 
1003 FailureOr<StaticContinuousTileSizeSpecification>
1004 computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension,
1005  unsigned targetSize);
1006 FailureOr<ContinuousTileSizeSpecification>
1007 computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,
1008  unsigned dimension, OpFoldResult targetSize,
1009  bool emitAssertions);
1010 
1011 /// Transformation information returned after reduction tiling.
1013  /// The partial reduction tiled op generated.
1015  /// The final reduction operation merging all the partial reductions.
1017  /// Initial values used for partial reductions.
1019  /// The `scf.forall` operation that iterate over the tiles.
1020  scf::ForallOp loops;
1021 };
1022 
1023 /// Method to tile a reduction to parallel iterations computing partial
1024 /// reductions. After the loop all the partial reduction are merged into a final
1025 /// reduction. For example for the following sequence
1026 ///
1027 /// ```mlir
1028 /// %0 = linalg.generic %in ["parallel", "reduction"]
1029 /// : tensor<7x9xf32> -> tensor<7xf32>
1030 /// ```
1031 ///
1032 /// into:
1033 ///
1034 /// ```mlir
1035 /// %0 = linalg.fill ... : tensor<7x4xf32>
1036 /// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
1037 /// -> (tensor<7x4xf32>) {
1038 /// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
1039 /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
1040 /// %4 = linalg.generic %2, %3 ["parallel", "reduction"]
1041 /// : tensor<7x?xf32> -> tensor<7xf32>
1042 /// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32>
1043 /// }
1044 /// %6 = linalg.generic %1 ["parallel", "reduction"]
1045 /// : tensor<7x4xf32> -> tensor<7xf32>
1046 /// ```
1047 FailureOr<ForallReductionTilingResult>
1048 tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
1049  ArrayRef<OpFoldResult> numThreads,
1050  ArrayRef<OpFoldResult> tileSizes = {},
1051  std::optional<ArrayAttr> mapping = std::nullopt);
1052 
1053 /// All indices returned by IndexOp should be invariant with respect to
1054 /// tiling. Therefore, if an operation is tiled, we have to transform the
1055 /// indices accordingly, i.e. offset them by the values of the corresponding
1056 /// induction variables that are captured implicitly in the body of the op.
1057 ///
1058 /// Example. `linalg.generic` before tiling:
1059 ///
1060 /// #id_2d = (i, j) -> (i, j)
1061 /// #pointwise_2d_trait = {
1062 /// indexing_maps = [#id_2d, #id_2d],
1063 /// iterator_types = ["parallel", "parallel"]
1064 /// }
1065 /// linalg.generic #pointwise_2d_trait %operand, %result {
1066 /// ^bb0(%operand_in: f32, %result_in: f32):
1067 /// %i = linalg.index 0 : index
1068 /// %j = linalg.index 1 : index
1069 /// <some operations that use %i, %j>
1070 /// }: memref<50x100xf32>, memref<50x100xf32>
1071 ///
1072 /// After tiling pass with tiles sizes 10 and 25:
1073 ///
1074 /// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
1075 ///
1076 /// %c1 = arith.constant 1 : index
1077 /// %c0 = arith.constant 0 : index
1078 /// %c25 = arith.constant 25 : index
1079 /// %c10 = arith.constant 10 : index
1080 /// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
1081 /// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
1082 /// scf.for %k = %c0 to operand_dim_0 step %c10 {
1083 /// scf.for %l = %c0 to operand_dim_1 step %c25 {
1084 /// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
1085 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
1086 /// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1]
1087 /// : memref<50x100xf32> to memref<?x?xf32, #strided>
1088 /// linalg.generic pointwise_2d_trait %4, %5 {
1089 /// ^bb0(%operand_in: f32, %result_in: f32):
1090 /// %i = linalg.index 0 : index
1091 /// %j = linalg.index 1 : index
1092 /// // Indices `k` and `l` are implicitly captured in the body.
1093 /// %transformed_i = arith.addi %i, %k : index // index `i` is offset by
1094 /// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset
1095 /// by %l
1096 /// // Every use of %i, %j is replaced with %transformed_i,
1097 /// %transformed_j <some operations that use %transformed_i,
1098 /// %transformed_j>
1099 /// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
1100 /// }
1101 /// }
1102 ///
1103 /// TODO: Investigate whether mixing implicit and explicit indices
1104 /// does not lead to losing information.
1105 void transformIndexOps(RewriterBase &b, LinalgOp op,
1107  const LoopIndexToRangeIndexMap &loopIndexToRangeIndex);
1108 
1109 /// Apply transformation to split the single linalg op reduction into a
1110 /// parallel and reduction dimension. Then create a new linalg.generic op
1111 /// doing the rest of the reduction. Return the new linalg op with an extra
1112 /// parallel dimension or failure if the transformation didn't happen.
1113 ///
1114 /// Example:
1115 /// ```
1116 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
1117 /// affine_map<(d0) -> ()>],
1118 /// iterator_types = ["reduction"]}
1119 /// ins(%in : tensor<32xf32>)
1120 /// outs(%out : tensor<f32>) {
1121 /// ^bb0(%arg1: f32, %arg2: f32):
1122 /// %y = arith.addf %arg1, %arg2 : f32
1123 /// linalg.yield %y : f32
1124 /// } -> tensor<f32>
1125 /// ```
1126 /// To:
1127 /// ```
1128 /// %cst = arith.constant 0.000000e+00 : f32
1129 /// %0 = tensor.expand_shape %in [[0, 1]]: tensor<32xf32> into tensor<4x8xf32>
1130 /// %1 = tensor.empty [4] : tensor<4xf32>
1131 /// %2 = linalg.fill ins(%cst : f32)
1132 /// outs(%1 : tensor<4xf32>) -> tensor<4xf32>
1133 /// %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
1134 /// affine_map<(d0, d1) -> (d0)>],
1135 /// iterator_types = ["parallel", "reduction"]}
1136 /// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) {
1137 /// ^bb0(%arg3: f32, %arg5: f32):
1138 /// %5 = arith.addf %arg3, %arg4 : f32
1139 /// linalg.yield %5 : f32
1140 /// } -> tensor<4xf32>
1141 /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
1142 /// affine_map<(d0) -> ()>],
1143 /// iterator_types = ["reduction"]}
1144 /// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) {
1145 /// ^bb0(%arg3: f32, %arg4: f32):
1146 /// %5 = arith.addf %arg3, %arg4 : f32
1147 /// linalg.yield %5 : f32
1148 /// } -> tensor<f32>
1149 /// ```
1152  FillOp fillOp;
1153  LinalgOp splitLinalgOp;
1155 };
1156 FailureOr<SplitReductionResult>
1157 splitReduction(RewriterBase &b, LinalgOp op,
1158  const ControlSplitReductionFn &controlSplitReductionFn,
1159  bool useAlloc = false);
1160 
1161 /// Scaling-based implementation of the split reduction transformation.
1162 /// Instead of introducing an ExpandShapeOp, this rewrites a reduction
1163 /// dimension `k` into `k * scale + kk`.
1164 ///
1165 /// Example:
1166 /// ```
1167 /// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
1168 /// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
1169 /// ```
1170 ///
1171 /// Is transformed to:
1172 ///
1173 /// ```
1174 /// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
1175 /// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
1176 /// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
1177 /// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1178 /// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
1179 /// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
1180 /// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32>
1181 /// %cst = arith.constant 0.000000e+00 : f32
1182 /// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
1183 /// tensor<16x32x64xf32>
1184 /// %2 = tensor.empty [64, 4] : tensor<64x4xi1>
1185 ///
1186 /// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
1187 /// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
1188 /// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>,
1189 /// tensor<64x4xi1>)
1190 /// outs(%1 : tensor<16x32x64xf32>) {
1191 /// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
1192 /// %5 = arith.mulf %arg3, %arg4 : f32
1193 /// %6 = arith.addf %arg6, %5 : f32
1194 /// linalg.yield %6 : f32
1195 /// } -> tensor<16x32x64xf32>
1196 ///
1197 /// %4 = linalg.generic {indexing_maps = [#map4, #map5],
1198 /// iterator_types = ["parallel", "parallel", "reduction"]}
1199 // ins(%3 : tensor<16x32x64xf32>)
1200 /// outs(%C : tensor<16x32xf32>) {
1201 /// ^bb0(%arg3: f32, %arg4: f32):
1202 /// %5 = arith.addf %arg3, %arg4 : f32
1203 /// linalg.yield %5 : f32
1204 /// } -> tensor<16x32xf32>
1205 ///
1206 /// return %4 : tensor<16x32xf32>
1207 /// ```
1208 FailureOr<SplitReductionResult>
1209 splitReductionByScaling(RewriterBase &b, LinalgOp op,
1210  const ControlSplitReductionFn &controlSplitReductionFn,
1211  bool useAlloc = false);
1212 
1213 /// Return `true` if a given sequence of dimensions are contiguous in the
1214 /// range of the specified indexing map.
1216 /// Return `true` if all sequences of dimensions specified in `dimSequences` are
1217 /// contiguous in all the ranges of the `maps`.
1219  ArrayRef<ReassociationIndices> dimSequences);
1220 
1223  LinalgOp collapsedOp;
1224 };
1225 
1226 /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
1227 /// to calling this method is that for each list in `foldedIterationDim`, the
1228 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
1229 /// the `linalgOp`. This can be checked using `areDimSequencePreserved` method.
1230 /// When valid, the method also collapses the operands of the op. Returns
1231 /// replacement values of the results of the original `linalgOp` by inserting
1232 /// reshapes to get back values of compatible types.
1233 FailureOr<CollapseResult>
1234 collapseOpIterationDims(LinalgOp op,
1235  ArrayRef<ReassociationIndices> foldedIterationDims,
1236  RewriterBase &rewriter);
1237 
1239  tensor::PadOp padOp;
1240  tensor::ExpandShapeOp expandShapeOp;
1241  linalg::TransposeOp transposeOp;
1242 };
1243 
1244 /// Rewrite pack as pad + reshape + transpose.
1245 FailureOr<LowerPackResult> lowerPack(RewriterBase &rewriter,
1246  linalg::PackOp packOp,
1247  bool lowerPadLikeWithInsertSlice = true);
1248 
1250  tensor::EmptyOp emptyOp;
1251  linalg::TransposeOp transposeOp;
1252  tensor::CollapseShapeOp collapseShapeOp;
1253  tensor::ExtractSliceOp extractSliceOp;
1254 };
1255 
1256 /// Rewrite pack as empty + transpose + reshape + extract_slice.
1257 FailureOr<LowerUnPackOpResult>
1258 lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp,
1259  bool lowerUnpadLikeWithExtractSlice = true);
1260 
1261 /// Struct to hold the result of a `pack` call.
1262 struct PackResult {
1264  linalg::LinalgOp packedLinalgOp;
1266 };
1267 /// Implement packing of a single LinalgOp by `packedSizes`.
1268 /// There must be one packedSizes entry per `linalgOp` iterator.
1269 /// Return the packed Linalg op on success, failure otherwise.
1270 FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1271  ArrayRef<OpFoldResult> packedSizes);
1272 
1273 /// Struct to hold the result of a `packTranspose` call.
1275  linalg::PackOp transposedPackOp;
1276  linalg::LinalgOp transposedLinalgOp;
1277  linalg::UnPackOp transposedUnPackOp;
1278 };
1279 /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the
1280 /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements.
1281 /// Return failure if either:
1282 /// 1. the `packOp` does not have the `linalgOp` as its unique use.
1283 /// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied
1284 /// to the unique `packOp` use.
1285 /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of
1286 /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty.
1287 FailureOr<PackTransposeResult>
1288 packTranspose(RewriterBase &rewriter, linalg::PackOp packOp,
1289  linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp,
1290  ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm);
1291 
1292 /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m
1293 /// and n are proper parallel dimensions and k is a proper reduction
1294 /// dimension. Packing occurs by rewriting the op as a linalg.generic and
1295 /// calling linalg::pack by `mnkPackedSizes`. The order of the packed
1296 /// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2}
1297 /// to reorder {m, n, k} into one of the 8 possible forms. The outer
1298 /// dimensions of the operands are not permuted at this time, this is left for
1299 /// future work.
1300 FailureOr<PackResult>
1301 packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp,
1302  ArrayRef<OpFoldResult> mnkPackedSizes,
1303  ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf,
1304  ArrayRef<int64_t> mnkOrder);
1305 
1307  /// Minor block factors (mb, nb, kb) for packing relayout where mb, mn are
1308  /// the parallel dimensions and kb is the reduction dimension.
1310 
1311  /// If true, allows packing of dimensions that only partially fit into the
1312  /// block factors.
1313  bool allowPadding = true;
1314 
1315  /// Next multiples of the packing sizes.
1317 
1318  /// Permutation of matmul (M, N, K) dimensions order.
1320 
1321  /// Transpose LHS outer block layout [MB][KB] -> [KB][MB].
1323 
1324  /// Transpose LHS inner block layout [mb][kb] -> [kb][mb].
1326 
1327  /// Transpose RHS outer block layout [KB][NB] -> [NB][KB].
1329 
1330  /// Transpose RHS inner block layout [kb][nb] -> [nb][kb].
1332 };
1333 
1334 /// Function type which is used to control matmul packing.
1335 /// It is expected to return valid packing configuration for each operation.
1336 /// Lack of packing options indicates that no valid configuration could be
1337 /// assigned and the operation will not be packed.
1339  std::function<std::optional<BlockPackMatmulOptions>(linalg::LinalgOp)>;
1340 
1341 /// Pack a matmul operation into blocked 4D layout.
1342 ///
1343 /// Relayout a matmul operation into blocked layout with two levels of
1344 /// subdivision:
1345 /// - major 2D blocks - outer dimensions, consist of minor blocks
1346 /// - minor 2D blocks - inner dimensions, consist of scalar elements
1347 ///
1348 /// A 2D matmul MxNxK gets reshaped into blocked 4D representation
1349 /// as: [MB][NB][mb][nb] += [MB][KB][mb][kb] * [NB][KB][nb][kb]
1350 /// where the (MB, NB, KB) dimensions represent the major blocks,
1351 /// and the (mb, nb, kb) are the minor blocks of their respective
1352 /// original 2D dimensions (M, N, K).
1353 ///
1354 /// Depending on the initial operands' data layout and the specified
1355 /// packing options, the major blocks dimensions might get transposed
1356 /// e.g., [MB][KB] -> [KB][MB]. The minor blocks can also be transposed
1357 /// e.g., [mb][kb] -> [kb][mb].
1358 /// Any present batch dimensions remain unchanged.
1359 /// The final result is unpacked back to the original shape.
1360 ///
1361 /// Return failure if no valid packing options are provided.
1362 FailureOr<PackResult>
1363 blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1364  const ControlBlockPackMatmulFn &controlPackMatmul);
1365 
1366 /// Rewrite tensor.from_elements to linalg.generic.
1367 FailureOr<Operation *>
1369  tensor::FromElementsOp fromElementsOp);
1370 
1371 /// Rewrite tensor.generate to linalg.generic.
1372 FailureOr<Operation *>
1374  tensor::GenerateOp generateOp);
1375 
1376 /// Rewrite tensor.pad to linalg.generic + tensor.insert_slice.
1377 FailureOr<Operation *> rewriteInDestinationPassingStyle(RewriterBase &rewriter,
1378  tensor::PadOp padOp);
1379 
1380 /// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing)
1381 /// and linalg.matmul.
1382 ///
1383 /// A convolution operation can be written as a matrix-matrix multiplication by
1384 /// unfolding the cross-correlation between input and filter and explicitly copy
1385 /// overlapped sliding window inputs.
1386 ///
1387 /// Consider 2D input X with single channel input and output and 2x2 filter W:
1388 /// [x(0, 0) , x(0, 1) , ..., x(0, n) ]
1389 /// [x(1, 0) , x(1, 1) , ..., x(1, n) ]
1390 /// [. , . ,. , . ] [w(0, 0), w(0, 1)]
1391 /// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)]
1392 /// [. , . , ., . ]
1393 /// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)]
1394 ///
1395 /// The packed input data (img2col) is a matrix with |rows| = output spatial
1396 /// size, |columns| = filter spatial size. To compute the output Y(i, j) we need
1397 /// to calculate the dot product between filter window at input X(x, y)) and the
1398 /// filter which will look like the following where r.h.s is the img2col matrix
1399 /// and l.h.s is the flattened filter:
1400 ///
1401 /// [x(0,0), x(0,1), x(1,0), x(1,1)]
1402 /// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)]
1403 /// [x(0,1), x(1,1), x(0,2), x(1,2)]
1404 /// [ . , . , . , . ]
1405 ///
1406 /// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter
1407 /// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix
1408 /// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in
1409 /// the N input. For the case where N > 1 its a batched matrix-matrix
1410 /// multiplication.
1411 ///
1412 /// On success, return both the operation that produces the img2col tensor and
1413 /// the final operation of the sequence that replaces the original convolution.
1414 FailureOr<std::pair<Operation *, Operation *>>
1415 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp);
1416 
1417 /// Same as the above but for Fhwc channel orderings in the filter. In this case
1418 /// the matrix multiplication is actually a row-wise dot-product rather than a
1419 /// row-column dot-product. This is to avoid transposing the filter matrix which
1420 /// would be required for a regular matrix multiplication to produce the correct
1421 /// output dimensions.
1422 FailureOr<std::pair<Operation *, Operation *>>
1423 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp);
1424 
1425 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no
1426 /// reduction among the input channels so each convolution can be a
1427 /// matrix-vector product and by transposing both input filter so channels are
1428 /// outer most the computation is a batched matrix-vector product.
1429 FailureOr<std::pair<Operation *, Operation *>>
1430 rewriteInIm2Col(RewriterBase &rewriter,
1431  linalg::DepthwiseConv2DNhwcHwcOp convOp);
1432 
1433 /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the
1434 /// channels are to the left of the image shape dimensions, the position of the
1435 /// contraction dimension in the resulting matmul is reversed. This swaps the
1436 /// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) *
1437 /// (C x Kh x Kw, Ho x Wo))
1438 FailureOr<std::pair<Operation *, Operation *>>
1439 rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp);
1440 
1441 /// Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by
1442 /// materializing transpose.
1443 FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1444  linalg::Conv2DNhwcFhwcOp op);
1445 FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1446  linalg::Conv2DNhwcFhwcQOp op);
1447 
1448 /// Convert Linalg matmul ops to transposed variants.
1449 FailureOr<Operation *> transposeMatmul(RewriterBase &rewriter,
1450  linalg::MatmulOp op,
1451  bool transposeLHS = true);
1452 FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter,
1453  linalg::BatchMatmulOp op,
1454  bool transposeLHS = true);
1455 
1456 /// Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm
1457 /// F(m x m, r x r). m is the dimension size of output and r is the dimension
1458 /// size of filter.
1459 FailureOr<Operation *> winogradConv2D(RewriterBase &rewriter,
1460  linalg::Conv2DNhwcFhwcOp op,
1461  WinogradConv2DFmr fmr);
1462 
1463 /// Rewrite linalg.winograd_filter_transform. The data layout of the filter is
1464 /// FHWC. The transformation matrix is 2-dimension. We need to extract H x W
1465 /// from FHWC first. We generate 2 levels of loops to iterate on F and C. After
1466 /// the rewriting, we get
1467 ///
1468 /// scf.for %f = lo_f to hi_f step 1
1469 /// scf.for %c = lo_c to hi_c step 1
1470 /// %extracted = extract filter<h x w> from filter<f x h x w x c>
1471 /// %ret = linalg.matmul G, %extracted
1472 /// %ret = linalg.matmul %ret, GT
1473 /// %inserted = insert %ret into filter<h x w x c x f>
1474 FailureOr<Operation *>
1476  linalg::WinogradFilterTransformOp op);
1477 
1478 /// Rewrite linalg.winograd_input_transform. The data layout of the input is
1479 /// NHWC. The transformation matrix is 2-dimension. We need to extract H x W
1480 /// from NHWC first. We generate 4 levels of loops to iterate on N, C, tileH,
1481 /// and tileW. After the rewriting, we get
1482 ///
1483 /// scf.for %h = 0 to tileH step 1
1484 /// scf.for %w = 0 to tileW step 1
1485 /// scf.for %n = 0 to N step 1
1486 /// scf.for %c = 0 to C step 1
1487 /// %extracted = extract %extracted<alphaH x alphaW> from
1488 /// %input<N x H x W x C>
1489 /// at [%n, (%h x m), (%w x m), %c]
1490 /// %ret = linalg.matmul BT, %extracted
1491 /// %ret = linalg.matmul %ret, B
1492 /// %inserted = insert %ret<alphaH x alphaW> into
1493 /// %output<alphaH x alphaW x tileH x tileW x N x C>
1494 /// at [0, 0, %h, %w, %n, %c]
1495 FailureOr<Operation *>
1497  linalg::WinogradInputTransformOp op);
1498 
1499 /// Rewrite linalg.winograd_output_transform. The data layout of the output is
1500 /// HWNF. The transformation matrix is 2-dimension. We need to extract H x W
1501 /// from HWNF first. We generate 4 levels of loops to iterate on N, F, tileH,
1502 /// and tileW. After the transformation, we get
1503 ///
1504 /// scf.for %h = 0 to tileH step 1
1505 /// scf.for %w = 0 to tileW step 1
1506 /// scf.for %n = 0 to N step 1
1507 /// scf.for %f = 0 to F step 1
1508 /// %extracted = extract %extracted<alphaH x alphaW> from
1509 /// %input<alphaH x alphaW x tileH x tileW x N x F>
1510 /// at [0, 0, %h, %w, %n, %f]
1511 /// %ret = linalg.matmul AT, %extracted
1512 /// %ret = linalg.matmul %ret, A
1513 /// %inserted = insert %ret<alphaH x alphaW> into
1514 /// output<N x H x W x F>
1515 /// at [%n, (%h x m), (%w x m), %f]
1516 FailureOr<Operation *>
1518  linalg::WinogradOutputTransformOp op);
1519 
1520 /// Method to deduplicate operands and remove dead results of `linalg.generic`
1521 /// operations. This is effectively DCE for a linalg.generic op. If there is
1522 /// deduplication of operands orremoval of results, replaces the `genericOp`
1523 /// with a new op and returns it. Returns the same operation if there is no
1524 /// deduplication/removal.
1525 FailureOr<linalg::GenericOp> deduplicateOperandsAndRemoveDeadResults(
1526  RewriterBase &rewriter, linalg::GenericOp genericOp, bool removeOutputs);
1527 
1528 //===----------------------------------------------------------------------===//
1529 // Rewrite patterns wrapping transformations.
1530 // TODO: every single such pattern should be a close to noop wrapper around a
1531 // functional-stye API call.
1532 //===----------------------------------------------------------------------===//
1533 
1534 /// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D
1535 /// convolution ops.
1536 template <typename Conv2DOp, typename Conv1DOp>
1538  : public OpRewritePattern<Conv2DOp> {
1540 
1541  FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp,
1542  PatternRewriter &rewriter) const;
1543 
1544  LogicalResult matchAndRewrite(Conv2DOp convOp,
1545  PatternRewriter &rewriter) const override {
1546  return returningMatchAndRewrite(convOp, rewriter);
1547  }
1548 };
1549 
1550 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
1551  Conv1DNwcWcfOp>;
1552 extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
1553  Conv1DNcwFcwOp>;
1554 
1555 /// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh)
1556 /// dimensions into 1-D depthwise convolution ops.
1558  : public OpRewritePattern<DepthwiseConv2DNhwcHwcOp> {
1560  PatternBenefit benefit = 1)
1561  : OpRewritePattern<DepthwiseConv2DNhwcHwcOp>(context, benefit) {}
1562 
1563  FailureOr<DepthwiseConv1DNwcWcOp>
1564  returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1565  PatternRewriter &rewriter) const;
1566 
1567  LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp,
1568  PatternRewriter &rewriter) const override {
1569  return returningMatchAndRewrite(convOp, rewriter);
1570  }
1571 };
1572 
1573 struct DownscaleConv2DOp final : public OpRewritePattern<Conv2DOp> {
1575  : OpRewritePattern<Conv2DOp>(context, benefit) {}
1576 
1577  FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp,
1578  PatternRewriter &rewriter) const;
1579 
1580  LogicalResult matchAndRewrite(Conv2DOp convOp,
1581  PatternRewriter &rewriter) const override {
1582  return returningMatchAndRewrite(convOp, rewriter);
1583  }
1584 };
1585 
1586 ///
1587 /// Linalg generalization pattern.
1588 ///
1589 /// Apply the `generalization` transformation as a pattern.
1590 /// See `generalization` for more details.
1591 //
1592 // TODO: Automatic default pattern class that just unwraps a function
1593 // returning FailureOr<GenericOp>.
1595  : public OpInterfaceRewritePattern<LinalgOp> {
1597 
1598  /// `matchAndRewrite` implementation that returns the significant
1599  /// transformed pieces of IR.
1600  FailureOr<GenericOp>
1601  returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const {
1602  return generalizeNamedOp(rewriter, op);
1603  }
1604 
1605  LogicalResult matchAndRewrite(LinalgOp op,
1606  PatternRewriter &rewriter) const override {
1607  return returningMatchAndRewrite(op, rewriter);
1608  }
1609 };
1610 
1611 struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
1613 
1614  FailureOr<GenericOp>
1615  returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
1616  return specializeGenericOp(rewriter, op);
1617  }
1618 
1619  LogicalResult matchAndRewrite(GenericOp op,
1620  PatternRewriter &rewriter) const override {
1621  return returningMatchAndRewrite(op, rewriter);
1622  }
1623 };
1624 
1625 /// Vectorization pattern for memref::CopyOp.
1626 struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
1628 
1629  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
1630  PatternRewriter &rewriter) const override;
1631 };
1632 
1634  std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>;
1635 
1636 /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
1637 /// InsertSliceOp. For now, only constant padding values are supported.
1638 struct DecomposePadOpPattern : public OpRewritePattern<tensor::PadOp> {
1640  : OpRewritePattern<tensor::PadOp>(context, benefit) {}
1641  LogicalResult matchAndRewrite(tensor::PadOp padOp,
1642  PatternRewriter &rewriter) const override;
1643 
1644 protected:
1645  Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp,
1646  Value dest,
1647  const SmallVector<Value> &dynSizes) const;
1648 };
1649 
1650 /// Rewrites a linalg::PackOp into a sequence of:
1651 /// * tensor::PadOp + linalg::TransposeOp + tensor::EmptyOp +
1652 /// tensor::InsertSliceOp ops.
1653 ///
1654 /// Requires that all the outer dims of the input linalg::PackOp are 1.
1655 ///
1656 /// Before:
1657 /// ```
1658 /// %packed = linalg.pack %input
1659 /// padding_value(%pad : f32)
1660 /// inner_dims_pos = [1, 0]
1661 /// inner_tiles = [2, %high]
1662 /// into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32>
1663 /// ```
1664 ///
1665 /// After:
1666 /// ```
1667 /// // PadOp
1668 /// %padded = tensor.pad %arg0 low[0, 0] high[%0, 1] {
1669 /// ^bb0(...):
1670 /// tensor.yield %arg2 : f32
1671 /// } : tensor<5x1xf32> to tensor<?x2xf32>
1672 /// // EmptyOp + TransposeOp
1673 /// %empty = tensor.empty(%arg3) : tensor<2x?xf32>
1674 /// %transposed = linalg.transpose
1675 /// ins(%extracted_slice : tensor<?x2xf32>)
1676 /// outs(%empty : tensor<2x?xf32>)
1677 /// permutation = [1, 0]
1678 /// // InsertSliceOp
1679 /// %inserted_slice = tensor.insert_slice %transposed
1680 /// into %arg1[0, 0, 0, 0] [1, 1, 2, %tile_dim_1] [1, 1, 1, 1]
1681 /// : tensor<2x?xf32> into tensor<1x1x2x?xf32>
1682 /// ```
1684  : public OpRewritePattern<linalg::PackOp> {
1686  LogicalResult matchAndRewrite(linalg::PackOp packOp,
1687  PatternRewriter &rewriter) const override;
1688 };
1689 
1690 /// Rewrites a linalg::UnPackOp into a sequence of rank-reduced
1691 /// * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp
1692 ///
1693 /// Requires that all the tiled outer dims of the input linalg::PackOp are 1.
1694 ///
1695 /// Before:
1696 /// ```
1697 /// %packed = linalg.unpack %input
1698 /// inner_dims_pos = [1, 0]
1699 /// inner_tiles = [2, 8]
1700 /// into %output : tensor<1x1x2x8xf32> -> tensor<5x1xf32>
1701 /// ```
1702 ///
1703 /// After:
1704 /// ```
1705 /// // Rank-reduced extract to obtain the tile
1706 /// %slice = tensor.extract_slice %arg0[0, 0, 0, 0] [1, 1, 2, 8] [1, 1, 1, 1]
1707 /// : tensor<1x1x2x8xf32> to tensor<2x8xf32>
1708 /// // EmptyOp + TransposeOp
1709 /// %init = tensor.empty() : tensor<8x2xf32>
1710 /// %transposed = linalg.transpose
1711 /// ins(%extracted_slice : tensor<2x8xf32>)
1712 /// outs(%0 : tensor<8x2xf32>) permutation = [1, 0]
1713 /// // Extract a slice matching the specified output size
1714 /// %result = tensor.extract_slice %transposed[0, 0] [5, 1] [1, 1]
1715 /// : tensor<8x2xf32> to tensor<5x1xf32>
1716 /// ```
1718  : public OpRewritePattern<linalg::UnPackOp> {
1720  LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp,
1721  PatternRewriter &rewriter) const override;
1722 };
1723 
1724 /// Match and rewrite for the pattern:
1725 /// ```
1726 /// %alloc = ...
1727 /// [optional] %view = memref.view %alloc ...
1728 /// %subView = subview %allocOrView ...
1729 /// [optional] linalg.fill(%allocOrView, %cst) ...
1730 /// ...
1731 /// memref.copy(%in, %subView) ...
1732 /// vector.transfer_read %allocOrView[...], %cst ...
1733 /// ```
1734 /// into
1735 /// ```
1736 /// [unchanged] %alloc = ...
1737 /// [unchanged] [optional] %view = memref.view %alloc ...
1738 /// [unchanged] [unchanged] %subView = subview %allocOrView ...
1739 /// ...
1740 /// vector.transfer_read %in[...], %cst ...
1741 /// ```
1742 /// Where there is no interleaved use between memref.copy and transfer_read as
1743 /// well as no interleaved use between linalg.fill and memref.copy (if
1744 /// linalg.fill is specified).
1745 /// This is a custom rewrite to forward partial reads (with optional fills) to
1746 /// vector.transfer_read.
1748  : public OpRewritePattern<vector::TransferReadOp> {
1750 
1751  LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
1752  PatternRewriter &rewriter) const override;
1753 };
1754 
1755 /// Match and rewrite for the pattern:
1756 /// ```
1757 /// %alloc = ...
1758 /// [optional] %view = memref.view %alloc ...
1759 /// %subView = subview %allocOrView...
1760 /// ...
1761 /// vector.transfer_write %..., %allocOrView[...]
1762 /// memref.copy(%subView, %out)
1763 /// ```
1764 /// into
1765 /// ```
1766 /// [unchanged] %alloc = ...
1767 /// [unchanged] [optional] %view = memref.view %alloc ...
1768 /// [unchanged] %subView = subview %allocOrView...
1769 /// ...
1770 /// vector.transfer_write %..., %out[...]
1771 /// ```
1772 /// Where there is no interleaved use between transfer_write and memref.copy.
1773 /// This is a custom rewrite to forward partial writes to
1774 /// vector.transfer_write.
1776  : public OpRewritePattern<vector::TransferWriteOp> {
1778 
1779  LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
1780  PatternRewriter &rewriter) const override;
1781 };
1782 
1783 /// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
1785  : public OpRewritePattern<tensor::ExtractSliceOp> {
1786  /// A function to control pattern application and rewrite logic.
1787  ///
1788  /// The function will be given the slice op and should return:
1789  /// - std::nullopt: to fail the match and not apply the pattern;
1790  /// - true: to apply the pattern with zero slice guard;
1791  /// - false: to apply the pattern without zero slice guard.
1792  ///
1793  /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice
1794  /// guard.
1795  using ControlFn = std::function<std::optional<bool>(tensor::ExtractSliceOp)>;
1796 
1798  ControlFn controlFn = nullptr,
1799  PatternBenefit benefit = 1)
1800  : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {}
1801 
1802  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
1803  PatternRewriter &rewriter) const override;
1804 
1805 private:
1806  ControlFn controlFn;
1807 };
1808 
1809 //===----------------------------------------------------------------------===//
1810 // Populate functions.
1811 //===----------------------------------------------------------------------===//
1812 
1813 /// Canonicalization patterns relevant to apply after tiling patterns. These
1814 /// are applied automatically by the tiling pass but need to be applied
1815 /// manually when tiling is called programmatically.
1818 
1819 /// Linalg generalization patterns
1820 
1821 /// Populates `patterns` with patterns to convert spec-generated named ops to
1822 /// linalg.generic ops.
1824 
1825 /// Populates `patterns` with patterns to convert linalg.generic ops to named
1826 /// ops where possible. A linalg.generic can represent wide range and complex
1827 /// computations for which equivalent linalg named op may not exist e.g.
1828 /// linalg.generic that takes a tensor and computes a polynomial such as:
1829 /// p(x) = an*x^n + ... + a1x + a0
1830 /// There is no equivalent named op to convert to. Many such cases exist.
1833 
1834 /// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
1835 /// to equivalent `linalg.elementwise`.
1837 
1838 /// Populates `patterns` with patterns that fold operations like
1839 /// `linalg.transform` into elementwise op map.
1841 
1842 /// Linalg decompose convolutions patterns
1843 
1844 /// Populates patterns to decompose high-D convolution ops into low-D ones.
1845 /// This is a step in progressive lowering for convolution ops, afterwards we
1846 /// can vectorize the low-D convolution ops.
1848  PatternBenefit benefit = 1);
1849 
1850 /// Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g.
1851 /// tensor.pad, linalg.transpose, tensor.{insert|extract}_slice. Require all
1852 /// outer dims to be unit.
1854 
1855 /// Populates patterns to decompose tensor.pad into e.g.
1856 /// tensor.empty, linalg.fill, tensor.insert_slice.
1858 
1859 /// Populates patterns to transform linalg.conv_2d_xxx operations into
1860 /// linalg.generic (for img2col packing) and linalg.matmul.
1861 /// \see rewriteInIm2Col for more details.
1863 
1864 /// Populates `patterns` with patterns that vectorize tensor.pad.
1865 /// These patterns are meant to apply in a complementary fashion. Benefits
1866 /// are used to encode a certain ordering of pattern application. To avoid
1867 /// scattering magic constants throughout the code base, the patterns must be
1868 /// added with this function. `baseBenefit` can be used to offset the benefit
1869 /// of all tensor::PadOp vectorization patterns by a certain value.
1871  PatternBenefit baseBenefit = 1);
1872 
1873 /// Populate patterns for splitting a `LinalgOp` with multiple statements within
1874 /// its payload into multiple `GenericOp` that have a single statement.
1875 /// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments
1876 /// and results from the generated decomposed ops. This is default `true` since
1877 /// the core decomposition patterns relies on these clean up patterns. It is set
1878 /// to false only for testing purposes.
1880  bool removeDeadArgsAndResults = true);
1881 
1882 /// Populate patterns that convert non-destination-style ops to destination
1883 /// style ops.
1885 
1886 /// Populate patterns for vectorizing low-D convolution ops. This is a step in
1887 /// progressive lowering for convolution ops, it assume high-D convolution ops
1888 /// were decomposed previously.
1890  PatternBenefit benefit = 1);
1891 
1892 /// Populate patterns that convert `ElementwiseMappable` ops to linalg
1893 /// parallel loops.
1895 
1896 /// Populate patterns that are only useful in the context of sparse tensors.
1898 
1899 /// Function type which is used to control when to stop fusion. It is expected
1900 /// that OpOperand is not modified in the callback. The OpOperand is not marked
1901 /// as const to allow callers to use non-const methods.
1902 using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>;
1903 
1904 /// Patterns for fusing linalg operation on tensors.
1905 
1906 /// Pattern to fuse `linalg.generic` -> `linalg.generic` operations
1907 /// when both operations are fusable elementwise operations.
1910  const ControlFusionFn &controlElementwiseOpFusion);
1911 
1912 /// Function type which is used to control propagation of linalg.pack/unpack
1913 /// ops.
1914 using ControlPropagationFn = std::function<bool(OpOperand *opOperand)>;
1915 
1916 /// Patterns to bubble up or down data layout ops across other operations.
1919  const ControlPropagationFn &controlPackUnPackPropagation);
1920 
1921 /// Patterns to sink extract slice across other operations.
1924  const ControlPropagationFn &controlPackUnPackPropagation);
1925 
1926 /// Pattern to remove dead operands and results of `linalg.generic` operations.
1927 /// This is a pattern wrapper for `deduplicateOperandsAndRemoveDeadResults`.
1929 
1930 /// Patterns to promote inputs to outputs and remove unused inputs of
1931 /// `linalg.generic` ops.
1933 
1934 /// Function type to control generic op dimension collapsing. It is expected
1935 /// to return an array of `ReassociationIndices` representing dimensions that
1936 /// should be merged.
1938  std::function<SmallVector<ReassociationIndices>(linalg::LinalgOp)>;
1939 
1940 /// Pattern to collapse dimensions in a linalg.generic op. This will collapse
1941 /// tensor operands when needed and expand back the result tensors.
1944  const GetCollapsableDimensionsFn &controlCollapseDimensions);
1945 
1946 /// Patterns to fold an expanding (collapsing) tensor_reshape operation with its
1947 /// producer (consumer) generic operation by expanding the dimensionality of the
1948 /// loop in the generic op.
1950  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1951 
1952 /// Patterns to fold an expanding tensor.expand_shape operation with its
1953 /// producer generic operation by collapsing the dimensions of the generic op.
1955  RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
1956 
1957 /// Patterns to constant fold Linalg operations.
1959  const ControlFusionFn &controlFn);
1960 
1961 /// Pattern to replace `linalg.add` when destination passing on a contraction op
1962 /// suffices for achieving the sum.
1964 
1965 /// Pattern to fuse a `tensor.pad` operation with the producer of its source,
1966 /// if the producer is a `linalg` operation with all parallel iterator types.
1969 
1970 /// Patterns to simplify depthwise convolutions.
1972 
1973 /// Patterns to fold unit-extent dimensions in operands/results of linalg ops on
1974 /// tensors via reassociative reshape ops.
1977 
1978 /// A pattern that converts init operands to input operands.
1980 
1981 /// Patterns that are used to inline constant operands into linalg generic ops.
1983 
1984 /// Patterns that are used to bubble up extract slice op above linalg op.
1986 
1987 /// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into
1988 /// linalg.fill(%cst, tensor.extract_slice(%init)).
1990 
1991 /// Add patterns to make explicit broadcasts and transforms in the
1992 /// input operands of a genericOp.
1994 
1995 /// Patterns to apply `splitReduction` below.
1998  const ControlSplitReductionFn &controlSplitReductionFn,
1999  bool useAlloc = false);
2000 
2001 /// Patterns to convert Linalg matmul ops to transposed variants.
2003  bool transposeLHS = true);
2004 
2005 /// Patterns to block pack Linalg matmul ops.
2007  const ControlBlockPackMatmulFn &controlFn);
2008 
2009 /// Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
2011  WinogradConv2DFmr fmr);
2012 
2013 /// Patterns to decompose Winograd operators.
2015 
2016 /// Adds patterns that reduce the rank of named contraction ops that have
2017 /// unit dimensions in the operand(s) by converting to a sequence of
2018 /// `collapse_shape`,
2019 /// `<corresponding linalg named op>`, `expand_shape` (if on tensors). For
2020 /// example a `linalg.batch_matmul` with unit batch size will convert to
2021 /// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will
2022 /// convert to a `linalg.dot`.
2024 
2025 /// Function type which is used to control folding operations like `tensor.pad`
2026 /// and `tensor.extract_slice` into linalg.pack/unpack ops.
2027 using ControlFoldIntoPackUnpackFn = std::function<bool(OpOperand *opOperand)>;
2028 /// Populates `patterns` with patterns that fold operations like `tensor.pad`
2029 /// and `tensor.extract_slice` into `tensor.pack` and `tensor.unpack` operations
2030 /// respectively.
2033  const ControlFoldIntoPackUnpackFn &controlFn = nullptr);
2034 
2035 /// Populates `patterns` with patterns that fold operations like `linalg.pack`
2036 /// and `linalg.unpack` into `tensor.empty`.
2038 
2039 /// Populates `patterns` with patterns that simplify `tensor.pack` and
2040 /// `tensor.unpack` operations.
2042 
2043 } // namespace linalg
2044 } // namespace mlir
2045 
2046 #endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
static llvm::ManagedStatic< PassManagerOptions > options
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The main mechanism for performing data layout queries.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
This class helps build Operations.
Definition: Builders.h:205
This class represents a single result from folding an operation.
Definition: OpDefinition.h:272
This class represents an operand of an operation.
Definition: Value.h:257
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
Definition: PatternMatch.h:34
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:783
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns)
A pattern that converts init operands to input operands.
void populateTransposeMatmulPatterns(RewritePatternSet &patterns, bool transposeLHS=true)
Patterns to convert Linalg matmul ops to transposed variants.
void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns)
Adds patterns that reduce the rank of named contraction ops that have unit dimensions in the operand(...
LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, const LinalgPaddingOptions &options, LinalgOp &paddedOp, SmallVector< Value > &replacements, SmallVector< tensor::PadOp > &padOps)
Pad the iterator dimensions options.paddingDimensions of all opToPad operands to a static bounding bo...
Definition: Padding.cpp:244
void populateSplitReductionPattern(RewritePatternSet &patterns, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Patterns to apply splitReduction below.
void populateFuseTensorPadWithProducerLinalgOpPatterns(RewritePatternSet &patterns)
Pattern to fuse a tensor.pad operation with the producer of its source, if the producer is a linalg o...
FailureOr< std::pair< Operation *, Operation * > > rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp)
Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) and linalg....
bool areDimSequencesPreserved(ArrayRef< AffineMap > maps, ArrayRef< ReassociationIndices > dimSequences)
Return true if all sequences of dimensions specified in dimSequences are contiguous in all the ranges...
bool hasVectorizationImpl(Operation *)
Return true if there's dedicated logic in the Linalg Vectorizer to vectorize this Op,...
void populateExtractSliceSinkingPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation)
Patterns to sink extract slice across other operations.
void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns)
Patterns that are used to bubble up extract slice op above linalg op.
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
Definition: Tiling.cpp:73
std::function< std::optional< Value >(OpBuilder &b, memref::SubViewOp subView, ArrayRef< Value > boundingSubViewSize, DataLayout &layout)> AllocBufferCallbackFn
Callback function type used to perform the allocation for the promoted subView.
Definition: Transforms.h:381
void populateBlockPackMatmulPatterns(RewritePatternSet &patterns, const ControlBlockPackMatmulFn &controlFn)
Patterns to block pack Linalg matmul ops.
void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns)
Populates patterns to transform linalg.conv_2d_xxx operations into linalg.generic (for img2col packin...
FailureOr< Operation * > decomposeWinogradFilterTransformOp(RewriterBase &rewriter, linalg::WinogradFilterTransformOp op)
Rewrite linalg.winograd_filter_transform.
DenseMap< int, int > LoopIndexToRangeIndexMap
Creates a number of ranges equal to the number of non-zero in tileSizes.
Definition: Transforms.h:930
std::optional< Value > allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU workgroup memory.
Definition: Promotion.cpp:471
FailureOr< PackTransposeResult > packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef< int64_t > outerPerm, ArrayRef< int64_t > innerPerm)
Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the transposed PackOp -> LinalgOp ...
Definition: Transforms.cpp:657
std::function< IndexingMapOpInterface(Location loc, OpBuilder &, IndexingMapOpInterface, ArrayRef< Value > newOperands, ArrayRef< AffineMap > newIndexingMaps, const llvm::SmallDenseSet< unsigned > &droppedDims)> DroppedUnitDimsBuilder
Definition: Transforms.h:548
Value bufferizeToAllocation(RewriterBase &rewriter, const BufferizeToAllocationOptions &options, tensor::PadOp padOp, Attribute memorySpace={}, Operation *insertionPoint=nullptr)
Materialize a buffer allocation for the given tensor.pad op and lower the op to linalg....
FailureOr< VectorizationResult > vectorize(RewriterBase &rewriter, Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false, bool assumeDynamicDimsMatchVecSizes=false, bool createNamedContraction=false)
Returns a VectorizationResult containing the results of the vectorized op, or failure if the transfor...
std::function< bool(OpOperand *fusedOperand)> ControlFusionFn
Function type which is used to control when to stop fusion.
Definition: Transforms.h:1902
bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence)
Return true if a given sequence of dimensions are contiguous in the range of the specified indexing m...
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< TransposeOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
SmallVector< OpFoldResult > computePaddedShape(RewriterBase &rewriter, TypedValue< RankedTensorType > v, AffineMap indexingMap, ArrayRef< OpFoldResult > indexingSizes, const PadTilingInterfaceOptions &options)
Helper function to compute the padded shape of the given value v of RankedTensorType given:
void populateDecomposeProjectedPermutationPatterns(RewritePatternSet &patterns)
Add patterns to make explicit broadcasts and transforms in the input operands of a genericOp.
FailureOr< LinalgOp > specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp)
Create a namedOp from the given GenericOp and replace the GenericOp.
Definition: Specialize.cpp:247
void populateFoldReshapeOpsByCollapsingPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding tensor.expand_shape operation with its producer generic operation by co...
LinalgTilingLoopType
The type of loops to be generated during tiling.
Definition: Utils.h:118
FailureOr< LowerUnPackOpResult > lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice=true)
Rewrite pack as empty + transpose + reshape + extract_slice.
Definition: Transforms.cpp:346
std::function< LogicalResult(OpBuilder &b, Value buffer)> DeallocBufferCallbackFn
Callback function type used to deallocate the buffers used to hold the promoted subview.
Definition: Transforms.h:386
void populateDataLayoutPropagationPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation)
Patterns to bubble up or down data layout ops across other operations.
void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit baseBenefit=1)
Populates patterns with patterns that vectorize tensor.pad.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
Definition: Tiling.cpp:857
void populateLinalgFoldIntoElementwisePatterns(RewritePatternSet &patterns)
Populates patterns with patterns that fold operations like linalg.transform into elementwise op map.
LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value)
In case of GPU private memory there is no need to deallocate since the memory is freed when going out...
Definition: Promotion.cpp:512
void populateSparseTensorRewriting(RewritePatternSet &patterns)
Populate patterns that are only useful in the context of sparse tensors.
FailureOr< Operation * > decomposeWinogradOutputTransformOp(RewriterBase &rewriter, linalg::WinogradOutputTransformOp op)
Rewrite linalg.winograd_output_transform.
void populateWinogradConv2DPatterns(RewritePatternSet &patterns, WinogradConv2DFmr fmr)
Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
FailureOr< ElementwiseOpFusionResult > fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand)
FailureOr< PromotionInfo > promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, bool useOriginalSubviewSize, const AllocBufferCallbackFn &allocationFn, DataLayout &layout)
Definition: Promotion.cpp:237
llvm::SmallDenseSet< int > getPreservedProducerResults(GenericOp producer, GenericOp consumer, OpOperand *fusedOperand)
Returns a set of indices of the producer's results which would be preserved after the fusion.
std::optional< Value > allocateGPUPrivateMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU private memory.
Definition: Promotion.cpp:496
void populateSimplifyDepthwiseConvPatterns(RewritePatternSet &patterns)
Patterns to simplify depthwise convolutions.
FailureOr< Operation * > rewriteInDestinationPassingStyle(RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp)
Rewrite tensor.from_elements to linalg.generic.
FailureOr< PackResult > blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp, const ControlBlockPackMatmulFn &controlPackMatmul)
Pack a matmul operation into blocked 4D layout.
void peelLoops(RewriterBase &rewriter, ArrayRef< scf::ForOp > loops)
Peel 'loops' and applies affine_min/max bounds simplification on the fly where relevant.
Definition: Transforms.cpp:69
void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns)
Populate patterns that convert non-destination-style ops to destination style ops.
FailureOr< Operation * > transposeConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op)
Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by materializing transpose.
void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, ControlDropUnitDims &options)
Patterns to fold unit-extent dimensions in operands/results of linalg ops on tensors via reassociativ...
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst)
Create Memref copy operations and add gpu barrier guards before and after the copy operation to ensur...
Definition: Promotion.cpp:487
std::function< SmallVector< Value, 4 >(OpBuilder &, Operation *)> TileSizeComputationFunction
Definition: Transforms.h:189
std::function< LogicalResult(RewriterBase &, tensor::PadOp, Value)> OptimizeCopyFn
Definition: Transforms.h:1634
void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns)
Populate patterns that convert ElementwiseMappable ops to linalg parallel loops.
LogicalResult linalgOpAnchoredEmptyTensorEliminationStep(RewriterBase &rewriter, Operation *op, bufferization::OneShotAnalysisState &state)
Try to eliminate tensor::EmptyOps inside op that are anchored on a LinalgOp.
FailureOr< LinalgLoops > linalgOpToLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.for with the proper body for linalgOp.
Definition: Loops.cpp:368
FailureOr< GenericOp > generalizeNamedOp(RewriterBase &rewriter, LinalgOp linalgOp)
Create a GenericOp from the given named operation linalgOp and replace the given linalgOp.
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
Definition: Tiling.cpp:44
FailureOr< Operation * > transposeBatchMatmul(RewriterBase &rewriter, linalg::BatchMatmulOp op, bool transposeLHS=true)
Pattern to replace.
LogicalResult promoteSubviewsPrecondition(Operation *op, LinalgPromotionOptions options)
Promote memref.subviews feeding linalg-on-buffers operations.
Definition: Promotion.cpp:400
LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst)
Normal copy to between src and dst.
Definition: Promotion.cpp:504
FailureOr< linalg::GenericOp > deduplicateOperandsAndRemoveDeadResults(RewriterBase &rewriter, linalg::GenericOp genericOp, bool removeOutputs)
Method to deduplicate operands and remove dead results of linalg.generic operations.
void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Linalg decompose convolutions patterns.
void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns)
Patterns to decompose Winograd operators.
void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns for vectorizing low-D convolution ops.
std::function< bool(OpOperand *opOperand)> ControlFoldIntoPackUnpackFn
Function type which is used to control folding operations like tensor.pad and tensor....
Definition: Transforms.h:2027
FailureOr< Operation * > winogradConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op, WinogradConv2DFmr fmr)
Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm F(m x m, r x r).
LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp)
Emit a suitable vector form for a Copy op with fully static shape.
FailureOr< SmallVector< OpFoldResult > > computeIndexingMapOpInterfacePaddedShape(RewriterBase &rewriter, OpOperand &operandToPad, ArrayRef< Range > iterationDomain, const PadTilingInterfaceOptions &options)
Specific helper for Linalg ops.
LogicalResult vectorizeOpPrecondition(Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false)
Return success if the operation can be vectorized.
FailureOr< GenericOp > interchangeGenericOp(RewriterBase &rewriter, GenericOp genericOp, ArrayRef< unsigned > interchangeVector)
Interchange the iterator_types and iterator_maps dimensions and adapts the index accesses of op.
Definition: Interchange.cpp:45
void populateCollapseDimensions(RewritePatternSet &patterns, const GetCollapsableDimensionsFn &controlCollapseDimensions)
Pattern to collapse dimensions in a linalg.generic op.
bool areElementwiseOpsFusable(OpOperand *fusedOperand)
Return true if two linalg.generic operations with producer/consumer relationship through fusedOperand...
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
Definition: Tiling.cpp:236
FailureOr< LinalgLoops > linalgOpToAffineLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of affine.for with the proper body for linalgOp.
Definition: Loops.cpp:363
void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns)
Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g.
void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns)
Pattern to remove dead operands and results of linalg.generic operations.
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
Definition: Tiling.cpp:156
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
Definition: Tiling.cpp:106
std::function< LogicalResult(OpBuilder &b, Value src, Value dst)> CopyCallbackFn
Callback function type used to insert copy from original subview to subview of the promoted region fo...
Definition: Transforms.h:393
FailureOr< SplitReductionResult > splitReduction(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns)
Populates patterns with patterns that simplify tensor.pack and tensor.unpack operations.
void populateFoldPackUnpackIntoTensorEmptyPatterns(RewritePatternSet &patterns)
Populates patterns with patterns that fold operations like linalg.pack and linalg....
FailureOr< LinalgOp > padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp, const LinalgPaddingOptions &options)
Apply padding and hoisting to linalgOp according to the configuration specified in options.
Definition: Padding.cpp:355
void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, bool removeDeadArgsAndResults=true)
Populate patterns for splitting a LinalgOp with multiple statements within its payload into multiple ...
std::function< bool(OpOperand *opOperand)> ControlPropagationFn
Function type which is used to control propagation of linalg.pack/unpack ops.
Definition: Transforms.h:1914
void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns, const ControlFoldIntoPackUnpackFn &controlFn=nullptr)
Populates patterns with patterns that fold operations like tensor.pad and tensor.extract_slice into t...
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
Definition: Tiling.cpp:589
FailureOr< PackResult > packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef< OpFoldResult > mnkPackedSizes, ArrayRef< int64_t > mnkPaddedSizesNextMultipleOf, ArrayRef< int64_t > mnkOrder)
Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m and n are proper parallel d...
Definition: Transforms.cpp:748
FailureOr< PackResult > pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, ArrayRef< OpFoldResult > packedSizes)
Implement packing of a single LinalgOp by packedSizes.
Definition: Transforms.cpp:464
void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns)
Patterns to promote inputs to outputs and remove unused inputs of linalg.generic ops.
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
Definition: Tiling.cpp:817
std::function< SmallVector< ReassociationIndices >(linalg::LinalgOp)> GetCollapsableDimensionsFn
Function type to control generic op dimension collapsing.
Definition: Transforms.h:1938
std::function< FailureOr< SmallVector< OpFoldResult > >(RewriterBase &, OpOperand &, ArrayRef< Range >, const PadTilingInterfaceOptions &)> PadSizeComputationFunction
Definition: Transforms.h:631
void populateFoldReshapeOpsByExpansionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding (collapsing) tensor_reshape operation with its producer (consumer) gene...
void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns)
Adds patterns that waps tensor.extract_slice(linalg.fill(cst, init)) into linalg.fill(cst,...
FailureOr< DropUnitDimsResult > dropUnitDims(RewriterBase &rewriter, IndexingMapOpInterface op, const DroppedUnitDimsBuilder &droppedUnitDimsBuilder, const ControlDropUnitDims &options)
void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns)
Patterns that are used to inline constant operands into linalg generic ops.
FailureOr< LinalgOp > promoteSubViews(OpBuilder &b, LinalgOp op, const LinalgPromotionOptions &options)
Promote the subViews into a new buffer allocated at the insertion point b.
Definition: Promotion.cpp:422
void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, const ControlFusionFn &controlFn)
Patterns to constant fold Linalg operations.
std::function< SplitReductionOptions(LinalgOp op)> ControlSplitReductionFn
Function signature to control reduction splitting.
Definition: Transforms.h:491
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value)
In case of GPU group memory there is no need to deallocate.
Definition: Promotion.cpp:480
FailureOr< Operation * > transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp op, bool transposeLHS=true)
Convert Linalg matmul ops to transposed variants.
void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns)
Linalg generalization patterns.
void populateLinalgGenericOpsSpecializationPatterns(RewritePatternSet &patterns)
Populates patterns with patterns to convert linalg.generic ops to named ops where possible.
Definition: Specialize.cpp:344
std::function< std::optional< BlockPackMatmulOptions >(linalg::LinalgOp)> ControlBlockPackMatmulFn
Function type which is used to control matmul packing.
Definition: Transforms.h:1339
void populateLinalgNamedToElementwisePatterns(RewritePatternSet &patterns)
Populates patterns that convert linalg named ops e.g.
enum WinogradConv2DFmr uint32_t std::optional< vector::CombiningKind > getCombinerOpKind(Operation *combinerOp)
Return vector::CombiningKind for the given op.
SmallVector< Value > peelLoop(RewriterBase &rewriter, Operation *op)
Try to peel and canonicalize loop op and return the new result.
Definition: Transforms.cpp:53
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx)
Canonicalization patterns relevant to apply after tiling patterns.
Definition: Tiling.cpp:851
FailureOr< CollapseResult > collapseOpIterationDims(LinalgOp op, ArrayRef< ReassociationIndices > foldedIterationDims, RewriterBase &rewriter)
Collapses dimensions of linalg.generic/linalg.copy operation.
FailureOr< Operation * > decomposeWinogradInputTransformOp(RewriterBase &rewriter, linalg::WinogradInputTransformOp op)
Rewrite linalg.winograd_input_transform.
void populateDecomposePadPatterns(RewritePatternSet &patterns)
Populates patterns to decompose tensor.pad into e.g.
void populateFoldAddIntoDestPatterns(RewritePatternSet &patterns)
Pattern to replace linalg.add when destination passing on a contraction op suffices for achieving the...
std::pair< TilingInterface, TilingInterface > splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension, OpFoldResult splitPoint)
Split the given op into two parts along the given iteration space dimension at the specified splitPoi...
Definition: Split.cpp:67
void populateElementwiseOpsFusionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlElementwiseOpFusion)
Patterns for fusing linalg operation on tensors.
FailureOr< SplitReductionResult > splitReductionByScaling(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Scaling-based implementation of the split reduction transformation.
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
Definition: Tiling.cpp:262
FailureOr< LowerPackResult > lowerPack(RewriterBase &rewriter, linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice=true)
Rewrite pack as pad + reshape + transpose.
Definition: Transforms.cpp:217
FailureOr< LinalgLoops > linalgOpToParallelLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.parallel with the proper body for linalgOp.
Definition: Loops.cpp:375
Include the generated interface declarations.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition: Value.h:488
ArrayRef< int64_t > ReassociationIndicesRef
const FrozenRewritePatternSet & patterns
OpInterfaceRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting a...
Definition: PatternMatch.h:330
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314
SmallVector< int64_t, 3 > mnkOrder
Permutation of matmul (M, N, K) dimensions order.
Definition: Transforms.h:1319
SmallVector< int64_t, 3 > blockFactors
Minor block factors (mb, nb, kb) for packing relayout where mb, mn are the parallel dimensions and kb...
Definition: Transforms.h:1309
bool rhsTransposeOuterBlocks
Transpose RHS outer block layout [KB][NB] -> [NB][KB].
Definition: Transforms.h:1328
bool lhsTransposeInnerBlocks
Transpose LHS inner block layout [mb][kb] -> [kb][mb].
Definition: Transforms.h:1325
SmallVector< int64_t, 3 > mnkPaddedSizesNextMultipleOf
Next multiples of the packing sizes.
Definition: Transforms.h:1316
bool lhsTransposeOuterBlocks
Transpose LHS outer block layout [MB][KB] -> [KB][MB].
Definition: Transforms.h:1322
bool allowPadding
If true, allows packing of dimensions that only partially fit into the block factors.
Definition: Transforms.h:1313
bool rhsTransposeInnerBlocks
Transpose RHS inner block layout [kb][nb] -> [nb][kb].
Definition: Transforms.h:1331
SmallVector< Value > results
Definition: Transforms.h:1222
Transformation to drop unit-extent dimensions from linalg.generic operations.
Definition: Transforms.h:522
RankReductionStrategy rankReductionStrategy
Definition: Transforms.h:525
std::function< SmallVector< unsigned >(Operation *)> ControlFnTy
Definition: Transforms.h:528
Vectorization pattern for memref::CopyOp.
Definition: Transforms.h:1626
LogicalResult matchAndRewrite(memref::CopyOp copyOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:893
Rewrites a linalg::PackOp into a sequence of:
Definition: Transforms.h:1684
LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override
Rewrites a linalg::UnPackOp into a sequence of rank-reduced.
Definition: Transforms.h:1718
LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const override
Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and InsertSliceOp.
Definition: Transforms.h:1638
LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:921
Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, Value dest, const SmallVector< Value > &dynSizes) const
Filling dest using FillOp constant padding value if possible.
Definition: Transforms.cpp:900
DecomposePadOpPattern(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1639
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1580
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1574
Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh) dimensions into 1-D depthwise c...
Definition: Transforms.h:1558
FailureOr< DepthwiseConv1DNwcWcOp > returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const
LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1567
DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, PatternBenefit benefit=1)
Definition: Transforms.h:1559
Rewrites 2-D convolution ops with size-1 window dimensions into 1-D convolution ops.
Definition: Transforms.h:1538
LogicalResult matchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const override
Definition: Transforms.h:1544
FailureOr< Conv1DOp > returningMatchAndRewrite(Conv2DOp convOp, PatternRewriter &rewriter) const
IndexingMapOpInterface resultOp
Definition: Transforms.h:542
SmallVector< Value > replacements
Definition: Transforms.h:543
Fuse two linalg.generic operations that have a producer-consumer relationship captured through fusedO...
Definition: Transforms.h:561
llvm::DenseMap< Value, Value > replacements
Definition: Transforms.h:563
Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
Definition: Transforms.h:1785
std::function< std::optional< bool >(tensor::ExtractSliceOp)> ControlFn
A function to control pattern application and rewrite logic.
Definition: Transforms.h:1795
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override
Definition: Transforms.cpp:972
ExtractSliceOfPadTensorSwapPattern(MLIRContext *context, ControlFn controlFn=nullptr, PatternBenefit benefit=1)
Definition: Transforms.h:1797
Transformation information returned after reduction tiling.
Definition: Transforms.h:1012
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
Definition: Transforms.h:1016
SmallVector< Value > initialValues
Initial values used for partial reductions.
Definition: Transforms.h:1018
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
Definition: Transforms.h:1020
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
Definition: Transforms.h:1014
Match and rewrite for the pattern:
Definition: Transforms.h:1748
LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Match and rewrite for the pattern:
Definition: Transforms.h:1776
LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const override
TODO: use interfaces, side-effects and aliasing analysis as appropriate, when available.
Linalg generalization pattern.
Definition: Transforms.h:1595
LogicalResult matchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const override
Definition: Transforms.h:1605
FailureOr< GenericOp > returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const
matchAndRewrite implementation that returns the significant transformed pieces of IR.
Definition: Transforms.h:1601
Options that allow distribution of loops generated in Linalg transforms to processors while generatin...
Definition: Utils.h:319
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition: Transforms.h:283
LinalgPaddingOptions & setPadToMultipleOf(ArrayRef< int64_t > m)
Definition: Transforms.h:296
DenseMap< std::pair< unsigned, unsigned >, OpFoldResult > sizeToPadTo
A mapping between an operand and shape dim, and a size for a padding dimension.
Definition: Transforms.h:304
std::optional< SmallVector< int64_t > > padToMultipleOf
A list of multiples to which each padding dimension should be padded to.
Definition: Transforms.h:295
OpFoldResult getSizeToPadTo(unsigned operandIndex, unsigned dimIndex) const
Given the operand index and shape dim it returns the size to pad to.
Definition: Transforms.h:312
LinalgPaddingOptions & setNofoldFlags(ArrayRef< bool > pp)
Definition: Transforms.h:320
LinalgPaddingOptions & setPaddingDimensions(ArrayRef< int64_t > pd)
Definition: Transforms.h:290
LinalgPaddingOptions & setTransposePaddings(ArrayRef< SmallVector< int64_t >> tp)
Definition: Transforms.h:334
SmallVector< SmallVector< int64_t > > transposePaddings
A permutation vector for every operand used to transpose the packed PadOp results.
Definition: Transforms.h:332
LinalgPaddingOptions & setSizeToPadTo(unsigned operandIndex, unsigned dimIndex, OpFoldResult size)
Definition: Transforms.h:305
LinalgPaddingOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition: Transforms.h:284
SmallVector< bool > nofoldFlags
A flag for every operand to mark the PadOp as nofold which enables packing for statically shaped oper...
Definition: Transforms.h:319
LinalgPaddingOptions & setCopyBackOp(CopyBackOp op)
Definition: Transforms.h:346
LinalgPaddingOptions & setHoistPaddings(ArrayRef< int64_t > hp)
Definition: Transforms.h:326
SmallVector< int64_t > hoistPaddings
A number of loops to hoist the PadOp out for every operand.
Definition: Transforms.h:325
SmallVector< int64_t > paddingDimensions
A list of iterator dimensions to pad.
Definition: Transforms.h:289
CopyBackOp copyBackOp
The op to be used for copying the padded result to the original destination tensor.
Definition: Transforms.h:345
std::optional< unsigned > alignment
Alignment of promoted buffer. If std::nullopt do not specify alignment.
Definition: Transforms.h:433
LinalgPromotionOptions & setUseFullTileBuffersByDefault(bool use)
Definition: Transforms.h:421
bool useAlloca
Use alloca with the default allocation scheme.
Definition: Transforms.h:446
LinalgPromotionOptions & setAlignment(unsigned align)
Definition: Transforms.h:434
std::optional< Attribute > memorySpace
Memory space of promoted buffer.
Definition: Transforms.h:440
bool useOriginalSubviewSize
If true, buffers will be allocated with the original subview size.
Definition: Transforms.h:427
std::optional< CopyCallbackFn > copyOutFn
Definition: Transforms.h:466
std::optional< CopyCallbackFn > copyInFn
Callback function to do the copy of data to and from the promoted subview.
Definition: Transforms.h:465
LinalgPromotionOptions & setUseAlloca(bool use)
Definition: Transforms.h:447
std::optional< DenseSet< unsigned > > operandsToPromote
Indices of subViews to promote.
Definition: Transforms.h:398
LinalgPromotionOptions & setCopyInOutFns(CopyCallbackFn const &copyIn, CopyCallbackFn const &copyOut)
Definition: Transforms.h:467
LinalgPromotionOptions & setUseFullTileBuffers(ArrayRef< bool > useFullTiles)
Definition: Transforms.h:410
std::optional< AllocBufferCallbackFn > allocationFn
Callback function to do the allocation of the promoted buffer.
Definition: Transforms.h:454
bool useFullTileBuffersDefault
If true all operands unspecified by useFullTileBuffers will use the full view, otherwise the partial ...
Definition: Transforms.h:420
std::optional< DeallocBufferCallbackFn > deallocationFn
Definition: Transforms.h:455
LinalgPromotionOptions & setMemorySpace(Attribute memorySpc)
Definition: Transforms.h:441
LinalgPromotionOptions & setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, DeallocBufferCallbackFn const &deallocFn)
Definition: Transforms.h:457
LinalgPromotionOptions & setUseOriginalSubviewSize(bool originalSize)
Definition: Transforms.h:428
std::optional< llvm::SmallBitVector > useFullTileBuffers
If ith element of useFullTiles is true the full view should be used for the promoted buffer of the it...
Definition: Transforms.h:409
LinalgPromotionOptions & setOperandsToPromote(ArrayRef< int64_t > operands)
Definition: Transforms.h:399
LogicalResult matchAndRewrite(GenericOp op, PatternRewriter &rewriter) const override
Definition: Transforms.h:1619
FailureOr< GenericOp > returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const
Definition: Transforms.h:1615
std::optional< LinalgLoopDistributionOptions > tileDistribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:273
LinalgTilingAndFusionOptions & setTileSizes(ArrayRef< int64_t > ts)
Definition: Transforms.h:265
SmallVector< int64_t > tileInterchange
Tile interchange used to permute the tile loops.
Definition: Transforms.h:270
LinalgTilingAndFusionOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:275
SmallVector< int64_t > tileSizes
Tile sizes used to tile the root operation.
Definition: Transforms.h:264
LinalgTilingOptions & setLoopType(LinalgTilingLoopType lt)
Definition: Transforms.h:229
LinalgTilingOptions & setDistributionTypes(ArrayRef< StringRef > types)
Definition: Transforms.h:247
LinalgTilingOptions & setInterchange(ArrayRef< unsigned > interchange)
Definition: Transforms.h:221
LinalgTilingLoopType loopType
The type of tile loops to generate.
Definition: Transforms.h:227
LinalgTilingOptions & setTileSizeComputationFunction(TileSizeComputationFunction fun)
Definition: Transforms.h:198
LinalgTilingOptions & setTileSizes(const SmallVector< Value, 4 > &ts)
Set the tileSizeComputationFunction to return the values ts.
Definition: Transforms.h:205
LinalgTilingOptions & setPeeledLoops(ArrayRef< int64_t > loops)
Definition: Transforms.h:255
SmallVector< int64_t > peeledLoops
Peel the specified loops.
Definition: Transforms.h:253
LinalgTilingOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition: Transforms.h:239
SmallVector< unsigned, 4 > interchangeVector
The interchange vector to reorder the tiled loops.
Definition: Transforms.h:219
TileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes for each operation.
Definition: Transforms.h:195
LinalgTilingOptions & scalarizeDynamicDims()
Tile all dynamic dimensions by 1.
std::optional< LinalgLoopDistributionOptions > distribution
When specified, specifies distribution of generated tile loops to processors.
Definition: Transforms.h:236
SmallVector< StringRef, 2 > distributionTypes
Specification markers of how to distribute the linalg.tiled_loop.
Definition: Transforms.h:245
linalg::TransposeOp transposeOp
Definition: Transforms.h:1241
tensor::ExpandShapeOp expandShapeOp
Definition: Transforms.h:1240
tensor::ExtractSliceOp extractSliceOp
Definition: Transforms.h:1253
linalg::TransposeOp transposeOp
Definition: Transforms.h:1251
tensor::CollapseShapeOp collapseShapeOp
Definition: Transforms.h:1252
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Definition: Transforms.h:959
Struct to hold the result of a pack call.
Definition: Transforms.h:1262
SmallVector< linalg::UnPackOp > unPackOps
Definition: Transforms.h:1265
linalg::LinalgOp packedLinalgOp
Definition: Transforms.h:1264
SmallVector< linalg::PackOp > packOps
Definition: Transforms.h:1263
Struct to hold the result of a packTranspose call.
Definition: Transforms.h:1274
linalg::LinalgOp transposedLinalgOp
Definition: Transforms.h:1276
linalg::UnPackOp transposedUnPackOp
Definition: Transforms.h:1277
PadTilingInterfaceOptions & setPaddingSizes(ArrayRef< OpFoldResult > m)
Definition: Transforms.h:361
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition: Transforms.h:354
PadTilingInterfaceOptions & setPadToMultipleOf(bool b)
Definition: Transforms.h:368
bool padToMultipleOf
Pad iterator paddingDimension[i] to next multiple of paddingSizes[i] if true.
Definition: Transforms.h:367
PadTilingInterfaceOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition: Transforms.h:355
SmallVector< OpFoldResult > paddingSizes
A list of iterator dimensions sizes to pad to.
Definition: Transforms.h:360
Create a new buffer using the allocationFn provided.
Definition: Transforms.h:828
Split Reduction options.
Definition: Transforms.h:476
Apply transformation to split the single linalg op reduction into a parallel and reduction dimension.
Definition: Transforms.h:1150
Perform standalone tiling of a single LinalgOp by tileSizes.
Definition: Transforms.h:788
SmallVector< Operation *, 8 > loops
Definition: Transforms.h:790
SmallVector< Value, 4 > tensorResults
Definition: Transforms.h:791
Transformation information returned after vectorizing.
Definition: Transforms.h:885
SmallVector< Value > replacements
Results of the vectorization transform to replace the original operation.
Definition: Transforms.h:887
SmallVector< T > tripCounts
Number of tiles associated with each size.
Definition: Transforms.h:950
T lowTripCount
Number of tiles associated with each size.
Definition: Transforms.h:942
Helper struct to hold the results of building a packing loop nest.
Definition: Transforms.h:658
SmallVector< OpFoldResult > strides
Definition: Transforms.h:659
SmallVector< Value > leadingPackedTensorIndexings
Definition: Transforms.h:660
SmallVector< Value > clonedLoopIvs
Definition: Transforms.h:660
SmallVector< OpFoldResult > sizes
Definition: Transforms.h:659
SmallVector< OpFoldResult > offsets
Definition: Transforms.h:659