MLIR 23.0.0git
Transforms.h
Go to the documentation of this file.
1//===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
10#define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
11
12#include <utility>
13
27#include "llvm/ADT/SmallBitVector.h"
28
29namespace mlir {
30namespace bufferization {
31class AllocTensorOp;
33class BufferizationState;
34} // namespace bufferization
35
36namespace linalg {
37
38class LinalgOp;
39enum class WinogradConv2DFmr : uint32_t;
40
41//===----------------------------------------------------------------------===//
42// Utils.
43//===----------------------------------------------------------------------===//
44
45/// Return vector::CombiningKind for the given op.
46std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp);
47
48//===----------------------------------------------------------------------===//
49// Bufferization-related transforms.
50//===----------------------------------------------------------------------===//
51
53 enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 };
55
62
63 /// If set to "true", only the destination tensor operands are bufferized to
64 /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the
65 /// targeted op itself.
67
68 /// If set to "true", a `memref.dealloc` operation will be emitted for each
69 /// allocated buffer. Otherwise, the memory is leaked, which is useful if
70 /// the buffer deallocation pipeline should be run after bufferization is
71 /// done.
72 bool emitDealloc = false;
73};
74
75/// Materialize a buffer allocation for the given tensor.pad op and lower the
76/// op to linalg.fill/linalg.generic + bufferization.materialize_in_destination.
77/// E.g.:
78///
79/// %0 = tensor.pad low[%l] high[%h] %t ...
80///
81/// is lowered to:
82///
83/// %alloc = memref.alloc
84/// linalg.fill ... outs(%alloc)
85/// %subview = memref.subview %alloc [%l] [...] [1]
86/// bufferization.materialize_in_destination %t in %subview
87/// %0 = bufferization.to_tensor %alloc restrict writable
88///
89/// In addition to rewriting the IR as shown above, this function returns the
90/// newly allocated buffer. The `insertionPoint` parameter can be used to
91/// specify a custom insertion point for the buffer allocation.
94 tensor::PadOp padOp, Attribute memorySpace = {},
95 Operation *insertionPoint = nullptr);
96
97/// Materialize a buffer allocation for the given vector.mask op and bufferize
98/// the op, including its region. E.g.:
99///
100/// %0 = vector.mask {
101/// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32>
102/// } : vector<16xi1> -> tensor<?xf32>
103///
104/// is lowered to:
105///
106/// %alloc = memref.alloc
107/// bufferization.materialize_in_destination %t in %subview
108/// vector.mask {
109/// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32>
110/// } : vector<16xi1>
111/// %0 = bufferization.to_tensor %alloc restrict writable
112///
113/// In addition to rewriting the IR as shown above, this function returns the
114/// newly allocated buffer. The `insertionPoint` parameter can be used to
115/// specify a custom insertion point for the buffer allocation.
117 const BufferizeToAllocationOptions &options,
118 vector::MaskOp maskOp, Attribute memorySpace = {},
119 Operation *insertionPoint = nullptr);
120
121/// Materialize a buffer allocation for the given bufferization.alloc_tensor op
122/// and lower the op to memref.alloc + memref.tensor_store.
123///
124/// In addition to rewriting the IR, this function returns the newly allocated
125/// buffer. The `insertionPoint` parameter can be used to specify a custom
126/// insertion point for the buffer allocation.
127Value bufferizeToAllocation(RewriterBase &rewriter,
128 const BufferizeToAllocationOptions &options,
129 bufferization::AllocTensorOp allocTensorOp,
130 Attribute memorySpace = {},
131 Operation *insertionPoint = nullptr);
132
133/// Bufferize the given op with tensor semantics and materialize the result in
134/// a newly allocated buffer.
135///
136/// Only bufferizable ops that bufferize to a memory write or have an
137/// aliasing OpOperand (and do not themselves bufferize to an allocation) are
138/// supported. They are bufferized using their BufferizableOpInterface
139/// implementation.
140///
141/// Selected ops that bufferize to an allocation (or need special handling) are
142/// also supported:
143/// - tensor.pad
144/// - vector.mask
145///
146/// This function returns the newly allocated buffer. The `insertionPoint`
147/// parameter can be used to specify a custom insertion point for the buffer
148/// allocation.
149Value bufferizeToAllocation(RewriterBase &rewriter,
151 Operation *op, Attribute memorySpace = {},
152 Operation *insertionPoint = nullptr);
153
154/// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a
155/// LinalgOp. This transforms looks for LinalgOps that have an unused output
156/// operand and an input operand that is rooted in a tensor::EmptyOp. The
157/// tensor::EmptyOp uses are replaced with the output operand and the two
158/// operands of the LinalgOp are swapped.
159///
160/// Example:
161/// %0 = tensor.empty()
162/// %1 = linalg.matmul ins(...) outs(%0)
163/// %2 = linalg.generic ins(%1) outs(%dest) {
164/// ^bb0(%in: f32, %out: f32):
165/// // out not used
166/// }
167///
168/// The IR is transformed as follows:
169/// %0 = tensor.empty()
170/// %1 = linalg.matmul ins(...) outs(%dest)
171/// %2 = linalg.generic ins(%0) outs(%1) {
172/// ^bb0(%in: f32, %out: f32):
173/// // Use %out instead of %in
174/// }
175///
176/// The "ins" operand has no uses inside the body of the LinalgOp and can be
177/// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp
178/// can also fold away.
180 RewriterBase &rewriter, Operation *op,
181 bufferization::OneShotAnalysisState &state);
182
183//===----------------------------------------------------------------------===//
184// Structs that configure the behavior of various transformations.
185//===----------------------------------------------------------------------===//
186
188 std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;
189
191 /// Computation function that returns the tile sizes for each operation.
192 /// Delayed construction of constant tile sizes should occur to interoperate
193 /// with folding.
195
201 /// Set the `tileSizeComputationFunction` to return the values `ts`. The
202 /// values must not fold away when tiling. Otherwise, use a more robust
203 /// `tileSizeComputationFunction`.
205 tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; };
206 return *this;
207 }
208 /// Convenience function to set the `tileSizeComputationFunction` to a
209 /// function that computes tile sizes at the point they are needed. Allows
210 /// proper interaction with folding.
212
213 /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions.
214 /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together.
216
217 /// The interchange vector to reorder the tiled loops.
219
221 interchangeVector.assign(interchange.begin(), interchange.end());
222 return *this;
223 }
224
225 /// The type of tile loops to generate.
227
229 loopType = lt;
230 return *this;
231 }
232
233 /// When specified, specifies distribution of generated tile loops to
234 /// processors.
235 std::optional<LinalgLoopDistributionOptions> distribution;
236
239 distribution = std::move(distributionOptions);
240 return *this;
241 }
242
243 /// Specification markers of how to distribute the `linalg.tiled_loop`.
245
247 distributionTypes.assign(types.begin(), types.end());
248 return *this;
249 }
250
251 /// Peel the specified loops.
253
255 peeledLoops.clear();
256 peeledLoops.append(loops.begin(), loops.end());
257 return *this;
258 }
259};
260
262 /// Tile sizes used to tile the root operation.
265 tileSizes.assign(ts.begin(), ts.end());
266 return *this;
267 }
268 /// Tile interchange used to permute the tile loops.
270 /// When specified, specifies distribution of generated tile loops to
271 /// processors.
272 std::optional<LinalgLoopDistributionOptions> tileDistribution;
275 tileDistribution = std::move(distributionOptions);
276 return *this;
277 }
278};
279
281 /// A padding value for every operand.
284 paddingValues.assign(pv.begin(), pv.end());
285 return *this;
286 }
287 /// A list of iterator dimensions to pad.
290 paddingDimensions.assign(pd.begin(), pd.end());
291 return *this;
292 }
293 /// A list of multiples to which each padding dimension should be padded to.
294 std::optional<SmallVector<int64_t>> padToMultipleOf;
296 padToMultipleOf.emplace(m.begin(), m.end());
297 return *this;
298 }
299 /// A mapping between an operand and shape dim, and a size for a padding
300 /// dimension. Each size is expected to be greater or equal than the
301 /// corresponding shape dim. If no value is provided then the constant upper
302 /// bound will be used.
304 LinalgPaddingOptions &setSizeToPadTo(unsigned operandIndex, unsigned dimIndex,
305 OpFoldResult size) {
306 assert(size && "expected non-null size");
307 sizeToPadTo[{operandIndex, dimIndex}] = size;
308 return *this;
309 }
310 /// Given the operand index and shape dim it returns the size to pad to.
311 OpFoldResult getSizeToPadTo(unsigned operandIndex, unsigned dimIndex) const {
312 return sizeToPadTo.lookup_or(
313 std::pair<unsigned, unsigned>(operandIndex, dimIndex), nullptr);
314 }
315
316 /// A flag for every operand to mark the PadOp as nofold which enables
317 /// packing for statically shaped operands.
320 nofoldFlags.assign(pp.begin(), pp.end());
321 return *this;
322 }
323 /// A number of loops to hoist the PadOp out for every operand.
326 hoistPaddings.assign(hp.begin(), hp.end());
327 return *this;
328 }
329 /// A permutation vector for every operand used to transpose the packed
330 /// PadOp results.
334 transposePaddings.assign(tp.begin(), tp.end());
335 return *this;
336 }
342 /// The op to be used for copying the padded result to the original
343 /// destination tensor.
346 copyBackOp = op;
347 return *this;
348 }
349};
350
352 /// A padding value for every operand.
355 paddingValues.assign(pv.begin(), pv.end());
356 return *this;
357 }
358 /// A list of iterator dimensions sizes to pad to.
361 paddingSizes.assign(m.begin(), m.end());
362 return *this;
363 }
364 /// Pad iterator `paddingDimension[i]` to next multiple of `paddingSizes[i]`
365 /// if true. Otherwise pad to `paddingSizes[i]`.
369 return *this;
370 }
371};
372
373/// Callback function type used to perform the allocation for the promoted
374/// `subView`. In `boundingSubViewsize` a best attempt is made to find the
375/// smallest constant value for the size of the buffer needed for each
376/// dimension. If that is not possible, contains the dynamic size of the
377/// subview. The call back should return the buffer to use.
378using AllocBufferCallbackFn = std::function<std::optional<Value>(
379 OpBuilder &b, memref::SubViewOp subView,
380 ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>;
381
382/// Callback function type used to deallocate the buffers used to hold the
383/// promoted subview.
385 std::function<LogicalResult(OpBuilder &b, Value buffer)>;
386
387/// Callback function type used to insert copy from original subview to
388/// subview of the promoted region for the read operands/subview of promoted
389/// region to original subview for the results. The copy has to happen from
390/// `src` to `dst`.
392 std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
393
395 /// Indices of subViews to promote. If `std::nullopt`, try to promote all
396 /// operands.
397 std::optional<DenseSet<unsigned>> operandsToPromote;
400 operandsToPromote->insert_range(operands);
401 return *this;
402 }
403 /// If ith element of `useFullTiles` is true the full view should be used
404 /// for the promoted buffer of the ith operand in `operandsToPromote`.
405 /// Otherwise the partial view will be used. The decision is defaulted to
406 /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and
407 /// for operands missing from `useFullTileBuffers`.
408 std::optional<llvm::SmallBitVector> useFullTileBuffers;
410 unsigned size = useFullTiles.size();
411 llvm::SmallBitVector tmp(size, false);
412 for (unsigned i = 0; i < size; ++i)
413 tmp[i] = useFullTiles[i];
414 useFullTileBuffers = tmp;
415 return *this;
416 }
417 /// If true all operands unspecified by `useFullTileBuffers` will use the
418 /// full view, otherwise the partial view.
424 /// If true, buffers will be allocated with the original subview size. This
425 /// may result in more dynamic allocations, in case of dynamic sizes.
428 useOriginalSubviewSize = originalSize;
429 return *this;
430 }
431 /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment.
432 std::optional<unsigned> alignment;
434 alignment = align;
435 return *this;
436 }
437 /// Memory space of promoted buffer. If `std::nullopt` do not specify memory
438 /// space.
439 std::optional<Attribute> memorySpace;
441 memorySpace = memorySpc;
442 return *this;
443 }
444 /// Use alloca with the default allocation scheme.
445 bool useAlloca = false;
447 useAlloca = use;
448 return *this;
449 }
450 /// Callback function to do the allocation of the promoted buffer. If
451 /// std::nullopt, then the default allocation scheme of allocating a
452 /// memref<?xi8> buffer followed by a view operation is used.
453 std::optional<AllocBufferCallbackFn> allocationFn;
454 std::optional<DeallocBufferCallbackFn> deallocationFn;
457 DeallocBufferCallbackFn const &deallocFn) {
458 allocationFn = allocFn;
459 deallocationFn = deallocFn;
460 return *this;
461 }
462 /// Callback function to do the copy of data to and from the promoted
463 /// subview. If std::nullopt then a memref.copy is used.
464 std::optional<CopyCallbackFn> copyInFn;
465 std::optional<CopyCallbackFn> copyOutFn;
467 CopyCallbackFn const &copyOut) {
468 copyInFn = copyIn;
469 copyOutFn = copyOut;
470 return *this;
471 }
472};
473
474/// Split Reduction options.
476 // Ratio used to split the reduction dimension. If the ratio is <= 1,
477 // nothing will be done.
479 // Index where the extra dimension is added to the intermediate tensor
480 // shape.
481 unsigned index = 0;
482 // If the inner dimension after splitting is parallel or reduction.
483 bool innerParallel = false;
484};
485
486/// Function signature to control reduction splitting. This returns
487/// `SplitReductionOptions`.
488// TODO: don't use unsigned unless doing bit manipulation.
490 std::function<SplitReductionOptions(LinalgOp op)>;
491
492//===----------------------------------------------------------------------===//
493// Preconditions that ensure the corresponding transformation succeeds and can
494// be applied as a rewrite pattern.
495//===----------------------------------------------------------------------===//
496
497/// Return true if two `linalg.generic` operations with producer/consumer
498/// relationship through `fusedOperand` can be fused using elementwise op
499/// fusion.
500bool areElementwiseOpsFusable(OpOperand *fusedOperand);
501
502/// Promote memref.subviews feeding linalg-on-buffers operations.
503LogicalResult promoteSubviewsPrecondition(Operation *op,
505
506/// Return success if the operation can be vectorized.
508 ArrayRef<int64_t> inputVectorSizes = {},
509 ArrayRef<bool> inputScalableVecDims = {},
510 bool vectorizeNDExtract = false,
511 bool flatten1DDepthwiseConv = false);
512
513//===----------------------------------------------------------------------===//
514// Transformations exposed as functional-style API calls.
515//===----------------------------------------------------------------------===//
516
518
519/// Transformation to drop unit-extent dimensions from `linalg.generic`
520/// operations.
523
526
527 /// Instances of this type are used to control which dimensions of an operand
528 /// are considered for dropping unit extent dimensions. The parameter to the
529 /// function is the operation itself, the expected return is a list of
530 /// dimensions to consider for dropping unit extent dimensions. If the
531 /// operation should not be have any dimensions dropped, implementations
532 /// should return an empty list.
533 using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>;
534
535 /// Function to control which dimensions, if any, are to be considered for
536 /// dropping unit extent dimensions. The default behavior is to consider all
537 /// dimensions of a \c linalg.generic or \c tensor.pad operation for dropping.
538 /// Users of the \ref dropUnitDims interface can override the default behavior
539 /// by setting this member to their own implementation.
541 if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
542 return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
543 }
544 if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) {
545 return llvm::to_vector(
546 llvm::seq<unsigned>(0, padOp.getSourceType().getRank()));
547 }
548 return SmallVector<unsigned>{};
549 };
550
551 /// Instances of this type are used to control how operand values are
552 /// collapsed after dropping unit extent dimensions. Next to the control
553 /// struct, rewriter and location, the function receives the operand value to
554 /// collapse, the new target shape and how old dimensions should be grouped.
555 /// The function needs to insert the necessary operations to collapse the
556 /// operand to the target shape and returns the new operand value.
557 /// If the operand should not be collapsed, the function should return
558 /// failure, leading to the transformation to be aborted.
559 using CollapseFnTy = std::function<FailureOr<Value>(
562
563 /// Function to control how operands are collapsed into their new target shape
564 /// after dropping unit extent dimensions. For the default behavior
565 /// \see linalg::collapseValue.
566 /// Users of the \ref dropUnitDims interface can override the default behavior
567 /// by setting this member to their own implementation.
569 [](RewriterBase &rewriter, Location loc, Value operand,
570 ArrayRef<int64_t> targetShape,
571 ArrayRef<ReassociationIndices> reassociation,
572 const ControlDropUnitDims &control) -> FailureOr<Value> {
573 return collapseValue(rewriter, loc, operand, targetShape, reassociation,
574 control);
575 };
576
577 /// Instances of this type are used to control how result values are expanded
578 /// into their original shape after dropping unit extent dimensions. Next to
579 /// the control construct, rewriter and location, the function recieves the
580 /// result value, the original value to replace and and information on how the
581 /// new dimensions were grouped.
582 /// The function needs to insert the necessary operations to expand the
583 /// result to the original shape and returns the new result value.
584 /// If the result should not be expanded, the function should return
585 /// failure, leading to the transformation to be aborted.
586 using ExpandFnTy = std::function<FailureOr<Value>(
588 const ControlDropUnitDims &)>;
589
590 /// Function to control how results are expanded into their original shape
591 /// after dropping unit extent dimensions. The default behavior
592 /// \see linalg::expandValue.
593 /// Users of the \ref dropUnitDims interface can override the default behavior
594 /// by setting this member to their own implementation.
596 [](RewriterBase &rewriter, Location loc, Value result, Value origDest,
597 ArrayRef<ReassociationIndices> reassociation,
598 const ControlDropUnitDims &control) -> FailureOr<Value> {
599 return expandValue(rewriter, loc, result, origDest, reassociation, control);
600 };
601
602private:
603 /// Collapse the given \p value to \p targetShape. The \p reassociation is
604 /// used when `rankReductionStrategy` of \p control is set to
605 /// `RankReductionStrategy::ReassociativeReshape`. Will return failure if the
606 /// operand has memref type with a non-identity layout or tensor type with an
607 /// encoding.
608 static FailureOr<Value>
609 collapseValue(RewriterBase &rewriter, Location loc, Value operand,
610 ArrayRef<int64_t> targetShape,
611 ArrayRef<ReassociationIndices> reassociation,
612 const ControlDropUnitDims &control);
613
614 /// Expand the given \p value so that the type matches the type of \p
615 /// origDest. The \p reassociation is used when `rankReductionStrategy` of \p
616 /// control is set to `RankReductionStrategy::ReassociativeReshape`. Will
617 /// return failure if the original destination has tensor type with an
618 /// encoding.
619 static FailureOr<Value>
620 expandValue(RewriterBase &rewriter, Location loc, Value result,
621 Value origDest, ArrayRef<ReassociationIndices> reassociation,
622 const ControlDropUnitDims &control);
623};
624
629using DroppedUnitDimsBuilder = std::function<IndexingMapOpInterface(
630 Location loc, OpBuilder &, IndexingMapOpInterface,
631 ArrayRef<Value> newOperands, ArrayRef<AffineMap> newIndexingMaps,
632 const llvm::SmallDenseSet<unsigned> &droppedDims)>;
633
634/// Drop unit extent dimensions from the \p op and its operands.
635/// The transformation is aborted if unit dimensions cannot be dropped from any
636/// of the operands. Note that this function may insert trivially dead
637/// operations if the transformation is aborted and should therefore not be
638/// called from greedy drivers.
639FailureOr<DropUnitDimsResult>
640dropUnitDims(RewriterBase &rewriter, IndexingMapOpInterface op,
641 const DroppedUnitDimsBuilder &droppedUnitDimsBuilder,
643
644/// Drop unit extent dimensions from the \p genericOp and its operands.
645/// The transformation is aborted if unit dimensions cannot be dropped from any
646/// of the operands. Note that this function may insert trivially dead
647/// operations if the transformation is aborted and should therefore not be
648/// called from greedy drivers.
649FailureOr<DropUnitDimsResult> dropUnitDims(RewriterBase &rewriter,
650 GenericOp genericOp,
652
653/// Fuse two `linalg.generic` operations that have a producer-consumer
654/// relationship captured through `fusedOperand`. The method expects
655/// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`.
660/// This transformation is intended to be used with a top-down traversal
661/// (from producer to consumer). In that way fusion logic can safely handle
662/// producers with multiple users.
663FailureOr<ElementwiseOpFusionResult>
664fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand);
665
666/// Returns a set of indices of the producer's results which would
667/// be preserved after the fusion.
668/// * There is a chance that the implementation of the transformation does not
669/// agree with the result of this method. This function gives a prediction based
670/// on an optimized fusion.
671llvm::SmallDenseSet<int> getPreservedProducerResults(GenericOp producer,
672 GenericOp consumer,
673 OpOperand *fusedOperand);
674
675/// Try to peel and canonicalize loop `op` and return the new result.
676/// Also applies affine_min/max bounds simplification on the fly where relevant.
677// TODO: Add support for scf.parallel and affine.for loops.
679
680/// Peel 'loops' and applies affine_min/max bounds simplification on the fly
681/// where relevant.
682void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
683
684/// Pad the iterator dimensions `options.paddingDimensions` of all `opToPad`
685/// operands to a static bounding box. The original `opToPad` is cloned and
686/// operates on the padded tensors.
687///
688/// * "options.padToMultipleOf" indicates that each padding dimension should be
689/// padded to the specified multiple.
690/// * Use "options.paddingValues" and "options.nofoldFlags" to set padding
691/// value and nofold attribute of the created tensor::PadOps, respectively.
692/// * The unpadded results (extracted slice of the cloned operation) are
693/// returned via `replacements`.
694/// * The tensor::PadOps are returned via `padOps`.
695/// * "options.copyBackOp" specifies the op type for copying back the unpadded
696/// result to the original destination tensor.
697LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
699 LinalgOp &paddedOp,
700 SmallVector<Value> &replacements,
702
703/// Helper function to compute the padded shape of the given value `v` of
704/// `RankedTensorType` given:
705/// - the `indexingSizes` as a list of OpFoldResult.
706/// - an `indexingMap` that encodes how the padded shape varies with
707/// increases in `indexingSizes`.
708/// The implementation iteratively combines increases from contributing using
709/// affine.apply operations.
710/// The `indexingMap` + `indexingSizes` encoding suits StructuredOps and
711/// provides a gentle portability path for Linalg-like ops with affine maps.
712/// The padded shape is computed by evaluating the maximum accessed index per
713/// dimension, which may involve multiplying by constant factors derived from
714/// the affine indexing expressions. Currently, only a limited set of projected
715/// permuation indexing maps are supported, such as
716/// - affine_map<(d0, d1, d2) -> (d0, d1)>
717/// - affine_map<(d0, d1, d2) -> (d0, d1 + d2)>
718/// - affine_map<(d0, d1) -> (d0 * 3 + d1)>
719/// In the future, more general interfaces can be devised to encode similar
720/// shape evolutions and map between an op and its operands.
723 AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
725
727 std::function<FailureOr<SmallVector<OpFoldResult>>(
730
731/// Specific helper for Linalg ops.
732FailureOr<SmallVector<OpFoldResult>>
734 ArrayRef<Range> iterationDomain,
736
737/// Operations and values created in the process of padding a TilingInterface
738/// operation.
740 /// The operands of the padded op.
742 /// The padded op, a clone of `toPad` with padded operands.
743 TilingInterface paddedOp;
744 /// Slices of the padded op's results, same types as `toPad`.
746};
747
748/// Pad the iterator dimensions of `toPad`.
749/// * "options.paddingSizes" indicates that each padding dimension should be
750/// padded to the specified padding size.
751/// * "options.padToMultipleOf" indicates that the paddingSizes should be
752// interpreted as the bounding box (dynamic) value to pad to.
753/// * Use "options.paddingValues" to set the padding value of the created
754// tensor::PadOp.
755//
756// The transformation assumes that the insertion point is set after the
757// operation to pad.
758FailureOr<PadTilingInterfaceResult>
759rewriteAsPaddedOp(OpBuilder &, TilingInterface toPad,
763
764namespace detail {
765
766/// Helper struct to hold the results of building a packing loop nest.
773
774/// Build the packing loop nest required to hoist `opToHoist` above
775/// `outermostEnclosingForOp`.
776/// The loop nest is built just before `outermostEnclosingForOp`.
777FailureOr<PackingResult>
778buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
779 scf::ForOp outermostEnclosingForOp,
780 ArrayRef<int64_t> transposeVector);
781
782} // namespace detail
783
784/// Mechanically hoist padding operations on tensors by `numLoops` into a new,
785/// generally larger tensor. This achieves packing of multiple padding ops into
786/// a larger tensor. On success, `opToHoist` is replaced by the cloned version
787/// in the packing loop so the caller can continue reasoning about the padding
788/// operation. If `transposeVector` is non-empty, hoist padding introduces a
789/// TransposeOp to transpose the padded tensor before inserting it into the
790/// packed tensor. A `transposeVector` can change the storage order of the
791/// padded tensor but does not change the order of the pack or compute loops.
792///
793/// TODO: In the future, we should consider rewriting as a linalg.pack after
794/// hoisting since this abstraction is now available.
795///
796/// Example in pseudo-mlir:
797/// =======================
798///
799/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
800/// ```
801/// scf.for (%i, %j, %k)
802/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
803/// %0 = tensor.pad %st0 low[0, 0] high[...] {
804/// ^bb0( ... ):
805/// linalg.yield %pad
806/// } : tensor<?x?xf32> to tensor<4x8xf32>
807/// compute(%0)
808/// ```
809///
810/// IR resembling the following is produced:
811///
812/// ```
813/// scf.for (%i) {
814/// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32>
815/// %packed = scf.for (%k) iter_args(%p : %packed_init) {
816/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
817/// %0 = tensor.pad %st0 low[0, 0] high[...] {
818/// ^bb0( ... ):
819/// linalg.yield %pad
820/// } : tensor<?x?xf32> to tensor<4x8xf32>
821/// %1 = tensor.insert_slice %0 ...
822/// : tensor<4x8xf32> to tensor<?x4x8xf32>
823/// scf.yield %1: tensor<?x4x8xf32>
824/// } -> tensor<?x4x8xf32>
825/// scf.for (%j, %k) {
826/// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
827/// tensor<?x4x8xf32> to tensor<4x8xf32>
828/// compute(%st0)
829/// }
830/// }
831/// ```
832FailureOr<Value>
833hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
834 int64_t numLoops, ArrayRef<int64_t> transposeVector,
835 tensor::PadOp &hoistedOp,
836 SmallVectorImpl<TransposeOp> &transposeOps);
837/// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
838FailureOr<Value>
839hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
840 ArrayRef<int64_t> transposeVector,
841 tensor::PadOp &hoistedOp,
842 SmallVectorImpl<TransposeOp> &transposeOps);
843
844/// Apply padding and hoisting to `linalgOp` according to the configuration
845/// specified in `options`.
846FailureOr<LinalgOp> padAndHoistLinalgOp(RewriterBase &rewriter,
847 LinalgOp linalgOp,
849
850/// Split the given `op` into two parts along the given iteration space
851/// `dimension` at the specified `splitPoint`, and return the two parts.
852/// If the second part is statically known to be empty, do not create it
853/// and return nullptr instead. Error state is signalled by returning
854/// a pair of nullptrs.
855///
856/// For example, the following op:
857///
858/// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>)
859/// outs(%2 : tensor<128x64xf32>)
860///
861/// split along the first dimension at position 42 will result in:
862///
863/// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1]
864/// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1]
865/// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>)
866/// outs(%5 : tensor<42x64xf32>)
867/// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1]
868///
869/// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1]
870/// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1]
871/// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>)
872/// outs(%8 : tensor<86x64xf32>)
873/// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1]
874///
875/// Note that there is no simplification other than constant propagation applied
876/// to slice extraction and insertion.
877std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter,
878 TilingInterface op,
879 unsigned dimension,
880 OpFoldResult splitPoint);
881
882/// Perform standalone tiling of a single LinalgOp by `tileSizes`.
883/// and permute the loop nest according to `interchangeVector`
884/// The permutation is expressed as a list of integers that specify
885/// the new ordering of the loop nest. The length of `interchangeVector`
886/// must be equal to the length of `tileSizes`.
887/// An empty vector is interpreted as the identity permutation and the
888/// transformation returns early.
889///
890/// Return a struct containing the tiled loops in the specified order
891/// and the cloned op if successful, std::nullopt otherwise.
892///
893/// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by
894/// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
895/// integers, in the range 0..`tileSizes.size()` without duplications
896/// (i.e. `[1,1,2]` is an invalid permutation).
902FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op,
904
905/// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts
906/// the index accesses of `op`. This is an in-place transformation controlled
907/// by `interchangeVector`. An empty vector is interpreted as the identity
908/// permutation and the transformation returns early.
909///
910/// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
911/// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
912/// integers, in the range 0..`op.rank` without duplications
913/// (i.e. `[1,1,2]` is an invalid permutation).
914///
915/// Return failure if the permutation is not valid.
916FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
917 GenericOp genericOp,
918 ArrayRef<unsigned> interchangeVector);
919
920/// Create a GenericOp from the given named operation `linalgOp` and replace
921/// the given `linalgOp`.
922/// Return failure if `linalgOp` is a GenericOp or misses a region builder.
923FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
924 LinalgOp linalgOp);
925
927 // Specialize generics to category ops (default: named ops).
928 bool emitCategoryOps = false;
929};
930
931/// Replace the given GenericOp with a namedOp or categoryOp.
932FailureOr<LinalgOp>
933specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
935
936/// Create a new buffer using the `allocationFn` provided. The size of this
937/// buffer is either the original subview size when 'useOriginalSubviewSize' is
938/// set to true or the smallest constant bounding size along each dimension that
939/// can be computed for the size of the result of `subView`. Returns the
940/// allocated buffer as `fullLocalView` and the view that matches the size of
941/// the result of subview operation as `partialLocalView`.
946FailureOr<PromotionInfo>
947promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
948 bool useOriginalSubviewSize,
949 const AllocBufferCallbackFn &allocationFn,
950 DataLayout &layout);
951
952/// Promote the `subViews` into a new buffer allocated at the insertion point
953/// `b`. Promotion occurs in 3 steps:
954/// 1. Create a new buffer for a full tile (i.e. not clipped at the
955/// boundary).
956/// 2. Take a full view on the buffer.
957/// 3. Take a partial slice of the full view in step 2. and copy into it.
958///
959/// Return the modified linalg op (the modification happens in place) as well
960/// as all the copy ops created.
961FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
963
964/// Allocate the subview in the GPU workgroup memory.
965std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
966 memref::SubViewOp subview,
967 ArrayRef<Value> sizeBounds,
968 DataLayout &);
969
970/// In case of GPU group memory there is no need to deallocate.
971LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/);
972
973/// Create Memref copy operations and add gpu barrier guards before and after
974/// the copy operation to ensure data integrity.
975LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
976
977/// Allocate the subview in the GPU private memory.
978std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
979 memref::SubViewOp subview,
980 ArrayRef<Value> sizeBounds,
981 DataLayout &);
982
983/// Normal copy to between src and dst.
984LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst);
985
986/// In case of GPU private memory there is no need to deallocate since the
987/// memory is freed when going outside of the scope.
988LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/);
989
990/// Return true if there's dedicated logic in the Linalg Vectorizer to
991/// vectorize this Op, false otherwise.
992///
993/// Note that this helper merely implements a very high level check and that the
994/// vectorizer also requires various additional pre-conditions to be met for it
995/// to work (these are checked by the vectorizer itself).
997
998/// Transformation information returned after vectorizing.
1000 /// Results of the vectorization transform to replace the original operation.
1002};
1003/// Returns a `VectorizationResult` containing the results of the vectorized op,
1004/// or failure if the transformation fails. If provided, `inputVectorSizes` are
1005/// used to vectorize this operation. `inputVectorSizes` must match the rank of
1006/// the iteration space of the operation and the input vector sizes must be
1007/// greater than or equal to their counterpart iteration space sizes, if static.
1008/// `inputVectorShapes` also allows the vectorization of operations with dynamic
1009/// shapes.
1010/// Optionally, `createNamedContraction` can force compatible contractions to be
1011/// vectorized directly to vector.contract operation.
1012FailureOr<VectorizationResult>
1014 ArrayRef<int64_t> inputVectorSizes = {},
1015 ArrayRef<bool> inputScalableVecDims = {},
1016 bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false,
1017 bool assumeDynamicDimsMatchVecSizes = false,
1018 bool createNamedContraction = false);
1019
1020/// Emit a suitable vector form for a Copy op with fully static shape.
1021LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
1022
1023/// Emit a loop nest of `scf.for` with the proper body for `linalgOp`.
1024FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter,
1025 LinalgOp linalgOp);
1026
1027/// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`.
1028FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter,
1029 LinalgOp linalgOp);
1030
1031/// Emit a loop nest of `affine.for` with the proper body for `linalgOp`.
1032FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter,
1033 LinalgOp linalgOp);
1034
1035/// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
1036/// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument
1037/// has one entry per surrounding loop. It uses zero as the convention that a
1038/// particular loop is not tiled. This convention simplifies implementations
1039/// by avoiding affine map manipulations. The returned ranges correspond to
1040/// the loop ranges, in the proper order, that are tiled and for which new
1041/// loops will be created. Also the function returns a map from loop indices
1042/// of the LinalgOp to the corresponding non-empty range indices of newly
1043/// created loops.
1045std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
1047 ArrayRef<OpFoldResult> allShapeSizes,
1048 ArrayRef<OpFoldResult> allTileSizes);
1049
1050namespace detail {
1051template <typename T>
1053 /// Tile sizes.
1055 /// Number of tiles associated with each size.
1057};
1058
1059template <typename T>
1061 /// Tile sizes.
1063 /// Number of tiles associated with each size.
1065};
1066
1067} // namespace detail
1068
1069/// A description of a multi-size tiling comprising tile sizes and numbers of
1070/// tiles, expressed as Values which may or may not be constant. Multi-size
1071/// currently means two-size.
1076
1081
1082/// Emits the IR computing the multi-sized tiling specification with two tile
1083/// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such
1084/// that there exist numbers of tiles with these sizes that fully cover the
1085/// given iteration space `dimension` of the structured `op`.
1086///
1087/// The computation is as follows:
1088///
1089/// b = originalTripCount floordiv sizeDivisor
1090/// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor
1091/// d = (b + t - 1) floordiv t
1092/// s = (b floordiv d) * sizeDivisor
1093/// v = b % d
1094/// u = d - v
1095///
1096/// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of
1097/// the corresponding tiles are `u` and `v`, respectively. Alternatively,
1098///
1099/// s * u + (s + sizeDivisor) * v == original size,
1100/// where s mod sizeDivisor = 0.
1101///
1102/// Expects all values to be positive. In some cases with the target tile size
1103/// sufficiently close to the dimension shape and non-unit divisor, it is
1104/// impossible to compute such sizes. If `emitAssertion` is set, also emit the
1105/// assertion that size computation succeeded.
1106///
1107/// Returns the specification consisting of both tile values and the number of
1108/// tiles of each size.
1109FailureOr<MultiSizeSpecification>
1110computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension,
1111 OpFoldResult targetSize, OpFoldResult divisor,
1112 bool emitAssertions = true);
1113FailureOr<StaticMultiSizeSpecification>
1114computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
1115 int64_t divisor);
1116
1117FailureOr<StaticContinuousTileSizeSpecification>
1118computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension,
1119 unsigned targetSize);
1120FailureOr<ContinuousTileSizeSpecification>
1121computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,
1122 unsigned dimension, OpFoldResult targetSize,
1123 bool emitAssertions);
1124
1125/// Transformation information returned after reduction tiling.
1127 /// The partial reduction tiled op generated.
1129 /// The final reduction operation merging all the partial reductions.
1131 /// Initial values used for partial reductions.
1133 /// The `scf.forall` operation that iterate over the tiles.
1134 scf::ForallOp loops;
1135};
1136
1137/// Method to tile a reduction to parallel iterations computing partial
1138/// reductions. After the loop all the partial reduction are merged into a final
1139/// reduction. For example for the following sequence
1140///
1141/// ```mlir
1142/// %0 = linalg.generic %in ["parallel", "reduction"]
1143/// : tensor<7x9xf32> -> tensor<7xf32>
1144/// ```
1145///
1146/// into:
1147///
1148/// ```mlir
1149/// %0 = linalg.fill ... : tensor<7x4xf32>
1150/// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
1151/// -> (tensor<7x4xf32>) {
1152/// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
1153/// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
1154/// %4 = linalg.generic %2, %3 ["parallel", "reduction"]
1155/// : tensor<7x?xf32> -> tensor<7xf32>
1156/// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32>
1157/// }
1158/// %6 = linalg.generic %1 ["parallel", "reduction"]
1159/// : tensor<7x4xf32> -> tensor<7xf32>
1160/// ```
1161FailureOr<ForallReductionTilingResult>
1162tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
1163 ArrayRef<OpFoldResult> numThreads,
1164 ArrayRef<OpFoldResult> tileSizes = {},
1165 std::optional<ArrayAttr> mapping = std::nullopt);
1166
1167/// All indices returned by IndexOp should be invariant with respect to
1168/// tiling. Therefore, if an operation is tiled, we have to transform the
1169/// indices accordingly, i.e. offset them by the values of the corresponding
1170/// induction variables that are captured implicitly in the body of the op.
1171///
1172/// Example. `linalg.generic` before tiling:
1173///
1174/// #id_2d = (i, j) -> (i, j)
1175/// #pointwise_2d_trait = {
1176/// indexing_maps = [#id_2d, #id_2d],
1177/// iterator_types = ["parallel", "parallel"]
1178/// }
1179/// linalg.generic #pointwise_2d_trait %operand, %result {
1180/// ^bb0(%operand_in: f32, %result_in: f32):
1181/// %i = linalg.index 0 : index
1182/// %j = linalg.index 1 : index
1183/// <some operations that use %i, %j>
1184/// }: memref<50x100xf32>, memref<50x100xf32>
1185///
1186/// After tiling pass with tiles sizes 10 and 25:
1187///
1188/// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
1189///
1190/// %c1 = arith.constant 1 : index
1191/// %c0 = arith.constant 0 : index
1192/// %c25 = arith.constant 25 : index
1193/// %c10 = arith.constant 10 : index
1194/// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
1195/// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
1196/// scf.for %k = %c0 to operand_dim_0 step %c10 {
1197/// scf.for %l = %c0 to operand_dim_1 step %c25 {
1198/// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
1199/// : memref<50x100xf32> to memref<?x?xf32, #strided>
1200/// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1]
1201/// : memref<50x100xf32> to memref<?x?xf32, #strided>
1202/// linalg.generic pointwise_2d_trait %4, %5 {
1203/// ^bb0(%operand_in: f32, %result_in: f32):
1204/// %i = linalg.index 0 : index
1205/// %j = linalg.index 1 : index
1206/// // Indices `k` and `l` are implicitly captured in the body.
1207/// %transformed_i = arith.addi %i, %k : index // index `i` is offset by
1208/// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset
1209/// by %l
1210/// // Every use of %i, %j is replaced with %transformed_i,
1211/// %transformed_j <some operations that use %transformed_i,
1212/// %transformed_j>
1213/// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
1214/// }
1215/// }
1216///
1217/// TODO: Investigate whether mixing implicit and explicit indices
1218/// does not lead to losing information.
1219void transformIndexOps(RewriterBase &b, LinalgOp op,
1221 const LoopIndexToRangeIndexMap &loopIndexToRangeIndex);
1222
1223/// Apply transformation to split the single linalg op reduction into a
1224/// parallel and reduction dimension. Then create a new linalg.generic op
1225/// doing the rest of the reduction. Return the new linalg op with an extra
1226/// parallel dimension or failure if the transformation didn't happen.
1227///
1228/// Example:
1229/// ```
1230/// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
1231/// affine_map<(d0) -> ()>],
1232/// iterator_types = ["reduction"]}
1233/// ins(%in : tensor<32xf32>)
1234/// outs(%out : tensor<f32>) {
1235/// ^bb0(%arg1: f32, %arg2: f32):
1236/// %y = arith.addf %arg1, %arg2 : f32
1237/// linalg.yield %y : f32
1238/// } -> tensor<f32>
1239/// ```
1240/// To:
1241/// ```
1242/// %cst = arith.constant 0.000000e+00 : f32
1243/// %0 = tensor.expand_shape %in [[0, 1]]: tensor<32xf32> into tensor<4x8xf32>
1244/// %1 = tensor.empty [4] : tensor<4xf32>
1245/// %2 = linalg.fill ins(%cst : f32)
1246/// outs(%1 : tensor<4xf32>) -> tensor<4xf32>
1247/// %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
1248/// affine_map<(d0, d1) -> (d0)>],
1249/// iterator_types = ["parallel", "reduction"]}
1250/// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) {
1251/// ^bb0(%arg3: f32, %arg5: f32):
1252/// %5 = arith.addf %arg3, %arg4 : f32
1253/// linalg.yield %5 : f32
1254/// } -> tensor<4xf32>
1255/// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
1256/// affine_map<(d0) -> ()>],
1257/// iterator_types = ["reduction"]}
1258/// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) {
1259/// ^bb0(%arg3: f32, %arg4: f32):
1260/// %5 = arith.addf %arg3, %arg4 : f32
1261/// linalg.yield %5 : f32
1262/// } -> tensor<f32>
1263/// ```
1270FailureOr<SplitReductionResult>
1271splitReduction(RewriterBase &b, LinalgOp op,
1272 const ControlSplitReductionFn &controlSplitReductionFn,
1273 bool useAlloc = false);
1274
1275/// Scaling-based implementation of the split reduction transformation.
1276/// Instead of introducing an ExpandShapeOp, this rewrites a reduction
1277/// dimension `k` into `k * scale + kk`.
1278///
1279/// Example:
1280/// ```
1281/// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>)
1282/// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
1283/// ```
1284///
1285/// Is transformed to:
1286///
1287/// ```
1288/// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)>
1289/// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)>
1290/// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
1291/// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
1292/// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
1293/// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
1294/// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32>
1295/// %cst = arith.constant 0.000000e+00 : f32
1296/// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) ->
1297/// tensor<16x32x64xf32>
1298/// %2 = tensor.empty [64, 4] : tensor<64x4xi1>
1299///
1300/// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
1301/// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
1302/// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>,
1303/// tensor<64x4xi1>)
1304/// outs(%1 : tensor<16x32x64xf32>) {
1305/// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
1306/// %5 = arith.mulf %arg3, %arg4 : f32
1307/// %6 = arith.addf %arg6, %5 : f32
1308/// linalg.yield %6 : f32
1309/// } -> tensor<16x32x64xf32>
1310///
1311/// %4 = linalg.generic {indexing_maps = [#map4, #map5],
1312/// iterator_types = ["parallel", "parallel", "reduction"]}
1313// ins(%3 : tensor<16x32x64xf32>)
1314/// outs(%C : tensor<16x32xf32>) {
1315/// ^bb0(%arg3: f32, %arg4: f32):
1316/// %5 = arith.addf %arg3, %arg4 : f32
1317/// linalg.yield %5 : f32
1318/// } -> tensor<16x32xf32>
1319///
1320/// return %4 : tensor<16x32xf32>
1321/// ```
1322FailureOr<SplitReductionResult>
1324 const ControlSplitReductionFn &controlSplitReductionFn,
1325 bool useAlloc = false);
1326
1327/// Return `true` if a given sequence of dimensions are contiguous in the
1328/// range of the specified indexing map.
1330/// Return `true` if all sequences of dimensions specified in `dimSequences` are
1331/// contiguous in all the ranges of the `maps`.
1333 ArrayRef<ReassociationIndices> dimSequences);
1334
1339
1340/// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
1341/// to calling this method is that for each list in `foldedIterationDim`, the
1342/// sequence of dimensions is contiguous in domains of all `indexing_maps` of
1343/// the `linalgOp`. This can be checked using `areDimSequencePreserved` method.
1344/// When valid, the method also collapses the operands of the op. Returns
1345/// replacement values of the results of the original `linalgOp` by inserting
1346/// reshapes to get back values of compatible types.
1347FailureOr<CollapseResult>
1349 ArrayRef<ReassociationIndices> foldedIterationDims,
1350 RewriterBase &rewriter);
1351
1353 tensor::PadOp padOp;
1354 tensor::ExpandShapeOp expandShapeOp;
1355 linalg::TransposeOp transposeOp;
1356};
1357
1358/// Rewrite pack as pad + reshape + transpose.
1359FailureOr<LowerPackResult> lowerPack(RewriterBase &rewriter,
1360 linalg::PackOp packOp,
1361 bool lowerPadLikeWithInsertSlice = true);
1362
1364 tensor::EmptyOp emptyOp;
1365 linalg::TransposeOp transposeOp;
1366 tensor::CollapseShapeOp collapseShapeOp;
1367 tensor::ExtractSliceOp extractSliceOp;
1368 linalg::CopyOp copyOp;
1369};
1370
1371/// Rewrite pack as empty + transpose + reshape + extract_slice + copy.
1372FailureOr<LowerUnPackOpResult>
1373lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp,
1374 bool lowerUnpadLikeWithExtractSlice = true);
1375
1376/// Struct to hold the result of a `pack` call.
1382/// Implement packing of a single LinalgOp by `packedSizes`.
1383/// There must be one packedSizes entry per `linalgOp` iterator.
1384/// Return the packed Linalg op on success, failure otherwise.
1385FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1386 ArrayRef<OpFoldResult> packedSizes);
1387
1388/// Struct to hold the result of a `packTranspose` call.
1390 linalg::PackOp transposedPackOp;
1391 linalg::LinalgOp transposedLinalgOp;
1392 linalg::UnPackOp transposedUnPackOp;
1393};
1394/// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the
1395/// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements.
1396/// Return failure if either:
1397/// 1. the `packOp` does not have the `linalgOp` as its unique use.
1398/// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied
1399/// to the unique `packOp` use.
1400/// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of
1401/// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty.
1402FailureOr<PackTransposeResult>
1403packTranspose(RewriterBase &rewriter, linalg::PackOp packOp,
1404 linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp,
1405 ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm);
1406
1407/// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m
1408/// and n are proper parallel dimensions and k is a proper reduction
1409/// dimension. Packing occurs by rewriting the op as a linalg.generic and
1410/// calling linalg::pack by `mnkPackedSizes`. The order of the packed
1411/// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2}
1412/// to reorder {m, n, k} into one of the 8 possible forms. The outer
1413/// dimensions of the operands are not permuted at this time, this is left for
1414/// future work.
1415FailureOr<PackResult>
1416packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp,
1417 ArrayRef<OpFoldResult> mnkPackedSizes,
1418 ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf,
1419 ArrayRef<int64_t> mnkOrder);
1420
1422 /// Minor block factors (mb, nb, kb) for packing relayout where mb, mn are
1423 /// the parallel dimensions and kb is the reduction dimension.
1425
1426 /// If true, allows packing of dimensions that only partially fit into the
1427 /// block factors.
1428 bool allowPadding = true;
1429
1430 /// Next multiples of the packing sizes.
1432
1433 /// Permutation of matmul (M, N, K) dimensions order.
1435
1436 /// Transpose LHS outer block layout [MB][KB] -> [KB][MB].
1438
1439 /// Transpose LHS inner block layout [mb][kb] -> [kb][mb].
1441
1442 /// Transpose RHS outer block layout [KB][NB] -> [NB][KB].
1444
1445 /// Transpose RHS inner block layout [kb][nb] -> [nb][kb].
1447};
1448
1449/// Function type which is used to control matmul packing.
1450/// It is expected to return valid packing configuration for each operation.
1451/// Lack of packing options indicates that no valid configuration could be
1452/// assigned and the operation will not be packed.
1454 std::function<std::optional<BlockPackMatmulOptions>(linalg::LinalgOp)>;
1455
1456/// Pack a matmul operation into blocked 4D layout.
1457///
1458/// Relayout a matmul operation into blocked layout with two levels of
1459/// subdivision:
1460/// - major 2D blocks - outer dimensions, consist of minor blocks
1461/// - minor 2D blocks - inner dimensions, consist of scalar elements
1462///
1463/// A 2D matmul MxNxK gets reshaped into blocked 4D representation
1464/// as: [MB][NB][mb][nb] += [MB][KB][mb][kb] * [NB][KB][nb][kb]
1465/// where the (MB, NB, KB) dimensions represent the major blocks,
1466/// and the (mb, nb, kb) are the minor blocks of their respective
1467/// original 2D dimensions (M, N, K).
1468///
1469/// Depending on the initial operands' data layout and the specified
1470/// packing options, the major blocks dimensions might get transposed
1471/// e.g., [MB][KB] -> [KB][MB]. The minor blocks can also be transposed
1472/// e.g., [mb][kb] -> [kb][mb].
1473/// Any present batch dimensions remain unchanged.
1474/// The final result is unpacked back to the original shape.
1475///
1476/// Return failure if no valid packing options are provided.
1477FailureOr<PackResult>
1478blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
1479 const ControlBlockPackMatmulFn &controlPackMatmul);
1480
1481/// Rewrite tensor.from_elements to linalg.generic.
1482FailureOr<Operation *>
1484 tensor::FromElementsOp fromElementsOp);
1485
1486/// Rewrite tensor.generate to linalg.generic.
1487FailureOr<Operation *>
1489 tensor::GenerateOp generateOp);
1490
1491/// Rewrite tensor.pad to linalg.generic + tensor.insert_slice.
1492FailureOr<Operation *> rewriteInDestinationPassingStyle(RewriterBase &rewriter,
1493 tensor::PadOp padOp);
1494
1495/// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing)
1496/// and linalg.matmul.
1497///
1498/// A convolution operation can be written as a matrix-matrix multiplication by
1499/// unfolding the cross-correlation between input and filter and explicitly copy
1500/// overlapped sliding window inputs.
1501///
1502/// Consider 2D input X with single channel input and output and 2x2 filter W:
1503/// [x(0, 0) , x(0, 1) , ..., x(0, n) ]
1504/// [x(1, 0) , x(1, 1) , ..., x(1, n) ]
1505/// [. , . ,. , . ] [w(0, 0), w(0, 1)]
1506/// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)]
1507/// [. , . , ., . ]
1508/// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)]
1509///
1510/// The packed input data (img2col) is a matrix with |rows| = output spatial
1511/// size, |columns| = filter spatial size. To compute the output Y(i, j) we need
1512/// to calculate the dot product between filter window at input X(x, y)) and the
1513/// filter which will look like the following where r.h.s is the img2col matrix
1514/// and l.h.s is the flattened filter:
1515///
1516/// [x(0,0), x(0,1), x(1,0), x(1,1)]
1517/// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)]
1518/// [x(0,1), x(1,1), x(0,2), x(1,2)]
1519/// [ . , . , . , . ]
1520///
1521/// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter
1522/// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix
1523/// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in
1524/// the N input. For the case where N > 1 its a batched matrix-matrix
1525/// multiplication.
1526///
1527/// On success, return both the operation that produces the img2col tensor and
1528/// the final operation of the sequence that replaces the original convolution.
1529FailureOr<std::pair<Operation *, Operation *>>
1530rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp);
1531
1532/// Same as the above but for Fhwc channel orderings in the filter. In this case
1533/// the matrix multiplication is actually a row-wise dot-product rather than a
1534/// row-column dot-product. This is to avoid transposing the filter matrix which
1535/// would be required for a regular matrix multiplication to produce the correct
1536/// output dimensions.
1537FailureOr<std::pair<Operation *, Operation *>>
1538rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp);
1539
1540/// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no
1541/// reduction among the input channels so each convolution can be a
1542/// matrix-vector product and by transposing both input filter so channels are
1543/// outer most the computation is a batched matrix-vector product.
1544FailureOr<std::pair<Operation *, Operation *>>
1546 linalg::DepthwiseConv2DNhwcHwcOp convOp);
1547
1548/// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the
1549/// channels are to the left of the image shape dimensions, the position of the
1550/// contraction dimension in the resulting matmul is reversed. This swaps the
1551/// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) *
1552/// (C x Kh x Kw, Ho x Wo))
1553FailureOr<std::pair<Operation *, Operation *>>
1554rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp);
1555
1556/// Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by
1557/// materializing transpose.
1558FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1559 linalg::Conv2DNhwcFhwcOp op);
1560FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter,
1561 linalg::Conv2DNhwcFhwcQOp op);
1562
1563/// Convert Linalg matmul ops to transposed variants.
1564FailureOr<Operation *> transposeMatmul(RewriterBase &rewriter,
1565 linalg::MatmulOp op,
1566 bool transposeLHS = true);
1567FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter,
1568 linalg::BatchMatmulOp op,
1569 bool transposeLHS = true);
1570
1571/// Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm
1572/// F(m x m, r x r). m is the dimension size of output and r is the dimension
1573/// size of filter.
1574FailureOr<Operation *> winogradConv2D(RewriterBase &rewriter,
1575 linalg::Conv2DNhwcFhwcOp op,
1576 WinogradConv2DFmr fmr);
1577
1578/// Rewrite linalg.winograd_filter_transform. The data layout of the filter is
1579/// FHWC. The transformation matrix is 2-dimension. We need to extract H x W
1580/// from FHWC first. We generate 2 levels of loops to iterate on F and C. After
1581/// the rewriting, we get
1582///
1583/// scf.for %f = lo_f to hi_f step 1
1584/// scf.for %c = lo_c to hi_c step 1
1585/// %extracted = extract filter<h x w> from filter<f x h x w x c>
1586/// %ret = linalg.matmul G, %extracted
1587/// %ret = linalg.matmul %ret, GT
1588/// %inserted = insert %ret into filter<h x w x c x f>
1589FailureOr<Operation *>
1591 linalg::WinogradFilterTransformOp op);
1592
1593/// Rewrite linalg.winograd_input_transform. The data layout of the input is
1594/// NHWC. The transformation matrix is 2-dimension. We need to extract H x W
1595/// from NHWC first. We generate 4 levels of loops to iterate on N, C, tileH,
1596/// and tileW. After the rewriting, we get
1597///
1598/// scf.for %h = 0 to tileH step 1
1599/// scf.for %w = 0 to tileW step 1
1600/// scf.for %n = 0 to N step 1
1601/// scf.for %c = 0 to C step 1
1602/// %extracted = extract %extracted<alphaH x alphaW> from
1603/// %input<N x H x W x C>
1604/// at [%n, (%h x m), (%w x m), %c]
1605/// %ret = linalg.matmul BT, %extracted
1606/// %ret = linalg.matmul %ret, B
1607/// %inserted = insert %ret<alphaH x alphaW> into
1608/// %output<alphaH x alphaW x tileH x tileW x N x C>
1609/// at [0, 0, %h, %w, %n, %c]
1610FailureOr<Operation *>
1612 linalg::WinogradInputTransformOp op);
1613
1614/// Rewrite linalg.winograd_output_transform. The data layout of the output is
1615/// HWNF. The transformation matrix is 2-dimension. We need to extract H x W
1616/// from HWNF first. We generate 4 levels of loops to iterate on N, F, tileH,
1617/// and tileW. After the transformation, we get
1618///
1619/// scf.for %h = 0 to tileH step 1
1620/// scf.for %w = 0 to tileW step 1
1621/// scf.for %n = 0 to N step 1
1622/// scf.for %f = 0 to F step 1
1623/// %extracted = extract %extracted<alphaH x alphaW> from
1624/// %input<alphaH x alphaW x tileH x tileW x N x F>
1625/// at [0, 0, %h, %w, %n, %f]
1626/// %ret = linalg.matmul AT, %extracted
1627/// %ret = linalg.matmul %ret, A
1628/// %inserted = insert %ret<alphaH x alphaW> into
1629/// output<N x H x W x F>
1630/// at [%n, (%h x m), (%w x m), %f]
1631FailureOr<Operation *>
1633 linalg::WinogradOutputTransformOp op);
1634
1635/// Method to deduplicate operands and remove dead results of `linalg.generic`
1636/// operations. This is effectively DCE for a linalg.generic op. If there is
1637/// deduplication of operands orremoval of results, replaces the `genericOp`
1638/// with a new op and returns it. Returns the same operation if there is no
1639/// deduplication/removal.
1640FailureOr<linalg::GenericOp> deduplicateOperandsAndRemoveDeadResults(
1641 RewriterBase &rewriter, linalg::GenericOp genericOp, bool removeOutputs);
1642
1643//===----------------------------------------------------------------------===//
1644// Rewrite patterns wrapping transformations.
1645// TODO: every single such pattern should be a close to noop wrapper around a
1646// functional-stye API call.
1647//===----------------------------------------------------------------------===//
1648
1649/// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D
1650/// convolution ops. Works with both named ops and equivalent generic ops.
1651template <typename Conv2DOp, typename Conv1DOp>
1653 : public OpInterfaceRewritePattern<LinalgOp> {
1655
1656 FailureOr<Conv1DOp> returningMatchAndRewrite(LinalgOp convOp,
1657 PatternRewriter &rewriter) const;
1658
1659 LogicalResult matchAndRewrite(LinalgOp convOp,
1660 PatternRewriter &rewriter) const override {
1661 return returningMatchAndRewrite(convOp, rewriter);
1662 }
1663};
1664
1665extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
1666 Conv1DNwcWcfOp>;
1667extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
1668 Conv1DNcwFcwOp>;
1669
1670/// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh)
1671/// dimensions into 1-D depthwise convolution ops.
1673 : public OpInterfaceRewritePattern<LinalgOp> {
1675 PatternBenefit benefit = 1)
1676 : OpInterfaceRewritePattern<LinalgOp>(context, benefit) {}
1677
1678 FailureOr<DepthwiseConv1DNwcWcOp>
1679 returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const;
1680
1681 LogicalResult matchAndRewrite(LinalgOp convOp,
1682 PatternRewriter &rewriter) const override {
1683 return returningMatchAndRewrite(convOp, rewriter);
1684 }
1685};
1686
1687struct DownscaleConv2DOp final : public OpInterfaceRewritePattern<LinalgOp> {
1689 : OpInterfaceRewritePattern<LinalgOp>(context, benefit) {}
1690
1691 FailureOr<Conv1DOp> returningMatchAndRewrite(LinalgOp convOp,
1692 PatternRewriter &rewriter) const;
1693
1694 LogicalResult matchAndRewrite(LinalgOp convOp,
1695 PatternRewriter &rewriter) const override {
1696 return returningMatchAndRewrite(convOp, rewriter);
1697 }
1698};
1699
1700///
1701/// Linalg generalization pattern.
1702///
1703/// Apply the `generalization` transformation as a pattern.
1704/// See `generalization` for more details.
1705//
1706// TODO: Automatic default pattern class that just unwraps a function
1707// returning FailureOr<GenericOp>.
1709 : public OpInterfaceRewritePattern<LinalgOp> {
1711
1712 /// `matchAndRewrite` implementation that returns the significant
1713 /// transformed pieces of IR.
1714 FailureOr<GenericOp>
1715 returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const {
1716 return generalizeNamedOp(rewriter, op);
1717 }
1718
1719 LogicalResult matchAndRewrite(LinalgOp op,
1720 PatternRewriter &rewriter) const override {
1721 return returningMatchAndRewrite(op, rewriter);
1722 }
1723};
1724
1726
1728 MLIRContext *context, const GenericOpSpecializationOptions &options = {},
1729 PatternBenefit benefit = 1)
1730 : OpRewritePattern<GenericOp>(context, benefit), options(options) {}
1731
1732 FailureOr<GenericOp>
1733 returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
1734 return specializeGenericOp(rewriter, op, options);
1735 }
1736
1737 LogicalResult matchAndRewrite(GenericOp op,
1738 PatternRewriter &rewriter) const override {
1739 return returningMatchAndRewrite(op, rewriter);
1740 }
1741
1742private:
1744};
1745
1746/// Vectorization pattern for memref::CopyOp.
1747struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
1748 using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
1749
1750 LogicalResult matchAndRewrite(memref::CopyOp copyOp,
1751 PatternRewriter &rewriter) const override;
1752};
1753
1755 std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>;
1756
1757/// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
1758/// InsertSliceOp. For now, only constant padding values are supported.
1759struct DecomposePadOpPattern : public OpRewritePattern<tensor::PadOp> {
1761 : OpRewritePattern<tensor::PadOp>(context, benefit) {}
1762 LogicalResult matchAndRewrite(tensor::PadOp padOp,
1763 PatternRewriter &rewriter) const override;
1764
1765protected:
1766 Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp,
1767 Value dest,
1768 const SmallVector<Value> &dynSizes) const;
1769};
1770
1771/// Rewrites a linalg::PackOp into a sequence of:
1772/// * tensor::PadOp + linalg::TransposeOp + tensor::EmptyOp +
1773/// tensor::InsertSliceOp ops.
1774/// (InsertSliceOp is rank-expanding).
1775///
1776/// Requires that all the tiled-outer-dims of the input linalg::PackOp are 1.
1777/// Note that this constraint means that effectively exactly one tile is packed.
1778///
1779/// In addition, assumes that the un-tiled-outer-dims are not permuted.
1780///
1781/// Before:
1782/// ```
1783/// %packed = linalg.pack %input
1784/// padding_value(%pad : f32)
1785/// inner_dims_pos = [1, 0]
1786/// inner_tiles = [2, %high]
1787/// into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32>
1788/// ```
1789///
1790/// After:
1791/// ```
1792/// // PadOp
1793/// %padded = tensor.pad %arg0 low[0, 0] high[%0, 1] {
1794/// ^bb0(...):
1795/// tensor.yield %arg2 : f32
1796/// } : tensor<5x1xf32> to tensor<?x2xf32>
1797/// // EmptyOp + TransposeOp
1798/// %empty = tensor.empty(%arg3) : tensor<2x?xf32>
1799/// %transposed = linalg.transpose
1800/// ins(%extracted_slice : tensor<?x2xf32>)
1801/// outs(%empty : tensor<2x?xf32>)
1802/// permutation = [1, 0]
1803/// // InsertSliceOp
1804/// %inserted_slice = tensor.insert_slice %transposed
1805/// into %arg1[0, 0, 0, 0] [1, 1, 2, %tile_dim_1] [1, 1, 1, 1]
1806/// : tensor<2x?xf32> into tensor<1x1x2x?xf32>
1807/// ```
1809 : public OpRewritePattern<linalg::PackOp> {
1810 using OpRewritePattern<linalg::PackOp>::OpRewritePattern;
1811 LogicalResult matchAndRewrite(linalg::PackOp packOp,
1812 PatternRewriter &rewriter) const override;
1813};
1814
1815/// Rewrites a linalg::UnPackOp into a sequence of:
1816/// * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp
1817/// (ExtractSliceOp is rank-reducing).
1818///
1819/// Requires that all the tiled-outer-dims of the input linalg::UnPackOp are 1.
1820/// Note that this constraint means that effectively exactly one tile is
1821/// unpacked.
1822///
1823/// Before:
1824/// ```
1825/// %packed = linalg.unpack %input
1826/// inner_dims_pos = [1, 0]
1827/// inner_tiles = [2, 8]
1828/// into %output : tensor<1x1x2x8xf32> -> tensor<5x1xf32>
1829/// ```
1830///
1831/// After:
1832/// ```
1833/// // Rank-reduced extract to obtain the tile
1834/// %slice = tensor.extract_slice %arg0[0, 0, 0, 0] [1, 1, 2, 8] [1, 1, 1, 1]
1835/// : tensor<1x1x2x8xf32> to tensor<2x8xf32>
1836/// // EmptyOp + TransposeOp
1837/// %init = tensor.empty() : tensor<8x2xf32>
1838/// %transposed = linalg.transpose
1839/// ins(%extracted_slice : tensor<2x8xf32>)
1840/// outs(%0 : tensor<8x2xf32>) permutation = [1, 0]
1841/// // Extract a slice matching the specified output size
1842/// %result = tensor.extract_slice %transposed[0, 0] [5, 1] [1, 1]
1843/// : tensor<8x2xf32> to tensor<5x1xf32>
1844/// ```
1846 : public OpRewritePattern<linalg::UnPackOp> {
1847 using OpRewritePattern<linalg::UnPackOp>::OpRewritePattern;
1848 LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp,
1849 PatternRewriter &rewriter) const override;
1850};
1851
1852/// Match and rewrite for the pattern:
1853/// ```
1854/// %alloc = ...
1855/// [optional] %view = memref.view %alloc ...
1856/// %subView = subview %allocOrView ...
1857/// [optional] linalg.fill(%allocOrView, %cst) ...
1858/// ...
1859/// memref.copy(%in, %subView) ...
1860/// vector.transfer_read %allocOrView[...], %cst ...
1861/// ```
1862/// into
1863/// ```
1864/// [unchanged] %alloc = ...
1865/// [unchanged] [optional] %view = memref.view %alloc ...
1866/// [unchanged] [unchanged] %subView = subview %allocOrView ...
1867/// ...
1868/// vector.transfer_read %in[...], %cst ...
1869/// ```
1870/// Where there is no interleaved use between memref.copy and transfer_read as
1871/// well as no interleaved use between linalg.fill and memref.copy (if
1872/// linalg.fill is specified).
1873/// This is a custom rewrite to forward partial reads (with optional fills) to
1874/// vector.transfer_read.
1876 : public OpRewritePattern<vector::TransferReadOp> {
1877 using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
1878
1879 LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
1880 PatternRewriter &rewriter) const override;
1881};
1882
1883/// Match and rewrite for the pattern:
1884/// ```
1885/// %alloc = ...
1886/// [optional] %view = memref.view %alloc ...
1887/// %subView = subview %allocOrView...
1888/// ...
1889/// vector.transfer_write %..., %allocOrView[...]
1890/// memref.copy(%subView, %out)
1891/// ```
1892/// into
1893/// ```
1894/// [unchanged] %alloc = ...
1895/// [unchanged] [optional] %view = memref.view %alloc ...
1896/// [unchanged] %subView = subview %allocOrView...
1897/// ...
1898/// vector.transfer_write %..., %out[...]
1899/// ```
1900/// Where there is no interleaved use between transfer_write and memref.copy.
1901/// This is a custom rewrite to forward partial writes to
1902/// vector.transfer_write.
1904 : public OpRewritePattern<vector::TransferWriteOp> {
1905 using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
1906
1907 LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
1908 PatternRewriter &rewriter) const override;
1909};
1910
1911/// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)).
1913 : public OpRewritePattern<tensor::ExtractSliceOp> {
1914 /// A function to control pattern application and rewrite logic.
1915 ///
1916 /// The function will be given the slice op and should return:
1917 /// - std::nullopt: to fail the match and not apply the pattern;
1918 /// - true: to apply the pattern with zero slice guard;
1919 /// - false: to apply the pattern without zero slice guard.
1920 ///
1921 /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice
1922 /// guard.
1923 using ControlFn = std::function<std::optional<bool>(tensor::ExtractSliceOp)>;
1924
1926 ControlFn controlFn = nullptr,
1927 PatternBenefit benefit = 1)
1928 : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {}
1929
1930 LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
1931 PatternRewriter &rewriter) const override;
1932
1933private:
1934 ControlFn controlFn;
1935};
1936
1937//===----------------------------------------------------------------------===//
1938// Populate functions.
1939//===----------------------------------------------------------------------===//
1940
1941/// Canonicalization patterns relevant to apply after tiling patterns. These
1942/// are applied automatically by the tiling pass but need to be applied
1943/// manually when tiling is called programmatically.
1945
1946/// Linalg generalization patterns
1947
1948/// Populates `patterns` with patterns to convert spec-generated named ops to
1949/// linalg.generic ops.
1951
1952/// Populates `patterns` with patterns to convert linalg.generic ops to named
1953/// or category ops where possible. A linalg.generic can represent wide range
1954/// and complex computations for which equivalent linalg named op may not exist
1955/// e.g. linalg.generic that takes a tensor and computes a polynomial such as:
1956/// p(x) = an*x^n + ... + a1x + a0
1957/// There is no equivalent named op to convert to. Many such cases exist.
1959 RewritePatternSet &patterns,
1960 const GenericOpSpecializationOptions &options = {});
1961
1962/// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
1963/// to equivalent `linalg.elementwise`.
1964void populateLinalgNamedToElementwisePatterns(RewritePatternSet &patterns);
1965
1966/// Populates `patterns` with patterns that fold operations like
1967/// `linalg.transform` into elementwise op map.
1968void populateLinalgFoldIntoElementwisePatterns(RewritePatternSet &patterns);
1969
1970/// Linalg decompose convolutions patterns
1971
1972/// Populates patterns to decompose high-D convolution ops into low-D ones.
1973/// This is a step in progressive lowering for convolution ops, afterwards we
1974/// can vectorize the low-D convolution ops.
1975void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
1976 PatternBenefit benefit = 1);
1977
1978/// Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g.
1979/// tensor.pad, linalg.transpose, tensor.{insert|extract}_slice. Require all
1980/// outer dims to be unit.
1981void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns);
1982
1983/// Populates patterns to decompose tensor.pad into e.g.
1984/// tensor.empty, linalg.fill, tensor.insert_slice.
1985void populateDecomposePadPatterns(RewritePatternSet &patterns);
1986
1987/// Populates patterns to transform linalg.conv_2d_xxx operations into
1988/// linalg.generic (for img2col packing) and linalg.matmul.
1989/// Note: currently limited to Tensor semantics only.
1990/// \see rewriteInIm2Col for more details.
1991void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns);
1992
1993/// Populates `patterns` with patterns that vectorize tensor.pad.
1994/// These patterns are meant to apply in a complementary fashion. Benefits
1995/// are used to encode a certain ordering of pattern application. To avoid
1996/// scattering magic constants throughout the code base, the patterns must be
1997/// added with this function. `baseBenefit` can be used to offset the benefit
1998/// of all tensor::PadOp vectorization patterns by a certain value.
2000 PatternBenefit baseBenefit = 1);
2001
2002/// Populate patterns for splitting a `LinalgOp` with multiple statements within
2003/// its payload into multiple `GenericOp` that have a single statement.
2004/// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments
2005/// and results from the generated decomposed ops. This is default `true` since
2006/// the core decomposition patterns relies on these clean up patterns. It is set
2007/// to false only for testing purposes.
2009 bool removeDeadArgsAndResults = true);
2010
2011/// Populate patterns that convert non-destination-style ops to destination
2012/// style ops.
2014
2015/// Populate patterns for vectorizing low-D convolution ops. This is a step in
2016/// progressive lowering for convolution ops, it assume high-D convolution ops
2017/// were decomposed previously.
2019 PatternBenefit benefit = 1);
2020
2021/// Populate patterns that convert `ElementwiseMappable` ops to linalg
2022/// parallel loops.
2024
2025/// Populate patterns that are only useful in the context of sparse tensors.
2027
2028/// Function type which is used to control when to stop fusion. It is expected
2029/// that OpOperand is not modified in the callback. The OpOperand is not marked
2030/// as const to allow callers to use non-const methods.
2031using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>;
2032
2033/// Patterns for fusing linalg operation on tensors.
2034
2035/// Pattern to fuse `linalg.generic` -> `linalg.generic` operations
2036/// when both operations are fusable elementwise operations.
2038 RewritePatternSet &patterns,
2039 const ControlFusionFn &controlElementwiseOpFusion);
2040
2041/// Function type which is used to control propagation of linalg.pack/unpack
2042/// ops.
2043using ControlPropagationFn = std::function<bool(OpOperand *opOperand)>;
2044
2045/// Patterns to bubble up or down data layout ops across other operations.
2046/// The function also has an option to allow the patterns to propagate with
2047/// poison padding if requested by the caller.
2049 RewritePatternSet &patterns,
2050 const ControlPropagationFn &controlPackUnPackPropagation,
2051 bool PoisonPaddingOk = false);
2052
2053/// Patterns to sink extract slice across other operations.
2055 RewritePatternSet &patterns,
2056 const ControlPropagationFn &controlPackUnPackPropagation);
2057
2058/// Pattern to remove dead operands and results of `linalg.generic` operations.
2059/// This is a pattern wrapper for `deduplicateOperandsAndRemoveDeadResults`.
2061
2062/// Patterns to promote inputs to outputs and remove unused inputs of
2063/// `linalg.generic` ops.
2065
2066/// Function type to control generic op dimension collapsing. It is expected
2067/// to return an array of `ReassociationIndices` representing dimensions that
2068/// should be merged.
2070 std::function<SmallVector<ReassociationIndices>(linalg::LinalgOp)>;
2071
2072/// Pattern to collapse dimensions in a linalg.generic op. This will collapse
2073/// tensor operands when needed and expand back the result tensors.
2075 RewritePatternSet &patterns,
2076 const GetCollapsableDimensionsFn &controlCollapseDimensions);
2077
2078/// Patterns to fold an expanding (collapsing) tensor_reshape operation with its
2079/// producer (consumer) generic operation by expanding the dimensionality of the
2080/// loop in the generic op.
2082 RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
2083
2084/// Patterns to fold an expanding tensor.expand_shape operation with its
2085/// producer generic operation by collapsing the dimensions of the generic op.
2087 RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes);
2088
2089/// Patterns to constant fold Linalg operations.
2091 const ControlFusionFn &controlFn);
2092
2093/// Pattern to replace `linalg.add` when destination passing on a contraction op
2094/// suffices for achieving the sum.
2096
2097/// Pattern to fuse a `tensor.pad` operation with the producer of its source,
2098/// if the producer is a `linalg` operation with all parallel iterator types.
2100 RewritePatternSet &patterns);
2101
2102/// Patterns to simplify depthwise convolutions.
2104
2105/// Patterns to fold unit-extent dimensions in operands/results of linalg ops on
2106/// tensors and memref.
2107/// Note that these patterns should not be used with a greedy driver.
2110
2111/// Populates canonicalization patterns that simplify IR after folding
2112/// unit-extent dimensions.
2115
2116/// A pattern that converts init operands to input operands.
2118
2119/// Patterns that are used to inline constant operands into linalg generic ops.
2121
2122/// Patterns that are used to bubble up extract slice op above linalg op.
2124
2125/// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into
2126/// linalg.fill(%cst, tensor.extract_slice(%init)).
2128
2129/// Add patterns to make explicit broadcasts and transforms in the
2130/// input operands of a genericOp.
2132
2133/// Patterns to apply `splitReduction` below.
2135 RewritePatternSet &patterns,
2136 const ControlSplitReductionFn &controlSplitReductionFn,
2137 bool useAlloc = false);
2138
2139/// Patterns to convert Linalg matmul ops to transposed variants.
2141 bool transposeLHS = true);
2142
2143/// Patterns to block pack Linalg matmul ops.
2145 const ControlBlockPackMatmulFn &controlFn);
2146
2147/// Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
2149 WinogradConv2DFmr fmr);
2150
2151/// Patterns to decompose Winograd operators.
2153
2154/// Adds patterns that reduce the rank of named contraction ops that have
2155/// unit dimensions in the operand(s) by converting to a sequence of
2156/// `collapse_shape`,
2157/// `<corresponding linalg named op>`, `expand_shape` (if on tensors). For
2158/// example a `linalg.batch_matmul` with unit batch size will convert to
2159/// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will
2160/// convert to a `linalg.dot`.
2162
2163/// Function type which is used to control folding operations like `tensor.pad`
2164/// and `tensor.extract_slice` into linalg.pack/unpack ops.
2165using ControlFoldIntoPackUnpackFn = std::function<bool(OpOperand *opOperand)>;
2166/// Populates `patterns` with patterns that fold operations like `tensor.pad`
2167/// and `tensor.extract_slice` into `tensor.pack` and `tensor.unpack` operations
2168/// respectively.
2170 RewritePatternSet &patterns,
2171 const ControlFoldIntoPackUnpackFn &controlFn = nullptr);
2172
2173/// Populates `patterns` with patterns that fold operations like `linalg.pack`
2174/// and `linalg.unpack` into `tensor.empty`.
2176
2177/// Populates `patterns` with patterns that simplify `tensor.pack` and
2178/// `tensor.unpack` operations.
2180
2181} // namespace linalg
2182} // namespace mlir
2183
2184#endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
static llvm::ManagedStatic< PassManagerOptions > options
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition AffineMap.h:46
Attributes are known-constant values of operations.
Definition Attributes.h:25
The main mechanism for performing data layout queries.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
This class represents a single result from folding an operation.
This class represents an operand of an operation.
Definition Value.h:257
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
State for analysis-enabled bufferization.
FailureOr< PackingResult > buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, scf::ForOp outermostEnclosingForOp, ArrayRef< int64_t > transposeVector)
Build the packing loop nest required to hoist opToHoist above outermostEnclosingForOp.
void populateDataLayoutPropagationPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation, bool PoisonPaddingOk=false)
Patterns to bubble up or down data layout ops across other operations.
void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns)
A pattern that converts init operands to input operands.
std::function< IndexingMapOpInterface( Location loc, OpBuilder &, IndexingMapOpInterface, ArrayRef< Value > newOperands, ArrayRef< AffineMap > newIndexingMaps, const llvm::SmallDenseSet< unsigned > &droppedDims)> DroppedUnitDimsBuilder
Definition Transforms.h:629
void populateTransposeMatmulPatterns(RewritePatternSet &patterns, bool transposeLHS=true)
Patterns to convert Linalg matmul ops to transposed variants.
void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns)
Adds patterns that reduce the rank of named contraction ops that have unit dimensions in the operand(...
LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, const LinalgPaddingOptions &options, LinalgOp &paddedOp, SmallVector< Value > &replacements, SmallVector< tensor::PadOp > &padOps)
Pad the iterator dimensions options.paddingDimensions of all opToPad operands to a static bounding bo...
Definition Padding.cpp:244
void populateSplitReductionPattern(RewritePatternSet &patterns, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Patterns to apply splitReduction below.
void populateFuseTensorPadWithProducerLinalgOpPatterns(RewritePatternSet &patterns)
Pattern to fuse a tensor.pad operation with the producer of its source, if the producer is a linalg o...
FailureOr< std::pair< Operation *, Operation * > > rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp)
Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) and linalg....
bool areDimSequencesPreserved(ArrayRef< AffineMap > maps, ArrayRef< ReassociationIndices > dimSequences)
Return true if all sequences of dimensions specified in dimSequences are contiguous in all the ranges...
bool hasVectorizationImpl(Operation *)
Return true if there's dedicated logic in the Linalg Vectorizer to vectorize this Op,...
void populateExtractSliceSinkingPatterns(RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation)
Patterns to sink extract slice across other operations.
void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns)
Patterns that are used to bubble up extract slice op above linalg op.
SmallVector< Operation *, 4 > LinalgLoops
Definition Transforms.h:517
void transformIndexOps(RewriterBase &b, LinalgOp op, SmallVectorImpl< Value > &ivs, const LoopIndexToRangeIndexMap &loopIndexToRangeIndex)
All indices returned by IndexOp should be invariant with respect to tiling.
Definition Tiling.cpp:73
void populateBlockPackMatmulPatterns(RewritePatternSet &patterns, const ControlBlockPackMatmulFn &controlFn)
Patterns to block pack Linalg matmul ops.
void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns)
Populates patterns to transform linalg.conv_2d_xxx operations into linalg.generic (for img2col packin...
std::function< FailureOr< SmallVector< OpFoldResult > >( OpBuilder &, OpOperand &, ArrayRef< Range >, const PadTilingInterfaceOptions &)> PadSizeComputationFunction
Definition Transforms.h:726
FailureOr< Operation * > decomposeWinogradFilterTransformOp(RewriterBase &rewriter, linalg::WinogradFilterTransformOp op)
Rewrite linalg.winograd_filter_transform.
std::optional< Value > allocateWorkgroupMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU workgroup memory.
FailureOr< PackTransposeResult > packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef< int64_t > outerPerm, ArrayRef< int64_t > innerPerm)
Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the transposed PackOp -> LinalgOp ...
Value bufferizeToAllocation(RewriterBase &rewriter, const BufferizeToAllocationOptions &options, tensor::PadOp padOp, Attribute memorySpace={}, Operation *insertionPoint=nullptr)
Materialize a buffer allocation for the given tensor.pad op and lower the op to linalg....
FailureOr< VectorizationResult > vectorize(RewriterBase &rewriter, Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false, bool assumeDynamicDimsMatchVecSizes=false, bool createNamedContraction=false)
Returns a VectorizationResult containing the results of the vectorized op, or failure if the transfor...
bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence)
Return true if a given sequence of dimensions are contiguous in the range of the specified indexing m...
FailureOr< Value > hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops, ArrayRef< int64_t > transposeVector, tensor::PadOp &hoistedOp, SmallVectorImpl< TransposeOp > &transposeOps)
Mechanically hoist padding operations on tensors by numLoops into a new, generally larger tensor.
void populateDecomposeProjectedPermutationPatterns(RewritePatternSet &patterns)
Add patterns to make explicit broadcasts and transforms in the input operands of a genericOp.
void populateFoldReshapeOpsByCollapsingPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding tensor.expand_shape operation with its producer generic operation by co...
LinalgTilingLoopType
The type of loops to be generated during tiling.
Definition Utils.h:135
FailureOr< LowerUnPackOpResult > lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice=true)
Rewrite pack as empty + transpose + reshape + extract_slice + copy.
std::function< std::optional< BlockPackMatmulOptions >(linalg::LinalgOp)> ControlBlockPackMatmulFn
Function type which is used to control matmul packing.
void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit baseBenefit=1)
Populates patterns with patterns that vectorize tensor.pad.
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns)
Canonicalization patterns relevant to apply after tiling patterns.
Definition Tiling.cpp:866
void populateLinalgFoldIntoElementwisePatterns(RewritePatternSet &patterns)
Populates patterns with patterns that fold operations like linalg.transform into elementwise op map.
LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value)
In case of GPU private memory there is no need to deallocate since the memory is freed when going out...
void populateSparseTensorRewriting(RewritePatternSet &patterns)
Populate patterns that are only useful in the context of sparse tensors.
FailureOr< Operation * > decomposeWinogradOutputTransformOp(RewriterBase &rewriter, linalg::WinogradOutputTransformOp op)
Rewrite linalg.winograd_output_transform.
void populateWinogradConv2DPatterns(RewritePatternSet &patterns, WinogradConv2DFmr fmr)
Patterns to apply Winograd Conv2D algorithm F(m x m, r x r).
FailureOr< ElementwiseOpFusionResult > fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand)
This transformation is intended to be used with a top-down traversal (from producer to consumer).
FailureOr< PromotionInfo > promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, bool useOriginalSubviewSize, const AllocBufferCallbackFn &allocationFn, DataLayout &layout)
llvm::SmallDenseSet< int > getPreservedProducerResults(GenericOp producer, GenericOp consumer, OpOperand *fusedOperand)
Returns a set of indices of the producer's results which would be preserved after the fusion.
std::function< SplitReductionOptions(LinalgOp op)> ControlSplitReductionFn
Function signature to control reduction splitting.
Definition Transforms.h:489
std::optional< Value > allocateGPUPrivateMemory(OpBuilder &builder, memref::SubViewOp subview, ArrayRef< Value > sizeBounds, DataLayout &)
Allocate the subview in the GPU private memory.
std::function< SmallVector< ReassociationIndices >(linalg::LinalgOp)> GetCollapsableDimensionsFn
Function type to control generic op dimension collapsing.
void populateSimplifyDepthwiseConvPatterns(RewritePatternSet &patterns)
Patterns to simplify depthwise convolutions.
FailureOr< Operation * > rewriteInDestinationPassingStyle(RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp)
Rewrite tensor.from_elements to linalg.generic.
FailureOr< PackResult > blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp, const ControlBlockPackMatmulFn &controlPackMatmul)
Pack a matmul operation into blocked 4D layout.
void peelLoops(RewriterBase &rewriter, ArrayRef< scf::ForOp > loops)
Peel 'loops' and applies affine_min/max bounds simplification on the fly where relevant.
FailureOr< LinalgOp > specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp, const GenericOpSpecializationOptions &options={})
Replace the given GenericOp with a namedOp or categoryOp.
SmallVector< OpFoldResult > computePaddedShape(OpBuilder &, TypedValue< RankedTensorType > v, AffineMap indexingMap, ArrayRef< OpFoldResult > indexingSizes, const PadTilingInterfaceOptions &options)
Helper function to compute the padded shape of the given value v of RankedTensorType given:
void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns)
Populate patterns that convert non-destination-style ops to destination style ops.
FailureOr< Operation * > winogradConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op, WinogradConv2DFmr fmr)
Convert linalg.conv_2d_nhwc_fhwc to Winograd Conv2D algorithm F(m x m, r x r).
FailureOr< Operation * > transposeConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op)
Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by materializing transpose.
void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, ControlDropUnitDims &options)
Patterns to fold unit-extent dimensions in operands/results of linalg ops on tensors and memref.
std::function< SmallVector< Value, 4 >(OpBuilder &, Operation *)> TileSizeComputationFunction
Definition Transforms.h:187
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst)
Create Memref copy operations and add gpu barrier guards before and after the copy operation to ensur...
void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns)
Populate patterns that convert ElementwiseMappable ops to linalg parallel loops.
LogicalResult linalgOpAnchoredEmptyTensorEliminationStep(RewriterBase &rewriter, Operation *op, bufferization::OneShotAnalysisState &state)
Try to eliminate tensor::EmptyOps inside op that are anchored on a LinalgOp.
FailureOr< LinalgLoops > linalgOpToLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.for with the proper body for linalgOp.
Definition Loops.cpp:368
FailureOr< GenericOp > generalizeNamedOp(RewriterBase &rewriter, LinalgOp linalgOp)
Create a GenericOp from the given named operation linalgOp and replace the given linalgOp.
std::tuple< SmallVector< Range, 4 >, LoopIndexToRangeIndexMap > makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > allShapeSizes, ArrayRef< OpFoldResult > allTileSizes)
Definition Tiling.cpp:44
FailureOr< Operation * > transposeBatchMatmul(RewriterBase &rewriter, linalg::BatchMatmulOp op, bool transposeLHS=true)
Pattern to replace.
LogicalResult promoteSubviewsPrecondition(Operation *op, LinalgPromotionOptions options)
Promote memref.subviews feeding linalg-on-buffers operations.
LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst)
Normal copy to between src and dst.
FailureOr< linalg::GenericOp > deduplicateOperandsAndRemoveDeadResults(RewriterBase &rewriter, linalg::GenericOp genericOp, bool removeOutputs)
Method to deduplicate operands and remove dead results of linalg.generic operations.
void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Linalg decompose convolutions patterns.
void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns)
Patterns to decompose Winograd operators.
void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate patterns for vectorizing low-D convolution ops.
LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp)
Emit a suitable vector form for a Copy op with fully static shape.
LogicalResult vectorizeOpPrecondition(Operation *op, ArrayRef< int64_t > inputVectorSizes={}, ArrayRef< bool > inputScalableVecDims={}, bool vectorizeNDExtract=false, bool flatten1DDepthwiseConv=false)
Return success if the operation can be vectorized.
FailureOr< GenericOp > interchangeGenericOp(RewriterBase &rewriter, GenericOp genericOp, ArrayRef< unsigned > interchangeVector)
Interchange the iterator_types and iterator_maps dimensions and adapts the index accesses of op.
void populateCollapseDimensions(RewritePatternSet &patterns, const GetCollapsableDimensionsFn &controlCollapseDimensions)
Pattern to collapse dimensions in a linalg.generic op.
bool areElementwiseOpsFusable(OpOperand *fusedOperand)
Return true if two linalg.generic operations with producer/consumer relationship through fusedOperand...
FailureOr< StaticMultiSizeSpecification > computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, int64_t divisor)
Definition Tiling.cpp:236
FailureOr< LinalgLoops > linalgOpToAffineLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of affine.for with the proper body for linalgOp.
Definition Loops.cpp:363
void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns)
Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g.
std::function< std::optional< Value >( OpBuilder &b, memref::SubViewOp subView, ArrayRef< Value > boundingSubViewSize, DataLayout &layout)> AllocBufferCallbackFn
Callback function type used to perform the allocation for the promoted subView.
Definition Transforms.h:378
void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns)
Pattern to remove dead operands and results of linalg.generic operations.
std::function< bool(OpOperand *fusedOperand)> ControlFusionFn
Function type which is used to control when to stop fusion.
FailureOr< ContinuousTileSizeSpecification > computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions)
Definition Tiling.cpp:156
FailureOr< SmallVector< OpFoldResult > > computeIndexingMapOpInterfacePaddedShape(OpBuilder &, OpOperand &operandToPad, ArrayRef< Range > iterationDomain, const PadTilingInterfaceOptions &)
Specific helper for Linalg ops.
FailureOr< StaticContinuousTileSizeSpecification > computeStaticContinuousTileSizes(LinalgOp op, unsigned dimension, unsigned targetSize)
Definition Tiling.cpp:106
std::function< LogicalResult(RewriterBase &, tensor::PadOp, Value)> OptimizeCopyFn
FailureOr< SplitReductionResult > splitReduction(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns)
Populates patterns with patterns that simplify tensor.pack and tensor.unpack operations.
void populateFoldPackUnpackIntoTensorEmptyPatterns(RewritePatternSet &patterns)
Populates patterns with patterns that fold operations like linalg.pack and linalg....
FailureOr< LinalgOp > padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp, const LinalgPaddingOptions &options)
Apply padding and hoisting to linalgOp according to the configuration specified in options.
Definition Padding.cpp:355
void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, bool removeDeadArgsAndResults=true)
Populate patterns for splitting a LinalgOp with multiple statements within its payload into multiple ...
void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns, const ControlFoldIntoPackUnpackFn &controlFn=nullptr)
Populates patterns with patterns that fold operations like tensor.pad and tensor.extract_slice into t...
FailureOr< ForallReductionTilingResult > tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, ArrayRef< OpFoldResult > numThreads, ArrayRef< OpFoldResult > tileSizes={}, std::optional< ArrayAttr > mapping=std::nullopt)
Method to tile a reduction to parallel iterations computing partial reductions.
Definition Tiling.cpp:588
FailureOr< PackResult > packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef< OpFoldResult > mnkPackedSizes, ArrayRef< int64_t > mnkPaddedSizesNextMultipleOf, ArrayRef< int64_t > mnkOrder)
Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m and n are proper parallel d...
std::function< LogicalResult(OpBuilder &b, Value buffer)> DeallocBufferCallbackFn
Callback function type used to deallocate the buffers used to hold the promoted subview.
Definition Transforms.h:384
FailureOr< PackResult > pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, ArrayRef< OpFoldResult > packedSizes)
Implement packing of a single LinalgOp by packedSizes.
void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns)
Patterns to promote inputs to outputs and remove unused inputs of linalg.generic ops.
std::function< bool(OpOperand *opOperand)> ControlFoldIntoPackUnpackFn
Function type which is used to control folding operations like tensor.pad and tensor....
FailureOr< TiledLinalgOp > tileLinalgOp(RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options)
Definition Tiling.cpp:816
DenseMap< int, int > LoopIndexToRangeIndexMap
Creates a number of ranges equal to the number of non-zero in tileSizes.
void populateFoldReshapeOpsByExpansionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes)
Patterns to fold an expanding (collapsing) tensor_reshape operation with its producer (consumer) gene...
std::function< bool(OpOperand *opOperand)> ControlPropagationFn
Function type which is used to control propagation of linalg.pack/unpack ops.
void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns)
Adds patterns that waps tensor.extract_slice(linalg.fill(cst, init)) into linalg.fill(cst,...
FailureOr< DropUnitDimsResult > dropUnitDims(RewriterBase &rewriter, IndexingMapOpInterface op, const DroppedUnitDimsBuilder &droppedUnitDimsBuilder, const ControlDropUnitDims &options)
Drop unit extent dimensions from the op and its operands.
void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns)
Patterns that are used to inline constant operands into linalg generic ops.
FailureOr< LinalgOp > promoteSubViews(OpBuilder &b, LinalgOp op, const LinalgPromotionOptions &options)
Promote the subViews into a new buffer allocated at the insertion point b.
void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, const ControlFusionFn &controlFn)
Patterns to constant fold Linalg operations.
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value)
In case of GPU group memory there is no need to deallocate.
FailureOr< Operation * > transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp op, bool transposeLHS=true)
Convert Linalg matmul ops to transposed variants.
void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns)
Linalg generalization patterns.
void populateLinalgNamedToElementwisePatterns(RewritePatternSet &patterns)
Populates patterns that convert linalg named ops e.g.
std::optional< vector::CombiningKind > getCombinerOpKind(Operation *combinerOp)
Return vector::CombiningKind for the given op.
SmallVector< Value > peelLoop(RewriterBase &rewriter, Operation *op)
Try to peel and canonicalize loop op and return the new result.
std::function< LogicalResult(OpBuilder &b, Value src, Value dst)> CopyCallbackFn
Callback function type used to insert copy from original subview to subview of the promoted region fo...
Definition Transforms.h:391
FailureOr< CollapseResult > collapseOpIterationDims(LinalgOp op, ArrayRef< ReassociationIndices > foldedIterationDims, RewriterBase &rewriter)
Collapses dimensions of linalg.generic/linalg.copy operation.
FailureOr< Operation * > decomposeWinogradInputTransformOp(RewriterBase &rewriter, linalg::WinogradInputTransformOp op)
Rewrite linalg.winograd_input_transform.
void populateDecomposePadPatterns(RewritePatternSet &patterns)
Populates patterns to decompose tensor.pad into e.g.
void populateFoldAddIntoDestPatterns(RewritePatternSet &patterns)
Pattern to replace linalg.add when destination passing on a contraction op suffices for achieving the...
void populateLinalgGenericOpsSpecializationPatterns(RewritePatternSet &patterns, const GenericOpSpecializationOptions &options={})
Populates patterns with patterns to convert linalg.generic ops to named or category ops where possibl...
std::pair< TilingInterface, TilingInterface > splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension, OpFoldResult splitPoint)
Split the given op into two parts along the given iteration space dimension at the specified splitPoi...
Definition Split.cpp:68
void populateElementwiseOpsFusionPatterns(RewritePatternSet &patterns, const ControlFusionFn &controlElementwiseOpFusion)
Patterns for fusing linalg operation on tensors.
void populateFoldUnitExtentDimsCanonicalizationPatterns(RewritePatternSet &patterns, ControlDropUnitDims &options)
Populates canonicalization patterns that simplify IR after folding unit-extent dimensions.
FailureOr< SplitReductionResult > splitReductionByScaling(RewriterBase &b, LinalgOp op, const ControlSplitReductionFn &controlSplitReductionFn, bool useAlloc=false)
Scaling-based implementation of the split reduction transformation.
FailureOr< MultiSizeSpecification > computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, OpFoldResult targetSize, OpFoldResult divisor, bool emitAssertions=true)
Emits the IR computing the multi-sized tiling specification with two tile sizes not exceeding targetS...
Definition Tiling.cpp:262
FailureOr< LowerPackResult > lowerPack(RewriterBase &rewriter, linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice=true)
Rewrite pack as pad + reshape + transpose.
FailureOr< LinalgLoops > linalgOpToParallelLoops(RewriterBase &rewriter, LinalgOp linalgOp)
Emit a loop nest of scf.parallel with the proper body for linalgOp.
Definition Loops.cpp:375
Include the generated interface declarations.
ArrayRef< int64_t > ReassociationIndicesRef
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:120
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:118
OpInterfaceRewritePattern(MLIRContext *context, PatternBenefit benefit=1)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
OpRewritePattern(MLIRContext *context, PatternBenefit benefit=1, ArrayRef< StringRef > generatedNames={})
SmallVector< int64_t, 3 > mnkOrder
Permutation of matmul (M, N, K) dimensions order.
SmallVector< int64_t, 3 > blockFactors
Minor block factors (mb, nb, kb) for packing relayout where mb, mn are the parallel dimensions and kb...
bool rhsTransposeOuterBlocks
Transpose RHS outer block layout [KB][NB] -> [NB][KB].
bool lhsTransposeInnerBlocks
Transpose LHS inner block layout [mb][kb] -> [kb][mb].
SmallVector< int64_t, 3 > mnkPaddedSizesNextMultipleOf
Next multiples of the packing sizes.
bool lhsTransposeOuterBlocks
Transpose LHS outer block layout [MB][KB] -> [KB][MB].
bool allowPadding
If true, allows packing of dimensions that only partially fit into the block factors.
bool rhsTransposeInnerBlocks
Transpose RHS inner block layout [kb][nb] -> [nb][kb].
bool bufferizeDestinationOnly
If set to "true", only the destination tensor operands are bufferized to a new allocation (and wrappe...
Definition Transforms.h:66
bool emitDealloc
If set to "true", a memref.dealloc operation will be emitted for each allocated buffer.
Definition Transforms.h:72
SmallVector< Value > results
Transformation to drop unit-extent dimensions from linalg.generic operations.
Definition Transforms.h:521
std::function< FailureOr< Value >( RewriterBase &, Location, Value, Value, ArrayRef< ReassociationIndices >, const ControlDropUnitDims &)> ExpandFnTy
Instances of this type are used to control how result values are expanded into their original shape a...
Definition Transforms.h:586
std::function< FailureOr< Value >( RewriterBase &, Location, Value, ArrayRef< int64_t >, ArrayRef< ReassociationIndices >, const ControlDropUnitDims &)> CollapseFnTy
Instances of this type are used to control how operand values are collapsed after dropping unit exten...
Definition Transforms.h:559
ControlFnTy controlFn
Function to control which dimensions, if any, are to be considered for dropping unit extent dimension...
Definition Transforms.h:540
RankReductionStrategy rankReductionStrategy
Definition Transforms.h:524
ExpandFnTy expandFn
Function to control how results are expanded into their original shape after dropping unit extent dim...
Definition Transforms.h:595
CollapseFnTy collapseFn
Function to control how operands are collapsed into their new target shape after dropping unit extent...
Definition Transforms.h:568
std::function< SmallVector< unsigned >(Operation *)> ControlFnTy
Instances of this type are used to control which dimensions of an operand are considered for dropping...
Definition Transforms.h:533
Vectorization pattern for memref::CopyOp.
LogicalResult matchAndRewrite(memref::CopyOp copyOp, PatternRewriter &rewriter) const override
Rewrites a linalg::PackOp into a sequence of:
LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override
Rewrites a linalg::UnPackOp into a sequence of:
LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override
Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, Value dest, const SmallVector< Value > &dynSizes) const
Filling dest using FillOp constant padding value if possible.
DecomposePadOpPattern(MLIRContext *context, PatternBenefit benefit=1)
FailureOr< Conv1DOp > returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const
DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit=1)
LogicalResult matchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const override
DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, PatternBenefit benefit=1)
FailureOr< DepthwiseConv1DNwcWcOp > returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const
Rewrites 2-D convolution ops with size-1 window dimensions into 1-D convolution ops.
LogicalResult matchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const override
FailureOr< Conv1DOp > returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const
IndexingMapOpInterface resultOp
Definition Transforms.h:626
SmallVector< Value > replacements
Definition Transforms.h:627
Fuse two linalg.generic operations that have a producer-consumer relationship captured through fusedO...
Definition Transforms.h:656
llvm::DenseMap< Value, Value > replacements
Definition Transforms.h:658
std::function< std::optional< bool >(tensor::ExtractSliceOp)> ControlFn
A function to control pattern application and rewrite logic.
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override
ExtractSliceOfPadTensorSwapPattern(MLIRContext *context, ControlFn controlFn=nullptr, PatternBenefit benefit=1)
Transformation information returned after reduction tiling.
SmallVector< Operation * > mergeOps
The final reduction operation merging all the partial reductions.
SmallVector< Value > initialValues
Initial values used for partial reductions.
scf::ForallOp loops
The scf.forall operation that iterate over the tiles.
SmallVector< Operation * > parallelTiledOps
The partial reduction tiled op generated.
Match and rewrite for the pattern:
LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, PatternRewriter &rewriter) const override
Match and rewrite for the pattern:
LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const override
Linalg generalization pattern.
LogicalResult matchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const override
FailureOr< GenericOp > returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const
matchAndRewrite implementation that returns the significant transformed pieces of IR.
Options that allow distribution of loops generated in Linalg transforms to processors while generatin...
Definition Utils.h:336
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition Transforms.h:282
LinalgPaddingOptions & setSizeToPadTo(unsigned operandIndex, unsigned dimIndex, OpFoldResult size)
Definition Transforms.h:304
DenseMap< std::pair< unsigned, unsigned >, OpFoldResult > sizeToPadTo
A mapping between an operand and shape dim, and a size for a padding dimension.
Definition Transforms.h:303
std::optional< SmallVector< int64_t > > padToMultipleOf
A list of multiples to which each padding dimension should be padded to.
Definition Transforms.h:294
OpFoldResult getSizeToPadTo(unsigned operandIndex, unsigned dimIndex) const
Given the operand index and shape dim it returns the size to pad to.
Definition Transforms.h:311
SmallVector< SmallVector< int64_t > > transposePaddings
A permutation vector for every operand used to transpose the packed PadOp results.
Definition Transforms.h:331
LinalgPaddingOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition Transforms.h:283
LinalgPaddingOptions & setPadToMultipleOf(ArrayRef< int64_t > m)
Definition Transforms.h:295
LinalgPaddingOptions & setHoistPaddings(ArrayRef< int64_t > hp)
Definition Transforms.h:325
LinalgPaddingOptions & setCopyBackOp(CopyBackOp op)
Definition Transforms.h:345
SmallVector< bool > nofoldFlags
A flag for every operand to mark the PadOp as nofold which enables packing for statically shaped oper...
Definition Transforms.h:318
SmallVector< int64_t > hoistPaddings
A number of loops to hoist the PadOp out for every operand.
Definition Transforms.h:324
LinalgPaddingOptions & setTransposePaddings(ArrayRef< SmallVector< int64_t > > tp)
Definition Transforms.h:333
LinalgPaddingOptions & setPaddingDimensions(ArrayRef< int64_t > pd)
Definition Transforms.h:289
SmallVector< int64_t > paddingDimensions
A list of iterator dimensions to pad.
Definition Transforms.h:288
LinalgPaddingOptions & setNofoldFlags(ArrayRef< bool > pp)
Definition Transforms.h:319
CopyBackOp copyBackOp
The op to be used for copying the padded result to the original destination tensor.
Definition Transforms.h:344
std::optional< unsigned > alignment
Alignment of promoted buffer. If std::nullopt do not specify alignment.
Definition Transforms.h:432
bool useAlloca
Use alloca with the default allocation scheme.
Definition Transforms.h:445
LinalgPromotionOptions & setUseFullTileBuffers(ArrayRef< bool > useFullTiles)
Definition Transforms.h:409
LinalgPromotionOptions & setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, DeallocBufferCallbackFn const &deallocFn)
Definition Transforms.h:456
LinalgPromotionOptions & setAlignment(unsigned align)
Definition Transforms.h:433
LinalgPromotionOptions & setMemorySpace(Attribute memorySpc)
Definition Transforms.h:440
std::optional< Attribute > memorySpace
Memory space of promoted buffer.
Definition Transforms.h:439
bool useOriginalSubviewSize
If true, buffers will be allocated with the original subview size.
Definition Transforms.h:426
std::optional< CopyCallbackFn > copyOutFn
Definition Transforms.h:465
std::optional< CopyCallbackFn > copyInFn
Callback function to do the copy of data to and from the promoted subview.
Definition Transforms.h:464
std::optional< DenseSet< unsigned > > operandsToPromote
Indices of subViews to promote.
Definition Transforms.h:397
LinalgPromotionOptions & setOperandsToPromote(ArrayRef< int64_t > operands)
Definition Transforms.h:398
LinalgPromotionOptions & setUseFullTileBuffersByDefault(bool use)
Definition Transforms.h:420
std::optional< AllocBufferCallbackFn > allocationFn
Callback function to do the allocation of the promoted buffer.
Definition Transforms.h:453
bool useFullTileBuffersDefault
If true all operands unspecified by useFullTileBuffers will use the full view, otherwise the partial ...
Definition Transforms.h:419
std::optional< DeallocBufferCallbackFn > deallocationFn
Definition Transforms.h:454
LinalgPromotionOptions & setUseOriginalSubviewSize(bool originalSize)
Definition Transforms.h:427
LinalgPromotionOptions & setUseAlloca(bool use)
Definition Transforms.h:446
LinalgPromotionOptions & setCopyInOutFns(CopyCallbackFn const &copyIn, CopyCallbackFn const &copyOut)
Definition Transforms.h:466
std::optional< llvm::SmallBitVector > useFullTileBuffers
If ith element of useFullTiles is true the full view should be used for the promoted buffer of the it...
Definition Transforms.h:408
LogicalResult matchAndRewrite(GenericOp op, PatternRewriter &rewriter) const override
FailureOr< GenericOp > returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const
LinalgSpecializationPattern(MLIRContext *context, const GenericOpSpecializationOptions &options={}, PatternBenefit benefit=1)
std::optional< LinalgLoopDistributionOptions > tileDistribution
When specified, specifies distribution of generated tile loops to processors.
Definition Transforms.h:272
LinalgTilingAndFusionOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition Transforms.h:274
LinalgTilingAndFusionOptions & setTileSizes(ArrayRef< int64_t > ts)
Definition Transforms.h:264
SmallVector< int64_t > tileInterchange
Tile interchange used to permute the tile loops.
Definition Transforms.h:269
SmallVector< int64_t > tileSizes
Tile sizes used to tile the root operation.
Definition Transforms.h:263
LinalgTilingOptions & setDistributionOptions(LinalgLoopDistributionOptions distributionOptions)
Definition Transforms.h:238
LinalgTilingOptions & setTileSizes(const SmallVector< Value, 4 > &ts)
Set the tileSizeComputationFunction to return the values ts.
Definition Transforms.h:204
LinalgTilingOptions & setTileSizeComputationFunction(TileSizeComputationFunction fun)
Definition Transforms.h:197
LinalgTilingLoopType loopType
The type of tile loops to generate.
Definition Transforms.h:226
LinalgTilingOptions & setInterchange(ArrayRef< unsigned > interchange)
Definition Transforms.h:220
SmallVector< int64_t > peeledLoops
Peel the specified loops.
Definition Transforms.h:252
LinalgTilingOptions & setLoopType(LinalgTilingLoopType lt)
Definition Transforms.h:228
SmallVector< unsigned, 4 > interchangeVector
The interchange vector to reorder the tiled loops.
Definition Transforms.h:218
LinalgTilingOptions & setDistributionTypes(ArrayRef< StringRef > types)
Definition Transforms.h:246
TileSizeComputationFunction tileSizeComputationFunction
Computation function that returns the tile sizes for each operation.
Definition Transforms.h:194
LinalgTilingOptions & scalarizeDynamicDims()
Tile all dynamic dimensions by 1.
std::optional< LinalgLoopDistributionOptions > distribution
When specified, specifies distribution of generated tile loops to processors.
Definition Transforms.h:235
SmallVector< StringRef, 2 > distributionTypes
Specification markers of how to distribute the linalg.tiled_loop.
Definition Transforms.h:244
LinalgTilingOptions & setPeeledLoops(ArrayRef< int64_t > loops)
Definition Transforms.h:254
linalg::TransposeOp transposeOp
tensor::ExpandShapeOp expandShapeOp
tensor::ExtractSliceOp extractSliceOp
linalg::TransposeOp transposeOp
tensor::CollapseShapeOp collapseShapeOp
A description of a multi-size tiling comprising tile sizes and numbers of tiles, expressed as Values ...
Struct to hold the result of a pack call.
SmallVector< linalg::UnPackOp > unPackOps
linalg::LinalgOp packedLinalgOp
SmallVector< linalg::PackOp > packOps
Struct to hold the result of a packTranspose call.
SmallVector< Attribute > paddingValues
A padding value for every operand.
Definition Transforms.h:353
PadTilingInterfaceOptions & setPaddingValues(ArrayRef< Attribute > pv)
Definition Transforms.h:354
PadTilingInterfaceOptions & setPadToMultipleOf(bool b)
Definition Transforms.h:367
PadTilingInterfaceOptions & setPaddingSizes(ArrayRef< OpFoldResult > m)
Definition Transforms.h:360
bool padToMultipleOf
Pad iterator paddingDimension[i] to next multiple of paddingSizes[i] if true.
Definition Transforms.h:366
SmallVector< OpFoldResult > paddingSizes
A list of iterator dimensions sizes to pad to.
Definition Transforms.h:359
Operations and values created in the process of padding a TilingInterface operation.
Definition Transforms.h:739
SmallVector< Value > replacements
Slices of the padded op's results, same types as toPad.
Definition Transforms.h:745
TilingInterface paddedOp
The padded op, a clone of toPad with padded operands.
Definition Transforms.h:743
SmallVector< tensor::PadOp > padOps
The operands of the padded op.
Definition Transforms.h:741
Create a new buffer using the allocationFn provided.
Definition Transforms.h:942
Split Reduction options.
Definition Transforms.h:475
Apply transformation to split the single linalg op reduction into a parallel and reduction dimension.
Perform standalone tiling of a single LinalgOp by tileSizes.
Definition Transforms.h:897
SmallVector< Operation *, 8 > loops
Definition Transforms.h:899
SmallVector< Value, 4 > tensorResults
Definition Transforms.h:900
Transformation information returned after vectorizing.
Definition Transforms.h:999
SmallVector< Value > replacements
Results of the vectorization transform to replace the original operation.
SmallVector< T > tripCounts
Number of tiles associated with each size.
T lowTripCount
Number of tiles associated with each size.
Helper struct to hold the results of building a packing loop nest.
Definition Transforms.h:767
SmallVector< OpFoldResult > strides
Definition Transforms.h:768
SmallVector< Value > leadingPackedTensorIndexings
Definition Transforms.h:769
SmallVector< Value > clonedLoopIvs
Definition Transforms.h:769
SmallVector< OpFoldResult > sizes
Definition Transforms.h:768
SmallVector< OpFoldResult > offsets
Definition Transforms.h:768