doxygen/LowerVectorContract_8cpp_source.html

 //===- LowerVectorContract.cpp - Lower 'vector.contract' operation --------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements target-independent rewrites and utilities to lower the

 // 'vector.contract' operation.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"

 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"

 #include "mlir/IR/BuiltinTypes.h"

 #include "mlir/IR/Location.h"

 #include "mlir/IR/PatternMatch.h"

 #include "mlir/IR/TypeUtilities.h"


 #define DEBUG_TYPE "vector-contract-lowering"


 using namespace mlir;

 using namespace mlir::vector;


 //===----------------------------------------------------------------------===//

 // Helper functions

 //===----------------------------------------------------------------------===//

 // Helper to find an index in an affine map.

 static std::optional<int64_t> getResultIndex(AffineMap map, int64_t index) {

   for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {

     int64_t idx = map.getDimPosition(i);

     if (idx == index)

       return i;

   }

   return std::nullopt;

 }


 // Helper to construct iterator types with one index removed.

 static SmallVector<Attribute> adjustIter(ArrayAttr iteratorTypes,

                                          int64_t index) {

   SmallVector<Attribute> results;

   for (const auto &it : llvm::enumerate(iteratorTypes)) {

     int64_t idx = it.index();

     if (idx == index)

       continue;

     results.push_back(it.value());

   }

   return results;

 }


 // Helper to construct an affine map with one index removed.

 static AffineMap adjustMap(AffineMap map, int64_t index,

                            PatternRewriter &rewriter) {

   auto *ctx = rewriter.getContext();

   SmallVector<AffineExpr> results;

   for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {

     int64_t idx = map.getDimPosition(i);

     if (idx == index)

       continue;

     // Re-insert remaining indices, but renamed when occurring

     // after the removed index.

     auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);

     results.push_back(targetExpr);

   }

   return AffineMap::get(map.getNumDims() - 1, 0, results, ctx);

 }


 // Helper method to possibly drop a dimension in a load.

 // TODO

 static Value reshapeLoad(Location loc, Value val, VectorType type,

                          int64_t index, int64_t pos,

                          PatternRewriter &rewriter) {

   if (index == -1)

     return val;


   // At extraction dimension?

   if (index == 0)

     return vector::ExtractOp::create(rewriter, loc, val, pos);


   // Unroll leading dimensions.

   VectorType vType = VectorType::Builder(type).dropDim(0);

   VectorType resType = VectorType::Builder(type).dropDim(index);

   Value result = arith::ConstantOp::create(rewriter, loc, resType,

                                            rewriter.getZeroAttr(resType));

   for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {

     Value ext = vector::ExtractOp::create(rewriter, loc, val, d);

     Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);

     result = vector::InsertOp::create(rewriter, loc, load, result, d);

   }

   return result;

 }


 // Helper method to possibly drop a dimension in a store.

 // TODO

 static Value reshapeStore(Location loc, Value val, Value result,

                           VectorType type, int64_t index, int64_t pos,

                           PatternRewriter &rewriter) {

   // Unmodified?

   if (index == -1)

     return val;

   // At insertion dimension?

   if (index == 0)

     return vector::InsertOp::create(rewriter, loc, val, result, pos);


   // Unroll leading dimensions.

   VectorType vType = VectorType::Builder(type).dropDim(0);

   for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {

     Value ext = vector::ExtractOp::create(rewriter, loc, result, d);

     Value ins = vector::ExtractOp::create(rewriter, loc, val, d);

     Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);

     result = vector::InsertOp::create(rewriter, loc, sto, result, d);

   }

   return result;

 }


 /// Helper to create arithmetic operation associated with a kind of contraction.

 static std::optional<Value>

 createContractArithOp(Location loc, Value x, Value y, Value acc,

                       vector::CombiningKind kind, PatternRewriter &rewriter,

                       bool isInt, Value mask = Value()) {

   using vector::CombiningKind;

   Value mul;


   if (isInt) {

     if (kind == CombiningKind::MINNUMF || kind == CombiningKind::MAXNUMF ||

         kind == CombiningKind::MINIMUMF || kind == CombiningKind::MAXIMUMF)

       // Only valid for floating point types.

       return std::nullopt;

     mul = arith::MulIOp::create(rewriter, loc, x, y);

   } else {

     // Float case.

     if (kind == CombiningKind::AND || kind == CombiningKind::MINUI ||

         kind == CombiningKind::MINSI || kind == CombiningKind::MAXUI ||

         kind == CombiningKind::MAXSI || kind == CombiningKind::OR ||

         kind == CombiningKind::XOR)

       // Only valid for integer types.

       return std::nullopt;

     // Special case for fused multiply-add.

     if (acc && isa<VectorType>(acc.getType()) && kind == CombiningKind::ADD) {

       Value fma = vector::FMAOp::create(rewriter, loc, x, y, acc);

       if (mask)

         // The fma op doesn't need explicit masking. However, fma ops used in

         // reductions must preserve previous 'acc' values for masked-out lanes.

         fma = selectPassthru(rewriter, mask, fma, acc);

       return fma;

     }

     mul = arith::MulFOp::create(rewriter, loc, x, y);

   }


   if (!acc)

     return std::optional<Value>(mul);


   return makeArithReduction(rewriter, loc, kind, mul, acc,

                             /*fastmath=*/nullptr, mask);

 }


 /// Return the positions of the reductions in the given map.

 static SmallVector<int64_t> getReductionIndex(AffineMap map,

                                               ArrayAttr iteratorTypes) {

   SmallVector<int64_t> dimsIdx;

   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {

     if (isReductionIterator(iteratorTypes[map.getDimPosition(i)]))

       dimsIdx.push_back(i);

   }

   return dimsIdx;

 }


 /// Look for a given dimension in an affine map and return its position. Return

 /// std::nullopt if the dimension is not in the map results.

 static std::optional<unsigned> getDimPosition(AffineMap map, unsigned dim) {

   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {

     if (map.getDimPosition(i) == dim)

       return i;

   }

   return std::nullopt;

 }


 /// Creates an AddIOp if `isInt` is true otherwise create an arith::AddFOp using

 /// operands `x` and `y`.

 static Value createAdd(Location loc, Value x, Value y, bool isInt,

                        PatternRewriter &rewriter) {

   if (isInt)

     return arith::AddIOp::create(rewriter, loc, x, y);

   return arith::AddFOp::create(rewriter, loc, x, y);

 }


 /// Creates a MulIOp if `isInt` is true otherwise create an MulFOp using

 /// operands `x and `y`.

 static Value createMul(Location loc, Value x, Value y, bool isInt,

                        PatternRewriter &rewriter) {

   if (isInt)

     return arith::MulIOp::create(rewriter, loc, x, y);

   return arith::MulFOp::create(rewriter, loc, x, y);

 }


 namespace {


 /// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul

 /// semantics to a reduction_size-unrolled sequence:

 /// ```

 ///    %at = vector.transpose %a, [1, 0]

 ///    %bRow0 = vector.extract %b[0]

 ///    %atRow0 = vector.extract %at[0]

 ///    %c0 = vector.outerproduct %atRow0, %bRow0, %c

 ///    ...

 ///    %bRowK = vector.extract %b[K]

 ///    %atRowK = vector.extract %at[K]

 ///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1

 /// ```

 ///

 /// This only kicks in when vectorContractLowering is set to OuterProduct and

 /// the vector.contract op is a row-major matrix multiply.

 class ContractionOpToOuterProductOpLowering

     : public MaskableOpRewritePattern<vector::ContractionOp> {

 public:

   using MaskableOpRewritePattern::MaskableOpRewritePattern;


   using FilterConstraintType =

       std::function<LogicalResult(vector::ContractionOp op)>;


   static LogicalResult defaultFilter(vector::ContractionOp op) {

     return success();

   }


   ContractionOpToOuterProductOpLowering(

       vector::VectorContractLowering vectorContractLowering,

       MLIRContext *context, PatternBenefit benefit = 1,

       FilterConstraintType constraint = defaultFilter)

       : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),

         vectorContractLowering(vectorContractLowering),

         filter(std::move(constraint)) {}


   FailureOr<Value>

   matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,

                             PatternRewriter &rewriter) const override;


 private:

   /// Options to control the vector patterns.

   vector::VectorContractLowering vectorContractLowering;

   FilterConstraintType filter;

 };


 /// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul

 /// semantics to an output-size-unrolled sequence:

 /// ```

 ///    %out = arith.constant ... : vector<MxNxelt_type>

 ///    %bt = vector.transpose %b, [1, 0]

 ///    %aRow0 = vector.extract %a[0]

 ///    %btRow0 = vector.extract %bt[0]

 ///    %c00 = vector.reduce %atRow0, %bRow0

 ///    %out00 = vector.insert %c00, %out[0, 0]

 ///    ...

 ///    %aRowLast = vector.extract %at[M-1]

 ///    %btRowLast = vector.extract %b[N-1]

 ///    %cLastLast = vector.reduce %atRowLast, %bRowLast

 ///    %outcLastLast = vector.insert %cLastLast, %out[M-1, N-1]

 /// ```

 ///

 /// This only kicks in when VectorTransformsOptions is set to Dot and

 /// the vector.contract op is a row-major matmul or matvec.

 class ContractionOpToDotLowering

     : public MaskableOpRewritePattern<vector::ContractionOp> {

 public:

   using MaskableOpRewritePattern::MaskableOpRewritePattern;


   using FilterConstraintType =

       std::function<LogicalResult(vector::ContractionOp op)>;


   static LogicalResult defaultFilter(vector::ContractionOp op) {

     return success();

   }


   ContractionOpToDotLowering(

       vector::VectorContractLowering vectorContractLowering,

       MLIRContext *context, PatternBenefit benefit = 1,

       const FilterConstraintType &constraint = defaultFilter)

       : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),

         vectorContractLowering(vectorContractLowering), filter(defaultFilter) {}


   FailureOr<Value>

   matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,

                             PatternRewriter &rewriter) const override;


 private:

   /// Options to control the vector patterns.

   vector::VectorContractLowering vectorContractLowering;

   FilterConstraintType filter;

 };


 /// Progressive lowering of ContractionOp.

 ///

 /// One:

 ///   %x = vector.contract with at least one free/batch dimension

 /// is replaced by:

 ///   %a = vector.contract with one less free/batch dimension

 ///   %b = vector.contract with one less free/batch dimension

 ///   ..

 ///   %x = combine %a %b ..

 /// until a pure contraction is reached (no free/batch dimensions),

 /// which is replaced by a dot-product.

 ///

 /// This only kicks in when either VectorTransformsOptions is set

 /// to Dot or when other contraction patterns fail.

 class ContractionOpLowering

     : public MaskableOpRewritePattern<vector::ContractionOp> {

 public:

   using MaskableOpRewritePattern::MaskableOpRewritePattern;

   using FilterConstraintType =

       std::function<LogicalResult(vector::ContractionOp op)>;


   static LogicalResult defaultFilter(vector::ContractionOp op) {

     return success();

   }


   ContractionOpLowering(

       vector::VectorContractLowering vectorContractLoweringOption,

       MLIRContext *context, PatternBenefit benefit = 1,

       FilterConstraintType constraint = defaultFilter)

       : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),

         vectorContractLoweringOption(vectorContractLoweringOption),

         filter(std::move(constraint)) {}


   FailureOr<Value>

   matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,

                             PatternRewriter &rewriter) const override;


 private:

   /// Options to control the vector patterns.

   vector::VectorContractLowering vectorContractLoweringOption;

   FilterConstraintType filter;

   // Lower one parallel dimension.

   FailureOr<Value> lowerParallel(PatternRewriter &rewriter,

                                  vector::ContractionOp op, int64_t lhsIndex,

                                  int64_t rhsIndex, Value mask) const;

   // Lower one reduction dimension.

   FailureOr<Value> lowerReduction(PatternRewriter &rewriter,

                                   vector::ContractionOp op, Value mask) const;

 };


 /// Generate a vector implementation for matmat, matvec and tmatvec.

 /// This unrolls outer-products along the reduction dimension.

 struct UnrolledOuterProductGenerator

     : public StructuredGenerator<vector::ContractionOp, vector::IteratorType> {

   UnrolledOuterProductGenerator(RewriterBase &b, vector::ContractionOp op)

       : StructuredGenerator<vector::ContractionOp, vector::IteratorType>(b, op),

         kind(op.getKind()), lhs(op.getLhs()), rhs(op.getRhs()),

         res(op.getAcc()), lhsType(op.getLhsType()) {

     auto maskableOp = cast<MaskableOpInterface>(op.getOperation());

     if (maskableOp.isMasked())

       mask = maskableOp.getMaskingOp().getMask();

   }


   Value t(Value v, ArrayRef<int64_t> perm = {1, 0}) {

     if (!v)

       return v;

     return vector::TransposeOp::create(rewriter, loc, v, perm);

   }


   Value promote(Value v, Type dstElementType) {

     Type elementType = v.getType();

     auto vecType = dyn_cast<VectorType>(elementType);

     if (vecType)

       elementType = vecType.getElementType();

     if (elementType == dstElementType)

       return v;

     Type promotedType = dstElementType;

     if (vecType)

       promotedType = vecType.clone(promotedType);

     if (isa<FloatType>(dstElementType))

       return arith::ExtFOp::create(rewriter, loc, promotedType, v);

     return arith::ExtSIOp::create(rewriter, loc, promotedType, v);

   }


   FailureOr<Value> outerProd(Value lhs, Value rhs, Value res,

                              VectorType lhsType, int reductionSize,

                              std::optional<Value> maybeMask = std::nullopt) {

     // Incremental support for masking.

     if (mask && !maybeMask.has_value())

       return failure();


     Type resElementType = cast<VectorType>(res.getType()).getElementType();

     for (int64_t k = 0; k < reductionSize; ++k) {

       Value extractA = vector::ExtractOp::create(rewriter, loc, lhs, k);

       Value extractB = vector::ExtractOp::create(rewriter, loc, rhs, k);

       extractA = promote(extractA, resElementType);

       extractB = promote(extractB, resElementType);

       Value extractMask;

       if (maybeMask.has_value() && maybeMask.value())

         extractMask =

             vector::ExtractOp::create(rewriter, loc, maybeMask.value(), k);


       Operation *outerProdOp = vector::OuterProductOp::create(

           rewriter, loc, res.getType(), extractA, extractB, res, kind);

       res = maskOperation(rewriter, outerProdOp, extractMask)->getResult(0);

     }

     return res;

   }


   /// Helper function for `matmat`, `matvec`, `tmatvec`. Returns the size of

   /// dimension `reductionDim`. If the dimension is a scalable dimension,

   /// returns "nullopt".

   std::optional<int64_t> getReductionSize(VectorType vecType,

                                           int64_t reductionDim) {

     // Cannot unroll scalable dimension.

     if (vecType.getScalableDims()[reductionDim])

       return std::nullopt;

     int64_t reductionSize = vecType.getDimSize(reductionDim);

     assert(reductionSize > 0 &&

            "Reduction dim must be a known static size to allow unrolling");

     return reductionSize;

   }


   /// Two outer parallel, one inner reduction (matmat flavor).

   FailureOr<Value> matmat() {

     if (!iters({Par(), Par(), Red()}))

       return failure();

     // Set up the parallel/reduction structure in the right form.

     AffineExpr m, n, k;

     bindDims(rewriter.getContext(), m, n, k);


     // Classical row-major matmul:  Just permute the lhs.

     if (layout({{m, k}, {k, n}, {m, n}})) {

       if (auto reductionSize = getReductionSize(lhsType, 1)) {

         // Note: `t` creates new IR. It must be nested within this `if` check

         // so that no IR is created when then pattern returns "failure".

         Value tLhs = t(lhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(tLhs, rhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // TODO: may be better to fail and use some vector<k> -> scalar reduction.

     if (layout({{m, k}, {n, k}, {m, n}})) {

       if (auto reductionSize = getReductionSize(lhsType, 1)) {

         Value tLhs = t(lhs);

         Value tRhs = t(rhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(tLhs, tRhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // No need to permute anything.

     if (layout({{k, m}, {k, n}, {m, n}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(lhs, rhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // Just permute the rhs.

     if (layout({{k, m}, {n, k}, {m, n}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tRhs = t(rhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(lhs, tRhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // Transposed output: swap RHS and LHS.

     // Classical row-major matmul: permute the lhs.

     if (layout({{m, k}, {k, n}, {n, m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 1)) {

         Value tLhs = t(lhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(rhs, tLhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // TODO: may be better to fail and use some vector<k> -> scalar reduction.

     if (layout({{m, k}, {n, k}, {n, m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 1)) {

         Value tRhs = t(rhs);

         Value tLhs = t(lhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(tRhs, tLhs, res, lhsType, *reductionSize, tMask);

       }

     }

     if (layout({{k, m}, {k, n}, {n, m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(rhs, lhs, res, lhsType, *reductionSize, tMask);

       }

     }

     if (layout({{k, m}, {n, k}, {n, m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tRhs = t(rhs);

         Value tMask = t(mask, {2, 0, 1});

         return outerProd(tRhs, lhs, res, lhsType, *reductionSize, tMask);

       }

     }

     return failure();

   }


   //

   // One outer parallel, one inner reduction (matvec flavor).

   // Mask needs to be transposed everywhere to turn the reduction dimension

   // outermost as required by outerproduct.

   //

   FailureOr<Value> matvec() {

     if (!iters({Par(), Red()}))

       return failure();

     AffineExpr m, k;

     bindDims(rewriter.getContext(), m, k);


     // Case mat-vec: transpose.

     if (layout({{m, k}, {k}, {m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 1)) {

         Value tLhs = t(lhs);

         Value tMask = t(mask);

         return outerProd(tLhs, rhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // Case mat-trans-vec: ready to go.

     if (layout({{k, m}, {k}, {m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tMask = t(mask);

         return outerProd(lhs, rhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // Case vec-mat: swap and transpose.

     if (layout({{k}, {m, k}, {m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tRhs = t(rhs);

         Value tMask = t(mask);

         return outerProd(tRhs, lhs, res, lhsType, *reductionSize, tMask);

       }

     }

     // Case vec-mat-trans: swap and ready to go.

     if (layout({{k}, {k, m}, {m}})) {

       if (auto reductionSize = getReductionSize(lhsType, 0)) {

         Value tMask = t(mask);

         return outerProd(rhs, lhs, res, lhsType, *reductionSize, tMask);

       }

     }

     return failure();

   }


   //

   // One outer reduction, one inner parallel (tmatvec flavor).

   // Mask already has the shape of the outer product.

   //

   FailureOr<Value> tmatvec() {

     if (!iters({Red(), Par()}))

       return failure();

     AffineExpr k, m;

     bindDims(rewriter.getContext(), k, m);


     // Case mat-vec: transpose.

     if (layout({{m, k}, {k}, {m}}))

       if (auto reductionSize = getReductionSize(lhsType, 1))

         return outerProd(t(lhs), rhs, res, lhsType, *reductionSize, mask);

     // Case mat-trans-vec: ready to go.

     if (layout({{k, m}, {k}, {m}}))

       if (auto reductionSize = getReductionSize(lhsType, 0))

         return outerProd(lhs, rhs, res, lhsType, *reductionSize, mask);

     // Case vec-mat: swap and transpose.

     if (layout({{k}, {m, k}, {m}}))

       if (auto reductionSize = getReductionSize(lhsType, 0))

         return outerProd(t(rhs), lhs, res, lhsType, *reductionSize, mask);

     // Case vec-mat-trans: swap and ready to go.

     if (layout({{k}, {k, m}, {m}}))

       if (auto reductionSize = getReductionSize(lhsType, 0))

         return outerProd(rhs, lhs, res, lhsType, *reductionSize, mask);

     return failure();

   }


 private:

   vector::CombiningKind kind;

   Value lhs, rhs, res, mask;

   VectorType lhsType;

 };


 /// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul

 /// semantics to a reduction_size-unrolled sequence:

 /// ```

 ///    %at = vector.transpose %a, [1, 0]

 ///    %bRow0 = vector.extract %b[0]

 ///    %atRow0 = vector.extract %at[0]

 ///    %c0 = vector.outerproduct %atRow0, %bRow0, %c

 ///    ...

 ///    %bRowK = vector.extract %b[K]

 ///    %atRowK = vector.extract %at[K]

 ///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1

 /// ```

 ///

 /// This only kicks in when vectorContractLowering is set to OuterProduct but

 /// otherwise supports any layout permutation of the matrix-multiply.

 FailureOr<Value>

 ContractionOpToOuterProductOpLowering::matchAndRewriteMaskableOp(

     vector::ContractionOp op, MaskingOpInterface maskOp,

     PatternRewriter &rewriter) const {

   if (vectorContractLowering != vector::VectorContractLowering::OuterProduct)

     return failure();


   if (failed(filter(op)))

     return failure();


   UnrolledOuterProductGenerator e(rewriter, op);

   FailureOr<Value> matmatRes = e.matmat();

   if (succeeded(matmatRes)) {

     return matmatRes;

   }

   FailureOr<Value> matvecRes = e.matvec();

   if (succeeded(matvecRes)) {

     return matvecRes;

   }


   FailureOr<Value> tmatvecRes = e.tmatvec();

   return tmatvecRes;

 }


 FailureOr<Value> ContractionOpToDotLowering::matchAndRewriteMaskableOp(

     vector::ContractionOp op, MaskingOpInterface maskOp,

     PatternRewriter &rewriter) const {

   // TODO: Support vector.mask.

   if (maskOp)

     return failure();


   if (failed(filter(op)))

     return failure();


   if (vectorContractLowering != vector::VectorContractLowering::Dot)

     return failure();


   auto iteratorTypes = op.getIteratorTypes().getValue();

   static constexpr std::array<int64_t, 2> perm = {1, 0};

   Location loc = op.getLoc();

   Value lhs = op.getLhs(), rhs = op.getRhs();


   using MapList = ArrayRef<ArrayRef<AffineExpr>>;

   auto infer = [&](MapList m) {

     return AffineMap::inferFromExprList(m, op.getContext());

   };

   AffineExpr m, n, k;

   bindDims(rewriter.getContext(), m, n, k);

   SmallVector<AffineMap> maps = op.getIndexingMapsArray();

   //

   // In the following we wish to make the reduction dimension innermost so we

   // can load vectors and just fmul + reduce into a scalar.

   //

   if (isParallelIterator(iteratorTypes[0]) &&

       isParallelIterator(iteratorTypes[1]) &&

       isReductionIterator(iteratorTypes[2])) {

     //

     // Two outer parallel, one inner reduction (matmat flavor).

     //

     if (maps == infer({{m, k}, {k, n}, {m, n}})) {

       rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);

     } else if (maps == infer({{m, k}, {n, k}, {m, n}})) {

       // No need to permute anything.

     } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {

       lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);

       rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);

     } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {

       lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);

     } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {

       // This is the classical row-major matmul. Just permute the lhs.

       Value tmp = lhs;

       lhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);

       rhs = tmp;

     } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {

       std::swap(lhs, rhs);

     } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {

       Value tmp = lhs;

       lhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);

       rhs = vector::TransposeOp::create(rewriter, loc, tmp, perm);

     } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {

       Value tmp = rhs;

       rhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);

       lhs = tmp;

     } else {

       return failure();

     }

   } else if (isParallelIterator(iteratorTypes[0]) &&

              isReductionIterator(iteratorTypes[1])) {

     //

     // One outer parallel, one inner reduction (matvec flavor)

     //

     if (maps == infer({{m, n}, {n}, {m}})) {

       // No need to permute anything.

     } else if (maps == infer({{n, m}, {n}, {m}})) {

       lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);

     } else if (maps == infer({{n}, {m, n}, {m}})) {

       std::swap(lhs, rhs);

     } else if (maps == infer({{n}, {n, m}, {m}})) {

       std::swap(lhs, rhs);

       lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);

     } else {

       return failure();

     }

   } else {

     return failure();

   }


   VectorType dstType = cast<VectorType>(op.getResultType());

   assert(dstType.getRank() >= 1 && dstType.getRank() <= 2 &&

          "Expected dst type of rank 1 or 2");


   unsigned rank = dstType.getRank();

   unsigned dstRows = dstType.getShape()[0];

   unsigned dstColumns = rank == 1 ? 1 : dstType.getShape()[1];


   // ExtractOp does not allow dynamic indexing, we must unroll explicitly.

   Value res = arith::ConstantOp::create(rewriter, loc, dstType,

                                         rewriter.getZeroAttr(dstType));

   bool isInt = isa<IntegerType>(dstType.getElementType());

   llvm::SmallVector<Value> extractedCols;

   extractedCols.reserve(dstColumns);

   for (unsigned r = 0; r < dstRows; ++r) {

     Value rowLhs = vector::ExtractOp::create(rewriter, op.getLoc(), lhs, r);

     for (unsigned c = 0; c < dstColumns; ++c) {

       // Extract each respective row and column of the LHS and RHS once to

       // avoid having duplicate SSA values pointing to the same rows/columns.

       if (r == 0) {

         Value colRhs =

             rank == 1

                 ? rhs

                 : vector::ExtractOp::create(rewriter, op.getLoc(), rhs, c);

         extractedCols.push_back(colRhs);

       }

       Value extractedColRhs = extractedCols[c];

       Value product =

           createMul(op.getLoc(), rowLhs, extractedColRhs, isInt, rewriter);

       Value sum = vector::ReductionOp::create(

           rewriter, op.getLoc(), vector::CombiningKind::ADD, product);


       SmallVector<int64_t, 2> pos = rank == 1 ? SmallVector<int64_t, 2>{r}

                                               : SmallVector<int64_t, 2>{r, c};

       res = vector::InsertOp::create(rewriter, op.getLoc(), sum, res, pos);

     }

   }

   if (auto acc = op.getAcc())

     res = createAdd(op.getLoc(), res, acc, isInt, rewriter);

   return res;

 }


 /// Lower vector.contract with all size one reduction dimensions to

 /// elementwise ops when possible.

 struct ContractOpToElementwise

     : public MaskableOpRewritePattern<vector::ContractionOp> {

   using MaskableOpRewritePattern::MaskableOpRewritePattern;

   using FilterConstraintType =

       std::function<LogicalResult(vector::ContractionOp op)>;

   static LogicalResult defaultFilter(vector::ContractionOp op) {

     return success();

   }

   ContractOpToElementwise(

       vector::VectorContractLowering vectorContractLowering,

       MLIRContext *context, PatternBenefit benefit = 1,

       const FilterConstraintType &constraint = defaultFilter)

       : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),

         vectorContractLowering(vectorContractLowering), filter(defaultFilter) {}


   FailureOr<Value>

   matchAndRewriteMaskableOp(vector::ContractionOp contractOp,

                             MaskingOpInterface maskOp,

                             PatternRewriter &rewriter) const override {

     // TODO: Support vector.mask.

     if (maskOp)

       return failure();


     if (failed(filter(contractOp)))

       return failure();


     if (vectorContractLowering != vector::VectorContractLowering::ParallelArith)

       return failure();


     ArrayRef<int64_t> lhsShape = contractOp.getLhsType().getShape();

     ArrayRef<int64_t> rhsShape = contractOp.getRhsType().getShape();

     AffineMap lhsMap = contractOp.getIndexingMapsArray()[0];

     AffineMap rhsMap = contractOp.getIndexingMapsArray()[1];

     SmallVector<int64_t> lhsReductionDims =

         getReductionIndex(lhsMap, contractOp.getIteratorTypes());

     SmallVector<int64_t> rhsReductionDims =

         getReductionIndex(rhsMap, contractOp.getIteratorTypes());

     // All the reduction dimensions must be a size 1.

     for (int64_t dim : lhsReductionDims) {

       if (lhsShape[dim] != 1)

         return failure();

     }

     for (int64_t dim : rhsReductionDims) {

       if (rhsShape[dim] != 1)

         return failure();

     }

     AffineMap accMap = contractOp.getIndexingMapsArray()[2];

     unsigned numParallelDims = accMap.getNumResults();

     unsigned numLhsDimToBroadcast =

         numParallelDims - (lhsMap.getNumResults() - lhsReductionDims.size());

     unsigned numRhsDimToBroadcast =

         numParallelDims - (rhsMap.getNumResults() - rhsReductionDims.size());

     SmallVector<int64_t> lhsDims;

     SmallVector<int64_t> lhsTranspose;

     SmallVector<int64_t> rhsDims;

     SmallVector<int64_t> rhsTranspose;

     for (int64_t dim : lhsReductionDims)

       lhsTranspose.push_back(numLhsDimToBroadcast + dim);

     for (int64_t dim : rhsReductionDims)

       rhsTranspose.push_back(numRhsDimToBroadcast + dim);

     // Loop through the parallel dimensions to calculate the dimensions to

     // broadcast and to permute in order to extract only parallel dimensions.

     for (unsigned i = 0; i < numParallelDims; i++) {

       std::optional<unsigned> lhsDim =

           getDimPosition(lhsMap, accMap.getDimPosition(i));

       if (lhsDim) {

         lhsTranspose.push_back(numLhsDimToBroadcast + *lhsDim);

       } else {

         // If the parallel dimension doesn't exist we will have to broadcast it.

         lhsDims.push_back(

             cast<VectorType>(contractOp.getResultType()).getDimSize(i));

         lhsTranspose.push_back(lhsDims.size() - 1);

       }

       std::optional<unsigned> rhsDim =

           getDimPosition(rhsMap, accMap.getDimPosition(i));

       if (rhsDim) {

         rhsTranspose.push_back(numRhsDimToBroadcast + *rhsDim);

       } else {

         // If the parallel dimension doesn't exist we will have to broadcast it.

         rhsDims.push_back(

             cast<VectorType>(contractOp.getResultType()).getDimSize(i));

         rhsTranspose.push_back(rhsDims.size() - 1);

       }

     }

     Value newLhs = contractOp.getLhs();

     Value newRhs = contractOp.getRhs();

     Location loc = contractOp.getLoc();

     if (!lhsDims.empty()) {

       lhsDims.append(lhsShape.begin(), lhsShape.end());

       auto expandedType =

           VectorType::get(lhsDims, contractOp.getLhsType().getElementType());

       newLhs = vector::BroadcastOp::create(rewriter, loc, expandedType, newLhs);

     }

     if (!rhsDims.empty()) {

       rhsDims.append(rhsShape.begin(), rhsShape.end());

       auto expandedType =

           VectorType::get(rhsDims, contractOp.getRhsType().getElementType());

       newRhs = vector::BroadcastOp::create(rewriter, loc, expandedType, newRhs);

     }

     bool isInt = contractOp.getLhsType().getElementType().isIntOrIndex();

     newLhs = vector::TransposeOp::create(rewriter, loc, newLhs, lhsTranspose);

     newRhs = vector::TransposeOp::create(rewriter, loc, newRhs, rhsTranspose);

     SmallVector<int64_t> lhsOffsets(lhsReductionDims.size(), 0);

     SmallVector<int64_t> rhsOffsets(rhsReductionDims.size(), 0);

     newLhs = vector::ExtractOp::create(rewriter, loc, newLhs, lhsOffsets);

     newRhs = vector::ExtractOp::create(rewriter, loc, newRhs, rhsOffsets);

     std::optional<Value> result =

         createContractArithOp(loc, newLhs, newRhs, contractOp.getAcc(),

                               contractOp.getKind(), rewriter, isInt);

     if (result)

       return *result;


     return failure();

   }


 private:

   /// Options to control the vector patterns.

   vector::VectorContractLowering vectorContractLowering;

   FilterConstraintType filter;

 };


 /// Progressive lowering of ContractionOp.

 /// One:

 ///   %x = vector.contract with at least one free/batch dimension

 /// is replaced by:

 ///   %a = vector.contract with one less free/batch dimension

 ///   %b = vector.contract with one less free/batch dimension

 ///   ..

 ///   %x = combine %a %b ..

 /// until a pure contraction is reached (no free/batch dimensions),

 /// which is replaced by a dot-product.

 ///

 /// This only kicks in when either vectorContractLoweringOption is set

 /// to DOT or when other contraction patterns fail.

 //

 // TODO: break down into transpose/reshape/cast ops

 //               when they become available to avoid code dup

 // TODO: investigate lowering order impact on performance

 FailureOr<Value> ContractionOpLowering::matchAndRewriteMaskableOp(

     vector::ContractionOp op, MaskingOpInterface maskOp,

     PatternRewriter &rewriter) const {

   if (failed(filter(op)))

     return failure();


   // TODO: support mixed mode contract lowering.

   if (op.getLhsType().getElementType() !=

           getElementTypeOrSelf(op.getAccType()) ||

       op.getRhsType().getElementType() != getElementTypeOrSelf(op.getAccType()))

     return failure();


   // TODO: the code below assumes the default contraction, make sure it supports

   // other kinds before enabling this lowering.

   if (op.getKind() != vector::CombiningKind::ADD) {

     return rewriter.notifyMatchFailure(

         op, "contractions other than 'add' not supported");

   }


   // TODO: implement benefits, cost models.

   MLIRContext *ctx = op.getContext();


   ContractionOpToOuterProductOpLowering pat1(vectorContractLoweringOption, ctx);

   FailureOr<Value> newVal1 =

       pat1.matchAndRewriteMaskableOp(op, maskOp, rewriter);

   if (!failed(newVal1))

     return newVal1;


   ContractionOpToDotLowering pat2(vectorContractLoweringOption, ctx);

   FailureOr<Value> newVal2 =

       pat2.matchAndRewriteMaskableOp(op, maskOp, rewriter);

   if (!failed(newVal2))

     return newVal2;


   ContractOpToElementwise pat4(vectorContractLoweringOption, ctx);

   FailureOr<Value> newVal4 =

       pat4.matchAndRewriteMaskableOp(op, maskOp, rewriter);

   if (!failed(newVal4))

     return newVal4;


   // Vector mask setup.


   Value mask;

   if (maskOp)

     mask = maskOp.getMask();

   // Find first batch dimension in LHS/RHS, and lower when found.

   std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();

   if (!batchDimMap.empty()) {

     int64_t lhsIndex = batchDimMap[0].first;

     int64_t rhsIndex = batchDimMap[0].second;

     auto newOp = lowerParallel(rewriter, op, lhsIndex, rhsIndex, mask);

     if (failed(newOp))

       return failure();

     return newOp;

   }


   // Collect contracting dimensions.

   std::vector<std::pair<int64_t, int64_t>> contractingDimMap =

       op.getContractingDimMap();

   DenseSet<int64_t> lhsContractingDimSet;

   DenseSet<int64_t> rhsContractingDimSet;

   for (auto &dimPair : contractingDimMap) {

     lhsContractingDimSet.insert(dimPair.first);

     rhsContractingDimSet.insert(dimPair.second);

   }


   // Find first free dimension in LHS, and lower when found.

   VectorType lhsType = op.getLhsType();

   for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e; ++lhsIndex) {

     if (lhsContractingDimSet.count(lhsIndex) == 0) {

       auto newOp = lowerParallel(rewriter, op, lhsIndex, /*rhsIndex=*/-1, mask);

       if (failed(newOp))

         return failure();

       return newOp;

     }

   }


   // Find first free dimension in RHS, and lower when found.

   VectorType rhsType = op.getRhsType();

   for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e; ++rhsIndex) {

     if (rhsContractingDimSet.count(rhsIndex) == 0) {

       auto newOp = lowerParallel(rewriter, op, /*lhsIndex=*/-1, rhsIndex, mask);

       if (failed(newOp))

         return failure();

       return newOp;

     }

   }


   // Lower the first remaining reduction dimension.

   if (!contractingDimMap.empty()) {

     auto newOp = lowerReduction(rewriter, op, mask);

     if (failed(newOp))

       return failure();

     return newOp;

   }


   return failure();

 }


 // Lower one parallel dimension.

 // Incidentally also tolerates unit-size (hence trivial) reduction dimensions.

 // TODO: consider reusing existing contract unrolling

 FailureOr<Value> ContractionOpLowering::lowerParallel(PatternRewriter &rewriter,

                                                       vector::ContractionOp op,

                                                       int64_t lhsIndex,

                                                       int64_t rhsIndex,

                                                       Value mask) const {

   VectorType lhsType = op.getLhsType();

   VectorType rhsType = op.getRhsType();

   VectorType resType = cast<VectorType>(op.getResultType());

   // Find the iterator type index and result index.

   SmallVector<AffineMap> iMap = op.getIndexingMapsArray();

   int64_t iterIndex = -1;

   int64_t dimSize = -1;

   if (lhsIndex >= 0) {

     iterIndex = iMap[0].getDimPosition(lhsIndex);

     if (rhsIndex >= 0 && iterIndex != iMap[1].getDimPosition(rhsIndex))

       return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

         diag << "expected lhsIndex=" << lhsIndex << " and rhsIndex=" << rhsIndex

              << " to map to the same dimension";

       });

     if (lhsType.getScalableDims()[lhsIndex])

       return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

         diag << "Unrolling scalable dimension (lhsIndex=" << lhsIndex

              << ") is not supported yet";

       });

     dimSize = lhsType.getDimSize(lhsIndex);

   } else if (rhsIndex >= 0) {

     iterIndex = iMap[1].getDimPosition(rhsIndex);

     if (rhsType.getScalableDims()[rhsIndex])

       return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

         diag << "Unrolling scalable dimension (rhsIndex=" << rhsIndex

              << ") is not supported yet";

       });

     dimSize = rhsType.getDimSize(rhsIndex);

   }

   if (iterIndex < 0)

     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

       diag << "expected either lhsIndex=" << lhsIndex

            << " or rhsIndex=" << rhsIndex << " to be nonnegative";

     });

   // value_or(-1) means that we tolerate a dimension not appearing

   // in the result map. That can't happen for actual parallel iterators, but

   // the caller ContractionOpLowering::matchAndRewrite is currently calling

   // lowerParallel also for the case of unit-size reduction dims appearing only

   // on one of LHS or RHS, not both. At the moment, such cases are created by

   // CastAwayContractionLeadingOneDim, so we need to either support that or

   // modify that pattern.

   int64_t resIndex = getResultIndex(iMap[2], iterIndex).value_or(-1);

   if (resIndex == -1 && dimSize != 1)

     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

       diag << "expected the dimension for iterIndex=" << iterIndex

            << " to either appear in the result map, or to be a unit dimension";

     });


   // Construct new iterator types and affine map array attribute.

   std::array<AffineMap, 3> lowIndexingMaps = {

       adjustMap(iMap[0], iterIndex, rewriter),

       adjustMap(iMap[1], iterIndex, rewriter),

       adjustMap(iMap[2], iterIndex, rewriter)};

   auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);

   auto lowIter =

       rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));

   // Unroll into a series of lower dimensional vector.contract ops.

   Location loc = op.getLoc();

   Value result = arith::ConstantOp::create(rewriter, loc, resType,

                                            rewriter.getZeroAttr(resType));


   for (int64_t d = 0; d < dimSize; ++d) {

     auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);

     auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);

     auto acc = reshapeLoad(loc, op.getAcc(), resType, resIndex, d, rewriter);


     Value lowMask;

     if (mask)

       lowMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),

                             iterIndex, d, rewriter);


     Operation *lowContract = vector::ContractionOp::create(

         rewriter, loc, lhs, rhs, acc, lowAffine, lowIter);

     lowContract = maskOperation(rewriter, lowContract, lowMask);

     result = reshapeStore(loc, lowContract->getResult(0), result, resType,

                           resIndex, d, rewriter);

   }

   return result;

 }


 // Lower one reduction dimension.

 FailureOr<Value> ContractionOpLowering::lowerReduction(

     PatternRewriter &rewriter, vector::ContractionOp op, Value mask) const {

   auto loc = op.getLoc();

   VectorType lhsType = op.getLhsType();

   VectorType rhsType = op.getRhsType();

   Type resType = op.getResultType();

   if (isa<VectorType>(resType))

     return rewriter.notifyMatchFailure(op,

                                        "did not expect a VectorType result");

   bool isInt = isa<IntegerType>(resType);

   // Use iterator index 0.

   int64_t iterIndex = 0;

   SmallVector<AffineMap> iMap = op.getIndexingMapsArray();

   std::optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);

   std::optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);

   if (!lookupLhs.has_value())

     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

       diag << "expected iterIndex=" << iterIndex << "to map to a LHS dimension";

     });

   if (!lookupRhs.has_value())

     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

       diag << "expected iterIndex=" << iterIndex << "to map to a RHS dimension";

     });

   int64_t lhsIndex = *lookupLhs;

   int64_t rhsIndex = *lookupRhs;

   int64_t dimSize = lhsType.getDimSize(lhsIndex);

   if (dimSize != rhsType.getDimSize(rhsIndex))

     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {

       diag << "expect LHS dimension " << lhsIndex

            << " to have the same size as RHS dimension " << rhsIndex;

     });

   // Base case.

   if (lhsType.getRank() == 1) {

     if (rhsType.getRank() != 1)

       return rewriter.notifyMatchFailure(

           op, "When LHS has rank 1, expected also RHS to have rank 1");

     Value m = createMul(loc, op.getLhs(), op.getRhs(), isInt, rewriter);

     auto kind = vector::CombiningKind::ADD;


     Value acc = op.getAcc();

     Operation *reductionOp =

         acc ? vector::ReductionOp::create(rewriter, loc, kind, m, acc)

             : vector::ReductionOp::create(rewriter, loc, kind, m);

     return maskOperation(rewriter, reductionOp, mask)->getResult(0);

   }

   // Construct new iterator types and affine map array attribute.

   std::array<AffineMap, 3> lowIndexingMaps = {

       adjustMap(iMap[0], iterIndex, rewriter),

       adjustMap(iMap[1], iterIndex, rewriter),

       adjustMap(iMap[2], iterIndex, rewriter)};

   auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);

   auto lowIter =

       rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));

   // Unroll into a series of lower dimensional vector.contract ops.

   // By feeding the initial accumulator into the first contraction,

   // and the result of each contraction into the next, eventually

   // the sum of all reductions is computed.

   Value result = op.getAcc();

   for (int64_t d = 0; d < dimSize; ++d) {

     auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);

     auto rhs = reshapeLoad(loc, op.getRhs(), rhsType, rhsIndex, d, rewriter);

     Value newMask;

     if (mask)

       newMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),

                             iterIndex, d, rewriter);


     Operation *newContract = vector::ContractionOp::create(

         rewriter, loc, lhs, rhs, result, lowAffine, lowIter);

     result = maskOperation(rewriter, newContract, newMask)->getResult(0);

   }

   return result;

 }


 /// Progressive lowering of OuterProductOp.

 /// One:

 ///   %x = vector.outerproduct %lhs, %rhs, %acc

 /// is replaced by:

 ///   %z = zero-result

 ///   %0 = vector.extract %lhs[0]

 ///   %1 = vector.broadcast %0

 ///   %2 = vector.extract %acc[0]

 ///   %3 = vector.fma %1, %rhs, %2

 ///   %4 = vector.insert %3, %z[0]

 ///   ..

 ///   %x = vector.insert %.., %..[N-1]

 ///

 class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {

 public:

   using Base::Base;


   LogicalResult matchAndRewrite(vector::OuterProductOp op,

                                 PatternRewriter &rewriter) const override {

     VectorType resType = op.getResultVectorType();

     if ((resType.getShape().size() >= 2) && resType.allDimsScalable())

       return failure();


     auto loc = op.getLoc();


     VectorType lhsType = op.getOperandVectorTypeLHS();

     VectorType rhsType = dyn_cast<VectorType>(op.getOperandTypeRHS());

     Type eltType = resType.getElementType();

     bool isInt = isa<IntegerType, IndexType>(eltType);

     Value acc = op.getAcc();

     vector::CombiningKind kind = op.getKind();


     // Vector mask setup.

     OpBuilder::InsertionGuard guard(rewriter);

     auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());

     Operation *rootOp;

     Value mask;

     if (maskableOp.isMasked()) {

       rewriter.setInsertionPoint(maskableOp.getMaskingOp());

       rootOp = maskableOp.getMaskingOp();

       mask = maskableOp.getMaskingOp().getMask();

     } else {

       rootOp = op;

     }


     if (!rhsType) {

       // Special case: AXPY operation.

       Value b =

           vector::BroadcastOp::create(rewriter, loc, lhsType, op.getRhs());

       std::optional<Value> mult = createContractArithOp(

           loc, op.getLhs(), b, acc, kind, rewriter, isInt, mask);

       if (!mult.has_value())

         return failure();

       rewriter.replaceOp(rootOp, *mult);

       return success();

     }


     Value result = arith::ConstantOp::create(rewriter, loc, resType,

                                              rewriter.getZeroAttr(resType));

     for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {

       Value x = vector::ExtractOp::create(rewriter, loc, op.getLhs(), d);

       Value a = vector::BroadcastOp::create(rewriter, loc, rhsType, x);

       Value r = nullptr;

       if (acc)

         r = vector::ExtractOp::create(rewriter, loc, acc, d);

       Value extrMask;

       if (mask)

         extrMask = vector::ExtractOp::create(rewriter, loc, mask, d);


       std::optional<Value> m = createContractArithOp(

           loc, a, op.getRhs(), r, kind, rewriter, isInt, extrMask);

       if (!m.has_value())

         return failure();

       result = vector::InsertOp::create(rewriter, loc, *m, result, d);

     }


     rewriter.replaceOp(rootOp, result);

     return success();

   }

 };


 } // namespace


 void mlir::vector::populateVectorContractLoweringPatterns(

     RewritePatternSet &patterns,

     VectorContractLowering vectorContractLoweringOption, PatternBenefit benefit,

     bool disableOuterProductLowering) {

   if (!disableOuterProductLowering)

     patterns.add<OuterProductOpLowering>(patterns.getContext(), benefit);

   patterns.add<ContractionOpLowering, ContractionOpToOuterProductOpLowering>(

       vectorContractLoweringOption, patterns.getContext(), benefit);

 }


 void mlir::vector::populateVectorOuterProductLoweringPatterns(

     RewritePatternSet &patterns, PatternBenefit benefit) {

   patterns.add<OuterProductOpLowering>(patterns.getContext(), benefit);

 }

product
static int64_t product(ArrayRef< int64_t > vals)
Definition: GPUHeuristics.cpp:112

IndexingUtils.h

kind
union mlir::linalg::@1247::ArityGroupAndKind::Kind kind

Location.h

getReductionIndex
static SmallVector< int64_t > getReductionIndex(AffineMap map, ArrayAttr iteratorTypes)
Return the positions of the reductions in the given map.
Definition: LowerVectorContract.cpp:164

createMul
static Value createMul(Location loc, Value x, Value y, bool isInt, PatternRewriter &rewriter)
Creates a MulIOp if isInt is true otherwise create an MulFOp using operands x andy`.
Definition: LowerVectorContract.cpp:195

createAdd
static Value createAdd(Location loc, Value x, Value y, bool isInt, PatternRewriter &rewriter)
Creates an AddIOp if isInt is true otherwise create an arith::AddFOp using operands x and y.
Definition: LowerVectorContract.cpp:186

getDimPosition
static std::optional< unsigned > getDimPosition(AffineMap map, unsigned dim)
Look for a given dimension in an affine map and return its position.
Definition: LowerVectorContract.cpp:176

createContractArithOp
static std::optional< Value > createContractArithOp(Location loc, Value x, Value y, Value acc, vector::CombiningKind kind, PatternRewriter &rewriter, bool isInt, Value mask=Value())
Helper to create arithmetic operation associated with a kind of contraction.
Definition: LowerVectorContract.cpp:124

reshapeStore
static Value reshapeStore(Location loc, Value val, Value result, VectorType type, int64_t index, int64_t pos, PatternRewriter &rewriter)
Definition: LowerVectorContract.cpp:101

adjustMap
static AffineMap adjustMap(AffineMap map, int64_t index, PatternRewriter &rewriter)
Definition: LowerVectorContract.cpp:58

reshapeLoad
static Value reshapeLoad(Location loc, Value val, VectorType type, int64_t index, int64_t pos, PatternRewriter &rewriter)
Definition: LowerVectorContract.cpp:76

getResultIndex
static std::optional< int64_t > getResultIndex(AffineMap map, int64_t index)
Definition: LowerVectorContract.cpp:35

adjustIter
static SmallVector< Attribute > adjustIter(ArrayAttr iteratorTypes, int64_t index)
Definition: LowerVectorContract.cpp:45

LoweringPatterns.h

diag
static std::string diag(const llvm::Value &value)
Definition: ModuleImport.cpp:55

PatternMatch.h

MINUI
#define MINUI(lhs, rhs)
Definition: SparseTensorIterator.cpp:37

StructuredOpsUtils.h

TypeUtilities.h

VectorOps.h

VectorUtils.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::DenseSet
Definition: LLVM.h:59

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46

mlir::AffineMap::getDimPosition
unsigned getDimPosition(unsigned idx) const
Extracts the position of the dimensional expression at the given result, when the caller knows it is ...
Definition: AffineMap.cpp:411

mlir::AffineMap::get
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Definition: MLIRContext.cpp:1221

mlir::AffineMap::getNumDims
unsigned getNumDims() const
Definition: AffineMap.cpp:390

mlir::AffineMap::getNumResults
unsigned getNumResults() const
Definition: AffineMap.cpp:398

mlir::AffineMap::inferFromExprList
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList, MLIRContext *context)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Definition: AffineMap.cpp:308

mlir::Builder::getZeroAttr
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:324

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:56

mlir::Builder::getArrayAttr
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
Definition: Builders.cpp:266

mlir::Builder::getAffineMapArrayAttr
ArrayAttr getAffineMapArrayAttr(ArrayRef< AffineMap > values)
Definition: Builders.cpp:318

mlir::Diagnostic
This class contains all of the information necessary to report a diagnostic to the DiagnosticEngine.
Definition: Diagnostics.h:155

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63

mlir::OpBuilder::InsertionGuard
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:348

mlir::OpBuilder::setInsertionPoint
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::Operation::getResult
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:407

mlir::PatternBenefit
This class represents the benefit of a pattern match in a unitless scheme that ranges from 0 (very li...
Definition: PatternMatch.h:34

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:793

mlir::RewritePatternSet
Definition: PatternMatch.h:816

mlir::RewriterBase
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:368

mlir::RewriterBase::notifyMatchFailure
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:726

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:127

mlir::StructuredGenerator
Helper StructuredGenerator class to manipulate and rewrite ops with StructuredOpInterface.
Definition: StructuredOpsUtils.h:90

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

mlir::VectorType::Builder
This is a builder type that keeps local references to arguments.
Definition: BuiltinTypes.h:286

mlir::VectorType::Builder::dropDim
Builder & dropDim(unsigned pos)
Erase a dim from shape @pos.
Definition: BuiltinTypes.h:311

Arith.h

MemRef.h

BuiltinTypes.h

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir::remark::failed
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491

mlir::scf::promote
void promote(RewriterBase &rewriter, scf::ForallOp forallOp)
Promotes the loop body of a scf::ForallOp to its containing block.
Definition: SCF.cpp:699

mlir::vector
Definition: ConvertVectorToLLVM.h:22

mlir::vector::makeArithReduction
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.

mlir::vector::isReductionIterator
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition: VectorOps.h:155

mlir::vector::maskOperation
Operation * maskOperation(OpBuilder &builder, Operation *maskableOp, Value mask, Value passthru=Value())
Creates a vector.mask operation around a maskable operation.

mlir::vector::selectPassthru
Value selectPassthru(OpBuilder &builder, Value mask, Value newValue, Value passthru)
Creates a vector select operation that picks values from newValue or passthru for each result vector ...

mlir::vector::isParallelIterator
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition: VectorOps.h:150

mlir::vector::populateVectorOuterProductLoweringPatterns
void populateVectorOuterProductLoweringPatterns(RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate the pattern set with the following patterns:
Definition: LowerVectorContract.cpp:1232

mlir::vector::populateVectorContractLoweringPatterns
void populateVectorContractLoweringPatterns(RewritePatternSet &patterns, VectorContractLowering vectorContractLoweringOption, PatternBenefit benefit=1, bool disableOuterProductLowering=false)
Populate the pattern set with the following patterns:
Definition: LowerVectorContract.cpp:1222

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::bindDims
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311

mlir::getElementTypeOrSelf
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
Definition: TypeUtilities.cpp:23

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::getAffineDimExpr
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:619

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314

mlir::vector::MaskableOpRewritePattern
A pattern for ops that implement MaskableOpInterface and that might be masked (i.e.
Definition: VectorUtils.h:163