doxygen/LowerContractionToSMMLAPattern_8cpp_source.html

 //===- LowerContractionToSMMLAPattern.cpp - Contract to SMMLA ---*- C++ -*-===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements lowering patterns from vector.contract to

 // arm_neon.intr.smmla

 //

 //===---


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"

 #include "mlir/Dialect/ArmNeon/Transforms.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"

 #include "mlir/Dialect/Utils/IndexingUtils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/IR/AffineMap.h"

 #include "mlir/IR/PatternMatch.h"

 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"


 #define DEBUG_TYPE "lower-contract-to-arm-neon"


 using namespace mlir;

 using namespace mlir::arm_neon;


 namespace {


 /// Return the shaped type with new element type.

 static Type matchContainerType(Type element, Type container) {

   if (auto shapedTy = dyn_cast<ShapedType>(container)) {

     return shapedTy.clone(element);

   }

   return element;

 }


 /// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile

 /// any vector.contract into multiple smmla instructions with unrolling so long

 /// as [2,2,8] is a divisor of its shape. It can also process vecmats with dimM

 /// = 1 (either explicitly or inferred if LHS has only dimK) If no unrolling is

 /// necessary, a single smmla instruction is emitted.

 class LowerContractionToSMMLAPattern

     : public OpRewritePattern<vector::ContractionOp> {

 public:

   using OpRewritePattern::OpRewritePattern;

   LogicalResult matchAndRewrite(vector::ContractionOp op,

                                 PatternRewriter &rewriter) const override {

     Location loc = op.getLoc();

     // Infer tile sizes from operands. For vecmat, LHS may only have 1 dim.

     // Note: RHS is not transposed.

     mlir::VectorType lhsType = op.getLhsType();

     mlir::VectorType rhsType = op.getRhsType();

     // Avoid 0-D vectors and 1-D rhs:

     if (!lhsType.hasRank() || !rhsType.hasRank() || rhsType.getRank() < 2)

       return failure();

     auto dimM = lhsType.getRank() == 1 ? 1 : lhsType.getDimSize(0);

     auto dimN = rhsType.getDimSize(0);

     auto dimK = rhsType.getDimSize(1);

     bool isVecmat = dimM == 1 ? true : false;

     if (lhsType.getDimSize(lhsType.getRank() - 1) !=

         rhsType.getDimSize(rhsType.getRank() - 1)) {

       return failure(); // dimK mismatch

     }

     // Unrolling patterns can handle any [2, 2, 8] shaped multiple of inputs for

     // tiling.

     if ((dimM % 2 != 0 && !isVecmat) || dimN % 2 != 0 || dimK % 8 != 0) {

       return failure();

     }


     // Check iterator types for contract. All iterators except inner-most

     // dimension must be parallel.

     auto iteratorTypes = op.getIteratorTypesArray();

     if (iteratorTypes.size() > 3 || iteratorTypes[iteratorTypes.size() - 1] !=

                                         vector::IteratorType::reduction) {

       return failure();

     }

     if (llvm::any_of(ArrayRef<vector::IteratorType>(iteratorTypes).drop_back(1),

                      [](vector::IteratorType iteratorType) {

                        return iteratorType != vector::IteratorType::parallel;

                      })) {

       return failure();

     }


     // Check two extsi inputs Rhs Lhs for contract.

     arith::ExtSIOp origLhsExtOp =

         dyn_cast_or_null<arith::ExtSIOp>(op.getLhs().getDefiningOp());

     arith::ExtSIOp origRhsExtOp =

         dyn_cast_or_null<arith::ExtSIOp>(op.getRhs().getDefiningOp());

     if (!origLhsExtOp || !origRhsExtOp) {

       return failure();

     }


     // Match any iX to i32 for X<8 then turn into an i8 output. Feed into

     // following neon instruction. Check inputs for extsi are <=i8

     Value extsiLhs;

     Value extsiRhs;

     if (auto lhsExtInType =

             dyn_cast<mlir::VectorType>(origLhsExtOp.getIn().getType())) {

       if (lhsExtInType.getElementTypeBitWidth() <= 8) {

         Type targetLhsExtTy =

             matchContainerType(rewriter.getI8Type(), lhsExtInType);

         extsiLhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetLhsExtTy,

                                                          origLhsExtOp.getIn());

       }

     }

     if (auto rhsExtInType =

             dyn_cast<mlir::VectorType>(origRhsExtOp.getIn().getType())) {

       if (rhsExtInType.getElementTypeBitWidth() <= 8) {

         Type targetRhsExtTy =

             matchContainerType(rewriter.getI8Type(), rhsExtInType);

         extsiRhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetRhsExtTy,

                                                          origRhsExtOp.getIn());

       }

     }


     if (!extsiLhs || !extsiRhs) {

       return failure();

     }


     // Initial accumulator for the final result. This is the un-tiled result if

     // tiling is done.

     Value result = rewriter.create<arith::ConstantOp>(

         loc, op.getResultType(), rewriter.getZeroAttr(op.getResultType()));


     SmallVector<int64_t> unrolledSize = *op.getShapeForUnroll();

     SmallVector<int64_t> smmlaShape{2, 8};

     SmallVector<int64_t> loopOrder{0, 1};

     if (unrolledSize.size() == 3) {

       smmlaShape.insert(smmlaShape.begin(), isVecmat ? 1 : 2);

       loopOrder.push_back(2);

     }


     // Keep track of the previous accumulator when tiling over K.

     Value kAcc;

     for (SmallVector<int64_t> offsets :

          StaticTileOffsetRange(unrolledSize, smmlaShape, loopOrder)) {

       // Helper to compute the new shape of each operand and extract the slice.

       auto extractOperand = [&](Value operand, AffineMap permutationMap,

                                 ArrayRef<int64_t> operandOffsets) {

         SmallVector<int64_t> operandShape =

             applyPermutationMap(permutationMap, ArrayRef<int64_t>(smmlaShape));

         SmallVector<int64_t> operandStrides(operandOffsets.size(), 1);

         return rewriter.createOrFold<vector::ExtractStridedSliceOp>(

             loc, operand, operandOffsets, operandShape, operandStrides);

       };


       // Extract tiled lhs, rhs, and acc

       AffineMap lhsPermutationMap = op.getIndexingMapsArray()[0];

       SmallVector<int64_t> lhsOffsets =

           applyPermutationMap(lhsPermutationMap, ArrayRef<int64_t>(offsets));

       Value tiledLhs = extractOperand(extsiLhs, lhsPermutationMap, lhsOffsets);

       AffineMap rhsPermutationMap = op.getIndexingMapsArray()[1];

       SmallVector<int64_t> rhsOffsets =

           applyPermutationMap(rhsPermutationMap, ArrayRef<int64_t>(offsets));

       Value tiledRhs = extractOperand(extsiRhs, rhsPermutationMap, rhsOffsets);

       AffineMap accPermutationMap = op.getIndexingMapsArray()[2];

       SmallVector<int64_t> accOffsets =

           applyPermutationMap(accPermutationMap, ArrayRef<int64_t>(offsets));

       Value tiledAcc =

           extractOperand(op.getAcc(), accPermutationMap, accOffsets);


       auto inputElementType =

           cast<ShapedType>(tiledLhs.getType()).getElementType();

       auto accElementType =

           cast<ShapedType>(tiledAcc.getType()).getElementType();

       auto inputExpandedType = VectorType::get({2, 8}, inputElementType);

       auto outputExpandedType = VectorType::get({2, 2}, accElementType);


       // With vecmat, tiled LHS and ACC will contain only one of 2 necessary

       // rows along dimM. Expand their shapes to match the smmla op.

       if (isVecmat) {

         auto expandForSMMLA = [&](Value tiledOperand,

                                   VectorType expandedTypeType) {

           auto emptyOperand = rewriter.create<arith::ConstantOp>(

               loc, expandedTypeType, rewriter.getZeroAttr(expandedTypeType));

           SmallVector<int64_t> offsets(

               cast<ShapedType>(emptyOperand.getType()).getRank(), 0);

           SmallVector<int64_t> strides(

               cast<ShapedType>(tiledOperand.getType()).getRank(), 1);

           return rewriter.createOrFold<vector::InsertStridedSliceOp>(

               loc, tiledOperand, emptyOperand, offsets, strides);

         };

         tiledLhs = expandForSMMLA(tiledLhs, inputExpandedType);

         tiledAcc = expandForSMMLA(tiledAcc, outputExpandedType);

       }


       // Collapse tiled operands to 1D vectors required by smmla intrinsic

       auto collapsedInputType =

           VectorType::get(inputExpandedType.getNumElements(), inputElementType);

       auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(

           tiledLhs.getLoc(), collapsedInputType, tiledLhs);

       auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(

           tiledRhs.getLoc(), collapsedInputType, tiledRhs);

       auto collapsedOutputType =

           VectorType::get(outputExpandedType.getNumElements(), accElementType);


       bool initialKAcc = offsets.back() == 0;

       Value collapsedRes;

       if (!initialKAcc) {

         collapsedRes = kAcc;

       } else {

         collapsedRes = rewriter.createOrFold<vector::ShapeCastOp>(

             tiledAcc.getLoc(), collapsedOutputType, tiledAcc);

       }


       // Insert contract op

       kAcc = rewriter.createOrFold<arm_neon::SmmlaOp>(

           op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,

           collapsedRhs);


       // Reshape output back to 2D

       Value tiledRes = rewriter.createOrFold<vector::ShapeCastOp>(

           kAcc.getLoc(), tiledAcc.getType(), kAcc);


       // With vecmat, only one row of tiled ACC can be inserted into file result

       if (isVecmat) {

         tiledRes = rewriter.createOrFold<vector::ExtractOp>(loc, tiledRes, 0);

       }


       // Insert the tiled result back into the non tiled result of the

       // contract op.

       SmallVector<int64_t> strides(

           cast<ShapedType>(tiledRes.getType()).getRank(), 1);

       result = rewriter.createOrFold<vector::InsertStridedSliceOp>(

           loc, tiledRes, result, accOffsets, strides);

     }


     rewriter.replaceOp(op, result);

     return success();

   }

 };


 } // namespace


 void mlir::arm_neon::populateLowerContractionToSMMLAPatternPatterns(

     RewritePatternSet &patterns) {

   MLIRContext *context = patterns.getContext();

   patterns.add<LowerContractionToSMMLAPattern>(context, /*benefit=*/1);

 }

ArmNeonDialect.h

FuncOps.h

GreedyPatternRewriteDriver.h

IndexingUtils.h

LLVMDialect.h

PatternMatch.h

VectorOps.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46

mlir::Builder::getZeroAttr
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:364

mlir::Builder::getI8Type
IntegerType getI8Type()
Definition: Builders.cpp:103

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::OpBuilder::createOrFold
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:529

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:497

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:791

mlir::RewritePatternSet
Definition: PatternMatch.h:814

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:133

mlir::StaticTileOffsetRange
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
Definition: IndexingUtils.h:376

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:129

mlir::Value::getLoc
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26

Arith.h

Transforms.h

AffineMap.h

mlir::arm_neon
Definition: Transforms.h:15

mlir::arm_neon::populateLowerContractionToSMMLAPatternPatterns
void populateLowerContractionToSMMLAPatternPatterns(RewritePatternSet &patterns)
Definition: LowerContractionToSMMLAPattern.cpp:238

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::applyPermutationMap
SmallVector< T > applyPermutationMap(AffineMap map, llvm::ArrayRef< T > source)
Apply a permutation from map to source and return the result.
Definition: AffineMap.h:675

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:233

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::isVecmat
bool isVecmat(ArrayAttr indexingMaps)
Tests whether the given maps describe a vector matrix multiplication.
Definition: StructuredOpsUtils.cpp:99

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:358

mlir::OpRewritePattern::OpRewritePattern
OpRewritePattern(MLIRContext *context, PatternBenefit benefit=1, ArrayRef< StringRef > generatedNames={})
Patterns must specify the root operation name they match against, and can also specify the benefit of...
Definition: PatternMatch.h:362