doxygen/AVXTranspose%5F8cpp%5Fsource.html

 //===- AVXTranspose.cpp - Lower Vector transpose to AVX -------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements vector.transpose rewrites as AVX patterns for particular

 // sizes of interest.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"

 #include "mlir/Dialect/X86Vector/Transforms.h"

 #include "mlir/IR/ImplicitLocOpBuilder.h"

 #include "mlir/IR/Matchers.h"

 #include "mlir/IR/PatternMatch.h"

 #include "llvm/Support/Format.h"

 #include "llvm/Support/FormatVariadic.h"


 using namespace mlir;

 using namespace mlir::vector;

 using namespace mlir::x86vector;

 using namespace mlir::x86vector::avx2;

 using namespace mlir::x86vector::avx2::inline_asm;

 using namespace mlir::x86vector::avx2::intrin;


 Value mlir::x86vector::avx2::inline_asm::mm256BlendPsAsm(

     ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask) {

   auto asmDialectAttr =

       LLVM::AsmDialectAttr::get(b.getContext(), LLVM::AsmDialect::AD_Intel);

   const auto *asmTp = "vblendps $0, $1, $2, {0}";

   const auto *asmCstr =

       "=x,x,x"; // Careful: constraint parser is very brittle: no ws!

   SmallVector<Value> asmVals{v1, v2};

   auto asmStr = llvm::formatv(asmTp, llvm::format_hex(mask, /*width=*/2)).str();

   auto asmOp = b.create<LLVM::InlineAsmOp>(

       v1.getType(), /*operands=*/asmVals, /*asm_string=*/asmStr,

       /*constraints=*/asmCstr, /*has_side_effects=*/false,

       /*is_align_stack=*/false, LLVM::TailCallKind::None,

       /*asm_dialect=*/asmDialectAttr,

       /*operand_attrs=*/ArrayAttr());

   return asmOp.getResult(0);

 }


 Value mlir::x86vector::avx2::intrin::mm256UnpackLoPs(ImplicitLocOpBuilder &b,

                                                      Value v1, Value v2) {

   return b.create<vector::ShuffleOp>(

       v1, v2, ArrayRef<int64_t>{0, 8, 1, 9, 4, 12, 5, 13});

 }


 Value mlir::x86vector::avx2::intrin::mm256UnpackHiPs(ImplicitLocOpBuilder &b,

                                                      Value v1, Value v2) {

   return b.create<vector::ShuffleOp>(

       v1, v2, ArrayRef<int64_t>{2, 10, 3, 11, 6, 14, 7, 15});

 }

 ///                            a  a   b   b  a  a   b   b

 /// Takes an 8 bit mask, 2 bit for each position of a[0, 3)  **and** b[0, 4):

 ///                                 0:127    |         128:255

 ///                            b01  b23  C8  D8  |  b01+4 b23+4 C8+4 D8+4

 Value mlir::x86vector::avx2::intrin::mm256ShufflePs(ImplicitLocOpBuilder &b,

                                                     Value v1, Value v2,

                                                     uint8_t mask) {

   uint8_t b01, b23, b45, b67;

   MaskHelper::extractShuffle(mask, b01, b23, b45, b67);

   SmallVector<int64_t> shuffleMask = {

       b01, b23, b45 + 8, b67 + 8, b01 + 4, b23 + 4, b45 + 8 + 4, b67 + 8 + 4};

   return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);

 }


 // imm[0:1] out of imm[0:3] is:

 //    0             1           2             3

 // a[0:127] or a[128:255] or b[0:127] or b[128:255]    |

 //          a[0:127] or a[128:255] or b[0:127] or b[128:255]

 //             0             1           2             3

 // imm[0:1] out of imm[4:7].

 Value mlir::x86vector::avx2::intrin::mm256Permute2f128Ps(

     ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask) {

   SmallVector<int64_t> shuffleMask;

   auto appendToMask = [&](uint8_t control) {

     if (control == 0)

       llvm::append_range(shuffleMask, ArrayRef<int64_t>{0, 1, 2, 3});

     else if (control == 1)

       llvm::append_range(shuffleMask, ArrayRef<int64_t>{4, 5, 6, 7});

     else if (control == 2)

       llvm::append_range(shuffleMask, ArrayRef<int64_t>{8, 9, 10, 11});

     else if (control == 3)

       llvm::append_range(shuffleMask, ArrayRef<int64_t>{12, 13, 14, 15});

     else

       llvm_unreachable("control > 3 : overflow");

   };

   uint8_t b03, b47;

   MaskHelper::extractPermute(mask, b03, b47);

   appendToMask(b03);

   appendToMask(b47);

   return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);

 }


 /// If bit i of `mask` is zero, take f32@i from v1 else take it from v2.

 Value mlir::x86vector::avx2::intrin::mm256BlendPs(ImplicitLocOpBuilder &b,

                                                   Value v1, Value v2,

                                                   uint8_t mask) {

   SmallVector<int64_t, 8> shuffleMask;

   for (int i = 0; i < 8; ++i) {

     bool isSet = mask & (1 << i);

     shuffleMask.push_back(!isSet ? i : i + 8);

   }

   return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);

 }


 /// AVX2 4x8xf32-specific transpose lowering using a "C intrinsics" model.

 void mlir::x86vector::avx2::transpose4x8xf32(ImplicitLocOpBuilder &ib,

                                              MutableArrayRef<Value> vs) {

 #ifndef NDEBUG

   auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));

   assert(vs.size() == 4 && "expects 4 vectors");

   assert(llvm::all_of(ValueRange{vs}.getTypes(),

                       [&](Type t) { return t == vt; }) &&

          "expects all types to be vector<8xf32>");

 #endif


   Value t0 = mm256UnpackLoPs(ib, vs[0], vs[1]);

   Value t1 = mm256UnpackHiPs(ib, vs[0], vs[1]);

   Value t2 = mm256UnpackLoPs(ib, vs[2], vs[3]);

   Value t3 = mm256UnpackHiPs(ib, vs[2], vs[3]);

   Value s0 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<1, 0, 1, 0>());

   Value s1 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<3, 2, 3, 2>());

   Value s2 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<1, 0, 1, 0>());

   Value s3 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<3, 2, 3, 2>());

   vs[0] = mm256Permute2f128Ps(ib, s0, s1, MaskHelper::permute<2, 0>());

   vs[1] = mm256Permute2f128Ps(ib, s2, s3, MaskHelper::permute<2, 0>());

   vs[2] = mm256Permute2f128Ps(ib, s0, s1, MaskHelper::permute<3, 1>());

   vs[3] = mm256Permute2f128Ps(ib, s2, s3, MaskHelper::permute<3, 1>());

 }


 /// AVX2 8x8xf32-specific transpose lowering using a "C intrinsics" model.

 void mlir::x86vector::avx2::transpose8x8xf32(ImplicitLocOpBuilder &ib,

                                              MutableArrayRef<Value> vs) {

   auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));

   (void)vt;

   assert(vs.size() == 8 && "expects 8 vectors");

   assert(llvm::all_of(ValueRange{vs}.getTypes(),

                       [&](Type t) { return t == vt; }) &&

          "expects all types to be vector<8xf32>");


   Value t0 = mm256UnpackLoPs(ib, vs[0], vs[1]);

   Value t1 = mm256UnpackHiPs(ib, vs[0], vs[1]);

   Value t2 = mm256UnpackLoPs(ib, vs[2], vs[3]);

   Value t3 = mm256UnpackHiPs(ib, vs[2], vs[3]);

   Value t4 = mm256UnpackLoPs(ib, vs[4], vs[5]);

   Value t5 = mm256UnpackHiPs(ib, vs[4], vs[5]);

   Value t6 = mm256UnpackLoPs(ib, vs[6], vs[7]);

   Value t7 = mm256UnpackHiPs(ib, vs[6], vs[7]);


   using inline_asm::mm256BlendPsAsm;

   Value sh0 = mm256ShufflePs(ib, t0, t2, MaskHelper::shuffle<1, 0, 3, 2>());

   Value sh2 = mm256ShufflePs(ib, t1, t3, MaskHelper::shuffle<1, 0, 3, 2>());

   Value sh4 = mm256ShufflePs(ib, t4, t6, MaskHelper::shuffle<1, 0, 3, 2>());

   Value sh6 = mm256ShufflePs(ib, t5, t7, MaskHelper::shuffle<1, 0, 3, 2>());


   Value s0 =

       mm256BlendPsAsm(ib, t0, sh0, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());

   Value s1 =

       mm256BlendPsAsm(ib, t2, sh0, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());

   Value s2 =

       mm256BlendPsAsm(ib, t1, sh2, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());

   Value s3 =

       mm256BlendPsAsm(ib, t3, sh2, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());

   Value s4 =

       mm256BlendPsAsm(ib, t4, sh4, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());

   Value s5 =

       mm256BlendPsAsm(ib, t6, sh4, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());

   Value s6 =

       mm256BlendPsAsm(ib, t5, sh6, MaskHelper::blend<0, 0, 1, 1, 0, 0, 1, 1>());

   Value s7 =

       mm256BlendPsAsm(ib, t7, sh6, MaskHelper::blend<1, 1, 0, 0, 1, 1, 0, 0>());


   vs[0] = mm256Permute2f128Ps(ib, s0, s4, MaskHelper::permute<2, 0>());

   vs[1] = mm256Permute2f128Ps(ib, s1, s5, MaskHelper::permute<2, 0>());

   vs[2] = mm256Permute2f128Ps(ib, s2, s6, MaskHelper::permute<2, 0>());

   vs[3] = mm256Permute2f128Ps(ib, s3, s7, MaskHelper::permute<2, 0>());

   vs[4] = mm256Permute2f128Ps(ib, s0, s4, MaskHelper::permute<3, 1>());

   vs[5] = mm256Permute2f128Ps(ib, s1, s5, MaskHelper::permute<3, 1>());

   vs[6] = mm256Permute2f128Ps(ib, s2, s6, MaskHelper::permute<3, 1>());

   vs[7] = mm256Permute2f128Ps(ib, s3, s7, MaskHelper::permute<3, 1>());

 }


 /// Rewrite AVX2-specific vector.transpose, for the supported cases and

 /// depending on the `TransposeLoweringOptions`. The lowering supports 2-D

 /// transpose cases and n-D cases that have been decomposed into 2-D

 /// transposition slices. For example, a 3-D transpose:

 ///

 ///   %0 = vector.transpose %arg0, [2, 0, 1]

 ///      : vector<1024x2048x4096xf32> to vector<4096x1024x2048xf32>

 ///

 /// could be sliced into 2-D transposes by tiling two of its dimensions to one

 /// of the vector lengths supported by the AVX2 patterns (e.g., 4x8):

 ///

 ///   %0 = vector.transpose %arg0, [2, 0, 1]

 ///      : vector<1x4x8xf32> to vector<8x1x4xf32>

 ///

 /// This lowering will analyze the n-D vector.transpose and determine if it's a

 /// supported 2-D transposition slice where any of the AVX2 patterns can be

 /// applied.

 class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {

 public:

   using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;


   TransposeOpLowering(LoweringOptions loweringOptions, MLIRContext *context,

                       int benefit)

       : OpRewritePattern<vector::TransposeOp>(context, benefit),

         loweringOptions(loweringOptions) {}


   LogicalResult matchAndRewrite(vector::TransposeOp op,

                                 PatternRewriter &rewriter) const override {

     auto loc = op.getLoc();


     // Check if the source vector type is supported. AVX2 patterns can only be

     // applied to f32 vector types with two dimensions greater than one.

     VectorType srcType = op.getSourceVectorType();

     if (!srcType.getElementType().isF32())

       return rewriter.notifyMatchFailure(op, "Unsupported vector element type");


     auto srcGtOneDims = mlir::vector::isTranspose2DSlice(op);

     if (failed(srcGtOneDims))

       return rewriter.notifyMatchFailure(

           op, "expected transposition on a 2D slice");


     // Retrieve the sizes of the two dimensions greater than one to be

     // transposed.

     int64_t m = srcType.getDimSize(std::get<0>(srcGtOneDims.value()));

     int64_t n = srcType.getDimSize(std::get<1>(srcGtOneDims.value()));


     auto applyRewrite = [&]() {

       ImplicitLocOpBuilder ib(loc, rewriter);

       SmallVector<Value> vs;


       // Reshape the n-D input vector with only two dimensions greater than one

       // to a 2-D vector.

       auto flattenedType =

           VectorType::get({n * m}, op.getSourceVectorType().getElementType());

       auto reshInputType = VectorType::get({m, n}, srcType.getElementType());

       auto reshInput =

           ib.create<vector::ShapeCastOp>(flattenedType, op.getVector());

       reshInput = ib.create<vector::ShapeCastOp>(reshInputType, reshInput);


       // Extract 1-D vectors from the higher-order dimension of the input

       // vector.

       for (int64_t i = 0; i < m; ++i)

         vs.push_back(ib.create<vector::ExtractOp>(reshInput, i));


       // Transpose set of 1-D vectors.

       if (m == 4)

         transpose4x8xf32(ib, vs);

       if (m == 8)

         transpose8x8xf32(ib, vs);


       // Insert transposed 1-D vectors into the higher-order dimension of the

       // output vector.

       Value res = ib.create<arith::ConstantOp>(reshInputType,

                                                ib.getZeroAttr(reshInputType));

       for (int64_t i = 0; i < m; ++i)

         res = ib.create<vector::InsertOp>(vs[i], res, i);


       // The output vector still has the shape of the input vector (e.g., 4x8).

       // We have to transpose their dimensions and retrieve its original rank

       // (e.g., 1x8x1x4x1).

       res = ib.create<vector::ShapeCastOp>(flattenedType, res);

       res = ib.create<vector::ShapeCastOp>(op.getResultVectorType(), res);

       rewriter.replaceOp(op, res);

       return success();

     };


     if (loweringOptions.transposeOptions.lower4x8xf32_ && m == 4 && n == 8)

       return applyRewrite();

     if (loweringOptions.transposeOptions.lower8x8xf32_ && m == 8 && n == 8)

       return applyRewrite();

     return failure();

   }


 private:

   LoweringOptions loweringOptions;

 };


 void mlir::x86vector::avx2::populateSpecializedTransposeLoweringPatterns(

     RewritePatternSet &patterns, LoweringOptions options, int benefit) {

   patterns.add<TransposeOpLowering>(options, patterns.getContext(), benefit);

 }

ImplicitLocOpBuilder.h

LLVMDialect.h

Matchers.h

None
@ None
Definition: MlirTblgenMain.cpp:28

options
static llvm::ManagedStatic< PassManagerOptions > options
Definition: PassManagerOptions.cpp:89

PatternMatch.h

getElementType
static Type getElementType(Type type, ArrayRef< int32_t > indices, function_ref< InFlightDiagnostic(StringRef)> emitErrorFn)
Walks the given type hierarchy with the given indices, potentially down to component granularity,...
Definition: SPIRVOps.cpp:188

VectorOps.h

VectorUtils.h

TransposeOpLowering
Rewrite AVX2-specific vector.transpose, for the supported cases and depending on the TransposeLowerin...
Definition: AVXTranspose.cpp:209

TransposeOpLowering::TransposeOpLowering
TransposeOpLowering(LoweringOptions loweringOptions, MLIRContext *context, int benefit)
Definition: AVXTranspose.cpp:213

TransposeOpLowering::matchAndRewrite
LogicalResult matchAndRewrite(vector::TransposeOp op, PatternRewriter &rewriter) const override
Definition: AVXTranspose.cpp:218

llvm::ArrayRef
Definition: LLVM.h:48

llvm::MutableArrayRef
Definition: LLVM.h:62

llvm::SmallVector
Definition: LLVM.h:72

mlir::Builder::getZeroAttr
TypedAttr getZeroAttr(Type type)
Definition: Builders.cpp:322

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::ImplicitLocOpBuilder
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
Definition: ImplicitLocOpBuilder.h:23

mlir::ImplicitLocOpBuilder::create
OpTy create(Args &&...args)
Create an operation of specific op type at the current insertion point and location.
Definition: ImplicitLocOpBuilder.h:66

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:749

mlir::RewritePatternSet
Definition: PatternMatch.h:772

mlir::RewriterBase::notifyMatchFailure
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:682

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:129

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getType
Type getType() const
Return the type of this value.
Definition: Value.h:105

Arith.h

Transforms.h

mlir::vector
Definition: ConvertVectorToLLVM.h:28

mlir::vector::isTranspose2DSlice
FailureOr< std::pair< int, int > > isTranspose2DSlice(vector::TransposeOp op)
Returns two dims that are greater than one if the transposition is applied on a 2D slice.
Definition: VectorUtils.cpp:84

mlir::x86vector::avx2::inline_asm
Definition: Transforms.h:96

mlir::x86vector::avx2::inline_asm::mm256BlendPsAsm
Value mm256BlendPsAsm(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
Methods in the inline_asm namespace emit calls to LLVM::InlineAsmOp.
Definition: AVXTranspose.cpp:32

mlir::x86vector::avx2::intrin
Definition: Transforms.h:106

mlir::x86vector::avx2::intrin::mm256UnpackHiPs
Value mm256UnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2)
Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
Definition: AVXTranspose.cpp:56

mlir::x86vector::avx2::intrin::mm256BlendPs
Value mm256BlendPs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
If bit i of mask is zero, take f32@i from v1 else take it from v2.
Definition: AVXTranspose.cpp:104

mlir::x86vector::avx2::intrin::mm256Permute2f128Ps
Value mm256Permute2f128Ps(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
Definition: AVXTranspose.cpp:81

mlir::x86vector::avx2::intrin::mm256ShufflePs
Value mm256ShufflePs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
a a b b a a b b Take an 8 bit mask, 2 bit for each position of a[0, 3) and b[0, 4): 0:127 | 128:255 b...
Definition: AVXTranspose.cpp:65

mlir::x86vector::avx2::intrin::mm256UnpackLoPs
Value mm256UnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2)
Methods in the intrin namespace emulate clang's impl. of X86 intrinsics.
Definition: AVXTranspose.cpp:50

mlir::x86vector::avx2
Helpers extracted from:
Definition: Transforms.h:94

mlir::x86vector::avx2::transpose8x8xf32
void transpose8x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef< Value > vs)
8x8xf32-specific AVX2 transpose lowering.
Definition: AVXTranspose.cpp:141

mlir::x86vector::avx2::populateSpecializedTransposeLoweringPatterns
void populateSpecializedTransposeLoweringPatterns(RewritePatternSet &patterns, LoweringOptions options=LoweringOptions(), int benefit=10)
Insert specialized transpose lowering patterns.
Definition: AVXTranspose.cpp:289

mlir::x86vector::avx2::transpose4x8xf32
void transpose4x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef< Value > vs)
Generic lowerings may either use intrin or inline_asm depending on needs.
Definition: AVXTranspose.cpp:116

mlir::x86vector
Definition: Transforms.h:21

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314

mlir::x86vector::avx2::LoweringOptions
Options for controlling specialized AVX2 lowerings.
Definition: Transforms.h:159