9 #ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMS_H
10 #define MLIR_DIALECT_X86VECTOR_TRANSFORMS_H
16 class ImplicitLocOpBuilder;
17 class LLVMConversionTarget;
18 class LLVMTypeConverter;
19 class RewritePatternSet;
27 template <uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4,
28 uint8_t b5, uint8_t b6, uint8_t b7>
30 static_assert(b0 <= 1 && b1 <= 1 && b2 <= 1 && b3 <= 1,
"overflow");
31 static_assert(b4 <= 1 && b5 <= 1 && b6 <= 1 && b7 <= 1,
"overflow");
32 return static_cast<uint8_t
>((b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) |
33 (b3 << 3) | (b2 << 2) | (b1 << 1) | b0);
37 static void extractBlend(uint8_t mask, uint8_t &b0, uint8_t &b1, uint8_t &b2,
38 uint8_t &b3, uint8_t &b4, uint8_t &b5, uint8_t &b6,
51 template <
unsigned b67,
unsigned b45,
unsigned b23,
unsigned b01>
53 static_assert(b01 <= 0x03,
"overflow");
54 static_assert(b23 <= 0x03,
"overflow");
55 static_assert(b45 <= 0x03,
"overflow");
56 static_assert(b67 <= 0x03,
"overflow");
57 return static_cast<uint8_t
>((b67 << 6) | (b45 << 4) | (b23 << 2) | b01);
61 uint8_t &b45, uint8_t &b67) {
62 b67 = (mask & (0x03 << 6)) >> 6;
63 b45 = (mask & (0x03 << 4)) >> 4;
64 b23 = (mask & (0x03 << 2)) >> 2;
69 template <
unsigned b47,
unsigned b03>
71 static_assert(b03 <= 0x0f,
"overflow");
72 static_assert(b47 <= 0x0f,
"overflow");
73 return static_cast<uint8_t
>((b47 << 4) + b03);
77 b47 = (mask & (0x0f << 4)) >> 4;
96 namespace inline_asm {
179 const LLVMTypeConverter &converter, RewritePatternSet &
patterns);
static llvm::ManagedStatic< PassManagerOptions > options
ImplicitLocOpBuilder maintains a 'current location', allowing use of the create<> method without spec...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Value mm256BlendPsAsm(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
Methods in the inline_asm namespace emit calls to LLVM::InlineAsmOp.
Value mm256UnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2)
Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
Value mm256BlendPs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
If bit i of mask is zero, take f32@i from v1 else take it from v2.
Value mm256Permute2f128Ps(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
Value mm256ShufflePs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask)
a a b b a a b b Take an 8 bit mask, 2 bit for each position of a[0, 3) and b[0, 4): 0:127 | 128:255 b...
Value mm256UnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2)
Methods in the intrin namespace emulate clang's impl. of X86 intrinsics.
void transpose8x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef< Value > vs)
8x8xf32-specific AVX2 transpose lowering.
void populateSpecializedTransposeLoweringPatterns(RewritePatternSet &patterns, LoweringOptions options=LoweringOptions(), int benefit=10)
Insert specialized transpose lowering patterns.
void transpose4x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef< Value > vs)
Generic lowerings may either use intrin or inline_asm depending on needs.
Include the generated interface declarations.
void populateX86VectorLegalizeForLLVMExportPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to lower X86Vector ops to ops that map to LLVM intrinsics.
const FrozenRewritePatternSet & patterns
void configureX86VectorLegalizeForExportTarget(LLVMConversionTarget &target)
Configure the target to support lowering X86Vector ops to ops that map to LLVM intrinsics.
Helper class to factor out the creation and extraction of masks from nibs.
static void extractBlend(uint8_t mask, uint8_t &b0, uint8_t &b1, uint8_t &b2, uint8_t &b3, uint8_t &b4, uint8_t &b5, uint8_t &b6, uint8_t &b7)
b0 captures the lowest bit, b7 captures the highest bit.
static void extractPermute(uint8_t mask, uint8_t &b03, uint8_t &b47)
b03 captures the lower 4 bits, b47 captures the higher 4 bits.
static void extractShuffle(uint8_t mask, uint8_t &b01, uint8_t &b23, uint8_t &b45, uint8_t &b67)
b01 captures the lower 2 bits, b67 captures the higher 2 bits.
static uint8_t shuffle()
b01 captures the lower 2 bits, b67 captures the higher 2 bits.
static uint8_t blend()
b0 captures the lowest bit, b7 captures the highest bit.
static uint8_t permute()
b03 captures the lower 4 bits, b47 captures the higher 4 bits.
Options for controlling specialized AVX2 lowerings.
LoweringOptions & setTransposeOptions(TransposeLoweringOptions options)
TransposeLoweringOptions transposeOptions
Configure specialized vector lowerings.
Structure to control the behavior of specialized AVX2 transpose lowering.
TransposeLoweringOptions & lower8x8xf32(bool lower=true)
TransposeLoweringOptions & lower4x8xf32(bool lower=true)