MLIR  21.0.0git
Macros | Typedefs | Functions
VectorEmulateNarrowType.cpp File Reference
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Transforms/NarrowTypeEmulationConverter.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <optional>

Go to the source code of this file.

Macros

#define DEBUG_TYPE   "vector-narrow-type-emulation"
 
#define DBGS()   (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 
#define DBGSNL()   (llvm::dbgs() << "\n")
 
#define LDBG(X)   LLVM_DEBUG(DBGS() << X << "\n")
 

Typedefs

using VectorValue = TypedValue< VectorType >
 
using MemRefValue = TypedValue< MemRefType >
 
using ExtractNBitsFn = std::function< Value(PatternRewriter &, Location, Value, int, int)>
 

Functions

static FailureOr< Operation * > getCompressedMaskOp (OpBuilder &rewriter, Location loc, Value mask, int numSrcElems, int numSrcElemsPerDest, int numFrontPadElems=0)
 Returns a compressed mask for the emulated vector. More...
 
static Value staticallyExtractSubvector (OpBuilder &rewriter, Location loc, Value src, int64_t offset, int64_t numElemsToExtract)
 Extracts 1-D subvector from a 1-D vector. More...
 
static Value staticallyInsertSubvector (OpBuilder &rewriter, Location loc, Value src, Value dest, int64_t offset)
 Inserts 1-D subvector into a 1-D vector. More...
 
static Value dynamicallyExtractSubVector (OpBuilder &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToExtract)
 Extracts 1-D subvector from a 1-D vector. More...
 
static Value dynamicallyInsertSubVector (RewriterBase &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToInsert)
 Inserts 1-D subvector into a 1-D vector. More...
 
static VectorValue emulatedVectorLoad (OpBuilder &rewriter, Location loc, Value base, OpFoldResult linearizedIndices, int64_t numContainerElemsToLoad, Type emulatedElemTy, Type containerElemTy)
 Emulate a vector load for emulatedElemTy using containerElemTy More...
 
static Value downcastSelectAndUpcast (OpBuilder &builder, Location loc, VectorType downcastType, VectorType upcastType, Value mask, Value trueValue, Value falseValue)
 Downcast two values to downcastType, then select values based on mask, and casts the result to upcastType. More...
 
static void atomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value storeIdx, VectorValue valueToStore, Value mask)
 Emits memref.generic_atomic_rmw op to store a subbyte-sized value to a byte in linearizedMemref, with a mask. More...
 
static void nonAtomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value linearizedIndex, VectorValue valueToStore, Value mask)
 Generate a non-atomic read-modify-write sequence for storing to the emulated type. More...
 
static Value extractSliceIntoByte (ConversionPatternRewriter &rewriter, Location loc, VectorValue vector, int64_t extractOffset, int64_t sliceNumElements, int64_t insertOffset)
 Extract sliceNumElements from source vector at extractOffset, and insert it into an empty vector at insertOffset. More...
 
static raw_ostream & operator<< (raw_ostream &os, const SmallVector< SourceElementRangeList > &vec)
 
static LogicalResult commonConversionPrecondition (PatternRewriter &rewriter, VectorType preconditionType, Operation *op)
 Verify that the precondition type meets the common preconditions for any conversion. More...
 
static LogicalResult alignedConversionPrecondition (PatternRewriter &rewriter, VectorType subByteVecTy, Type containerTy, Operation *op)
 Verify that subByteVecTy (vector) and containerTy (scalar) are aligned. More...
 
static Value bitcastSubByteVectorToI8 (PatternRewriter &rewriter, Location loc, Value subByteVec)
 Bitcasts the aligned subByteVec vector to a vector of i8. More...
 
static Value extractNBitsPerByteAndSignExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits)
 Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More...
 
static Value extractNBitsPerByteAndExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits)
 Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More...
 
static Value rewriteI4ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn)
 Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...
 
static Value rewriteI2ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn)
 Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...
 
static Value rewriteI8ToI4Trunc (PatternRewriter &rewriter, Location loc, Value srcValue)
 Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...
 

Macro Definition Documentation

◆ DBGS

#define DBGS ( )    (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

Definition at line 44 of file VectorEmulateNarrowType.cpp.

◆ DBGSNL

#define DBGSNL ( )    (llvm::dbgs() << "\n")

Definition at line 45 of file VectorEmulateNarrowType.cpp.

◆ DEBUG_TYPE

#define DEBUG_TYPE   "vector-narrow-type-emulation"

Definition at line 43 of file VectorEmulateNarrowType.cpp.

◆ LDBG

#define LDBG (   X)    LLVM_DEBUG(DBGS() << X << "\n")

Definition at line 46 of file VectorEmulateNarrowType.cpp.

Typedef Documentation

◆ ExtractNBitsFn

using ExtractNBitsFn = std::function<Value(PatternRewriter &, Location, Value, int, int)>

Definition at line 1813 of file VectorEmulateNarrowType.cpp.

◆ MemRefValue

using MemRefValue = TypedValue<MemRefType>

Definition at line 49 of file VectorEmulateNarrowType.cpp.

◆ VectorValue

using VectorValue = TypedValue<VectorType>

Definition at line 48 of file VectorEmulateNarrowType.cpp.

Function Documentation

◆ alignedConversionPrecondition()

static LogicalResult alignedConversionPrecondition ( PatternRewriter rewriter,
VectorType  subByteVecTy,
Type  containerTy,
Operation op 
)
static

Verify that subByteVecTy (vector) and containerTy (scalar) are aligned.

Alignment means that subByteVecTy can be packed into a vector of containerTy elements. More specifically:

  1. The bit-width of containerTy is a multiple of the bit-width of subByteVecTy elements. For example, for i4 and i16 this multiple is 4.
  2. The multiple from 1. above divides evenly the number of the (trailing) elements in subByteVecTy.

EXAMPLE 1: subByteVecTy = vector<2xi4>, and containerTy = i16

2 divides evenly 4 ( = 16 / 4), hence both conditions are met.

EXAMPLE 2: subByteVecTy = vector<3xi4>, and containerTy = i16

3 does not divide evenly 4 (= 16/4), hence the conditions are not met.

EXAMPLE 3: subByteVecTy = vector<3xi3>, and containerTy = i16

16 is not a multiple of 3, hence the conditions are not met.

NOTE: This method assumes that common conversion preconditions are met. In particular, containerTy is assumed to be a multi-byte scalar type (e.g., i8, i16, i32).

Definition at line 1601 of file VectorEmulateNarrowType.cpp.

References mlir::Type::getIntOrFloatBitWidth(), mlir::Type::isIntOrFloat(), and mlir::RewriterBase::notifyMatchFailure().

◆ atomicRMW()

static void atomicRMW ( OpBuilder builder,
Location  loc,
MemRefValue  linearizedMemref,
Value  storeIdx,
VectorValue  valueToStore,
Value  mask 
)
static

Emits memref.generic_atomic_rmw op to store a subbyte-sized value to a byte in linearizedMemref, with a mask.

The valueToStore is a vector of subbyte-sized elements, with size of 8 bits, and the mask is used to select which elements to store.

Inputs: linearizedMemref = |2|2|2|2| : <4xi2> (<1xi8>) storeIdx = 2 valueToStore = |3|3|3|3| : vector<4xi2> mask = |0|0|1|1| : vector<4xi1>

Result: linearizedMemref = |2|2|3|3| : <4xi2> (<1xi8>)

Definition at line 418 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and mlir::OpBuilder::setInsertionPointToStart().

◆ bitcastSubByteVectorToI8()

static Value bitcastSubByteVectorToI8 ( PatternRewriter rewriter,
Location  loc,
Value  subByteVec 
)
static

Bitcasts the aligned subByteVec vector to a vector of i8.

Where aligned means it satisfies the alignedConversionPreconditions.

Example: vector<16x16xi2> -> vector<16x4xi8> vector<16x16xi4> -> vector<16x8xi8>

Definition at line 1719 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI8Type(), and mlir::Value::getType().

Referenced by rewriteI2ToI8Ext(), and rewriteI4ToI8Ext().

◆ commonConversionPrecondition()

static LogicalResult commonConversionPrecondition ( PatternRewriter rewriter,
VectorType  preconditionType,
Operation op 
)
static

Verify that the precondition type meets the common preconditions for any conversion.

Definition at line 1543 of file VectorEmulateNarrowType.cpp.

References mlir::RewriterBase::notifyMatchFailure().

◆ downcastSelectAndUpcast()

static Value downcastSelectAndUpcast ( OpBuilder builder,
Location  loc,
VectorType  downcastType,
VectorType  upcastType,
Value  mask,
Value  trueValue,
Value  falseValue 
)
static

Downcast two values to downcastType, then select values based on mask, and casts the result to upcastType.

Definition at line 384 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), and mlir::Value::getType().

Referenced by atomicRMW(), and nonAtomicRMW().

◆ dynamicallyExtractSubVector()

static Value dynamicallyExtractSubVector ( OpBuilder rewriter,
Location  loc,
Value  src,
Value  dest,
OpFoldResult  offset,
int64_t  numElemsToExtract 
)
static

Extracts 1-D subvector from a 1-D vector.

Given the input rank-1 source vector, extracts numElemsToExtact elements from src, starting at offset. The result is also a rank-1 vector:

vector<numElemsToExtact x !elType>

(!elType is the element type of the source vector). As offset is assumed to be a dynamic SSA value, this helper method generates a sequence of vector.extract + vector.insert pairs.

EXAMPLE: v1 = vector.extract src[offset] : i2 from vector<8xi2> r1 = vector.insert v1, dest[0] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[idx2] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [1] : i2 into vector<3xi2> (...)

Definition at line 284 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), and mlir::Value::getType().

◆ dynamicallyInsertSubVector()

static Value dynamicallyInsertSubVector ( RewriterBase rewriter,
Location  loc,
Value  src,
Value  dest,
OpFoldResult  offset,
int64_t  numElemsToInsert 
)
static

Inserts 1-D subvector into a 1-D vector.

Inserts the input rank-1 source vector into the destination vector starting at offset. As offset is assumed to be a dynamic SSA value, this hook uses a sequence of vector.extract + vector.insert pairs.

EXAMPLE: v1 = vector.extract src[0] : i2 from vector<8xi2> r1 = vector.insert v1, dest[offset] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[1] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [idx2] : i2 into vector<3xi2> (...)

Definition at line 327 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), mlir::Value::getType(), and mlir::getValueOrCreateConstantIndexOp().

◆ emulatedVectorLoad()

static VectorValue emulatedVectorLoad ( OpBuilder rewriter,
Location  loc,
Value  base,
OpFoldResult  linearizedIndices,
int64_t  numContainerElemsToLoad,
Type  emulatedElemTy,
Type  containerElemTy 
)
static

Emulate a vector load for emulatedElemTy using containerElemTy

Specifically, use containerElemTy for loading a vector of emulatedElemTy. The load location is given by base and linearizedIndices, and the load size is given by numEmulatedElementsToLoad.

Definition at line 364 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Type::getIntOrFloatBitWidth(), and mlir::getValueOrCreateConstantIndexOp().

◆ extractNBitsPerByteAndExtendToI8()

static Value extractNBitsPerByteAndExtendToI8 ( PatternRewriter rewriter,
Location  loc,
Value  src,
int  bitIdx,
int  numBits 
)
static

Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index.

The bitIdx starts at 0 from the LSB and moves to the left.

Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 0 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]

The target sequence is [10](decimal=2) as unsigned 2-bit integer. So the result should be [00 00 00 10](decimal=2) as unsigned 8-bit integer.

src = [01 01 10 10] mask = [00 00 00 11] shr = arith.shrui(src, 2) = [00 01 01 10] result = arith.andi(shr, mask) = [00 00 00 10] NOTE: Similarly to extractNBitsPerByteAndSignExtendToI8, this could be achieved by using arith::ShLIOp + arith::ShRUIOp instead of the masking. However, by using arith::ShRUIOp + arith::AndIOp, we are eliminating shift left when the index is 0.

Definition at line 1791 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().

◆ extractNBitsPerByteAndSignExtendToI8()

static Value extractNBitsPerByteAndSignExtendToI8 ( PatternRewriter rewriter,
Location  loc,
Value  src,
int  bitIdx,
int  numBits 
)
static

Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index.

The bitIdx starts at 0 from the LSB and moves to the left.

Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 1 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]

The target sequence is [11](decimal=-1) as signed 2-bit integer. So the result should be [11 11 11 11](decimal=-1) as signed 8-bit integer.

src = [01 01 11 10] shl = arith.shl(src, 4) -> [11 10 00 00] result = arith.shrsi(shl, 6) -> [11 11 11 11]

Definition at line 1749 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().

◆ extractSliceIntoByte()

static Value extractSliceIntoByte ( ConversionPatternRewriter rewriter,
Location  loc,
VectorValue  vector,
int64_t  extractOffset,
int64_t  sliceNumElements,
int64_t  insertOffset 
)
static

Extract sliceNumElements from source vector at extractOffset, and insert it into an empty vector at insertOffset.

Inputs: vec_in = |0|1|2|3| : vector<4xi2> extractOffset = 1 sliceNumElements = 2 insertOffset = 2 Output: vec_out = |0|0|1|2| : vector<4xi2>

Definition at line 477 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getZeroAttr(), staticallyExtractSubvector(), and staticallyInsertSubvector().

◆ getCompressedMaskOp()

static FailureOr<Operation *> getCompressedMaskOp ( OpBuilder rewriter,
Location  loc,
Value  mask,
int  numSrcElems,
int  numSrcElemsPerDest,
int  numFrontPadElems = 0 
)
static

Returns a compressed mask for the emulated vector.

For example, when emulating an eight-element i8 vector with i32 (i.e. when the source elements span two dest elements), this method compresses vector<8xi1> into vector<2xi1>.

The compressed/output mask value is set iff any mask in the corresponding numSrcElemsPerDest range of uncompressed/input masks is set. E.g., if numSrcElemsPerDest equals to 2, and numFrontPadElems equals to 1, the following mask:

mask = [1, 1, 0, 0, 0, 0]

will first be padded in the front with numFrontPadElems zeros, and zeros will be added in the back to make the number of elements a multiple of numSrcElemsPerDest (for easier computation). The resulting mask will be:

mask = [0, 1, 1, 0, 0, 0, 0, 0]

then it will return the following new compressed mask:

mask = [1, 1, 0, 0]

NOTE: numFrontPadElems is assumed to be strictly smaller than numSrcElemsPerDest.

Definition at line 79 of file VectorEmulateNarrowType.cpp.

References mlir::bindSymbols(), mlir::OpBuilder::create(), mlir::detail::divideCeil(), mlir::get(), mlir::DenseElementsAttr::get(), mlir::getAsOpFoldResult(), mlir::Builder::getContext(), mlir::Value::getDefiningOp(), mlir::Builder::getI1Type(), mlir::Operation::getResultTypes(), mlir::getValueOrCreateConstantIndexOp(), and mlir::affine::makeComposedFoldedAffineApply().

◆ nonAtomicRMW()

static void nonAtomicRMW ( OpBuilder builder,
Location  loc,
MemRefValue  linearizedMemref,
Value  linearizedIndex,
VectorValue  valueToStore,
Value  mask 
)
static

Generate a non-atomic read-modify-write sequence for storing to the emulated type.

It has similar logic to atomicRMWStore, but without atomicity.

Definition at line 449 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and getElementType().

◆ operator<<()

static raw_ostream& operator<< ( raw_ostream &  os,
const SmallVector< SourceElementRangeList > &  vec 
)
static

Definition at line 1486 of file VectorEmulateNarrowType.cpp.

References mlir::detail::enumerate().

◆ rewriteI2ToI8Ext()

static Value rewriteI2ToI8Ext ( PatternRewriter rewriter,
Location  loc,
Value  srcValue,
const ExtractNBitsFn extFn 
)
static

Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1838 of file VectorEmulateNarrowType.cpp.

References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().

◆ rewriteI4ToI8Ext()

static Value rewriteI4ToI8Ext ( PatternRewriter rewriter,
Location  loc,
Value  srcValue,
const ExtractNBitsFn extFn 
)
static

Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1818 of file VectorEmulateNarrowType.cpp.

References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().

◆ rewriteI8ToI4Trunc()

static Value rewriteI8ToI4Trunc ( PatternRewriter rewriter,
Location  loc,
Value  srcValue 
)
static

Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1873 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI4Type(), and mlir::Value::getType().

◆ staticallyExtractSubvector()

static Value staticallyExtractSubvector ( OpBuilder rewriter,
Location  loc,
Value  src,
int64_t  offset,
int64_t  numElemsToExtract 
)
static

Extracts 1-D subvector from a 1-D vector.

Given the input rank-1 source vector, extracts numElemsToExtract elements from src, starting at offset. The result is also a rank-1 vector:

vector<numElemsToExtract x !elemType>

(!elType is the element type of the source vector). As offset is a known static value, this helper hook emits vector.extract_strided_slice.

EXAMPLE: res = vector.extract_strided_slice src { offsets = [offset], sizes = [numElemsToExtract], strides = [1] }

Definition at line 214 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().

Referenced by extractSliceIntoByte().

◆ staticallyInsertSubvector()

static Value staticallyInsertSubvector ( OpBuilder rewriter,
Location  loc,
Value  src,
Value  dest,
int64_t  offset 
)
static

Inserts 1-D subvector into a 1-D vector.

Inserts the input rank-1 source vector into the destination vector starting at offset. As offset is a known static value, this helper hook emits vector.insert_strided_slice.

EXAMPLE: res = vector.insert_strided_slice src, dest {offsets = [offset], strides [1]}

Definition at line 248 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().

Referenced by extractSliceIntoByte().