#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Transforms/NarrowTypeEmulationConverter.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <optional>

Macros
#define	DEBUG_TYPE "vector-narrow-type-emulation"

#define	DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

#define	DBGSNL() (llvm::dbgs() << "\n")

#define	LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

Typedefs
using	VectorValue = TypedValue< VectorType >

using	MemRefValue = TypedValue< MemRefType >

using	ExtractNBitsFn = std::function< Value(PatternRewriter &, Location, Value, int, int)>

Functions
static FailureOr< Operation * >	getCompressedMaskOp (OpBuilder &rewriter, Location loc, Value mask, int numSrcElems, int numSrcElemsPerDest, int numFrontPadElems=0)
	Returns a compressed mask for the emulated vector. More...

static Value	staticallyExtractSubvector (OpBuilder &rewriter, Location loc, Value src, int64_t offset, int64_t numElemsToExtract)
	Extracts 1-D subvector from a 1-D vector. More...

static Value	staticallyInsertSubvector (OpBuilder &rewriter, Location loc, Value src, Value dest, int64_t offset)
	Inserts 1-D subvector into a 1-D vector. More...

static Value	dynamicallyExtractSubVector (OpBuilder &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToExtract)
	Extracts 1-D subvector from a 1-D vector. More...

static Value	dynamicallyInsertSubVector (RewriterBase &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToInsert)
	Inserts 1-D subvector into a 1-D vector. More...

static VectorValue	emulatedVectorLoad (OpBuilder &rewriter, Location loc, Value base, OpFoldResult linearizedIndices, int64_t numContainerElemsToLoad, Type emulatedElemTy, Type containerElemTy)
	Emulate a vector load for `emulatedElemTy` using `containerElemTy` More...

static Value	downcastSelectAndUpcast (OpBuilder &builder, Location loc, VectorType downcastType, VectorType upcastType, Value mask, Value trueValue, Value falseValue)
	Downcast two values to `downcastType`, then select values based on `mask`, and casts the result to `upcastType`. More...

static void	atomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value storeIdx, VectorValue valueToStore, Value mask)
	Emits `memref.generic_atomic_rmw` op to store a subbyte-sized value to a byte in `linearizedMemref`, with a mask. More...

static void	nonAtomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value linearizedIndex, VectorValue valueToStore, Value mask)
	Generate a non-atomic read-modify-write sequence for storing to the emulated type. More...

static Value	extractSliceIntoByte (ConversionPatternRewriter &rewriter, Location loc, VectorValue vector, int64_t extractOffset, int64_t sliceNumElements, int64_t insertOffset)
	Extract `sliceNumElements` from source `vector` at `extractOffset`, and insert it into an empty vector at `insertOffset`. More...

static raw_ostream &	operator<< (raw_ostream &os, const SmallVector< SourceElementRangeList > &vec)

static LogicalResult	commonConversionPrecondition (PatternRewriter &rewriter, VectorType preconditionType, Operation *op)
	Verify that the precondition type meets the common preconditions for any conversion. More...

static LogicalResult	alignedConversionPrecondition (PatternRewriter &rewriter, VectorType subByteVecTy, Type containerTy, Operation *op)
	Verify that `subByteVecTy` (vector) and `containerTy` (scalar) are aligned. More...

static Value	bitcastSubByteVectorToI8 (PatternRewriter &rewriter, Location loc, Value subByteVec)
	Bitcasts the aligned `subByteVec` vector to a vector of i8. More...

static Value	extractNBitsPerByteAndSignExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits)
	Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More...

static Value	extractNBitsPerByteAndExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits)
	Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More...

static Value	rewriteI4ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn)
	Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...

static Value	rewriteI2ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn)
	Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...

static Value	rewriteI8ToI4Trunc (PatternRewriter &rewriter, Location loc, Value srcValue)
	Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More...

Macro Definition Documentation

◆ DBGS

#define DBGS ( ) (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

Definition at line 44 of file VectorEmulateNarrowType.cpp.

◆ DBGSNL

#define DBGSNL ( ) (llvm::dbgs() << "\n")

Definition at line 45 of file VectorEmulateNarrowType.cpp.

◆ DEBUG_TYPE

#define DEBUG_TYPE "vector-narrow-type-emulation"

Definition at line 43 of file VectorEmulateNarrowType.cpp.

◆ LDBG

#define LDBG ( X ) LLVM_DEBUG(DBGS() << X << "\n")

Definition at line 46 of file VectorEmulateNarrowType.cpp.

Typedef Documentation

◆ ExtractNBitsFn

using ExtractNBitsFn = std::function<Value(PatternRewriter &, Location, Value, int, int)>

Definition at line 1813 of file VectorEmulateNarrowType.cpp.

◆ MemRefValue

using MemRefValue = TypedValue<MemRefType>

Definition at line 49 of file VectorEmulateNarrowType.cpp.

◆ VectorValue

using VectorValue = TypedValue<VectorType>

Definition at line 48 of file VectorEmulateNarrowType.cpp.

Function Documentation

◆ alignedConversionPrecondition()

static LogicalResult alignedConversionPrecondition	(	PatternRewriter &	rewriter,
		VectorType	subByteVecTy,
		Type	containerTy,
		Operation *	op
	)

static

Verify that subByteVecTy (vector) and containerTy (scalar) are aligned.

Alignment means that subByteVecTy can be packed into a vector of containerTy elements. More specifically:

The bit-width of containerTy is a multiple of the bit-width of subByteVecTy elements. For example, for i4 and i16 this multiple is 4.
The multiple from 1. above divides evenly the number of the (trailing) elements in subByteVecTy.

EXAMPLE 1: subByteVecTy = vector<2xi4>, and containerTy = i16

2 divides evenly 4 ( = 16 / 4), hence both conditions are met.

EXAMPLE 2: subByteVecTy = vector<3xi4>, and containerTy = i16

3 does not divide evenly 4 (= 16/4), hence the conditions are not met.

EXAMPLE 3: subByteVecTy = vector<3xi3>, and containerTy = i16

16 is not a multiple of 3, hence the conditions are not met.

NOTE: This method assumes that common conversion preconditions are met. In particular, containerTy is assumed to be a multi-byte scalar type (e.g., i8, i16, i32).

Definition at line 1601 of file VectorEmulateNarrowType.cpp.

References mlir::Type::getIntOrFloatBitWidth(), mlir::Type::isIntOrFloat(), and mlir::RewriterBase::notifyMatchFailure().

◆ atomicRMW()

static void atomicRMW	(	OpBuilder &	builder,
		Location	loc,
		MemRefValue	linearizedMemref,
		Value	storeIdx,
		VectorValue	valueToStore,
		Value	mask
	)

static

Emits memref.generic_atomic_rmw op to store a subbyte-sized value to a byte in linearizedMemref, with a mask.

The valueToStore is a vector of subbyte-sized elements, with size of 8 bits, and the mask is used to select which elements to store.

Inputs: linearizedMemref = |2|2|2|2| : <4xi2> (<1xi8>) storeIdx = 2 valueToStore = |3|3|3|3| : vector<4xi2> mask = |0|0|1|1| : vector<4xi1>

Result: linearizedMemref = |2|2|3|3| : <4xi2> (<1xi8>)

Definition at line 418 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and mlir::OpBuilder::setInsertionPointToStart().

◆ bitcastSubByteVectorToI8()

static Value bitcastSubByteVectorToI8	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	subByteVec
	)

static

Bitcasts the aligned subByteVec vector to a vector of i8.

Where aligned means it satisfies the alignedConversionPreconditions.

Example: vector<16x16xi2> -> vector<16x4xi8> vector<16x16xi4> -> vector<16x8xi8>

Definition at line 1719 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI8Type(), and mlir::Value::getType().

Referenced by rewriteI2ToI8Ext(), and rewriteI4ToI8Ext().

◆ commonConversionPrecondition()

static LogicalResult commonConversionPrecondition	(	PatternRewriter &	rewriter,
		VectorType	preconditionType,
		Operation *	op
	)

static

Verify that the precondition type meets the common preconditions for any conversion.

Definition at line 1543 of file VectorEmulateNarrowType.cpp.

References mlir::RewriterBase::notifyMatchFailure().

◆ downcastSelectAndUpcast()

static Value downcastSelectAndUpcast	(	OpBuilder &	builder,
		Location	loc,
		VectorType	downcastType,
		VectorType	upcastType,
		Value	mask,
		Value	trueValue,
		Value	falseValue
	)

static

Downcast two values to downcastType, then select values based on mask, and casts the result to upcastType.

Definition at line 384 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), and mlir::Value::getType().

Referenced by atomicRMW(), and nonAtomicRMW().

◆ dynamicallyExtractSubVector()

static Value dynamicallyExtractSubVector	(	OpBuilder &	rewriter,
		Location	loc,
		Value	src,
		Value	dest,
		OpFoldResult	offset,
		int64_t	numElemsToExtract
	)

static

Extracts 1-D subvector from a 1-D vector.

Given the input rank-1 source vector, extracts numElemsToExtact elements from src, starting at offset. The result is also a rank-1 vector:

vector<numElemsToExtact x !elType>

(!elType is the element type of the source vector). As offset is assumed to be a dynamic SSA value, this helper method generates a sequence of vector.extract + vector.insert pairs.

EXAMPLE: v1 = vector.extract src[offset] : i2 from vector<8xi2> r1 = vector.insert v1, dest[0] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[idx2] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [1] : i2 into vector<3xi2> (...)

Definition at line 284 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), and mlir::Value::getType().

◆ dynamicallyInsertSubVector()

static Value dynamicallyInsertSubVector	(	RewriterBase &	rewriter,
		Location	loc,
		Value	src,
		Value	dest,
		OpFoldResult	offset,
		int64_t	numElemsToInsert
	)

static

Inserts 1-D subvector into a 1-D vector.

Inserts the input rank-1 source vector into the destination vector starting at offset. As offset is assumed to be a dynamic SSA value, this hook uses a sequence of vector.extract + vector.insert pairs.

EXAMPLE: v1 = vector.extract src[0] : i2 from vector<8xi2> r1 = vector.insert v1, dest[offset] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[1] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [idx2] : i2 into vector<3xi2> (...)

Definition at line 327 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), mlir::Value::getType(), and mlir::getValueOrCreateConstantIndexOp().

◆ emulatedVectorLoad()

static VectorValue emulatedVectorLoad	(	OpBuilder &	rewriter,
		Location	loc,
		Value	base,
		OpFoldResult	linearizedIndices,
		int64_t	numContainerElemsToLoad,
		Type	emulatedElemTy,
		Type	containerElemTy
	)

static

Emulate a vector load for emulatedElemTy using containerElemTy

Specifically, use containerElemTy for loading a vector of emulatedElemTy. The load location is given by base and linearizedIndices, and the load size is given by numEmulatedElementsToLoad.

Definition at line 364 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Type::getIntOrFloatBitWidth(), and mlir::getValueOrCreateConstantIndexOp().

◆ extractNBitsPerByteAndExtendToI8()

static Value extractNBitsPerByteAndExtendToI8	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	src,
		int	bitIdx,
		int	numBits
	)

static

Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index.

The bitIdx starts at 0 from the LSB and moves to the left.

Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 0 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]

The target sequence is [10](decimal=2) as unsigned 2-bit integer. So the result should be [00 00 00 10](decimal=2) as unsigned 8-bit integer.

src = [01 01 10 10] mask = [00 00 00 11] shr = arith.shrui(src, 2) = [00 01 01 10] result = arith.andi(shr, mask) = [00 00 00 10] NOTE: Similarly to extractNBitsPerByteAndSignExtendToI8, this could be achieved by using arith::ShLIOp + arith::ShRUIOp instead of the masking. However, by using arith::ShRUIOp + arith::AndIOp, we are eliminating shift left when the index is 0.

Definition at line 1791 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().

◆ extractNBitsPerByteAndSignExtendToI8()

static Value extractNBitsPerByteAndSignExtendToI8	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	src,
		int	bitIdx,
		int	numBits
	)

static

Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index.

The bitIdx starts at 0 from the LSB and moves to the left.

Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 1 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]

The target sequence is [11](decimal=-1) as signed 2-bit integer. So the result should be [11 11 11 11](decimal=-1) as signed 8-bit integer.

src = [01 01 11 10] shl = arith.shl(src, 4) -> [11 10 00 00] result = arith.shrsi(shl, 6) -> [11 11 11 11]

Definition at line 1749 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().

◆ extractSliceIntoByte()

static Value extractSliceIntoByte	(	ConversionPatternRewriter &	rewriter,
		Location	loc,
		VectorValue	vector,
		int64_t	extractOffset,
		int64_t	sliceNumElements,
		int64_t	insertOffset
	)

static

Extract sliceNumElements from source vector at extractOffset, and insert it into an empty vector at insertOffset.

Inputs: vec_in = |0|1|2|3| : vector<4xi2> extractOffset = 1 sliceNumElements = 2 insertOffset = 2 Output: vec_out = |0|0|1|2| : vector<4xi2>

Definition at line 477 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getZeroAttr(), staticallyExtractSubvector(), and staticallyInsertSubvector().

◆ getCompressedMaskOp()

static FailureOr<Operation *> getCompressedMaskOp	(	OpBuilder &	rewriter,
		Location	loc,
		Value	mask,
		int	numSrcElems,
		int	numSrcElemsPerDest,
		int	numFrontPadElems = `0`
	)

static

Returns a compressed mask for the emulated vector.

For example, when emulating an eight-element i8 vector with i32 (i.e. when the source elements span two dest elements), this method compresses vector<8xi1> into vector<2xi1>.

The compressed/output mask value is set iff any mask in the corresponding numSrcElemsPerDest range of uncompressed/input masks is set. E.g., if numSrcElemsPerDest equals to 2, and numFrontPadElems equals to 1, the following mask:

mask = [1, 1, 0, 0, 0, 0]

will first be padded in the front with numFrontPadElems zeros, and zeros will be added in the back to make the number of elements a multiple of numSrcElemsPerDest (for easier computation). The resulting mask will be:

mask = [0, 1, 1, 0, 0, 0, 0, 0]

then it will return the following new compressed mask:

mask = [1, 1, 0, 0]

NOTE: numFrontPadElems is assumed to be strictly smaller than numSrcElemsPerDest.

Definition at line 79 of file VectorEmulateNarrowType.cpp.

References mlir::bindSymbols(), mlir::OpBuilder::create(), mlir::detail::divideCeil(), mlir::get(), mlir::DenseElementsAttr::get(), mlir::getAsOpFoldResult(), mlir::Builder::getContext(), mlir::Value::getDefiningOp(), mlir::Builder::getI1Type(), mlir::Operation::getResultTypes(), mlir::getValueOrCreateConstantIndexOp(), and mlir::affine::makeComposedFoldedAffineApply().

◆ nonAtomicRMW()

static void nonAtomicRMW	(	OpBuilder &	builder,
		Location	loc,
		MemRefValue	linearizedMemref,
		Value	linearizedIndex,
		VectorValue	valueToStore,
		Value	mask
	)

static

Generate a non-atomic read-modify-write sequence for storing to the emulated type.

It has similar logic to atomicRMWStore, but without atomicity.

Definition at line 449 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and getElementType().

◆ operator<<()

static raw_ostream& operator<<	(	raw_ostream &	os,
		const SmallVector< SourceElementRangeList > &	vec
	)

static

Definition at line 1486 of file VectorEmulateNarrowType.cpp.

References mlir::detail::enumerate().

◆ rewriteI2ToI8Ext()

static Value rewriteI2ToI8Ext	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	srcValue,
		const ExtractNBitsFn &	extFn
	)

static

Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1838 of file VectorEmulateNarrowType.cpp.

References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().

◆ rewriteI4ToI8Ext()

static Value rewriteI4ToI8Ext	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	srcValue,
		const ExtractNBitsFn &	extFn
	)

static

Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1818 of file VectorEmulateNarrowType.cpp.

References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().

◆ rewriteI8ToI4Trunc()

static Value rewriteI8ToI4Trunc	(	PatternRewriter &	rewriter,
		Location	loc,
		Value	srcValue
	)

static

Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.

Definition at line 1873 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI4Type(), and mlir::Value::getType().

◆ staticallyExtractSubvector()

static Value staticallyExtractSubvector	(	OpBuilder &	rewriter,
		Location	loc,
		Value	src,
		int64_t	offset,
		int64_t	numElemsToExtract
	)

static

Extracts 1-D subvector from a 1-D vector.

Given the input rank-1 source vector, extracts numElemsToExtract elements from src, starting at offset. The result is also a rank-1 vector:

vector<numElemsToExtract x !elemType>

(!elType is the element type of the source vector). As offset is a known static value, this helper hook emits vector.extract_strided_slice.

EXAMPLE: res = vector.extract_strided_slice src { offsets = [offset], sizes = [numElemsToExtract], strides = [1] }

Definition at line 214 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().

Referenced by extractSliceIntoByte().

◆ staticallyInsertSubvector()

static Value staticallyInsertSubvector	(	OpBuilder &	rewriter,
		Location	loc,
		Value	src,
		Value	dest,
		int64_t	offset
	)

static

Inserts 1-D subvector into a 1-D vector.

Inserts the input rank-1 source vector into the destination vector starting at offset. As offset is a known static value, this helper hook emits vector.insert_strided_slice.

EXAMPLE: res = vector.insert_strided_slice src, dest {offsets = [offset], strides [1]}

Definition at line 248 of file VectorEmulateNarrowType.cpp.

References mlir::OpBuilder::create(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().

Referenced by extractSliceIntoByte().

Macros

Typedefs

Functions

Macro Definition Documentation

◆ DBGS

◆ DBGSNL

◆ DEBUG_TYPE

◆ LDBG

Typedef Documentation

◆ ExtractNBitsFn

◆ MemRefValue

◆ VectorValue

Function Documentation

◆ alignedConversionPrecondition()

◆ atomicRMW()

◆ bitcastSubByteVectorToI8()

◆ commonConversionPrecondition()

◆ downcastSelectAndUpcast()

◆ dynamicallyExtractSubVector()

◆ dynamicallyInsertSubVector()

◆ emulatedVectorLoad()

◆ extractNBitsPerByteAndExtendToI8()

◆ extractNBitsPerByteAndSignExtendToI8()

◆ extractSliceIntoByte()

◆ getCompressedMaskOp()

◆ nonAtomicRMW()

◆ operator<<()

◆ rewriteI2ToI8Ext()

◆ rewriteI4ToI8Ext()

◆ rewriteI8ToI4Trunc()

◆ staticallyExtractSubvector()

◆ staticallyInsertSubvector()