MLIR
21.0.0git
|
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Transforms/NarrowTypeEmulationConverter.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <optional>
Go to the source code of this file.
Macros | |
#define | DEBUG_TYPE "vector-narrow-type-emulation" |
#define | DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
#define | DBGSNL() (llvm::dbgs() << "\n") |
#define | LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
Typedefs | |
using | VectorValue = TypedValue< VectorType > |
using | MemRefValue = TypedValue< MemRefType > |
using | ExtractNBitsFn = std::function< Value(PatternRewriter &, Location, Value, int, int)> |
Functions | |
static FailureOr< Operation * > | getCompressedMaskOp (OpBuilder &rewriter, Location loc, Value mask, int numSrcElems, int numSrcElemsPerDest, int numFrontPadElems=0) |
Returns a compressed mask for the emulated vector. More... | |
static Value | staticallyExtractSubvector (OpBuilder &rewriter, Location loc, Value src, int64_t offset, int64_t numElemsToExtract) |
Extracts 1-D subvector from a 1-D vector. More... | |
static Value | staticallyInsertSubvector (OpBuilder &rewriter, Location loc, Value src, Value dest, int64_t offset) |
Inserts 1-D subvector into a 1-D vector. More... | |
static Value | dynamicallyExtractSubVector (OpBuilder &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToExtract) |
Extracts 1-D subvector from a 1-D vector. More... | |
static Value | dynamicallyInsertSubVector (RewriterBase &rewriter, Location loc, Value src, Value dest, OpFoldResult offset, int64_t numElemsToInsert) |
Inserts 1-D subvector into a 1-D vector. More... | |
static VectorValue | emulatedVectorLoad (OpBuilder &rewriter, Location loc, Value base, OpFoldResult linearizedIndices, int64_t numContainerElemsToLoad, Type emulatedElemTy, Type containerElemTy) |
Emulate a vector load for emulatedElemTy using containerElemTy More... | |
static Value | downcastSelectAndUpcast (OpBuilder &builder, Location loc, VectorType downcastType, VectorType upcastType, Value mask, Value trueValue, Value falseValue) |
Downcast two values to downcastType , then select values based on mask , and casts the result to upcastType . More... | |
static void | atomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value storeIdx, VectorValue valueToStore, Value mask) |
Emits memref.generic_atomic_rmw op to store a subbyte-sized value to a byte in linearizedMemref , with a mask. More... | |
static void | nonAtomicRMW (OpBuilder &builder, Location loc, MemRefValue linearizedMemref, Value linearizedIndex, VectorValue valueToStore, Value mask) |
Generate a non-atomic read-modify-write sequence for storing to the emulated type. More... | |
static Value | extractSliceIntoByte (ConversionPatternRewriter &rewriter, Location loc, VectorValue vector, int64_t extractOffset, int64_t sliceNumElements, int64_t insertOffset) |
Extract sliceNumElements from source vector at extractOffset , and insert it into an empty vector at insertOffset . More... | |
static raw_ostream & | operator<< (raw_ostream &os, const SmallVector< SourceElementRangeList > &vec) |
static LogicalResult | commonConversionPrecondition (PatternRewriter &rewriter, VectorType preconditionType, Operation *op) |
Verify that the precondition type meets the common preconditions for any conversion. More... | |
static LogicalResult | alignedConversionPrecondition (PatternRewriter &rewriter, VectorType subByteVecTy, Type containerTy, Operation *op) |
Verify that subByteVecTy (vector) and containerTy (scalar) are aligned. More... | |
static Value | bitcastSubByteVectorToI8 (PatternRewriter &rewriter, Location loc, Value subByteVec) |
Bitcasts the aligned subByteVec vector to a vector of i8. More... | |
static Value | extractNBitsPerByteAndSignExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits) |
Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More... | |
static Value | extractNBitsPerByteAndExtendToI8 (PatternRewriter &rewriter, Location loc, Value src, int bitIdx, int numBits) |
Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index. More... | |
static Value | rewriteI4ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn) |
Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More... | |
static Value | rewriteI2ToI8Ext (PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn) |
Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More... | |
static Value | rewriteI8ToI4Trunc (PatternRewriter &rewriter, Location loc, Value srcValue) |
Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. More... | |
#define DBGS | ( | ) | (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
Definition at line 44 of file VectorEmulateNarrowType.cpp.
#define DBGSNL | ( | ) | (llvm::dbgs() << "\n") |
Definition at line 45 of file VectorEmulateNarrowType.cpp.
#define DEBUG_TYPE "vector-narrow-type-emulation" |
Definition at line 43 of file VectorEmulateNarrowType.cpp.
#define LDBG | ( | X | ) | LLVM_DEBUG(DBGS() << X << "\n") |
Definition at line 46 of file VectorEmulateNarrowType.cpp.
using ExtractNBitsFn = std::function<Value(PatternRewriter &, Location, Value, int, int)> |
Definition at line 1813 of file VectorEmulateNarrowType.cpp.
using MemRefValue = TypedValue<MemRefType> |
Definition at line 49 of file VectorEmulateNarrowType.cpp.
using VectorValue = TypedValue<VectorType> |
Definition at line 48 of file VectorEmulateNarrowType.cpp.
|
static |
Verify that subByteVecTy
(vector) and containerTy
(scalar) are aligned.
Alignment means that subByteVecTy
can be packed into a vector of containerTy
elements. More specifically:
containerTy
is a multiple of the bit-width of subByteVecTy
elements. For example, for i4
and i16
this multiple is 4.subByteVecTy
.EXAMPLE 1: subByteVecTy = vector<2xi4>
, and containerTy = i16
2 divides evenly 4 ( = 16 / 4), hence both conditions are met.
EXAMPLE 2: subByteVecTy = vector<3xi4>
, and containerTy = i16
3 does not divide evenly 4 (= 16/4), hence the conditions are not met.
EXAMPLE 3: subByteVecTy = vector<3xi3>
, and containerTy = i16
16 is not a multiple of 3, hence the conditions are not met.
NOTE: This method assumes that common conversion preconditions are met. In particular, containerTy
is assumed to be a multi-byte scalar type (e.g., i8, i16, i32).
Definition at line 1601 of file VectorEmulateNarrowType.cpp.
References mlir::Type::getIntOrFloatBitWidth(), mlir::Type::isIntOrFloat(), and mlir::RewriterBase::notifyMatchFailure().
|
static |
Emits memref.generic_atomic_rmw
op to store a subbyte-sized value to a byte in linearizedMemref
, with a mask.
The valueToStore
is a vector of subbyte-sized elements, with size of 8 bits, and the mask is used to select which elements to store.
Inputs: linearizedMemref = |2|2|2|2| : <4xi2> (<1xi8>) storeIdx = 2 valueToStore = |3|3|3|3| : vector<4xi2> mask = |0|0|1|1| : vector<4xi1>
Result: linearizedMemref = |2|2|3|3| : <4xi2> (<1xi8>)
Definition at line 418 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and mlir::OpBuilder::setInsertionPointToStart().
|
static |
Bitcasts the aligned subByteVec
vector to a vector of i8.
Where aligned means it satisfies the alignedConversionPreconditions.
Example: vector<16x16xi2> -> vector<16x4xi8> vector<16x16xi4> -> vector<16x8xi8>
Definition at line 1719 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI8Type(), and mlir::Value::getType().
Referenced by rewriteI2ToI8Ext(), and rewriteI4ToI8Ext().
|
static |
Verify that the precondition type meets the common preconditions for any conversion.
Definition at line 1543 of file VectorEmulateNarrowType.cpp.
References mlir::RewriterBase::notifyMatchFailure().
|
static |
Downcast two values to downcastType
, then select values based on mask
, and casts the result to upcastType
.
Definition at line 384 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), and mlir::Value::getType().
Referenced by atomicRMW(), and nonAtomicRMW().
|
static |
Extracts 1-D subvector from a 1-D vector.
Given the input rank-1 source vector, extracts numElemsToExtact
elements from src
, starting at offset
. The result is also a rank-1 vector:
vector<numElemsToExtact x !elType>
(!elType
is the element type of the source vector). As offset
is assumed to be a dynamic SSA value, this helper method generates a sequence of vector.extract
+ vector.insert
pairs.
EXAMPLE: v1 = vector.extract src[offset] : i2 from vector<8xi2> r1 = vector.insert v1, dest[0] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[idx2] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [1] : i2 into vector<3xi2> (...)
Definition at line 284 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), and mlir::Value::getType().
|
static |
Inserts 1-D subvector into a 1-D vector.
Inserts the input rank-1 source vector into the destination vector starting at offset
. As offset
is assumed to be a dynamic SSA value, this hook uses a sequence of vector.extract
+ vector.insert
pairs.
EXAMPLE: v1 = vector.extract src[0] : i2 from vector<8xi2> r1 = vector.insert v1, dest[offset] : i2 into vector<3xi2> c1 = arith.constant 1 : index idx2 = arith.addi offset, c1 : index v2 = vector.extract src[1] : i2 from vector<8xi2> r2 = vector.insert v2, r1 [idx2] : i2 into vector<3xi2> (...)
Definition at line 327 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::Builder::getIndexType(), mlir::Value::getType(), and mlir::getValueOrCreateConstantIndexOp().
|
static |
Emulate a vector load for emulatedElemTy
using containerElemTy
Specifically, use containerElemTy
for loading a vector of emulatedElemTy
. The load location is given by base
and linearizedIndices
, and the load size is given by numEmulatedElementsToLoad
.
Definition at line 364 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::Type::getIntOrFloatBitWidth(), and mlir::getValueOrCreateConstantIndexOp().
|
static |
Extracts an unsigned N-bit sequence from each element of a vector of bytes, starting at the specified bit index.
The bitIdx
starts at 0 from the LSB and moves to the left.
Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 0 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]
The target sequence is [10](decimal=2) as unsigned 2-bit integer. So the result should be [00 00 00 10](decimal=2) as unsigned 8-bit integer.
src = [01 01 10 10] mask = [00 00 00 11] shr = arith.shrui(src, 2) = [00 01 01 10] result = arith.andi(shr, mask) = [00 00 00 10] NOTE: Similarly to extractNBitsPerByteAndSignExtendToI8, this could be achieved by using arith::ShLIOp + arith::ShRUIOp instead of the masking. However, by using arith::ShRUIOp + arith::AndIOp, we are eliminating shift left when the index is 0.
Definition at line 1791 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().
|
static |
Extracts a signed N-bit sequence from each element of a vector of bytes, starting at the specified bit index.
The bitIdx
starts at 0 from the LSB and moves to the left.
Example for a single element: Extract numBits=2 starting at bitIdx=2 src = [0 | 1 | 0 | 1 | 1 | 1 | 1 | 0] indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] target = [. . . . ^ ^ . .]
The target sequence is [11](decimal=-1) as signed 2-bit integer. So the result should be [11 11 11 11](decimal=-1) as signed 8-bit integer.
src = [01 01 11 10] shl = arith.shl(src, 4) -> [11 10 00 00] result = arith.shrsi(shl, 6) -> [11 11 11 11]
Definition at line 1749 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), and mlir::Value::getType().
|
static |
Extract sliceNumElements
from source vector
at extractOffset
, and insert it into an empty vector at insertOffset
.
Inputs: vec_in = |0|1|2|3| : vector<4xi2> extractOffset = 1 sliceNumElements = 2 insertOffset = 2 Output: vec_out = |0|0|1|2| : vector<4xi2>
Definition at line 477 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getZeroAttr(), staticallyExtractSubvector(), and staticallyInsertSubvector().
|
static |
Returns a compressed mask for the emulated vector.
For example, when emulating an eight-element i8
vector with i32
(i.e. when the source elements span two dest elements), this method compresses vector<8xi1>
into vector<2xi1>
.
The compressed/output mask value is set iff any mask in the corresponding numSrcElemsPerDest
range of uncompressed/input masks is set. E.g., if numSrcElemsPerDest
equals to 2, and numFrontPadElems
equals to 1, the following mask:
mask = [1, 1, 0, 0, 0, 0]
will first be padded in the front with numFrontPadElems
zeros, and zeros will be added in the back to make the number of elements a multiple of numSrcElemsPerDest
(for easier computation). The resulting mask will be:
mask = [0, 1, 1, 0, 0, 0, 0, 0]
then it will return the following new compressed mask:
mask = [1, 1, 0, 0]
NOTE: numFrontPadElems
is assumed to be strictly smaller than numSrcElemsPerDest
.
Definition at line 79 of file VectorEmulateNarrowType.cpp.
References mlir::bindSymbols(), mlir::OpBuilder::create(), mlir::detail::divideCeil(), mlir::get(), mlir::DenseElementsAttr::get(), mlir::getAsOpFoldResult(), mlir::Builder::getContext(), mlir::Value::getDefiningOp(), mlir::Builder::getI1Type(), mlir::Operation::getResultTypes(), mlir::getValueOrCreateConstantIndexOp(), and mlir::affine::makeComposedFoldedAffineApply().
|
static |
Generate a non-atomic read-modify-write sequence for storing to the emulated type.
It has similar logic to atomicRMWStore
, but without atomicity.
Definition at line 449 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), downcastSelectAndUpcast(), mlir::get(), and getElementType().
|
static |
Definition at line 1486 of file VectorEmulateNarrowType.cpp.
References mlir::detail::enumerate().
|
static |
Rewrite the i2 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.
Definition at line 1838 of file VectorEmulateNarrowType.cpp.
References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().
|
static |
Rewrite the i4 -> i8 extension into a sequence of shuffles and bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.
Definition at line 1818 of file VectorEmulateNarrowType.cpp.
References bitcastSubByteVectorToI8(), mlir::OpBuilder::create(), and mlir::Value::getType().
|
static |
Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.
Definition at line 1873 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI4Type(), and mlir::Value::getType().
|
static |
Extracts 1-D subvector from a 1-D vector.
Given the input rank-1 source vector, extracts numElemsToExtract
elements from src
, starting at offset
. The result is also a rank-1 vector:
vector<numElemsToExtract x !elemType>
(!elType
is the element type of the source vector). As offset
is a known static value, this helper hook emits vector.extract_strided_slice
.
EXAMPLE: res = vector.extract_strided_slice src { offsets = [offset], sizes = [numElemsToExtract], strides = [1] }
Definition at line 214 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::get(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().
Referenced by extractSliceIntoByte().
|
static |
Inserts 1-D subvector into a 1-D vector.
Inserts the input rank-1 source vector into the destination vector starting at offset
. As offset
is a known static value, this helper hook emits vector.insert_strided_slice
.
EXAMPLE: res = vector.insert_strided_slice src, dest {offsets = [offset], strides [1]}
Definition at line 248 of file VectorEmulateNarrowType.cpp.
References mlir::OpBuilder::create(), mlir::Builder::getI64ArrayAttr(), and mlir::Value::getType().
Referenced by extractSliceIntoByte().