doxygen/VectorToXeGPU_8cpp_source.html

 //===- VectorToXeGPU.cpp - Convert vector to XeGPU dialect ------*- C++ -*-===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements lowering of vector operations to XeGPU dialect ops.

 //

 //===----------------------------------------------------------------------===//


 #include "mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h"


 #include "mlir/Dialect/Arith/IR/Arith.h"

 #include "mlir/Dialect/MemRef/IR/MemRef.h"

 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"

 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"

 #include "mlir/Pass/Pass.h"

 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 #include "mlir/Transforms/Passes.h"

 #include "llvm/ADT/TypeSwitch.h"


 #include <algorithm>

 #include <optional>


 namespace mlir {

 #define GEN_PASS_DEF_CONVERTVECTORTOXEGPU

 #include "mlir/Conversion/Passes.h.inc"

 } // namespace mlir


 using namespace mlir;


 namespace {


 // Return true if value represents a zero constant.

 static bool isZeroConstant(Value val) {

   auto constant = val.getDefiningOp<arith::ConstantOp>();

   if (!constant)

     return false;


   return TypeSwitch<Attribute, bool>(constant.getValue())

       .Case<FloatAttr>(

           [](auto floatAttr) { return floatAttr.getValue().isZero(); })

       .Case<IntegerAttr>(

           [](auto intAttr) { return intAttr.getValue().isZero(); })

       .Default([](auto) { return false; });

 }


 static LogicalResult storeLoadPreconditions(PatternRewriter &rewriter,

                                             Operation *op, VectorType vecTy) {

   // Validate only vector as the basic vector store and load ops guarantee

   // XeGPU-compatible memref source.

   unsigned vecRank = vecTy.getRank();

   if (!(vecRank == 1 || vecRank == 2))

     return rewriter.notifyMatchFailure(op, "Expects 1D or 2D vector");


   return success();

 }


 static LogicalResult transferPreconditions(PatternRewriter &rewriter,

                                            VectorTransferOpInterface xferOp) {

   if (xferOp.getMask())

     return rewriter.notifyMatchFailure(xferOp,

                                        "Masked transfer is not supported");


   auto srcTy = dyn_cast<MemRefType>(xferOp.getShapedType());

   if (!srcTy)

     return rewriter.notifyMatchFailure(xferOp, "Expects memref source");


   // Perform common data transfer checks.

   VectorType vecTy = xferOp.getVectorType();

   if (failed(storeLoadPreconditions(rewriter, xferOp, vecTy)))

     return failure();


   // Validate further transfer op semantics.

   SmallVector<int64_t> strides;

   int64_t offset;

   if (failed(srcTy.getStridesAndOffset(strides, offset)) || strides.back() != 1)

     return rewriter.notifyMatchFailure(

         xferOp, "Buffer must be contiguous in the innermost dimension");


   unsigned vecRank = vecTy.getRank();

   if (xferOp.hasOutOfBoundsDim() && vecRank < 2)

     return rewriter.notifyMatchFailure(

         xferOp, "Boundary check is available only for block instructions.");


   AffineMap map = xferOp.getPermutationMap();

   if (!map.isProjectedPermutation(/*allowZeroInResults=*/false))

     return rewriter.notifyMatchFailure(xferOp, "Unsupported permutation map");

   unsigned numInputDims = map.getNumInputs();

   for (AffineExpr expr : map.getResults().take_back(vecRank)) {

     auto dim = dyn_cast<AffineDimExpr>(expr);

     if (dim.getPosition() < (numInputDims - vecRank))

       return rewriter.notifyMatchFailure(

           xferOp, "Only the innermost dimensions can be accessed");

   }


   return success();

 }


 static xegpu::CreateNdDescOp

 createNdDescriptor(PatternRewriter &rewriter, Location loc,

                    xegpu::TensorDescType descType, TypedValue<MemRefType> src,

                    Operation::operand_range offsets) {

   MemRefType srcTy = src.getType();

   auto [strides, offset] = srcTy.getStridesAndOffset();


   xegpu::CreateNdDescOp ndDesc;

   if (srcTy.hasStaticShape()) {

     ndDesc = rewriter.create<xegpu::CreateNdDescOp>(loc, descType, src,

                                                     getAsOpFoldResult(offsets));

   } else {

     // In case of any dynamic shapes, source's shape and strides have to be

     // explicitly provided.

     SmallVector<Value> sourceDims;

     unsigned srcRank = srcTy.getRank();

     for (unsigned i = 0; i < srcRank; ++i)

       sourceDims.push_back(rewriter.create<memref::DimOp>(loc, src, i));


     SmallVector<int64_t> constOffsets;

     SmallVector<Value> dynOffsets;

     for (Value offset : offsets) {

       std::optional<int64_t> staticVal = getConstantIntValue(offset);

       if (!staticVal)

         dynOffsets.push_back(offset);

       constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));

     }


     SmallVector<Value> dynShapes;

     for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) {

       if (shape == ShapedType::kDynamic)

         dynShapes.push_back(sourceDims[idx]);

     }


     // Compute strides in reverse order.

     SmallVector<Value> dynStrides;

     Value accStride = rewriter.create<arith::ConstantIndexOp>(loc, 1);

     // Last stride is guaranteed to be static and unit.

     for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {

       accStride =

           rewriter.create<arith::MulIOp>(loc, accStride, sourceDims[i + 1]);

       if (strides[i] == ShapedType::kDynamic)

         dynStrides.push_back(accStride);

     }

     std::reverse(dynStrides.begin(), dynStrides.end());


     ndDesc = rewriter.create<xegpu::CreateNdDescOp>(

         loc, descType, src, dynOffsets, dynShapes, dynStrides,

         DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),

         DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),

         DenseI64ArrayAttr::get(rewriter.getContext(), strides));

   }


   return ndDesc;

 }


 struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {

   using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;


   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,

                                 PatternRewriter &rewriter) const override {

     Location loc = readOp.getLoc();


     if (failed(transferPreconditions(rewriter, readOp)))

       return failure();


     bool isOutOfBounds = readOp.hasOutOfBoundsDim();

     if (isOutOfBounds && !isZeroConstant(readOp.getPadding()))

       return rewriter.notifyMatchFailure(

           readOp, "Unsupported non-zero padded out-of-bounds read");


     AffineMap readMap = readOp.getPermutationMap();

     bool isTransposeLoad = !readMap.isMinorIdentity();


     VectorType vecTy = readOp.getVectorType();

     Type elementType = vecTy.getElementType();

     unsigned minTransposeBitWidth = 32;

     if (isTransposeLoad &&

         elementType.getIntOrFloatBitWidth() < minTransposeBitWidth)

       return rewriter.notifyMatchFailure(

           readOp, "Unsupported data type for transposition");


     // If load is transposed, get the base shape for the tensor descriptor.

     SmallVector<int64_t> descShape(vecTy.getShape());

     if (isTransposeLoad)

       std::reverse(descShape.begin(), descShape.end());

     auto descType = xegpu::TensorDescType::get(

         descShape, elementType, /*array_length=*/1,

         /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);


     xegpu::CreateNdDescOp ndDesc =

         createNdDescriptor(rewriter, loc, descType,

                            dyn_cast<TypedValue<MemRefType>>(readOp.getBase()),

                            readOp.getIndices());


     DenseI64ArrayAttr transposeAttr =

         !isTransposeLoad ? nullptr

                          : DenseI64ArrayAttr::get(rewriter.getContext(),

                                                   ArrayRef<int64_t>{1, 0});

     // By default, no specific caching policy is assigned.

     xegpu::CachePolicyAttr hint = nullptr;

     auto loadOp = rewriter.create<xegpu::LoadNdOp>(

         loc, vecTy, ndDesc, /*packed=*/nullptr, transposeAttr,

         /*l1_hint=*/hint,

         /*l2_hint=*/hint, /*l3_hint=*/hint);

     rewriter.replaceOp(readOp, loadOp);


     return success();

   }

 };


 struct TransferWriteLowering

     : public OpRewritePattern<vector::TransferWriteOp> {

   using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;


   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,

                                 PatternRewriter &rewriter) const override {

     Location loc = writeOp.getLoc();


     if (failed(transferPreconditions(rewriter, writeOp)))

       return failure();


     AffineMap map = writeOp.getPermutationMap();

     if (!map.isMinorIdentity())

       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");


     VectorType vecTy = writeOp.getVectorType();

     auto descType = xegpu::TensorDescType::get(

         vecTy.getShape(), vecTy.getElementType(),

         /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(),

         xegpu::MemorySpace::Global);

     xegpu::CreateNdDescOp ndDesc =

         createNdDescriptor(rewriter, loc, descType,

                            dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()),

                            writeOp.getIndices());


     // By default, no specific caching policy is assigned.

     xegpu::CachePolicyAttr hint = nullptr;

     auto storeOp =

         rewriter.create<xegpu::StoreNdOp>(loc, writeOp.getVector(), ndDesc,

                                           /*l1_hint=*/hint,

                                           /*l2_hint=*/hint, /*l3_hint=*/hint);

     rewriter.replaceOp(writeOp, storeOp);


     return success();

   }

 };


 struct LoadLowering : public OpRewritePattern<vector::LoadOp> {

   using OpRewritePattern<vector::LoadOp>::OpRewritePattern;


   LogicalResult matchAndRewrite(vector::LoadOp loadOp,

                                 PatternRewriter &rewriter) const override {

     Location loc = loadOp.getLoc();


     VectorType vecTy = loadOp.getResult().getType();

     if (failed(storeLoadPreconditions(rewriter, loadOp, vecTy)))

       return failure();


     // Boundary check is available only for block instructions.

     bool boundaryCheck = vecTy.getRank() > 1;


     auto descType = xegpu::TensorDescType::get(

         vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1,

         boundaryCheck, xegpu::MemorySpace::Global);

     xegpu::CreateNdDescOp ndDesc = createNdDescriptor(

         rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices());


     // By default, no specific caching policy is assigned.

     xegpu::CachePolicyAttr hint = nullptr;

     auto loadNdOp = rewriter.create<xegpu::LoadNdOp>(

         loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,

         /*l1_hint=*/hint,

         /*l2_hint=*/hint, /*l3_hint=*/hint);

     rewriter.replaceOp(loadOp, loadNdOp);


     return success();

   }

 };


 struct StoreLowering : public OpRewritePattern<vector::StoreOp> {

   using OpRewritePattern<vector::StoreOp>::OpRewritePattern;


   LogicalResult matchAndRewrite(vector::StoreOp storeOp,

                                 PatternRewriter &rewriter) const override {

     Location loc = storeOp.getLoc();


     TypedValue<VectorType> vector = storeOp.getValueToStore();

     VectorType vecTy = vector.getType();

     if (failed(storeLoadPreconditions(rewriter, storeOp, vecTy)))

       return failure();


     // Boundary check is available only for block instructions.

     bool boundaryCheck = vecTy.getRank() > 1;


     auto descType = xegpu::TensorDescType::get(

         vecTy.getShape(), vecTy.getElementType(),

         /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global);

     xegpu::CreateNdDescOp ndDesc = createNdDescriptor(

         rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices());


     // By default, no specific caching policy is assigned.

     xegpu::CachePolicyAttr hint = nullptr;

     auto storeNdOp =

         rewriter.create<xegpu::StoreNdOp>(loc, vector, ndDesc,

                                           /*l1_hint=*/hint,

                                           /*l2_hint=*/hint, /*l3_hint=*/hint);

     rewriter.replaceOp(storeOp, storeNdOp);


     return success();

   }

 };


 struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> {

   using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;


   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,

                                 PatternRewriter &rewriter) const override {

     Location loc = contractOp.getLoc();


     if (contractOp.getKind() != vector::CombiningKind::ADD)

       return rewriter.notifyMatchFailure(contractOp,

                                          "Expects add combining kind");


     TypedValue<Type> acc = contractOp.getAcc();

     VectorType accType = dyn_cast<VectorType>(acc.getType());

     if (!accType || accType.getRank() != 2)

       return rewriter.notifyMatchFailure(contractOp, "Expects acc 2D vector");


     // Accept only plain 2D data layout.

     // VNNI packing is applied to DPAS as a separate lowering step.

     TypedValue<VectorType> lhs = contractOp.getLhs();

     TypedValue<VectorType> rhs = contractOp.getRhs();

     if (lhs.getType().getRank() != 2 || rhs.getType().getRank() != 2)

       return rewriter.notifyMatchFailure(contractOp,

                                          "Expects lhs and rhs 2D vectors");


     if (!isRowMajorMatmul(contractOp.getIndexingMapsAttr()))

       return rewriter.notifyMatchFailure(contractOp, "Invalid indexing maps");


     // TODO: Update shape validation to be target aware.

     auto accShape = accType.getShape();

     int64_t dimN = accShape[1];

     if (dimN != 8 && dimN != 16)

       return rewriter.notifyMatchFailure(contractOp,

                                          "Invalid operand dimensions");


     auto dpasOp = rewriter.create<xegpu::DpasOp>(

         loc, TypeRange{contractOp.getResultType()}, ValueRange{lhs, rhs, acc});

     rewriter.replaceOp(contractOp, dpasOp);


     return success();

   }

 };


 struct ConvertVectorToXeGPUPass

     : public impl::ConvertVectorToXeGPUBase<ConvertVectorToXeGPUPass> {

   void runOnOperation() override {

     RewritePatternSet patterns(&getContext());

     populateVectorToXeGPUConversionPatterns(patterns);

     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))

       return signalPassFailure();

   }

 };


 } // namespace


 void mlir::populateVectorToXeGPUConversionPatterns(

     RewritePatternSet &patterns) {

   patterns.add<TransferReadLowering, TransferWriteLowering, LoadLowering,

                StoreLowering, ContractionLowering>(patterns.getContext());

 }

GreedyPatternRewriteDriver.h

getContext
static MLIRContext * getContext(OpFoldResult val)
Definition: IndexingUtils.cpp:295

StructuredOpsUtils.h

transferPreconditions
static LogicalResult transferPreconditions(PatternRewriter &rewriter, VectorTransferOpInterface xferOp, bool &requiresBroadcasting, VectorType &unbroadcastedVectorType)
This pattern supports lowering of: vector.transfer_read to a combination of vector....
Definition: TransferReadToLoad.cpp:47

Passes.h

VectorOps.h

VectorToXeGPU.h

XeGPU.h

llvm::ArrayRef
Definition: LLVM.h:48

llvm::SmallVector
Definition: LLVM.h:72

llvm::TypeSwitch
Definition: LLVM.h:82

mlir::AffineExpr
Base type for affine expression.
Definition: AffineExpr.h:68

mlir::AffineMap
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
Definition: AffineMap.h:46

mlir::AffineMap::isMinorIdentity
bool isMinorIdentity() const
Returns true if this affine map is a minor identity, i.e.
Definition: AffineMap.cpp:155

mlir::AffineMap::isProjectedPermutation
bool isProjectedPermutation(bool allowZeroInResults=false) const
Returns true if the AffineMap represents a subset (i.e.
Definition: AffineMap.cpp:615

mlir::AffineMap::getResults
ArrayRef< AffineExpr > getResults() const
Definition: AffineMap.cpp:407

mlir::AffineMap::getNumInputs
unsigned getNumInputs() const
Definition: AffineMap.cpp:403

mlir::AffineMap::getPermutationMap
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Definition: AffineMap.cpp:264

mlir::Builder::getContext
MLIRContext * getContext() const
Definition: Builders.h:55

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::OpBuilder::create
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453

mlir::OperandRange
This class implements the operand iterators for the Operation class.
Definition: ValueRange.h:43

mlir::Operation
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88

mlir::PatternRewriter
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:749

mlir::RewritePatternSet
Definition: PatternMatch.h:772

mlir::RewriterBase::notifyMatchFailure
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:682

mlir::RewriterBase::replaceOp
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
Definition: PatternMatch.cpp:129

mlir::TypeRange
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37

mlir::Type
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74

mlir::Type::getIntOrFloatBitWidth
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:122

mlir::ValueRange
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387

mlir::Value
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96

mlir::Value::getDefiningOp
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20

mlir::detail::DenseArrayAttrImpl
Base class for DenseArrayAttr that is instantiated and specialized for each supported element type be...
Definition: BuiltinAttributes.h:726

mlir::detail::DenseArrayAttrImpl::get
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< T > content)
Builder from ArrayRef<T>.
Definition: BuiltinAttributes.cpp:873

Pass.h

Arith.h

MemRef.h

mlir::detail::enumerate
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::getConstantIntValue
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
Definition: StaticValueUtils.cpp:115

mlir::TypedValue
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition: Value.h:488

mlir::applyPatternsGreedily
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
Definition: GreedyPatternRewriteDriver.cpp:898

mlir::patterns
const FrozenRewritePatternSet & patterns
Definition: GreedyPatternRewriteDriver.h:283

mlir::populateVectorToXeGPUConversionPatterns
void populateVectorToXeGPUConversionPatterns(RewritePatternSet &patterns)
Collect a set of patterns to convert from the vector to XeGPU ops.
Definition: VectorToXeGPU.cpp:370

mlir::get
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Definition: BytecodeImplementation.h:509

mlir::getAsOpFoldResult
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
Definition: StaticValueUtils.cpp:79

mlir::isRowMajorMatmul
bool isRowMajorMatmul(ArrayAttr indexingMaps)
Tests whether the given maps describe a row major matmul.
Definition: StructuredOpsUtils.cpp:20

mlir::OpRewritePattern
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314