25#include "llvm/Support/Casting.h"
26#include "llvm/Support/FormatVariadic.h"
35 for (
const auto &vals : values)
36 llvm::append_range(
result, vals);
42 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
45 if (!layout || !layout.isForSubgroup())
50 auto tdescShape = tdescTy.getShape();
51 auto elementType = tdescTy.getElementType();
56 int64_t sgSize = llvm::product_of(laneLayout);
59 auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
61 auto chunkSize = scatterAttr.getChunkSize().getInt();
64 assert(tdescShape[0] == laneLayout[0] &&
65 "tensor descriptor shape is not distributable");
66 return VectorType::get({chunkSize}, elementType);
72 for (
auto [tdescDim, laneDim, laneDataDim] :
73 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
74 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
75 "tensor descriptor shape is not distributable");
76 tensorSize *= tdescDim;
79 tensorSize *= tdescTy.getArrayLength();
81 return VectorType::get({tensorSize / sgSize}, elementType);
86 xegpu::LayoutAttr layout) {
87 int64_t rank = originalType.getRank();
89 if (rank < 1 || rank > 3)
96 arrayLength =
shape[0];
99 auto helperTdescTy = xegpu::TensorDescType::get(
100 shape, originalType.getElementType(), arrayLength,
102 xegpu::MemorySpace::Global, layout);
108 VectorType originalType) {
111 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
112 "Expecting a valid layout.");
114 layout.getEffectiveLaneLayoutAsInt();
115 assert(
static_cast<size_t>(originalType.getRank()) >=
116 effectiveLaneLayout.size() &&
117 "Rank of the original vector type should be greater or equal to the "
118 "size of the lane layout to distribute the vector type.");
122 unsigned distributionStart =
123 originalType.getRank() - effectiveLaneLayout.size();
124 for (
auto [i, dim] : llvm::enumerate(originalType.getShape())) {
125 if (i < distributionStart)
128 if (dim % effectiveLaneLayout[i - distributionStart] != 0)
130 distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
132 return VectorType::get(distributedShape, originalType.getElementType());
136 const StringRef prefix(
"layout_operand_");
137 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
138 return llvm::formatv(
"{0}{1}", prefix, idx).str();
142 const StringRef prefix =
"layout_result_";
143 return llvm::formatv(
"{0}{1}", prefix,
result.getResultNumber()).str();
151 dyn_cast_if_present<xegpu::TensorDescType>(value.
getType()))
152 return tdescTy.getLayoutAttr();
154 if (
auto result = dyn_cast<OpResult>(value)) {
156 assert(defOp &&
"result must have a defining op");
158 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
159 auto layout = anchorOp.getAnchorLayout();
164 if (defOp->
hasAttr(layoutName)) {
166 defOp->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
171 if (
auto arg = dyn_cast<BlockArgument>(value)) {
172 auto *parentOp = arg.getOwner()->getParentOp();
173 if (
auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
174 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
182xegpu::DistributeLayoutAttr
185 unsigned idx =
const_cast<OpOperand &
>(opr).getOperandNumber();
187 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
188 if (
auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
190 return dpasOp.getLayoutAAttr();
191 }
else if (idx == 1) {
192 return dpasOp.getLayoutBAttr();
193 }
else if (idx == 2) {
194 return dpasOp.getLayoutCdAttr();
197 if (
auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
198 return convertOp.getInputLayoutAttr();
200 auto layout = anchorOp.getAnchorLayout();
208 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
216 auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
225xegpu::DistributeLayoutAttr
228 const std::string &name) {
229 xegpu::DistributeLayoutAttr candidate = layout;
231 if (
auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
232 if (
auto perm = loadOp.getLayoutAttr())
241xegpu::DistributeLayoutAttr
244 const std::string &name) {
245 xegpu::DistributeLayoutAttr candidate = layout;
246 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
248 if (
auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
250 if (
auto perm = storeOp.getLayoutAttr())
262 const mlir::xegpu::DistributeLayoutAttr layout) {
265 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
266 if (anchorOp.getAnchorLayout() == layout)
268 anchorOp.setAnchorLayout(layout);
284 const DistributeLayoutAttr layout) {
286 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
291 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
292 if (
auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
294 return dpasOp.setLayoutAAttr(layout);
295 }
else if (idx == 1) {
296 return dpasOp.setLayoutBAttr(layout);
297 }
else if (idx == 2) {
298 return dpasOp.setLayoutCdAttr(layout);
301 if (
auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
302 return convertOp.setInputLayoutAttr(layout);
308 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
311 anchorOp.setAnchorLayout(layout);
315 anchorOp.setAnchorLayout(layout);
329template <
typename T,
typename>
330xegpu::DistributeLayoutAttr
332 Operation *op = operandOrResult.getOwner();
336 auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
343template xegpu::DistributeLayoutAttr
345template xegpu::DistributeLayoutAttr
348template <
typename T,
typename>
350 const xegpu::DistributeLayoutAttr layout) {
351 Operation *owner = operandOrResult.getOwner();
353 if (owner->
hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
363 const mlir::xegpu::DistributeLayoutAttr layout);
367 const mlir::xegpu::DistributeLayoutAttr layout);
372 auto vecTy = dyn_cast<VectorType>(value.
getType());
380 int64_t srcShapeRank = srcShape.size();
384 int64_t rankDiff = srcShapeRank - targetShapeRank;
385 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
387 llvm::copy(
shape, adjustedTargetShape.begin() + rankDiff);
393 Value slice = vector::ExtractStridedSliceOp::create(
394 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
397 if (srcShapeRank > targetShapeRank) {
398 auto targetTy = VectorType::get(
shape, vecTy.getElementType());
399 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
410 VectorType inputTy = dyn_cast<VectorType>(values[0].
getType());
411 assert(llvm::all_of(values.
getTypes(),
412 [&](
Type type) { return type == inputTy; }) &&
413 "values must be of the same VectorType");
415 Type elemTy = inputTy.getElementType();
418 VectorType resultTy = VectorType::get(
shape, elemTy);
423 for (
auto [src, offsets] :
426 result = vector::InsertStridedSliceOp::create(builder, loc, src,
result,
427 offsets, staticStrides);
438 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
444 converter.addConversion([](
Type type) ->
Type {
return type; });
445 converter.addConversion([](VectorType type) ->
Type {
446 return RankedTensorType::get(type.getShape(), type.getElementType());
448 converter.addSourceMaterialization(materializeCast);
449 converter.addTargetMaterialization(materializeCast);
451 mlir::ConversionTarget
target(*context);
452 target.addLegalOp<UnrealizedConversionCastOp>();
463 op->
walk([](UnrealizedConversionCastOp castOp) {
464 if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
467 Value input = castOp.getInputs()[0];
469 auto inputTy = dyn_cast<VectorType>(input.
getType());
470 auto resultTy = dyn_cast<RankedTensorType>(
result.getType());
473 if (!inputTy || !resultTy)
476 xegpu::DistributeLayoutAttr layout =
481 RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
486 if (
auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
492 if (
auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
493 unsigned idx = use.getOperandNumber();
502 op->
walk([](scf::YieldOp yieldOp) {
505 unsigned idx = r.getResultNumber();
506 Type resultTy = r.getType();
507 Type yieldTy = yieldOp.getResults()[idx].getType();
508 if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
521 class UnrealizedConversionCastOpPattern
522 :
public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
523 using OpConversionPattern<
524 mlir::UnrealizedConversionCastOp>::OpConversionPattern;
527 matchAndRewrite(mlir::UnrealizedConversionCastOp op,
529 ConversionPatternRewriter &rewriter)
const override {
530 auto inputs = op.getOperands();
531 auto outputs = op.getOutputs();
533 if (inputs.size() != 1 || outputs.size() != 1)
536 auto inputTy = inputs[0].getType();
537 auto outputTy = outputs[0].getType();
539 if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
540 rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
544 if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
546 auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
548 rewriter.replaceOp(op, newOp);
555 converter.addSourceMaterialization(materializeCast);
558 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
562 mlir::ConversionTarget
target(*context);
563 target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
564 [](UnrealizedConversionCastOp op) {
565 auto isTensorTy = [](
Type type) {
566 return isa<RankedTensorType>(type);
572 patterns.insert<UnrealizedConversionCastOpPattern>(context);
585 auto targetAttrs = gpuModuleOp.getTargets();
587 for (
auto &attr : *targetAttrs) {
588 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
590 return xevmAttr.getChip().str();
602 assert(
lhs.size() ==
rhs.size() &&
"lhs and rhs must have the same size");
604 for (
auto [l, r] : llvm::zip_equal(
lhs,
rhs)) {
607 results.push_back(builder.
createOrFold<arith::AddIOp>(loc, lval, rval));
630 a = a.slice(a.size() -
b.size());
638 static_assert(std::is_integral<T>::value,
"T must be an integer type");
641 if (!candidateMultiples.empty())
643 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
644 for (T candidate : candidates) {
645 for (T multiple : multiples) {
646 int value =
static_cast<int>(candidate * multiple);
647 if (value != 0 && dim % value == 0 && value > largest)
655 vector::CombiningKind kind, uint32_t size) {
657 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
659 for (uint64_t i = 1; i < size; i <<= 1) {
661 gpu::ShuffleOp::create(builder, loc, laneVal, i, size,
662 gpu::ShuffleMode::XOR)
664 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
671 vector::CombiningKind kind,
675 assert(src.getType().getRank() == 2 &&
"expected a 2D source vector");
676 VectorType sourceType = src.getType();
677 int64_t sourceH = sourceType.getShape()[0];
678 int64_t sourceW = sourceType.getShape()[1];
679 int nSlices = (reductionDim == 0) ? sourceW : sourceH;
681 TypedAttr zeroAttr = rewriter.
getZeroAttr(sourceType.getElementType());
682 Value reductionResult = arith::ConstantOp::create(
683 rewriter, loc,
acc.getType(),
691 for (
int i = 0; i < nSlices; ++i) {
693 if (reductionDim == 1) {
694 sliceOffsets = {i, 0};
695 sliceSizes = {1, sourceW};
697 sliceOffsets = {0, i};
698 sliceSizes = {sourceH, 1};
701 vector::ExtractStridedSliceOp extractOp =
702 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
707 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
709 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
711 VectorType::get({nSliceElements}, sourceType.getElementType()),
712 extractOp.getResult());
719 Value accExtract = vector::ExtractOp::create(rewriter, loc,
acc, i);
720 Value reduction = vector::ReductionOp::create(
721 rewriter, loc, kind, slice.getResult(), accExtract);
723 vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
727 return reductionResult;
740 auto laneData = layout.getEffectiveLaneDataAsInt();
741 if (laneData.size() != 2)
743 return laneData[0] != 1;
755 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
756 if (laneLayout.size() != 2)
771 for (
size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
772 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
774 else if (dst[dstIdx] == 1)
775 expandedUnitDims.push_back(dstIdx);
778 return srcIdx == src.size();
795 splitDimGroups.clear();
796 for (
size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
797 if (srcIdx >= src.size())
799 accumulatedSize *= dst[dstIdx];
800 currentDstDims.push_back(dstIdx);
802 if (accumulatedSize == src[srcIdx]) {
804 splitDimGroups.push_back(currentDstDims);
808 currentDstDims.clear();
809 }
else if (accumulatedSize > src[srcIdx]) {
813 return srcIdx == src.size();
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
This class represents an argument of a Block.
TypedAttr getZeroAttr(Type type)
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class represents an operand of an operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
AttrClass getAttrOfType(StringAttr name)
bool hasAttrOfType(NameT &&name)
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
operand_type_range getOperandTypes()
result_type_range getResultTypes()
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
MLIRContext * getContext()
Return the context this operation is associated with.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
void setType(Type newType)
Mutate the type of this Value to be of the specified type.
Type getType() const
Return the type of this value.
static WalkResult advance()
Operation * getOwner() const
Return the owner of this operand.
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
bool requirePacked(const LayoutAttr layout)
Helper function to check if the layout is packed.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
const FrozenRewritePatternSet & patterns
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const