25#include "llvm/Support/Casting.h"
26#include "llvm/Support/FormatVariadic.h"
35 for (
const auto &vals : values)
36 llvm::append_range(
result, vals);
42 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
45 if (!layout || !layout.isForSubgroup())
50 auto tdescShape = tdescTy.getShape();
51 auto elementType = tdescTy.getElementType();
56 int64_t sgSize = llvm::product_of(laneLayout);
60 for (
auto [tdescDim, laneDim, laneDataDim] :
61 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
62 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
63 "tensor descriptor shape is not distributable");
64 tensorSize *= tdescDim;
67 tensorSize *= tdescTy.getArrayLength();
69 return VectorType::get({tensorSize / sgSize}, elementType);
74 xegpu::LayoutAttr layout) {
75 int64_t rank = originalType.getRank();
82 while (
shape.size() > 2) {
83 arrayLength *=
shape[0];
88 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
89 auto laneData = layout.getEffectiveLaneDataAsInt();
90 while (!laneLayout.empty() && laneLayout.size() >
shape.size()) {
91 laneLayout.erase(laneLayout.begin());
92 laneData.erase(laneData.begin());
94 auto trimmedLayout = xegpu::LayoutAttr::get(
98 auto helperTdescTy = xegpu::TensorDescType::get(
99 shape, originalType.getElementType(), arrayLength,
101 xegpu::MemorySpace::Global, trimmedLayout);
107 VectorType originalType) {
110 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
111 "Expecting a valid layout.");
113 int64_t vectorRank = originalType.getRank();
114 int64_t layoutRank = layout.getRank();
115 assert(vectorRank >= layoutRank &&
"Vector rank must be >= layout rank.");
119 int64_t offset = vectorRank - layoutRank;
123 auto distributedShapeOrFailure =
124 layout.computeDistributedShape(trailingShape);
125 if (
failed(distributedShapeOrFailure))
129 fullShape.begin() + offset);
130 resultShape.append(distributedShapeOrFailure->begin(),
131 distributedShapeOrFailure->end());
132 return VectorType::get(resultShape, originalType.getElementType());
136 const StringRef prefix(
"layout_operand_");
137 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
138 return llvm::formatv(
"{0}{1}", prefix, idx).str();
142 const StringRef prefix =
"layout_result_";
143 return llvm::formatv(
"{0}{1}", prefix,
result.getResultNumber()).str();
150 if (
auto result = dyn_cast<OpResult>(value)) {
152 assert(defOp &&
"result must have a defining op");
154 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
155 auto layout = anchorOp.getAnchorLayout();
160 if (defOp->
hasAttr(layoutName)) {
162 defOp->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
167 if (
auto arg = dyn_cast<BlockArgument>(value)) {
168 auto *parentOp = arg.getOwner()->getParentOp();
169 if (
auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
170 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
177 dyn_cast_if_present<xegpu::TensorDescType>(value.
getType()))
178 return tdescTy.getLayoutAttr();
182xegpu::DistributeLayoutAttr
185 unsigned idx =
const_cast<OpOperand &
>(opr).getOperandNumber();
187 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
188 if (
auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
190 return dpasOp.getLayoutAAttr();
191 }
else if (idx == 1) {
192 return dpasOp.getLayoutBAttr();
193 }
else if (idx == 2) {
194 return dpasOp.getLayoutCdAttr();
197 if (
auto dpasMxOp = dyn_cast<xegpu::DpasMxOp>(op)) {
200 unsigned currentIdx = 0;
202 if (idx == currentIdx++)
203 return dpasMxOp.getLayoutAAttr();
205 if (idx == currentIdx++)
206 return dpasMxOp.getLayoutBAttr();
208 if (dpasMxOp.getAcc())
209 if (idx == currentIdx++)
210 return dpasMxOp.getLayoutCdAttr();
212 if (dpasMxOp.getScaleA())
213 if (idx == currentIdx++)
214 return dpasMxOp.getLayoutAScaleAttr();
216 if (dpasMxOp.getScaleB())
217 if (idx == currentIdx++)
218 return dpasMxOp.getLayoutBScaleAttr();
222 if (
auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
223 return convertOp.getInputLayoutAttr();
225 auto layout = anchorOp.getAnchorLayout();
232 if (isa<xegpu::StoreNdOp, xegpu::StoreMatrixOp>(op) && (idx < 2))
235 if (isa<xegpu::StoreScatterOp>(op)) {
236 xegpu::StoreScatterOp store(op);
237 int chunkSize = store.getChunkSize().value_or(1);
238 if (layout && idx >= 2 && chunkSize > 1)
239 return layout.dropDims(llvm::to_vector(
240 llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
243 if (isa<xegpu::LoadGatherOp>(op)) {
244 xegpu::LoadGatherOp
load(op);
245 int chunkSize =
load.getChunkSize().value_or(1);
246 if (layout && idx >= 1 && chunkSize > 1)
247 return layout.dropDims(llvm::to_vector(
248 llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
255 auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
264xegpu::DistributeLayoutAttr
267 const std::string &name) {
268 xegpu::DistributeLayoutAttr candidate = layout;
270 if (
auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
271 if (
auto perm = loadOp.getLayoutAttr())
280xegpu::DistributeLayoutAttr
283 const std::string &name) {
284 xegpu::DistributeLayoutAttr candidate = layout;
285 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
287 if (
auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
289 if (
auto perm = storeOp.getLayoutAttr())
301 const mlir::xegpu::DistributeLayoutAttr layout) {
304 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
305 if (anchorOp.getAnchorLayout() == layout)
307 anchorOp.setAnchorLayout(layout);
323 const DistributeLayoutAttr layout) {
325 unsigned idx =
const_cast<OpOperand &
>(operand).getOperandNumber();
330 if (
auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
331 if (
auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
333 return dpasOp.setLayoutAAttr(layout);
334 }
else if (idx == 1) {
335 return dpasOp.setLayoutBAttr(layout);
336 }
else if (idx == 2) {
337 return dpasOp.setLayoutCdAttr(layout);
340 if (
auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
341 return convertOp.setInputLayoutAttr(layout);
347 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
350 anchorOp.setAnchorLayout(layout);
354 anchorOp.setAnchorLayout(layout);
368template <
typename T,
typename>
369xegpu::DistributeLayoutAttr
371 Operation *op = operandOrResult.getOwner();
375 auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
382template xegpu::DistributeLayoutAttr
384template xegpu::DistributeLayoutAttr
387template <
typename T,
typename>
389 const xegpu::DistributeLayoutAttr layout) {
390 Operation *owner = operandOrResult.getOwner();
392 if (owner->
hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
402 const mlir::xegpu::DistributeLayoutAttr layout);
406 const mlir::xegpu::DistributeLayoutAttr layout);
411 auto vecTy = dyn_cast<VectorType>(value.
getType());
419 int64_t srcShapeRank = srcShape.size();
423 int64_t rankDiff = srcShapeRank - targetShapeRank;
424 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
426 llvm::copy(
shape, adjustedTargetShape.begin() + rankDiff);
432 Value slice = vector::ExtractStridedSliceOp::create(
433 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
436 if (srcShapeRank > targetShapeRank) {
437 auto targetTy = VectorType::get(
shape, vecTy.getElementType());
438 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
449 VectorType inputTy = dyn_cast<VectorType>(values[0].
getType());
450 assert(llvm::all_of(values.
getTypes(),
451 [&](
Type type) { return type == inputTy; }) &&
452 "values must be of the same VectorType");
454 Type elemTy = inputTy.getElementType();
457 VectorType resultTy = VectorType::get(
shape, elemTy);
462 for (
auto [src, offsets] :
465 result = vector::InsertStridedSliceOp::create(builder, loc, src,
result,
466 offsets, staticStrides);
477 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
483 converter.addConversion([](
Type type) ->
Type {
return type; });
484 converter.addConversion([](VectorType type) ->
Type {
485 return RankedTensorType::get(type.getShape(), type.getElementType());
487 converter.addSourceMaterialization(materializeCast);
488 converter.addTargetMaterialization(materializeCast);
490 mlir::ConversionTarget
target(*context);
491 target.addLegalOp<UnrealizedConversionCastOp>();
496 (
void)mlir::applyPartialConversion(op,
target, std::move(patterns));
502 op->
walk([](UnrealizedConversionCastOp castOp) {
503 if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
506 Value input = castOp.getInputs()[0];
508 auto inputTy = dyn_cast<VectorType>(input.
getType());
509 auto resultTy = dyn_cast<RankedTensorType>(
result.getType());
512 if (!inputTy || !resultTy)
515 xegpu::DistributeLayoutAttr layout =
520 RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
525 if (
auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
531 if (
auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
532 unsigned idx = use.getOperandNumber();
541 op->
walk([](scf::YieldOp yieldOp) {
544 unsigned idx = r.getResultNumber();
545 Type resultTy = r.getType();
546 Type yieldTy = yieldOp.getResults()[idx].getType();
547 if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
560 class UnrealizedConversionCastOpPattern
561 :
public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
562 using OpConversionPattern<
563 mlir::UnrealizedConversionCastOp>::OpConversionPattern;
566 matchAndRewrite(mlir::UnrealizedConversionCastOp op,
568 ConversionPatternRewriter &rewriter)
const override {
569 auto inputs = op.getOperands();
570 auto outputs = op.getOutputs();
572 if (inputs.size() != 1 || outputs.size() != 1)
575 auto inputTy = inputs[0].getType();
576 auto outputTy = outputs[0].getType();
578 if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
579 rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
583 if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
585 auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
587 rewriter.replaceOp(op, newOp);
594 converter.addSourceMaterialization(materializeCast);
597 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
601 mlir::ConversionTarget
target(*context);
602 target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
603 [](UnrealizedConversionCastOp op) {
604 auto isTensorTy = [](
Type type) {
605 return isa<RankedTensorType>(type);
611 patterns.insert<UnrealizedConversionCastOpPattern>(context);
614 (
void)mlir::applyPartialConversion(op,
target, std::move(patterns));
624 auto targetAttrs = gpuModuleOp.getTargets();
626 for (
auto &attr : *targetAttrs) {
627 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
629 return xevmAttr.getChip().str();
641 assert(
lhs.size() ==
rhs.size() &&
"lhs and rhs must have the same size");
643 for (
auto [l, r] : llvm::zip_equal(
lhs,
rhs)) {
646 results.push_back(builder.
createOrFold<arith::AddIOp>(loc, lval, rval));
669 a = a.slice(a.size() -
b.size());
677 static_assert(std::is_integral<T>::value,
"T must be an integer type");
680 if (!candidateMultiples.empty())
682 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
683 for (T candidate : candidates) {
684 for (T multiple : multiples) {
685 int value =
static_cast<int>(candidate * multiple);
686 if (value != 0 && dim % value == 0 && value > largest)
694 vector::CombiningKind kind, uint32_t size) {
696 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
698 for (uint64_t i = 1; i < size; i <<= 1) {
700 gpu::ShuffleOp::create(builder, loc, laneVal, i, size,
701 gpu::ShuffleMode::XOR)
703 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
710 vector::CombiningKind kind,
713 VectorType sourceType = src.
getType();
714 int64_t sourceRank = sourceType.getRank();
717 assert(sourceRank >= 2 &&
"expected at least a 2D source vector");
718 for (
int64_t i = 0; i < sourceRank - 2; ++i)
719 assert(sourceType.getShape()[i] == 1 &&
720 "expected leading dimensions to be unit");
721 int64_t rowIdx = sourceRank - 2;
722 int64_t columnIdx = sourceRank - 1;
723 int64_t sourceH = sourceType.getShape()[rowIdx];
724 int64_t sourceW = sourceType.getShape()[columnIdx];
725 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
727 TypedAttr zeroAttr = rewriter.
getZeroAttr(sourceType.getElementType());
728 Value reductionResult = arith::ConstantOp::create(
729 rewriter, loc,
acc.getType(),
738 for (
int i = 0; i < nSlices; ++i) {
744 if (reductionDim == columnIdx) {
745 sliceOffsets[rowIdx] = i;
746 sliceSizes[columnIdx] = sourceW;
748 sliceOffsets[columnIdx] = i;
749 sliceSizes[rowIdx] = sourceH;
752 vector::ExtractStridedSliceOp extractOp =
753 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
754 sliceSizes, strides);
758 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
760 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
762 VectorType::get({nSliceElements}, sourceType.getElementType()),
763 extractOp.getResult());
773 accIdx[accRank - 1] = i;
774 Value accExtract = vector::ExtractOp::create(rewriter, loc,
acc, accIdx);
775 Value reduction = vector::ReductionOp::create(
776 rewriter, loc, kind, slice.getResult(), accExtract);
777 reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
778 reductionResult, accIdx);
782 return reductionResult;
787 vector::CombiningKind kind,
int64_t reductionDim,
int64_t reductionSize,
789 VectorType sourceType = src.
getType();
790 int64_t sourceRank = sourceType.getRank();
793 assert(sourceRank >= 2 &&
"expected at least a 2D source vector");
794 for (
int64_t i = 0; i < sourceRank - 2; ++i)
795 assert(sourceType.getShape()[i] == 1 &&
796 "expected leading dimensions to be unit");
797 int64_t rowIdx = sourceRank - 2;
798 int64_t columnIdx = sourceRank - 1;
799 int64_t sourceH = sourceType.getShape()[rowIdx];
800 int64_t sourceW = sourceType.getShape()[columnIdx];
803 TypedAttr zeroAttr = rewriter.
getZeroAttr(sourceType.getElementType());
804 Value reductionResult = arith::ConstantOp::create(
805 rewriter, loc,
acc.getType(),
812 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
817 for (
int i = 0; i < nSlices; ++i) {
823 if (reductionDim == columnIdx) {
824 sliceOffsets[rowIdx] = i;
825 sliceSizes[columnIdx] = sourceW;
827 sliceOffsets[columnIdx] = i;
828 sliceSizes[rowIdx] = sourceH;
831 vector::ExtractStridedSliceOp extractOp =
832 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
833 sliceSizes, strides);
834 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
835 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
837 VectorType::get({nSliceElements}, sourceType.getElementType()),
838 extractOp.getResult());
841 accIdx[accRank - 1] = i;
842 Value accExtract = vector::ExtractOp::create(rewriter, loc,
acc, accIdx);
847 reductionResult = vector::InsertOp::create(rewriter, loc, fullReduce,
848 reductionResult, accIdx);
850 return reductionResult;
855 vector::CombiningKind kind) {
856 auto vecTy = dyn_cast<VectorType>(type);
857 Type elemTy = vecTy ? vecTy.getElementType() : type;
862 return arith::ConstantOp::create(
864 return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(scalarAttr));
868 case vector::CombiningKind::ADD:
869 case vector::CombiningKind::XOR:
870 case vector::CombiningKind::OR:
871 case vector::CombiningKind::MAXUI:
874 case vector::CombiningKind::MUL:
875 case vector::CombiningKind::AND:
878 case vector::CombiningKind::MINSI:
879 if (
auto intTy = dyn_cast<IntegerType>(elemTy))
881 elemTy, APInt::getSignedMaxValue(intTy.getWidth())));
884 case vector::CombiningKind::MINUI:
885 if (
auto intTy = dyn_cast<IntegerType>(elemTy))
887 builder.
getIntegerAttr(elemTy, APInt::getMaxValue(intTy.getWidth())));
890 case vector::CombiningKind::MAXSI:
891 if (
auto intTy = dyn_cast<IntegerType>(elemTy))
893 elemTy, APInt::getSignedMinValue(intTy.getWidth())));
896 case vector::CombiningKind::MINNUMF:
897 case vector::CombiningKind::MINIMUMF:
898 if (
auto floatTy = dyn_cast<FloatType>(elemTy))
900 elemTy, APFloat::getInf(floatTy.getFloatSemantics())));
903 case vector::CombiningKind::MAXNUMF:
904 case vector::CombiningKind::MAXIMUMF:
905 if (
auto floatTy = dyn_cast<FloatType>(elemTy))
907 elemTy, APFloat::getInf(floatTy.getFloatSemantics(),
true)));
923 auto laneData = layout.getEffectiveLaneDataAsInt();
924 if (laneData.size() != 2)
926 return laneData[0] != 1;
939 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
940 if (laneLayout.size() != 2)
955 for (
size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
956 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
958 else if (dst[dstIdx] == 1)
959 expandedUnitDims.push_back(dstIdx);
962 return srcIdx == src.size();
979 splitDimGroups.clear();
980 for (
size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
981 if (srcIdx >= src.size())
983 accumulatedSize *= dst[dstIdx];
984 currentDstDims.push_back(dstIdx);
986 if (accumulatedSize == src[srcIdx]) {
989 if (srcIdx == src.size() - 1) {
990 while (++dstIdx < dst.size() && dst[dstIdx] == 1)
991 currentDstDims.push_back(dstIdx);
994 splitDimGroups.push_back(currentDstDims);
998 currentDstDims.clear();
999 }
else if (accumulatedSize > src[srcIdx]) {
1003 return srcIdx == src.size();
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
Attributes are known-constant values of operations.
This class represents an argument of a Block.
IntegerAttr getIntegerAttr(Type type, int64_t value)
FloatAttr getFloatAttr(Type type, double value)
TypedAttr getZeroAttr(Type type)
TypedAttr getOneAttr(Type type)
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
This class represents an operand of an operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
AttrClass getAttrOfType(StringAttr name)
bool hasAttrOfType(NameT &&name)
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
operand_type_range getOperandTypes()
result_type_range getResultTypes()
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
MLIRContext * getContext()
Return the context this operation is associated with.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
void setType(Type newType)
Mutate the type of this Value to be of the specified type.
Type getType() const
Return the type of this value.
static WalkResult advance()
Operation * getOwner() const
Return the owner of this operand.
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
bool requirePacked(const DistributeLayoutAttr layout)
Helper function to check if the layout is packed.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool requireTranspose(const DistributeLayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const