27#include "llvm/Support/FormatVariadic.h"
50 out.reserve(attrs.size());
52 for (
auto attr : attrs) {
53 if (
auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
54 auto newLayout = dist.dropSgLayoutAndData();
56 out.emplace_back(attr.getName(), newLayout);
68 out.reserve(attrs.size());
70 for (
auto attr : attrs) {
71 if (
auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
72 auto newLayout = dist.dropInstData();
74 out.emplace_back(attr.getName(), newLayout);
90 if (!isa<VectorType>(operand.get().getType()))
94 if (isa<BlockArgument>(operand.get()))
98 op->
emitWarning(
"Could not find layout attribute for operand ")
99 << operand.getOperandNumber() <<
" of operation " << op->
getName();
106 return !
result.wasInterrupted();
109template <
typename T,
typename>
111 Operation *owner = operandOrResult.getOwner();
129 for (
auto namedAttr : nestOp->
getAttrs()) {
130 if (isa<DistributeLayoutAttr>(namedAttr.getValue()))
131 attrsToRemove.push_back(namedAttr.getName());
133 for (
auto attrName : attrsToRemove)
140xegpu::DistributeLayoutAttr
146 size_t dimDiff = resShape.size() - srcShape.size();
147 auto bcastSourceLayout = resLayout;
148 for (
size_t i = dimDiff; i < resShape.size(); i++) {
149 if ((srcShape[i - dimDiff] == 1) && (resShape[i] != 1))
150 bcastDims.push_back(i);
155 if (!bcastDims.empty())
156 bcastSourceLayout = bcastSourceLayout.setUnitDimData(bcastDims);
160 for (
size_t i = 0; i < dimDiff; i++)
161 sliceDims.push_back(i);
162 bcastSourceLayout = xegpu::SliceAttr::get(
163 resLayout.getContext(), bcastSourceLayout,
166 return bcastSourceLayout;
171xegpu::DistributeLayoutAttr
175 assert(isa<xegpu::SliceAttr>(resLayout) &&
176 "reduction result layout must be slice layout");
178 xegpu::SliceAttr sliceLayout = dyn_cast<xegpu::SliceAttr>(resLayout);
180 assert((reduceDims == sliceLayout.getDims().asArrayRef()) &&
181 "reduction dims must match with slice dims");
183 return sliceLayout.getParent();
188xegpu::DistributeLayoutAttr
191 return resLayout.transposeDims(permutation);
197xegpu::DistributeLayoutAttr
199 int resElemTyBitWidth,
int srcElemTyBitWidth) {
204 size_t sgDataSize = sgData.size();
205 size_t instDataSize = instData.size();
206 size_t laneDataSize = laneData.size();
210 int64_t dim = resLayout.getRank() - 1;
212 if (srcElemTyBitWidth <= resElemTyBitWidth) {
213 int bitWidthRatio = resElemTyBitWidth / srcElemTyBitWidth;
215 sgDataValue = sgData.back() * bitWidthRatio;
217 instDataValue = instData.back() * bitWidthRatio;
219 laneDataValue = laneData.back() * bitWidthRatio;
221 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
223 assert((sgData.back() % bitWidthRatio) == 0 &&
224 "sgData not divisible by bitWidthRatio");
225 sgDataValue = sgData.back() / bitWidthRatio;
228 assert((instData.back() % bitWidthRatio) == 0 &&
229 "instData not divisible by bitWidthRatio");
230 instDataValue = instData.back() / bitWidthRatio;
233 assert((laneData.back() % bitWidthRatio) == 0 &&
234 "laneData not divisible by bitWidthRatio");
235 laneDataValue = laneData.back() / bitWidthRatio;
239 xegpu::DistributeLayoutAttr finalSrcLayout;
241 resLayout.setDimData(dim, sgDataValue, instDataValue, laneDataValue);
243 return finalSrcLayout;
253 int srcShapeSize = srcShape.size();
254 int resShapeSize = resShape.size();
255 int dimDiff = resShapeSize - srcShapeSize;
260 auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
261 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
262 for (
int i = 0; i < dimDiff; i++) {
263 assert((resSgLayout.size() == 0 || resSgLayout[i] == 1) &&
264 (resLaneLayout.size() == 0 || resLaneLayout[i] == 1) &&
265 "Leading dimensions being sliced off must not be distributed");
267 return resLayout.dropDims(llvm::to_vector(llvm::seq<int64_t>(0, dimDiff)));
274xegpu::DistributeLayoutAttr
298 xegpu::SliceAttr::get(resLayout.getContext(), resLayout, sliceDimsAttr);
305 auto srcLayout = resLayout;
306 for (
const auto &dimGroup : splitDimGroups)
307 srcLayout = srcLayout.collapseDims(dimGroup);
316 if ((dst.size() != 2) && (dst.size() != 1))
318 int64_t srcSize = std::accumulate(src.begin(), src.end(), 1LL,
319 std::multiplies<int64_t>());
321 return (dst[0] == srcSize);
322 return (dst[0] == 1) && (dst[1] == srcSize);
325 if (matchCollapseToInnermostDim(srcShape, resShape)) {
326 int srcShapeSize = srcShape.size();
327 int resShapeSize = resShape.size();
328 auto context = resLayout.getContext();
329 auto resInstData = resLayout.getEffectiveInstDataAsInt();
330 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
331 auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
347 if (resInstData.size() != 0) {
349 for (
int i = 0; i < resShapeSize - 1; i++) {
350 assert(resInstData[i] == 1 &&
351 "only innermost dim can have non-unit instData");
354 inferredInstData[srcShapeSize - 1] =
355 std::min(resInstData[resShapeSize - 1], srcShape[srcShapeSize - 1]);
356 return xegpu::LayoutAttr::get(context, inferredInstData);
359 if (resLaneLayout.size() != 0) {
360 for (
int i = 0; i < resShapeSize - 1; i++) {
361 assert(resLaneData[i] == 1 &&
362 "only innermost dim can have non-unit instData");
364 assert(srcShape.back() % resLaneLayout.back() == 0 &&
365 "source innermost dim must be >= result lane layout");
368 inferredLaneLayout.back() = resLaneLayout.back();
369 inferredLaneData.back() = std::min(
370 resLaneData.back(), srcShape.back() / inferredLaneLayout.back());
371 return xegpu::LayoutAttr::get(context, inferredLaneLayout,
375 llvm_unreachable(
"running into unsupported shape cast scenarios");
445 auto srcShape = srcVecTy.getShape();
446 int srcRank = srcShape.size();
447 auto context = consumerLayout.getContext();
459 const int workgroupSize = consumerLayout.getNumSubgroups();
460 const int subgroupSize =
uArch->getSubgroupSize();
461 int64_t maxReduceVectorSize = 1;
464 consumerLayout.getEffectiveSgLayoutAsInt();
466 consumerLayout.getEffectiveLaneLayoutAsInt();
470 xegpu::DistributeLayoutAttr srcLayout;
472 xegpu::SliceAttr consumerSliceLayout =
473 dyn_cast<xegpu::SliceAttr>(consumerLayout);
474 if (consumerSliceLayout &&
475 consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
476 srcLayout = consumerSliceLayout.getParent();
478 srcLayout.getEffectiveSgLayoutAsInt();
481 for (
int dim = 0; dim < srcRank; dim++) {
482 srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
487 int remainingSgCount = workgroupSize;
491 for (
int i = 0; i < srcRank; i++) {
492 if (!llvm::is_contained(reductionDims, i) &&
493 consumerIdx <
static_cast<int>(consumerSgLayout.size())) {
494 sgLayout[i] = consumerSgLayout[consumerIdx];
495 assert((srcShape[i] % sgLayout[i] == 0) &&
496 "source shape not divisible by consumer sg_layout");
497 sgData[i] = srcShape[i] / sgLayout[i];
498 remainingSgCount /= sgLayout[i];
499 order[i] = consumerOrder[consumerIdx];
505 int64_t remainOrder = consumerSgLayout.size();
506 for (
int i = 0; i < srcRank; i++) {
507 if (llvm::is_contained(reductionDims, i)) {
509 std::min(srcShape[i],
static_cast<int64_t>(remainingSgCount));
510 assert((srcShape[i] % sgLayout[i] == 0) &&
511 "source shape not divisible by sg_layout");
512 sgData[i] = srcShape[i] / sgLayout[i];
513 remainingSgCount /= sgLayout[i];
514 order[i] = remainOrder++;
518 assert(remainingSgCount == 1 &&
"not all subgroups distributed");
519 srcLayout = xegpu::LayoutAttr::get(
520 context, toInt32Attr(sgLayout), toInt32Attr(sgData),
523 (!orderAttr || orderAttr.empty()) ?
nullptr : toInt32Attr(order));
528 instData[srcRank - 2] =
529 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
530 instData[srcRank - 1] =
531 std::min(
static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
532 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
537 laneLayout[srcRank - 1] =
538 std::min(
static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
539 laneData[srcRank - 2] =
540 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
541 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
542 toInt32Attr(laneData));
545 return xegpu::SliceAttr::get(context, srcLayout,
575 int srcElemTyBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
576 int resElemTyBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
582 assert(consumerLayout.getRank() ==
static_cast<int64_t>(srcShape.size()) &&
583 "laneData must be available for all dimensions");
584 size_t dim = srcShape.size() - 1;
588 const int subgroupSize =
uArch->getSubgroupSize();
590 if (srcElemTyBitWidth > resElemTyBitWidth) {
594 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
595 int innermostDimLaneLayout = subgroupSize;
597 sgDataValue = sgData[dim];
599 instDataValue = instData[dim];
602 while ((instDataValue <= srcShape[dim]) &&
603 (instDataValue % (innermostDimLaneLayout * bitWidthRatio) != 0))
605 assert((srcShape[dim] % instDataValue) == 0 &&
606 "srcShape, instData, and lanelayout for innermost must be 2^n !");
608 laneDataValue = laneData[dim];
609 while ((laneDataValue <= srcShape[dim]) &&
610 (laneDataValue % bitWidthRatio != 0))
614 xegpu::DistributeLayoutAttr resLayout;
615 resLayout = consumerLayout.setDimData(dim, sgDataValue, instDataValue,
619 return consumerLayout;
627 VectorType resVectorTy, xegpu::DistributeLayoutAttr consumerLayout,
630 xegpu::DistributeLayoutAttr requiredResLayout;
632 consumerLayout.getEffectiveInstDataAsInt();
634 consumerLayout.getEffectiveLaneDataAsInt();
636 consumerLayout.getEffectiveLaneLayoutAsInt();
641 requiredResLayout = consumerLayout;
642 int srcRank = srcShape.size();
646 "subgroup layout assignment not supported for insertStridedSlice.");
648 for (
int dim = 0; dim < srcRank; dim++) {
649 instDataValue = std::min(srcShape[dim], consumerInstData[dim]);
651 requiredResLayout.setDimData(dim, -1, instDataValue, -1);
654 for (
int dim = 0; dim < srcRank; dim++) {
655 assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
656 "srcShape must be divisible by laneLayout for all dimensions");
657 laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
658 consumerLaneData[dim]);
661 requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
664 return requiredResLayout;
681 xegpu::DistributeLayoutAttr consumerLayout,
bool isChunkedLoad,
685 return consumerLayout;
688 consumerLayout.getEffectiveInstDataAsInt();
690 consumerLayout.getEffectiveLaneDataAsInt();
696 if (!isChunkedLoad) {
698 instData.back() = std::min(
static_cast<int>(consumerInstData.back()),
699 maxChunkSize * subgroupSize);
700 return xegpu::LayoutAttr::get(context, instData);
703 std::min(
static_cast<int>(consumerLaneData.back()), maxChunkSize);
704 laneLayout.back() = std::min(
static_cast<int64_t>(subgroupSize),
705 resShape.back() / laneData.back());
706 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
709 assert(resShape.size() == 2 &&
"Chunked Store must access 2D tensor tile.");
711 instData[0] = subgroupSize;
713 std::min(
static_cast<int>(consumerInstData[1]), maxChunkSize);
714 return xegpu::LayoutAttr::get(context, instData);
716 laneLayout[0] = subgroupSize;
718 std::min(
static_cast<int>(consumerLaneData[1]), maxChunkSize);
719 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
732 auto context = resVecTy.getContext();
733 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
735 const auto *uArchInstruction =
736 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
738 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
741 (chunkSize > 1), maxChunkSize, resShape,
747xegpu::DistributeLayoutAttr
750 xegpu::DistributeLayoutAttr consumerLayout,
755 auto context = resVecTy.getContext();
756 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
758 const auto *uArchInstruction =
759 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
761 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
763 false, maxChunkSize, resShape,
778static xegpu::DistributeLayoutAttr
784 int srcShapeSize = srcShape.size();
791 "subgroup layout assignment not supported for storeScatter.");
795 if (!isChunkedStore) {
797 instData[srcShapeSize - 1] =
798 std::min(subgroupSize,
static_cast<int>(srcShape.back()));
799 return xegpu::LayoutAttr::get(context, instData);
801 laneLayout[srcShapeSize - 1] =
802 std::min(subgroupSize,
static_cast<int>(srcShape.back()));
803 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
806 assert(srcShapeSize == 2 &&
"Chunked Store must access 2D tensor tile.");
808 instData[0] = subgroupSize;
809 instData[1] = std::min(
static_cast<int>(srcShape[1]), maxChunkSize);
810 return xegpu::LayoutAttr::get(context, instData);
812 laneLayout[0] = subgroupSize;
813 laneData[1] = std::min(
static_cast<int>(srcShape[1]), maxChunkSize);
814 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
821xegpu::DistributeLayoutAttr
823 VectorType srcVecTy,
int chunkSize,
826 const int subgroupSize =
uArch->getSubgroupSize();
828 auto context = srcVecTy.getContext();
829 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
831 const auto *uArchInstruction =
832 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
834 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
836 maxChunkSize, srcShape, subgroupSize);
840xegpu::DistributeLayoutAttr
845 const int subgroupSize =
uArch->getSubgroupSize();
847 auto context = srcVecTy.getContext();
848 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
850 const auto *uArchInstruction =
851 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
853 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
856 srcShape, subgroupSize);
864template <
typename RankedTy>
867 std::optional<unsigned> packingSize = std::nullopt,
bool vnni =
false) {
869 assert(((ty.getRank() == 1 && !vnni) || ty.getRank() == 2) &&
870 "Expected 1D non-vnni or 2D vector.");
872 assert(ty.getElementType().isIntOrFloat() &&
873 "Expected int or float element type.");
875 auto context = ty.getContext();
876 auto rank = ty.getRank();
879 if (packingSize.has_value()) {
880 unsigned bitwidth = ty.getElementType().getIntOrFloatBitWidth();
881 int &laneDataPos = vnni ? laneData[rank - 2] : laneData.back();
882 laneDataPos = bitwidth < *packingSize ? *packingSize / bitwidth : 1;
885 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
900 for (
int sgLayout0 = 1; sgLayout0 <= sgCount; ++sgLayout0) {
901 if (sgCount % sgLayout0)
903 int64_t sgLayout1 = sgCount / sgLayout0;
904 int64_t sgData0 = wgShape[0] / sgLayout0;
905 int64_t sgData1 = wgShape[1] / sgLayout1;
906 if ((wgShape[0] % sgLayout0 || wgShape[1] % sgLayout1) ||
907 (sgData0 % instData[0] || sgData1 % instData[1]))
909 candidates.emplace_back(sgLayout0, sgLayout1);
916 int diffLhs = std::abs(
lhs.first -
lhs.second);
917 int diffRhs = std::abs(
rhs.first -
rhs.second);
918 if (diffLhs != diffRhs)
919 return diffLhs < diffRhs;
920 return lhs.first <
rhs.first;
929 std::tuple<xegpu::DistributeLayoutAttr, xegpu::DistributeLayoutAttr,
930 xegpu::DistributeLayoutAttr>>
932 VectorType bTy, VectorType cdTy,
933 xegpu::DistributeLayoutAttr consumerLayout,
935 auto context = aTy.getContext();
936 const auto *uArchInstruction =
940 auto getInstDataVectors = [&]()
944 const unsigned dataALen = aTy.getShape().front();
945 auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
949 const unsigned dataBLen = bTy.getShape().back();
950 auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType());
954 auto supportedCLen = uArchInstruction->getSupportedN(cdTy.getElementType());
957 if (maxALen == -1 || maxBLen == -1 || maxCLen == -1)
961 instDataA[aTy.getRank() - 2] = maxALen;
962 instDataA[aTy.getRank() - 1] = subgroupSize;
964 instDataB[bTy.getRank() - 2] = subgroupSize;
965 instDataB[bTy.getRank() - 1] = maxBLen;
967 instDataCD[cdTy.getRank() - 2] = maxALen;
968 instDataCD[cdTy.getRank() - 1] = maxCLen;
969 return std::make_tuple(instDataA, instDataB, instDataCD);
974 "Number of subgroups must be provided for sg layout creation.");
975 auto instDataVecs = getInstDataVectors();
978 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
979 assert(instDataA.size() == 2 && instDataB.size() == 2 &&
980 instDataCD.size() == 2 &&
981 "Sg layout creation expects valid 2D inst data");
983 std::optional<LayoutRepresentation> consumerSgLayout = std::nullopt;
984 if (consumerLayout && consumerLayout.isForWorkgroup()) {
986 consumerLayout.getEffectiveSgLayoutAsInt();
987 consumerSgLayout = std::make_pair(sgLayoutD[0], sgLayoutD[1]);
995 if (layoutsA.empty() || layoutsB.empty() || layoutsCD.empty())
1004 std::optional<LayoutRepresentation> bestPick;
1005 for (
auto &sgLayout : layoutsB) {
1006 if (setA.contains(sgLayout) && setCD.contains(sgLayout)) {
1008 if (consumerSgLayout.has_value() && sgLayout == *consumerSgLayout) {
1009 bestPick = sgLayout;
1017 bestPick = sgLayout;
1023 return std::nullopt;
1025 static_cast<int>(bestPick->second)};
1027 static_cast<int>(aTy.getShape()[0] / sgLayout[0]),
1028 static_cast<int>(aTy.getShape()[1] / sgLayout[1])};
1030 static_cast<int>(bTy.getShape()[0] / sgLayout[0]),
1031 static_cast<int>(bTy.getShape()[1] / sgLayout[1])};
1033 static_cast<int>(cdTy.getShape()[0] / sgLayout[0]),
1034 static_cast<int>(cdTy.getShape()[1] / sgLayout[1])};
1036 auto dpasALayout = xegpu::LayoutAttr::get(
1042 auto dpasBLayout = xegpu::LayoutAttr::get(
1048 auto dpasCDLayout = xegpu::LayoutAttr::get(
1053 return std::make_tuple(dpasALayout, dpasBLayout, dpasCDLayout);
1055 auto instDataVecs = getInstDataVectors();
1057 return std::nullopt;
1058 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
1059 return std::make_tuple(
1060 xegpu::LayoutAttr::get(
1062 xegpu::LayoutAttr::get(
1064 xegpu::LayoutAttr::get(
1068 aTy,
uArch, uArchInstruction->getPackedFormatBitSizeA());
1070 bTy,
uArch, uArchInstruction->getPackedFormatBitSizeB(),
true);
1073 return std::make_tuple(aLayout, bLayout, cdLayout);
1075 return std::nullopt;
1081 xegpu::DistributeLayoutAttr resLayout;
1086 if (
auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
1088 return xegpu::DistributeLayoutAttr();
1089 auto srcTy = dyn_cast<VectorType>(
broadcast.getSourceType());
1091 return xegpu::DistributeLayoutAttr();
1093 resLayout,
broadcast.getResultVectorType().getShape(),
1100 if (
auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
1102 return xegpu::DistributeLayoutAttr();
1113 if (
auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
1115 return xegpu::DistributeLayoutAttr();
1116 int resElemBitWidth =
1117 bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
1118 int srcElemBitWidth =
1119 bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
1126 if (
auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
1128 return xegpu::DistributeLayoutAttr();
1130 resLayout, shapeCast.getResultVectorType().getShape(),
1131 shapeCast.getSourceVectorType().getShape());
1136 if (
auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
1138 return xegpu::DistributeLayoutAttr();
1141 resLayout, insertSlice.getDestVectorType().getShape(),
1142 insertSlice.getSourceVectorType().getShape());
1149 if (
auto transpose = dyn_cast<vector::TransposeOp>(op)) {
1151 return xegpu::DistributeLayoutAttr();
1153 transpose.getPermutation());
1160 return xegpu::DistributeLayoutAttr();
static Value broadcast(Location loc, Value toBroadcast, unsigned numElements, const TypeConverter &typeConverter, ConversionPatternRewriter &rewriter)
Broadcasts the value to vector with numElements number of elements.
std::pair< int64_t, int64_t > LayoutRepresentation
static xegpu::DistributeLayoutAttr setupGenericStoreAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, bool isChunkedStore, int maxChunkSize, ArrayRef< int64_t > srcShape, int subgroupSize)
Sets up the anchor layout for store scatter and store matrix operation.
static SmallVector< LayoutRepresentation > getValidLayouts(ArrayRef< int64_t > wgShape, ArrayRef< int64_t > instData, int64_t sgCount)
static xegpu::LayoutAttr getDefaultLaneLayout2DBlockIo(RankedTy ty, const xegpu::uArch::uArch *uArch, std::optional< unsigned > packingSize=std::nullopt, bool vnni=false)
static xegpu::DistributeLayoutAttr setupGenericLoadAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, xegpu::DistributeLayoutAttr consumerLayout, bool isChunkedLoad, int maxChunkSize, ArrayRef< int64_t > resShape, int subgroupSize)
Sets up the anchor layout for load gather and load matrix operation.
IRValueT get() const
Return the current value being used by this operand.
MLIRContext is the top-level object for a collection of MLIR operations.
This class represents an operand of an operation.
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
bool hasAttrOfType(NameT &&name)
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
MutableArrayRef< OpOperand > getOpOperands()
OperationName getName()
The name of an operation is the key identifier for it.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
unsigned getNumResults()
Return the number of results held by this operation.
Type getType() const
Return the type of this value.
static WalkResult advance()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
Operation * getOwner() const
Return the owner of this operand.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
@ SubgroupMatrixMultiplyAcc
DistributeLayoutAttr inferShapeCastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a shape cast operation given the result layout attribute,...
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > permutation)
Infers the source layout attribute for a transpose operation given the result layout attribute and pe...
SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, DistributeLayoutAttr consumerLayout, SmallVector< int64_t > reductionDims, const uArch::uArch *uArch)
Sets up layout for reduction operations by creating a SliceAttr for the result.
DistributeLayoutAttr inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for an insert strided slice operation given the result layout attr...
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
std::optional< std::tuple< DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr > > setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch, int numSg)
Sets up the anchor layouts for a dpas operands (A, B, and C/D).
LayoutKind
Specifies the level of a layout hierarchy for comparison or propagation.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
SmallVector< NamedAttribute > dropInstDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping inst-data information from any DistributeLayoutAttr f...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
DistributeLayoutAttr setupLoadMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for load matrix operation.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
DistributeLayoutAttr inferBroadcastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a broadcast operation given the result layout attribute,...
DistributeLayoutAttr setupStoreScatterAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, const uArch::uArch *uArch)
Sets up the anchor layout for a store scatter operation.
void recoverTemporaryLayoutsDeprecated(Operation *op)
[to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and OpResult of of the given opera...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
DistributeLayoutAttr setupBitCastResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Setup the result layout attribute for a bitcast operation based on element type bitwidths.
void removeLayoutAttr(const T &operandOrResult)
Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
SmallVector< NamedAttribute > dropSgLayoutAndDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping sg-layout and sg-data information from any Distribute...
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
DistributeLayoutAttr inferBitCastSourceLayout(DistributeLayoutAttr resLayout, int resElemTyBitWidth, int srcElemTyBitWidth)
Infers the source layout attribute for a bitcast operation given the result layout attribute,...
DistributeLayoutAttr setupInsertStridedSliceResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the result layout for an insert strided slice operation.
xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand)
Gets the expected layout for a given consumer operand.
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
DistributeLayoutAttr inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout, SmallVector< int64_t > reduceDims)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
DistributeLayoutAttr setupLoadGatherAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for a load gather operation.
DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, const uArch::uArch *uArch)
Sets up the anchor layout for a store matrix operation.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int32_t > DenseI32ArrayAttr
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
uArch(StringRef name, StringRef description, llvm::ArrayRef< const Instruction * > instructionRegistry)
const Instruction * getInstruction(InstructionKind instKind) const