27#include "llvm/Support/FormatVariadic.h"
50 out.reserve(attrs.size());
52 for (
auto attr : attrs) {
53 if (
auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
54 auto newLayout = dist.dropSgLayoutAndData();
56 out.emplace_back(attr.getName(), newLayout);
68 out.reserve(attrs.size());
70 for (
auto attr : attrs) {
71 if (
auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
72 auto newLayout = dist.dropInstData();
74 out.emplace_back(attr.getName(), newLayout);
90 if (!isa<VectorType>(operand.get().getType()))
94 if (isa<BlockArgument>(operand.get()))
98 op->
emitWarning(
"Could not find layout attribute for operand ")
99 << operand.getOperandNumber() <<
" of operation " << op->
getName();
106 return !
result.wasInterrupted();
109template <
typename T,
typename>
111 Operation *owner = operandOrResult.getOwner();
129 for (
auto namedAttr : nestOp->
getAttrs()) {
130 if (isa<DistributeLayoutAttr>(namedAttr.getValue()))
131 attrsToRemove.push_back(namedAttr.getName());
133 for (
auto attrName : attrsToRemove)
140xegpu::DistributeLayoutAttr
146 size_t dimDiff = resShape.size() - srcShape.size();
147 auto bcastSourceLayout = resLayout;
148 for (
size_t i = dimDiff; i < resShape.size(); i++) {
149 if ((srcShape[i - dimDiff] == 1) && (resShape[i] != 1))
150 bcastDims.push_back(i);
155 if (!bcastDims.empty())
156 bcastSourceLayout = bcastSourceLayout.setUnitDimData(bcastDims);
160 for (
size_t i = 0; i < dimDiff; i++)
161 sliceDims.push_back(i);
162 bcastSourceLayout = xegpu::SliceAttr::get(
163 resLayout.getContext(), bcastSourceLayout,
166 return bcastSourceLayout;
171xegpu::DistributeLayoutAttr
175 assert(isa<xegpu::SliceAttr>(resLayout) &&
176 "reduction result layout must be slice layout");
178 xegpu::SliceAttr sliceLayout = dyn_cast<xegpu::SliceAttr>(resLayout);
180 assert((reduceDims == sliceLayout.getDims().asArrayRef()) &&
181 "reduction dims must match with slice dims");
183 return sliceLayout.getParent();
186xegpu::DistributeLayoutAttr
193xegpu::DistributeLayoutAttr
196 return resLayout.transposeDims(permutation);
202xegpu::DistributeLayoutAttr
204 int resElemTyBitWidth,
int srcElemTyBitWidth) {
209 size_t sgDataSize = sgData.size();
210 size_t instDataSize = instData.size();
211 size_t laneDataSize = laneData.size();
215 int64_t dim = resLayout.getRank() - 1;
217 if (srcElemTyBitWidth <= resElemTyBitWidth) {
218 int bitWidthRatio = resElemTyBitWidth / srcElemTyBitWidth;
220 sgDataValue = sgData.back() * bitWidthRatio;
222 instDataValue = instData.back() * bitWidthRatio;
224 laneDataValue = laneData.back() * bitWidthRatio;
226 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
228 assert((sgData.back() % bitWidthRatio) == 0 &&
229 "sgData not divisible by bitWidthRatio");
230 sgDataValue = sgData.back() / bitWidthRatio;
233 assert((instData.back() % bitWidthRatio) == 0 &&
234 "instData not divisible by bitWidthRatio");
235 instDataValue = instData.back() / bitWidthRatio;
238 assert((laneData.back() % bitWidthRatio) == 0 &&
239 "laneData not divisible by bitWidthRatio");
240 laneDataValue = laneData.back() / bitWidthRatio;
244 xegpu::DistributeLayoutAttr finalSrcLayout;
246 resLayout.setDimData(dim, sgDataValue, instDataValue, laneDataValue);
248 return finalSrcLayout;
258 int srcShapeSize = srcShape.size();
259 int resShapeSize = resShape.size();
260 int dimDiff = resShapeSize - srcShapeSize;
265 auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
266 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
267 for (
int i = 0; i < dimDiff; i++) {
268 assert((resSgLayout.size() == 0 || resSgLayout[i] == 1) &&
269 (resLaneLayout.size() == 0 || resLaneLayout[i] == 1) &&
270 "Leading dimensions being sliced off must not be distributed");
272 return resLayout.dropDims(llvm::to_vector(llvm::seq<int64_t>(0, dimDiff)));
279xegpu::DistributeLayoutAttr
303 xegpu::SliceAttr::get(resLayout.getContext(), resLayout, sliceDimsAttr);
310 auto srcLayout = resLayout;
311 for (
const auto &dimGroup : splitDimGroups)
312 srcLayout = srcLayout.collapseDims(dimGroup);
321 if ((dst.size() != 2) && (dst.size() != 1))
323 int64_t srcSize = std::accumulate(src.begin(), src.end(), 1LL,
324 std::multiplies<int64_t>());
326 return (dst[0] == srcSize);
327 return (dst[0] == 1) && (dst[1] == srcSize);
330 if (matchCollapseToInnermostDim(srcShape, resShape)) {
331 int srcShapeSize = srcShape.size();
332 int resShapeSize = resShape.size();
333 auto context = resLayout.getContext();
334 auto resInstData = resLayout.getEffectiveInstDataAsInt();
335 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
336 auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
352 if (resInstData.size() != 0) {
354 for (
int i = 0; i < resShapeSize - 1; i++) {
355 assert(resInstData[i] == 1 &&
356 "only innermost dim can have non-unit instData");
359 inferredInstData[srcShapeSize - 1] =
360 std::min(resInstData[resShapeSize - 1], srcShape[srcShapeSize - 1]);
361 return xegpu::LayoutAttr::get(context, inferredInstData);
364 if (resLaneLayout.size() != 0) {
365 for (
int i = 0; i < resShapeSize - 1; i++) {
366 assert(resLaneData[i] == 1 &&
367 "only innermost dim can have non-unit instData");
369 assert(srcShape.back() % resLaneLayout.back() == 0 &&
370 "source innermost dim must be >= result lane layout");
373 inferredLaneLayout.back() = resLaneLayout.back();
374 inferredLaneData.back() = std::min(
375 resLaneData.back(), srcShape.back() / inferredLaneLayout.back());
376 return xegpu::LayoutAttr::get(context, inferredLaneLayout,
380 llvm_unreachable(
"running into unsupported shape cast scenarios");
387 xegpu::DistributeLayoutAttr payloadLayout,
int chunkSize) {
388 auto rank = payloadLayout.getRank();
390 return payloadLayout.dropDims(
391 llvm::to_vector(llvm::seq<int64_t>(rank - 1, rank)));
392 return payloadLayout;
461 auto srcShape = srcVecTy.getShape();
462 int srcRank = srcShape.size();
463 auto context = srcVecTy.getContext();
471 const int subgroupSize =
uArch->getSubgroupSize();
472 int64_t maxReduceVectorSize = 1;
473 xegpu::DistributeLayoutAttr srcLayout;
475 xegpu::SliceAttr consumerSliceLayout =
476 dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
477 if (consumerSliceLayout &&
478 consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
479 srcLayout = consumerSliceLayout.getParent();
481 srcLayout.getEffectiveSgLayoutAsInt();
484 for (
int dim = 0; dim < srcRank; dim++) {
485 if (llvm::is_contained(reductionDims, dim))
487 srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
491 consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
494 consumerLayout ? consumerLayout.getEffectiveSgDataAsInt()
497 consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
500 consumerLayout ? consumerLayout.getOrder() :
nullptr;
502 int remainingSgCount =
503 consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
507 for (
int i = 0; i < srcRank; i++) {
508 if (!llvm::is_contained(reductionDims, i) &&
509 consumerIdx <
static_cast<int>(consumerSgLayout.size())) {
510 sgLayout[i] = consumerSgLayout[consumerIdx];
511 sgData[i] = consumerSgData[consumerIdx];
512 remainingSgCount /= sgLayout[i];
513 order[i] = consumerOrder[consumerIdx];
520 int64_t remainOrder = consumerSgLayout.size();
521 for (
int i = 0; i < srcRank; i++) {
522 if (llvm::is_contained(reductionDims, i)) {
524 std::min(srcShape[i],
static_cast<int64_t>(remainingSgCount));
525 assert((srcShape[i] % sgLayout[i] == 0) &&
526 "source shape not divisible by sg_layout");
527 sgData[i] = srcShape[i] / sgLayout[i];
528 remainingSgCount /= sgLayout[i];
529 order[i] = remainOrder++;
533 assert(remainingSgCount == 1 &&
"not all subgroups distributed");
534 srcLayout = xegpu::LayoutAttr::get(
535 context, toInt32Attr(sgLayout), toInt32Attr(sgData),
538 (!orderAttr || orderAttr.empty()) ?
nullptr : toInt32Attr(order));
544 instData[srcRank - 2] =
545 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
546 instData[srcRank - 1] =
547 std::min(
static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
548 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
552 laneLayout[srcRank - 1] =
553 std::min(
static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
555 laneData[srcRank - 2] =
556 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
557 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
558 toInt32Attr(laneData));
561 return xegpu::SliceAttr::get(context, srcLayout,
572 auto srcShape = srcVecTy.getShape();
573 auto context = srcVecTy.getContext();
574 auto subgroupSize =
uArch->getSubgroupSize();
575 xegpu::LayoutAttr srcLayout;
578 assert(
true &&
"subgroup layout assignment not supported for reduction (op "
579 "is not expected at this level).");
581 assert(
true &&
"instData layout assignment not supported for reduction (op "
582 "is not expected at this level).");
585 laneLayout[0] = std::min(subgroupSize,
static_cast<int32_t
>(srcShape[0]));
587 srcLayout = xegpu::LayoutAttr::get(
592 auto result = xegpu::SliceAttr::get(context, srcLayout,
623 int srcElemTyBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
624 int resElemTyBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
630 assert(consumerLayout.getRank() ==
static_cast<int64_t>(srcShape.size()) &&
631 "laneData must be available for all dimensions");
632 size_t dim = srcShape.size() - 1;
636 const int subgroupSize =
uArch->getSubgroupSize();
638 if (srcElemTyBitWidth > resElemTyBitWidth) {
642 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
643 int innermostDimLaneLayout = subgroupSize;
645 sgDataValue = sgData[dim];
647 instDataValue = instData[dim];
650 while ((instDataValue <= srcShape[dim]) &&
651 (instDataValue % (innermostDimLaneLayout * bitWidthRatio) != 0))
653 assert((srcShape[dim] % instDataValue) == 0 &&
654 "srcShape, instData, and lanelayout for innermost must be 2^n !");
656 laneDataValue = laneData[dim];
657 while ((laneDataValue <= srcShape[dim]) &&
658 (laneDataValue % bitWidthRatio != 0))
662 xegpu::DistributeLayoutAttr resLayout;
663 resLayout = consumerLayout.setDimData(dim, sgDataValue, instDataValue,
667 return consumerLayout;
675 VectorType resVectorTy, xegpu::DistributeLayoutAttr consumerLayout,
678 xegpu::DistributeLayoutAttr requiredResLayout;
680 consumerLayout.getEffectiveInstDataAsInt();
682 consumerLayout.getEffectiveLaneDataAsInt();
684 consumerLayout.getEffectiveLaneLayoutAsInt();
689 requiredResLayout = consumerLayout;
690 int srcRank = srcShape.size();
694 "subgroup layout assignment not supported for insertStridedSlice.");
696 for (
int dim = 0; dim < srcRank; dim++) {
697 instDataValue = std::min(srcShape[dim], consumerInstData[dim]);
699 requiredResLayout.setDimData(dim, -1, instDataValue, -1);
702 for (
int dim = 0; dim < srcRank; dim++) {
703 assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
704 "srcShape must be divisible by laneLayout for all dimensions");
705 laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
706 consumerLaneData[dim]);
708 requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
711 return requiredResLayout;
728 xegpu::DistributeLayoutAttr consumerLayout,
bool isChunkedLoad,
732 return consumerLayout;
735 consumerLayout.getEffectiveInstDataAsInt();
737 consumerLayout.getEffectiveLaneDataAsInt();
743 if (!isChunkedLoad) {
745 instData.back() = std::min(
static_cast<int>(consumerInstData.back()),
746 maxChunkSize * subgroupSize);
747 return xegpu::LayoutAttr::get(context, instData);
750 std::min(
static_cast<int>(consumerLaneData.back()), maxChunkSize);
751 laneLayout.back() = std::min(
static_cast<int64_t>(subgroupSize),
752 resShape.back() / laneData.back());
753 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
756 assert(resShape.size() == 2 &&
"Chunked Store must access 2D tensor tile.");
758 instData[0] = subgroupSize;
760 std::min(
static_cast<int>(consumerInstData[1]), maxChunkSize);
761 return xegpu::LayoutAttr::get(context, instData);
763 laneLayout[0] = subgroupSize;
765 std::min(
static_cast<int>(consumerLaneData[1]), maxChunkSize);
766 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
779 auto context = resVecTy.getContext();
780 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
782 const auto *uArchInstruction =
783 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
785 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
788 (chunkSize > 1), maxChunkSize, resShape,
794xegpu::DistributeLayoutAttr
797 xegpu::DistributeLayoutAttr consumerLayout,
802 auto context = resVecTy.getContext();
803 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
805 const auto *uArchInstruction =
806 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
808 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
810 false, maxChunkSize, resShape,
825static xegpu::DistributeLayoutAttr
831 int srcShapeSize = srcShape.size();
838 "subgroup layout assignment not supported for storeScatter.");
842 if (!isChunkedStore) {
844 instData[srcShapeSize - 1] =
845 std::min(subgroupSize,
static_cast<int>(srcShape.back()));
846 return xegpu::LayoutAttr::get(context, instData);
848 laneLayout[srcShapeSize - 1] =
849 std::min(subgroupSize,
static_cast<int>(srcShape.back()));
850 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
853 assert(srcShapeSize == 2 &&
"Chunked Store must access 2D tensor tile.");
855 instData[0] = subgroupSize;
856 instData[1] = std::min(
static_cast<int>(srcShape[1]), maxChunkSize);
857 return xegpu::LayoutAttr::get(context, instData);
859 laneLayout[0] = subgroupSize;
860 laneData[1] = std::min(
static_cast<int>(srcShape[1]), maxChunkSize);
861 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
868xegpu::DistributeLayoutAttr
870 VectorType srcVecTy,
int chunkSize,
873 const int subgroupSize =
uArch->getSubgroupSize();
875 auto context = srcVecTy.getContext();
876 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
878 const auto *uArchInstruction =
879 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
881 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
883 maxChunkSize, srcShape, subgroupSize);
887xegpu::DistributeLayoutAttr
892 const int subgroupSize =
uArch->getSubgroupSize();
894 auto context = srcVecTy.getContext();
895 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
897 const auto *uArchInstruction =
898 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
900 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
903 srcShape, subgroupSize);
911template <
typename RankedTy>
914 std::optional<unsigned> packingSize = std::nullopt,
bool vnni =
false) {
916 assert(((ty.getRank() == 1 && !vnni) || ty.getRank() == 2) &&
917 "Expected 1D non-vnni or 2D vector.");
919 assert(ty.getElementType().isIntOrFloat() &&
920 "Expected int or float element type.");
922 auto context = ty.getContext();
923 auto rank = ty.getRank();
926 if (packingSize.has_value()) {
927 unsigned bitwidth = ty.getElementType().getIntOrFloatBitWidth();
928 int &laneDataPos = vnni ? laneData[rank - 2] : laneData.back();
929 laneDataPos = bitwidth < *packingSize ? *packingSize / bitwidth : 1;
932 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
947 for (
int sgLayout0 = 1; sgLayout0 <= sgCount; ++sgLayout0) {
948 if (sgCount % sgLayout0)
950 int64_t sgLayout1 = sgCount / sgLayout0;
951 int64_t sgData0 = wgShape[0] / sgLayout0;
952 int64_t sgData1 = wgShape[1] / sgLayout1;
953 if ((wgShape[0] % sgLayout0 || wgShape[1] % sgLayout1) ||
954 (sgData0 % instData[0] || sgData1 % instData[1]))
956 candidates.emplace_back(sgLayout0, sgLayout1);
963 int diffLhs = std::abs(
lhs.first -
lhs.second);
964 int diffRhs = std::abs(
rhs.first -
rhs.second);
965 if (diffLhs != diffRhs)
966 return diffLhs < diffRhs;
967 return lhs.first <
rhs.first;
976 std::tuple<xegpu::DistributeLayoutAttr, xegpu::DistributeLayoutAttr,
977 xegpu::DistributeLayoutAttr>>
979 VectorType bTy, VectorType cdTy,
980 xegpu::DistributeLayoutAttr consumerLayout,
int numSg,
982 auto context = aTy.getContext();
983 const auto *uArchInstruction =
987 auto getInstDataVectors = [&]()
991 const unsigned dataALen = aTy.getShape().front();
992 auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
996 const unsigned dataBLen = bTy.getShape().back();
997 auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType());
1001 auto supportedCLen = uArchInstruction->getSupportedN(cdTy.getElementType());
1004 if (maxALen == -1 || maxBLen == -1 || maxCLen == -1)
1005 return std::nullopt;
1008 instDataA[aTy.getRank() - 2] = maxALen;
1009 instDataA[aTy.getRank() - 1] = subgroupSize;
1011 instDataB[bTy.getRank() - 2] = subgroupSize;
1012 instDataB[bTy.getRank() - 1] = maxBLen;
1014 instDataCD[cdTy.getRank() - 2] = maxALen;
1015 instDataCD[cdTy.getRank() - 1] = maxCLen;
1016 return std::make_tuple(instDataA, instDataB, instDataCD);
1021 "Number of subgroups must be provided for sg layout creation.");
1022 auto instDataVecs = getInstDataVectors();
1024 return std::nullopt;
1025 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
1026 assert(instDataA.size() == 2 && instDataB.size() == 2 &&
1027 instDataCD.size() == 2 &&
1028 "Sg layout creation expects valid 2D inst data");
1030 std::optional<LayoutRepresentation> consumerSgLayout = std::nullopt;
1031 if (consumerLayout && consumerLayout.isForWorkgroup()) {
1033 consumerLayout.getEffectiveSgLayoutAsInt();
1034 consumerSgLayout = std::make_pair(sgLayoutD[0], sgLayoutD[1]);
1042 if (layoutsA.empty() || layoutsB.empty() || layoutsCD.empty())
1043 return std::nullopt;
1051 std::optional<LayoutRepresentation> bestPick;
1052 for (
auto &sgLayout : layoutsB) {
1053 if (setA.contains(sgLayout) && setCD.contains(sgLayout)) {
1055 if (consumerSgLayout.has_value() && sgLayout == *consumerSgLayout) {
1056 bestPick = sgLayout;
1064 bestPick = sgLayout;
1070 return std::nullopt;
1072 static_cast<int>(bestPick->second)};
1074 static_cast<int>(aTy.getShape()[0] / sgLayout[0]),
1075 static_cast<int>(aTy.getShape()[1] / sgLayout[1])};
1077 static_cast<int>(bTy.getShape()[0] / sgLayout[0]),
1078 static_cast<int>(bTy.getShape()[1] / sgLayout[1])};
1080 static_cast<int>(cdTy.getShape()[0] / sgLayout[0]),
1081 static_cast<int>(cdTy.getShape()[1] / sgLayout[1])};
1083 auto dpasALayout = xegpu::LayoutAttr::get(
1089 auto dpasBLayout = xegpu::LayoutAttr::get(
1095 auto dpasCDLayout = xegpu::LayoutAttr::get(
1100 return std::make_tuple(dpasALayout, dpasBLayout, dpasCDLayout);
1102 auto instDataVecs = getInstDataVectors();
1104 return std::nullopt;
1105 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
1106 return std::make_tuple(
1107 xegpu::LayoutAttr::get(
1109 xegpu::LayoutAttr::get(
1111 xegpu::LayoutAttr::get(
1115 aTy,
uArch, uArchInstruction->getPackedFormatBitSizeA());
1117 bTy,
uArch, uArchInstruction->getPackedFormatBitSizeB(),
true);
1120 return std::make_tuple(aLayout, bLayout, cdLayout);
1122 return std::nullopt;
1128 xegpu::DistributeLayoutAttr resLayout;
1133 if (
auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
1135 return xegpu::DistributeLayoutAttr();
1136 auto srcTy = dyn_cast<VectorType>(
broadcast.getSourceType());
1138 return xegpu::DistributeLayoutAttr();
1140 resLayout,
broadcast.getResultVectorType().getShape(),
1147 if (
auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
1149 return xegpu::DistributeLayoutAttr();
1158 if (
auto reduction = dyn_cast<vector::ReductionOp>(op)) {
1160 return xegpu::DistributeLayoutAttr();
1166 if (
auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
1168 return xegpu::DistributeLayoutAttr();
1169 int resElemBitWidth =
1170 bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
1171 int srcElemBitWidth =
1172 bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
1179 if (
auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
1181 return xegpu::DistributeLayoutAttr();
1183 resLayout, shapeCast.getResultVectorType().getShape(),
1184 shapeCast.getSourceVectorType().getShape());
1189 if (
auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
1191 return xegpu::DistributeLayoutAttr();
1194 resLayout, insertSlice.getDestVectorType().getShape(),
1195 insertSlice.getSourceVectorType().getShape());
1202 if (
auto transpose = dyn_cast<vector::TransposeOp>(op)) {
1204 return xegpu::DistributeLayoutAttr();
1206 transpose.getPermutation());
1213 return xegpu::DistributeLayoutAttr();
static Value broadcast(Location loc, Value toBroadcast, unsigned numElements, const TypeConverter &typeConverter, ConversionPatternRewriter &rewriter)
Broadcasts the value to vector with numElements number of elements.
std::pair< int64_t, int64_t > LayoutRepresentation
static xegpu::DistributeLayoutAttr setupGenericStoreAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, bool isChunkedStore, int maxChunkSize, ArrayRef< int64_t > srcShape, int subgroupSize)
Sets up the anchor layout for store scatter and store matrix operation.
static SmallVector< LayoutRepresentation > getValidLayouts(ArrayRef< int64_t > wgShape, ArrayRef< int64_t > instData, int64_t sgCount)
static xegpu::LayoutAttr getDefaultLaneLayout2DBlockIo(RankedTy ty, const xegpu::uArch::uArch *uArch, std::optional< unsigned > packingSize=std::nullopt, bool vnni=false)
static xegpu::DistributeLayoutAttr setupGenericLoadAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, xegpu::DistributeLayoutAttr consumerLayout, bool isChunkedLoad, int maxChunkSize, ArrayRef< int64_t > resShape, int subgroupSize)
Sets up the anchor layout for load gather and load matrix operation.
IRValueT get() const
Return the current value being used by this operand.
MLIRContext is the top-level object for a collection of MLIR operations.
This class represents an operand of an operation.
unsigned getOperandNumber() const
Return which operand this is in the OpOperand list of the Operation.
This is a value defined by a result of an operation.
Operation is the basic unit of execution within MLIR.
bool hasAttrOfType(NameT &&name)
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
MutableArrayRef< OpOperand > getOpOperands()
OperationName getName()
The name of an operation is the key identifier for it.
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
unsigned getNumResults()
Return the number of results held by this operation.
static WalkResult advance()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
Operation * getOwner() const
Return the owner of this operand.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
@ SubgroupMatrixMultiplyAcc
DistributeLayoutAttr inferShapeCastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a shape cast operation given the result layout attribute,...
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > permutation)
Infers the source layout attribute for a transpose operation given the result layout attribute and pe...
DistributeLayoutAttr inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for an insert strided slice operation given the result layout attr...
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
LayoutKind
Specifies the level of a layout hierarchy for comparison or propagation.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
SmallVector< NamedAttribute > dropInstDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping inst-data information from any DistributeLayoutAttr f...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
DistributeLayoutAttr setupLoadMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for load matrix operation.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
DistributeLayoutAttr inferBroadcastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a broadcast operation given the result layout attribute,...
DistributeLayoutAttr setupStoreScatterAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, const uArch::uArch *uArch)
Sets up the anchor layout for a store scatter operation.
void recoverTemporaryLayoutsDeprecated(Operation *op)
[to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and OpResult of of the given opera...
SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, DistributeLayoutAttr consumerLayout, SmallVector< int64_t > reductionDims, int numSg, const uArch::uArch *uArch)
Sets up layout for Multi-Reduction operations by creating a SliceAttr for the result.
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
DistributeLayoutAttr setupBitCastResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Setup the result layout attribute for a bitcast operation based on element type bitwidths.
void removeLayoutAttr(const T &operandOrResult)
Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
DistributeLayoutAttr inferMaskOffsetLayoutForScatterIO(DistributeLayoutAttr payloadLayout, int chunkSize)
Infers the layout attribute for mask and offset operand for Chunked load and store,...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
SmallVector< NamedAttribute > dropSgLayoutAndDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping sg-layout and sg-data information from any Distribute...
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
DistributeLayoutAttr inferBitCastSourceLayout(DistributeLayoutAttr resLayout, int resElemTyBitWidth, int srcElemTyBitWidth)
Infers the source layout attribute for a bitcast operation given the result layout attribute,...
DistributeLayoutAttr setupInsertStridedSliceResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the result layout for an insert strided slice operation.
DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand)
Gets the expected layout for a given consumer operand.
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
DistributeLayoutAttr inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout, SmallVector< int64_t > reduceDims)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
DistributeLayoutAttr setupLoadGatherAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for a load gather operation.
std::optional< std::tuple< DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr > > setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg, const uArch::uArch *uArch)
Sets up the anchor layouts for a dpas operands (A, B, and C/D).
SliceAttr setupReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, const uArch::uArch *uArch)
Sets up layout for Reduction operations by creating a SliceAttr for the result.
DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, const uArch::uArch *uArch)
Sets up the anchor layout for a store matrix operation.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int32_t > DenseI32ArrayAttr
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
uArch(StringRef name, StringRef description, llvm::ArrayRef< const Instruction * > instructionRegistry)
const Instruction * getInstruction(InstructionKind instKind) const