MLIR 23.0.0git
XeGPULayoutImpl.cpp
Go to the documentation of this file.
1//===---- XeGPULayoutImpl.cpp - MLIR Utilities for XeGPUOps
2//------------------===//
3//
4// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements layout utility functions for XeGPU dialect
11// transformation.
12//
13//===----------------------------------------------------------------------===//
14
22#include "mlir/IR/Builders.h"
23#include "mlir/IR/Operation.h"
24#include "mlir/IR/ValueRange.h"
27#include "llvm/Support/FormatVariadic.h"
28#include <cstdint>
29#include <numeric>
30
31using namespace mlir;
32
34 op->walk([&](Operation *nestOp) {
35 for (OpOperand &opr : nestOp->getOpOperands()) {
36 auto layout = getDistributeLayoutAttr(opr.get());
37 setDistributeLayoutAttr(opr, layout);
38 }
39
40 for (OpResult result : nestOp->getOpResults()) {
41 auto layout = getDistributeLayoutAttr(result);
43 }
44 });
45}
46
50 out.reserve(attrs.size());
51
52 for (auto attr : attrs) {
53 if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
54 auto newLayout = dist.dropSgLayoutAndData();
55 if (newLayout)
56 out.emplace_back(attr.getName(), newLayout);
57 } else {
58 out.push_back(attr);
59 }
60 }
61
62 return out;
63}
64
68 out.reserve(attrs.size());
69
70 for (auto attr : attrs) {
71 if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
72 auto newLayout = dist.dropInstData();
73 if (newLayout)
74 out.emplace_back(attr.getName(), newLayout);
75 } else {
76 out.push_back(attr);
77 }
78 }
79
80 return out;
81}
82
83// Attach layout attributes to all vector-type operands of operations within
84// the given operation's region. Reports an error if any vector operand lacks
85// a layout attribute.
87 auto result = rootOp->walk([&](Operation *op) {
88 for (OpOperand &operand : op->getOpOperands()) {
89 // Layouts are needed for vector type only.
90 if (!isa<VectorType>(operand.get().getType()))
91 continue;
92 // Skip block arguments since they don't have defining ops to attach
93 // layout attributes to.
94 if (isa<BlockArgument>(operand.get()))
95 continue;
96 auto layout = xegpu::getDistributeLayoutAttr(operand.get());
97 if (!layout) {
98 op->emitWarning("Could not find layout attribute for operand ")
99 << operand.getOperandNumber() << " of operation " << op->getName();
100 continue;
101 }
102 xegpu::setTemporaryLayout(operand, layout);
103 }
104 return WalkResult::advance();
105 });
106 return !result.wasInterrupted();
107}
108
109template <typename T, typename>
110void xegpu::removeLayoutAttr(const T &operandOrResult) {
111 Operation *owner = operandOrResult.getOwner();
112 std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
113 if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
114 owner->removeAttr(name);
115}
116
117// Explicit instantiation for OpResult
118template void
120
121// Explicit instantiation for OpOperand
122template void
124
126 op->walk([&](Operation *nestOp) {
127 // Remove all attributes of DistributeLayoutAttr type
128 SmallVector<StringAttr> attrsToRemove;
129 for (auto namedAttr : nestOp->getAttrs()) {
130 if (isa<DistributeLayoutAttr>(namedAttr.getValue()))
131 attrsToRemove.push_back(namedAttr.getName());
132 }
133 for (auto attrName : attrsToRemove)
134 nestOp->removeAttr(attrName);
135 });
136}
137
138/// Infers the source layout attribute for a broadcast operation given the
139/// result layout attribute, result shape, source shape.
140xegpu::DistributeLayoutAttr
141xegpu::inferBroadcastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
142 ArrayRef<int64_t> resShape,
143 ArrayRef<int64_t> srcShape) {
144
145 SmallVector<int64_t> bcastDims;
146 auto returnLayout = resLayout;
147
148 // Handling broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
149 int dimDiff = resShape.size() - srcShape.size();
150
151 if (dimDiff > 0) {
152 // Adding the missing leading dims
153 for (int i = 0; i < dimDiff; i++)
154 bcastDims.push_back(i);
155
156 // Create a slice layout for the source
157 returnLayout = xegpu::SliceAttr::get(
158 resLayout.getContext(), resLayout,
159 DenseI64ArrayAttr::get(resLayout.getContext(), bcastDims));
160 }
161 return returnLayout;
162}
163
164/// Infers the source layout attribute for a reduction operation given the
165/// result layout attribute and reduced dims.
166xegpu::DistributeLayoutAttr
167xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
168 SmallVector<int64_t> reduceDims) {
169
170 assert(isa<xegpu::SliceAttr>(resLayout) &&
171 "reduction result layout must be slice layout");
172
173 xegpu::SliceAttr sliceLayout = dyn_cast<xegpu::SliceAttr>(resLayout);
174
175 assert((reduceDims == sliceLayout.getDims().asArrayRef()) &&
176 "reduction dims must match with slice dims");
177
178 return sliceLayout.getParent();
179}
180
181/// Infers the source layout attribute for a bitcast operation given the
182/// result layout attribute, result element type bitwidth, and source element
183/// type bitwidth.
184xegpu::DistributeLayoutAttr
185xegpu::inferBitCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
186 int resElemTyBitWidth, int srcElemTyBitWidth) {
187
188 SmallVector<int64_t> sgData = resLayout.getEffectiveSgDataAsInt();
189 SmallVector<int64_t> instData = resLayout.getEffectiveInstDataAsInt();
190 SmallVector<int64_t> laneData = resLayout.getEffectiveLaneDataAsInt();
191 size_t sgDataSize = sgData.size();
192 size_t instDataSize = instData.size();
193 size_t laneDataSize = laneData.size();
194 int64_t sgDataValue = -1;
195 int64_t instDataValue = -1;
196 int64_t laneDataValue = -1;
197 int64_t dim = resLayout.getRank() - 1;
198
199 if (srcElemTyBitWidth <= resElemTyBitWidth) {
200 int bitWidthRatio = resElemTyBitWidth / srcElemTyBitWidth;
201 if (sgDataSize)
202 sgDataValue = sgData.back() * bitWidthRatio;
203 if (instDataSize)
204 instDataValue = instData.back() * bitWidthRatio;
205 if (laneDataSize)
206 laneDataValue = laneData.back() * bitWidthRatio;
207 } else {
208 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
209 if (sgDataSize) {
210 assert((sgData.back() % bitWidthRatio) == 0 &&
211 "sgData not divisible by bitWidthRatio");
212 sgDataValue = sgData.back() / bitWidthRatio;
213 }
214 if (instDataSize) {
215 assert((instData.back() % bitWidthRatio) == 0 &&
216 "instData not divisible by bitWidthRatio");
217 instDataValue = instData.back() / bitWidthRatio;
218 }
219 if (laneDataSize) {
220 assert((laneData.back() % bitWidthRatio) == 0 &&
221 "laneData not divisible by bitWidthRatio");
222 laneDataValue = laneData.back() / bitWidthRatio;
223 }
224 }
225
226 xegpu::DistributeLayoutAttr finalSrcLayout;
227 finalSrcLayout =
228 resLayout.setDimData(dim, sgDataValue, instDataValue, laneDataValue);
229
230 return finalSrcLayout;
231}
232
233/// Infers the source layout attribute for an insert strided slice operation
234/// given the result layout attribute, result shape, and source shape. Removes
235/// leading dimensions from the result layout to match the source shape size.
236xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
237 xegpu::DistributeLayoutAttr resLayout, ArrayRef<int64_t> resShape,
238 ArrayRef<int64_t> srcShape) {
239
240 int srcShapeSize = srcShape.size();
241 int resShapeSize = resShape.size();
242 int dimDiff = resShapeSize - srcShapeSize;
243
244 assert(isa<xegpu::LayoutAttr>(resLayout) &&
245 "insertStridedSlice result layout must be plain layout");
246 auto context = resLayout.getContext();
247 auto resInstData = resLayout.getEffectiveInstDataAsInt();
248 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
249 auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
250
251 if (resInstData.size() != 0) {
252 SmallVector<int> inferredInstData(srcShapeSize);
253 for (int i = 0; i < srcShapeSize; i++)
254 inferredInstData[i] = resInstData[i + dimDiff];
255 return xegpu::LayoutAttr::get(context, inferredInstData);
256 }
257
258 if (resLaneLayout.size() != 0) {
259 SmallVector<int> inferredLaneLayout(srcShapeSize);
260 SmallVector<int> inferredLaneData(srcShapeSize);
261 for (int i = 0; i < srcShapeSize; i++) {
262 inferredLaneLayout[i] = resLaneLayout[i + dimDiff];
263 inferredLaneData[i] = resLaneData[i + dimDiff];
264 }
265 return xegpu::LayoutAttr::get(context, inferredLaneLayout,
266 inferredLaneData);
267 }
268 return nullptr;
269}
270
271/// Infers the source layout attribute for a shape cast operation given the
272/// result layout attribute, result shape, and source shape.
273xegpu::DistributeLayoutAttr
274xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
275 ArrayRef<int64_t> resShape,
276 ArrayRef<int64_t> srcShape) {
277
278 // There are three use cases:
279 // 1. expand dims of low-rank dimensions (e.g., 1D to 2D): to set up the
280 // tensor before broadcast
281 // 2. split dim of a high-rank dimension (e.g., 1D to 2D): to setup tensor
282 // for multi-stage reduction
283 // 3. combines all dims to a single dim and put in the innermost dim in 2d as
284 // [1, combinedData] or [combinedData]. Say, [2, 4, 8] -> [1, 64] or [64]
285 // Use cases are only supported after workgroup distribution,
286 // like cross-sg reduction saves multidimension data to
287 // 1D slm buffer, shapecast inserted by cse/canonicalization passes.
288
289 // Use case 1: Shapes only differ by expanding unit dimensions, for broadcast
290 SmallVector<int64_t> expandedUnitDims;
291
292 if (xegpu::matchUnitDimExpansion(srcShape, resShape, expandedUnitDims)) {
293 // create a slice layout for the source by removing the expanded unit dims
294 auto sliceDimsAttr = DenseI64ArrayAttr::get(
295 resLayout.getContext(), ArrayRef<int64_t>(expandedUnitDims));
296 auto srcLayout =
297 xegpu::SliceAttr::get(resLayout.getContext(), resLayout, sliceDimsAttr);
298 return srcLayout;
299 }
300
301 // Use case 2: Dim split from source to result, for multi-stage reduction
302 SmallVector<SmallVector<int64_t>> splitDimGroups;
303 if (xegpu::matchSplitDimExpansion(srcShape, resShape, splitDimGroups)) {
304 auto srcLayout = resLayout;
305 for (const auto &dimGroup : splitDimGroups)
306 srcLayout = srcLayout.collapseDims(dimGroup);
307
308 return srcLayout;
309 }
310
311 // Use case 3: Collaspse to innermost dim, for cross-sg reduction to SLM
312 auto matchCollapseToInnermostDim = [&](ArrayRef<int64_t> src,
313 ArrayRef<int64_t> dst) -> bool {
314 // only one non-unit dim in dst which is the innermost dim
315 if ((dst.size() != 2) && (dst.size() != 1))
316 return false;
317 int64_t srcSize = std::accumulate(src.begin(), src.end(), 1LL,
318 std::multiplies<int64_t>());
319 if (dst.size() == 1)
320 return (dst[0] == srcSize);
321 return (dst[0] == 1) && (dst[1] == srcSize);
322 };
323
324 if (matchCollapseToInnermostDim(srcShape, resShape)) {
325 int srcShapeSize = srcShape.size();
326 int resShapeSize = resShape.size();
327 auto context = resLayout.getContext();
328 auto resInstData = resLayout.getEffectiveInstDataAsInt();
329 auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
330 auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
331
332 // Extract layout info from result's innermost dimension and apply to
333 // source's innermost dimension while setting all other dimensions to 1.
334 // The inferred layout is restricted by srcShape to ensure it fits within
335 // the source dimensions.
336 // Examples 1:
337 // srcShape=[8, 16, 32], resShape=[1, 4096]
338 // resInstData=[1, 16]
339 // -> inferredInstData=[1, 1, min(16, 32)]=[1, 1, 16]
340 // Examples 2:
341 // srcShape=[4, 8, 64], resShape=[2048]
342 // resLaneLayout=[16], resLaneData=[2]
343 // -> inferredLaneLayout=[1, 1, 16]
344 // -> inferredLaneData=[1, 1, min(2, 64/16)]=[1, 1, 2]
345
346 if (resInstData.size() != 0) {
347 // assert resInstData must be 1 for all but the innermost dim
348 for (int i = 0; i < resShapeSize - 1; i++) {
349 assert(resInstData[i] == 1 &&
350 "only innermost dim can have non-unit instData");
351 }
352 SmallVector<int> inferredInstData(srcShapeSize, 1);
353 inferredInstData[srcShapeSize - 1] =
354 std::min(resInstData[resShapeSize - 1], srcShape[srcShapeSize - 1]);
355 return xegpu::LayoutAttr::get(context, inferredInstData);
356 }
357
358 if (resLaneLayout.size() != 0) {
359 for (int i = 0; i < resShapeSize - 1; i++) {
360 assert(resLaneData[i] == 1 &&
361 "only innermost dim can have non-unit instData");
362 }
363 assert(srcShape.back() % resLaneLayout.back() == 0 &&
364 "source innermost dim must be >= result lane layout");
365 SmallVector<int> inferredLaneLayout(srcShapeSize, 1);
366 SmallVector<int> inferredLaneData(srcShapeSize, 1);
367 inferredLaneLayout.back() = resLaneLayout.back();
368 inferredLaneData.back() = std::min(
369 resLaneData.back(), srcShape.back() / inferredLaneLayout.back());
370 return xegpu::LayoutAttr::get(context, inferredLaneLayout,
371 inferredLaneData);
372 }
373 }
374 llvm_unreachable("running into unsupported shape cast scenarios");
375 return nullptr;
376}
377
378/// Sets up layout for reduction operations by creating a SliceAttr for the
379/// result.
380///
381/// Algorithm Overview:
382/// This function attempts to construct a source layout that, when sliced along
383/// reduction dimensions, produces a result layout compatible with the
384/// consumer layout.
385///
386/// For subgroup layouts, it first tries to align the source layout's subgroup
387/// layout and data with the consumer's layout on non-reduction dimensions.
388/// Then, it distributes remaining subgroups across reduction dimensions. This
389/// avoids subgroup data redistribution overhead between the reduced result and
390/// its consumer.
391///
392/// InstData requries {1, ..., min(maxReduceVectorSize, srcShape),subgroupSize}
393/// Lane Layout requires {1, ..., 1, subgroupSize}
394/// Lane data requires {1, ..., min(maxReduceVectorSize, srcShape), 1}
395///
396/// Examples:
397/// 1. Subgroup layout - Row reduction on 2D tensor:
398/// srcShape=[32, 64], reductionDims=[1], resShape=[32], subgroupSize=16,
399/// workgroupSize=32
400/// Consumer Layout:
401/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
402/// [1]>} Result: srcLayout with sgLayout=[4, 8], sgData=[8, 8] (matches
403/// consumer on non-reduction dim, minimizing data redistribution on
404/// reduction dim)
405/// 2. Subgroup layout - Same example above but consumer has different layout:
406/// sgLayout=[32], sgData=[1]
407/// Result: srcLayout with sgLayout=[32,1], sgData=[1, 64]
408/// (distributes all subgroups on non reduction dim)
409///
410/// 2. InstData layout - Column reduction:
411/// srcShape=[32, 64], reductionDims=[0], subgroupSize=16
412/// Result: instData=[1, 16] (maxReduceVectorSize=1, subgroupSize on
413/// innermost)
414///
415/// 3. Lane layout - Multi-dimensional reduction:
416/// srcShape=[16, 32, 64], reductionDims=[1], subgroupSize=16
417/// Result: laneLayout=[1, 1, 16], laneData=[1, 1, 1]
418/// (subgroupSize on innermost dim, max vector size on reduction dim)
419
421 xegpu::LayoutKind layoutKind, VectorType srcVecTy,
422 DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
423 const xegpu::uArch::uArch *uArch) {
424
425 auto srcShape = srcVecTy.getShape();
426 int srcRank = srcShape.size();
427 auto context = consumerLayout.getContext();
428
429 // Reduction layout requires at least 2D tensors
430 if (srcRank < 2)
431 return nullptr;
432
433 // Helper lambda to convert int64 vectors to int32 DenseArrayAttr
434 auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
435 SmallVector<int32_t> vec32(vec.begin(), vec.end());
436 return DenseI32ArrayAttr::get(context, vec32);
437 };
438
439 // Extract original plain layout for workgroup/subgroup size recovery
440 xegpu::SliceAttr consumerSliceLayout =
441 dyn_cast<xegpu::SliceAttr>(consumerLayout);
442 DistributeLayoutAttr plainLayout =
443 consumerSliceLayout ? consumerSliceLayout.flatten().getParent()
444 : consumerLayout;
445
446 const int subgroupSize = uArch->getSubgroupSize();
447 int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
448
449 xegpu::DistributeLayoutAttr srcLayout;
450
451 if (layoutKind == xegpu::LayoutKind::Subgroup) {
452 auto sgLayoutVec = plainLayout.getEffectiveSgLayoutAsInt();
453 const int workgroupSize = std::accumulate(
454 sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
455 SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank);
456 SmallVector<int64_t> consumerSgLayout =
457 consumerLayout.getEffectiveSgLayoutAsInt();
458 int remainingSgCount = workgroupSize;
459 int consumerIdx = consumerSgLayout.size() - 1;
460
461 // First pass: Match consumer's layout on non-reduction dimensions
462 for (int i = srcRank - 1; i >= 0; i--) {
463 if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
464 sgLayout[i] = consumerSgLayout[consumerIdx];
465 assert((srcShape[i] % sgLayout[i] == 0) &&
466 "source shape not divisible by consumer sg_layout");
467 sgData[i] = srcShape[i] / sgLayout[i];
468 remainingSgCount /= sgLayout[i];
469 consumerIdx--;
470 }
471 }
472
473 // Second pass: Distribute remaining subgroups across reduction dimensions
474 for (int i = srcRank - 1; i >= 0; i--) {
475 if (llvm::is_contained(reductionDims, i)) {
476 sgLayout[i] =
477 std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
478 assert((srcShape[i] % sgLayout[i] == 0) &&
479 "source shape not divisible by sg_layout");
480 sgData[i] = srcShape[i] / sgLayout[i];
481 remainingSgCount /= sgLayout[i];
482 }
483 }
484
485 assert(remainingSgCount == 1 && "not all subgroups distributed");
486 srcLayout = xegpu::LayoutAttr::get(
487 context, toInt32Attr(sgLayout), toInt32Attr(sgData),
488 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
489 /*lane_data =*/nullptr, /*order =*/nullptr);
490
491 } else if (layoutKind == xegpu::LayoutKind::InstData) {
492
493 SmallVector<int64_t> instData(srcRank, 1);
494 instData[srcRank - 2] =
495 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
496 instData[srcRank - 1] = subgroupSize;
497 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
498
499 } else if (layoutKind == xegpu::LayoutKind::Lane) {
500
501 SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
502 laneLayout[srcRank - 1] = subgroupSize;
503 laneData[srcRank - 2] =
504 std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
505 srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
506 toInt32Attr(laneData),
507 consumerLayout.getOrder());
508 }
509
510 return xegpu::SliceAttr::get(context, srcLayout,
511 DenseI64ArrayAttr::get(context, reductionDims));
512}
513
514/// Sets up the result layout for a bitcast operation.
515/// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
516/// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
517/// result layout can be correctly divided back to the source layout during
518/// inference.
519///
520/// Examples:
521/// 1. Casting f32 -> f16 (32-bit to 16-bit, bitWidthRatio = 2):
522/// Consumer layout: instData=[1, 16], subgroupSize=16
523/// Source shape: [8, 32]
524/// Result layout: instData=[1, 32] (16 * 2)
525/// The innermost dimension is multiplied by 2 to maintain consistency.
526///
527/// 2. Casting f32 -> i8 (32-bit to 8-bit, bitWidthRatio = 4):
528/// Consumer instData=[1, 16], subgroupSize=16
529/// Source shape: [4, 128]
530/// adjust the instData from [1, 16] to [1, 16 * 4 = 64]
531///
532/// 3. Casting i8 -> i32 (8-bit to 32-bit, bitWidthRatio = 1/4):
533/// Consumer layout: laneLayout=[1, 16], laneData=[1, 4]
534/// No adjustment needed - returns consumer layout directly.
535///
536xegpu::DistributeLayoutAttr xegpu::setupBitCastResultLayout(
537 xegpu::LayoutKind layoutKind, VectorType srcVecTy, VectorType resVecTy,
538 DistributeLayoutAttr consumerLayout, const xegpu::uArch::uArch *uArch) {
539
540 int srcElemTyBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
541 int resElemTyBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
542
543 ArrayRef<int64_t> srcShape = srcVecTy.getShape();
544 SmallVector<int64_t> sgData = consumerLayout.getEffectiveSgDataAsInt();
545 SmallVector<int64_t> instData = consumerLayout.getEffectiveInstDataAsInt();
546 SmallVector<int64_t> laneData = consumerLayout.getEffectiveLaneDataAsInt();
547 size_t dim = srcShape.size() - 1;
548 int64_t sgDataValue = -1;
549 int64_t instDataValue = -1;
550 int64_t laneDataValue = -1;
551
552 const int subgroupSize = uArch->getSubgroupSize();
553
554 if (srcElemTyBitWidth > resElemTyBitWidth) {
555 // When casting to a smaller bitwidth, multiply the result layout
556 // accordingly to ensure it can be divided by the ratio back to the
557 // source layout.
558 int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
559 int innermostDimLaneLayout = subgroupSize;
560 if (layoutKind == xegpu::LayoutKind::Subgroup) {
561 assert(sgData.size() == srcShape.size() &&
562 "sgData must be available for all dimensions");
563 sgDataValue = sgData[dim];
564 } else if (layoutKind == xegpu::LayoutKind::InstData) {
565 assert(instData.size() == srcShape.size() &&
566 "instData must be available for all dimensions");
567 instDataValue = instData[dim];
568 // Adjust instDataValue so it still fits within an instruction after
569 // dividing by bitWidthRatio
570 while ((instDataValue <= srcShape[dim]) &&
571 (instDataValue % (innermostDimLaneLayout * bitWidthRatio) != 0))
572 instDataValue *= 2;
573 assert((srcShape[dim] % instDataValue) == 0 &&
574 "srcShape, instData, and lanelayout for innermost must be 2^n !");
575 } else if (layoutKind == xegpu::LayoutKind::Lane) {
576 assert(laneData.size() == srcShape.size() &&
577 "laneData must be available for all dimensions");
578 laneDataValue = laneData[dim];
579 while ((laneDataValue <= srcShape[dim]) &&
580 (laneDataValue % bitWidthRatio != 0))
581 laneDataValue *= 2;
582 }
583 // Now set only instData and laneData, preserving sgData
584 xegpu::DistributeLayoutAttr resLayout;
585 resLayout = consumerLayout.setDimData(dim, sgDataValue, instDataValue,
586 laneDataValue);
587 return resLayout;
588 }
589 return consumerLayout;
590}
591
592/// Sets up the result layout for an insert strided slice operation.
593/// Creates a result layout based on the specified layout kind (InstData or
594/// Lane).
595/// Subgroup layout is currently not supported for this operation.
596/// InstData layout is first set to be {1, .., subgroupSize}.
597/// Lane layout is first set to be {1, ..., subgroupSize} with lane data {1,
598/// ..., 1}. The instData and laneData is then adjusted to contain packed data,
599/// by checking if the consumerLayout's innermost dimension.
600///
601/// Examples:
602/// 1. InstData layout without packing:
603/// resShape=[8, 32], subgroupSize=16, bitwidth=32
604/// packingFactor=1, packedDataSize=16
605/// consumerLayout: instData=[1, 16]
606/// Result: instData=[1, 16]
607///
608/// 2. InstData layout with packing:
609/// resShape=[8, 64], subgroupSize=16, bitwidth=8, packingFactor=4
610/// consumerLayout: instData=[1, 64]
611/// Result: instData=[1, 64] (adjusted for packed data)
612///
613/// 3. Lane layout without packing:
614/// resShape=[4, 64], subgroupSize=16, bitwidth=32
615/// consumerLayout: laneLayout=[1, 16], laneData=[1, 1]
616/// Result: laneLayout=[1, 16], laneData=[1, 1]
617///
618/// 4. Lane layout with packing:
619/// resShape=[4, 64], subgroupSize=16, bitwidth=16, packingFactor=2
620/// consumerLayout: laneLayout=[1, 16], laneData=[1, 2]
621/// Result: laneLayout=[1, 16], laneData=[1, 2] (adjusted for packed data)
622xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
623 xegpu::LayoutKind layoutKind, VectorType srcVectorTy,
624 VectorType resVectorTy, xegpu::DistributeLayoutAttr consumerLayout,
625 const xegpu::uArch::uArch *uArch) {
626
627 xegpu::DistributeLayoutAttr requiredResLayout;
628 auto subgroupSize = uArch->getSubgroupSize();
629 auto context = resVectorTy.getContext();
630 auto resShape = resVectorTy.getShape();
631 int resShapeSize = resShape.size();
632 auto srcShape = srcVectorTy.getShape();
633 SmallVector<int64_t> consumerInstData =
634 consumerLayout.getEffectiveInstDataAsInt();
635 SmallVector<int64_t> consumerLaneData =
636 consumerLayout.getEffectiveLaneDataAsInt();
637
638 SmallVector<int> instData(resShapeSize, 1);
639 SmallVector<int> laneLayout(resShapeSize, 1);
640 SmallVector<int> laneData(resShapeSize, 1);
641
642 const unsigned packingSize{uArch->getGeneralPackedFormatBitSize()};
643 unsigned bitwidth = resVectorTy.getElementType().getIntOrFloatBitWidth();
644 int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
645 int packedDataSize = subgroupSize * packingFactor;
646
647 if (layoutKind == xegpu::LayoutKind::Subgroup) {
648 assert(true &&
649 "subgroup layout assignment not supported for insertStridedSlice.");
650 } else if (layoutKind == xegpu::LayoutKind::InstData) {
651 assert(srcShape.back() >= subgroupSize &&
652 "source innermost dim must be >= subgroupSize");
653 instData.back() = subgroupSize;
654 if (consumerInstData.back() == packedDataSize &&
655 srcShape.back() >= packedDataSize)
656 instData.back() = packedDataSize;
657 requiredResLayout = xegpu::LayoutAttr::get(context, instData);
658 } else if (layoutKind == xegpu::LayoutKind::Lane) {
659 laneLayout.back() = subgroupSize;
660 laneData.back() = 1;
661 if (consumerLaneData.back() == packingFactor &&
662 srcShape.back() >= packedDataSize)
663 laneData.back() = packingFactor;
664 requiredResLayout = xegpu::LayoutAttr::get(context, laneLayout, laneData);
665 }
666 return requiredResLayout;
667}
668
669/// Sets up the anchor layout for load gather and load matrix operation.
670/// load matrix lowers to load gather and 1d block load. All of them share the
671/// same layout setup logic.
672/// For Subgroup layout, uses the consumer layout directly.
673/// non-chunked loads:
674/// InstData = {1, ..., min(consumer, maxLaneLoadSize * subgroupSize)}
675/// LaneLayout = {1, ..., subgroupSize}
676/// lane_data = {1, ..., min(consumer, maxLaneLoadSize)}
677/// chunked loads:
678/// InstData = {subgroupSize, min(consumer, maxLaneLoadSize)}
679/// LaneLayout = {subgroupSize, 1}
680/// lane_data={1,min(consumer, maxLaneLoadSize)}
681static xegpu::DistributeLayoutAttr setupGenericLoadAnchorLayout(
682 xegpu::LayoutKind layoutKind, mlir::MLIRContext *context,
683 xegpu::DistributeLayoutAttr consumerLayout, bool isChunkedLoad,
684 int maxChunkSize, ArrayRef<int64_t> resShape, int subgroupSize) {
685
686 if (layoutKind == xegpu::LayoutKind::Subgroup)
687 return consumerLayout;
688
689 SmallVector<int64_t> consumerInstData =
690 consumerLayout.getEffectiveInstDataAsInt();
691 SmallVector<int64_t> consumerLaneData =
692 consumerLayout.getEffectiveLaneDataAsInt();
693
694 SmallVector<int> instData(resShape.size(), 1);
695 SmallVector<int> laneLayout(resShape.size(), 1);
696 SmallVector<int> laneData(resShape.size(), 1);
697
698 if (!isChunkedLoad) {
699 if (layoutKind == xegpu::LayoutKind::InstData) {
700 instData.back() = std::min(static_cast<int>(consumerInstData.back()),
701 maxChunkSize * subgroupSize);
702 return xegpu::LayoutAttr::get(context, instData);
703 } else if (layoutKind == xegpu::LayoutKind::Lane) {
704 laneData.back() =
705 std::min(static_cast<int>(consumerLaneData.back()), maxChunkSize);
706 laneLayout.back() = std::min(static_cast<int64_t>(subgroupSize),
707 resShape.back() / laneData.back());
708 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
709 }
710 } else {
711 assert(resShape.size() == 2 && "Chunked Store must access 2D tensor tile.");
712 if (layoutKind == xegpu::LayoutKind::InstData) {
713 instData[0] = subgroupSize;
714 instData[1] =
715 std::min(static_cast<int>(consumerInstData[1]), maxChunkSize);
716 return xegpu::LayoutAttr::get(context, instData);
717 } else if (layoutKind == xegpu::LayoutKind::Lane) {
718 laneLayout[0] = subgroupSize;
719 laneData[1] =
720 std::min(static_cast<int>(consumerLaneData[1]), maxChunkSize);
721 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
722 }
723 }
724 return nullptr;
725}
726
727/// Sets up the anchor layout for a load gather operation.
728xegpu::DistributeLayoutAttr xegpu::setupLoadGatherAnchorLayout(
729 xegpu::LayoutKind layoutKind, VectorType resVecTy, int chunkSize,
730 xegpu::DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch) {
731
732 const int subgroupSize = uArch->getSubgroupSize();
733 ArrayRef<int64_t> resShape = resVecTy.getShape();
734 auto context = resVecTy.getContext();
735 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
736
737 const auto *uArchInstruction =
738 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
740 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
741
742 return setupGenericLoadAnchorLayout(layoutKind, context, consumerLayout,
743 (chunkSize > 1), maxChunkSize, resShape,
744 subgroupSize);
745}
746
747/// Sets up the anchor layout for load matrix operation.
748/// TODO: enhance load matrix to indicate lowering to chunked load or not.
749xegpu::DistributeLayoutAttr
751 VectorType resVecTy,
752 xegpu::DistributeLayoutAttr consumerLayout,
753 const xegpu::uArch::uArch *uArch) {
754
755 const int subgroupSize = uArch->getSubgroupSize();
756 ArrayRef<int64_t> resShape = resVecTy.getShape();
757 auto context = resVecTy.getContext();
758 auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
759
760 const auto *uArchInstruction =
761 dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
763 int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
764 return setupGenericLoadAnchorLayout(layoutKind, context, consumerLayout,
765 false, maxChunkSize, resShape,
766 subgroupSize);
767}
768
769/// Sets up the anchor layout for store scatter and store matrix operation.
770/// store matrix lowers to store scatter and 1d block store. All of them share
771/// the same layout setup logic. For Subgroup layout, not support yet.
772/// non-chunked stores:
773/// InstData = {1, ..., subgroupSize}
774/// LaneLayout = {1, ..., subgroupSize}
775/// lane_data = {1, ..., 1}
776/// chunked stores:
777/// InstData = {subgroupSize, min(srcVec, maxLaneStoreSize)}
778/// LaneLayout = {subgroupSize, 1}
779/// lane_data={1,min(srcVec, maxLaneStoreSize)}
780static xegpu::DistributeLayoutAttr
782 mlir::MLIRContext *context, bool isChunkedStore,
783 int maxChunkSize, ArrayRef<int64_t> srcShape,
784 int subgroupSize) {
785
786 int srcShapeSize = srcShape.size();
787 SmallVector<int> instData(srcShapeSize, 1);
788 SmallVector<int> laneLayout(srcShapeSize, 1);
789 SmallVector<int> laneData(srcShapeSize, 1);
790
791 if (layoutKind == xegpu::LayoutKind::Subgroup) {
792 assert(true &&
793 "subgroup layout assignment not supported for storeScatter.");
794 return nullptr;
795 }
796
797 if (!isChunkedStore) {
798 if (layoutKind == xegpu::LayoutKind::InstData) {
799 instData[srcShapeSize - 1] =
800 std::min(subgroupSize, static_cast<int>(srcShape.back()));
801 return xegpu::LayoutAttr::get(context, instData);
802 } else if (layoutKind == xegpu::LayoutKind::Lane) {
803 laneLayout[srcShapeSize - 1] =
804 std::min(subgroupSize, static_cast<int>(srcShape.back()));
805 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
806 }
807 } else {
808 assert(srcShapeSize == 2 && "Chunked Store must access 2D tensor tile.");
809 if (layoutKind == xegpu::LayoutKind::InstData) {
810 instData[0] = subgroupSize;
811 instData[1] = std::min(static_cast<int>(srcShape[1]), maxChunkSize);
812 return xegpu::LayoutAttr::get(context, instData);
813 } else if (layoutKind == xegpu::LayoutKind::Lane) {
814 laneLayout[0] = subgroupSize;
815 laneData[1] = std::min(static_cast<int>(srcShape[1]), maxChunkSize);
816 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
817 }
818 }
819 return nullptr;
820}
821
822/// Sets up the anchor layout for a store scatter operation.
823xegpu::DistributeLayoutAttr
825 VectorType srcVecTy, int chunkSize,
826 const uArch::uArch *uArch) {
827
828 const int subgroupSize = uArch->getSubgroupSize();
829 ArrayRef<int64_t> srcShape = srcVecTy.getShape();
830 auto context = srcVecTy.getContext();
831 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
832
833 const auto *uArchInstruction =
834 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
836 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
837 return setupGenericStoreAnchorLayout(layoutKind, context, (chunkSize > 1),
838 maxChunkSize, srcShape, subgroupSize);
839}
840
841/// Sets up the anchor layout for a store matrix operation.
842xegpu::DistributeLayoutAttr
844 VectorType srcVecTy,
845 const xegpu::uArch::uArch *uArch) {
846
847 const int subgroupSize = uArch->getSubgroupSize();
848 ArrayRef<int64_t> srcShape = srcVecTy.getShape();
849 auto context = srcVecTy.getContext();
850 auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
851
852 const auto *uArchInstruction =
853 dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
855 int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
856
857 return setupGenericStoreAnchorLayout(layoutKind, context, false, maxChunkSize,
858 srcShape, subgroupSize);
859}
860
861// This function returns the default lane layout for a given vector type.
862// - `packingSize` means multiple consecutive elements can be accessed together
863// as a single unit.
864// - `vnni` means data packing is column-wise (i.e., 2x1xf16 with vnni vs.
865// 1x2xf16 w/o vnni).
866template <typename RankedTy>
867static xegpu::LayoutAttr getDefaultLaneLayout2DBlockIo(
868 RankedTy ty, const xegpu::uArch::uArch *uArch,
869 std::optional<unsigned> packingSize = std::nullopt, bool vnni = false) {
870 // Expecting a 1D or 2D vector.
871 assert(((ty.getRank() == 1 && !vnni) || ty.getRank() == 2) &&
872 "Expected 1D non-vnni or 2D vector.");
873 // Expecting int or float element type.
874 assert(ty.getElementType().isIntOrFloat() &&
875 "Expected int or float element type.");
876
877 auto context = ty.getContext();
878 auto rank = ty.getRank();
879 SmallVector<int> laneLayout(rank, 1);
880 SmallVector<int> laneData(rank, 1);
881 if (packingSize.has_value()) {
882 unsigned bitwidth = ty.getElementType().getIntOrFloatBitWidth();
883 int &laneDataPos = vnni ? laneData[rank - 2] : laneData.back();
884 laneDataPos = bitwidth < *packingSize ? *packingSize / bitwidth : 1;
885 }
886 laneLayout.back() = uArch->getSubgroupSize();
887 return xegpu::LayoutAttr::get(context, laneLayout, laneData);
888}
889
890// This function returns all layouts for the given sgCount, whose sgData:
891// 1. Evenly divides the wgShape.
892// 2. Is a multiple of instData.
893// Example:
894// wgShape = [128, 64], instData = [8, 16], sgCount = 32
895// Returns layouts:
896// [(8,4), (16,2)], which correspond to sgData [16,16] and [8,32].
897using LayoutRepresentation = std::pair<int64_t, int64_t>;
900 int64_t sgCount) {
902 for (int sgLayout0 = 1; sgLayout0 <= sgCount; ++sgLayout0) {
903 if (sgCount % sgLayout0)
904 continue;
905 int64_t sgLayout1 = sgCount / sgLayout0;
906 int64_t sgData0 = wgShape[0] / sgLayout0;
907 int64_t sgData1 = wgShape[1] / sgLayout1;
908 if ((wgShape[0] % sgLayout0 || wgShape[1] % sgLayout1) ||
909 (sgData0 % instData[0] || sgData1 % instData[1]))
910 continue;
911 candidates.emplace_back(sgLayout0, sgLayout1);
912 }
913 // Sort primarily by how balanced they are
914 // (i.e., minimize the absolute difference between the two dimensions), and
915 // secondarily by the first dimension in ascending order.
916 llvm::sort(candidates, [](const LayoutRepresentation &lhs,
917 const LayoutRepresentation &rhs) {
918 int diffLhs = std::abs(lhs.first - lhs.second);
919 int diffRhs = std::abs(rhs.first - rhs.second);
920 if (diffLhs != diffRhs)
921 return diffLhs < diffRhs;
922 return lhs.first < rhs.first;
923 });
924 return candidates;
925}
926
927/// Sets up the anchor layouts for dpas operands (A, B, and C/D).
928/// The numSg and consumerLayout (optional) are only used by sg layout creation.
929std::optional<
930 std::tuple<xegpu::DistributeLayoutAttr, xegpu::DistributeLayoutAttr,
931 xegpu::DistributeLayoutAttr>>
932xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
933 VectorType bTy, VectorType cdTy,
934 xegpu::DistributeLayoutAttr consumerLayout,
935 const xegpu::uArch::uArch *uArch, int numSg) {
936 auto context = aTy.getContext();
937 const auto *uArchInstruction =
938 dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
940
941 auto getInstDataVectors = [&]()
942 -> std::optional<std::tuple<SmallVector<int64_t>, SmallVector<int64_t>,
944 const int subgroupSize = uArch->getSubgroupSize();
945 const unsigned dataALen = aTy.getShape().front();
946 auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
947 const int maxALen =
948 xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
949
950 const unsigned dataBLen = bTy.getShape().back();
951 auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType());
952 const int maxBLen =
953 xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
954
955 auto supportedCLen = uArchInstruction->getSupportedN(cdTy.getElementType());
956 const int maxCLen =
957 xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedCLen));
958 if (maxALen == -1 || maxBLen == -1 || maxCLen == -1)
959 return std::nullopt;
960
961 SmallVector<int64_t> instDataA(aTy.getRank(), 1);
962 instDataA[aTy.getRank() - 2] = maxALen;
963 instDataA[aTy.getRank() - 1] = subgroupSize;
964 SmallVector<int64_t> instDataB(bTy.getRank(), 1);
965 instDataB[bTy.getRank() - 2] = subgroupSize;
966 instDataB[bTy.getRank() - 1] = maxBLen;
967 SmallVector<int64_t> instDataCD(cdTy.getRank(), 1);
968 instDataCD[cdTy.getRank() - 2] = maxALen;
969 instDataCD[cdTy.getRank() - 1] = maxCLen;
970 return std::make_tuple(instDataA, instDataB, instDataCD);
971 };
972
973 if (layoutKind == xegpu::LayoutKind::Subgroup) {
974 assert(numSg > 0 &&
975 "Number of subgroups must be provided for sg layout creation.");
976 auto instDataVecs = getInstDataVectors();
977 if (!instDataVecs)
978 return std::nullopt;
979 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
980 assert(instDataA.size() == 2 && instDataB.size() == 2 &&
981 instDataCD.size() == 2 &&
982 "Sg layout creation expects valid 2D inst data");
983
984 std::optional<LayoutRepresentation> consumerSgLayout = std::nullopt;
985 if (consumerLayout && consumerLayout.isForWorkgroup()) {
986 SmallVector<int64_t> sgLayoutD =
987 consumerLayout.getEffectiveSgLayoutAsInt();
988 consumerSgLayout = std::make_pair(sgLayoutD[0], sgLayoutD[1]);
989 }
990
991 // Step 1. Get all valid layouts for A, B and C/D operands.
992 // Order them from most balanced to least balanced.
993 auto layoutsA = getValidLayouts(aTy.getShape(), instDataA, numSg);
994 auto layoutsB = getValidLayouts(bTy.getShape(), instDataB, numSg);
995 auto layoutsCD = getValidLayouts(cdTy.getShape(), instDataCD, numSg);
996 if (layoutsA.empty() || layoutsB.empty() || layoutsCD.empty())
997 return std::nullopt;
998
999 // Step 2. If the consumer layout can be reused for all operands, that
1000 // layout is chosen. Otherwise, pick the most balanced subgroup layout
1001 // that is valid for A, B and C (if present) operands
1002 llvm::DenseSet<LayoutRepresentation> setA(layoutsA.begin(), layoutsA.end());
1003 llvm::DenseSet<LayoutRepresentation> setCD(layoutsCD.begin(),
1004 layoutsCD.end());
1005 std::optional<LayoutRepresentation> bestPick;
1006 for (auto &sgLayout : layoutsB) {
1007 if (setA.contains(sgLayout) && setCD.contains(sgLayout)) {
1008 // Is in (A and B and CD) and matches consumer -> best pick
1009 if (consumerSgLayout.has_value() && sgLayout == *consumerSgLayout) {
1010 bestPick = sgLayout;
1011 break;
1012 }
1013 // Is in (A and B and CD) layoutsB is ordered from most
1014 // balanced to least. So the first one we see is the most balanced one,
1015 // remember it and later only update if there is one that matches the
1016 // consumer.
1017 if (!bestPick)
1018 bestPick = sgLayout;
1019 }
1020 }
1021 // Step 3. If there is no subgroup layout compatible with A, B and C (if
1022 // present) operands, we fail.
1023 if (!bestPick)
1024 return std::nullopt;
1025 SmallVector<int> sgLayout = {static_cast<int>(bestPick->first),
1026 static_cast<int>(bestPick->second)};
1027 SmallVector<int> sgDataA = {
1028 static_cast<int>(aTy.getShape()[0] / sgLayout[0]),
1029 static_cast<int>(aTy.getShape()[1] / sgLayout[1])};
1030 SmallVector<int> sgDataB = {
1031 static_cast<int>(bTy.getShape()[0] / sgLayout[0]),
1032 static_cast<int>(bTy.getShape()[1] / sgLayout[1])};
1033 SmallVector<int> sgDataCD = {
1034 static_cast<int>(cdTy.getShape()[0] / sgLayout[0]),
1035 static_cast<int>(cdTy.getShape()[1] / sgLayout[1])};
1036
1037 auto dpasALayout = xegpu::LayoutAttr::get(
1038 context, DenseI32ArrayAttr::get(context, sgLayout),
1039 DenseI32ArrayAttr::get(context, sgDataA),
1040 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
1041 /*lane_data =*/nullptr, /*order =*/nullptr);
1042
1043 auto dpasBLayout = xegpu::LayoutAttr::get(
1044 context, DenseI32ArrayAttr::get(context, sgLayout),
1045 DenseI32ArrayAttr::get(context, sgDataB),
1046 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
1047 /*lane_data =*/nullptr, /*order =*/nullptr);
1048
1049 auto dpasCDLayout = xegpu::LayoutAttr::get(
1050 context, DenseI32ArrayAttr::get(context, sgLayout),
1051 DenseI32ArrayAttr::get(context, sgDataCD),
1052 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
1053 /*lane_data =*/nullptr, /*order =*/nullptr);
1054 return std::make_tuple(dpasALayout, dpasBLayout, dpasCDLayout);
1055 } else if (layoutKind == xegpu::LayoutKind::InstData) {
1056 auto instDataVecs = getInstDataVectors();
1057 if (!instDataVecs)
1058 return std::nullopt;
1059 auto [instDataA, instDataB, instDataCD] = *instDataVecs;
1060 return std::make_tuple(
1061 xegpu::LayoutAttr::get(
1062 context, SmallVector<int>(instDataA.begin(), instDataA.end())),
1063 xegpu::LayoutAttr::get(
1064 context, SmallVector<int>(instDataB.begin(), instDataB.end())),
1065 xegpu::LayoutAttr::get(
1066 context, SmallVector<int>(instDataCD.begin(), instDataCD.end())));
1067 } else if (layoutKind == xegpu::LayoutKind::Lane) {
1068 auto aLayout = getDefaultLaneLayout2DBlockIo(
1069 aTy, uArch, uArchInstruction->getPackedFormatBitSizeA());
1070 auto bLayout = getDefaultLaneLayout2DBlockIo(
1071 bTy, uArch, uArchInstruction->getPackedFormatBitSizeB(), true);
1072 auto cdLayout = getDefaultLaneLayout2DBlockIo(
1073 cdTy, uArch, uArchInstruction->getPackedFormatBitSizeB());
1074 return std::make_tuple(aLayout, bLayout, cdLayout);
1075 }
1076 return std::nullopt;
1077}
1078
1079xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
1080 Operation *op = operand.getOwner();
1081 unsigned idx = operand.getOperandNumber();
1082 xegpu::DistributeLayoutAttr resLayout;
1083 if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
1084 resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
1085
1086 // For vector::BroadcastOp, infer the source layout from the result layout.
1087 if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
1088 if (!resLayout)
1089 return xegpu::DistributeLayoutAttr();
1090 auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
1091 if (!srcTy)
1092 return xegpu::DistributeLayoutAttr();
1094 resLayout, broadcast.getResultVectorType().getShape(),
1095 srcTy.getShape());
1096 }
1097
1098 // For vector::MultiDimReductionOp, infer source layout from result layout
1099 // using reduction dims. Acc operand is expected to have the same layout as
1100 // the result.
1101 if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
1102 if (!resLayout)
1103 return xegpu::DistributeLayoutAttr();
1104 if (idx == 0) {
1105 SmallVector<int64_t> reductionDims(reduction.getReductionDims());
1106 return xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
1107 }
1108 if (idx == 1)
1109 return resLayout;
1110 }
1111
1112 // For vector::BitCastOp, infer source layout from result layout using
1113 // element type bitwidths.
1114 if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
1115 if (!resLayout)
1116 return xegpu::DistributeLayoutAttr();
1117 int resElemBitWidth =
1118 bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
1119 int srcElemBitWidth =
1120 bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
1121 return xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
1122 srcElemBitWidth);
1123 }
1124
1125 // For vector::ShapeCastOp, infer source layout from result layout using
1126 // shapes.
1127 if (auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
1128 if (!resLayout)
1129 return xegpu::DistributeLayoutAttr();
1131 resLayout, shapeCast.getResultVectorType().getShape(),
1132 shapeCast.getSourceVectorType().getShape());
1133 }
1134
1135 // For vector::InsertStridedSliceOp, infer source layout from result layout.
1136 // Dest vector must have the same layout as the result.
1137 if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
1138 if (!resLayout)
1139 return xegpu::DistributeLayoutAttr();
1140 if (idx == 0)
1142 resLayout, insertSlice.getDestVectorType().getShape(),
1143 insertSlice.getSourceVectorType().getShape());
1144 if (idx == 1)
1145 return resLayout;
1146 }
1147 // For elementwise operations, all operands must have the same layout as the
1148 // result.
1150 if (!resLayout)
1151 return xegpu::DistributeLayoutAttr();
1152 return resLayout;
1153 }
1154 // TODO: Handle more cases as needed here.
1155 // By default, assume no layout conflict and return the current layout of the
1156 // operand.
1157 return xegpu::getDistributeLayoutAttr(operand.get());
1158}
lhs
static Value broadcast(Location loc, Value toBroadcast, unsigned numElements, const TypeConverter &typeConverter, ConversionPatternRewriter &rewriter)
Broadcasts the value to vector with numElements number of elements.
std::pair< int64_t, int64_t > LayoutRepresentation
static xegpu::DistributeLayoutAttr setupGenericStoreAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, bool isChunkedStore, int maxChunkSize, ArrayRef< int64_t > srcShape, int subgroupSize)
Sets up the anchor layout for store scatter and store matrix operation.
static SmallVector< LayoutRepresentation > getValidLayouts(ArrayRef< int64_t > wgShape, ArrayRef< int64_t > instData, int64_t sgCount)
static xegpu::LayoutAttr getDefaultLaneLayout2DBlockIo(RankedTy ty, const xegpu::uArch::uArch *uArch, std::optional< unsigned > packingSize=std::nullopt, bool vnni=false)
static xegpu::DistributeLayoutAttr setupGenericLoadAnchorLayout(xegpu::LayoutKind layoutKind, mlir::MLIRContext *context, xegpu::DistributeLayoutAttr consumerLayout, bool isChunkedLoad, int maxChunkSize, ArrayRef< int64_t > resShape, int subgroupSize)
Sets up the anchor layout for load gather and load matrix operation.
IRValueT get() const
Return the current value being used by this operand.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents an operand of an operation.
Definition Value.h:257
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
Definition Value.cpp:226
This is a value defined by a result of an operation.
Definition Value.h:457
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
bool hasAttrOfType(NameT &&name)
Definition Operation.h:583
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition Operation.h:520
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:415
MutableArrayRef< OpOperand > getOpOperands()
Definition Operation.h:391
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:119
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:805
result_range getOpResults()
Definition Operation.h:428
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
Definition Operation.h:608
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:412
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult advance()
Definition WalkResult.h:47
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
DistributeLayoutAttr inferShapeCastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a shape cast operation given the result layout attribute,...
SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, DistributeLayoutAttr consumerLayout, SmallVector< int64_t > reductionDims, const uArch::uArch *uArch)
Sets up layout for reduction operations by creating a SliceAttr for the result.
DistributeLayoutAttr inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for an insert strided slice operation given the result layout attr...
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
std::optional< std::tuple< DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr > > setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch, int numSg)
Sets up the anchor layouts for a dpas operands (A, B, and C/D).
LayoutKind
Specifies the level of a layout hierarchy for comparison or propagation.
Definition XeGPU.h:32
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
SmallVector< NamedAttribute > dropInstDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping inst-data information from any DistributeLayoutAttr f...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
DistributeLayoutAttr setupLoadMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for load matrix operation.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
DistributeLayoutAttr inferBroadcastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a broadcast operation given the result layout attribute,...
DistributeLayoutAttr setupStoreScatterAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, const uArch::uArch *uArch)
Sets up the anchor layout for a store scatter operation.
void recoverTemporaryLayoutsDeprecated(Operation *op)
[to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and OpResult of of the given opera...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
DistributeLayoutAttr setupBitCastResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Setup the result layout attribute for a bitcast operation based on element type bitwidths.
void removeLayoutAttr(const T &operandOrResult)
Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
SmallVector< NamedAttribute > dropSgLayoutAndDataOnAttrs(ArrayRef< NamedAttribute > attrs)
Updates the NamedAttribute sequence by dropping sg-layout and sg-data information from any Distribute...
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
DistributeLayoutAttr inferBitCastSourceLayout(DistributeLayoutAttr resLayout, int resElemTyBitWidth, int srcElemTyBitWidth)
Infers the source layout attribute for a bitcast operation given the result layout attribute,...
DistributeLayoutAttr setupInsertStridedSliceResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the result layout for an insert strided slice operation.
xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand)
Gets the expected layout for a given consumer operand.
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
DistributeLayoutAttr inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout, SmallVector< int64_t > reduceDims)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
DistributeLayoutAttr setupLoadGatherAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for a load gather operation.
DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, const uArch::uArch *uArch)
Sets up the anchor layout for a store matrix operation.
Include the generated interface declarations.
virtual unsigned getGeneralPackedFormatBitSize() const =0
virtual int getSubgroupSize() const =0
uArch(StringRef name, StringRef description, llvm::ArrayRef< const Instruction * > instructionRegistry)
Definition uArchBase.h:151
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:163