MLIR 23.0.0git
XeGPUUtils.cpp
Go to the documentation of this file.
1//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===//
2//
3// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utility methods for working with the XeGPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
22#include "mlir/IR/Builders.h"
23#include "mlir/IR/BuiltinOps.h"
24#include "mlir/IR/Operation.h"
25#include "mlir/IR/ValueRange.h"
28#include "llvm/Support/Casting.h"
29#include "llvm/Support/FormatVariadic.h"
30#include <cstdint>
31#include <numeric>
32
33using namespace mlir;
34
35/// convert ArrayRef<ValueRange> into SmallVector<Value>
38 for (const auto &vals : values)
39 llvm::append_range(result, vals);
40 return result;
41}
42
43FailureOr<VectorType>
44mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
45 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
46 // It only works for subgroup level layout, which only has lane_layout
47 // and lane_data, and is to distribute a SIMD code into SIMT code.
48 if (!layout || !layout.isForSubgroup())
49 return failure();
50
51 SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
52 SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
53 auto tdescShape = tdescTy.getShape();
54 auto elementType = tdescTy.getElementType();
55
56 // compute sgSize by multiply elements of laneLayout
57 // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
58 // e.g. for 1D layout, sgSize = laneLayout[0]
59 int64_t sgSize = llvm::product_of(laneLayout);
60
61 // Check if the tensor descriptor shape is distributable.
62 int64_t tensorSize = 1;
63 for (auto [tdescDim, laneDim, laneDataDim] :
64 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
65 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
66 "tensor descriptor shape is not distributable");
67 tensorSize *= tdescDim;
68 }
69 // tensorSize must be adjusted for array_length.
70 tensorSize *= tdescTy.getArrayLength();
71
72 return VectorType::get({tensorSize / sgSize}, elementType);
73}
74
75FailureOr<VectorType>
76mlir::xegpu::getDistributedVectorType(VectorType originalType,
77 xegpu::LayoutAttr layout) {
78 int64_t rank = originalType.getRank();
79 if (rank < 1)
80 return failure();
81 ArrayRef<int64_t> shape = originalType.getShape();
82 // For rank > 2, leading dimensions are treated as batch/array dimensions.
83 // Drop them and use the product as arrayLength.
84 int arrayLength = 1;
85 while (shape.size() > 2) {
86 arrayLength *= shape[0];
87 shape = shape.drop_front();
88 }
89 // Drop matching leading dims from layout if the layout rank exceeds the
90 // remaining shape rank.
91 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
92 auto laneData = layout.getEffectiveLaneDataAsInt();
93 while (!laneLayout.empty() && laneLayout.size() > shape.size()) {
94 laneLayout.erase(laneLayout.begin());
95 laneData.erase(laneData.begin());
96 }
97 auto trimmedLayout = xegpu::LayoutAttr::get(
98 layout.getContext(),
99 SmallVector<int32_t>(laneLayout.begin(), laneLayout.end()),
100 SmallVector<int32_t>(laneData.begin(), laneData.end()));
101 auto helperTdescTy = xegpu::TensorDescType::get(
102 shape, originalType.getElementType(), arrayLength,
103 /*boundary_check=*/true,
104 /*memory_space=*/xegpu::MemorySpace::Global, trimmedLayout);
105 return xegpu::getDistributedVectorType(helperTdescTy);
106}
107
108FailureOr<VectorType>
109xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
110 VectorType originalType) {
111 if (!layout)
112 return failure();
113 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
114 "Expecting a valid layout.");
115
116 int64_t vectorRank = originalType.getRank();
117 int64_t layoutRank = layout.getRank();
118 assert(vectorRank >= layoutRank && "Vector rank must be >= layout rank.");
119
120 // When the vector has more dimensions than the layout, only the trailing
121 // dimensions are distributed. Leading dimensions are preserved as-is.
122 int64_t offset = vectorRank - layoutRank;
123 ArrayRef<int64_t> fullShape = originalType.getShape();
124 SmallVector<int64_t> trailingShape(fullShape.begin() + offset,
125 fullShape.end());
126 auto distributedShapeOrFailure =
127 layout.computeDistributedShape(trailingShape);
128 if (failed(distributedShapeOrFailure))
129 return failure();
130
131 SmallVector<int64_t> resultShape(fullShape.begin(),
132 fullShape.begin() + offset);
133 resultShape.append(distributedShapeOrFailure->begin(),
134 distributedShapeOrFailure->end());
135 return VectorType::get(resultShape, originalType.getElementType());
136}
137
138std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
139 const StringRef prefix("layout_operand_");
140 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
141 return llvm::formatv("{0}{1}", prefix, idx).str();
142}
143
145 const StringRef prefix = "layout_result_";
146 return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
147}
148
149xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
150 if (!value)
151 return nullptr;
152
153 if (auto result = dyn_cast<OpResult>(value)) {
154 Operation *defOp = result.getDefiningOp();
155 assert(defOp && "result must have a defining op");
156
157 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
158 auto layout = anchorOp.getAnchorLayout();
159 return layout;
160 }
161
162 std::string layoutName = getTemporaryLayoutName(result);
163 if (defOp->hasAttr(layoutName)) {
164 auto layout =
165 defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
166 return layout;
167 }
168 }
169
170 if (auto arg = dyn_cast<BlockArgument>(value)) {
171 auto *parentOp = arg.getOwner()->getParentOp();
172 if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
173 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
174 if (tiedInit)
175 return getTemporaryLayout(*tiedInit);
176 }
177 }
178
179 if (auto tdescTy =
180 dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
181 return tdescTy.getLayoutAttr();
182
183 return nullptr;
184}
185xegpu::DistributeLayoutAttr
187 Operation *op = opr.getOwner();
188 unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
189
190 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
191 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
192 if (idx == 0) {
193 return dpasOp.getLayoutAAttr();
194 } else if (idx == 1) {
195 return dpasOp.getLayoutBAttr();
196 } else if (idx == 2) {
197 return dpasOp.getLayoutCdAttr();
198 }
199 }
200 if (auto dpasMxOp = dyn_cast<xegpu::DpasMxOp>(op)) {
201 // DpasMxOp has operands: a, b, optional acc, optional scale_a, optional
202 // scale_b
203 unsigned currentIdx = 0;
204
205 if (idx == currentIdx++)
206 return dpasMxOp.getLayoutAAttr();
207
208 if (idx == currentIdx++)
209 return dpasMxOp.getLayoutBAttr();
210
211 if (dpasMxOp.getAcc())
212 if (idx == currentIdx++)
213 return dpasMxOp.getLayoutCdAttr();
214
215 if (dpasMxOp.getScaleA())
216 if (idx == currentIdx++)
217 return dpasMxOp.getLayoutAScaleAttr();
218
219 if (dpasMxOp.getScaleB())
220 if (idx == currentIdx++)
221 return dpasMxOp.getLayoutBScaleAttr();
222
223 return nullptr;
224 }
225 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
226 return convertOp.getInputLayoutAttr();
227 }
228 auto layout = anchorOp.getAnchorLayout();
229
230 if (idx == 0)
231 return layout;
232
233 // For StoreNdOp and StoreMatrixOp,
234 // the layout is valid for the first two operands: value and memref/tdesc.
235 if (isa<xegpu::StoreNdOp, xegpu::StoreMatrixOp>(op) && (idx < 2))
236 return layout;
237
238 if (isa<xegpu::StoreScatterOp>(op)) {
239 xegpu::StoreScatterOp store(op);
240 int chunkSize = store.getChunkSize().value_or(1);
241 if (layout && idx >= 2 && chunkSize > 1)
242 return layout.dropDims(llvm::to_vector(
243 llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
244 return layout;
245 }
246 if (isa<xegpu::LoadGatherOp>(op)) {
247 xegpu::LoadGatherOp load(op);
248 int chunkSize = load.getChunkSize().value_or(1);
249 if (layout && idx >= 1 && chunkSize > 1)
250 return layout.dropDims(llvm::to_vector(
251 llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
252 return layout;
253 }
254 }
255
256 std::string layoutName = xegpu::getTemporaryLayoutName(opr);
257 if (op->hasAttr(layoutName)) {
258 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
259 return layout;
260 }
261
262 return nullptr;
263}
264
265// Returns the permanent layout attribute for the given result if it's
266// available on the defining op. Otherwise returns the provided layout.
267xegpu::DistributeLayoutAttr
268maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
269 const OpResult &result, mlir::Operation *owner,
270 const std::string &name) {
271 xegpu::DistributeLayoutAttr candidate = layout;
272
273 if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
274 if (auto perm = loadOp.getLayoutAttr())
275 candidate = perm;
276 }
277
278 return candidate;
279}
280
281// Returns the permanent layout attribute for the given operand if it's
282// available on the defining op. Otherwise returns the provided layout.
283xegpu::DistributeLayoutAttr
284maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
285 const OpOperand &operand, mlir::Operation *owner,
286 const std::string &name) {
287 xegpu::DistributeLayoutAttr candidate = layout;
288 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
289
290 if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
291 if (idx == 0) {
292 if (auto perm = storeOp.getLayoutAttr())
293 candidate = perm;
294 }
295 }
296
297 return candidate;
298}
299
300// TODO-LayoutRefactor: Remove this function after replacing use
301// with setTemporaryLayout or setAnchorLayout
303 const mlir::OpResult &result,
304 const mlir::xegpu::DistributeLayoutAttr layout) {
305 Operation *owner = result.getOwner();
306
307 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
308 if (anchorOp.getAnchorLayout() == layout)
309 return;
310 anchorOp.setAnchorLayout(layout);
311 return;
312 }
313
314 std::string name = xegpu::getTemporaryLayoutName(result);
315 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
316 return;
317 }
318 if (layout) {
319 owner->setAttr(name, layout);
320 }
321}
322
323// TODO-LayoutRefactor: Remove this function after replacing use
324// with setTemporaryLayout or setAnchorLayout
326 const DistributeLayoutAttr layout) {
327 Operation *owner = operand.getOwner();
328 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
329
330 if (!layout) {
331 return;
332 }
333 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
334 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
335 if (idx == 0) {
336 return dpasOp.setLayoutAAttr(layout);
337 } else if (idx == 1) {
338 return dpasOp.setLayoutBAttr(layout);
339 } else if (idx == 2) {
340 return dpasOp.setLayoutCdAttr(layout);
341 }
342 }
343 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
344 return convertOp.setInputLayoutAttr(layout);
345 }
346
347 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
348 // the layout is valid for the first two operands: value and memref/tdesc.
349 // For other operations, the layout applies to the first operand only.
350 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
351 owner)) {
352 if (idx < 2) {
353 anchorOp.setAnchorLayout(layout);
354 }
355 } else {
356 if (idx == 0) {
357 anchorOp.setAnchorLayout(layout);
358 }
359 }
360 }
361
362 std::string name = xegpu::getTemporaryLayoutName(operand);
363 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
364 return;
365 }
366 if (layout) {
367 owner->setAttr(name, layout);
368 }
369}
370
371template <typename T, typename>
372xegpu::DistributeLayoutAttr
373xegpu::getTemporaryLayout(const T &operandOrResult) {
374 Operation *op = operandOrResult.getOwner();
375
376 std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
377 if (op->hasAttr(layoutName)) {
378 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
379 return layout;
380 }
381
382 return nullptr;
383}
384
385template xegpu::DistributeLayoutAttr
387template xegpu::DistributeLayoutAttr
389
390template <typename T, typename>
391void xegpu::setTemporaryLayout(const T &operandOrResult,
392 const xegpu::DistributeLayoutAttr layout) {
393 Operation *owner = operandOrResult.getOwner();
394 std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
395 if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
396 return;
397 }
398 if (layout) {
399 owner->setAttr(name, layout);
400 }
401}
402
404 const mlir::OpResult &result,
405 const mlir::xegpu::DistributeLayoutAttr layout);
406
408 const mlir::OpOperand &operand,
409 const mlir::xegpu::DistributeLayoutAttr layout);
410
414 auto vecTy = dyn_cast<VectorType>(value.getType());
415 if (!vecTy)
416 return {value};
417
418 ArrayRef<int64_t> srcShape = vecTy.getShape();
419 if (!computeShapeRatio(srcShape, shape))
420 return {value};
421
422 int64_t srcShapeRank = srcShape.size();
423 int64_t targetShapeRank = shape.size();
424
425 SmallVector<int64_t> adjustedTargetShape(srcShape.size());
426 int64_t rankDiff = srcShapeRank - targetShapeRank;
427 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
428 1);
429 llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);
430
432 for (SmallVector<int64_t> offsets :
433 StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
434 SmallVector<int64_t> staticStrides(offsets.size(), 1);
435 Value slice = vector::ExtractStridedSliceOp::create(
436 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
437
438 // Reshape to remove leading unit dims if needed
439 if (srcShapeRank > targetShapeRank) {
440 auto targetTy = VectorType::get(shape, vecTy.getElementType());
441 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
442 }
443 result.push_back(slice);
444 }
445
446 return result;
447}
448
450 ValueRange values,
452 VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
453 assert(llvm::all_of(values.getTypes(),
454 [&](Type type) { return type == inputTy; }) &&
455 "values must be of the same VectorType");
456
457 Type elemTy = inputTy.getElementType();
458 ArrayRef<int64_t> tileShape = inputTy.getShape();
459
460 VectorType resultTy = VectorType::get(shape, elemTy);
461 auto zeroAttr = builder.getZeroAttr(elemTy);
462 Value result = arith::ConstantOp::create(
463 builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
464
465 for (auto [src, offsets] :
466 llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
467 SmallVector<int64_t> staticStrides(tileShape.size(), 1);
468 result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
469 offsets, staticStrides);
470 }
471 return result;
472}
473
474std::optional<std::string> xegpu::getChipStr(Operation *op) {
475 auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();
476
477 if (!gpuModuleOp)
478 return std::nullopt;
479
480 auto targetAttrs = gpuModuleOp.getTargets();
481 if (targetAttrs) {
482 for (auto &attr : *targetAttrs) {
483 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
484 if (xevmAttr)
485 return xevmAttr.getChip().str();
486 }
487 }
488
489 return std::nullopt;
490}
491
492/// Generates element-wise addition ops of two arrays with same length.
494 Location loc,
497 assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
499 for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
500 auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
501 auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
502 results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
503 }
504 return results;
505}
506
507/// Generates element-wise addition ops of two arrays with automatic alignment.
508/// When the input arrays have different sizes, the shorter array is
509/// right-aligned with the longer array, and the unmatched leading elements from
510/// the longer array are preserved unchanged. This is commonly used for offset
511/// computation where higher-dimensional offsets need to be added to
512/// lower-dimensional adjustments.
513///
514/// Example:
515/// lhs = [l1, l2, l3], rhs = [r1, r2]
516/// Result: [11, l2+r1, l3+r2]
521 // ensure a is longer than b
522 ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
523 ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
524 SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
525 a = a.slice(a.size() - b.size());
526 results.append(addElementwise(builder, loc, a, b));
527 return results;
528}
529
530template <typename T>
532 ArrayRef<T> candidateMultiples) {
533 static_assert(std::is_integral<T>::value, "T must be an integer type");
534 int largest = -1;
535 SmallVector<T> multiples = {1};
536 if (!candidateMultiples.empty())
537 multiples =
538 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
539 for (T candidate : candidates) {
540 for (T multiple : multiples) {
541 int value = static_cast<int>(candidate * multiple);
542 if (value != 0 && dim % value == 0 && value > largest)
543 largest = value;
544 }
545 }
546 return largest;
547}
548
550 vector::CombiningKind kind, uint32_t size) {
551 // First reduce on a single thread to get per lane reduction value.
552 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
553 // Parallel reduction using butterfly shuffles.
554 for (uint64_t i = 1; i < size; i <<= 1) {
555 Value shuffled =
556 gpu::ShuffleOp::create(builder, loc, laneVal, i, /** width = **/ size,
557 /** mode = **/ gpu::ShuffleMode::XOR)
558 .getShuffleResult();
559 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
560 }
561 return laneVal;
562}
563
566 vector::CombiningKind kind,
567 int64_t reductionDim, Location loc,
568 PatternRewriter &rewriter) {
569 VectorType sourceType = src.getType();
570 int64_t sourceRank = sourceType.getRank();
571 // Expecting at least a 2D source vector. Leading dimensions (all except the
572 // last two) must be unit.
573 assert(sourceRank >= 2 && "expected at least a 2D source vector");
574 for (int64_t i = 0; i < sourceRank - 2; ++i)
575 assert(sourceType.getShape()[i] == 1 &&
576 "expected leading dimensions to be unit");
577 int64_t rowIdx = sourceRank - 2;
578 int64_t columnIdx = sourceRank - 1;
579 int64_t sourceH = sourceType.getShape()[rowIdx];
580 int64_t sourceW = sourceType.getShape()[columnIdx];
581 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
582 // Create a constant vector to hold the result of the reduction.
583 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
584 Value reductionResult = arith::ConstantOp::create(
585 rewriter, loc, acc.getType(),
586 DenseElementsAttr::get(acc.getType(), zeroAttr));
587 auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
588 auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
589 // Reduction result should have the same layout as the accumulator.
590 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
591 // For each slice of the source, extract the slice vector, do a reduction
592 // and, insert the reduced value back to the result vector.
593 int64_t accRank = acc.getType().getRank();
594 for (int i = 0; i < nSlices; ++i) {
595 // Build nD offsets, sizes, and strides. Leading unit dims get
596 // offset=0, size=1. The last two dims are set based on reductionDim.
597 SmallVector<int64_t> sliceOffsets(sourceRank, 0);
598 SmallVector<int64_t> sliceSizes(sourceRank, 1);
599 SmallVector<int64_t> strides(sourceRank, 1);
600 if (reductionDim == columnIdx) {
601 sliceOffsets[rowIdx] = i;
602 sliceSizes[columnIdx] = sourceW;
603 } else {
604 sliceOffsets[columnIdx] = i;
605 sliceSizes[rowIdx] = sourceH;
606 }
607
608 vector::ExtractStridedSliceOp extractOp =
609 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
610 sliceSizes, strides);
611 // Extract strided slice has the same layout as src.
612 xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
613
614 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
615
616 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
617 rewriter, loc,
618 VectorType::get({nSliceElements}, sourceType.getElementType()),
619 extractOp.getResult());
620
621 // Shape cast output has the same layout as the accumulator. Shape cast
622 // source has the same layout as the original reduction source.
623 xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
624 xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
625 // Extract and reduction results in scalars, so no result layout is needed.
626 // Build multi-dim index into acc (sourceRank-1 dims, i.e. source shape with
627 // the reduction dim removed). Leading unit dims get index 0.
628 SmallVector<int64_t> accIdx(accRank, 0);
629 accIdx[accRank - 1] = i;
630 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
631 Value reduction = vector::ReductionOp::create(
632 rewriter, loc, kind, slice.getResult(), accExtract);
633 reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
634 reductionResult, accIdx);
635 // Insert op should have the same layout as the accumulator.
636 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
637 }
638 return reductionResult;
639}
640
643 vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
644 Location loc, PatternRewriter &rewriter) {
645 VectorType sourceType = src.getType();
646 int64_t sourceRank = sourceType.getRank();
647 // Expecting at least a 2D source vector. Leading dimensions (all except the
648 // last two) must be unit.
649 assert(sourceRank >= 2 && "expected at least a 2D source vector");
650 for (int64_t i = 0; i < sourceRank - 2; ++i)
651 assert(sourceType.getShape()[i] == 1 &&
652 "expected leading dimensions to be unit");
653 int64_t rowIdx = sourceRank - 2;
654 int64_t columnIdx = sourceRank - 1;
655 int64_t sourceH = sourceType.getShape()[rowIdx];
656 int64_t sourceW = sourceType.getShape()[columnIdx];
657
658 // Create a constant vector to hold the result of the reduction.
659 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
660 Value reductionResult = arith::ConstantOp::create(
661 rewriter, loc, acc.getType(),
662 DenseElementsAttr::get(acc.getType(), zeroAttr));
663
664 // nSlices is the number of reduction operations needed to reduce the entire
665 // source vector. For example, if reductionDim is the row dim, we are
666 // reducing across rows, and each slice is a column. So the number of slices
667 // is the number of columns, which is sourceW.
668 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
669
670 // For each slice of the source, extract the slice vector, do a reduction
671 // and, insert the reduced value back to the result vector.
672 int64_t accRank = acc.getType().getRank();
673 for (int i = 0; i < nSlices; ++i) {
674 // Build nD offsets, sizes, and strides. Leading unit dims get
675 // offset=0, size=1. The last two dims are set based on reductionDim.
676 SmallVector<int64_t> sliceOffsets(sourceRank, 0);
677 SmallVector<int64_t> sliceSizes(sourceRank, 1);
678 SmallVector<int64_t> strides(sourceRank, 1);
679 if (reductionDim == columnIdx) {
680 sliceOffsets[rowIdx] = i;
681 sliceSizes[columnIdx] = sourceW;
682 } else {
683 sliceOffsets[columnIdx] = i;
684 sliceSizes[rowIdx] = sourceH;
685 }
686
687 vector::ExtractStridedSliceOp extractOp =
688 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
689 sliceSizes, strides);
690 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
691 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
692 rewriter, loc,
693 VectorType::get({nSliceElements}, sourceType.getElementType()),
694 extractOp.getResult());
695
696 SmallVector<int64_t> accIdx(accRank, 0);
697 accIdx[accRank - 1] = i;
698 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
699 Value fullReduce =
700 xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
701 fullReduce =
702 vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
703 reductionResult = vector::InsertOp::create(rewriter, loc, fullReduce,
704 reductionResult, accIdx);
705 }
706 return reductionResult;
707}
708
710 Type type,
711 vector::CombiningKind kind) {
712 auto vecTy = dyn_cast<VectorType>(type);
713 Type elemTy = vecTy ? vecTy.getElementType() : type;
714
715 // Helper to create either a splat vector or scalar constant from an attr.
716 auto makeConst = [&](Attribute scalarAttr) -> Value {
717 if (vecTy)
718 return arith::ConstantOp::create(
719 builder, loc, vecTy, DenseElementsAttr::get(vecTy, scalarAttr));
720 return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(scalarAttr));
721 };
722
723 switch (kind) {
724 case vector::CombiningKind::ADD:
725 case vector::CombiningKind::XOR:
726 case vector::CombiningKind::OR:
727 case vector::CombiningKind::MAXUI:
728 return makeConst(builder.getZeroAttr(elemTy));
729
730 case vector::CombiningKind::MUL:
731 case vector::CombiningKind::AND:
732 return makeConst(builder.getOneAttr(elemTy));
733
734 case vector::CombiningKind::MINSI:
735 if (auto intTy = dyn_cast<IntegerType>(elemTy))
736 return makeConst(builder.getIntegerAttr(
737 elemTy, APInt::getSignedMaxValue(intTy.getWidth())));
738 return nullptr;
739
740 case vector::CombiningKind::MINUI:
741 if (auto intTy = dyn_cast<IntegerType>(elemTy))
742 return makeConst(
743 builder.getIntegerAttr(elemTy, APInt::getMaxValue(intTy.getWidth())));
744 return nullptr;
745
746 case vector::CombiningKind::MAXSI:
747 if (auto intTy = dyn_cast<IntegerType>(elemTy))
748 return makeConst(builder.getIntegerAttr(
749 elemTy, APInt::getSignedMinValue(intTy.getWidth())));
750 return nullptr;
751
752 case vector::CombiningKind::MINNUMF:
753 case vector::CombiningKind::MINIMUMF:
754 if (auto floatTy = dyn_cast<FloatType>(elemTy))
755 return makeConst(builder.getFloatAttr(
756 elemTy, APFloat::getInf(floatTy.getFloatSemantics())));
757 return nullptr;
758
759 case vector::CombiningKind::MAXNUMF:
760 case vector::CombiningKind::MAXIMUMF:
761 if (auto floatTy = dyn_cast<FloatType>(elemTy))
762 return makeConst(builder.getFloatAttr(
763 elemTy, APFloat::getInf(floatTy.getFloatSemantics(), true)));
764 return nullptr;
765 }
766 return nullptr;
767}
768
769/// Explicit instantiations
770template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
771 ArrayRef<int> candidateMultiples);
772template int
774 ArrayRef<unsigned> candidateMultiples);
775
776bool xegpu::requirePacked(const xegpu::DistributeLayoutAttr layout) {
777 if (!layout)
778 return false;
779 auto laneData = layout.getEffectiveLaneDataAsInt();
780 if (laneData.size() != 2)
781 return false;
782 return laneData[0] != 1;
783}
784
785bool xegpu::requireTranspose(const xegpu::DistributeLayoutAttr layout,
786 const xegpu::uArch::uArch *uArch) {
787 // Return false for unsupported targets.
788 // TODO: Add more support or move to target info.
789 if (uArch->getName().equals_insensitive("pvc") &&
790 uArch->getName().equals_insensitive("bmg") &&
791 uArch->getName().equals_insensitive("cri"))
792 return false;
793 if (!layout)
794 return false;
795 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
796 if (laneLayout.size() != 2)
797 return false;
798 return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
799}
800
801// Check if dst shape is an expansion of src shape by inserting unit dimensions.
802// Returns true if all dimensions in src match corresponding dimensions in dst
803// (after skipping unit dimensions), and populates expandedUnitDims with the
804// indices of the unit dimensions in dst that were added (not present in src).
805// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
807 SmallVector<int64_t> &expandedUnitDims) {
808 // All unit dimensions in dst that don't appear in src are the expanded
809 // unit dimensions
810 size_t srcIdx = 0;
811 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
812 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
813 srcIdx++;
814 else if (dst[dstIdx] == 1)
815 expandedUnitDims.push_back(dstIdx);
816 else
817 return false;
818 return srcIdx == src.size();
819}
820
821// Checks if dst shape is an expansion of src shape where each dimension in src
822// is split into one or more consecutive dimensions in dst whose product equals
823// the original dimension. Populates splitDimGroups with groups of dst indices
824// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
825// true
828 SmallVector<SmallVector<int64_t>> &splitDimGroups) {
829 // each dim in src can be mapped to one or more dims in dst whose product
830 // equals to the src dim
831 size_t srcIdx = 0;
832 int64_t accumulatedSize = 1;
833 SmallVector<int64_t> currentDstDims;
834
835 splitDimGroups.clear();
836 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
837 if (srcIdx >= src.size())
838 return false;
839 accumulatedSize *= dst[dstIdx];
840 currentDstDims.push_back(dstIdx);
841
842 if (accumulatedSize == src[srcIdx]) {
843 // Also collect trailing unit dims in destination, if any.
844 // Leading unit dims were implicitly collected.
845 if (srcIdx == src.size() - 1) {
846 while (++dstIdx < dst.size() && dst[dstIdx] == 1)
847 currentDstDims.push_back(dstIdx);
848 }
849 // Record the mapping: srcIdx -> currentDstDims
850 splitDimGroups.push_back(currentDstDims);
851 // move to next src dim
852 srcIdx++;
853 accumulatedSize = 1;
854 currentDstDims.clear();
855 } else if (accumulatedSize > src[srcIdx]) {
856 return false;
857 }
858 }
859 return srcIdx == src.size();
860}
861
862//===----------------------------------------------------------------------===//
863// Context-aware type conversion utilities
864//===----------------------------------------------------------------------===//
865
866// Pre-computes distributed VectorType mappings for every value carried through
867// an SCF loop (scf.while, scf.for): block args (iter_args /
868// before-/after-args), loop results, and the terminator operands feeding them.
869// These positions share one logical value and must convert identically, so each
870// is derived from a single source -- the layout of the feeding value (loop
871// init, or `scf.condition` operand) -- via `getDistributeLayoutAttr(Value)`,
872// and keyed by `Value`. Keying by Value is required because the SCF converters
873// detach/replace the loop body mid-conversion (scf.while detaches before/after
874// blocks -> a detached-arg layout query trips an ilist assertion; scf.for
875// rebuilds the op, which loses the temporary `layout_operand_N` attrs -> the
876// query returns null). Recording results and terminator operands lets a 1:N
877// pass resolve them from the map after stripping the loop op's transient attrs
878// (see XeGPUBlocking).
881 SubShapeAndCountFn getSubShapeAndCount) {
883 // Derive the distributed types from the feeding value's layout (the single
884 // authoritative source) and record them for every value that shares this
885 // loop-carried position.
886 auto recordTypes = [&](Value layoutSrc, ArrayRef<Value> dests) {
887 auto vecTy = dyn_cast<VectorType>(layoutSrc.getType());
888 if (!vecTy)
889 return;
890 auto layout = xegpu::getDistributeLayoutAttr(layoutSrc);
891 if (!layout)
892 return;
893 auto [subShape, count] = getSubShapeAndCount(vecTy, layout);
894 if (count <= 0)
895 return;
896 auto newTy = VectorType::get(subShape, vecTy.getElementType());
897 for (Value dest : dests)
898 loopArgTypes[dest] = SmallVector<Type>(count, newTy);
899 };
900 topLevelOp->walk([&](Operation *op) {
901 if (auto whileOp = dyn_cast<scf::WhileOp>(op)) {
902 // "before" args (and the after-region yield operands that feed them)
903 // correspond to the while `inits` operands.
904 auto yieldOp =
905 cast<scf::YieldOp>(whileOp.getAfterBody()->getTerminator());
906 for (auto [init, beforeArg, yieldVal] :
907 llvm::zip(whileOp.getInits(), whileOp.getBeforeArguments(),
908 yieldOp.getOperands()))
909 recordTypes(init, {beforeArg, yieldVal});
910 // "after" args and the while results correspond to the operands of the
911 // embedded `scf.condition` op (not the `inits`).
912 scf::ConditionOp condOp = whileOp.getConditionOp();
913 for (auto [condArg, afterArg, res] :
914 llvm::zip(condOp.getArgs(), whileOp.getAfterArguments(),
915 whileOp.getResults()))
916 recordTypes(condArg, {afterArg, res});
917 return;
918 }
919 if (auto forOp = dyn_cast<scf::ForOp>(op)) {
920 // Each loop-carried position pairs an init operand with its iter_arg,
921 // its loop result, and the yield operand that feeds the next iteration.
922 auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
923 for (auto [init, arg, res, yieldVal] :
924 llvm::zip(forOp.getInitArgs(), forOp.getRegionIterArgs(),
925 forOp.getResults(), yieldOp.getOperands()))
926 recordTypes(init, {arg, res, yieldVal});
927 return;
928 }
929 });
930 return loopArgTypes;
931}
932
934 TypeConverter &converter, SubShapeAndCountFn getSubShapeAndCount,
935 DenseMap<Value, SmallVector<Type>> loopArgTypes) {
936 // Context-aware VectorType conversion (1:1 shape-changing or 1:N). For
937 // SCF loop block arguments (scf.while, scf.for), uses the pre-computed
938 // map. For all other Values, retrieves the layout directly via
939 // getDistributeLayoutAttr.
940 auto loopArgTypeMap = std::make_shared<DenseMap<Value, SmallVector<Type>>>(
941 std::move(loopArgTypes));
942 converter.addConversion(
943 [loopArgTypeMap, getSubShapeAndCount](
944 Value v,
945 SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
946 if (!isa<VectorType>(v.getType()))
947 return std::nullopt;
948
949 // Check the pre-computed map first. It covers every value carried
950 // through an SCF loop (operands, block args, results, yield
951 // operands), all keyed by Value identity.
952 auto it = loopArgTypeMap->find(v);
953 if (it != loopArgTypeMap->end()) {
954 result.append(it->second.begin(), it->second.end());
955 return success();
956 }
957
958 // For all other Values, retrieve the layout directly.
959 auto layout = xegpu::getDistributeLayoutAttr(v);
960 if (!layout)
961 return std::nullopt;
962
963 auto vecType = cast<VectorType>(v.getType());
964 auto [subShape, count] = getSubShapeAndCount(vecType, layout);
965 if (count <= 0)
966 return std::nullopt;
967
968 auto newTy = VectorType::get(subShape, vecType.getElementType());
969 result.append(count, newTy);
970 return success();
971 });
972}
973
975 Operation *root,
976 const llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts) {
977 // Structural type conversion can generate some redundant
978 // UnrealizedConversionCastOps to materialize the original type from the
979 // type converted (sub-tile) type. These are redundant at this point and
980 // can be eliminated by either folding the cancelling cast chain or, when
981 // the original and final shapes differ but their element counts match,
982 // inserting a vector.shape_cast instead.
983 //
984 // Example (shape differs but element count matches -> shape_cast):
985 // %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32>
986 // to vector<16x16xf32>
987 // %2 = UnrealizedConversionCastOp %1 : vector<16x16xf32>
988 // to vector<16xf32>
989 // becomes:
990 // %2 = vector.shape_cast %0 : vector<16x1xf32> to vector<16xf32>
991 //
992 // For unpaired casts that emulate a pack (1:N) or unpack (N:1) between a
993 // single large VectorType and N identically-typed smaller VectorTypes,
994 // lower to vector.extract_strided_slice / vector.insert_strided_slice.
995 auto hasIdenticalVectorTypes = [](ValueRange values) {
996 auto types = values.getTypes();
997 return !types.empty() && llvm::all_of(types, [&](Type type) {
998 return isa<VectorType>(type) && type == types.front();
999 });
1000 };
1001 OpBuilder builder(root);
1002 root->walk([&](UnrealizedConversionCastOp op) {
1003 if (existingCasts.contains(op))
1004 return;
1005 // Handle N:1 cast (N >= 1) where all inputs come from a single 1:N cast.
1006 if (op.getNumResults() == 1 && op.getNumOperands() >= 1) {
1007 auto defOp =
1008 op.getInputs()[0].getDefiningOp<UnrealizedConversionCastOp>();
1009 if (defOp && !existingCasts.contains(defOp) &&
1010 defOp.getNumOperands() == 1 &&
1011 defOp.getNumResults() == op.getNumOperands() &&
1012 llvm::all_of(op.getInputs(),
1013 [&](Value v) { return v.getDefiningOp() == defOp; })) {
1014 Value orig = defOp.getInputs()[0];
1015 auto origTy = dyn_cast<VectorType>(orig.getType());
1016 auto resTy = dyn_cast<VectorType>(op.getResult(0).getType());
1017 if (origTy && resTy &&
1018 origTy.getNumElements() == resTy.getNumElements() &&
1019 origTy != resTy) {
1020 builder.setInsertionPoint(op);
1021 auto shapeCast =
1022 vector::ShapeCastOp::create(builder, op.getLoc(), resTy, orig);
1023 op.replaceAllUsesWith(ValueRange{shapeCast.getResult()});
1024 } else {
1025 op.replaceAllUsesWith(ValueRange{orig});
1026 }
1027 return;
1028 }
1029 // Unpaired N:1 cast emulating unpack: stitch inputs into the output
1030 // shape via vector.insert_strided_slice.
1031 auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
1032 if (op.getNumOperands() > 1 && outputTy &&
1033 hasIdenticalVectorTypes(op.getInputs())) {
1034 builder.setInsertionPoint(op);
1036 builder, op.getLoc(), op.getInputs(), outputTy.getShape());
1037 op->replaceAllUsesWith(ValueRange(result));
1038 }
1039 return;
1040 }
1041 // Handle 1:N cast where the single input comes from an N:1 cast.
1042 if (op.getNumOperands() == 1 && op.getNumResults() > 1) {
1043 auto defOp =
1044 op.getInputs()[0].getDefiningOp<UnrealizedConversionCastOp>();
1045 if (defOp && !existingCasts.contains(defOp) &&
1046 defOp.getNumResults() == 1 &&
1047 defOp.getNumOperands() == op.getNumResults() &&
1048 llvm::equal(ValueRange(defOp.getInputs()).getTypes(),
1049 op->getResultTypes())) {
1050 op.replaceAllUsesWith(defOp.getInputs());
1051 return;
1052 }
1053 // Unpaired 1:N cast emulating pack: split the input into the output
1054 // tile shape via vector.extract_strided_slice.
1055 auto tileTy = dyn_cast<VectorType>(op.getResult(0).getType());
1056 if (tileTy && hasIdenticalVectorTypes(op.getResults())) {
1057 builder.setInsertionPoint(op);
1059 builder, op.getLoc(), op.getInputs()[0], tileTy.getShape());
1060 op->replaceAllUsesWith(results);
1061 }
1062 return;
1063 }
1064 });
1065
1066 // Erase dead casts iteratively.
1067 bool changed = true;
1068 while (changed) {
1069 changed = false;
1070 root->walk([&](UnrealizedConversionCastOp op) {
1071 if (existingCasts.contains(op))
1072 return;
1073 if (op.use_empty()) {
1074 op.erase();
1075 changed = true;
1076 }
1077 });
1078 }
1079}
1080
1081// Checks if dst shape is a collapse of src shape where each dim in dst is
1082// produced by one or more consecutive dims in src whose product equals the dst
1083// dim. Populates collapseDims with one group per dst dim listing the src
1084// indices collapsed into it. Unit dims in dst that have no backing src dim
1085// (leading, in-between, or trailing) get empty groups; src unit dims that
1086// fall past the last consumed dst dim are absorbed into the most-recent
1087// non-empty group.
1088// Examples:
1089// src=[8,16,32], dst=[1,4096] -> true, collapseDims=[[],[0,1,2]]
1090// src=[8,16,32], dst=[4096,1] -> true, collapseDims=[[0,1,2],[]]
1091// src=[2,3,4], dst=[6,4] -> true, collapseDims=[[0,1],[2]]
1092// src=[64], dst=[64] -> true, collapseDims=[[0]]
1094 SmallVector<SmallVector<int64_t>> &collapseDims) {
1095 collapseDims.clear();
1096 collapseDims.resize(dst.size());
1097
1098 // Cheap precondition: src and dst must describe the same number of
1099 // elements. Bails out early on mismatched shapes without walking the dims.
1100 int64_t srcProd = std::accumulate(src.begin(), src.end(), int64_t{1},
1101 std::multiplies<int64_t>());
1102 int64_t dstProd = std::accumulate(dst.begin(), dst.end(), int64_t{1},
1103 std::multiplies<int64_t>());
1104 if (srcProd != dstProd)
1105 return false;
1106
1107 // Step 1: validate the partition on the unit-dim-stripped (compact) shapes.
1108 // Unit dims play no role in the matching decision — they only need to be
1109 // placed somewhere in the final groups (handled in step 2).
1110 SmallVector<int64_t> srcCompact, dstCompact;
1111 for (int64_t s : src)
1112 if (s != 1)
1113 srcCompact.push_back(s);
1114 for (int64_t d : dst)
1115 if (d != 1)
1116 dstCompact.push_back(d);
1117
1118 size_t s = 0;
1119 for (int64_t need : dstCompact) {
1120 int64_t acc = 1;
1121 while (s < srcCompact.size() && acc < need)
1122 acc *= srcCompact[s++];
1123 if (acc != need)
1124 return false;
1125 }
1126 if (s != srcCompact.size())
1127 return false;
1128
1129 // Step 2: assign each original src index to the correct original dst group.
1130 // Walk dst in original order, advancing past unit dst dims (they keep their
1131 // pre-initialized empty group). Walk src in original order; non-unit src
1132 // dims accumulate into the current dst group, unit src dims attach to the
1133 // current group when one is open or to the most-recent non-empty group
1134 // after dst is exhausted (leading unit src dims with no group yet are
1135 // dropped).
1136 size_t dstIdx = 0;
1137 while (dstIdx < dst.size() && dst[dstIdx] == 1)
1138 dstIdx++;
1139
1140 int64_t lastNonEmpty = -1;
1141 int64_t acc = 1;
1142 for (size_t srcIdx = 0; srcIdx < src.size(); ++srcIdx) {
1143 if (dstIdx >= dst.size()) {
1144 // dst exhausted; remaining src dims are unit (validated above) and
1145 // attach to the last non-empty group, if any.
1146 if (lastNonEmpty >= 0)
1147 collapseDims[lastNonEmpty].push_back(srcIdx);
1148 continue;
1149 }
1150 acc *= src[srcIdx];
1151 collapseDims[dstIdx].push_back(srcIdx);
1152 lastNonEmpty = dstIdx;
1153 if (acc == dst[dstIdx]) {
1154 acc = 1;
1155 ++dstIdx;
1156 while (dstIdx < dst.size() && dst[dstIdx] == 1)
1157 ++dstIdx;
1158 }
1159 }
1160 return true;
1161}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
auto load
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
Attributes are known-constant values of operations.
Definition Attributes.h:25
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition Builders.cpp:233
FloatAttr getFloatAttr(Type type, double value)
Definition Builders.cpp:259
TypedAttr getZeroAttr(Type type)
Definition Builders.cpp:329
TypedAttr getOneAttr(Type type)
Definition Builders.cpp:347
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:209
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:400
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition Builders.h:528
This class represents an operand of an operation.
Definition Value.h:254
This is a value defined by a result of an operation.
Definition Value.h:454
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
AttrClass getAttrOfType(StringAttr name)
Definition Operation.h:575
bool hasAttrOfType(NameT &&name)
Definition Operation.h:600
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:585
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:255
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition Operation.h:607
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:822
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
bool matchDimCollapse(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &collapseDims)
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
bool requirePacked(const DistributeLayoutAttr layout)
Helper function to check if the layout is packed.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool requireTranspose(const DistributeLayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
DenseMap< Value, SmallVector< Type > > precomputeLoopBlockArgTypes(Operation *topLevelOp, SubShapeAndCountFn getSubShapeAndCount)
Pre-computes distributed VectorType mappings for every value carried through an SCF loop under topLev...
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
void addVectorTypeConversion(TypeConverter &converter, SubShapeAndCountFn getSubShapeAndCount, DenseMap< Value, SmallVector< Type > > loopArgTypes)
Adds a context-aware VectorType conversion to converter (1:1 shape-changing or 1:N,...
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
std::function< std::pair< SmallVector< int64_t >, int >( VectorType, DistributeLayoutAttr)> SubShapeAndCountFn
Callback type for computing sub-shape and count for 1:N (or 1:1 shape-changing) VectorType conversion...
Definition XeGPUUtils.h:235
void cleanupUnrealizedConversionCasts(Operation *root, const llvm::SmallSetVector< UnrealizedConversionCastOp, 8 > &existingCasts)
Cleans up UnrealizedConversionCastOps inserted during SCF structural type conversion and/or XeGPU unr...
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:307
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:114
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:120
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const
Definition uArchBase.h:164