MLIR 23.0.0git
XeGPUUtils.cpp
Go to the documentation of this file.
1//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===//
2//
3// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utility methods for working with the XeGPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
20#include "mlir/IR/Builders.h"
21#include "mlir/IR/Operation.h"
22#include "mlir/IR/ValueRange.h"
25#include "llvm/Support/Casting.h"
26#include "llvm/Support/FormatVariadic.h"
27#include <cstdint>
28#include <numeric>
29
30using namespace mlir;
31
32/// convert ArrayRef<ValueRange> into SmallVector<Value>
35 for (const auto &vals : values)
36 llvm::append_range(result, vals);
37 return result;
38}
39
40FailureOr<VectorType>
41mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
42 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
43 // It only works for subgroup level layout, which only has lane_layout
44 // and lane_data, and is to distribute a SIMD code into SIMT code.
45 if (!layout || !layout.isForSubgroup())
46 return failure();
47
48 SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
49 SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
50 auto tdescShape = tdescTy.getShape();
51 auto elementType = tdescTy.getElementType();
52
53 // compute sgSize by multiply elements of laneLayout
54 // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
55 // e.g. for 1D layout, sgSize = laneLayout[0]
56 int64_t sgSize = llvm::product_of(laneLayout);
57
58 // Case 1: regular loads/stores
59 auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
60 if (scatterAttr) {
61 auto chunkSize = scatterAttr.getChunkSize().getInt();
62 // Verify if the first dimension of the tensor descriptor shape is
63 // distributable.
64 assert(tdescShape[0] == laneLayout[0] &&
65 "tensor descriptor shape is not distributable");
66 return VectorType::get({chunkSize}, elementType);
67 }
68
69 // Case 2: block loads/stores
70 // Check if the tensor descriptor shape is distributable.
71 int64_t tensorSize = 1;
72 for (auto [tdescDim, laneDim, laneDataDim] :
73 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
74 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
75 "tensor descriptor shape is not distributable");
76 tensorSize *= tdescDim;
77 }
78 // tensorSize must be adjusted for array_length.
79 tensorSize *= tdescTy.getArrayLength();
80
81 return VectorType::get({tensorSize / sgSize}, elementType);
82}
83
84FailureOr<VectorType>
85mlir::xegpu::getDistributedVectorType(VectorType originalType,
86 xegpu::LayoutAttr layout) {
87 int64_t rank = originalType.getRank();
88 // Distributed vector type is only supported for 1D, 2D and 3D vectors.
89 if (rank < 1 || rank > 3)
90 return failure();
91 ArrayRef<int64_t> shape = originalType.getShape();
92 // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
93 // of the 3D vector.
94 int arrayLength = 1;
95 if (rank == 3) {
96 arrayLength = shape[0];
97 shape = shape.drop_front();
98 }
99 auto helperTdescTy = xegpu::TensorDescType::get(
100 shape, originalType.getElementType(), arrayLength,
101 /*boundary_check=*/true,
102 /*memory_space=*/xegpu::MemorySpace::Global, layout);
103 return xegpu::getDistributedVectorType(helperTdescTy);
104}
105
106FailureOr<VectorType>
107xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
108 VectorType originalType) {
109 if (!layout)
110 return failure();
111 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
112 "Expecting a valid layout.");
113 SmallVector<int64_t> effectiveLaneLayout =
114 layout.getEffectiveLaneLayoutAsInt();
115 assert(static_cast<size_t>(originalType.getRank()) >=
116 effectiveLaneLayout.size() &&
117 "Rank of the original vector type should be greater or equal to the "
118 "size of the lane layout to distribute the vector type.");
119 SmallVector<int64_t> distributedShape(originalType.getShape());
120 // Only distribute the last `laneLayout.size()` dimensions. The remaining
121 // dimensions are not distributed.
122 unsigned distributionStart =
123 originalType.getRank() - effectiveLaneLayout.size();
124 for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
125 if (i < distributionStart)
126 continue;
127 // Check if the dimension can be distributed evenly.
128 if (dim % effectiveLaneLayout[i - distributionStart] != 0)
129 return failure();
130 distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
131 }
132 return VectorType::get(distributedShape, originalType.getElementType());
133}
134
135std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
136 const StringRef prefix("layout_operand_");
137 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
138 return llvm::formatv("{0}{1}", prefix, idx).str();
139}
140
142 const StringRef prefix = "layout_result_";
143 return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
144}
145
146xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
147 if (!value)
148 return nullptr;
149
150 if (auto tdescTy =
151 dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
152 return tdescTy.getLayoutAttr();
153
154 if (auto result = dyn_cast<OpResult>(value)) {
155 Operation *defOp = result.getDefiningOp();
156 assert(defOp && "result must have a defining op");
157
158 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
159 auto layout = anchorOp.getAnchorLayout();
160 return layout;
161 }
162
163 std::string layoutName = getTemporaryLayoutName(result);
164 if (defOp->hasAttr(layoutName)) {
165 auto layout =
166 defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
167 return layout;
168 }
169 }
170
171 if (auto arg = dyn_cast<BlockArgument>(value)) {
172 auto *parentOp = arg.getOwner()->getParentOp();
173 if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
174 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
175 if (tiedInit)
176 return getDistributeLayoutAttr(tiedInit->get());
177 }
178 }
179
180 return nullptr;
181}
182xegpu::DistributeLayoutAttr
184 Operation *op = opr.getOwner();
185 unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
186
187 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
188 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
189 if (idx == 0) {
190 return dpasOp.getLayoutAAttr();
191 } else if (idx == 1) {
192 return dpasOp.getLayoutBAttr();
193 } else if (idx == 2) {
194 return dpasOp.getLayoutCdAttr();
195 }
196 }
197 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
198 return convertOp.getInputLayoutAttr();
199 }
200 auto layout = anchorOp.getAnchorLayout();
201
202 if (idx == 0)
203 return layout;
204
205 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
206 // the layout is valid for the first two operands: value and memref/tdesc.
207 // For other operations, the layout applies to the first operand only.
208 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
209 op) &&
210 (idx < 2))
211 return layout;
212 }
213
214 std::string layoutName = xegpu::getTemporaryLayoutName(opr);
215 if (op->hasAttr(layoutName)) {
216 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
217 return layout;
218 }
219
220 return nullptr;
221}
222
223// Returns the permanent layout attribute for the given result if it's
224// available on the defining op. Otherwise returns the provided layout.
225xegpu::DistributeLayoutAttr
226maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
227 const OpResult &result, mlir::Operation *owner,
228 const std::string &name) {
229 xegpu::DistributeLayoutAttr candidate = layout;
230
231 if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
232 if (auto perm = loadOp.getLayoutAttr())
233 candidate = perm;
234 }
235
236 return candidate;
237}
238
239// Returns the permanent layout attribute for the given operand if it's
240// available on the defining op. Otherwise returns the provided layout.
241xegpu::DistributeLayoutAttr
242maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
243 const OpOperand &operand, mlir::Operation *owner,
244 const std::string &name) {
245 xegpu::DistributeLayoutAttr candidate = layout;
246 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
247
248 if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
249 if (idx == 0) {
250 if (auto perm = storeOp.getLayoutAttr())
251 candidate = perm;
252 }
253 }
254
255 return candidate;
256}
257
258// TODO-LayoutRefactor: Remove this function after replacing use
259// with setTemporaryLayout or setAnchorLayout
261 const mlir::OpResult &result,
262 const mlir::xegpu::DistributeLayoutAttr layout) {
263 Operation *owner = result.getOwner();
264
265 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
266 if (anchorOp.getAnchorLayout() == layout)
267 return;
268 anchorOp.setAnchorLayout(layout);
269 return;
270 }
271
272 std::string name = xegpu::getTemporaryLayoutName(result);
273 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
274 return;
275 }
276 if (layout) {
277 owner->setAttr(name, layout);
278 }
279}
280
281// TODO-LayoutRefactor: Remove this function after replacing use
282// with setTemporaryLayout or setAnchorLayout
284 const DistributeLayoutAttr layout) {
285 Operation *owner = operand.getOwner();
286 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
287
288 if (!layout) {
289 return;
290 }
291 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
292 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
293 if (idx == 0) {
294 return dpasOp.setLayoutAAttr(layout);
295 } else if (idx == 1) {
296 return dpasOp.setLayoutBAttr(layout);
297 } else if (idx == 2) {
298 return dpasOp.setLayoutCdAttr(layout);
299 }
300 }
301 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
302 return convertOp.setInputLayoutAttr(layout);
303 }
304
305 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
306 // the layout is valid for the first two operands: value and memref/tdesc.
307 // For other operations, the layout applies to the first operand only.
308 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
309 owner)) {
310 if (idx < 2) {
311 anchorOp.setAnchorLayout(layout);
312 }
313 } else {
314 if (idx == 0) {
315 anchorOp.setAnchorLayout(layout);
316 }
317 }
318 }
319
320 std::string name = xegpu::getTemporaryLayoutName(operand);
321 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
322 return;
323 }
324 if (layout) {
325 owner->setAttr(name, layout);
326 }
327}
328
329template <typename T, typename>
330xegpu::DistributeLayoutAttr
331xegpu::getTemporaryLayout(const T &operandOrResult) {
332 Operation *op = operandOrResult.getOwner();
333
334 std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
335 if (op->hasAttr(layoutName)) {
336 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
337 return layout;
338 }
339
340 return nullptr;
341}
342
343template xegpu::DistributeLayoutAttr
345template xegpu::DistributeLayoutAttr
347
348template <typename T, typename>
349void xegpu::setTemporaryLayout(const T &operandOrResult,
350 const xegpu::DistributeLayoutAttr layout) {
351 Operation *owner = operandOrResult.getOwner();
352 std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
353 if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
354 return;
355 }
356 if (layout) {
357 owner->setAttr(name, layout);
358 }
359}
360
362 const mlir::OpResult &result,
363 const mlir::xegpu::DistributeLayoutAttr layout);
364
366 const mlir::OpOperand &operand,
367 const mlir::xegpu::DistributeLayoutAttr layout);
368
372 auto vecTy = dyn_cast<VectorType>(value.getType());
373 if (!vecTy)
374 return {value};
375
376 ArrayRef<int64_t> srcShape = vecTy.getShape();
377 if (!computeShapeRatio(srcShape, shape))
378 return {value};
379
380 int64_t srcShapeRank = srcShape.size();
381 int64_t targetShapeRank = shape.size();
382
383 SmallVector<int64_t> adjustedTargetShape(srcShape.size());
384 int64_t rankDiff = srcShapeRank - targetShapeRank;
385 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
386 1);
387 llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);
388
390 for (SmallVector<int64_t> offsets :
391 StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
392 SmallVector<int64_t> staticStrides(offsets.size(), 1);
393 Value slice = vector::ExtractStridedSliceOp::create(
394 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
395
396 // Reshape to remove leading unit dims if needed
397 if (srcShapeRank > targetShapeRank) {
398 auto targetTy = VectorType::get(shape, vecTy.getElementType());
399 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
400 }
401 result.push_back(slice);
402 }
403
404 return result;
405}
406
408 ValueRange values,
410 VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
411 assert(llvm::all_of(values.getTypes(),
412 [&](Type type) { return type == inputTy; }) &&
413 "values must be of the same VectorType");
414
415 Type elemTy = inputTy.getElementType();
416 ArrayRef<int64_t> tileShape = inputTy.getShape();
417
418 VectorType resultTy = VectorType::get(shape, elemTy);
419 auto zeroAttr = builder.getZeroAttr(elemTy);
420 Value result = arith::ConstantOp::create(
421 builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
422
423 for (auto [src, offsets] :
424 llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
425 SmallVector<int64_t> staticStrides(tileShape.size(), 1);
426 result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
427 offsets, staticStrides);
428 }
429 return result;
430}
431
433 Operation *op, TypeConverter converter) {
434 MLIRContext *context = op->getContext();
435
436 auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
437 Location loc) -> Value {
438 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
439 .getResult(0);
440 };
441
442 { // convert VectorType to RankedTensorType for SCF Structural ops
443 TypeConverter converter;
444 converter.addConversion([](Type type) -> Type { return type; });
445 converter.addConversion([](VectorType type) -> Type {
446 return RankedTensorType::get(type.getShape(), type.getElementType());
447 });
448 converter.addSourceMaterialization(materializeCast);
449 converter.addTargetMaterialization(materializeCast);
450
451 mlir::ConversionTarget target(*context);
452 target.addLegalOp<UnrealizedConversionCastOp>();
453
456 target);
457 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
458 }
459
460 { // propagate the layout attribute to RankedTensorType by checking
461 // BuiltInUnrealizedCastOps
462 // for VectorType to RankedTensorType cast.
463 op->walk([](UnrealizedConversionCastOp castOp) {
464 if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
465 return WalkResult::skip();
466
467 Value input = castOp.getInputs()[0];
468 Value result = castOp.getResults()[0];
469 auto inputTy = dyn_cast<VectorType>(input.getType());
470 auto resultTy = dyn_cast<RankedTensorType>(result.getType());
471
472 // Only look at ops casting from VectorType to RankedTensorType
473 if (!inputTy || !resultTy)
474 return WalkResult::skip();
475
476 xegpu::DistributeLayoutAttr layout =
478 if (!layout)
479 return WalkResult::skip();
480
481 RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
482 result.setType(newTy);
483
484 // update the arguments if user is a LoopLike op.
485 for (OpOperand &use : result.getUses()) {
486 if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
487 BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
488 arg.setType(newTy);
489 }
490 // whileOp has two regions, the BlockArgument of the after region
491 // is not exposed by LoopLikeOpInterface
492 if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
493 unsigned idx = use.getOperandNumber();
494 BlockArgument arg = whileOp.getAfterArguments()[idx];
495 arg.setType(newTy);
496 }
497 }
498 return WalkResult::advance();
499 });
500
501 // using yieldOp as anchor to update the result type of its ParentOp
502 op->walk([](scf::YieldOp yieldOp) {
503 Operation *parentOp = yieldOp->getParentOp();
504 for (OpResult r : parentOp->getOpResults()) {
505 unsigned idx = r.getResultNumber();
506 Type resultTy = r.getType();
507 Type yieldTy = yieldOp.getResults()[idx].getType();
508 if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
509 r.setType(yieldTy);
510 }
511 });
512 }
513
514 { // perform the conversion from RankedTensorType to VectorType based on the
515 // DistributeLayoutAttr
516
517 // Handle the UnrealizedConversionCastOp introduced by the first step.
518 // For vector->RankedTensorType, it will simply forward the inputs.
519 // For RankedTensorType->vector, it will update the inputs with the
520 // one from the adaptor.
521 class UnrealizedConversionCastOpPattern
522 : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
523 using OpConversionPattern<
524 mlir::UnrealizedConversionCastOp>::OpConversionPattern;
525
526 mlir::LogicalResult
527 matchAndRewrite(mlir::UnrealizedConversionCastOp op,
528 OneToNOpAdaptor adaptor,
529 ConversionPatternRewriter &rewriter) const override {
530 auto inputs = op.getOperands();
531 auto outputs = op.getOutputs();
532
533 if (inputs.size() != 1 || outputs.size() != 1)
534 return failure();
535
536 auto inputTy = inputs[0].getType();
537 auto outputTy = outputs[0].getType();
538
539 if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
540 rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
541 return success();
542 }
543
544 if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
545 SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
546 auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
547 outputTy, values);
548 rewriter.replaceOp(op, newOp);
549 return success();
550 }
551 return failure();
552 }
553 };
554
555 converter.addSourceMaterialization(materializeCast);
556 converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
557 ValueRange inputs, Location loc) {
558 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
559 .getResults();
560 });
561
562 mlir::ConversionTarget target(*context);
563 target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
564 [](UnrealizedConversionCastOp op) {
565 auto isTensorTy = [](Type type) {
566 return isa<RankedTensorType>(type);
567 };
568 return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
569 llvm::none_of(op->getResultTypes(), isTensorTy);
570 });
572 patterns.insert<UnrealizedConversionCastOpPattern>(context);
574 target);
575 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
576 }
577}
578
579std::optional<std::string> xegpu::getChipStr(Operation *op) {
580 auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();
581
582 if (!gpuModuleOp)
583 return std::nullopt;
584
585 auto targetAttrs = gpuModuleOp.getTargets();
586 if (targetAttrs) {
587 for (auto &attr : *targetAttrs) {
588 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
589 if (xevmAttr)
590 return xevmAttr.getChip().str();
591 }
592 }
593
594 return std::nullopt;
595}
596
597/// Generates element-wise addition ops of two arrays with same length.
599 Location loc,
602 assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
604 for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
605 auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
606 auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
607 results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
608 }
609 return results;
610}
611
612/// Generates element-wise addition ops of two arrays with automatic alignment.
613/// When the input arrays have different sizes, the shorter array is
614/// right-aligned with the longer array, and the unmatched leading elements from
615/// the longer array are preserved unchanged. This is commonly used for offset
616/// computation where higher-dimensional offsets need to be added to
617/// lower-dimensional adjustments.
618///
619/// Example:
620/// lhs = [l1, l2, l3], rhs = [r1, r2]
621/// Result: [11, l2+r1, l3+r2]
626 // ensure a is longer than b
627 ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
628 ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
629 SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
630 a = a.slice(a.size() - b.size());
631 results.append(addElementwise(builder, loc, a, b));
632 return results;
633}
634
635template <typename T>
637 ArrayRef<T> candidateMultiples) {
638 static_assert(std::is_integral<T>::value, "T must be an integer type");
639 int largest = -1;
640 SmallVector<T> multiples = {1};
641 if (!candidateMultiples.empty())
642 multiples =
643 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
644 for (T candidate : candidates) {
645 for (T multiple : multiples) {
646 int value = static_cast<int>(candidate * multiple);
647 if (value != 0 && dim % value == 0 && value > largest)
648 largest = value;
649 }
650 }
651 return largest;
652}
653
655 vector::CombiningKind kind, uint32_t size) {
656 // First reduce on a single thread to get per lane reduction value.
657 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
658 // Parallel reduction using butterfly shuffles.
659 for (uint64_t i = 1; i < size; i <<= 1) {
660 Value shuffled =
661 gpu::ShuffleOp::create(builder, loc, laneVal, i, /** width = **/ size,
662 /** mode = **/ gpu::ShuffleMode::XOR)
663 .getShuffleResult();
664 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
665 }
666 return laneVal;
667}
668
671 vector::CombiningKind kind,
672 int64_t reductionDim, Location loc,
673 PatternRewriter &rewriter) {
674 // Expecting a 2D source vector.
675 assert(src.getType().getRank() == 2 && "expected a 2D source vector");
676 VectorType sourceType = src.getType();
677 int64_t sourceH = sourceType.getShape()[0];
678 int64_t sourceW = sourceType.getShape()[1];
679 int nSlices = (reductionDim == 0) ? sourceW : sourceH;
680 // Create a constant vector to hold the result of the reduction.
681 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
682 Value reductionResult = arith::ConstantOp::create(
683 rewriter, loc, acc.getType(),
684 DenseElementsAttr::get(acc.getType(), zeroAttr));
685 auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
686 auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
687 // Reduction result should have the same layout as the accumulator.
688 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
689 // For each slice of the source, extract the slice vector, do a reduction
690 // and, insert the reduced value back to the result vector.
691 for (int i = 0; i < nSlices; ++i) {
692 SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
693 if (reductionDim == 1) {
694 sliceOffsets = {i, 0};
695 sliceSizes = {1, sourceW};
696 } else {
697 sliceOffsets = {0, i};
698 sliceSizes = {sourceH, 1};
699 }
700
701 vector::ExtractStridedSliceOp extractOp =
702 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
703 sliceSizes, {1, 1});
704 // Extract strided slice has the same layout as src.
705 xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
706
707 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
708
709 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
710 rewriter, loc,
711 VectorType::get({nSliceElements}, sourceType.getElementType()),
712 extractOp.getResult());
713
714 // Shape cast output has the same layout as the accumulator. Shape cast
715 // source has the same layout as the original reduction source.
716 xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
717 xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
718 // Extract and reduction results in scalars, so no result layout is needed.
719 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
720 Value reduction = vector::ReductionOp::create(
721 rewriter, loc, kind, slice.getResult(), accExtract);
722 reductionResult =
723 vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
724 // Insert op should have the same layout as the accumulator.
725 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
726 }
727 return reductionResult;
728}
729
730/// Explicit instantiations
731template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
732 ArrayRef<int> candidateMultiples);
733template int
735 ArrayRef<unsigned> candidateMultiples);
736
737bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
738 if (!layout)
739 return false;
740 auto laneData = layout.getEffectiveLaneDataAsInt();
741 if (laneData.size() != 2)
742 return false;
743 return laneData[0] != 1;
744}
745
746bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
747 const xegpu::uArch::uArch *uArch) {
748 // Return false for unsupported targets.
749 // TODO: Add more support or move to target info.
750 if (uArch->getName().equals_insensitive("pvc") &&
751 uArch->getName().equals_insensitive("bmg"))
752 return false;
753 if (!layout)
754 return false;
755 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
756 if (laneLayout.size() != 2)
757 return false;
758 return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
759}
760
761// Check if dst shape is an expansion of src shape by inserting unit dimensions.
762// Returns true if all dimensions in src match corresponding dimensions in dst
763// (after skipping unit dimensions), and populates expandedUnitDims with the
764// indices of the unit dimensions in dst that were added (not present in src).
765// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
767 SmallVector<int64_t> &expandedUnitDims) {
768 // All unit dimensions in dst that don't appear in src are the expanded
769 // unit dimensions
770 size_t srcIdx = 0;
771 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
772 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
773 srcIdx++;
774 else if (dst[dstIdx] == 1)
775 expandedUnitDims.push_back(dstIdx);
776 else
777 return false;
778 return srcIdx == src.size();
779}
780
781// Checks if dst shape is an expansion of src shape where each dimension in src
782// is split into one or more consecutive dimensions in dst whose product equals
783// the original dimension. Populates splitDimGroups with groups of dst indices
784// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
785// true
788 SmallVector<SmallVector<int64_t>> &splitDimGroups) {
789 // each dim in src can be mapped to one or more dims in dst whose product
790 // equals to the src dim
791 size_t srcIdx = 0;
792 int64_t accumulatedSize = 1;
793 SmallVector<int64_t> currentDstDims;
794
795 splitDimGroups.clear();
796 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
797 if (srcIdx >= src.size())
798 return false;
799 accumulatedSize *= dst[dstIdx];
800 currentDstDims.push_back(dstIdx);
801
802 if (accumulatedSize == src[srcIdx]) {
803 // Record the mapping: srcIdx -> currentDstDims
804 splitDimGroups.push_back(currentDstDims);
805 // move to next src dim
806 srcIdx++;
807 accumulatedSize = 1;
808 currentDstDims.clear();
809 } else if (accumulatedSize > src[srcIdx]) {
810 return false;
811 }
812 }
813 return srcIdx == src.size();
814}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
This class represents an argument of a Block.
Definition Value.h:309
TypedAttr getZeroAttr(Type type)
Definition Builders.cpp:328
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition Builders.h:528
This class represents an operand of an operation.
Definition Value.h:257
This is a value defined by a result of an operation.
Definition Value.h:457
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
AttrClass getAttrOfType(StringAttr name)
Definition Operation.h:550
bool hasAttrOfType(NameT &&name)
Definition Operation.h:575
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:560
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition Operation.h:234
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:238
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition Operation.h:582
operand_type_range getOperandTypes()
Definition Operation.h:397
result_type_range getResultTypes()
Definition Operation.h:428
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:797
result_range getOpResults()
Definition Operation.h:420
MLIRContext * getContext()
Return the context this operation is associated with.
Definition Operation.h:216
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:37
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
void setType(Type newType)
Mutate the type of this Value to be of the specified type.
Definition Value.h:116
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
bool requirePacked(const LayoutAttr layout)
Helper function to check if the layout is packed.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:305
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
const FrozenRewritePatternSet & patterns
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:112
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const
Definition uArchBase.h:164