MLIR 23.0.0git
XeGPUUtils.cpp
Go to the documentation of this file.
1//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===//
2//
3// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utility methods for working with the XeGPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
20#include "mlir/IR/Builders.h"
21#include "mlir/IR/Operation.h"
22#include "mlir/IR/ValueRange.h"
25#include "llvm/Support/Casting.h"
26#include "llvm/Support/FormatVariadic.h"
27#include <cstdint>
28#include <numeric>
29
30using namespace mlir;
31
32/// convert ArrayRef<ValueRange> into SmallVector<Value>
35 for (const auto &vals : values)
36 llvm::append_range(result, vals);
37 return result;
38}
39
40FailureOr<VectorType>
41mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
42 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
43 // It only works for subgroup level layout, which only has lane_layout
44 // and lane_data, and is to distribute a SIMD code into SIMT code.
45 if (!layout || !layout.isForSubgroup())
46 return failure();
47
48 SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
49 SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
50 auto tdescShape = tdescTy.getShape();
51 auto elementType = tdescTy.getElementType();
52
53 // compute sgSize by multiply elements of laneLayout
54 // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
55 // e.g. for 1D layout, sgSize = laneLayout[0]
56 int64_t sgSize = llvm::product_of(laneLayout);
57
58 // Case 1: regular loads/stores
59 auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
60 if (scatterAttr) {
61 auto chunkSize = scatterAttr.getChunkSize().getInt();
62 // Verify if the first dimension of the tensor descriptor shape is
63 // distributable.
64 assert(tdescShape[0] == laneLayout[0] &&
65 "tensor descriptor shape is not distributable");
66 return VectorType::get({chunkSize}, elementType);
67 }
68
69 // Case 2: block loads/stores
70 // Check if the tensor descriptor shape is distributable.
71 int64_t tensorSize = 1;
72 for (auto [tdescDim, laneDim, laneDataDim] :
73 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
74 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
75 "tensor descriptor shape is not distributable");
76 tensorSize *= tdescDim;
77 }
78 // tensorSize must be adjusted for array_length.
79 tensorSize *= tdescTy.getArrayLength();
80
81 return VectorType::get({tensorSize / sgSize}, elementType);
82}
83
84FailureOr<VectorType>
85mlir::xegpu::getDistributedVectorType(VectorType originalType,
86 xegpu::LayoutAttr layout) {
87 int64_t rank = originalType.getRank();
88 // Distributed vector type is only supported for 1D, 2D and 3D vectors.
89 if (rank < 1 || rank > 3)
90 return failure();
91 ArrayRef<int64_t> shape = originalType.getShape();
92 // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
93 // of the 3D vector.
94 int arrayLength = 1;
95 if (rank == 3) {
96 arrayLength = shape[0];
97 shape = shape.drop_front();
98 }
99 auto helperTdescTy = xegpu::TensorDescType::get(
100 shape, originalType.getElementType(), arrayLength,
101 /*boundary_check=*/true,
102 /*memory_space=*/xegpu::MemorySpace::Global, layout);
103 return xegpu::getDistributedVectorType(helperTdescTy);
104}
105
106FailureOr<VectorType>
107xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
108 VectorType originalType) {
109 if (!layout)
110 return failure();
111 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
112 "Expecting a valid layout.");
113 SmallVector<int64_t> effectiveLaneLayout =
114 layout.getEffectiveLaneLayoutAsInt();
115 assert(static_cast<size_t>(originalType.getRank()) >=
116 effectiveLaneLayout.size() &&
117 "Rank of the original vector type should be greater or equal to the "
118 "size of the lane layout to distribute the vector type.");
119 SmallVector<int64_t> distributedShape(originalType.getShape());
120 // Only distribute the last `laneLayout.size()` dimensions. The remaining
121 // dimensions are not distributed.
122 unsigned distributionStart =
123 originalType.getRank() - effectiveLaneLayout.size();
124 for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
125 if (i < distributionStart)
126 continue;
127 // Check if the dimension can be distributed evenly.
128 if (dim % effectiveLaneLayout[i - distributionStart] != 0)
129 return failure();
130 distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
131 }
132 return VectorType::get(distributedShape, originalType.getElementType());
133}
134
135std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
136 const StringRef prefix("layout_operand_");
137 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
138 return llvm::formatv("{0}{1}", prefix, idx).str();
139}
140
142 const StringRef prefix = "layout_result_";
143 return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
144}
145
146xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
147 if (!value)
148 return nullptr;
149
150 if (auto tdescTy =
151 dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
152 return tdescTy.getLayoutAttr();
153
154 if (auto result = dyn_cast<OpResult>(value)) {
155 Operation *defOp = result.getDefiningOp();
156 assert(defOp && "result must have a defining op");
157
158 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
159 auto layout = anchorOp.getAnchorLayout();
160 return layout;
161 }
162
163 std::string layoutName = getTemporaryLayoutName(result);
164 if (defOp->hasAttr(layoutName)) {
165 auto layout =
166 defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
167 return layout;
168 }
169 }
170
171 if (auto arg = dyn_cast<BlockArgument>(value)) {
172 auto *parentOp = arg.getOwner()->getParentOp();
173 if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
174 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
175 if (tiedInit)
176 return getDistributeLayoutAttr(tiedInit->get());
177 }
178 }
179
180 return nullptr;
181}
182xegpu::DistributeLayoutAttr
184 Operation *op = opr.getOwner();
185 unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
186
187 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
188 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
189 if (idx == 0) {
190 return dpasOp.getLayoutAAttr();
191 } else if (idx == 1) {
192 return dpasOp.getLayoutBAttr();
193 } else if (idx == 2) {
194 return dpasOp.getLayoutCdAttr();
195 }
196 }
197 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
198 return convertOp.getInputLayoutAttr();
199 }
200 auto layout = anchorOp.getAnchorLayout();
201
202 if (idx == 0)
203 return layout;
204
205 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
206 // the layout is valid for the first two operands: value and memref/tdesc.
207 // For other operations, the layout applies to the first operand only.
208 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
209 op) &&
210 (idx < 2))
211 return layout;
212 }
213
214 std::string layoutName = xegpu::getTemporaryLayoutName(opr);
215 if (op->hasAttr(layoutName)) {
216 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
217 return layout;
218 }
219
220 return nullptr;
221}
222
223// Returns the permanent layout attribute for the given result if it's
224// available on the defining op. Otherwise returns the provided layout.
225xegpu::DistributeLayoutAttr
226maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
227 const OpResult &result, mlir::Operation *owner,
228 const std::string &name) {
229 xegpu::DistributeLayoutAttr candidate = layout;
230
231 if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
232 if (auto perm = loadOp.getLayoutAttr())
233 candidate = perm;
234 }
235
236 return candidate;
237}
238
239// Returns the permanent layout attribute for the given operand if it's
240// available on the defining op. Otherwise returns the provided layout.
241xegpu::DistributeLayoutAttr
242maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
243 const OpOperand &operand, mlir::Operation *owner,
244 const std::string &name) {
245 xegpu::DistributeLayoutAttr candidate = layout;
246 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
247
248 if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
249 if (idx == 0) {
250 if (auto perm = storeOp.getLayoutAttr())
251 candidate = perm;
252 }
253 }
254
255 return candidate;
256}
257
258// TODO-LayoutRefactor: Remove this function after replacing use
259// with setTemporaryLayout or setAnchorLayout
261 const mlir::OpResult &result,
262 const mlir::xegpu::DistributeLayoutAttr layout) {
263 Operation *owner = result.getOwner();
264
265 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
266 if (anchorOp.getAnchorLayout() == layout)
267 return;
268 anchorOp.setAnchorLayout(layout);
269 return;
270 }
271
272 std::string name = xegpu::getTemporaryLayoutName(result);
273 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
274 return;
275 }
276 if (layout) {
277 owner->setAttr(name, layout);
278 }
279}
280
281// TODO-LayoutRefactor: Remove this function after replacing use
282// with setTemporaryLayout or setAnchorLayout
284 const DistributeLayoutAttr layout) {
285 Operation *owner = operand.getOwner();
286 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
287
288 if (!layout) {
289 return;
290 }
291 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
292 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
293 if (idx == 0) {
294 return dpasOp.setLayoutAAttr(layout);
295 } else if (idx == 1) {
296 return dpasOp.setLayoutBAttr(layout);
297 } else if (idx == 2) {
298 return dpasOp.setLayoutCdAttr(layout);
299 }
300 }
301 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
302 return convertOp.setInputLayoutAttr(layout);
303 }
304
305 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
306 // the layout is valid for the first two operands: value and memref/tdesc.
307 // For other operations, the layout applies to the first operand only.
308 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
309 owner)) {
310 if (idx < 2) {
311 anchorOp.setAnchorLayout(layout);
312 }
313 } else {
314 if (idx == 0) {
315 anchorOp.setAnchorLayout(layout);
316 }
317 }
318 }
319
320 std::string name = xegpu::getTemporaryLayoutName(operand);
321 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
322 return;
323 }
324 if (layout) {
325 owner->setAttr(name, layout);
326 }
327}
328
329template <typename T, typename>
330xegpu::DistributeLayoutAttr
331xegpu::getTemporaryLayout(const T &operandOrResult) {
332 Operation *op = operandOrResult.getOwner();
333
334 std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
335 if (op->hasAttr(layoutName)) {
336 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
337 return layout;
338 }
339
340 return nullptr;
341}
342
343template xegpu::DistributeLayoutAttr
345template xegpu::DistributeLayoutAttr
347
348template <typename T, typename>
349void xegpu::setTemporaryLayout(const T &operandOrResult,
350 const xegpu::DistributeLayoutAttr layout) {
351 Operation *owner = operandOrResult.getOwner();
352 std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
353 if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
354 return;
355 }
356 if (layout) {
357 owner->setAttr(name, layout);
358 }
359}
360
362 const mlir::OpResult &result,
363 const mlir::xegpu::DistributeLayoutAttr layout);
364
366 const mlir::OpOperand &operand,
367 const mlir::xegpu::DistributeLayoutAttr layout);
368
372 auto vecTy = dyn_cast<VectorType>(value.getType());
373 if (!vecTy)
374 return {value};
375
376 ArrayRef<int64_t> srcShape = vecTy.getShape();
377 if (!computeShapeRatio(srcShape, shape))
378 return {value};
379
380 int64_t srcShapeRank = srcShape.size();
381 int64_t targetShapeRank = shape.size();
382
383 SmallVector<int64_t> adjustedTargetShape(srcShape.size());
384 int64_t rankDiff = srcShapeRank - targetShapeRank;
385 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
386 1);
387 llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);
388
390 for (SmallVector<int64_t> offsets :
391 StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
392 SmallVector<int64_t> staticStrides(offsets.size(), 1);
393 Value slice = vector::ExtractStridedSliceOp::create(
394 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
395
396 // Reshape to remove leading unit dims if needed
397 if (srcShapeRank > targetShapeRank) {
398 auto targetTy = VectorType::get(shape, vecTy.getElementType());
399 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
400 }
401 result.push_back(slice);
402 }
403
404 return result;
405}
406
408 ValueRange values,
410 VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
411 assert(llvm::all_of(values.getTypes(),
412 [&](Type type) { return type == inputTy; }) &&
413 "values must be of the same VectorType");
414
415 Type elemTy = inputTy.getElementType();
416 ArrayRef<int64_t> tileShape = inputTy.getShape();
417
418 VectorType resultTy = VectorType::get(shape, elemTy);
419 auto zeroAttr = builder.getZeroAttr(elemTy);
420 Value result = arith::ConstantOp::create(
421 builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
422
423 for (auto [src, offsets] :
424 llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
425 SmallVector<int64_t> staticStrides(tileShape.size(), 1);
426 result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
427 offsets, staticStrides);
428 }
429 return result;
430}
431
433 Operation *op, TypeConverter converter) {
434 MLIRContext *context = op->getContext();
435
436 auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
437 Location loc) -> Value {
438 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
439 .getResult(0);
440 };
441
442 { // convert VectorType to RankedTensorType for SCF Structural ops
443 TypeConverter converter;
444 converter.addConversion([](Type type) -> Type { return type; });
445 converter.addConversion([](VectorType type) -> Type {
446 return RankedTensorType::get(type.getShape(), type.getElementType());
447 });
448 converter.addSourceMaterialization(materializeCast);
449 converter.addTargetMaterialization(materializeCast);
450
451 mlir::ConversionTarget target(*context);
452 target.addLegalOp<UnrealizedConversionCastOp>();
453
454 mlir::RewritePatternSet patterns(context);
456 target);
457 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
458 }
459
460 { // propagate the layout attribute to RankedTensorType by checking
461 // BuiltInUnrealizedCastOps
462 // for VectorType to RankedTensorType cast.
463 op->walk([](UnrealizedConversionCastOp castOp) {
464 if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
465 return WalkResult::skip();
466
467 Value input = castOp.getInputs()[0];
468 Value result = castOp.getResults()[0];
469 auto inputTy = dyn_cast<VectorType>(input.getType());
470 auto resultTy = dyn_cast<RankedTensorType>(result.getType());
471
472 // Only look at ops casting from VectorType to RankedTensorType
473 if (!inputTy || !resultTy)
474 return WalkResult::skip();
475
476 xegpu::DistributeLayoutAttr layout =
478 if (!layout)
479 return WalkResult::skip();
480
481 RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
482 result.setType(newTy);
483
484 // update the arguments if user is a LoopLike op.
485 for (OpOperand &use : result.getUses()) {
486 if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
487 BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
488 arg.setType(newTy);
489 }
490 // whileOp has two regions, the BlockArgument of the after region
491 // is not exposed by LoopLikeOpInterface
492 if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
493 unsigned idx = use.getOperandNumber();
494 BlockArgument arg = whileOp.getAfterArguments()[idx];
495 arg.setType(newTy);
496 }
497 }
498 return WalkResult::advance();
499 });
500
501 // using yieldOp as anchor to update the result type of its ParentOp
502 op->walk([](scf::YieldOp yieldOp) {
503 Operation *parentOp = yieldOp->getParentOp();
504 for (OpResult r : parentOp->getOpResults()) {
505 unsigned idx = r.getResultNumber();
506 Type resultTy = r.getType();
507 Type yieldTy = yieldOp.getResults()[idx].getType();
508 if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
509 r.setType(yieldTy);
510 }
511 });
512 }
513
514 { // perform the conversion from RankedTensorType to VectorType based on the
515 // DistributeLayoutAttr
516
517 // Handle the UnrealizedConversionCastOp introduced by the first step.
518 // For vector->RankedTensorType, it will simply forward the inputs.
519 // For RankedTensorType->vector, it will update the inputs with the
520 // one from the adaptor.
521 class UnrealizedConversionCastOpPattern
522 : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
523 using OpConversionPattern<
524 mlir::UnrealizedConversionCastOp>::OpConversionPattern;
525
526 mlir::LogicalResult
527 matchAndRewrite(mlir::UnrealizedConversionCastOp op,
528 OneToNOpAdaptor adaptor,
529 ConversionPatternRewriter &rewriter) const override {
530 auto inputs = op.getOperands();
531 auto outputs = op.getOutputs();
532
533 if (inputs.size() != 1 || outputs.size() != 1)
534 return failure();
535
536 auto inputTy = inputs[0].getType();
537 auto outputTy = outputs[0].getType();
538
539 if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
540 rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
541 return success();
542 }
543
544 if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
545 SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
546 auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
547 outputTy, values);
548 rewriter.replaceOp(op, newOp);
549 return success();
550 }
551 return failure();
552 }
553 };
554
555 converter.addSourceMaterialization(materializeCast);
556 converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
557 ValueRange inputs, Location loc) {
558 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
559 .getResults();
560 });
561
562 mlir::ConversionTarget target(*context);
563 target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
564 [](UnrealizedConversionCastOp op) {
565 auto isTensorTy = [](Type type) {
566 return isa<RankedTensorType>(type);
567 };
568 return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
569 llvm::none_of(op->getResultTypes(), isTensorTy);
570 });
571 mlir::RewritePatternSet patterns(context);
572 patterns.insert<UnrealizedConversionCastOpPattern>(context);
574 target);
575 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
576 }
577}
578
579std::optional<std::string> xegpu::getChipStr(Operation *op) {
580 auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();
581
582 if (!gpuModuleOp)
583 return std::nullopt;
584
585 auto targetAttrs = gpuModuleOp.getTargets();
586 if (targetAttrs) {
587 for (auto &attr : *targetAttrs) {
588 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
589 if (xevmAttr)
590 return xevmAttr.getChip().str();
591 }
592 }
593
594 return std::nullopt;
595}
596
597/// Generates element-wise addition ops of two arrays with same length.
599 Location loc,
602 assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
604 for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
605 auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
606 auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
607 results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
608 }
609 return results;
610}
611
612/// Generates element-wise addition ops of two arrays with automatic alignment.
613/// When the input arrays have different sizes, the shorter array is
614/// right-aligned with the longer array, and the unmatched leading elements from
615/// the longer array are preserved unchanged. This is commonly used for offset
616/// computation where higher-dimensional offsets need to be added to
617/// lower-dimensional adjustments.
618///
619/// Example:
620/// lhs = [l1, l2, l3], rhs = [r1, r2]
621/// Result: [11, l2+r1, l3+r2]
626 // ensure a is longer than b
627 ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
628 ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
629 SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
630 a = a.slice(a.size() - b.size());
631 results.append(addElementwise(builder, loc, a, b));
632 return results;
633}
634
635template <typename T>
637 ArrayRef<T> candidateMultiples) {
638 static_assert(std::is_integral<T>::value, "T must be an integer type");
639 int largest = -1;
640 SmallVector<T> multiples = {1};
641 if (!candidateMultiples.empty())
642 multiples =
643 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
644 for (T candidate : candidates) {
645 for (T multiple : multiples) {
646 int value = static_cast<int>(candidate * multiple);
647 if (value != 0 && dim % value == 0 && value > largest)
648 largest = value;
649 }
650 }
651 return largest;
652}
653
655 vector::CombiningKind kind, uint32_t size) {
656 // First reduce on a single thread to get per lane reduction value.
657 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
658 // Parallel reduction using butterfly shuffles.
659 for (uint64_t i = 1; i < size; i <<= 1) {
660 Value shuffled =
661 gpu::ShuffleOp::create(builder, loc, laneVal, i, /** width = **/ size,
662 /** mode = **/ gpu::ShuffleMode::XOR)
663 .getShuffleResult();
664 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
665 }
666 return laneVal;
667}
668
671 vector::CombiningKind kind,
672 int64_t reductionDim, Location loc,
673 PatternRewriter &rewriter) {
674 VectorType sourceType = src.getType();
675 int64_t sourceRank = sourceType.getRank();
676 // Expecting at least a 2D source vector. Leading dimensions (all except the
677 // last two) must be unit.
678 assert(sourceRank >= 2 && "expected at least a 2D source vector");
679 for (int64_t i = 0; i < sourceRank - 2; ++i)
680 assert(sourceType.getShape()[i] == 1 &&
681 "expected leading dimensions to be unit");
682 int64_t rowIdx = sourceRank - 2;
683 int64_t columnIdx = sourceRank - 1;
684 int64_t sourceH = sourceType.getShape()[rowIdx];
685 int64_t sourceW = sourceType.getShape()[columnIdx];
686 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
687 // Create a constant vector to hold the result of the reduction.
688 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
689 Value reductionResult = arith::ConstantOp::create(
690 rewriter, loc, acc.getType(),
691 DenseElementsAttr::get(acc.getType(), zeroAttr));
692 auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
693 auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
694 // Reduction result should have the same layout as the accumulator.
695 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
696 // For each slice of the source, extract the slice vector, do a reduction
697 // and, insert the reduced value back to the result vector.
698 int64_t accRank = acc.getType().getRank();
699 for (int i = 0; i < nSlices; ++i) {
700 // Build nD offsets, sizes, and strides. Leading unit dims get
701 // offset=0, size=1. The last two dims are set based on reductionDim.
702 SmallVector<int64_t> sliceOffsets(sourceRank, 0);
703 SmallVector<int64_t> sliceSizes(sourceRank, 1);
704 SmallVector<int64_t> strides(sourceRank, 1);
705 if (reductionDim == columnIdx) {
706 sliceOffsets[rowIdx] = i;
707 sliceSizes[columnIdx] = sourceW;
708 } else {
709 sliceOffsets[columnIdx] = i;
710 sliceSizes[rowIdx] = sourceH;
711 }
712
713 vector::ExtractStridedSliceOp extractOp =
714 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
715 sliceSizes, strides);
716 // Extract strided slice has the same layout as src.
717 xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
718
719 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
720
721 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
722 rewriter, loc,
723 VectorType::get({nSliceElements}, sourceType.getElementType()),
724 extractOp.getResult());
725
726 // Shape cast output has the same layout as the accumulator. Shape cast
727 // source has the same layout as the original reduction source.
728 xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
729 xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
730 // Extract and reduction results in scalars, so no result layout is needed.
731 // Build multi-dim index into acc (sourceRank-1 dims, i.e. source shape with
732 // the reduction dim removed). Leading unit dims get index 0.
733 SmallVector<int64_t> accIdx(accRank, 0);
734 accIdx[accRank - 1] = i;
735 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
736 Value reduction = vector::ReductionOp::create(
737 rewriter, loc, kind, slice.getResult(), accExtract);
738 reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
739 reductionResult, accIdx);
740 // Insert op should have the same layout as the accumulator.
741 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
742 }
743 return reductionResult;
744}
745
748 vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
749 Location loc, PatternRewriter &rewriter) {
750 // Expecting a 2D source vector.
751 assert(src.getType().getRank() == 2 && "expected a 2D source vector");
752 VectorType sourceType = src.getType();
753 int64_t sourceH = sourceType.getShape()[0];
754 int64_t sourceW = sourceType.getShape()[1];
755
756 // Create a constant vector to hold the result of the reduction.
757 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
758 Value reductionResult = arith::ConstantOp::create(
759 rewriter, loc, acc.getType(),
760 DenseElementsAttr::get(acc.getType(), zeroAttr));
761
762 // nSlices is the number of reduction operations needed to reduce the entire
763 // source vector. For example, if reductionDim is 0, we are reducing across
764 // rows, and each slice is a column of the source vector. So the number of
765 // slices is the number of columns, which is sourceW.
766 int nSlices = (reductionDim == 0) ? sourceW : sourceH;
767
768 // For each slice of the source, extract the slice vector, do a reduction
769 // and, insert the reduced value back to the result vector.
770 for (int i = 0; i < nSlices; ++i) {
771 SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
772 if (reductionDim == 1) {
773 sliceOffsets = {i, 0};
774 sliceSizes = {1, sourceW};
775 } else {
776 sliceOffsets = {0, i};
777 sliceSizes = {sourceH, 1};
778 }
779
780 vector::ExtractStridedSliceOp extractOp =
781 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
782 sliceSizes, {1, 1});
783 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
784 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
785 rewriter, loc,
786 VectorType::get({nSliceElements}, sourceType.getElementType()),
787 extractOp.getResult());
788
789 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
790 Value fullReduce =
791 xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
792 fullReduce =
793 vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
794 reductionResult =
795 vector::InsertOp::create(rewriter, loc, fullReduce, reductionResult, i);
796 }
797 return reductionResult;
798}
799
800/// Explicit instantiations
801template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
802 ArrayRef<int> candidateMultiples);
803template int
805 ArrayRef<unsigned> candidateMultiples);
806
807bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
808 if (!layout)
809 return false;
810 auto laneData = layout.getEffectiveLaneDataAsInt();
811 if (laneData.size() != 2)
812 return false;
813 return laneData[0] != 1;
814}
815
816bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
817 const xegpu::uArch::uArch *uArch) {
818 // Return false for unsupported targets.
819 // TODO: Add more support or move to target info.
820 if (uArch->getName().equals_insensitive("pvc") &&
821 uArch->getName().equals_insensitive("bmg"))
822 return false;
823 if (!layout)
824 return false;
825 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
826 if (laneLayout.size() != 2)
827 return false;
828 return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
829}
830
831// Check if dst shape is an expansion of src shape by inserting unit dimensions.
832// Returns true if all dimensions in src match corresponding dimensions in dst
833// (after skipping unit dimensions), and populates expandedUnitDims with the
834// indices of the unit dimensions in dst that were added (not present in src).
835// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
837 SmallVector<int64_t> &expandedUnitDims) {
838 // All unit dimensions in dst that don't appear in src are the expanded
839 // unit dimensions
840 size_t srcIdx = 0;
841 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
842 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
843 srcIdx++;
844 else if (dst[dstIdx] == 1)
845 expandedUnitDims.push_back(dstIdx);
846 else
847 return false;
848 return srcIdx == src.size();
849}
850
851// Checks if dst shape is an expansion of src shape where each dimension in src
852// is split into one or more consecutive dimensions in dst whose product equals
853// the original dimension. Populates splitDimGroups with groups of dst indices
854// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
855// true
858 SmallVector<SmallVector<int64_t>> &splitDimGroups) {
859 // each dim in src can be mapped to one or more dims in dst whose product
860 // equals to the src dim
861 size_t srcIdx = 0;
862 int64_t accumulatedSize = 1;
863 SmallVector<int64_t> currentDstDims;
864
865 splitDimGroups.clear();
866 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
867 if (srcIdx >= src.size())
868 return false;
869 accumulatedSize *= dst[dstIdx];
870 currentDstDims.push_back(dstIdx);
871
872 if (accumulatedSize == src[srcIdx]) {
873 // Record the mapping: srcIdx -> currentDstDims
874 splitDimGroups.push_back(currentDstDims);
875 // move to next src dim
876 srcIdx++;
877 accumulatedSize = 1;
878 currentDstDims.clear();
879 } else if (accumulatedSize > src[srcIdx]) {
880 return false;
881 }
882 }
883 return srcIdx == src.size();
884}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
This class represents an argument of a Block.
Definition Value.h:306
TypedAttr getZeroAttr(Type type)
Definition Builders.cpp:328
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition Builders.h:528
This class represents an operand of an operation.
Definition Value.h:254
This is a value defined by a result of an operation.
Definition Value.h:454
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
AttrClass getAttrOfType(StringAttr name)
Definition Operation.h:579
bool hasAttrOfType(NameT &&name)
Definition Operation.h:604
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:589
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition Operation.h:255
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:259
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition Operation.h:611
operand_type_range getOperandTypes()
Definition Operation.h:426
result_type_range getResultTypes()
Definition Operation.h:457
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:826
result_range getOpResults()
Definition Operation.h:449
MLIRContext * getContext()
Return the context this operation is associated with.
Definition Operation.h:237
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:37
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
void setType(Type newType)
Mutate the type of this Value to be of the specified type.
Definition Value.h:116
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
bool requirePacked(const LayoutAttr layout)
Helper function to check if the layout is packed.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:305
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:112
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const
Definition uArchBase.h:158