MLIR 23.0.0git
XeGPUUtils.cpp
Go to the documentation of this file.
1//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===//
2//
3// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utility methods for working with the XeGPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
20#include "mlir/IR/Builders.h"
21#include "mlir/IR/Operation.h"
22#include "mlir/IR/ValueRange.h"
25#include "llvm/Support/Casting.h"
26#include "llvm/Support/FormatVariadic.h"
27#include <cstdint>
28#include <numeric>
29
30using namespace mlir;
31
32/// convert ArrayRef<ValueRange> into SmallVector<Value>
35 for (const auto &vals : values)
36 llvm::append_range(result, vals);
37 return result;
38}
39
40FailureOr<VectorType>
41mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
42 auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
43 // It only works for subgroup level layout, which only has lane_layout
44 // and lane_data, and is to distribute a SIMD code into SIMT code.
45 if (!layout || !layout.isForSubgroup())
46 return failure();
47
48 SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
49 SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
50 auto tdescShape = tdescTy.getShape();
51 auto elementType = tdescTy.getElementType();
52
53 // compute sgSize by multiply elements of laneLayout
54 // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
55 // e.g. for 1D layout, sgSize = laneLayout[0]
56 int64_t sgSize = llvm::product_of(laneLayout);
57
58 // Case 1: regular loads/stores
59 auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
60 if (scatterAttr) {
61 auto chunkSize = scatterAttr.getChunkSize().getInt();
62 // Verify if the first dimension of the tensor descriptor shape is
63 // distributable.
64 assert(tdescShape[0] == laneLayout[0] &&
65 "tensor descriptor shape is not distributable");
66 return VectorType::get({chunkSize}, elementType);
67 }
68
69 // Case 2: block loads/stores
70 // Check if the tensor descriptor shape is distributable.
71 int64_t tensorSize = 1;
72 for (auto [tdescDim, laneDim, laneDataDim] :
73 llvm::zip_equal(tdescShape, laneLayout, laneData)) {
74 assert((tdescDim % (laneDim * laneDataDim) == 0) &&
75 "tensor descriptor shape is not distributable");
76 tensorSize *= tdescDim;
77 }
78 // tensorSize must be adjusted for array_length.
79 tensorSize *= tdescTy.getArrayLength();
80
81 return VectorType::get({tensorSize / sgSize}, elementType);
82}
83
84FailureOr<VectorType>
85mlir::xegpu::getDistributedVectorType(VectorType originalType,
86 xegpu::LayoutAttr layout) {
87 int64_t rank = originalType.getRank();
88 // Distributed vector type is only supported for 1D, 2D and 3D vectors.
89 if (rank < 1 || rank > 3)
90 return failure();
91 ArrayRef<int64_t> shape = originalType.getShape();
92 // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
93 // of the 3D vector.
94 int arrayLength = 1;
95 if (rank == 3) {
96 arrayLength = shape[0];
97 shape = shape.drop_front();
98 }
99 auto helperTdescTy = xegpu::TensorDescType::get(
100 shape, originalType.getElementType(), arrayLength,
101 /*boundary_check=*/true,
102 /*memory_space=*/xegpu::MemorySpace::Global, layout);
103 return xegpu::getDistributedVectorType(helperTdescTy);
104}
105
106FailureOr<VectorType>
107xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
108 VectorType originalType) {
109 if (!layout)
110 return failure();
111 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
112 "Expecting a valid layout.");
113
114 int64_t vectorRank = originalType.getRank();
115 int64_t layoutRank = layout.getRank();
116 assert(vectorRank >= layoutRank && "Vector rank must be >= layout rank.");
117
118 // When the vector has more dimensions than the layout, only the trailing
119 // dimensions are distributed. Leading dimensions are preserved as-is.
120 int64_t offset = vectorRank - layoutRank;
121 ArrayRef<int64_t> fullShape = originalType.getShape();
122 SmallVector<int64_t> trailingShape(fullShape.begin() + offset,
123 fullShape.end());
124 auto distributedShapeOrFailure =
125 layout.computeDistributedShape(trailingShape);
126 if (failed(distributedShapeOrFailure))
127 return failure();
128
129 SmallVector<int64_t> resultShape(fullShape.begin(),
130 fullShape.begin() + offset);
131 resultShape.append(distributedShapeOrFailure->begin(),
132 distributedShapeOrFailure->end());
133 return VectorType::get(resultShape, originalType.getElementType());
134}
135
136std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
137 const StringRef prefix("layout_operand_");
138 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
139 return llvm::formatv("{0}{1}", prefix, idx).str();
140}
141
143 const StringRef prefix = "layout_result_";
144 return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
145}
146
147xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
148 if (!value)
149 return nullptr;
150
151 if (auto tdescTy =
152 dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
153 return tdescTy.getLayoutAttr();
154
155 if (auto result = dyn_cast<OpResult>(value)) {
156 Operation *defOp = result.getDefiningOp();
157 assert(defOp && "result must have a defining op");
158
159 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
160 auto layout = anchorOp.getAnchorLayout();
161 return layout;
162 }
163
164 std::string layoutName = getTemporaryLayoutName(result);
165 if (defOp->hasAttr(layoutName)) {
166 auto layout =
167 defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
168 return layout;
169 }
170 }
171
172 if (auto arg = dyn_cast<BlockArgument>(value)) {
173 auto *parentOp = arg.getOwner()->getParentOp();
174 if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
175 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
176 if (tiedInit)
177 return getDistributeLayoutAttr(tiedInit->get());
178 }
179 }
180
181 return nullptr;
182}
183xegpu::DistributeLayoutAttr
185 Operation *op = opr.getOwner();
186 unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
187
188 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
189 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
190 if (idx == 0) {
191 return dpasOp.getLayoutAAttr();
192 } else if (idx == 1) {
193 return dpasOp.getLayoutBAttr();
194 } else if (idx == 2) {
195 return dpasOp.getLayoutCdAttr();
196 }
197 }
198 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
199 return convertOp.getInputLayoutAttr();
200 }
201 auto layout = anchorOp.getAnchorLayout();
202
203 if (idx == 0)
204 return layout;
205
206 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
207 // the layout is valid for the first two operands: value and memref/tdesc.
208 // For other operations, the layout applies to the first operand only.
209 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
210 op) &&
211 (idx < 2))
212 return layout;
213 }
214
215 std::string layoutName = xegpu::getTemporaryLayoutName(opr);
216 if (op->hasAttr(layoutName)) {
217 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
218 return layout;
219 }
220
221 return nullptr;
222}
223
224// Returns the permanent layout attribute for the given result if it's
225// available on the defining op. Otherwise returns the provided layout.
226xegpu::DistributeLayoutAttr
227maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
228 const OpResult &result, mlir::Operation *owner,
229 const std::string &name) {
230 xegpu::DistributeLayoutAttr candidate = layout;
231
232 if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
233 if (auto perm = loadOp.getLayoutAttr())
234 candidate = perm;
235 }
236
237 return candidate;
238}
239
240// Returns the permanent layout attribute for the given operand if it's
241// available on the defining op. Otherwise returns the provided layout.
242xegpu::DistributeLayoutAttr
243maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
244 const OpOperand &operand, mlir::Operation *owner,
245 const std::string &name) {
246 xegpu::DistributeLayoutAttr candidate = layout;
247 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
248
249 if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
250 if (idx == 0) {
251 if (auto perm = storeOp.getLayoutAttr())
252 candidate = perm;
253 }
254 }
255
256 return candidate;
257}
258
259// TODO-LayoutRefactor: Remove this function after replacing use
260// with setTemporaryLayout or setAnchorLayout
262 const mlir::OpResult &result,
263 const mlir::xegpu::DistributeLayoutAttr layout) {
264 Operation *owner = result.getOwner();
265
266 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
267 if (anchorOp.getAnchorLayout() == layout)
268 return;
269 anchorOp.setAnchorLayout(layout);
270 return;
271 }
272
273 std::string name = xegpu::getTemporaryLayoutName(result);
274 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
275 return;
276 }
277 if (layout) {
278 owner->setAttr(name, layout);
279 }
280}
281
282// TODO-LayoutRefactor: Remove this function after replacing use
283// with setTemporaryLayout or setAnchorLayout
285 const DistributeLayoutAttr layout) {
286 Operation *owner = operand.getOwner();
287 unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
288
289 if (!layout) {
290 return;
291 }
292 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
293 if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
294 if (idx == 0) {
295 return dpasOp.setLayoutAAttr(layout);
296 } else if (idx == 1) {
297 return dpasOp.setLayoutBAttr(layout);
298 } else if (idx == 2) {
299 return dpasOp.setLayoutCdAttr(layout);
300 }
301 }
302 if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
303 return convertOp.setInputLayoutAttr(layout);
304 }
305
306 // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
307 // the layout is valid for the first two operands: value and memref/tdesc.
308 // For other operations, the layout applies to the first operand only.
309 if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
310 owner)) {
311 if (idx < 2) {
312 anchorOp.setAnchorLayout(layout);
313 }
314 } else {
315 if (idx == 0) {
316 anchorOp.setAnchorLayout(layout);
317 }
318 }
319 }
320
321 std::string name = xegpu::getTemporaryLayoutName(operand);
322 if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
323 return;
324 }
325 if (layout) {
326 owner->setAttr(name, layout);
327 }
328}
329
330template <typename T, typename>
331xegpu::DistributeLayoutAttr
332xegpu::getTemporaryLayout(const T &operandOrResult) {
333 Operation *op = operandOrResult.getOwner();
334
335 std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
336 if (op->hasAttr(layoutName)) {
337 auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
338 return layout;
339 }
340
341 return nullptr;
342}
343
344template xegpu::DistributeLayoutAttr
346template xegpu::DistributeLayoutAttr
348
349template <typename T, typename>
350void xegpu::setTemporaryLayout(const T &operandOrResult,
351 const xegpu::DistributeLayoutAttr layout) {
352 Operation *owner = operandOrResult.getOwner();
353 std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
354 if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
355 return;
356 }
357 if (layout) {
358 owner->setAttr(name, layout);
359 }
360}
361
363 const mlir::OpResult &result,
364 const mlir::xegpu::DistributeLayoutAttr layout);
365
367 const mlir::OpOperand &operand,
368 const mlir::xegpu::DistributeLayoutAttr layout);
369
373 auto vecTy = dyn_cast<VectorType>(value.getType());
374 if (!vecTy)
375 return {value};
376
377 ArrayRef<int64_t> srcShape = vecTy.getShape();
378 if (!computeShapeRatio(srcShape, shape))
379 return {value};
380
381 int64_t srcShapeRank = srcShape.size();
382 int64_t targetShapeRank = shape.size();
383
384 SmallVector<int64_t> adjustedTargetShape(srcShape.size());
385 int64_t rankDiff = srcShapeRank - targetShapeRank;
386 std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
387 1);
388 llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);
389
391 for (SmallVector<int64_t> offsets :
392 StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
393 SmallVector<int64_t> staticStrides(offsets.size(), 1);
394 Value slice = vector::ExtractStridedSliceOp::create(
395 builder, loc, value, offsets, adjustedTargetShape, staticStrides);
396
397 // Reshape to remove leading unit dims if needed
398 if (srcShapeRank > targetShapeRank) {
399 auto targetTy = VectorType::get(shape, vecTy.getElementType());
400 slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
401 }
402 result.push_back(slice);
403 }
404
405 return result;
406}
407
409 ValueRange values,
411 VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
412 assert(llvm::all_of(values.getTypes(),
413 [&](Type type) { return type == inputTy; }) &&
414 "values must be of the same VectorType");
415
416 Type elemTy = inputTy.getElementType();
417 ArrayRef<int64_t> tileShape = inputTy.getShape();
418
419 VectorType resultTy = VectorType::get(shape, elemTy);
420 auto zeroAttr = builder.getZeroAttr(elemTy);
421 Value result = arith::ConstantOp::create(
422 builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
423
424 for (auto [src, offsets] :
425 llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
426 SmallVector<int64_t> staticStrides(tileShape.size(), 1);
427 result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
428 offsets, staticStrides);
429 }
430 return result;
431}
432
434 Operation *op, TypeConverter converter) {
435 MLIRContext *context = op->getContext();
436
437 auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
438 Location loc) -> Value {
439 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
440 .getResult(0);
441 };
442
443 { // convert VectorType to RankedTensorType for SCF Structural ops
444 TypeConverter converter;
445 converter.addConversion([](Type type) -> Type { return type; });
446 converter.addConversion([](VectorType type) -> Type {
447 return RankedTensorType::get(type.getShape(), type.getElementType());
448 });
449 converter.addSourceMaterialization(materializeCast);
450 converter.addTargetMaterialization(materializeCast);
451
452 mlir::ConversionTarget target(*context);
453 target.addLegalOp<UnrealizedConversionCastOp>();
454
455 mlir::RewritePatternSet patterns(context);
457 target);
458 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
459 }
460
461 { // propagate the layout attribute to RankedTensorType by checking
462 // BuiltInUnrealizedCastOps
463 // for VectorType to RankedTensorType cast.
464 op->walk([](UnrealizedConversionCastOp castOp) {
465 if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
466 return WalkResult::skip();
467
468 Value input = castOp.getInputs()[0];
469 Value result = castOp.getResults()[0];
470 auto inputTy = dyn_cast<VectorType>(input.getType());
471 auto resultTy = dyn_cast<RankedTensorType>(result.getType());
472
473 // Only look at ops casting from VectorType to RankedTensorType
474 if (!inputTy || !resultTy)
475 return WalkResult::skip();
476
477 xegpu::DistributeLayoutAttr layout =
479 if (!layout)
480 return WalkResult::skip();
481
482 RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
483 result.setType(newTy);
484
485 // update the arguments if user is a LoopLike op.
486 for (OpOperand &use : result.getUses()) {
487 if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
488 BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
489 arg.setType(newTy);
490 }
491 // whileOp has two regions, the BlockArgument of the after region
492 // is not exposed by LoopLikeOpInterface
493 if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
494 unsigned idx = use.getOperandNumber();
495 BlockArgument arg = whileOp.getAfterArguments()[idx];
496 arg.setType(newTy);
497 }
498 }
499 return WalkResult::advance();
500 });
501
502 // using yieldOp as anchor to update the result type of its ParentOp
503 op->walk([](scf::YieldOp yieldOp) {
504 Operation *parentOp = yieldOp->getParentOp();
505 for (OpResult r : parentOp->getOpResults()) {
506 unsigned idx = r.getResultNumber();
507 Type resultTy = r.getType();
508 Type yieldTy = yieldOp.getResults()[idx].getType();
509 if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
510 r.setType(yieldTy);
511 }
512 });
513 }
514
515 { // perform the conversion from RankedTensorType to VectorType based on the
516 // DistributeLayoutAttr
517
518 // Handle the UnrealizedConversionCastOp introduced by the first step.
519 // For vector->RankedTensorType, it will simply forward the inputs.
520 // For RankedTensorType->vector, it will update the inputs with the
521 // one from the adaptor.
522 class UnrealizedConversionCastOpPattern
523 : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
524 using OpConversionPattern<
525 mlir::UnrealizedConversionCastOp>::OpConversionPattern;
526
527 mlir::LogicalResult
528 matchAndRewrite(mlir::UnrealizedConversionCastOp op,
529 OneToNOpAdaptor adaptor,
530 ConversionPatternRewriter &rewriter) const override {
531 auto inputs = op.getOperands();
532 auto outputs = op.getOutputs();
533
534 if (inputs.size() != 1 || outputs.size() != 1)
535 return failure();
536
537 auto inputTy = inputs[0].getType();
538 auto outputTy = outputs[0].getType();
539
540 if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
541 rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
542 return success();
543 }
544
545 if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
546 SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
547 auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
548 outputTy, values);
549 rewriter.replaceOp(op, newOp);
550 return success();
551 }
552 return failure();
553 }
554 };
555
556 converter.addSourceMaterialization(materializeCast);
557 converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
558 ValueRange inputs, Location loc) {
559 return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
560 .getResults();
561 });
562
563 mlir::ConversionTarget target(*context);
564 target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
565 [](UnrealizedConversionCastOp op) {
566 auto isTensorTy = [](Type type) {
567 return isa<RankedTensorType>(type);
568 };
569 return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
570 llvm::none_of(op->getResultTypes(), isTensorTy);
571 });
572 mlir::RewritePatternSet patterns(context);
573 patterns.insert<UnrealizedConversionCastOpPattern>(context);
575 target);
576 (void)mlir::applyPartialConversion(op, target, std::move(patterns));
577 }
578}
579
580std::optional<std::string> xegpu::getChipStr(Operation *op) {
581 auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();
582
583 if (!gpuModuleOp)
584 return std::nullopt;
585
586 auto targetAttrs = gpuModuleOp.getTargets();
587 if (targetAttrs) {
588 for (auto &attr : *targetAttrs) {
589 auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
590 if (xevmAttr)
591 return xevmAttr.getChip().str();
592 }
593 }
594
595 return std::nullopt;
596}
597
598/// Generates element-wise addition ops of two arrays with same length.
600 Location loc,
603 assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
605 for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
606 auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
607 auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
608 results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
609 }
610 return results;
611}
612
613/// Generates element-wise addition ops of two arrays with automatic alignment.
614/// When the input arrays have different sizes, the shorter array is
615/// right-aligned with the longer array, and the unmatched leading elements from
616/// the longer array are preserved unchanged. This is commonly used for offset
617/// computation where higher-dimensional offsets need to be added to
618/// lower-dimensional adjustments.
619///
620/// Example:
621/// lhs = [l1, l2, l3], rhs = [r1, r2]
622/// Result: [11, l2+r1, l3+r2]
627 // ensure a is longer than b
628 ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
629 ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
630 SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
631 a = a.slice(a.size() - b.size());
632 results.append(addElementwise(builder, loc, a, b));
633 return results;
634}
635
636template <typename T>
638 ArrayRef<T> candidateMultiples) {
639 static_assert(std::is_integral<T>::value, "T must be an integer type");
640 int largest = -1;
641 SmallVector<T> multiples = {1};
642 if (!candidateMultiples.empty())
643 multiples =
644 SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
645 for (T candidate : candidates) {
646 for (T multiple : multiples) {
647 int value = static_cast<int>(candidate * multiple);
648 if (value != 0 && dim % value == 0 && value > largest)
649 largest = value;
650 }
651 }
652 return largest;
653}
654
656 vector::CombiningKind kind, uint32_t size) {
657 // First reduce on a single thread to get per lane reduction value.
658 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
659 // Parallel reduction using butterfly shuffles.
660 for (uint64_t i = 1; i < size; i <<= 1) {
661 Value shuffled =
662 gpu::ShuffleOp::create(builder, loc, laneVal, i, /** width = **/ size,
663 /** mode = **/ gpu::ShuffleMode::XOR)
664 .getShuffleResult();
665 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
666 }
667 return laneVal;
668}
669
672 vector::CombiningKind kind,
673 int64_t reductionDim, Location loc,
674 PatternRewriter &rewriter) {
675 VectorType sourceType = src.getType();
676 int64_t sourceRank = sourceType.getRank();
677 // Expecting at least a 2D source vector. Leading dimensions (all except the
678 // last two) must be unit.
679 assert(sourceRank >= 2 && "expected at least a 2D source vector");
680 for (int64_t i = 0; i < sourceRank - 2; ++i)
681 assert(sourceType.getShape()[i] == 1 &&
682 "expected leading dimensions to be unit");
683 int64_t rowIdx = sourceRank - 2;
684 int64_t columnIdx = sourceRank - 1;
685 int64_t sourceH = sourceType.getShape()[rowIdx];
686 int64_t sourceW = sourceType.getShape()[columnIdx];
687 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
688 // Create a constant vector to hold the result of the reduction.
689 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
690 Value reductionResult = arith::ConstantOp::create(
691 rewriter, loc, acc.getType(),
692 DenseElementsAttr::get(acc.getType(), zeroAttr));
693 auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
694 auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
695 // Reduction result should have the same layout as the accumulator.
696 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
697 // For each slice of the source, extract the slice vector, do a reduction
698 // and, insert the reduced value back to the result vector.
699 int64_t accRank = acc.getType().getRank();
700 for (int i = 0; i < nSlices; ++i) {
701 // Build nD offsets, sizes, and strides. Leading unit dims get
702 // offset=0, size=1. The last two dims are set based on reductionDim.
703 SmallVector<int64_t> sliceOffsets(sourceRank, 0);
704 SmallVector<int64_t> sliceSizes(sourceRank, 1);
705 SmallVector<int64_t> strides(sourceRank, 1);
706 if (reductionDim == columnIdx) {
707 sliceOffsets[rowIdx] = i;
708 sliceSizes[columnIdx] = sourceW;
709 } else {
710 sliceOffsets[columnIdx] = i;
711 sliceSizes[rowIdx] = sourceH;
712 }
713
714 vector::ExtractStridedSliceOp extractOp =
715 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
716 sliceSizes, strides);
717 // Extract strided slice has the same layout as src.
718 xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
719
720 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
721
722 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
723 rewriter, loc,
724 VectorType::get({nSliceElements}, sourceType.getElementType()),
725 extractOp.getResult());
726
727 // Shape cast output has the same layout as the accumulator. Shape cast
728 // source has the same layout as the original reduction source.
729 xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
730 xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
731 // Extract and reduction results in scalars, so no result layout is needed.
732 // Build multi-dim index into acc (sourceRank-1 dims, i.e. source shape with
733 // the reduction dim removed). Leading unit dims get index 0.
734 SmallVector<int64_t> accIdx(accRank, 0);
735 accIdx[accRank - 1] = i;
736 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
737 Value reduction = vector::ReductionOp::create(
738 rewriter, loc, kind, slice.getResult(), accExtract);
739 reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
740 reductionResult, accIdx);
741 // Insert op should have the same layout as the accumulator.
742 xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
743 }
744 return reductionResult;
745}
746
749 vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
750 Location loc, PatternRewriter &rewriter) {
751 VectorType sourceType = src.getType();
752 int64_t sourceRank = sourceType.getRank();
753 // Expecting at least a 2D source vector. Leading dimensions (all except the
754 // last two) must be unit.
755 assert(sourceRank >= 2 && "expected at least a 2D source vector");
756 for (int64_t i = 0; i < sourceRank - 2; ++i)
757 assert(sourceType.getShape()[i] == 1 &&
758 "expected leading dimensions to be unit");
759 int64_t rowIdx = sourceRank - 2;
760 int64_t columnIdx = sourceRank - 1;
761 int64_t sourceH = sourceType.getShape()[rowIdx];
762 int64_t sourceW = sourceType.getShape()[columnIdx];
763
764 // Create a constant vector to hold the result of the reduction.
765 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
766 Value reductionResult = arith::ConstantOp::create(
767 rewriter, loc, acc.getType(),
768 DenseElementsAttr::get(acc.getType(), zeroAttr));
769
770 // nSlices is the number of reduction operations needed to reduce the entire
771 // source vector. For example, if reductionDim is the row dim, we are
772 // reducing across rows, and each slice is a column. So the number of slices
773 // is the number of columns, which is sourceW.
774 int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
775
776 // For each slice of the source, extract the slice vector, do a reduction
777 // and, insert the reduced value back to the result vector.
778 int64_t accRank = acc.getType().getRank();
779 for (int i = 0; i < nSlices; ++i) {
780 // Build nD offsets, sizes, and strides. Leading unit dims get
781 // offset=0, size=1. The last two dims are set based on reductionDim.
782 SmallVector<int64_t> sliceOffsets(sourceRank, 0);
783 SmallVector<int64_t> sliceSizes(sourceRank, 1);
784 SmallVector<int64_t> strides(sourceRank, 1);
785 if (reductionDim == columnIdx) {
786 sliceOffsets[rowIdx] = i;
787 sliceSizes[columnIdx] = sourceW;
788 } else {
789 sliceOffsets[columnIdx] = i;
790 sliceSizes[rowIdx] = sourceH;
791 }
792
793 vector::ExtractStridedSliceOp extractOp =
794 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
795 sliceSizes, strides);
796 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
797 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
798 rewriter, loc,
799 VectorType::get({nSliceElements}, sourceType.getElementType()),
800 extractOp.getResult());
801
802 SmallVector<int64_t> accIdx(accRank, 0);
803 accIdx[accRank - 1] = i;
804 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
805 Value fullReduce =
806 xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
807 fullReduce =
808 vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
809 reductionResult = vector::InsertOp::create(rewriter, loc, fullReduce,
810 reductionResult, accIdx);
811 }
812 return reductionResult;
813}
814
816 Type type,
817 vector::CombiningKind kind) {
818 auto vecTy = dyn_cast<VectorType>(type);
819 Type elemTy = vecTy ? vecTy.getElementType() : type;
820
821 // Helper to create either a splat vector or scalar constant from an attr.
822 auto makeConst = [&](Attribute scalarAttr) -> Value {
823 if (vecTy)
824 return arith::ConstantOp::create(
825 builder, loc, vecTy, DenseElementsAttr::get(vecTy, scalarAttr));
826 return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(scalarAttr));
827 };
828
829 switch (kind) {
830 case vector::CombiningKind::ADD:
831 case vector::CombiningKind::XOR:
832 case vector::CombiningKind::OR:
833 case vector::CombiningKind::MAXUI:
834 return makeConst(builder.getZeroAttr(elemTy));
835
836 case vector::CombiningKind::MUL:
837 case vector::CombiningKind::AND:
838 return makeConst(builder.getOneAttr(elemTy));
839
840 case vector::CombiningKind::MINSI:
841 if (auto intTy = dyn_cast<IntegerType>(elemTy))
842 return makeConst(builder.getIntegerAttr(
843 elemTy, APInt::getSignedMaxValue(intTy.getWidth())));
844 return nullptr;
845
846 case vector::CombiningKind::MINUI:
847 if (auto intTy = dyn_cast<IntegerType>(elemTy))
848 return makeConst(
849 builder.getIntegerAttr(elemTy, APInt::getMaxValue(intTy.getWidth())));
850 return nullptr;
851
852 case vector::CombiningKind::MAXSI:
853 if (auto intTy = dyn_cast<IntegerType>(elemTy))
854 return makeConst(builder.getIntegerAttr(
855 elemTy, APInt::getSignedMinValue(intTy.getWidth())));
856 return nullptr;
857
858 case vector::CombiningKind::MINNUMF:
859 case vector::CombiningKind::MINIMUMF:
860 if (auto floatTy = dyn_cast<FloatType>(elemTy))
861 return makeConst(builder.getFloatAttr(
862 elemTy, APFloat::getInf(floatTy.getFloatSemantics())));
863 return nullptr;
864
865 case vector::CombiningKind::MAXNUMF:
866 case vector::CombiningKind::MAXIMUMF:
867 if (auto floatTy = dyn_cast<FloatType>(elemTy))
868 return makeConst(builder.getFloatAttr(
869 elemTy, APFloat::getInf(floatTy.getFloatSemantics(), true)));
870 return nullptr;
871 }
872 return nullptr;
873}
874
875/// Explicit instantiations
876template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
877 ArrayRef<int> candidateMultiples);
878template int
880 ArrayRef<unsigned> candidateMultiples);
881
882bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
883 if (!layout)
884 return false;
885 auto laneData = layout.getEffectiveLaneDataAsInt();
886 if (laneData.size() != 2)
887 return false;
888 return laneData[0] != 1;
889}
890
891bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
892 const xegpu::uArch::uArch *uArch) {
893 // Return false for unsupported targets.
894 // TODO: Add more support or move to target info.
895 if (uArch->getName().equals_insensitive("pvc") &&
896 uArch->getName().equals_insensitive("bmg"))
897 return false;
898 if (!layout)
899 return false;
900 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
901 if (laneLayout.size() != 2)
902 return false;
903 return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
904}
905
906// Check if dst shape is an expansion of src shape by inserting unit dimensions.
907// Returns true if all dimensions in src match corresponding dimensions in dst
908// (after skipping unit dimensions), and populates expandedUnitDims with the
909// indices of the unit dimensions in dst that were added (not present in src).
910// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
912 SmallVector<int64_t> &expandedUnitDims) {
913 // All unit dimensions in dst that don't appear in src are the expanded
914 // unit dimensions
915 size_t srcIdx = 0;
916 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
917 if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
918 srcIdx++;
919 else if (dst[dstIdx] == 1)
920 expandedUnitDims.push_back(dstIdx);
921 else
922 return false;
923 return srcIdx == src.size();
924}
925
926// Checks if dst shape is an expansion of src shape where each dimension in src
927// is split into one or more consecutive dimensions in dst whose product equals
928// the original dimension. Populates splitDimGroups with groups of dst indices
929// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
930// true
933 SmallVector<SmallVector<int64_t>> &splitDimGroups) {
934 // each dim in src can be mapped to one or more dims in dst whose product
935 // equals to the src dim
936 size_t srcIdx = 0;
937 int64_t accumulatedSize = 1;
938 SmallVector<int64_t> currentDstDims;
939
940 splitDimGroups.clear();
941 for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
942 if (srcIdx >= src.size())
943 return false;
944 accumulatedSize *= dst[dstIdx];
945 currentDstDims.push_back(dstIdx);
946
947 if (accumulatedSize == src[srcIdx]) {
948 // Record the mapping: srcIdx -> currentDstDims
949 splitDimGroups.push_back(currentDstDims);
950 // move to next src dim
951 srcIdx++;
952 accumulatedSize = 1;
953 currentDstDims.clear();
954 } else if (accumulatedSize > src[srcIdx]) {
955 return false;
956 }
957 }
958 return srcIdx == src.size();
959}
return success()
lhs
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name)
Attributes are known-constant values of operations.
Definition Attributes.h:25
This class represents an argument of a Block.
Definition Value.h:306
IntegerAttr getIntegerAttr(Type type, int64_t value)
Definition Builders.cpp:232
FloatAttr getFloatAttr(Type type, double value)
Definition Builders.cpp:258
TypedAttr getZeroAttr(Type type)
Definition Builders.cpp:328
TypedAttr getOneAttr(Type type)
Definition Builders.cpp:346
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition Builders.h:528
This class represents an operand of an operation.
Definition Value.h:254
This is a value defined by a result of an operation.
Definition Value.h:454
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
AttrClass getAttrOfType(StringAttr name)
Definition Operation.h:576
bool hasAttrOfType(NameT &&name)
Definition Operation.h:601
bool hasAttr(StringAttr name)
Return true if the operation has an attribute with the provided name, false otherwise.
Definition Operation.h:586
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition Operation.h:252
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:256
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
Definition Operation.h:608
operand_type_range getOperandTypes()
Definition Operation.h:423
result_type_range getResultTypes()
Definition Operation.h:454
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:823
result_range getOpResults()
Definition Operation.h:446
MLIRContext * getContext()
Return the context this operation is associated with.
Definition Operation.h:234
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
A range-style iterator that allows for iterating over the offsets of all potential tiles of size tile...
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:40
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
void setType(Type newType)
Mutate the type of this Value to be of the specified type.
Definition Value.h:116
Type getType() const
Return the type of this value.
Definition Value.h:105
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
bool requirePacked(const LayoutAttr layout)
Helper function to check if the layout is packed.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
Definition Utils.cpp:307
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:114
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
virtual int getSubgroupSize() const =0
StringRef getName() const
Definition uArchBase.h:158