MLIR 23.0.0git
XeGPUSgToWiDistributeExperimental.cpp
Go to the documentation of this file.
1//===- XeGPUSgToWiDistributeExperimental.cpp - XeGPU SG to WI Pass --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
19#include "mlir/IR/Builders.h"
21#include "mlir/IR/BuiltinOps.h"
23#include "mlir/IR/MLIRContext.h"
24#include "mlir/IR/Operation.h"
25#include "mlir/IR/Value.h"
26#include "mlir/IR/ValueRange.h"
28#include "llvm/ADT/SetVector.h"
29#include "llvm/Support/LogicalResult.h"
30#include "llvm/Support/raw_ostream.h"
31#include <optional>
32
33namespace mlir {
34namespace xegpu {
35#define GEN_PASS_DEF_XEGPUSGTOWIDISTRIBUTEEXPERIMENTAL
36#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
37} // namespace xegpu
38} // namespace mlir
39
40using namespace mlir;
41
42#define DEBUG_TYPE "xegpu-sg-to-wi-distribute-experimental"
43#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
44
45namespace {
46
47/// Casts the given vector value `v` to the expected vector type `expectedTy`.
48static Value castValueTo(ConversionPatternRewriter &rewriter,
49 TypedValue<VectorType> v, VectorType expectedTy) {
50 // If the type matches, simply return the value itself.
51 if (v.getType() == expectedTy)
52 return v;
53 // If only shape differs, use shape cast.
54 if (isa<VectorType>(v.getType()) &&
55 v.getType().getNumElements() == expectedTy.getNumElements())
56 return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
57
58 // Else create an unrealized cast.
59 auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),
60 expectedTy, ValueRange{v});
61 return newOp.getResult(0);
62}
63
64/// Checks if all XeGPU anchor ops and vector results have valid layouts.
65static LogicalResult verifyLayouts(Operation *root) {
66 auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
67 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
68 auto layout = anchorOp.getAnchorLayout();
69 if (!layout) {
70 nestedOp->emitError("expected anchor layout attribute on operation");
71 return WalkResult::interrupt();
72 }
73 return WalkResult::advance();
74 }
75 // For each vector result, check if the op contains a result layout
76 // attribute.
77 for (OpResult result : nestedOp->getResults()) {
78 if (isa<VectorType>(result.getType())) {
80 if (!layout) {
81 nestedOp->emitError(
82 "expected result layout attribute on vector result");
83 return WalkResult::interrupt();
84 }
85 }
86 }
87 return WalkResult::advance();
88 });
89 return walkResult.wasInterrupted() ? failure() : success();
90}
91
92/// A vector::MultiDimReductionOp at subgroup level in expected form if, it has
93/// exactly 1 reduction dimension, it had valid result layout attribute, and
94/// result type can be distributed to lanes using the layout.
95static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {
96 auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
97 // If no layout, not valid.
98 if (!resLayout || !resLayout.isForSubgroup())
99 return false;
100 VectorType resTy = dyn_cast<VectorType>(op.getType());
101 if (!resTy)
102 return false;
103 // Compute the distributed result vector type based on the layout.
104 FailureOr<VectorType> resDistTypeOrFailure =
105 getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
106 if (failed(resDistTypeOrFailure))
107 return false;
108 return op.getReductionDims().size() == 1;
109}
110
111/// A vector::MultiDimReductionOp is doing lane-local reduction if each workitem
112/// is doing its own local reduction. In this case the result layout ensures
113/// that result vector is distributed to lanes, i.e. the result vector type is
114/// different from the distributed result vector type.
115static bool isReductionLaneLocal(vector::MultiDimReductionOp op) {
116 // Must be valid MultiDimReductionOp.
117 assert(isValidSubgroupMultiReductionOp(op) && "Expecting a valid subgroup "
118 "MultiDimReductionOp");
119 auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
120 VectorType resTy = dyn_cast<VectorType>(op.getType());
121 auto resDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
122 return resTy != resDistTypeOrFailure.value();
123}
124
125/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
126/// op. This simply drops the layout attribute from the tensor descriptor type.
127struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
128 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
129
130 LogicalResult
131 matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,
132 ConversionPatternRewriter &rewriter) const override {
133 xegpu::TensorDescType resultType = op.getType();
134 // If no layout, nothing to do.
135 if (!resultType.getLayout())
136 return failure();
137
138 auto newOp = xegpu::CreateNdDescOp::create(
139 rewriter, op.getLoc(), resultType.dropLayouts(), op.getOperands(),
140 op->getAttrs());
141 rewriter.replaceOp(op, newOp.getResult());
142 return success();
143 }
144};
145
146/// Distributes a subgroup-level LoadNd op to workitem-level LoadNd op. Output
147/// of workitem-level LoadNd op is 1D. ShapeCast is added to restore the
148/// original rank.
149struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
150 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
151
152 LogicalResult
153 matchAndRewrite(xegpu::LoadNdOp op, OpAdaptor adaptor,
154 ConversionPatternRewriter &rewriter) const override {
155 xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
156 // If no layout, nothing to do.
157 if (!layout)
158 return failure();
159 // Check if the layout attached to the tensor descriptor is same as the
160 // anchor layout. Otherwise, this is a conflict.
161 if (op.getTensorDescType().getLayout() != layout)
162 return rewriter.notifyMatchFailure(
163 op, "conflicting layout attributes on tensor descriptor and anchor");
164 auto uArch = getUArch(xegpu::getChipStr(op).value_or(""));
165 if (!uArch)
166 return rewriter.notifyMatchFailure(
167 op, "xegpu::LoadNdOp require target attribute attached to "
168 "determine transpose "
169 "requirement");
170 auto supportedWiResultTyOrFailure =
171 xegpu::getDistributedVectorType(op.getTensorDescType());
172 auto expectedWiResultTyOrFailure =
173 xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());
174 if (failed(supportedWiResultTyOrFailure))
175 return rewriter.notifyMatchFailure(
176 op, "unable to compute the workitem vector type for LoadNdOp");
177 if (failed(expectedWiResultTyOrFailure))
178 return rewriter.notifyMatchFailure(
179 op,
180 "unable to compute expected workitem vector type from lane layout");
181 auto newOp = xegpu::LoadNdOp::create(
182 rewriter, op.getLoc(), supportedWiResultTyOrFailure.value(),
183 adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
184 op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
185 op.getL3HintAttr(), /**layout**/ nullptr);
186 // Set the packed attribute if the layout requires it.
187 newOp.setPacked(xegpu::requirePacked(cast<xegpu::LayoutAttr>(layout)));
188 // Set the transpose attribute if the layout requires it.
189 if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))
190 newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
191 rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
192 expectedWiResultTyOrFailure.value()));
193 return success();
194 }
195};
196
197/// Distributes a subgroup-level StoreNd op to workitem-level StoreNd op. Stored
198/// value in workitem-level StoreNd op is 1D. ShapeCast is added to cast the
199/// incoming value to 1D.
200struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
201 using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
202
203 LogicalResult
204 matchAndRewrite(xegpu::StoreNdOp op, OpAdaptor adaptor,
205 ConversionPatternRewriter &rewriter) const override {
206 xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
207 // If no layout, nothing to do.
208 if (!layout)
209 return failure();
210 // Check if the layout attached to the tensor descriptor and value layout is
211 // same as the anchor layout. Otherwise, this is a conflict.
212 if (op.getTensorDescType().getLayout() != layout)
213 return rewriter.notifyMatchFailure(
214 op, "conflicting layout attributes on tensor descriptor and anchor");
215 auto valueLayout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));
216 if (valueLayout != layout)
217 return rewriter.notifyMatchFailure(
218 op, "conflicting layout attributes on value and anchor");
219 auto supportedWiValueTyOrFailure =
220 xegpu::getDistributedVectorType(op.getTensorDescType());
221 if (failed(supportedWiValueTyOrFailure))
222 return rewriter.notifyMatchFailure(
223 op,
224 "unable to compute wi vector type for StoreNdOp value from tensor "
225 "descriptor");
226
227 xegpu::StoreNdOp::create(
228 rewriter, op.getLoc(),
229 castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
230 supportedWiValueTyOrFailure.value()),
231 adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
232 op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
233 rewriter.eraseOp(op);
234 return success();
235 }
236};
237
238/// Distributes a subgroup-level Dpas op to workitem-level Dpas op. All inpputs
239/// and output of workitem-level Dpas op are 1D. Necessary casts are added to
240/// convert the inputs and output to/from 1D.
241struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
242 using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
243
244 LogicalResult
245 matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,
246 ConversionPatternRewriter &rewriter) const override {
247 // llvm::errs() << "DpasOpPattern matchAndRewrite called\n";
248 // Check if the op has A, B and CD layouts attached.
249 auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());
250 auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());
251 auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
252 if (!layoutA || !layoutB || !layoutCd)
253 return failure();
254 // llvm::errs() << "tryning to calculate wi types for dpas op\n";
255 auto wiResultTyOrFailure =
256 xegpu::getDistributedVectorType(op.getType(), layoutCd);
257 auto wiATypeOrFailure =
258 xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);
259 auto wiBTypeOrFailure =
260 xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);
261 auto expectedWiResultTyOrFailure =
262 xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());
263 if (failed(wiResultTyOrFailure) || failed(wiATypeOrFailure) ||
264 failed(wiBTypeOrFailure))
265 return rewriter.notifyMatchFailure(
266 op, "failed to calculate supported workitem vector types for DpasOp "
267 "from layouts");
268 if (failed(expectedWiResultTyOrFailure))
269 return rewriter.notifyMatchFailure(
270 op, "unable to compute expected workitem vector type for DpasOp from "
271 "lane layout");
272 auto newOp = xegpu::DpasOp::create(
273 rewriter, op->getLoc(), wiResultTyOrFailure.value(),
274 castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
275 wiATypeOrFailure.value()),
276 castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
277 wiBTypeOrFailure.value()),
278 castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
279 wiResultTyOrFailure.value()),
280 /** layoutA**/ nullptr,
281 /** layoutB**/ nullptr, /** layoutCd**/ nullptr);
282 // Explicitly set the new types to enable correct type materializations.
283 rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
284 expectedWiResultTyOrFailure.value()));
285 return success();
287};
289/// Distributes elementwise ops to workitem-level elementwise ops. This
290/// currently handles elementwise ops with single result only.
291struct SgToWiElementWise : public ConversionPattern {
292 SgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
293 : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
295 LogicalResult
296 matchAndRewrite(Operation *op, ArrayRef<Value> operands,
297 ConversionPatternRewriter &rewriter) const override {
298 // Only match ops with elementwise trait and single result.
300 return failure();
302 auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
303 if (!resultType)
304 return rewriter.notifyMatchFailure(
305 op, "operation result is not a vector type");
307 xegpu::DistributeLayoutAttr layout =
308 xegpu::getTemporaryLayout(llvm::cast<OpResult>(op->getResult(0)));
309 if (!layout || !layout.isForSubgroup())
310 return rewriter.notifyMatchFailure(
311 op, "operation result does not have subgroup distribute layout");
313 auto wiShapeOrFailure =
314 xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
315
316 if (failed(wiShapeOrFailure))
317 return rewriter.notifyMatchFailure(
318 op, "unable to compute workitem vector type from the layout");
319
320 VectorType newResultType = wiShapeOrFailure.value();
321 OperationState state(op->getLoc(), op->getName());
322 state.addOperands(operands);
323 state.addTypes(newResultType);
324 // Copy all attributes except for DistributeLayoutAttr.
325 for (auto attr : op->getAttrs()) {
326 if (!isa<xegpu::DistributeLayoutAttr>(attr.getValue()))
327 state.addAttribute(attr.getName(), attr.getValue());
328 }
329 Operation *newOp = rewriter.create(state);
330
331 rewriter.replaceOp(op, newOp->getResult(0));
332 return success();
333 }
334};
335
336/// Distributes a subgroup-level arith ConstantOp to workitem-level arith
337/// ConstantOp.
338struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
339 using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
340
341 LogicalResult
342 matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
343 ConversionPatternRewriter &rewriter) const override {
344 auto resultType = dyn_cast<VectorType>(op.getType());
345 if (!resultType)
346 return failure();
347
348 // Only handle dense vector constants
349 auto dense = dyn_cast<SplatElementsAttr>(op.getValue());
350 if (!dense)
351 return rewriter.notifyMatchFailure(
352 op, "only dense splat vector constants are supported");
353
354 xegpu::DistributeLayoutAttr layout =
355 xegpu::getTemporaryLayout(llvm::cast<OpResult>(op.getResult()));
356 if (!layout || !layout.isForSubgroup())
357 return rewriter.notifyMatchFailure(
358 op, "operation result does not have subgroup distribute layout");
359
360 auto wiShapeOrFailure =
361 xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
362
363 if (failed(wiShapeOrFailure))
364 return rewriter.notifyMatchFailure(
365 op, "unable to compute workitem vector type from the layout");
366
367 VectorType newResultType = wiShapeOrFailure.value();
368 auto sclarValue = dense.getSplatValue<Attribute>();
369 auto newDenseAttr = DenseElementsAttr::get(newResultType, sclarValue);
370
371 auto newOp = arith::ConstantOp::create(rewriter, op.getLoc(), newResultType,
372 newDenseAttr);
373 rewriter.replaceOp(op, newOp.getResult());
374 return success();
375 }
376};
377
378/// Distributes a subgroup-level PrefetchNd op to workitem-level PrefetchNd op.
379struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
380 using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
381
382 LogicalResult
383 matchAndRewrite(xegpu::PrefetchNdOp op, OpAdaptor adaptor,
384 ConversionPatternRewriter &rewriter) const override {
385 xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
386 // If no layout, nothing to do.
387 if (!layout)
388 return failure();
389
390 xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), adaptor.getTensorDesc(),
391 op.getMixedOffsets(), op.getL1HintAttr(),
392 op.getL2HintAttr(), op.getL3HintAttr(),
393 /**layout**/ nullptr);
394 rewriter.eraseOp(op);
395 return success();
396 }
397};
398
399/// Distributes a subgroup-level LoadGather (xegpu.load) op to workitem-level.
400///
401/// Example 1 (1D, no chunk size):
402/// layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
403/// %mask = producer_op : vector<16xi1>
404/// %offset = producer_op : vector<16xindex>
405/// %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
406/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
407/// Distributed to:
408/// %mask = producer_op : vector<1xi1>
409/// %offset = producer_op : vector<1xindex>
410/// %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
411/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
412///
413/// Example 2 (2D with chunk size, same mask & offset):
414/// layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
415/// %0 = xegpu.load %src[%offset], %mask <{chunk_size=8}> :
416/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
417/// Distributed to:
418/// %0 = xegpu.load %src[%offset], %mask <{chunk_size=8}> :
419/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
420///
421/// Example 3 (3D with leading unit dims):
422/// layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
423/// %mask = producer_op : vector<1x1x16xi1>
424/// %offset = producer_op : vector<1x1x16xindex>
425/// %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
426/// vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>
427/// Distributed to:
428/// %mask = producer_op : vector<1x1x1xi1>
429/// %offset = producer_op : vector<1x1x1xindex>
430/// %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
431/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
432struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
433 using OpConversionPattern<xegpu::LoadGatherOp>::OpConversionPattern;
434
435 LogicalResult
436 matchAndRewrite(xegpu::LoadGatherOp op, OpAdaptor adaptor,
437 ConversionPatternRewriter &rewriter) const override {
438 xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
439 if (!layout)
440 return failure();
441
442 VectorType origResultTy = op.getValueType();
443 if (!origResultTy)
444 return failure();
445
446 // Check that leading dimensions are unit.
447 int chunkSize = op.getChunkSize().value_or(1);
448 int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
449 ArrayRef<int64_t> shape = origResultTy.getShape();
450 if (llvm::any_of(
451 shape.take_front(origResultTy.getRank() - effectiveVecRank),
452 [](int64_t d) { return d != 1; }))
453 return rewriter.notifyMatchFailure(
454 op, "Only unit dimensions allowed for the leading "
455 "dimensions of the load vector!");
456
457 auto distResultTyOrFailure =
458 xegpu::getDistVecTypeBasedOnLaneLayout(layout, origResultTy);
459 if (failed(distResultTyOrFailure))
460 return rewriter.notifyMatchFailure(
461 op,
462 "unable to compute expected workitem vector type from lane layout");
463
464 VectorType distResultTy = distResultTyOrFailure.value();
465 VectorType distResultTy1D = VectorType::get({distResultTy.getNumElements()},
466 distResultTy.getElementType());
467
468 // Flatten offsets and mask to 1D to match the 1D result type.
469 Value distOffsets = adaptor.getOffsets();
470 auto distOffsetsTy = cast<VectorType>(distOffsets.getType());
471 VectorType offsetsTy1D = VectorType::get({distOffsetsTy.getNumElements()},
472 distOffsetsTy.getElementType());
473 distOffsets = castValueTo(
474 rewriter, cast<TypedValue<VectorType>>(distOffsets), offsetsTy1D);
475
476 Value distMask = adaptor.getMask();
477 auto distMaskTy = cast<VectorType>(distMask.getType());
478 VectorType maskTy1D = VectorType::get({distMaskTy.getNumElements()},
479 distMaskTy.getElementType());
480 distMask =
481 castValueTo(rewriter, cast<TypedValue<VectorType>>(distMask), maskTy1D);
482
483 Value distSource = adaptor.getSource();
484 auto newOp = xegpu::LoadGatherOp::create(
485 rewriter, op.getLoc(), distResultTy1D, distSource, distOffsets,
486 distMask, op.getChunkSizeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
487 op.getL3HintAttr(), /*layout=*/nullptr);
488
489 Value result = newOp->getResult(0);
490 if (distResultTy1D != distResultTy)
491 result = castValueTo(rewriter, cast<TypedValue<VectorType>>(result),
492 distResultTy);
493 rewriter.replaceOp(op, result);
494 return success();
495 }
496};
497
498/// This pattern distributes a subgroup-level vector.reduction op to
499/// workitem-level. This require shuffling the data across the workitems (using
500/// gpu::ShuffleOp) and reducing in stages until all workitems have the final
501/// result.
502struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
503 using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
504
505 LogicalResult
506 matchAndRewrite(vector::ReductionOp op, OpAdaptor adaptor,
507 ConversionPatternRewriter &rewriter) const override {
508 auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
509
510 // If no layout, nothing to do.
511 if (!layout || !layout.isForSubgroup())
512 return failure();
513
514 VectorType srcVecType = op.getSourceVectorType();
515 // Only rank 1 vectors supported.
516 if (srcVecType.getRank() != 1)
517 return rewriter.notifyMatchFailure(
518 op, "Only rank 1 reductions can be distributed.");
519 // Lane layout must have the same rank as the vector.
520 if (layout.getRank() != srcVecType.getRank())
521 return rewriter.notifyMatchFailure(
522 op, "Layout rank does not match vector rank.");
523
524 // Get the subgroup size from the layout.
525 int64_t sgSize = layout.getEffectiveLaneLayoutAsInt()[0];
526 const uArch *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
527 if (!uArch)
528 return rewriter.notifyMatchFailure(
529 op, "xegpu::ReductionOp require target attribute attached to "
530 "determine subgroup size");
531
532 // Only subgroup-sized vectors supported.
533 if (sgSize != uArch->getSubgroupSize() ||
534 srcVecType.getShape()[0] % sgSize != 0)
535 return rewriter.notifyMatchFailure(op,
536 "Invalid layout or reduction vector "
537 "dimension must match subgroup size.");
538
539 if (!op.getType().isIntOrFloat())
540 return rewriter.notifyMatchFailure(
541 op, "Reduction distribution currently only supports floats and "
542 "integer types.");
543
544 // Get the distributed vector (per work-item portion).
545 Value laneValVec = adaptor.getVector();
546
547 // Distribute and reduce across work-items in the subgroup.
548 Value fullReduce = xegpu::subgroupReduction(
549 op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);
550
551 // If there's an accumulator, combine it with the reduced value.
552 if (adaptor.getAcc())
553 fullReduce = vector::makeArithReduction(
554 rewriter, op.getLoc(), op.getKind(), fullReduce, adaptor.getAcc());
555
556 rewriter.replaceOp(op, fullReduce);
557 return success();
558 }
559};
560
561/// This pattern distributes a subgroup-level vector.multi_reduction op to
562/// workitem-level only if the reduction is lane-local. This means that
563/// reduction dimension is not distributed to lanes and each lane does its own
564/// local reduction.
565struct SgToWiMultiDimReduction
566 : public OpConversionPattern<vector::MultiDimReductionOp> {
567 using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
568
569 LogicalResult
570 matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
571 ConversionPatternRewriter &rewriter) const override {
572 Value result;
573 ArrayRef<int64_t> reductionDims = op.getReductionDims();
574 assert(reductionDims.size() == 1 &&
575 "Expecting single reduction dimension for subgroup multi "
576 "reduction op");
577 if (isReductionLaneLocal(op)) {
578 auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
579 VectorType resVecTy = dyn_cast<VectorType>(op.getType());
580 auto resDistVecTyOrFailure =
581 getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
582 // For lane local reduction, simply create a new MultiDimReductionOp using
583 // adaptor operands and the new result type.
584 result = vector::MultiDimReductionOp::create(
585 rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
586 adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
587 } else {
588 auto reductionDim = reductionDims[0];
589 VectorType sourceType = op.getSourceVectorType();
590 int64_t reductionDimSize = sourceType.getShape()[reductionDim];
592 cast<TypedValue<VectorType>>(adaptor.getSource()),
593 cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
594 reductionDim, reductionDimSize, op.getLoc(), rewriter);
595 }
596 rewriter.replaceOp(op, result);
597 return success();
598 }
599};
600
601/// Helper to compute distributed coordinates for matrix ops.
602/// When not using subgroup_block_io, each workitem computes its own
603/// coordinates based on the layout and lane ID.
604static SmallVector<Value> computeDistributedCoordsForMatrixOp(
605 ConversionPatternRewriter &rewriter, Location loc,
606 xegpu::DistributeLayoutAttr layout, ArrayRef<int64_t> payloadShape,
607 ValueRange origOffsets) {
608 Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
609 /*upperBound=*/mlir::IntegerAttr());
610 auto maybeCoords =
611 layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
612 if (failed(maybeCoords))
613 return {};
614 assert(maybeCoords.value().size() == 1 &&
615 "Expected one set of distributed offsets");
617 rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
618 getAsOpFoldResult(origOffsets));
619 return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
620}
621
622/// This pattern distributes a subgroup-level LoadMatrix op to workitem-level.
623struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
624 using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;
625
626 LogicalResult
627 matchAndRewrite(xegpu::LoadMatrixOp op, OpAdaptor adaptor,
628 ConversionPatternRewriter &rewriter) const override {
629 auto layout = op.getLayoutAttr();
630 // If no layout, nothing to do.
631 if (!layout)
632 return failure();
633
634 VectorType sgPayloadTy = dyn_cast<VectorType>(op.getResult().getType());
635 if (!sgPayloadTy)
636 return rewriter.notifyMatchFailure(
637 op, "the matrix op payload must be a vector type");
638
639 auto loc = op.getLoc();
640 auto offsets = op.getMixedOffsets();
641 if (offsets.empty())
642 return rewriter.notifyMatchFailure(op, "the load op must have offsets");
643
644 FailureOr<VectorType> distPayloadTyOrFailure =
645 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
646 if (failed(distPayloadTyOrFailure))
647 return rewriter.notifyMatchFailure(
648 op, "Failed to distribute matrix op payload based on layout.");
649
650 SmallVector<Value> offsetsAsValues =
651 vector::getAsValues(rewriter, loc, offsets);
652
653 SmallVector<Value> newCoords = offsetsAsValues;
654 if (!op.getSubgroupBlockIoAttr()) {
655 newCoords = computeDistributedCoordsForMatrixOp(
656 rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);
657 if (newCoords.empty())
658 return rewriter.notifyMatchFailure(
659 op, "Failed to compute distributed coordinates.");
660 }
661
662 SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),
663 ShapedType::kDynamic);
664 DenseI64ArrayAttr newConstOffsetsAttr =
665 rewriter.getDenseI64ArrayAttr(newConstOffsets);
666
667 auto newOp = xegpu::LoadMatrixOp::create(
668 rewriter, loc, *distPayloadTyOrFailure, adaptor.getMemDesc(),
669 ValueRange(newCoords), newConstOffsetsAttr, op.getSubgroupBlockIoAttr(),
670 xegpu::DistributeLayoutAttr{});
671 rewriter.replaceOp(op, newOp.getResult());
672 return success();
673 }
674};
675
676/// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
677struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
678 using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
679
680 LogicalResult
681 matchAndRewrite(xegpu::StoreMatrixOp op, OpAdaptor adaptor,
682 ConversionPatternRewriter &rewriter) const override {
683 auto layout = op.getLayoutAttr();
684 // If no layout, nothing to do.
685 if (!layout)
686 return failure();
687
688 VectorType sgPayloadTy = dyn_cast<VectorType>(op.getData().getType());
689 if (!sgPayloadTy)
690 return rewriter.notifyMatchFailure(
691 op, "the matrix op payload must be a vector type");
692
693 auto loc = op.getLoc();
694 auto offsets = op.getMixedOffsets();
695 if (offsets.empty())
696 return rewriter.notifyMatchFailure(op, "the store op must have offsets");
697
698 FailureOr<VectorType> distPayloadTyOrFailure =
699 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
700 if (failed(distPayloadTyOrFailure))
701 return rewriter.notifyMatchFailure(
702 op, "Failed to distribute matrix op payload based on layout.");
703
704 SmallVector<Value> offsetsAsValues =
705 vector::getAsValues(rewriter, loc, offsets);
706
707 SmallVector<Value> newCoords = offsetsAsValues;
708 if (!op.getSubgroupBlockIoAttr()) {
709 newCoords = computeDistributedCoordsForMatrixOp(
710 rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);
711 if (newCoords.empty())
712 return rewriter.notifyMatchFailure(
713 op, "Failed to compute distributed coordinates.");
714 }
715
716 SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),
717 ShapedType::kDynamic);
718 DenseI64ArrayAttr newConstOffsetsAttr =
719 rewriter.getDenseI64ArrayAttr(newConstOffsets);
720
721 xegpu::StoreMatrixOp::create(
722 rewriter, loc, TypeRange{},
723 castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getData()),
724 distPayloadTyOrFailure.value()),
725 adaptor.getMemDesc(), ValueRange(newCoords), newConstOffsetsAttr,
726 op.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
727 rewriter.eraseOp(op);
728 return success();
729 }
730};
731
732/// Distributes a subgroup-level StoreScatter (xegpu.store) op to
733/// workitem-level.
734///
735/// Example 1 (1D, no chunk size):
736/// layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
737/// %mask = producer_op : vector<16xi1>
738/// %offset = producer_op : vector<16xindex>
739/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
740/// memref<256xf16>, vector<16xindex>, vector<16xi1>
741/// Distributed to:
742/// %mask = producer_op : vector<1xi1>
743/// %offset = producer_op : vector<1xindex>
744/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
745/// memref<256xf16>, vector<1xindex>, vector<1xi1>
746///
747/// Example 2 (2D with chunk size, same mask & offset):
748/// layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
749/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
750/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
751/// Distributed to:
752/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
753/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
754///
755/// Example 3 (3D with leading unit dims):
756/// layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
757/// %mask = producer_op : vector<1x1x16xi1>
758/// %offset = producer_op : vector<1x1x16xindex>
759/// xegpu.store %payload, %src[%offset], %mask : vector<1x1x16xf16>,
760/// memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>
761/// Distributed to:
762/// %mask = producer_op : vector<1x1x1xi1>
763/// %offset = producer_op : vector<1x1x1xindex>
764/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
765/// memref<256xf16>, vector<1xindex>, vector<1xi1>
766struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
767 using OpConversionPattern<xegpu::StoreScatterOp>::OpConversionPattern;
768
769 LogicalResult
770 matchAndRewrite(xegpu::StoreScatterOp op, OpAdaptor adaptor,
771 ConversionPatternRewriter &rewriter) const override {
772 xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
773 if (!layout)
774 return failure();
775
776 VectorType origValueTy = op.getValueType();
777 if (!origValueTy)
778 return failure();
779
780 // Check that all leading dimensions are unit dimensions.
781 int chunkSize = op.getChunkSize().value_or(1);
782 int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
783 ArrayRef<int64_t> shape = origValueTy.getShape();
784 if (llvm::any_of(shape.take_front(origValueTy.getRank() - effectiveVecRank),
785 [](int64_t d) { return d != 1; }))
786 return rewriter.notifyMatchFailure(
787 op, "Only unit dimensions allowed for the leading "
788 "dimensions of the store vector!");
789
790 auto distValueTyOrFailure =
791 xegpu::getDistVecTypeBasedOnLaneLayout(layout, origValueTy);
792 if (failed(distValueTyOrFailure))
793 return rewriter.notifyMatchFailure(
794 op,
795 "unable to compute expected workitem vector type from lane layout");
796
797 VectorType distValueTy = distValueTyOrFailure.value();
798 VectorType distValueTy1D = VectorType::get({distValueTy.getNumElements()},
799 distValueTy.getElementType());
800
801 Value distValue = adaptor.getValue();
802 if (distValue.getType() != distValueTy1D)
803 distValue = castValueTo(rewriter, cast<TypedValue<VectorType>>(distValue),
804 distValueTy1D);
805
806 // Flatten offsets and mask to 1D to match the 1D value type.
807 Value distOffsets = adaptor.getOffsets();
808 auto distOffsetsTy = cast<VectorType>(distOffsets.getType());
809 VectorType offsetsTy1D = VectorType::get({distOffsetsTy.getNumElements()},
810 distOffsetsTy.getElementType());
811 distOffsets = castValueTo(
812 rewriter, cast<TypedValue<VectorType>>(distOffsets), offsetsTy1D);
813
814 Value distMask = adaptor.getMask();
815 auto distMaskTy = cast<VectorType>(distMask.getType());
816 VectorType maskTy1D = VectorType::get({distMaskTy.getNumElements()},
817 distMaskTy.getElementType());
818 distMask =
819 castValueTo(rewriter, cast<TypedValue<VectorType>>(distMask), maskTy1D);
820
821 Value distDest = adaptor.getDest();
822 xegpu::StoreScatterOp::create(rewriter, op.getLoc(), distValue, distDest,
823 distOffsets, distMask, op.getChunkSizeAttr(),
824 op.getL1HintAttr(), op.getL2HintAttr(),
825 op.getL3HintAttr(), /*layout=*/nullptr);
826 rewriter.eraseOp(op);
827 return success();
828 }
829};
830
831struct XeGPUSgToWiDistributeExperimentalPass
833 XeGPUSgToWiDistributeExperimentalPass> {
834 void runOnOperation() override;
835};
836
837} // namespace
838
839void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
840
841 // Verify if all XeGPU anchor ops and vector ops have result layouts.
842 // TODO: This can be removed once the full layout refactoring is done.
843 Operation *root = getOperation();
844 if (failed(verifyLayouts(root))) {
845 LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
846 "verification failed\n");
847 signalPassFailure();
848 return;
849 }
850 // Collect existing UnrealizedConversionCastOps. These must be preserved.
851 llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
852 root->walk(
853 [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });
854 // Perform a structural type conversion to convert structural ops to have WI
855 // types. This will insert UnrealizedConversionCastOps to make the IR
856 // valid.
857 auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
858 mlir::ValueRange inputs,
859 mlir::Location loc) -> mlir::Value {
860 UnrealizedConversionCastOp castOp =
861 UnrealizedConversionCastOp::create(builder, loc, type, inputs);
862 return castOp.getResult(0);
863 };
864 {
865 ConversionTarget target(getContext());
866 TypeConverter typeConverter;
867 RewritePatternSet patterns(&getContext());
868 typeConverter.addSourceMaterialization(materializeCast);
869 typeConverter.addTargetMaterialization(materializeCast);
872 patterns, target);
874 typeConverter, patterns, target);
875 target.addLegalOp<UnrealizedConversionCastOp>();
876 (void)applyPartialConversion(root, target, std::move(patterns));
877 }
878 // Structural type conversion can generate some redundant
879 // UnrealizedConversionCastOps to materialize the SG type from type converted
880 // WI type. These are redundant at this point and can be eliminated by
881 // inserting shape casts instead.
882 // Example:
883 // %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32> to vector<16x16xf32>
884 // %2 = UnrealizedConversionCastOp %1 : vector<16x16xf32> to vector<16xf32>
885 // This can be replaced with:
886 // %2 = vector.shape_cast %0 : vector<16x1xf32> to vector<16xf32>
887 OpBuilder builder(root);
888 root->walk([&](UnrealizedConversionCastOp op) {
889 // If this op existed before, nothing to do.
890 if (existingCasts.contains(op))
891 return;
892 // number of inputs and outputs must be 1.
893 if (op.getNumOperands() != 1 || op.getNumResults() != 1)
894 return;
895 // Both input and output types must be vector types.
896 auto singleInput = op.getInputs()[0];
897 auto inputTy = dyn_cast<VectorType>(singleInput.getType());
898 auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
899 if (!inputTy || !outputTy)
900 return;
901
902 // Check if the defining op of the input is also an
903 // UnrealizedConversionCastOp and it has a single user (which is this
904 // op).
905 auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
906 if (!definingOp || !definingOp->hasOneUse())
907 return;
908 auto inputOfDefiningOp = definingOp.getInputs()[0];
909 // If the input of the defining op and output type are both vector types
910 // have same number of elements, insert a shape cast.
911 auto inputOfDefiningOpTy =
912 dyn_cast<VectorType>(inputOfDefiningOp.getType());
913 if (inputOfDefiningOpTy &&
914 inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
915 builder.setInsertionPoint(op);
916 auto shapeCast = vector::ShapeCastOp::create(builder, op.getLoc(),
917 outputTy, inputOfDefiningOp);
918 op.replaceAllUsesWith(ValueRange{shapeCast.getResult()});
919 return;
920 }
921 });
922 // At this point, we will have some dead UnrealizedConversionCastOps. Just
923 // erase them.
924 bool changed = true;
925 while (changed) {
926 changed = false;
927 root->walk([&](UnrealizedConversionCastOp op) {
928 // Skip existing casts.
929 if (existingCasts.contains(op))
930 return;
931 if (op.use_empty()) {
932 op.erase();
933 changed = true;
934 }
935 });
936 }
937}
938
940 TypeConverter &typeConverter) {
941 // Any type other than TensorDescType and VectorType are legal as is.
942 typeConverter.addConversion([](Type type) -> std::optional<Type> {
943 if (!isa<TensorDescType, VectorType>(type))
944 return type;
945 return std::nullopt;
946 });
947 // For TensorDescType, drop the layout attribute if any.
948 typeConverter.addConversion([](TensorDescType type) -> Type {
949 if (type.getLayoutAttr()) {
950 return type.dropLayouts();
951 }
952 return type;
953 });
954 // For VectorType, check if there is a distribute layout attribute on the
955 // value. If so, convert to the distributed vector type based on the layout.
956 typeConverter.addConversion([](Value v) -> std::optional<Type> {
957 auto type = v.getType();
958 // If value is not vector type, nothing to do.
959 if (!isa<VectorType>(type))
960 return std::nullopt;
961 auto layout = xegpu::getDistributeLayoutAttr(v);
962 if (!layout || !layout.isForSubgroup())
963 return type;
964 // Vector type is distributed based on lane layout.
965 auto newTyOrFailure =
966 getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
967 if (failed(newTyOrFailure))
968 return type;
969 return *newTyOrFailure;
970 });
971}
972
974 TypeConverter &typeConverter, RewritePatternSet &patterns,
977 // CreateNdDescOp is legal only if its result type has no layout attribute.
978 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
979 [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
980 // Any anchor XeGPU op is legal only if it has no anchor layout.
981 target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
982 auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
983 if (!anchorOp)
984 return true;
985 return !anchorOp.getAnchorLayout();
986 });
987 // Arith constants are legal only if they have no temporary layout attribute.
988 target.addDynamicallyLegalOp<arith::ConstantOp>(
989 [=](arith::ConstantOp op) -> bool {
990 // If the result type is not a vector, it's legal.
991 if (!isa<VectorType>(op.getResult().getType()))
992 return true;
993 return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
994 });
995 // In math and arith dialects, only handle elementwise ops with a single
996 // result and with a result layout attribute.
997 target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
998 [=](Operation *op) -> std::optional<bool> {
999 // Only handle elementwise mappable ops
1001 return true;
1002 // Only handle ops with single vector result
1003 if (op->getNumResults() != 1)
1004 return true;
1005
1006 VectorType resultType =
1007 dyn_cast<VectorType>(op->getResult(0).getType());
1008 if (!resultType)
1009 return true;
1010
1011 // Check if all operands are vectors of the same shape
1012 for (Value operand : op->getOperands()) {
1013 VectorType operandType = dyn_cast<VectorType>(operand.getType());
1014 if (!operandType || operandType.getShape() != resultType.getShape()) {
1015 return true;
1016 }
1017 }
1018 return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
1019 });
1020 // vector::ReductionOp is legal only if its source has no distribute layout
1021 // attribute.
1022 target.addDynamicallyLegalOp<vector::ReductionOp>(
1023 [=](vector::ReductionOp op) -> bool {
1024 auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
1025 return !layout;
1026 });
1027 // vector::MultiDimReductionOp op legality.
1028 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
1029 [=](vector::MultiDimReductionOp op) -> bool {
1030 return !isValidSubgroupMultiReductionOp(op);
1031 });
1032 target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
1033 patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
1034 SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
1035 SgToWiLoadGather, SgToWiStoreScatter, SgToWiVectorReduction,
1036 SgToWiMultiDimReduction, SgToWiLoadMatrix, SgToWiStoreMatrix>(
1037 typeConverter, patterns.getContext());
1038}
return success()
#define DBGS()
Definition Hoisting.cpp:32
b getContext())
Attributes are known-constant values of operations.
Definition Attributes.h:25
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:400
This is a value defined by a result of an operation.
Definition Value.h:457
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition Operation.h:541
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:436
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:244
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:119
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:826
result_range getResults()
Definition Operation.h:444
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:433
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
SmallVector< Value > getAsValues(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > foldResults)
Convert foldResults into Values.
const uArch * getUArch(llvm::StringRef archName)
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter)
Define only the type conversions needed for XeGPU subgroup to workitem distribution.
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
bool requirePacked(const LayoutAttr layout)
Helper function to check if the layout is packed.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
void populateXeGPUSgToWiDistributeTypeConversionAndLegality(TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target)
Defines type conversions and legality for XeGPU subgroup to workitem distribution and appends the req...
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int64_t > DenseI64ArrayAttr
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
This represents an operation in an abstracted form, suitable for use with the builder APIs.
virtual int getSubgroupSize() const =0