MLIR 22.0.0git
XeGPUSubgroupDistribute.cpp
Go to the documentation of this file.
1//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
19#include "mlir/IR/AffineMap.h"
20#include "mlir/IR/Attributes.h"
21#include "mlir/IR/Builders.h"
23#include "mlir/IR/BuiltinOps.h"
25#include "mlir/IR/Operation.h"
27#include "mlir/IR/TypeRange.h"
28#include "mlir/IR/Value.h"
29#include "mlir/IR/Visitors.h"
31#include "mlir/Support/LLVM.h"
35#include "llvm/ADT/ArrayRef.h"
36#include "llvm/ADT/STLExtras.h"
37#include "llvm/ADT/SmallVector.h"
38
39namespace mlir {
40namespace xegpu {
41#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
42#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
43} // namespace xegpu
44} // namespace mlir
45
46#define DEBUG_TYPE "xegpu-subgroup-distribute"
47#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
48
49using namespace mlir;
50
51static const char *const resolveSIMTTypeMismatch =
52 "resolve_simt_type_mismatch"; // Attribute name for identifying
53 // UnrelizedConversionCastOp added to resolve
54 // SIMT type mismatches.
55
56namespace {
57
58//===----------------------------------------------------------------------===//
59// SIMT Distribution Patterns
60//===----------------------------------------------------------------------===//
61
62/// In certain cases, we may need to favor XeGPU specific distribution patterns
63/// over generic vector distribution patterns. In such cases, we can assign
64/// priorities to patterns.
65static constexpr unsigned regularPatternBenefit = 1;
66static constexpr unsigned highPatternBenefit = 2;
67
68/// Helper function to get distributed vector type for a source vector type
69/// according to the lane_layout. We simply divide each dimension of tensor
70/// descriptor shape by corresponding lane_layout dimension. If
71/// array_length > 1, that is appended to the front of the ditributed shape.
72/// NOTE: This is the vector type that will be returned by the
73/// gpu.warp_execute_on_lane0 op.
74///
75/// Examples:
76/// | original vector shape | lane_layout | distributed vector shape |
77/// |-----------------------|-------------|--------------------------|
78/// | 32x16 | [1, 16] | 32x1 |
79/// | 32x16 | [2, 8] | 16x2 |
80/// | 2x32x16 | [1, 16] | 2x32x1 |
81static FailureOr<VectorType>
82getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
83 VectorType originalType) {
84 if (!layout)
85 return failure();
86 assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
87 "Expecting a valid layout.");
88 SmallVector<int64_t> effectiveLaneLayout =
89 layout.getEffectiveLaneLayoutAsInt();
90 assert(static_cast<size_t>(originalType.getRank()) >=
91 effectiveLaneLayout.size() &&
92 "Rank of the original vector type should be greater or equal to the "
93 "size of the lane layout to distribute the vector type.");
94 SmallVector<int64_t> distributedShape(originalType.getShape());
95 // Only distribute the last `laneLayout.size()` dimensions. The remaining
96 // dimensions are not distributed.
97 unsigned distributionStart =
98 originalType.getRank() - effectiveLaneLayout.size();
99 for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
100 if (i < distributionStart)
101 continue;
102
103 // Check if the dimension can be distributed evenly.
104 if (dim % effectiveLaneLayout[i - distributionStart] != 0)
105 return failure();
106 distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
107 }
108 return VectorType::get(distributedShape, originalType.getElementType());
109}
110
111/// Helper function to resolve types if the distributed type out of
112/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
113/// Example 1:
114/// distributed type: vector<8x1xf32>
115/// expected type: vector<8xf32>
116/// resolved using,
117/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
118/// Example 2:
119/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
120/// expected type: xegpu.tensor_desc<8x16xf32>
121/// resolved using,
122/// %0 = unrealized_conversion_cast %1 :
123/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
124/// xegpu.tensor_desc<8x16xf32>
125template <typename T>
126static Value resolveDistributedTy(Value orig, T expected,
127 PatternRewriter &rewriter) {
128 // If orig and expected types are the same, return orig.
129 if (orig.getType() == expected)
130 return orig;
131 // If orig is a vector type, create a shape cast op to reconcile the types.
132 if (isa<VectorType>(orig.getType())) {
133 auto castOp =
134 vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
135 return castOp.getResult();
136 }
137 // If orig is a tensor descriptor type, create an unrealized conversion cast
138 // op to reconcile the types.
139 if (isa<xegpu::TensorDescType>(orig.getType())) {
140 auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
141 expected, orig);
142 castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
143 return castOp.getResult(0);
144 }
145 llvm_unreachable("Unsupported type for reconciliation");
146 return orig;
147}
148
149/// Helper function to check if the layout is packed. Layout is packed if it is
150/// 2D and lane_data[0] != 1 (data packed from col dimension).
151/// TODO: Move to target info.
152static bool requirePacked(const xegpu::LayoutAttr layout) {
153 if (!layout)
154 return false;
155 auto laneData = layout.getEffectiveLaneDataAsInt();
156 if (laneData.size() != 2)
157 return false;
158 return laneData[0] != 1;
159}
160
161/// Helper function to check if the layout requires a transpose effect.
162static bool requireTranspose(const xegpu::LayoutAttr layout,
163 const xegpu::uArch::uArch *uArch) {
164 // Return false for unsupported targets.
165 // TODO: Add more support or move to target info.
166 if (uArch->getName().equals_insensitive("pvc") &&
167 uArch->getName().equals_insensitive("bmg"))
168 return false;
169 if (!layout)
170 return false;
171 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
172 if (laneLayout.size() != 2)
173 return false;
174 return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
175}
176
177/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
178/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
179/// contained within a WarpExecuteOnLane0Op.
180/// Example:
181///
182/// ```
183/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
184/// ...
185/// ...
186/// gpu.return %result: vector<8x16xf32>
187/// }
188/// ```
189/// To
190/// ```
191/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
192/// %laneid = gpu.lane_id : index
193/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
194/// ...
195/// ...
196/// gpu.yield %result: vector<8x16xf32>
197/// }
198/// return %0
199/// }
200struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
201 using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
202 LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
203 PatternRewriter &rewriter) const override {
204 auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
205 if (!uArch)
206 return rewriter.notifyMatchFailure(
207 gpuFuncOp, "Subgroup distribution requires target attribute attached "
208 "to set the warp size");
209 // If the function only contains a single void return, skip.
210 if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
211 return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
212 }))
213 return failure();
214 // If the function already moved inside a warp_execute_on_lane0, skip.
215 if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
216 return isa<gpu::WarpExecuteOnLane0Op>(op);
217 }))
218 return failure();
219 // Create a new function with the same signature and same attributes.
220 SmallVector<Type> workgroupAttributionsTypes =
221 llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
222 [](BlockArgument arg) { return arg.getType(); });
223 SmallVector<Type> privateAttributionsTypes =
224 llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
225 [](BlockArgument arg) { return arg.getType(); });
226 auto newGpuFunc = gpu::GPUFuncOp::create(
227 rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
228 gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
229 privateAttributionsTypes);
230 newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
231 // Create a WarpExecuteOnLane0Op with same arguments and results as the
232 // original gpuFuncOp.
233 rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
234 auto laneId = gpu::LaneIdOp::create(
235 rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
236 /** upperBound = **/ mlir::IntegerAttr());
237 ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
238 auto warpOp = gpu::WarpExecuteOnLane0Op::create(
239 rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
240 uArch->getSubgroupSize(), newGpuFunc.getArguments(),
241 newGpuFunc.getArgumentTypes());
242 Block &warpBodyBlock = warpOp.getBodyRegion().front();
243 // Replace the ReturnOp of the original gpu function with a YieldOp.
244 auto origRetunOp =
245 cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
246 rewriter.setInsertionPointAfter(origRetunOp);
247 gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
248 origRetunOp.getOperands());
249 rewriter.eraseOp(origRetunOp);
250 // Move the original function body to the WarpExecuteOnLane0Op body.
251 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
252 warpOp.getBodyRegion().begin());
253 rewriter.eraseBlock(&warpBodyBlock);
254 // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
255 rewriter.setInsertionPointAfter(warpOp);
256 gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
257 rewriter.replaceOp(gpuFuncOp, newGpuFunc);
258 return success();
259 }
260};
261
262/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
263/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
264/// still contain the original op that will not be used by the yield op (and
265/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
266/// arguments. Tensor descriptor shape is not distributed because it is a
267/// uniform value across all work items within the subgroup. However, the
268/// layout information is dropped in the new tensor descriptor type.
269///
270/// Example:
271///
272/// ```
273/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
274/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
275/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
276/// ...
277/// %td = xegpu.create_nd_tdesc %arg0
278/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
279/// vector.yield %td
280/// }
281/// ```
282/// To
283/// ```
284/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
285/// ...
286/// %dead = xegpu.create_nd_tdesc %arg0
287/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
288/// vector.yield %arg0, %dead
289/// }
290/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
291/// -> !xegpu.tensor_desc<4x8xf32>
292///
293/// ```
294struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
295 using gpu::WarpDistributionPattern::WarpDistributionPattern;
296 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
297 PatternRewriter &rewriter) const override {
298 OpOperand *operand =
299 getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
300 if (!operand)
301 return rewriter.notifyMatchFailure(
302 warpOp, "warp result is not a xegpu::CreateNdDesc op");
303 auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
304 unsigned operandIdx = operand->getOperandNumber();
305
306 xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
307 if (!layout)
308 return rewriter.notifyMatchFailure(
309 descOp, "the tensor descriptor lacks layout attribute");
310 // CreateNdOp must not have offsets.
311 if (descOp.getMixedOffsets().size())
312 return rewriter.notifyMatchFailure(
313 descOp, "xegpu::CreateNdDescOp must not have offsets");
314
315 SmallVector<size_t> newRetIndices;
316 rewriter.setInsertionPoint(warpOp);
317 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
318 rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
319 /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
320
321 SmallVector<Value> newDescOperands = llvm::map_to_vector(
322 newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
323 rewriter.setInsertionPointAfter(newWarpOp);
324 xegpu::TensorDescType distributedTensorDescTy =
325 descOp.getType().dropLayouts(); // Distributed tensor descriptor type
326 // does not contain layout info.
327 Value newDescOp = xegpu::CreateNdDescOp::create(
328 rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
329 descOp->getAttrs());
330
331 Value distributedVal = newWarpOp.getResult(operandIdx);
332 // Resolve the distributed type to the expected type.
333 newDescOp =
334 resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
335 rewriter.replaceAllUsesWith(distributedVal, newDescOp);
336 return success();
337 }
338};
339
340/// Distribute a store_nd op at the end of enclosing
341/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
342/// through the warp op interface they would be propagated as returned values.
343/// Source vector is distributed based on lane layout. Appropriate cast ops are
344/// inserted if the distributed types does not match expected xegpu SIMT types.
345///
346/// Example:
347///
348/// ```
349/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
350/// gpu.warp_execute_on_lane_0(%laneid) -> () {
351/// ...
352/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
353/// !xegpu.tensor_desc<4x8xf32, #layout0>
354/// }
355/// ```
356/// To
357/// ```
358/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
359/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
360/// ...
361/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
362/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
363/// }
364/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
365/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
366/// #layout0>
367/// -> !xegpu.tensor_desc<4x8xf32>
368/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
369/// !xegpu.tensor_desc<4x8xf32>
370///
371/// ```
372struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
373 using gpu::WarpDistributionPattern::WarpDistributionPattern;
374 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
375 PatternRewriter &rewriter) const override {
376 gpu::YieldOp yield = warpOp.getTerminator();
377 Operation *lastNode = yield->getPrevNode();
378 auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
379 if (!storeOp)
380 return failure();
381
382 SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
383 // Expecting offsets to be present.
384 if (offsets.empty())
385 return rewriter.notifyMatchFailure(storeOp,
386 "the store op must have offsets");
387 SmallVector<Value> offsetsAsValues =
388 vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
389 SmallVector<Type> offsetTypes = llvm::to_vector(
390 llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
391 xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
392 xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
393 if (!layout)
394 return rewriter.notifyMatchFailure(
395 storeOp, "the source tensor descriptor lacks layout attribute");
396
397 FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
398 getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
399 if (failed(distributedTypeByWarpOpOrFailure))
400 return rewriter.notifyMatchFailure(storeOp,
401 "Failed to distribute the type");
402 VectorType distributedTypeByWarpOp =
403 distributedTypeByWarpOpOrFailure.value();
404
405 SmallVector<size_t> newRetIndices;
406 SmallVector<Value> newYieldedValues = {storeOp.getValue(),
407 storeOp.getTensorDesc()};
408 SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
409 newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
410 newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
411 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
412 rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
413 // Create a new store op outside the warp op with the distributed vector
414 // type. Tensor descriptor is not distributed.
415 rewriter.setInsertionPointAfter(newWarpOp);
416 SmallVector<Value> newStoreOperands;
417
418 // For the value operand, there can be a mismatch between the vector type
419 // distributed by the warp op and (xegpu-specific) distributed type
420 // supported by the store op. Type mismatch must be resolved using
421 // appropriate cast op.
422 FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
423 xegpu::getDistributedVectorType(storeOp.getTensorDescType());
424 if (failed(storeNdDistributedValueTyOrFailure))
425 return rewriter.notifyMatchFailure(
426 storeOp, "Failed to get distributed vector type for the store op");
427 newStoreOperands.push_back(resolveDistributedTy(
428 newWarpOp.getResult(newRetIndices[0]),
429 storeNdDistributedValueTyOrFailure.value(), rewriter));
430 // For the tensor descriptor operand, the layout attribute is dropped after
431 // distribution. Types needs to be resolved in this case also.
432 xegpu::TensorDescType distributedTensorDescTy =
433 storeOp.getTensorDescType().dropLayouts();
434 newStoreOperands.push_back(
435 resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
436 distributedTensorDescTy, rewriter));
437 // Collect offsets.
438 for (size_t i = 2; i < newRetIndices.size(); ++i)
439 newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
440
441 auto newStoreOp =
442 xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
443 newStoreOperands, storeOp->getAttrs());
444 xegpu::removeLayoutAttrs(newStoreOp);
445 rewriter.eraseOp(storeOp);
446 return success();
447 }
448};
449
450/// Distribute a load_nd op feeding into vector.yield op for the enclosing
451/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
452/// The warp op will still contain the original op that will not be used by
453/// the yield op (and should be cleaned up later). The yield op will
454/// bypass the load's arguments. Only the loaded vector is distributed
455/// according to lane layout and, tensor descriptor types is not
456/// distributed. Appropriate cast ops are inserted if the distributed types does
457/// not match expected xegpu SIMT types.
458///
459/// Example:
460///
461/// ```
462/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
463/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
464/// (vector<4x1xf32>) {
465/// ...
466/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
467/// ->
468/// vector<4x8xf32>
469/// gpu.yield %ld
470/// }
471/// ```
472/// To
473/// ```
474/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
475/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
476/// ...
477/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
478/// vector<4x8xf32> gpu.yield %dead, %arg0
479/// }
480/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
481/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
482/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
483/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
484///
485/// ```
486struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
487 using gpu::WarpDistributionPattern::WarpDistributionPattern;
488 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
489 PatternRewriter &rewriter) const override {
490 OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
491 if (!isa<xegpu::LoadNdOp>(op))
492 return false;
493 // Make sure the same load op is the last operation in the warp op body.
494 // This ensure that load op is not sinked earlier violating any barrier
495 // synchronizations.
496 gpu::YieldOp yield = warpOp.getTerminator();
497 return yield->getPrevNode() == op;
498 });
499
500 if (!operand)
501 return rewriter.notifyMatchFailure(
502 warpOp, "warp result is not a xegpu::LoadNd op");
503
504 auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
505 auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
506 if (!uArch)
507 return rewriter.notifyMatchFailure(
508 loadOp, "xegpu::LoadNdOp require target attribute attached to "
509 "determine transpose "
510 "requirement");
511 // Chip information is required to decide if the layout requires transpose
512 // effect.
513 // Expecting offsets to be present.
514 SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
515 if (offsets.empty())
516 return rewriter.notifyMatchFailure(loadOp,
517 "the load op must have offsets");
518 SmallVector<Value> offsetsAsValues =
519 vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
520 SmallVector<Type> offsetTypes = llvm::to_vector(
521 llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
522
523 xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
524 xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
525 if (!layout)
526 return rewriter.notifyMatchFailure(
527 loadOp, "the source tensor descriptor lacks layout attribute");
528
529 unsigned operandIdx = operand->getOperandNumber();
530 VectorType distributedTypeByWarpOp =
531 cast<VectorType>(warpOp.getResult(operandIdx).getType());
532
533 SmallVector<size_t> newRetIndices;
534 SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
535 SmallVector<Type> newYieldedTypes = {tensorDescTy};
536 newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
537 newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
538 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
539 rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
540
541 // Create a new load op outside the warp op with the distributed vector
542 // type.
543 rewriter.setInsertionPointAfter(newWarpOp);
544 FailureOr<VectorType> loadNdDistValueTyOrFailure =
545 xegpu::getDistributedVectorType(loadOp.getTensorDescType());
546 if (failed(loadNdDistValueTyOrFailure))
547 return rewriter.notifyMatchFailure(
548 loadOp, "Failed to get distributed vector type for the load op");
549 xegpu::TensorDescType distributedTensorDescTy =
550 loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
551 // descriptor type does not
552 // contain layout info.
553 SmallVector<Value> newLoadOperands{
554 resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
555 distributedTensorDescTy, rewriter)};
556 // Collect offsets.
557 for (size_t i = 1; i < newRetIndices.size(); ++i)
558 newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
559 auto newLoadOp = xegpu::LoadNdOp::create(
560 rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
561 newLoadOperands, loadOp->getAttrs());
562 xegpu::removeLayoutAttrs(newLoadOp);
563 // Set the packed attribute if the layout requires it.
564 newLoadOp.setPacked(requirePacked(layout));
565 // Set the transpose attribute if the layout requires it.
566 if (requireTranspose(layout, uArch))
567 newLoadOp.setTranspose(
568 DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
569 Value distributedVal = newWarpOp.getResult(operandIdx);
570 // There can be a conflict between the vector type distributed by the
571 // warp op and (xegpu-specific) distributed type supported by the load
572 // op. Resolve these mismatches by inserting a cast.
573 Value tyResolvedVal = resolveDistributedTy(
574 newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
575 rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
576 return success();
577 }
578};
579
580/// Distribute a dpas op feeding into vector.yield op for the enclosing
581/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
582/// The warp op will still contain the original op that will not be used by
583/// the yield op (and should be cleaned up later). The yield op will
584/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
585/// distributed types does not match expected xegpu SIMT types.
586/// Example:
587/// ```
588/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
589/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
590/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
591/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
592/// (vector<8x1xf32>) {
593/// ...
594/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
595/// vector<8x16xf32>
596/// gpu.yield %dpas
597/// }
598/// ```
599/// To
600/// ```
601/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
602/// vector<8x1xf16>, vector<16x1xf16>) {
603/// ...
604/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
605/// -> vector<8x16xf32>
606/// gpu.yield %dead, %arg0, %arg1
607/// }
608/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
609/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
610/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
611/// vector<8xf32>
612/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
613/// ```
614struct DpasDistribution final : public gpu::WarpDistributionPattern {
615 using gpu::WarpDistributionPattern::WarpDistributionPattern;
616 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
617 PatternRewriter &rewriter) const override {
618 OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
619 if (!operand)
620 return rewriter.notifyMatchFailure(warpOp,
621 "warp result is not a xegpu::Dpas op");
622
623 auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
624 unsigned operandIdx = operand->getOperandNumber();
625 std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
626 std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
627 std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
628
629 xegpu::LayoutAttr layoutA =
630 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
631 xegpu::LayoutAttr layoutB =
632 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
633 xegpu::LayoutAttr layoutOut =
634 dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
635 if (!layoutA || !layoutB || !layoutOut)
636 return rewriter.notifyMatchFailure(
637 dpasOp,
638 "the xegpu::Dpas op lacks layout attribute for A, B or output");
639
640 FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
641 getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
642 FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
643 getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
644 FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
645 getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
646 if (failed(distLhsTypeByWarpOpOrFailure) ||
647 failed(distRhsTypeByWarpOpOrFailure) ||
648 failed(distResultTypeByWarpOpOrFailure))
649 return rewriter.notifyMatchFailure(
650 dpasOp,
651 "Failed to distribute the A, B or output types in xegpu::Dpas op");
652
653 llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
654 dpasOp.getRhs()};
655 llvm::SmallVector<Type, 3> newYieldTypes{
656 distLhsTypeByWarpOpOrFailure.value(),
657 distRhsTypeByWarpOpOrFailure.value()};
658 // Dpas acc operand is optional.
659 if (dpasOp.getAcc()) {
660 newYieldValues.push_back(dpasOp.getAcc());
661 newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
662 }
663 // Create a new warp op without the dpas.
664 SmallVector<size_t> newRetIndices;
665 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
666 rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
667
668 FailureOr<VectorType> expectedDistLhsTyOrFailure =
669 xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
670 FailureOr<VectorType> expectedDistRhsTyOrFailure =
671 xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
672 FailureOr<VectorType> expectedDistResultTyOrFailure =
673 xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
674 if (failed(expectedDistLhsTyOrFailure) ||
675 failed(expectedDistRhsTyOrFailure) ||
676 failed(expectedDistResultTyOrFailure))
677 return rewriter.notifyMatchFailure(
678 dpasOp,
679 "Failed to get distributed vector type for the dpas operands.");
680 // Create a new dpas op outside the warp op.
681 rewriter.setInsertionPointAfter(newWarpOp);
682 SmallVector<Value> newDpasOperands;
683 SmallVector<VectorType> newDpasOperandExpectedTypes;
684
685 // Resolve the distributed types with the original types.
686 newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
687 newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
688 VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
689 if (dpasOp.getAcc())
690 newDpasOperandExpectedTypes.push_back(distributedResultTy);
691
692 for (unsigned i = 0; i < newRetIndices.size(); i++) {
693 newDpasOperands.push_back(
694 resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
695 newDpasOperandExpectedTypes[i], rewriter));
696 }
697 auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
698 distributedResultTy, newDpasOperands,
699 dpasOp->getAttrs());
700 xegpu::removeLayoutAttrs(newDpasOp);
701 Value distributedVal = newWarpOp.getResult(operandIdx);
702 // Resolve the output type.
703 Value typeResolved =
704 resolveDistributedTy(newDpasOp.getResult(),
705 distResultTypeByWarpOpOrFailure.value(), rewriter);
706 rewriter.replaceAllUsesWith(distributedVal, typeResolved);
707 return success();
708 }
709};
710
711/// Distribute a prefetch_nd op at the end of enclosing
712/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
713/// through the warp op interface they would be propagated as returned values.
714/// Tensor descriptor shape is not distributed because it is a uniform value
715/// across all work items within the subgroup. Appropriate cast ops are inserted
716/// if the distributed types does not match expected xegpu SIMT types.
717///
718/// Example:
719///
720/// ```
721/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
722/// gpu.warp_execute_on_lane_0(%laneid) -> () {
723/// ...
724/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
725/// }
726/// ```
727/// To
728/// ```
729/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
730/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
731/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
732/// index
733/// }
734/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
735/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
736/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
737///
738/// ```
739struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
740 using gpu::WarpDistributionPattern::WarpDistributionPattern;
741 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
742 PatternRewriter &rewriter) const override {
743 gpu::YieldOp yield = warpOp.getTerminator();
744 Operation *lastNode = yield->getPrevNode();
745 auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
746 if (!prefetchOp)
747 return failure();
748
749 SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
750 // PrefetchNdOp must have offsets.
751 if (offsets.empty())
752 return rewriter.notifyMatchFailure(prefetchOp,
753 "the prefetch op must have offsets");
754 SmallVector<Value> offsetsAsValues =
755 vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
756 SmallVector<Type> offsetTypes = llvm::to_vector(
757 llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
758
759 xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
760 if (!layout)
761 return rewriter.notifyMatchFailure(
762 prefetchOp, "the source tensor descriptor lacks layout attribute");
763
764 SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
765 SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
766 newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
767 newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
768 SmallVector<size_t> newRetIndices;
769 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
770 rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
771 // Create a new prefetch op outside the warp op with updated tensor
772 // descriptor type. Source tensor descriptor require type resolution.
773 xegpu::TensorDescType newTensorDescTy =
774 prefetchOp.getTensorDescType().dropLayouts();
775 rewriter.setInsertionPointAfter(newWarpOp);
776 SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
777 newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
778 // Collect offsets.
779 for (size_t i = 1; i < newRetIndices.size(); ++i)
780 newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
781 xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
782 newPrefetchOperands, prefetchOp->getAttrs());
783 xegpu::removeLayoutAttrs(prefetchOp);
784 rewriter.eraseOp(prefetchOp);
785 return success();
786 }
787};
788
789/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
790/// region. This will simply move the barrier op outside of the warp op.
791struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
792 using gpu::WarpDistributionPattern::WarpDistributionPattern;
793 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
794 PatternRewriter &rewriter) const override {
795 gpu::YieldOp yield = warpOp.getTerminator();
796 Operation *lastNode = yield->getPrevNode();
797 // The last node must be a gpu::BarrierOp.
798 auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
799 if (!barrierOp)
800 return failure();
801 // Move the barrier op outside of the warp op.
802 rewriter.setInsertionPointAfter(warpOp);
803 gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
804 barrierOp->getResultTypes(),
805 barrierOp->getOperands(), barrierOp->getAttrs());
806 rewriter.eraseOp(barrierOp);
807 return success();
808 }
809};
810
811/// Distribute a scattered store op. The offsets argument is required.
812/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
813/// The layouts are fixed and implicit: one offset/mask per lane.
814/// The pass changes the offset/mask vector shapes to a
815/// single-element vector, **it is assumed that their producer will also be
816/// distributed**. The payload vector also has a fixed distribution:
817/// no chunk size -> vector of one element.
818/// chunk size -> vector of the innermost dimension of the SG-payload.
819/// Example 1 (no chunk size):
820/// %mask = producer_op : vector<16xi1>
821/// %offset = producer_op : vector<16xindex>
822/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
823/// memref<256xf16>, vector<16xindex>, vector<16xi1>
824/// To
825/// %mask = producer_op : vector<1xi1>
826/// %offset = producer_op : vector<1xindex>
827/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
828/// memref<256xf16>, vector<1xindex>, vector<1xi1>
829/// Example 2 (chunk size, same mask and offsets):
830/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
831/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
832/// To
833/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
834/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
835struct StoreDistribution final : public gpu::WarpDistributionPattern {
836 using gpu::WarpDistributionPattern::WarpDistributionPattern;
837 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
838 PatternRewriter &rewriter) const override {
839 Operation *lastNode = warpOp.getTerminator()->getPrevNode();
840 auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
841 if (!storeScatterOp)
842 return failure();
843 auto offsets = storeScatterOp.getOffsets();
844 if (!offsets || !isa<VectorType>(offsets.getType()))
845 return rewriter.notifyMatchFailure(
846 storeScatterOp, "Store op must have a vector of offsets argument");
847 VectorType offsetsTy = cast<VectorType>(offsets.getType());
848 VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
849 if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
850 return rewriter.notifyMatchFailure(storeScatterOp,
851 "Expected 1D offsets and mask vector");
852 VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
853 if (storeVecTy.getRank() > 2)
854 return rewriter.notifyMatchFailure(
855 storeScatterOp, "Expected at most 2D result at SG level");
856
857 std::string layoutPayloadName =
858 xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
859 std::string layoutOffsetsName =
860 xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
861 std::string layoutMaskName =
862 xegpu::getLayoutName(storeScatterOp->getOpOperand(3));
863
864 xegpu::LayoutAttr layoutPayload =
865 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
866 xegpu::LayoutAttr layoutOffsets =
867 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
868 xegpu::LayoutAttr layoutMask =
869 storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
870
871 FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
872 getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
873 FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
874 getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
875 FailureOr<VectorType> distMaskByWarpOpOrFailure =
876 getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
877 if (failed(distStoreVecByWarpOpOrFailure) ||
878 failed(distOffsetsByWarpOpOrFailure) ||
879 failed(distMaskByWarpOpOrFailure)) {
880 return rewriter.notifyMatchFailure(
881 storeScatterOp,
882 "Some vector operands have no layouts, using defaults instead.");
883 }
884 // Distributed store payload type according to the lane layout.
885 VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
886 // Expected distributed payload type is always 1D.
887 VectorType expectedPayloadTy =
888 VectorType::get({distPayloadTyByWarpOp.getNumElements()},
889 distPayloadTyByWarpOp.getElementType());
890
891 SmallVector<size_t> newRetIndices;
892 SmallVector<Value> operands = storeScatterOp->getOperands();
893 SmallVector<Type> operandTypesToYield = {
894 distPayloadTyByWarpOp, operands[1].getType(),
895 distOffsetsByWarpOpOrFailure.value(),
896 distMaskByWarpOpOrFailure.value()};
897
898 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
899 rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
900 SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
901 newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
902 // The payload operand may need type adjustment due to mismatch between warp
903 // distributed type and expected SIMT type.
904 rewriter.setInsertionPointAfter(newWarpOp);
905 newStoreScatterOpOperands[0] = resolveDistributedTy(
906 newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
907 xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
908 rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
909 storeScatterOp->getAttrs());
911 rewriter.eraseOp(storeScatterOp);
912 return success();
913 }
914};
915
916static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
917 PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
918 Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
919 SmallVector<Value> newCoods;
920 auto maybeCoords =
921 layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
922 if (failed(maybeCoords))
923 return {};
924 assert(maybeCoords.value().size() == 1 &&
925 "Expected one set of distributed offsets");
927 rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
928 getAsOpFoldResult(origOffsets));
929 newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
930 return newCoods;
931}
932
933/// Pattern for distributing xegpu::LoadMatrixOp.
934struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
935 using gpu::WarpDistributionPattern::WarpDistributionPattern;
936 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
937 PatternRewriter &rewriter) const override {
938 gpu::YieldOp yield = warpOp.getTerminator();
939 Operation *lastNode = yield->getPrevNode();
940 auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
941 if (!matrixOp)
942 return failure();
943
944 OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
945 return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
946 });
947 if (!producedByLastLoad)
948 return rewriter.notifyMatchFailure(
949 warpOp, "The last op is not xegpu::LoadMatrixOp");
950 const int operandIdx = producedByLastLoad->getOperandNumber();
951
952 VectorType sgPayloadTy =
953 dyn_cast<VectorType>(matrixOp.getResult().getType());
954 VectorType warpResultTy =
955 cast<VectorType>(warpOp.getResult(operandIdx).getType());
956 if (!sgPayloadTy)
957 return rewriter.notifyMatchFailure(
958 matrixOp, "the matrix op payload must be a vector type");
959
960 auto loc = matrixOp.getLoc();
961 auto offsets = matrixOp.getMixedOffsets();
962 if (offsets.empty())
963 return rewriter.notifyMatchFailure(matrixOp,
964 "the load op must have offsets");
965 SmallVector<Value> offsetsAsValues =
966 vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
967
968 auto layout = matrixOp.getLayoutAttr();
969 if (!layout)
970 return rewriter.notifyMatchFailure(
971 matrixOp, "the matrix operation lacks layout attribute");
972
973 FailureOr<VectorType> distPayloadByWarpOpOrFailure =
974 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
975 if (failed(distPayloadByWarpOpOrFailure))
976 return rewriter.notifyMatchFailure(
977 matrixOp, "Failed to distribute matrix op payload based on layout.");
978
979 SmallVector<Value> operands = {matrixOp.getMemDesc()};
980 const unsigned offsetsStartIdx = operands.size();
981 operands.append(offsetsAsValues);
982
983 SmallVector<Type> operandTypes = llvm::to_vector(
984 llvm::map_range(operands, [](Value v) { return v.getType(); }));
985
986 SmallVector<size_t> newRetIndices;
987 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
988 rewriter, warpOp, operands, operandTypes, newRetIndices);
989 SmallVector<Value> newOperands = llvm::map_to_vector(
990 newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
991
992 SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
993 std::fill(newConstOffsets.begin(), newConstOffsets.end(),
994 ShapedType::kDynamic);
995 DenseI64ArrayAttr newConstOffsetsAttr =
996 rewriter.getDenseI64ArrayAttr(newConstOffsets);
997 ValueRange currentOffsets =
998 ValueRange(newOperands).drop_front(offsetsStartIdx);
999
1000 SmallVector<Value> newCoords = currentOffsets;
1001 rewriter.setInsertionPointAfter(newWarpOp);
1002
1003 if (!matrixOp.getSubgroupBlockIoAttr()) {
1004 newCoords = computeDistributedCoordinatesForMatrixOp(
1005 rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
1006 currentOffsets);
1007 }
1008 xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
1009 rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
1010 newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
1011 matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
1012 // Resolve the output type and replace all uses.
1013 rewriter.replaceAllUsesWith(
1014 newWarpOp.getResult(operandIdx),
1015 resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
1016 return success();
1017 }
1018};
1019
1020/// Pattern for distributing xegpu::StoreMatrixOp.
1021struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
1022 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1023 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1024 PatternRewriter &rewriter) const override {
1025 gpu::YieldOp yield = warpOp.getTerminator();
1026 Operation *lastNode = yield->getPrevNode();
1027 auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
1028 if (!matrixOp)
1029 return failure();
1030
1031 VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
1032 if (!sgPayloadTy)
1033 return rewriter.notifyMatchFailure(
1034 matrixOp, "the matrix op payload must be a vector type");
1035
1036 auto loc = matrixOp.getLoc();
1037 auto offsets = matrixOp.getMixedOffsets();
1038 if (offsets.empty())
1039 return rewriter.notifyMatchFailure(matrixOp,
1040 "the store op must have offsets");
1041 SmallVector<Value> offsetsAsValues =
1042 vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
1043
1044 auto layout = matrixOp.getLayoutAttr();
1045 if (!layout)
1046 return rewriter.notifyMatchFailure(
1047 matrixOp, "the matrix operation lacks layout attribute");
1048
1049 FailureOr<VectorType> distPayloadByWarpOpOrFailure =
1050 getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
1051 if (failed(distPayloadByWarpOpOrFailure))
1052 return rewriter.notifyMatchFailure(
1053 matrixOp, "Failed to distribute matrix op payload based on layout.");
1054
1055 SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
1056 const unsigned offsetsStartIdx = operands.size();
1057 operands.append(offsetsAsValues);
1058
1059 SmallVector<Type> operandTypes = llvm::to_vector(
1060 llvm::map_range(operands, [](Value v) { return v.getType(); }));
1061 operandTypes[0] = *distPayloadByWarpOpOrFailure;
1062
1063 SmallVector<size_t> newRetIndices;
1064 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1065 rewriter, warpOp, operands, operandTypes, newRetIndices);
1066 SmallVector<Value> newOperands = llvm::map_to_vector(
1067 newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
1068
1069 SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
1070 std::fill(newConstOffsets.begin(), newConstOffsets.end(),
1071 ShapedType::kDynamic);
1072 DenseI64ArrayAttr newConstOffsetsAttr =
1073 rewriter.getDenseI64ArrayAttr(newConstOffsets);
1074 ValueRange currentOffsets =
1075 ValueRange(newOperands).drop_front(offsetsStartIdx);
1076
1077 SmallVector<Value> newCoords = currentOffsets;
1078 rewriter.setInsertionPointAfter(newWarpOp);
1079
1080 if (!matrixOp.getSubgroupBlockIoAttr()) {
1081 newCoords = computeDistributedCoordinatesForMatrixOp(
1082 rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
1083 currentOffsets);
1084 }
1085
1086 xegpu::StoreMatrixOp::create(
1087 rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
1088 ValueRange(newCoords), newConstOffsetsAttr,
1089 matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
1090 rewriter.eraseOp(matrixOp);
1091 return success();
1092 }
1093};
1094
1095/// Distribute a scattered load op. The logic and requirements are the same as
1096/// for the scattered store distribution. The warpOp's payload vector is
1097/// expected to be distributed by the load's result consumer.
1098/// Example 1 (no chunk size):
1099/// %mask = producer_op : vector<16xi1>
1100/// %offset = producer_op : vector<16xindex>
1101/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
1102/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
1103/// To
1104/// %mask = producer_op : vector<1xi1>
1105/// %offset = producer_op : vector<1xindex>
1106/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
1107/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
1108/// Example 2 (chunk size, same mask and offsets):
1109/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
1110/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
1111/// To
1112/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
1113/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
1114struct LoadDistribution final : public gpu::WarpDistributionPattern {
1115 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1116 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1117 PatternRewriter &rewriter) const override {
1118 OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
1119 // Check if the yield operand that was produced by the *last* scattered
1120 // load op to avoid sinking it before barriers (maintain memory order).
1121 return isa<xegpu::LoadGatherOp>(op) &&
1122 warpOp.getTerminator()->getPrevNode() == op;
1123 });
1124 if (!producedByLastLoad)
1125 return rewriter.notifyMatchFailure(
1126 warpOp, "The last op is not xegpu::LoadGatherOp");
1127
1128 auto loadGatherOp =
1129 producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
1130 auto offsets = loadGatherOp.getOffsets();
1131 if (!offsets || !isa<VectorType>(offsets.getType()) ||
1132 !isa<VectorType>(loadGatherOp.getMask().getType()))
1133 return rewriter.notifyMatchFailure(
1134 loadGatherOp,
1135 "Load op must have a vector arguments for offsets and mask");
1136 VectorType offsetsTy = cast<VectorType>(offsets.getType());
1137 VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
1138 if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
1139 return rewriter.notifyMatchFailure(loadGatherOp,
1140 "Expected 1D offsets and mask vector");
1141 // Assume offset and mask producers will be distributed as well.
1142 std::string layoutOffsetsName =
1143 xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
1144 std::string layoutMaskName =
1145 xegpu::getLayoutName(loadGatherOp->getOpOperand(2));
1146
1147 xegpu::LayoutAttr layoutOffsets =
1148 loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
1149 xegpu::LayoutAttr layoutMask =
1150 loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
1151
1152 FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
1153 getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
1154 FailureOr<VectorType> distMaskByWarpOpOrFailure =
1155 getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
1156 if (failed(distOffsetsByWarpOpOrFailure) ||
1157 failed(distMaskByWarpOpOrFailure)) {
1158 return rewriter.notifyMatchFailure(
1159 loadGatherOp,
1160 "Some vector operands have no layouts, using defaults instead.");
1161 }
1162
1163 SmallVector<size_t> newRetIndices;
1164 SmallVector<Value> operands = loadGatherOp->getOperands();
1165 SmallVector<Type> operandTypesToYield = {
1166 operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
1167 distMaskByWarpOpOrFailure.value()};
1168
1169 const unsigned operandIdx = producedByLastLoad->getOperandNumber();
1170 VectorType distResultTy =
1171 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1172 // Distributed load op will always be 1D.
1173 VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
1174 distResultTy.getElementType());
1175
1176 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1177 rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
1178
1179 SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
1180 newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
1181
1182 rewriter.setInsertionPointAfter(newWarpOp);
1183 xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
1184 rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
1185 loadGatherOp->getAttrs());
1187 Value distributedVal = newWarpOp.getResult(operandIdx);
1188 // Resolve the output type and replace all uses.
1189 rewriter.replaceAllUsesWith(
1190 distributedVal,
1191 resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
1192 return success();
1193 }
1194};
1195
1196/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
1197/// VectorReductionOps. We also insert layouts for the newly created ops.
1198static Value lowerToVectorReductions(TypedValue<VectorType> src,
1200 vector::CombiningKind kind,
1201 int64_t reductionDim, Location loc,
1202 PatternRewriter &rewriter) {
1203 // Expecting a 2D source vector.
1204 assert(src.getType().getRank() == 2 && "expected a 2D source vector");
1205 VectorType sourceType = src.getType();
1206 int64_t sourceH = sourceType.getShape()[0];
1207 int64_t sourceW = sourceType.getShape()[1];
1208 int nSlices = (reductionDim == 0) ? sourceW : sourceH;
1209 // Create a constant vector to hold the result of the reduction.
1210 TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
1211 Value reductionResult = arith::ConstantOp::create(
1212 rewriter, loc, acc.getType(),
1213 DenseElementsAttr::get(acc.getType(), zeroAttr));
1214 // Reduction result should have the same layout as the accumulator.
1215 xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult),
1217 // For each slice of the source, extract the slice vector, do a reduction
1218 // and, insert the reduced value back to the result vector.
1219 for (int i = 0; i < nSlices; ++i) {
1220 SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
1221 if (reductionDim == 1) {
1222 sliceOffsets = {i, 0};
1223 sliceSizes = {1, sourceW};
1224 } else {
1225 sliceOffsets = {0, i};
1226 sliceSizes = {sourceH, 1};
1227 }
1228 vector::ExtractStridedSliceOp extractOp =
1229 vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
1230 sliceSizes, {1, 1});
1231 int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
1232 vector::ShapeCastOp slice = vector::ShapeCastOp::create(
1233 rewriter, loc,
1234 VectorType::get({nSliceElements}, sourceType.getElementType()),
1235 extractOp.getResult());
1236 // Shape cast is currently handled in xegpu side. So layouts must be
1237 // retained during lowering. Shape cast output has the same layout as the
1238 // accumulator. Shape cast source has the same layout as the original
1239 // reduction source.
1240 // TODO: other ops generated here may also need layout attributes.
1241 xegpu::setDistributeLayoutAttr(slice->getOpOperand(0),
1243 xegpu::setDistributeLayoutAttr(slice->getOpResult(0),
1245 // Extract and reduction results in scalars, so no result layout is needed.
1246 Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
1247 Value reduction = vector::ReductionOp::create(
1248 rewriter, loc, kind, slice.getResult(), accExtract);
1249 reductionResult =
1250 vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
1251 }
1252 return reductionResult;
1253}
1254
1255/// This patterns distribute the `vector.multi_reduction` operation across
1256/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
1257/// layouts for the source and accumulator vectors,
1258/// * If the reduction dimension is distributed across lanes, the reduction is
1259/// non-lane-local and the reduction is done using warp shuffles. Here we
1260/// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
1261/// the warp op body.
1262/// * If the reduction dimension is not distributed across lanes, the reduction
1263/// is lane-local. In this case, we yield the source and accumulator vectors
1264/// from the warp op and perform the lane-local reduction outside the warp op
1265/// using a sequence of ReductionOps.
1266/// Example 1 (Reduction is lane-local):
1267/// ```
1268/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
1269/// %0 = "some_def"() : () -> (vector<16x32xf32>)
1270/// %acc = "some_def"() : () -> (vector<32xf32>)
1271/// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
1272/// vector<32xf32> gpu.yield %1 : vector<32xf32>
1273/// }
1274/// ```
1275/// is lowered to:
1276/// ```
1277/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
1278/// vector<1xf32>) {
1279/// %0 = "some_def"() : () -> (vector<16x32xf32>)
1280/// %acc = "some_def"() : () -> (vector<32xf32>)
1281/// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
1282/// }
1283/// %c = arith.constant dense<0.0> : vector<1xf32>
1284/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
1285/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
1286/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
1287/// ```
1288/// Example 2 (Reduction is non-lane-local):
1289/// ```
1290/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
1291/// %0 = "some_def"() : () -> (vector<2x32xf32>)
1292/// %acc = "some_def"() : () -> (vector<2xf32>)
1293/// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
1294/// vector<2xf32>
1295/// gpu.yield %1 : vector<2xf32>
1296/// }
1297/// ```
1298/// is lowered to:
1299/// ```
1300/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
1301/// %0 = "some_def"() : () -> (vector<2x32xf32>)
1302/// %acc = "some_def"() : () -> (vector<2xf32>)
1303/// %1 = arith.constant dense<0.0> : vector<2xf32>
1304/// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
1305/// %3 = ("warp.reduction %2") : f32
1306/// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
1307/// ... repeat for row 1
1308/// gpu.yield %1 : vector<2xf32>
1309/// }
1310struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
1311 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1312 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1313 PatternRewriter &rewriter) const override {
1314 OpOperand *yieldOperand =
1315 getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
1316 if (!yieldOperand)
1317 return failure();
1318 auto reductionOp =
1319 cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
1320 unsigned operandIdx = yieldOperand->getOperandNumber();
1321 VectorType sourceType = reductionOp.getSourceVectorType();
1322 // Only 2D vectors are supported.
1323 if (sourceType.getRank() != 2)
1324 return rewriter.notifyMatchFailure(warpOp,
1325 "Only 2D reductions are supported.");
1326 ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
1327 // Only 1 reduction dimension supported. This also ensures that the result
1328 // is vector type.
1329 if (reductionDims.size() != 1)
1330 return rewriter.notifyMatchFailure(
1331 warpOp, "Only 1 reduction dimension is supported.");
1332 int64_t reductionDim = reductionDims[0];
1333 VectorType distributedResultType =
1334 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1335 VectorType resultType = cast<VectorType>(reductionOp.getType());
1336 xegpu::DistributeLayoutAttr sourceLayout =
1337 xegpu::getDistributeLayoutAttr(reductionOp.getSource());
1338
1339 FailureOr<VectorType> sourceDistTypeOrFailure =
1340 getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
1341 if (failed(sourceDistTypeOrFailure))
1342 return rewriter.notifyMatchFailure(
1343 warpOp, "Failed to distribute the source vector type.");
1344 VectorType sourceDistType = sourceDistTypeOrFailure.value();
1345 // Only single dimension distribution is supported.
1346 bool dim0Distributed =
1347 sourceDistType.getShape()[0] != sourceType.getShape()[0];
1348 bool dim1Distributed =
1349 sourceDistType.getShape()[1] != sourceType.getShape()[1];
1350 if (dim0Distributed && dim1Distributed)
1351 return rewriter.notifyMatchFailure(
1352 warpOp, "Expecting source to be distributed in a single dimension.");
1353 int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
1354 if (sourceDistDim == -1)
1355 return rewriter.notifyMatchFailure(
1356 warpOp, "Expecting a distributed source vector.");
1357 bool resultDistributed =
1358 distributedResultType.getNumElements() < resultType.getNumElements();
1359 // If the lane owns all the data required for reduction (i.e. reduction is
1360 // fully parallel accross lanes), then each lane owns part of the result
1361 // (i.e. result is distributed). If the reduction require cross-lane
1362 // shuffling, then the result is shared among all lanes (broadcasted).
1363 // Therefore we expect following cases:
1364 //
1365 // | Source vector | Reduction dim | Result vector |
1366 // |----------------------|----------------|----------------|
1367 // | dim-0 distributed | 0 | broadcasted |
1368 // | dim-0 distributed | 1 | distributed |
1369 // | dim-1 distributed | 0 | distributed |
1370 // | dim-1 distributed | 1 | broadcasted |
1371
1372 bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
1373 (sourceDistDim == 1 && reductionDim == 0);
1374 if (isReductionLaneLocal && !resultDistributed)
1375 return rewriter.notifyMatchFailure(
1376 warpOp, "Expecting a distributed result for lane-local reduction.");
1377
1378 if (!isReductionLaneLocal && resultDistributed)
1379 return rewriter.notifyMatchFailure(
1380 warpOp,
1381 "Expecting a broadcasted result for non-lane-local reduction.");
1382
1383 // Handle lane-local reduction case. In this case we fully distribute the
1384 // reduction result.
1385 if (isReductionLaneLocal) {
1386 // Yield the source and acc vectors from the WarpOp.
1387 SmallVector<size_t> newRetIndices;
1388 auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1389 rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
1390 {sourceDistType, distributedResultType}, newRetIndices);
1391 rewriter.setInsertionPointAfter(newWarpOp);
1392 Value result = lowerToVectorReductions(
1393 cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
1394 cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
1395 reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
1396 // Replace the warp op result with the final result.
1397 rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
1398 return success();
1399 }
1400 // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
1401 // of multiple ReductionOps. Actual distribution is done by the
1402 // WarpOpReduction pattern.
1403 rewriter.setInsertionPointAfter(reductionOp);
1404 Value result = lowerToVectorReductions(
1405 cast<TypedValue<VectorType>>(reductionOp.getSource()),
1406 cast<TypedValue<VectorType>>(reductionOp.getAcc()),
1407 reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
1408 // Replace the warp op result with the final result.
1409 rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
1410 return success();
1411 }
1412};
1413
1414/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
1415/// `gpu.warp_execute_on_lane_0` region.
1416struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
1417 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1418 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1419 PatternRewriter &rewriter) const override {
1420 OpOperand *yieldOperand =
1421 getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
1422 if (!yieldOperand)
1423 return failure();
1424 auto shapeCastOp =
1425 cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
1426 unsigned operandNumber = yieldOperand->getOperandNumber();
1427 auto resultDistTy =
1428 cast<VectorType>(warpOp.getResult(operandNumber).getType());
1429 xegpu::DistributeLayoutAttr sourceLayout =
1430 xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
1431 xegpu::DistributeLayoutAttr resultLayout =
1432 xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
1433 if (!sourceLayout || !resultLayout)
1434 return rewriter.notifyMatchFailure(
1435 warpOp,
1436 "the source or result of shape_cast op lacks distribution layout");
1437
1438 // For rank reducing or increasing shape_cast ops, the lower rank layout
1439 // must be a slice of higher rank layout.
1440 int64_t sourceRank = shapeCastOp.getSourceVectorType().getRank();
1441 int64_t resultRank = shapeCastOp.getResultVectorType().getRank();
1442 if (sourceRank < resultRank && !sourceLayout.isSliceOf(resultLayout))
1443 return rewriter.notifyMatchFailure(
1444 warpOp, "shape_cast is rank reducing but source layout is not a "
1445 "slice of result layout");
1446 if (sourceRank > resultRank && !resultLayout.isSliceOf(sourceLayout))
1447 return rewriter.notifyMatchFailure(
1448 warpOp, "shape_cast is rank increasing but result layout is not a "
1449 "slice of source layout");
1450
1451 FailureOr<VectorType> sourceDistTypeOrFailure =
1452 getDistVecTypeBasedOnLaneLayout(sourceLayout,
1453 shapeCastOp.getSourceVectorType());
1454 if (failed(sourceDistTypeOrFailure))
1455 return rewriter.notifyMatchFailure(
1456 warpOp, "failed to get distributed vector type for source");
1457 VectorType sourceDistType = sourceDistTypeOrFailure.value();
1458 // Create a new warp op that yields the source of the shape_cast op.
1459 SmallVector<size_t> newRetIndices;
1460 auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1461 rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
1462 newRetIndices);
1463 rewriter.setInsertionPointAfter(newWarpOp);
1464 Value source = newWarpOp.getResult(newRetIndices[0]);
1465 // Create a new shape_cast op outside the warp op.
1466 Value newShapeCast = vector::ShapeCastOp::create(
1467 rewriter, shapeCastOp.getLoc(), resultDistTy, source);
1468 rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
1469 newShapeCast);
1470 return success();
1471 }
1472};
1473
1474/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
1475/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
1476/// outside of the warp op.
1477struct MemrefExtractAlignedPointerAsIndexDistribution final
1479 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1480 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1481 PatternRewriter &rewriter) const override {
1482 OpOperand *operand = getWarpResult(
1483 warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
1484 if (!operand)
1485 return rewriter.notifyMatchFailure(
1486 warpOp,
1487 "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
1488 auto extractOp =
1489 operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
1490 unsigned operandIdx = operand->getOperandNumber();
1491 SmallVector<size_t> newRetIndices;
1492 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1493 rewriter, warpOp, extractOp.getSource(),
1494 TypeRange{extractOp.getSource().getType()}, newRetIndices);
1495 rewriter.setInsertionPointAfter(newWarpOp);
1496 auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
1497 rewriter, newWarpOp.getLoc(), extractOp.getType(),
1498 newWarpOp.getResult(newRetIndices[0]));
1499 Value distributedVal = newWarpOp.getResult(operandIdx);
1500 rewriter.replaceAllUsesWith(distributedVal, newExtractOp.getResult());
1501 return success();
1502 }
1503};
1504
1505/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
1506/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
1507/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
1508/// created outside of the warp op with distributed source vector type (computed
1509/// using assigned layout).
1510struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
1511 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1512 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1513 PatternRewriter &rewriter) const override {
1514 OpOperand *operand =
1515 getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
1516 if (!operand)
1517 return rewriter.notifyMatchFailure(
1518 warpOp, "warp result is not a vector::BitCast op");
1519 auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
1520 unsigned operandIdx = operand->getOperandNumber();
1521 VectorType distributedSourceType =
1522 getDistVecTypeBasedOnLaneLayout(
1523 xegpu::getDistributeLayoutAttr(bitcastOp.getSource()),
1524 bitcastOp.getSourceVectorType())
1525 .value_or(VectorType());
1526 if (!distributedSourceType)
1527 return rewriter.notifyMatchFailure(
1528 bitcastOp, "Failed to distribute the source vector type in "
1529 "vector::BitCast op");
1530 VectorType distributedResultType =
1531 cast<VectorType>(warpOp.getResult(operandIdx).getType());
1532 SmallVector<size_t> newRetIndices;
1533 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1534 rewriter, warpOp, bitcastOp.getSource(),
1535 TypeRange{distributedSourceType}, newRetIndices);
1536 rewriter.setInsertionPointAfter(newWarpOp);
1537 auto newBitcastOp = vector::BitCastOp::create(
1538 rewriter, newWarpOp.getLoc(), distributedResultType,
1539 newWarpOp.getResult(newRetIndices[0]));
1540 Value distributedVal = newWarpOp.getResult(operandIdx);
1541 rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
1542 return success();
1543 }
1544};
1545
1546/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
1547/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
1548/// supported. In most cases, transpose is a no op because it is entirely
1549/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
1550/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
1551/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
1552/// vector::TransposeOp outside of the warp op with distributed source vector
1553/// type (computed using assigned layout).
1554struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
1555 using gpu::WarpDistributionPattern::WarpDistributionPattern;
1556 LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
1557 PatternRewriter &rewriter) const override {
1558 OpOperand *operand =
1559 getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
1560 if (!operand)
1561 return rewriter.notifyMatchFailure(
1562 warpOp, "warp result is not a vector::Transpose op");
1563 auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
1564 unsigned operandIdx = operand->getOperandNumber();
1565 xegpu::DistributeLayoutAttr sourceLayout =
1566 xegpu::getDistributeLayoutAttr(transposeOp.getVector());
1567 xegpu::DistributeLayoutAttr resultLayout =
1568 xegpu::getDistributeLayoutAttr(transposeOp.getResult());
1569 if (!sourceLayout || !resultLayout)
1570 return rewriter.notifyMatchFailure(
1571 transposeOp,
1572 "the source or result vector of the transpose op lacks layout "
1573 "attribute");
1574 int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
1575 int64_t resultRank = transposeOp.getResultVectorType().getRank();
1576 // Only 2D transposes are supported for now.
1577 // TODO: Support nD transposes.
1578 if (sourceRank != 2 || resultRank != 2)
1579 return rewriter.notifyMatchFailure(
1580 transposeOp, "the source or result vector of the transpose op "
1581 "does not have 2D layout");
1582 ArrayRef<int64_t> perm = transposeOp.getPermutation();
1583 // Result layout must be a transpose of source layout.
1584 if (!resultLayout.isTransposeOf(sourceLayout, perm))
1585 return rewriter.notifyMatchFailure(
1586 transposeOp,
1587 "the source or result vector layouts must be 2D transposes of each "
1588 "other");
1589 FailureOr<VectorType> distributedSourceTypeOrFailure =
1590 getDistVecTypeBasedOnLaneLayout(sourceLayout,
1591 transposeOp.getSourceVectorType());
1592 if (failed(distributedSourceTypeOrFailure))
1593 return rewriter.notifyMatchFailure(
1594 transposeOp, "Failed to distribute the source vector type in "
1595 "vector::Transpose op");
1596 SmallVector<size_t> newRetIndices;
1597 gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1598 rewriter, warpOp, transposeOp.getVector(),
1599 TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
1600 rewriter.setInsertionPointAfter(newWarpOp);
1601 auto newTransposeOp = vector::TransposeOp::create(
1602 rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
1603 perm);
1604 Value distributedVal = newWarpOp.getResult(operandIdx);
1605 rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
1606 return success();
1607 }
1608};
1609
1610} // namespace
1611
1612namespace {
1613struct XeGPUSubgroupDistributePass final
1615 XeGPUSubgroupDistributePass> {
1616 void runOnOperation() override;
1617};
1618} // namespace
1619
1622 patterns.add<CreateNdDescDistribution, StoreNdDistribution,
1623 LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
1624 GpuBarrierDistribution, VectorMultiReductionDistribution,
1625 LoadDistribution, StoreDistribution, VectorTransposeDistribution,
1626 VectorBitcastDistribution, LoadMatrixDistribution,
1627 StoreMatrixDistribution,
1628 MemrefExtractAlignedPointerAsIndexDistribution>(
1629 patterns.getContext(),
1630 /*pattern benefit=*/regularPatternBenefit);
1631 patterns.add<VectorShapeCastDistribution>(
1632 patterns.getContext(),
1633 /*pattern benefit=*/highPatternBenefit);
1634}
1635
1638 patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
1639}
1640
1641void XeGPUSubgroupDistributePass::runOnOperation() {
1642 // Step 1: Attach layouts to op operands.
1643 // TODO: Following assumptions are made:
1644 // 1) It is assumed that there are no layout conflicts.
1645 // 2) Any existing layout attributes attached to the operands are ignored.
1646 Operation *op = getOperation();
1647 op->walk([&](Operation *op) {
1648 for (OpOperand &operand : op->getOpOperands()) {
1649 // Layouts are needed for vector type only.
1650 if (!isa<VectorType>(operand.get().getType()))
1651 continue;
1652 if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
1653 continue;
1654
1655 auto layout = xegpu::getDistributeLayoutAttr(operand.get());
1656 if (!layout) {
1657 op->emitError("Could not find layout attribute for operand ")
1658 << operand.getOperandNumber() << " of operation " << op->getName();
1659 signalPassFailure();
1660 return;
1661 }
1662 xegpu::setDistributeLayoutAttr(operand, layout);
1663 }
1664 });
1665 // Step 2: Move all operations of a GPU function inside
1666 // gpu.warp_execute_on_lane_0 operation.
1667 {
1668 RewritePatternSet patterns(&getContext());
1670
1671 if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
1672 signalPassFailure();
1673 return;
1674 }
1675 // At this point, we have moved the entire function body inside the
1676 // warpOp. Now move any scalar uniform code outside of the warpOp (like
1677 // GPU index ops, scalar constants, etc.). This will simplify the
1678 // later lowering and avoid custom patterns for these ops.
1679 getOperation()->walk([&](Operation *op) {
1680 if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
1681 vector::moveScalarUniformCode(warpOp);
1682 });
1683 }
1684 // Step 3: Apply subgroup to workitem distribution patterns.
1685 RewritePatternSet patterns(&getContext());
1687 // distributionFn is used by vector distribution patterns to determine the
1688 // distributed vector type for a given vector value. In XeGPU subgroup
1689 // distribution context, we compute this based on lane layout.
1690 auto distributionFn = [](Value val) {
1691 VectorType vecType = dyn_cast<VectorType>(val.getType());
1692 int64_t vecRank = vecType ? vecType.getRank() : 0;
1693 if (vecRank == 0)
1694 return AffineMap::get(val.getContext());
1695 // Get the layout of the vector type.
1696 xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
1697 // If no layout is specified, that means no distribution.
1698 if (!layout)
1699 return AffineMap::getMultiDimMapWithTargets(vecRank, {},
1700 val.getContext());
1701 // Expecting vector and layout rank to match.
1702 assert(layout.getRank() == vecRank &&
1703 "Expecting vector and layout rank to match");
1704 // A dimension is distributed only if layout suggests there are
1705 // multiple lanes assigned for this dimension and the shape can be evenly
1706 // distributed to those lanes.
1707 SmallVector<unsigned int> distributedDims;
1708 for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
1709 if (v > 1 && vecType.getShape()[i] % v == 0)
1710 distributedDims.push_back(i);
1711 }
1712 return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
1713 val.getContext());
1714 };
1715 // TODO: shuffleFn is not used.
1716 auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
1717 int64_t warpSz) { return Value(); };
1718
1719 auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
1720 vector::CombiningKind kind, uint32_t size) {
1721 // First reduce on a single thread to get per lane reduction value.
1722 Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
1723 // Parallel reduction using butterfly shuffles.
1724 for (uint64_t i = 1; i < size; i <<= 1) {
1725 Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
1726 /*width=*/size,
1727 /*mode=*/gpu::ShuffleMode::XOR)
1728 .getShuffleResult();
1729 laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
1730 }
1731 return laneVal;
1732 };
1733
1734 vector::populateDistributeReduction(
1735 patterns, warpReduction,
1736 /*pattern benefit=*/regularPatternBenefit);
1737
1738 vector::populatePropagateWarpVectorDistributionPatterns(
1739 patterns, distributionFn, shuffleFn,
1740 /*pattern benefit=*/regularPatternBenefit);
1741 if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
1742 signalPassFailure();
1743 return;
1744 }
1745
1746 // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
1747 // due to tensor desc type mismatches created by using upstream distribution
1748 // patterns (scf.for). This cleanup should only be done if all the ops are
1749 // distributed successfully, if some ops are still not distributed and remains
1750 // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
1751 // breaking the IR.
1752 bool foundWarpOp = false;
1753 getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
1754 // Look for WarpOps that are not trivially dead.
1755 if (isOpTriviallyDead(warpOp))
1756 return WalkResult::advance();
1757 foundWarpOp = true;
1758 return WalkResult::interrupt();
1759 });
1760 if (foundWarpOp)
1761 return;
1762
1763 getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
1764 // We are only interested in UnrealizedConversionCastOps there were added
1765 // for resolving SIMT type mismatches.
1766 if (!op->getAttr(resolveSIMTTypeMismatch))
1767 return WalkResult::skip();
1768
1769 Value input = op.getOperand(0);
1770 Value output = op.getResult(0);
1771
1772 // Both input and output must have tensor descriptor types.
1773 xegpu::TensorDescType inputDescType =
1774 mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
1775 xegpu::TensorDescType outputDescType =
1776 mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
1777 assert(inputDescType && outputDescType &&
1778 "Unrealized conversion cast must have tensor descriptor types");
1779
1780 // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
1781 // This occurs inside scf.for body to resolve the block argument type to
1782 // SIMT type.
1783 if (inputDescType.getLayout()) {
1784 auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
1785 if (argument) {
1786 argument.setType(output.getType());
1787 output.replaceAllUsesWith(argument);
1788 if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
1789 argument.getOwner()->getParentOp())) {
1790 auto result = loopOp.getTiedLoopResult(argument);
1791 result.setType(output.getType());
1792 }
1793 }
1794 }
1795
1796 // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
1797 // conversions. This occurs at the yield op of scf.for body to go back
1798 // from SIMT type to original type.
1799 if (outputDescType.getLayout())
1800 output.replaceAllUsesWith(input);
1801
1802 if (op->use_empty())
1803 op->erase();
1804 return WalkResult::advance();
1805 });
1806}
return success()
b getContext())
static const char *const resolveSIMTTypeMismatch
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static AffineMap getMultiDimMapWithTargets(unsigned numDims, ArrayRef< unsigned > targets, MLIRContext *context)
Returns an affine map with numDims input dimensions and results specified by targets.
This class represents an argument of a Block.
Definition Value.h:309
Block represents an ordered list of Operations.
Definition Block.h:33
Operation & front()
Definition Block.h:153
UnitAttr getUnitAttr()
Definition Builders.cpp:98
DenseI64ArrayAttr getDenseI64ArrayAttr(ArrayRef< int64_t > values)
Definition Builders.cpp:167
FunctionType getFunctionType(TypeRange inputs, TypeRange results)
Definition Builders.cpp:76
TypedAttr getZeroAttr(Type type)
Definition Builders.cpp:324
MLIRContext * getContext() const
Definition Builders.h:56
IndexType getIndexType()
Definition Builders.cpp:51
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
IRValueT get() const
Return the current value being used by this operand.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:398
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition Builders.h:436
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
This class represents an operand of an operation.
Definition Value.h:257
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
Definition Value.cpp:226
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
MutableArrayRef< OpOperand > getOpOperands()
Definition Operation.h:383
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:119
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:797
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:37
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition Value.h:149
Location getLoc() const
Return the location of this value.
Definition Value.cpp:24
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int64_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561
Value makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath=nullptr, Value mask=nullptr)
Returns the result value of reducing two scalar/vector values with the corresponding arith operation.
SmallVector< Value > getAsValues(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > foldResults)
Convert foldResults into Values.
const uArch * getUArch(llvm::StringRef archName)
void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns)
Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
std::string getLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
void setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout, bool respectPermLayout=false)
Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching it to the owner's dictio...
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns)
Appends patterns for XeGPU SIMT distribution into patterns.
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
detail::DenseArrayAttrImpl< int64_t > DenseI64ArrayAttr
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
bool isOpTriviallyDead(Operation *op)
Return true if the given operation is unused, and has no side effects on memory that prevent erasing.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
const FrozenRewritePatternSet & patterns
OpFoldResult getAsOpFoldResult(Value val)
Given a value, try to extract a constant Attribute.
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const final
Wrapper around the RewritePattern method that passes the derived op type.
virtual int getSubgroupSize() const =0
StringRef getName() const
Definition uArchBase.h:152