MLIR  22.0.0git
XeGPUSubgroupDistribute.cpp
Go to the documentation of this file.
1 //===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
18 #include "mlir/IR/AffineMap.h"
19 #include "mlir/IR/Attributes.h"
20 #include "mlir/IR/Builders.h"
22 #include "mlir/IR/BuiltinOps.h"
23 #include "mlir/IR/BuiltinTypes.h"
24 #include "mlir/IR/Operation.h"
25 #include "mlir/IR/PatternMatch.h"
26 #include "mlir/IR/TypeRange.h"
27 #include "mlir/IR/Value.h"
28 #include "mlir/IR/Visitors.h"
30 #include "mlir/Support/LLVM.h"
34 #include "llvm/ADT/ArrayRef.h"
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/ADT/SmallVector.h"
37 
38 namespace mlir {
39 namespace xegpu {
40 #define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
41 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
42 } // namespace xegpu
43 } // namespace mlir
44 
45 #define DEBUG_TYPE "xegpu-subgroup-distribute"
46 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
47 
48 using namespace mlir;
49 
50 static const char *const resolveSIMTTypeMismatch =
51  "resolve_simt_type_mismatch"; // Attribute name for identifying
52  // UnrelizedConversionCastOp added to resolve
53  // SIMT type mismatches.
54 
55 namespace {
56 
57 //===----------------------------------------------------------------------===//
58 // SIMT Distribution Patterns
59 //===----------------------------------------------------------------------===//
60 
61 /// Helper function to get distributed vector type for a source vector type
62 /// according to the lane_layout. We simply divide each dimension of tensor
63 /// descriptor shape by corresponding lane_layout dimension. If
64 /// array_length > 1, that is appended to the front of the ditributed shape.
65 /// NOTE: This is the vector type that will be returned by the
66 /// gpu.warp_execute_on_lane0 op.
67 ///
68 /// Examples:
69 /// | original vector shape | lane_layout | distributed vector shape |
70 /// |-----------------------|-------------|--------------------------|
71 /// | 32x16 | [1, 16] | 32x1 |
72 /// | 32x16 | [2, 8] | 16x2 |
73 /// | 2x32x16 | [1, 16] | 2x32x1 |
74 static FailureOr<VectorType>
75 getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
76  VectorType originalType) {
77  if (!layout)
78  return failure();
79 
80  auto laneLayout = layout.getLaneLayout().asArrayRef();
81  assert(originalType.getShape().size() >= laneLayout.size() &&
82  "Rank of the original vector type should be greater or equal to the "
83  "size of the lane layout to distribute the vector type.");
84  SmallVector<int64_t> distributedShape(originalType.getShape());
85  // Only distribute the last `laneLayout.size()` dimensions. The remaining
86  // dimensions are not distributed.
87  unsigned distributionStart = originalType.getRank() - laneLayout.size();
88  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
89  if (i < distributionStart)
90  continue;
91 
92  // Check if the dimension can be distributed evenly.
93  if (dim % laneLayout[i - distributionStart] != 0)
94  return failure();
95  distributedShape[i] = dim / laneLayout[i - distributionStart];
96  }
97  return VectorType::get(distributedShape, originalType.getElementType());
98 }
99 
100 /// Helper function to resolve types if the distributed type out of
101 /// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
102 /// Example 1:
103 /// distributed type: vector<8x1xf32>
104 /// expected type: vector<8xf32>
105 /// resolved using,
106 /// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
107 /// Example 2:
108 /// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
109 /// expected type: xegpu.tensor_desc<8x16xf32>
110 /// resolved using,
111 /// %0 = unrealized_conversion_cast %1 :
112 /// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
113 /// xegpu.tensor_desc<8x16xf32>
114 template <typename T>
115 static Value resolveDistributedTy(Value orig, T expected,
116  PatternRewriter &rewriter) {
117  // If orig and expected types are the same, return orig.
118  if (orig.getType() == expected)
119  return orig;
120  // If orig is a vector type, create a shape cast op to reconcile the types.
121  if (isa<VectorType>(orig.getType())) {
122  auto castOp =
123  vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
124  return castOp.getResult();
125  }
126  // If orig is a tensor descriptor type, create an unrealized conversion cast
127  // op to reconcile the types.
128  if (isa<xegpu::TensorDescType>(orig.getType())) {
129  auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
130  expected, orig);
131  castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
132  return castOp.getResult(0);
133  }
134  llvm_unreachable("Unsupported type for reconciliation");
135  return orig;
136 }
137 
138 /// Helper function to check if the layout is packed. Layout is packed if it is
139 /// 2D and lane_data[0] != 1 (data packed from col dimension).
140 static bool hasPackedLayout(xegpu::LayoutAttr layout) {
141  if (layout == xegpu::LayoutAttr())
142  return false;
143  DenseI32ArrayAttr laneData = layout.getLaneData();
144  if (!laneData || laneData.size() != 2)
145  return false;
146  return laneData.asArrayRef()[0] != 1;
147 }
148 
149 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
150 /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
151 /// contained within a WarpExecuteOnLane0Op.
152 /// Example:
153 ///
154 /// ```
155 /// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
156 /// ...
157 /// ...
158 /// gpu.return %result: vector<8x16xf32>
159 /// }
160 /// ```
161 /// To
162 /// ```
163 /// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
164 /// %laneid = gpu.lane_id : index
165 /// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
166 /// ...
167 /// ...
168 /// gpu.yield %result: vector<8x16xf32>
169 /// }
170 /// return %0
171 /// }
172 struct MoveFuncBodyToWarpExecuteOnLane0
173  : public OpRewritePattern<gpu::GPUFuncOp> {
175  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
176  PatternRewriter &rewriter) const override {
177  // If the function only contains a single void return, skip.
178  if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
179  return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
180  }))
181  return failure();
182  // If the function already moved inside a warp_execute_on_lane0, skip.
183  if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
184  return isa<gpu::WarpExecuteOnLane0Op>(op);
185  }))
186  return failure();
187  // Create a new function with the same signature and same attributes.
188  SmallVector<Type> workgroupAttributionsTypes =
189  llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
190  [](BlockArgument arg) { return arg.getType(); });
191  SmallVector<Type> privateAttributionsTypes =
192  llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
193  [](BlockArgument arg) { return arg.getType(); });
194  auto newGpuFunc = gpu::GPUFuncOp::create(
195  rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
196  gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
197  privateAttributionsTypes);
198  newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
199  // Create a WarpExecuteOnLane0Op with same arguments and results as the
200  // original gpuFuncOp.
201  rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
202  auto laneId = gpu::LaneIdOp::create(
203  rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
204  /** upperBound = **/ mlir::IntegerAttr());
205  ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
206  auto warpOp = gpu::WarpExecuteOnLane0Op::create(
207  rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
208  xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
209  newGpuFunc.getArgumentTypes());
210  Block &warpBodyBlock = warpOp.getBodyRegion().front();
211  // Replace the ReturnOp of the original gpu function with a YieldOp.
212  auto origRetunOp =
213  cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
214  rewriter.setInsertionPointAfter(origRetunOp);
215  gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
216  origRetunOp.getOperands());
217  rewriter.eraseOp(origRetunOp);
218  // Move the original function body to the WarpExecuteOnLane0Op body.
219  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
220  warpOp.getBodyRegion().begin());
221  rewriter.eraseBlock(&warpBodyBlock);
222  // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
223  rewriter.setInsertionPointAfter(warpOp);
224  gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
225  rewriter.replaceOp(gpuFuncOp, newGpuFunc);
226  return success();
227  }
228 };
229 
230 /// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
231 /// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
232 /// still contain the original op that will not be used by the yield op (and
233 /// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
234 /// arguments. Tensor descriptor shape is not distributed because it is a
235 /// uniform value across all work items within the subgroup. However, the
236 /// layout information is dropped in the new tensor descriptor type.
237 ///
238 /// Example:
239 ///
240 /// ```
241 /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
242 /// %r = gpu.warp_execute_on_lane_0(%laneid) ->
243 /// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
244 /// ...
245 /// %td = xegpu.create_nd_tdesc %arg0[0, 0]
246 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
247 /// vector.yield %td
248 /// }
249 /// ```
250 /// To
251 /// ```
252 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
253 /// ...
254 /// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
255 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
256 /// vector.yield %arg0, %dead
257 /// }
258 /// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
259 /// -> !xegpu.tensor_desc<4x8xf32>
260 ///
261 /// ```
262 struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
263  using gpu::WarpDistributionPattern::WarpDistributionPattern;
264  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
265  PatternRewriter &rewriter) const override {
266  OpOperand *operand =
267  getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
268  if (!operand)
269  return rewriter.notifyMatchFailure(
270  warpOp, "warp result is not a xegpu::CreateNdDesc op");
271  auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
272  unsigned operandIdx = operand->getOperandNumber();
273 
274  xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
275  if (!layout)
276  return rewriter.notifyMatchFailure(
277  descOp, "the tensor descriptor lacks layout attribute");
278 
279  SmallVector<size_t> newRetIndices;
280  rewriter.setInsertionPoint(warpOp);
281  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
282  rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
283  /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
284 
285  SmallVector<Value> newDescOperands = llvm::map_to_vector(
286  newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
287  rewriter.setInsertionPointAfter(newWarpOp);
288  xegpu::TensorDescType distributedTensorDescTy =
289  descOp.getType().dropLayouts(); // Distributed tensor descriptor type
290  // does not contain layout info.
291  Value newDescOp = xegpu::CreateNdDescOp::create(
292  rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
293  descOp->getAttrs());
294 
295  Value distributedVal = newWarpOp.getResult(operandIdx);
296  // Resolve the distributed type to the expected type.
297  newDescOp =
298  resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
299  rewriter.replaceAllUsesWith(distributedVal, newDescOp);
300  return success();
301  }
302 };
303 
304 /// Distribute a store_nd op at the end of enclosing
305 /// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
306 /// through the warp op interface they would be propagated as returned values.
307 /// Source vector is distributed based on lane layout. Appropriate cast ops are
308 /// inserted if the distributed types does not match expected xegpu SIMT types.
309 ///
310 /// Example:
311 ///
312 /// ```
313 /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
314 /// gpu.warp_execute_on_lane_0(%laneid) -> () {
315 /// ...
316 /// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
317 /// !xegpu.tensor_desc<4x8xf32, #layout0>
318 /// }
319 /// ```
320 /// To
321 /// ```
322 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
323 /// !xegpu.tensor_desc<4x8xf32, #layout0>) {
324 /// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
325 /// #layout0>
326 /// }
327 /// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
328 /// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
329 /// #layout0>
330 /// -> !xegpu.tensor_desc<4x8xf32>
331 /// xegpu.store_nd %0, %1: vector<4xf32>,
332 /// !xegpu.tensor_desc<4x8xf32>
333 ///
334 /// ```
335 struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
336  using gpu::WarpDistributionPattern::WarpDistributionPattern;
337  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
338  PatternRewriter &rewriter) const override {
339  gpu::YieldOp yield = warpOp.getTerminator();
340  Operation *lastNode = yield->getPrevNode();
341  auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
342  if (!storeOp)
343  return failure();
344 
345  int64_t offsetSize = static_cast<int64_t>(storeOp.getOffsets().size());
346  if ((offsetSize != 0) || storeOp.getConstOffsetsAttr())
347  return failure();
348 
349  xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
350  xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
351  if (!layout)
352  return rewriter.notifyMatchFailure(
353  storeOp, "the source tensor descriptor lacks layout attribute");
354 
355  FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
356  getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
357  if (failed(distributedTypeByWarpOpOrFailure))
358  return rewriter.notifyMatchFailure(storeOp,
359  "Failed to distribute the type");
360  VectorType distributedTypeByWarpOp =
361  distributedTypeByWarpOpOrFailure.value();
362 
363  SmallVector<size_t> newRetIndices;
364  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
365  rewriter, warpOp,
366  /* new yielded values = */
367  ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
368  /* new yielded types = */
369  TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
370  newRetIndices);
371  // Create a new store op outside the warp op with the distributed vector
372  // type. Tensor descriptor is not distributed.
373  rewriter.setInsertionPointAfter(newWarpOp);
374  SmallVector<Value> newStoreOperands;
375 
376  // For the value operand, there can be a mismatch between the vector type
377  // distributed by the warp op and (xegpu-specific) distributed type
378  // supported by the store op. Type mismatch must be resolved using
379  // appropriate cast op.
380  FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
381  xegpu::getDistributedVectorType(storeOp.getTensorDescType());
382  if (failed(storeNdDistributedValueTyOrFailure))
383  return rewriter.notifyMatchFailure(
384  storeOp, "Failed to get distributed vector type for the store op");
385  newStoreOperands.push_back(resolveDistributedTy(
386  newWarpOp.getResult(newRetIndices[0]),
387  storeNdDistributedValueTyOrFailure.value(), rewriter));
388  // For the tensor descriptor operand, the layout attribute is dropped after
389  // distribution. Types needs to be resolved in this case also.
390  xegpu::TensorDescType distributedTensorDescTy =
391  storeOp.getTensorDescType().dropLayouts();
392  newStoreOperands.push_back(
393  resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
394  distributedTensorDescTy, rewriter));
395 
396  auto newStoreOp =
397  xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
398  newStoreOperands, storeOp->getAttrs());
399  xegpu::removeLayoutAttrs(newStoreOp);
400  rewriter.eraseOp(storeOp);
401  return success();
402  }
403 };
404 
405 /// Distribute a load_nd op feeding into vector.yield op for the enclosing
406 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
407 /// The warp op will still contain the original op that will not be used by
408 /// the yield op (and should be cleaned up later). The yield op will
409 /// bypass the load's arguments. Only the loaded vector is distributed
410 /// according to lane layout and, tensor descriptor types is not
411 /// distributed. Appropriate cast ops are inserted if the distributed types does
412 /// not match expected xegpu SIMT types.
413 ///
414 /// Example:
415 ///
416 /// ```
417 /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
418 /// %r = gpu.warp_execute_on_lane_0(%laneid) ->
419 /// (vector<4x1xf32>) {
420 /// ...
421 /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
422 /// ->
423 /// vector<4x8xf32>
424 /// gpu.yield %ld
425 /// }
426 /// ```
427 /// To
428 /// ```
429 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
430 /// !xegpu.tensor_desc<4x8xf32, #layout0>) {
431 /// ...
432 /// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
433 /// vector<4x8xf32> gpu.yield %dead, %arg0
434 /// }
435 /// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
436 /// #layout0> -> !xegpu.tensor_desc<4x8xf32>
437 /// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
438 /// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
439 ///
440 /// ```
441 struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
442  using gpu::WarpDistributionPattern::WarpDistributionPattern;
443  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
444  PatternRewriter &rewriter) const override {
445  OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
446  if (!isa<xegpu::LoadNdOp>(op))
447  return false;
448  // Make sure the same load op is the last operation in the warp op body.
449  // This ensure that load op is not sinked earlier violating any barrier
450  // synchronizations.
451  gpu::YieldOp yield = warpOp.getTerminator();
452  return yield->getPrevNode() == op;
453  });
454 
455  if (!operand)
456  return rewriter.notifyMatchFailure(
457  warpOp, "warp result is not a xegpu::LoadNd op");
458 
459  auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
460 
461  int64_t offsetSize = static_cast<int64_t>(loadOp.getOffsets().size());
462  if ((offsetSize != 0) || loadOp.getConstOffsetsAttr())
463  return failure();
464 
465  xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
466  xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
467  if (!layout)
468  return rewriter.notifyMatchFailure(
469  loadOp, "the source tensor descriptor lacks layout attribute");
470 
471  unsigned operandIdx = operand->getOperandNumber();
472  VectorType distributedTypeByWarpOp =
473  cast<VectorType>(warpOp.getResult(operandIdx).getType());
474 
475  SmallVector<size_t> newRetIndices;
476  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
477  rewriter, warpOp,
478  /* new yielded values = */ loadOp.getTensorDesc(),
479  /* new yielded types = */ tensorDescTy, newRetIndices);
480 
481  // Create a new load op outside the warp op with the distributed vector
482  // type.
483  rewriter.setInsertionPointAfter(newWarpOp);
484  FailureOr<VectorType> loadNdDistValueTyOrFailure =
485  xegpu::getDistributedVectorType(loadOp.getTensorDescType());
486  if (failed(loadNdDistValueTyOrFailure))
487  return rewriter.notifyMatchFailure(
488  loadOp, "Failed to get distributed vector type for the load op");
489  xegpu::TensorDescType distributedTensorDescTy =
490  loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
491  // descriptor type does not
492  // contain layout info.
493  auto newLoadOp = xegpu::LoadNdOp::create(
494  rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
495  resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
496  distributedTensorDescTy, rewriter),
497  loadOp->getAttrs());
498  xegpu::removeLayoutAttrs(newLoadOp);
499  // Set the packed attribute if the layout requires it.
500  newLoadOp.setPacked(hasPackedLayout(layout));
501  Value distributedVal = newWarpOp.getResult(operandIdx);
502  // There can be a conflict between the vector type distributed by the
503  // warp op and (xegpu-specific) distributed type supported by the load
504  // op. Resolve these mismatches by inserting a cast.
505  Value tyResolvedVal = resolveDistributedTy(
506  newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
507  rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
508  return success();
509  }
510 };
511 
512 /// Distribute a dpas op feeding into vector.yield op for the enclosing
513 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
514 /// The warp op will still contain the original op that will not be used by
515 /// the yield op (and should be cleaned up later). The yield op will
516 /// bypass the dpas's arguments. Appropriate cast ops are inserted if the
517 /// distributed types does not match expected xegpu SIMT types.
518 /// Example:
519 /// ```
520 /// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
521 /// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
522 /// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
523 /// %r = gpu.warp_execute_on_lane_0(%laneid) ->
524 /// (vector<8x1xf32>) {
525 /// ...
526 /// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
527 /// vector<8x16xf32>
528 /// gpu.yield %dpas
529 /// }
530 /// ```
531 /// To
532 /// ```
533 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
534 /// vector<8x1xf16>, vector<16x1xf16>) {
535 /// ...
536 /// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
537 /// -> vector<8x16xf32>
538 /// gpu.yield %dead, %arg0, %arg1
539 /// }
540 /// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
541 /// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
542 /// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
543 /// vector<8xf32>
544 /// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
545 /// ```
546 struct DpasDistribution final : public gpu::WarpDistributionPattern {
547  using gpu::WarpDistributionPattern::WarpDistributionPattern;
548  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
549  PatternRewriter &rewriter) const override {
550  OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
551  if (!operand)
552  return rewriter.notifyMatchFailure(warpOp,
553  "warp result is not a xegpu::Dpas op");
554 
555  auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
556  unsigned operandIdx = operand->getOperandNumber();
557  std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
558  std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
559  std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
560 
561  xegpu::LayoutAttr layoutA =
562  dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
563  xegpu::LayoutAttr layoutB =
564  dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
565  xegpu::LayoutAttr layoutOut =
566  dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
567  if (!layoutA || !layoutB || !layoutOut)
568  return rewriter.notifyMatchFailure(
569  dpasOp,
570  "the xegpu::Dpas op lacks layout attribute for A, B or output");
571 
572  FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
573  getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
574  FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
575  getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
576  FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
577  getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
578  if (failed(distLhsTypeByWarpOpOrFailure) ||
579  failed(distRhsTypeByWarpOpOrFailure) ||
580  failed(distResultTypeByWarpOpOrFailure))
581  return rewriter.notifyMatchFailure(
582  dpasOp,
583  "Failed to distribute the A, B or output types in xegpu::Dpas op");
584 
585  llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
586  dpasOp.getRhs()};
587  llvm::SmallVector<Type, 3> newYieldTypes{
588  distLhsTypeByWarpOpOrFailure.value(),
589  distRhsTypeByWarpOpOrFailure.value()};
590  // Dpas acc operand is optional.
591  if (dpasOp.getAcc()) {
592  newYieldValues.push_back(dpasOp.getAcc());
593  newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
594  }
595  // Create a new warp op without the dpas.
596  SmallVector<size_t> newRetIndices;
597  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
598  rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
599 
600  FailureOr<VectorType> expectedDistLhsTyOrFailure =
601  xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
602  FailureOr<VectorType> expectedDistRhsTyOrFailure =
603  xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
604  FailureOr<VectorType> expectedDistResultTyOrFailure =
605  xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
606  if (failed(expectedDistLhsTyOrFailure) ||
607  failed(expectedDistRhsTyOrFailure) ||
608  failed(expectedDistResultTyOrFailure))
609  return rewriter.notifyMatchFailure(
610  dpasOp,
611  "Failed to get distributed vector type for the dpas operands.");
612  // Create a new dpas op outside the warp op.
613  rewriter.setInsertionPointAfter(newWarpOp);
614  SmallVector<Value> newDpasOperands;
615  SmallVector<VectorType> newDpasOperandExpectedTypes;
616 
617  // Resolve the distributed types with the original types.
618  newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
619  newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
620  VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
621  if (dpasOp.getAcc())
622  newDpasOperandExpectedTypes.push_back(distributedResultTy);
623 
624  for (unsigned i = 0; i < newRetIndices.size(); i++) {
625  newDpasOperands.push_back(
626  resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
627  newDpasOperandExpectedTypes[i], rewriter));
628  }
629  auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
630  distributedResultTy, newDpasOperands,
631  dpasOp->getAttrs());
632  xegpu::removeLayoutAttrs(newDpasOp);
633  Value distributedVal = newWarpOp.getResult(operandIdx);
634  // Resolve the output type.
635  Value typeResolved =
636  resolveDistributedTy(newDpasOp.getResult(),
637  distResultTypeByWarpOpOrFailure.value(), rewriter);
638  rewriter.replaceAllUsesWith(distributedVal, typeResolved);
639  return success();
640  }
641 };
642 
643 /// Sink an update_nd_offset op feeding into yield op of an enclosing
644 /// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
645 /// original op that will not be used by the yield op (and should be cleaned
646 /// up later). The yield op will bypass the updateOp's arguments. The tensor
647 /// descriptor type is not distributed. Appropriate cast ops are inserted if
648 /// the distributed types does not match expected xegpu SIMT types.
649 /// Example:
650 /// ```
651 /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
652 /// %r = gpu.warp_execute_on_lane_0(%laneid) ->
653 /// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
654 /// ...
655 /// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
656 /// !xegpu.tensor_desc<4x8xf32, #layout0>
657 /// gpu.yield %update
658 /// }
659 /// ...
660 /// ```
661 /// To
662 /// ```
663 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
664 /// !xegpu.tensor_desc<4x8xf32, #layout0>,
665 /// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
666 /// ...
667 /// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
668 /// !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0
669 /// gpu.yield %dead, %arg0, %c32, %c16
670 /// }
671 /// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
672 /// #layout0> -> !xegpu.tensor_desc<4x8xf32>
673 /// %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]:
674 /// !xegpu.tensor_desc<4x8xf32>
675 /// ...
676 /// ```
677 struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
678  using gpu::WarpDistributionPattern::WarpDistributionPattern;
679  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
680  PatternRewriter &rewriter) const override {
681  OpOperand *operand =
682  getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
683  if (!operand)
684  return rewriter.notifyMatchFailure(
685  warpOp, "warp result is not a xegpu::UpdateNdOffset op");
686  auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
687  unsigned operandIdx = operand->getOperandNumber();
688 
689  SmallVector<size_t> newRetIndices;
690  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
691  rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
692  newRetIndices);
693  rewriter.setInsertionPointAfter(newWarpOp);
694  // new update op does not have layout attribute.
695  xegpu::TensorDescType distributedTensorDescTy =
696  updateOp.getTensorDescType().dropLayouts();
697  SmallVector<Value> newUpdateOperands =
698  llvm::map_to_vector(newRetIndices, [&](size_t i) {
699  // For the tensor descriptor operand, the layout attribute is
700  // dropped after distribution. Types needs to be resolved in this
701  // case.
702  if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
703  return resolveDistributedTy(newWarpOp.getResult(i),
704  distributedTensorDescTy, rewriter);
705  }
706  return newWarpOp.getResult(i);
707  });
708  // Create a new update op outside the warp op.
709  auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
710  rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
711  newUpdateOperands, updateOp->getAttrs());
712  xegpu::removeLayoutAttrs(newUpdateOp);
713  Value distributedVal = newWarpOp.getResult(operandIdx);
714  // Resolve the distributed type with the original type.
715  Value typeResolved = resolveDistributedTy(
716  newUpdateOp.getResult(), distributedVal.getType(), rewriter);
717  rewriter.replaceAllUsesWith(distributedVal, typeResolved);
718  return success();
719  }
720 };
721 
722 /// Distribute a prefetch_nd op at the end of enclosing
723 /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
724 /// through the warp op interface they would be propagated as returned values.
725 /// Tensor descriptor shape is not distributed because it is a uniform value
726 /// across all work items within the subgroup. Appropriate cast ops are inserted
727 /// if the distributed types does not match expected xegpu SIMT types.
728 ///
729 /// Example:
730 ///
731 /// ```
732 /// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
733 /// gpu.warp_execute_on_lane_0(%laneid) -> () {
734 /// ...
735 /// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0>
736 /// }
737 /// ```
738 /// To
739 /// ```
740 /// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
741 /// !xegpu.tensor_desc<4x8xf32, #layout0>) {
742 /// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0>
743 /// }
744 /// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
745 /// #layout0> -> !xegpu.tensor_desc<4x8xf32>
746 /// xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32>
747 ///
748 /// ```
749 struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
750  using gpu::WarpDistributionPattern::WarpDistributionPattern;
751  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
752  PatternRewriter &rewriter) const override {
753  gpu::YieldOp yield = warpOp.getTerminator();
754  Operation *lastNode = yield->getPrevNode();
755  auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
756  if (!prefetchOp)
757  return failure();
758 
759  int64_t offsetSize = static_cast<int64_t>(prefetchOp.getOffsets().size());
760  if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr())
761  return failure();
762 
763  xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
764  if (!layout)
765  return rewriter.notifyMatchFailure(
766  prefetchOp, "the source tensor descriptor lacks layout attribute");
767 
768  SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
769  SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
770  SmallVector<size_t> newRetIndices;
771  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
772  rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
773  // Create a new prefetch op outside the warp op with updated tensor
774  // descriptor type. Source tensor descriptor require type resolution.
775  xegpu::TensorDescType newTensorDescTy =
776  prefetchOp.getTensorDescType().dropLayouts();
777  rewriter.setInsertionPointAfter(newWarpOp);
778  SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
779  newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
780  xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
781  newPrefetchOperands, prefetchOp->getAttrs());
782  xegpu::removeLayoutAttrs(prefetchOp);
783  rewriter.eraseOp(prefetchOp);
784  return success();
785  }
786 };
787 
788 /// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
789 /// region. This will simply move the barrier op outside of the warp op.
790 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
791  using gpu::WarpDistributionPattern::WarpDistributionPattern;
792  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
793  PatternRewriter &rewriter) const override {
794  gpu::YieldOp yield = warpOp.getTerminator();
795  Operation *lastNode = yield->getPrevNode();
796  // The last node must be a gpu::BarrierOp.
797  auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
798  if (!barrierOp)
799  return failure();
800  // Move the barrier op outside of the warp op.
801  rewriter.setInsertionPointAfter(warpOp);
802  gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
803  barrierOp->getResultTypes(),
804  barrierOp->getOperands(), barrierOp->getAttrs());
805  rewriter.eraseOp(barrierOp);
806  return success();
807  }
808 };
809 
810 } // namespace
811 
812 namespace {
813 struct XeGPUSubgroupDistributePass final
814  : public xegpu::impl::XeGPUSubgroupDistributeBase<
815  XeGPUSubgroupDistributePass> {
816  void runOnOperation() override;
817 };
818 } // namespace
819 
822  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
823  LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
824  UpdateNdOffsetDistribution, GpuBarrierDistribution>(
825  patterns.getContext());
826 }
827 
828 void XeGPUSubgroupDistributePass::runOnOperation() {
829  // Step 1: Attach layouts to op operands.
830  // TODO: Following assumptions are made:
831  // 1) It is assumed that there are no layout conflicts.
832  // 2) Any existing layout attributes attached to the operands are ignored.
833  Operation *op = getOperation();
834  op->walk([&](Operation *op) {
835  for (OpOperand &operand : op->getOpOperands()) {
836  // Layouts are needed for vector type only.
837  if (!isa<VectorType>(operand.get().getType()))
838  continue;
839 
840  auto layout =
841  xegpu::getDistributeLayoutAttrOfType<xegpu::LayoutAttr>(operand);
842  if (!layout) {
843  op->emitError("Could not find layout attribute for operand ")
844  << operand.getOperandNumber() << " of operation " << op->getName();
845  signalPassFailure();
846  return;
847  }
848  xegpu::setDistributeLayoutAttr(operand, layout);
849  }
850  });
851  // Step 2: Move all operations of a GPU function inside
852  // gpu.warp_execute_on_lane_0 operation.
853  {
855  patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
856 
857  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
858  signalPassFailure();
859  return;
860  }
861  // At this point, we have moved the entire function body inside the
862  // warpOp. Now move any scalar uniform code outside of the warpOp (like
863  // GPU index ops, scalar constants, etc.). This will simplify the
864  // later lowering and avoid custom patterns for these ops.
865  getOperation()->walk([&](Operation *op) {
866  if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
867  vector::moveScalarUniformCode(warpOp);
868  });
869  }
870  // Step 3: Apply subgroup to workitem distribution patterns.
873  // distributionFn is used by vector distribution patterns to determine the
874  // distributed vector type for a given vector value. In XeGPU subgroup
875  // distribution context, we compute this based on lane layout.
876  auto distributionFn = [](Value val) {
877  VectorType vecType = dyn_cast<VectorType>(val.getType());
878  int64_t vecRank = vecType ? vecType.getRank() : 0;
879  if (vecRank == 0)
880  return AffineMap::get(val.getContext());
881  // Get the layout of the vector type.
882  // TODO: support more layout types
883  auto layout = xegpu::getDistributeLayoutAttrOfType<xegpu::LayoutAttr>(val);
884  // If no layout is specified, assume the inner most dimension is distributed
885  // for now.
886  if (!layout)
888  vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
889  SmallVector<unsigned int> distributedDims;
890  // Get the distributed dimensions based on the layout.
891  ArrayRef<int> laneLayout = layout.getLaneLayout().asArrayRef();
892  for (unsigned i = 0; i < laneLayout.size(); ++i) {
893  if (laneLayout[i] > 1)
894  distributedDims.push_back(i);
895  }
896  return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
897  val.getContext());
898  };
899  // TODO: shuffleFn is not used.
900  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
901  int64_t warpSz) { return Value(); };
902  vector::populatePropagateWarpVectorDistributionPatterns(
903  patterns, distributionFn, shuffleFn);
904  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
905  signalPassFailure();
906  return;
907  }
908 
909  // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted
910  // due to tensor desc type mismatches created by using upstream distribution
911  // patterns (scf.for)
912  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
913  // We are only interested in UnrealizedConversionCastOps there were added
914  // for resolving SIMT type mismatches.
915  if (!op->getAttr(resolveSIMTTypeMismatch))
916  return WalkResult::skip();
917 
918  Value input = op.getOperand(0);
919  Value output = op.getResult(0);
920 
921  // Both input and output must have tensor descriptor types.
922  xegpu::TensorDescType inputDescType =
923  mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
924  xegpu::TensorDescType outputDescType =
925  mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
926  assert(inputDescType && outputDescType &&
927  "Unrealized conversion cast must have tensor descriptor types");
928 
929  // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
930  // This occurs iside scf.for body to resolve the block argument type to
931  // SIMT type.
932  if (inputDescType.getLayout()) {
933  auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
934  if (argument) {
935  argument.setType(output.getType());
936  output.replaceAllUsesWith(argument);
937  if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
938  argument.getOwner()->getParentOp())) {
939  auto result = loopOp.getTiedLoopResult(argument);
940  result.setType(output.getType());
941  }
942  }
943  }
944 
945  // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
946  // conversions. This occurs at the yield op of scf.for body to go back
947  // from SIMT type to original type.
948  if (outputDescType.getLayout())
949  output.replaceAllUsesWith(input);
950 
951  if (op->use_empty())
952  op->erase();
953  return WalkResult::advance();
954  });
955 }
static MLIRContext * getContext(OpFoldResult val)
static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue)
Update an operation with the layout of its results.
static const char *const resolveSIMTTypeMismatch
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static AffineMap getMultiDimMapWithTargets(unsigned numDims, ArrayRef< unsigned > targets, MLIRContext *context)
Returns an affine map with numDims input dimensions and results specified by targets.
Definition: AffineMap.cpp:276
This class represents an argument of a Block.
Definition: Value.h:309
Block represents an ordered list of Operations.
Definition: Block.h:33
Operation & front()
Definition: Block.h:153
UnitAttr getUnitAttr()
Definition: Builders.cpp:97
FunctionType getFunctionType(TypeRange inputs, TypeRange results)
Definition: Builders.cpp:75
IndexType getIndexType()
Definition: Builders.cpp:50
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:160
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
This class helps build Operations.
Definition: Builders.h:207
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:398
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:436
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:412
This class represents an operand of an operation.
Definition: Value.h:257
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
Definition: Value.cpp:226
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:797
MutableArrayRef< OpOperand > getOpOperands()
Definition: Operation.h:383
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:783
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:716
virtual void eraseBlock(Block *block)
This method erases all operations in a block.
virtual void replaceOp(Operation *op, ValueRange newValues)
Replace the results of the given (original) operation with the specified list of values (replacements...
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:636
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:37
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
void replaceAllUsesWith(Value newValue)
Replace all uses of 'this' value with the new value, updating anything in the IR that uses 'this' to ...
Definition: Value.h:149
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:24
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:18
static WalkResult skip()
Definition: WalkResult.h:48
static WalkResult advance()
Definition: WalkResult.h:47
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
constexpr unsigned subgroupSize
void setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout)
Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching it to the owner's dictio...
Definition: XeGPUUtils.cpp:164
std::string getLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
Definition: XeGPUUtils.cpp:106
void removeLayoutAttrs(Operation *op)
Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given operation if they exist...
Definition: XeGPUUtils.cpp:212
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns)
Appends patterns for XeGPU SIMT distribution into patterns.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Definition: XeGPUUtils.cpp:40
Include the generated interface declarations.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
const FrozenRewritePatternSet & patterns
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:314