MLIR 23.0.0git
XeGPUPropagateLayout.cpp
Go to the documentation of this file.
1//===- XeGPUPropagateLayout.cpp - XeGPU Layout Propagation ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
21#include "mlir/IR/Attributes.h"
22#include "mlir/IR/Builders.h"
25#include "mlir/IR/Operation.h"
26#include "mlir/IR/Value.h"
27#include "mlir/IR/Visitors.h"
31#include "mlir/Support/LLVM.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/STLExtras.h"
34#include "llvm/ADT/SmallSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/TypeSwitch.h"
37#include "llvm/Support/Casting.h"
38#include "llvm/Support/Debug.h"
39#include "llvm/Support/LogicalResult.h"
40#include "llvm/Support/raw_ostream.h"
41
42namespace mlir {
43namespace xegpu {
44#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
45#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
46} // namespace xegpu
47} // namespace mlir
48
49#define DEBUG_TYPE "xegpu-propagate-layout"
50#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
51
52using namespace mlir;
53using namespace mlir::dataflow;
54
55namespace {
56
57//===----------------------------------------------------------------------===//
58// LayoutInfo
59//===----------------------------------------------------------------------===//
60
61/// Helper class for tracking the analysis state of an mlir value. For layout
62/// propagation, the analysis state is simply the distribution layout of
63/// each value. The distribution layout information is encapsulated using
64/// xegpu::DistributeLayoutAttr class which can hold information about any type
65/// of distribution layout that XeGPU dialect supports. Purpose of this analysis
66/// to propagate some unique distribution layout for each value in the program
67/// starting from a set of anchor operations (like DPAS, StoreNd, etc.). Note
68/// that analysis will reach a fixed point when all values are reached some
69/// layout and, analysis does not try to modify any already assigned layouts.
70///
71/// Given this, LayoutInfo satisifies the following properties:
72/// 1) A LayoutInfo value can be in one of two states - `assigned` or `not
73/// assigned`.
74/// 2) Two LayoutInfo values are equal if they are both assigned or
75/// both not assigned. The concrete value of assigned state does not matter.
76/// 3) The meet operator works as follows:
77/// - If current state is assigned, return the current state. (already
78/// a unique layout is assigned. don't change it)
79/// - Otherwise, return the other state.
80
81struct LayoutInfo {
82private:
83 xegpu::DistributeLayoutAttr storage = nullptr;
84
85public:
86 LayoutInfo() = default;
87 LayoutInfo(const xegpu::DistributeLayoutAttr &layout) : storage(layout) {}
88
89 // Two lattice values are equal if they have `some` layout. The actual
90 // content of the layout does not matter.
91 bool operator==(const LayoutInfo &other) const {
92 return this->isAssigned() == other.isAssigned();
93 }
94
95 static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
96
97 static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
98
99 void print(raw_ostream &os) const;
100
101 bool isAssigned() const { return storage != nullptr; }
102
103 LayoutInfo transpose(ArrayRef<int64_t> permutation) const;
104
105 SmallVector<int> getLaneLayout() const;
106
107 SmallVector<int> getLaneData() const;
108
109 SmallVector<int> getInstData() const;
110
111 SmallVector<int> getSgLayout() const;
112
113 SmallVector<int> getSgData() const;
114
115 SmallVector<int> getOrder() const;
116
117 bool isSliceLayout() const {
118 if (!isAssigned())
119 return false;
120 return isa<xegpu::SliceAttr>(storage);
121 }
122
123 int64_t getRank() const {
124 if (!isAssigned())
125 return -1;
126 return storage.getRank();
127 }
128
129 Attribute get() { return storage; }
130 void set(const xegpu::DistributeLayoutAttr &layout) { storage = layout; }
131};
132
133SmallVector<int> LayoutInfo::getLaneLayout() const {
134 if (!isAssigned())
135 return {};
136 return llvm::map_to_vector(storage.getEffectiveLaneLayoutAsInt(),
137 [](int64_t val) { return static_cast<int>(val); });
138}
139
140SmallVector<int> LayoutInfo::getLaneData() const {
141 if (!isAssigned())
142 return {};
143 return llvm::map_to_vector(storage.getEffectiveLaneDataAsInt(),
144 [](int64_t val) { return static_cast<int>(val); });
145}
146
147SmallVector<int> LayoutInfo::getInstData() const {
148 if (!isAssigned())
149 return {};
150 return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
151 [](int64_t val) { return static_cast<int>(val); });
152}
153
154SmallVector<int> LayoutInfo::getSgLayout() const {
155 if (!isAssigned())
156 return {};
157 return llvm::map_to_vector(storage.getEffectiveSgLayoutAsInt(),
158 [](int64_t val) { return static_cast<int>(val); });
159}
160
161SmallVector<int> LayoutInfo::getSgData() const {
162 if (!isAssigned())
163 return {};
164 return llvm::map_to_vector(storage.getEffectiveSgDataAsInt(),
165 [](int64_t val) { return static_cast<int>(val); });
166}
167
168SmallVector<int> LayoutInfo::getOrder() const {
169 if (!isAssigned() || !storage.getOrder())
170 return {};
171 return llvm::map_to_vector(storage.getOrder().asArrayRef(),
172 [](int64_t val) { return static_cast<int>(val); });
173}
174
175void LayoutInfo::print(raw_ostream &os) const {
176 if (isAssigned()) {
177 os << storage;
178 } else {
179 os << "Not assigned.";
180 }
181}
182
183LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
184 if (!lhs.isAssigned())
185 return rhs;
186 return lhs;
187}
188
189/// Since this is a backward analysis, join method is not used.
190LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
191 llvm_unreachable("Join should not be triggered by layout propagation.");
192}
193
194/// Construct a new layout with the transposed inst_data or lane_layout,
195/// lane_data.
196LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
197 if (!isAssigned())
198 return {};
199 // Check if the permutation is valid.
200 llvm::SmallSet<int64_t, 4> seen(permutation.begin(), permutation.end());
201 bool hasDuplicates = seen.size() != permutation.size();
202 bool withinRange = llvm::all_of(permutation, [&](int64_t idx) {
203 return idx >= 0 && idx < static_cast<int64_t>(permutation.size());
204 });
205
206 if (!withinRange || hasDuplicates) {
207 assert(false && "Invalid permutation for transpose.");
208 return {};
209 }
210
211 SmallVector<int32_t> laneLayout;
212 SmallVector<int32_t> laneData;
213 SmallVector<int32_t> instData;
214 SmallVector<int32_t> sgLayout;
217
218 for (int64_t idx : permutation) {
219 if (getLaneLayout().size()) {
220 laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
221 laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
222 }
223 if (getInstData().size())
224 instData.push_back(static_cast<int32_t>(getInstData()[idx]));
225 if (getSgData().size()) {
226 sgLayout.push_back(static_cast<int32_t>(getSgLayout()[idx]));
227 sgData.push_back(static_cast<int32_t>(getSgData()[idx]));
228 }
229 if (getOrder().size()) {
230 order.push_back(static_cast<int32_t>(getOrder()[idx]));
231 }
232 }
233 auto orderAttr = order.size()
234 ? DenseI32ArrayAttr::get(storage.getContext(), order)
235 : nullptr;
236 xegpu::LayoutAttr layoutAttr;
237 if (getLaneLayout().size())
238 layoutAttr =
239 xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData);
240 if (getInstData().size())
241 layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData);
242 if (getSgData().size())
243 layoutAttr = xegpu::LayoutAttr::get(
244 storage.getContext(),
245 DenseI32ArrayAttr::get(storage.getContext(), sgLayout),
246 DenseI32ArrayAttr::get(storage.getContext(), sgData),
247 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
248 /*lane_data =*/nullptr, orderAttr);
249 return LayoutInfo(layoutAttr);
250}
251
252//===----------------------------------------------------------------------===//
253// LayoutInfoLattice
254//===----------------------------------------------------------------------===//
255
256/// Lattice holding the LayoutInfo for each value.
257struct LayoutInfoLattice : public Lattice<LayoutInfo> {
259 using Lattice::Lattice;
260};
261
262/// Helper Functions to get default layouts. A `default layout` is a layout that
263/// is assigned to a value when the layout is not fixed by some anchor operation
264/// (like DPAS).
265
266/// Helper Function to get the default layout for uniform values like constants.
267/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
268/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
269static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
270 unsigned rank,
271 const xegpu::uArch::uArch *uArch) {
272 assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
273 if (rank == 1) {
274 return LayoutInfo(
275 xegpu::LayoutAttr::get(ctx, {uArch->getSubgroupSize()}, {1}));
276 }
277 return LayoutInfo(
278 xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1}));
279}
280
281/// Helper to get the default layout for 2D block operations.
282template <typename Ty>
283static LayoutInfo getSIMTLayoutInfoBlockIO(Ty ty,
285 unsigned packingSize) {
286 // Expecting a 1D or 2D vector.
287 assert((ty.getRank() == 1 || ty.getRank() == 2) &&
288 "Expected 1D or 2D vector.");
289 // Expecting int or float element type.
290 assert(ty.getElementType().isIntOrFloat() &&
291 "Expected int or float element type.");
292 // If the rank is 1, then return default layout for 1D vector.
293 if (ty.getRank() == 1)
294 return getDefaultSIMTLayoutInfo(ty.getContext(), 1, uArch);
295 // Packing factor is determined by the element type bitwidth.
296 unsigned bitwidth = ty.getElementType().getIntOrFloatBitWidth();
297 int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
298 return LayoutInfo(xegpu::LayoutAttr::get(
299 ty.getContext(), {1, uArch->getSubgroupSize()}, {1, packingFactor}));
300}
301
302//===----------------------------------------------------------------------===//
303// LayoutInfoPropagation
304//===----------------------------------------------------------------------===//
305
306/// Backward data flow analysis to propagate the lane_layout and lane_data of
307/// each value in the program. Currently, the layouts for operands DPAS,
308/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
309/// this analysis is to propagate those known layouts to all their producers and
310/// (other) consumers.
311class LayoutInfoPropagation
312 : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
313public:
315
316private:
317 xegpu::LayoutKind layoutKind;
318 unsigned indexBitWidth;
319 void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
321
322 void visitDpasMxOp(xegpu::DpasMxOp dpasMx,
325
326 void visitStoreNdOp(xegpu::StoreNdOp store,
329
330 void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
333
334 void visitLoadNdOp(xegpu::LoadNdOp load,
337
338 void visitLoadGatherOp(xegpu::LoadGatherOp load,
341
342 void visitTransposeOp(vector::TransposeOp transpose,
345
346 void visitVectorBitcastOp(vector::BitCastOp bitcast,
349
350 void visitVectorInterleaveOp(vector::InterleaveOp interleave,
353
354 void visitVectorDeinterleaveOp(vector::DeinterleaveOp deinterleave,
357
358 void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
361
362 void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
365
366 void visitVectorReductionOp(vector::ReductionOp reduction,
369
370 void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
373 void visitShapeCastOp(vector::ShapeCastOp shapeCast,
376 void
377 visitInsertStridedSliceOp(vector::InsertStridedSliceOp insertStridedSlice,
380
381 void visitLoadMatrixOp(xegpu::LoadMatrixOp load,
384
385 void visitStoreMatrixOp(xegpu::StoreMatrixOp store,
388
389 void visitLoadGatherOp(xegpu::LoadMatrixOp load,
392
393 void visitStoreScatterOp(xegpu::StoreMatrixOp store,
396
397 void visitConvertLayoutOp(xegpu::ConvertLayoutOp convertLayout,
400
401 bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout);
402
403public:
404 LayoutInfoPropagation(DataFlowSolver &solver,
405 SymbolTableCollection &symbolTable,
406 xegpu::LayoutKind layoutKind, unsigned indexBitWidth)
407 : SparseBackwardDataFlowAnalysis(solver, symbolTable),
408 layoutKind(layoutKind), indexBitWidth(indexBitWidth) {}
410
411 LogicalResult
412 visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
413 ArrayRef<const LayoutInfoLattice *> results) override;
414
415 void visitBranchOperand(OpOperand &operand) override {};
416
417 void visitCallOperand(OpOperand &operand) override {};
418
419 void
420 visitNonControlFlowArguments(RegionSuccessor &successor,
421 ArrayRef<BlockArgument> arguments) override {};
422
423 void visitExternalCall(CallOpInterface call,
425 ArrayRef<const LayoutInfoLattice *> results) override {
426 };
427
428 void setToExitState(LayoutInfoLattice *lattice) override {
429 (void)lattice->meet(LayoutInfo());
430 }
431};
432} // namespace
433
434LogicalResult LayoutInfoPropagation::visitOperation(
435 Operation *op, ArrayRef<LayoutInfoLattice *> operands,
436 ArrayRef<const LayoutInfoLattice *> results) {
438 .Case(
439 [&](xegpu::DpasOp dpasOp) { visitDpasOp(dpasOp, operands, results); })
440 .Case([&](xegpu::DpasMxOp dpasMxOp) {
441 visitDpasMxOp(dpasMxOp, operands, results);
442 })
443 .Case([&](xegpu::StoreNdOp storeNdOp) {
444 visitStoreNdOp(storeNdOp, operands, results);
445 })
446 .Case([&](xegpu::StoreScatterOp storeScatterOp) {
447 visitStoreScatterOp(storeScatterOp, operands, results);
448 })
449 .Case([&](xegpu::LoadNdOp loadNdOp) {
450 visitLoadNdOp(loadNdOp, operands, results);
451 })
452 .Case([&](xegpu::LoadGatherOp loadGatherOp) {
453 visitLoadGatherOp(loadGatherOp, operands, results);
454 })
455 .Case([&](xegpu::PrefetchNdOp prefetchNdOp) {
456 visitPrefetchNdOp(prefetchNdOp, operands, results);
457 })
458 .Case([&](vector::TransposeOp transposeOp) {
459 visitTransposeOp(transposeOp, operands, results);
460 })
461 .Case([&](vector::BitCastOp bitcastOp) {
462 visitVectorBitcastOp(bitcastOp, operands, results);
463 })
464 .Case([&](vector::InterleaveOp interleaveOp) {
465 visitVectorInterleaveOp(interleaveOp, operands, results);
466 })
467 .Case([&](vector::DeinterleaveOp deinterleaveOp) {
468 visitVectorDeinterleaveOp(deinterleaveOp, operands, results);
469 })
470 .Case([&](vector::MultiDimReductionOp reductionOp) {
471 visitVectorMultiReductionOp(reductionOp, operands, results);
472 })
473 .Case([&](vector::ReductionOp reductionOp) {
474 visitVectorReductionOp(reductionOp, operands, results);
475 })
476 .Case([&](vector::BroadcastOp broadcastOp) {
477 visitVectorBroadCastOp(broadcastOp, operands, results);
478 })
479 .Case([&](vector::ShapeCastOp shapeCastOp) {
480 visitShapeCastOp(shapeCastOp, operands, results);
481 })
482 .Case([&](vector::InsertStridedSliceOp insertStridedSliceOp) {
483 visitInsertStridedSliceOp(insertStridedSliceOp, operands, results);
484 })
485 .Case([&](xegpu::LoadMatrixOp loadMatrixOp) {
486 visitLoadMatrixOp(loadMatrixOp, operands, results);
487 })
488 .Case([&](xegpu::StoreMatrixOp storeMatrixOp) {
489 visitStoreMatrixOp(storeMatrixOp, operands, results);
490 })
491 .Case([&](xegpu::ConvertLayoutOp convertLayoutOp) {
492 visitConvertLayoutOp(convertLayoutOp, operands, results);
493 })
494 // All other ops.
495 .Default([&](Operation *op) {
496 for (const LayoutInfoLattice *resultInfo : results) {
497 if (!resultInfo->getValue().isAssigned())
498 continue;
499 for (auto [operandInfo, operand] :
500 llvm::zip(operands, op->getOpOperands())) {
501 // If the operand type is not a vector or tensor descriptor, skip
502 // it.
503 if (!isa<xegpu::TensorDescType, VectorType>(
504 operand.get().getType()))
505 continue;
506 // Propagate the result layout to the operand.
507 meet(operandInfo, *resultInfo);
508 }
509 }
510 });
511
512 return success();
513}
514
515bool LayoutInfoPropagation::hasParamsOfLayoutKind(
516 xegpu::DistributeLayoutAttr anchorLayout) {
517 if (anchorLayout == nullptr) {
518 return false;
519 }
520 if (layoutKind == xegpu::LayoutKind::InstData) {
521 return !(anchorLayout.getEffectiveInstDataAsInt().empty());
522 }
523 if (layoutKind == xegpu::LayoutKind::Lane) {
524 return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() ||
525 anchorLayout.getEffectiveLaneDataAsInt().empty());
526 }
527 if (layoutKind == xegpu::LayoutKind::Subgroup) {
528 return !(anchorLayout.getEffectiveSgLayoutAsInt().empty() ||
529 anchorLayout.getEffectiveSgDataAsInt().empty());
530 }
531 return false;
532}
533
534// This function returns all layouts for the given sgCount, whose sgData:
535// 1. Evenly divides the wgShape.
536// 2. Is a multiple of instData.
537// Example:
538// wgShape = [128, 64], instData = [8, 16], sgCount = 32
539// Returns layouts:
540// [(8,4), (16,2)], which correspond to sgData [16,16] and [8,32].
542 ArrayRef<int> instData,
543 int64_t sgCount) {
545 for (int sgLayout0 = 1; sgLayout0 <= sgCount; ++sgLayout0) {
546 if (sgCount % sgLayout0)
547 continue;
548 int sgLayout1 = sgCount / sgLayout0;
549 int sgData0 = wgShape[0] / sgLayout0;
550 int sgData1 = wgShape[1] / sgLayout1;
551 if ((wgShape[0] % sgLayout0 || wgShape[1] % sgLayout1) ||
552 (sgData0 % instData[0] || sgData1 % instData[1]))
553 continue;
554 candidates.emplace_back(sgLayout0, sgLayout1);
555 }
556 // Sort primarily by how balanced they are
557 // (i.e., minimize the absolute difference between the two dimensions), and
558 // secondarily by the first dimension in ascending order.
559 llvm::sort(candidates, [](const std::pair<int, int> &lhs,
560 const std::pair<int, int> &rhs) {
561 int diffLhs = std::abs(lhs.first - lhs.second);
562 int diffRhs = std::abs(rhs.first - rhs.second);
563 if (diffLhs != diffRhs)
564 return diffLhs < diffRhs;
565 return lhs.first < rhs.first;
566 });
567 return candidates;
568}
569
570FailureOr<int64_t> getNumSg(Operation *op, const int sgSize) {
571 // Oblivious to workitem layout, the total count matters.
572 auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>();
573 if (!gpuFunc)
574 return failure();
575 auto knownBlockSize = gpuFunc.getKnownBlockSize();
576 if (!knownBlockSize.has_value())
577 return failure();
578 const int flatBlockSize = llvm::product_of(knownBlockSize.value());
579 return flatBlockSize / sgSize;
580}
581
582void LayoutInfoPropagation::visitPrefetchNdOp(
583 xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
584 ArrayRef<const LayoutInfoLattice *> results) {
585
586 LayoutInfo prefetchLayout;
587 xegpu::DistributeLayoutAttr anchorLayout = prefetch.getLayoutAttr();
588 if (hasParamsOfLayoutKind(anchorLayout)) {
589 prefetchLayout = LayoutInfo(anchorLayout);
590 } else {
591 // Here we assign the default layout to the tensor descriptor operand of
592 // prefetch.
593 auto tdescTy = prefetch.getTensorDescType();
594
595 const uArch *uArch = getUArch(getChipStr(prefetch).value_or(""));
596 if (!uArch)
597 return;
598 const auto *uArchInstruction =
599 dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
600 uArch->getInstruction(
601 xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
602
603 auto blockWHC =
604 uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
605 if (!blockWHC)
606 prefetch.emitWarning("No known block params found for the element type.");
607 auto [bWidth, bHeight, bCount] = blockWHC.value();
608 SmallVector<int> instData;
609 int instWidth = xegpu::getLargestDivisor(
610 static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth);
611 if (instWidth == -1)
612 prefetch.emitWarning(
613 "No suitable instruction multiple found for the given shape.");
614 if (tdescTy.getRank() == 1)
615 instData = {instWidth};
616 else {
617 int instHeight = xegpu::getLargestDivisor(
618 static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
619 if (instHeight == -1)
620 prefetch.emitWarning(
621 "No suitable instruction multiple found for the given shape.");
622 instData = {instHeight, instWidth};
623 }
624
625 if (layoutKind == xegpu::LayoutKind::InstData)
626 prefetchLayout =
627 LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
628 else
629 prefetchLayout = getSIMTLayoutInfoBlockIO(
630 tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
631
632 prefetch.setLayoutAttr(
633 dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get()));
634 }
635 // Propagate the layout to the source tensor descriptor.
636 propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
637}
638
639void LayoutInfoPropagation::visitVectorMultiReductionOp(
640 vector::MultiDimReductionOp reduction,
641 ArrayRef<LayoutInfoLattice *> operands,
642 ArrayRef<const LayoutInfoLattice *> results) {
643 Type resultTy = reduction.getDestType();
644 // The layout of the result must be present.
645 LayoutInfo resLayoutInfo = results[0]->getValue();
646
647 xegpu::DistributeLayoutAttr consumerLayoutAttr;
648 if (!resultTy.isIntOrFloat()) {
649 if (!resLayoutInfo.isAssigned())
650 return;
651 consumerLayoutAttr =
652 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
653 }
654
655 VectorType sourceTy = reduction.getSourceVectorType();
656 SmallVector<int64_t> reductionDims(reduction.getReductionDims());
657
658 const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
659 if (!uArch)
660 return;
661 int numSg = 0;
662 if (layoutKind == xegpu::LayoutKind::Subgroup) {
663 auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
664 if (succeeded(numSgOrErr))
665 numSg = numSgOrErr.value();
666 }
667
668 // The result layout represents the layout requirements of the operation.
669 // it is recorded to anchor layout or temporary layout.
670 // it must be honored for current op and may conflict with the layout
671 // propagated from consumer op, the conflict is resolved in later phase by
672 // converting the required result layout to the consumer layout
673 auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
674 layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);
675
676 xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
677
678 // derive the source layout from the dominant layout and reduction dims
679 auto srcLayoutAttr = xegpu::inferMultiReductionSourceLayout(
680 requiredResLayoutAttr, reductionDims);
681
682 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
683 // Accumulator should have the same layout as the result.
684 propagateIfChanged(operands[1],
685 operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
686}
687
688void LayoutInfoPropagation::visitVectorReductionOp(
689 vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
690 ArrayRef<const LayoutInfoLattice *> results) {
691
692 VectorType sourceTy = reduction.getSourceVectorType();
693 const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
694 if (!uArch)
695 return;
696
697 auto requiredResLayoutAttr =
698 xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
699 xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
700
701 auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
702 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
703 if (reduction.getAcc())
704 propagateIfChanged(operands[1],
705 operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
706}
707
708void LayoutInfoPropagation::visitVectorBroadCastOp(
709 vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
710 ArrayRef<const LayoutInfoLattice *> results) {
711 // The layout of the result must be present.
712 LayoutInfo resLayoutInfo = results[0]->getValue();
713 if (!resLayoutInfo.isAssigned())
714 return;
715
716 // Only consider vector to vector broadcasts for now.
717 VectorType resultTy = broadcast.getResultVectorType();
718 VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
719 // skip layout propagation for non-vector source operand.
720 if (!sourceTy)
721 return;
722
723 auto srcShape = sourceTy.getShape();
724 auto resShape = resultTy.getShape();
725
726 auto resultLayoutAttr =
727 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
728
729 xegpu::DistributeLayoutAttr srcLayoutAttr =
730 xegpu::inferBroadcastSourceLayout(resultLayoutAttr, resShape, srcShape);
731
732 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
733}
734
735void LayoutInfoPropagation::visitShapeCastOp(
736 vector::ShapeCastOp shapeCast, ArrayRef<LayoutInfoLattice *> operands,
737 ArrayRef<const LayoutInfoLattice *> results) {
738 // The layout of the result must be present.
739 LayoutInfo resLayoutInfo = results[0]->getValue();
740 if (!resLayoutInfo.isAssigned())
741 return;
742 ArrayRef<int64_t> resShape = shapeCast.getResultVectorType().getShape();
743 ArrayRef<int64_t> srcShape = shapeCast.getSourceVectorType().getShape();
744 auto resultLayoutAttr =
745 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
746
747 xegpu::DistributeLayoutAttr srcLayoutAttr =
748 xegpu::inferShapeCastSourceLayout(resultLayoutAttr, resShape, srcShape);
749
750 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
751}
752
753/// Set the layouts for DPAS A, B, and C operands.
754void LayoutInfoPropagation::visitDpasOp(
755 xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
756 ArrayRef<const LayoutInfoLattice *> results) {
757 LayoutInfo dpasALayout;
758 LayoutInfo dpasBLayout;
759 LayoutInfo dpasCDLayout;
760
761 xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getLayoutCdAttr();
762 if (hasParamsOfLayoutKind(anchorLayoutCD)) {
763 xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getLayoutAAttr();
764 xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getLayoutBAttr();
765 assert(hasParamsOfLayoutKind(anchorLayoutA) &&
766 "Expected anchor layout for DPAS A operand.");
767 assert(hasParamsOfLayoutKind(anchorLayoutB) &&
768 "Expected anchor layout for DPAS B operand.");
769 dpasALayout = LayoutInfo(anchorLayoutA);
770 dpasBLayout = LayoutInfo(anchorLayoutB);
771 dpasCDLayout = LayoutInfo(anchorLayoutCD);
772 } else {
773 const uArch *uArch = getUArch(getChipStr(dpas).value_or(""));
774 if (!uArch)
775 return;
776 VectorType aTy = dpas.getLhsType();
777 VectorType bTy = dpas.getRhsType();
778 VectorType cdTy = dpas.getResultType();
779
780 xegpu::DistributeLayoutAttr consumerLayoutAttr = nullptr;
781 xegpu::DistributeLayoutAttr requiredCDLayoutAttr, requiredALayout,
782 requiredBLayout;
783
784 int numSg = 0;
785 if (layoutKind == xegpu::LayoutKind::Subgroup) {
786 LayoutInfo consumerLayout = results[0]->getValue();
787 if (!consumerLayout.isAssigned())
788 return;
789 consumerLayoutAttr =
790 dyn_cast<xegpu::DistributeLayoutAttr>(consumerLayout.get());
791 auto numSgOrErr = getNumSg(dpas, uArch->getSubgroupSize());
792 if (failed(numSgOrErr)) {
793 dpas.emitWarning(
794 "Unable to determine the number of subgroups for the operation.");
795 return;
796 }
797 numSg = numSgOrErr.value();
798 }
799 auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
800 consumerLayoutAttr, numSg, uArch);
801 if (!layouts.has_value()) {
802 dpas.emitWarning(
803 "Failed to determine required layouts for DPAS operands.");
804 return;
805 }
806
807 std::tie(requiredALayout, requiredBLayout, requiredCDLayoutAttr) = *layouts;
808
809 dpas.setLayoutAAttr(requiredALayout);
810 dpas.setLayoutBAttr(requiredBLayout);
811 dpas.setLayoutCdAttr(requiredCDLayoutAttr);
812 dpasALayout = LayoutInfo(requiredALayout);
813 dpasBLayout = LayoutInfo(requiredBLayout);
814 dpasCDLayout = LayoutInfo(requiredCDLayoutAttr);
815 }
816 propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
817 propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
818 if (operands.size() > 2)
819 propagateIfChanged(operands[2], operands[2]->meet(dpasCDLayout));
820}
821
822/// Propagate layout for DpasMxOp operands using the layout attributes.
823/// DpasMxOp has operands: a, b, acc (optional), scale_a (optional), scale_b
824/// (optional)
825void LayoutInfoPropagation::visitDpasMxOp(
826 xegpu::DpasMxOp dpasMx, ArrayRef<LayoutInfoLattice *> operands,
827 ArrayRef<const LayoutInfoLattice *> results) {
828
829 // Initialize layout variables
830 LayoutInfo dpasMxALayout, dpasMxBLayout, dpasMxCDLayout;
831 LayoutInfo dpasMxAScaleLayout, dpasMxBScaleLayout;
832
833 // Get existing layout attributes from the operation
834 xegpu::DistributeLayoutAttr anchorLayoutA = dpasMx.getLayoutAAttr();
835 xegpu::DistributeLayoutAttr anchorLayoutB = dpasMx.getLayoutBAttr();
836 xegpu::DistributeLayoutAttr anchorLayoutCD = dpasMx.getLayoutCdAttr();
837
838 // Check if all layouts are already set
839 if (anchorLayoutA && anchorLayoutB && anchorLayoutCD &&
840 hasParamsOfLayoutKind(anchorLayoutA) &&
841 hasParamsOfLayoutKind(anchorLayoutB) &&
842 hasParamsOfLayoutKind(anchorLayoutCD)) {
843 dpasMxALayout = LayoutInfo(anchorLayoutA);
844 dpasMxBLayout = LayoutInfo(anchorLayoutB);
845 dpasMxCDLayout = LayoutInfo(anchorLayoutCD);
846
847 // Get scale layouts if available
848 xegpu::DistributeLayoutAttr anchorLayoutAScale =
849 dpasMx.getLayoutAScaleAttr();
850 xegpu::DistributeLayoutAttr anchorLayoutBScale =
851 dpasMx.getLayoutBScaleAttr();
852 if (anchorLayoutAScale)
853 dpasMxAScaleLayout = LayoutInfo(anchorLayoutAScale);
854 if (anchorLayoutBScale)
855 dpasMxBScaleLayout = LayoutInfo(anchorLayoutBScale);
856 } else {
857 // Need to compute layouts
858 const uArch *uArch = getUArch(getChipStr(dpasMx).value_or(""));
859 if (!uArch)
860 return;
861
862 VectorType aTy = dpasMx.getAType();
863 VectorType bTy = dpasMx.getBType();
864 VectorType cdTy = dpasMx.getResultType();
865
866 // Get scale types if present
867 VectorType aScaleTy;
868 VectorType bScaleTy;
869 Value scaleA = dpasMx.getScaleA();
870 Value scaleB = dpasMx.getScaleB();
871 if (scaleA)
872 aScaleTy = dyn_cast<VectorType>(scaleA.getType());
873 if (scaleB)
874 bScaleTy = dyn_cast<VectorType>(scaleB.getType());
875
876 xegpu::DistributeLayoutAttr consumerLayoutAttr = nullptr;
877 xegpu::DistributeLayoutAttr requiredCDLayoutAttr, requiredALayout,
878 requiredBLayout, requiredAScaleLayout, requiredBScaleLayout;
879
880 int numSg = 0;
881 if (layoutKind == xegpu::LayoutKind::Subgroup) {
882 LayoutInfo consumerLayout = results[0]->getValue();
883 if (!consumerLayout.isAssigned())
884 return;
885 consumerLayoutAttr =
886 dyn_cast<xegpu::DistributeLayoutAttr>(consumerLayout.get());
887 auto numSgOrErr = getNumSg(dpasMx, uArch->getSubgroupSize());
888 if (failed(numSgOrErr)) {
889 dpasMx.emitWarning(
890 "Unable to determine the number of subgroups for the operation.");
891 return;
892 }
893 numSg = numSgOrErr.value();
894 }
895
896 auto layouts =
897 xegpu::setupDpasMxLayout(layoutKind, aTy, bTy, cdTy, aScaleTy, bScaleTy,
898 consumerLayoutAttr, numSg, uArch);
899 if (!layouts.has_value()) {
900 dpasMx.emitWarning(
901 "Failed to determine required layouts for DPAS_MX operands.");
902 return;
903 }
904
905 std::tie(requiredALayout, requiredBLayout, requiredCDLayoutAttr,
906 requiredAScaleLayout, requiredBScaleLayout) = *layouts;
907
908 dpasMx.setLayoutAAttr(requiredALayout);
909 dpasMx.setLayoutBAttr(requiredBLayout);
910 dpasMx.setLayoutCdAttr(requiredCDLayoutAttr);
911 if (requiredAScaleLayout)
912 dpasMx.setLayoutAScaleAttr(requiredAScaleLayout);
913 if (requiredBScaleLayout)
914 dpasMx.setLayoutBScaleAttr(requiredBScaleLayout);
915
916 dpasMxALayout = LayoutInfo(requiredALayout);
917 dpasMxBLayout = LayoutInfo(requiredBLayout);
918 dpasMxCDLayout = LayoutInfo(requiredCDLayoutAttr);
919 if (requiredAScaleLayout)
920 dpasMxAScaleLayout = LayoutInfo(requiredAScaleLayout);
921 if (requiredBScaleLayout)
922 dpasMxBScaleLayout = LayoutInfo(requiredBScaleLayout);
923 }
924
925 // Propagate layouts to operands. Because acc, scale_a, scale_b are all
926 // optional (AttrSizedOperandSegments), the index of each present operand in
927 // `operands` depends on which optionals are actually supplied. Use the
928 // op's accessors to determine the correct positional index.
929 propagateIfChanged(operands[0], operands[0]->meet(dpasMxALayout));
930 propagateIfChanged(operands[1], operands[1]->meet(dpasMxBLayout));
931 unsigned idx = 2;
932 if (dpasMx.getAcc()) {
933 propagateIfChanged(operands[idx], operands[idx]->meet(dpasMxCDLayout));
934 ++idx;
935 }
936 if (dpasMx.getScaleA()) {
937 if (dpasMxAScaleLayout.isAssigned())
938 propagateIfChanged(operands[idx],
939 operands[idx]->meet(dpasMxAScaleLayout));
940 ++idx;
941 }
942 if (dpasMx.getScaleB()) {
943 if (dpasMxBScaleLayout.isAssigned())
944 propagateIfChanged(operands[idx],
945 operands[idx]->meet(dpasMxBScaleLayout));
946 ++idx;
947 }
948}
949
950/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
951void LayoutInfoPropagation::visitStoreNdOp(
952 xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
953 ArrayRef<const LayoutInfoLattice *> results) {
954 LayoutInfo storeLayout;
955 xegpu::DistributeLayoutAttr anchorLayout = store.getLayoutAttr();
956 if (hasParamsOfLayoutKind(anchorLayout)) {
957 storeLayout = LayoutInfo(anchorLayout);
958 } else {
959 const uArch *uArch = getUArch(getChipStr(store).value_or(""));
960 if (!uArch)
961 return;
962 const auto *uArchInstruction =
963 dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
964 uArch->getInstruction(
965 xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
966 VectorType dataTy = store.getValueType();
967 auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
968 store.getValueType().getElementType());
969 if (!blockWHC)
970 store.emitWarning("No known block params found for the element type.");
971 auto [bWidth, bHeight, bCount] = blockWHC.value();
972 SmallVector<int> instData;
973 int instWidth = xegpu::getLargestDivisor(
974 static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth);
975 if (instWidth == -1)
976 store.emitWarning(
977 "No suitable instruction multiple found for the given shape.");
978 if (dataTy.getRank() == 1)
979 instData = {instWidth};
980 else {
981 int instHeight = xegpu::getLargestDivisor(
982 static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
983 if (instHeight == -1)
984 store.emitWarning(
985 "No suitable instruction multiple found for the given shape.");
986 instData = {instHeight, instWidth};
987 }
988
989 if (layoutKind == xegpu::LayoutKind::InstData)
990 storeLayout =
991 LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
992 else if (layoutKind == xegpu::LayoutKind::Lane)
993 storeLayout =
994 getSIMTLayoutInfoBlockIO(store.getValueType(), uArch,
995 uArchInstruction->getPackedFormatBitSize());
996 else { // xegpu::LayoutKind::Subgroup
997 auto sgSize = uArch->getSubgroupSize();
998 auto numSgOrErr = getNumSg(store, sgSize);
999 if (failed(numSgOrErr)) {
1000 store.emitWarning(
1001 "Unable to determine the number of subgroups for the operation.");
1002 return;
1003 }
1004 auto sgLayouts = getValidLayouts(store.getValueType().getShape(),
1005 instData, numSgOrErr.value());
1006 if (sgLayouts.empty()) {
1007 store.emitWarning(
1008 "Unable to determine suitable subgroup layout for store value.");
1009 return;
1010 }
1011 SmallVector<int> sgLayout = {sgLayouts[0].first, sgLayouts[0].second};
1012 SmallVector<int> sgData = {
1013 static_cast<int>(dataTy.getShape()[0]) / sgLayout[0],
1014 static_cast<int>(dataTy.getShape()[1]) / sgLayout[1]};
1015 storeLayout = LayoutInfo(xegpu::LayoutAttr::get(
1016 dataTy.getContext(),
1017 DenseI32ArrayAttr::get(dataTy.getContext(), sgLayout),
1018 DenseI32ArrayAttr::get(dataTy.getContext(), sgData),
1019 /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
1020 /*lane_data =*/nullptr, /*order =*/nullptr));
1021 }
1022 store.setLayoutAttr(
1023 dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
1024 }
1025 // Propagate the layout to the value operand.
1026 // Both operands should have the same layout
1027 for (LayoutInfoLattice *operand : operands)
1028 propagateIfChanged(operand, operand->meet(storeLayout));
1029}
1030
1031/// Propagate the layout of the value to the tensor descriptor operand in
1032/// LoadNdOp.
1033void LayoutInfoPropagation::visitLoadNdOp(
1034 xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
1035 ArrayRef<const LayoutInfoLattice *> results) {
1036 LayoutInfo loadLayout;
1037 xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
1038 if (hasParamsOfLayoutKind(anchorLayout)) {
1039 loadLayout = LayoutInfo(anchorLayout);
1040 } else {
1041
1042 LayoutInfo valueLayout = results[0]->getValue();
1043 // Need the layout of the value to propagate to the tensor descriptor.
1044 if (!valueLayout.isAssigned())
1045 return;
1046 loadLayout = valueLayout;
1047 // LoadNdOp has the transpose effect. However, at the stage of this analysis
1048 // this effect is not expected and should be abstracted away. Emit a
1049 // warning.
1050 if (auto transpose = load.getTranspose()) {
1051 load.emitWarning("Transpose effect is not expected for LoadNdOp at "
1052 "LayoutInfoPropagation stage.");
1053 loadLayout = valueLayout.transpose(transpose.value());
1054 }
1055 load.setLayoutAttr(dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
1056 }
1057 // Propagate the new layout to the tensor descriptor operand.
1058 propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
1059}
1060
1061/// Propagate the layout of the value to the tensor descriptor operand in
1062/// ConvertLayoutOp.
1063void LayoutInfoPropagation::visitConvertLayoutOp(
1064 xegpu::ConvertLayoutOp convert, ArrayRef<LayoutInfoLattice *> operands,
1065 ArrayRef<const LayoutInfoLattice *> results) {
1066 xegpu::DistributeLayoutAttr anchorLayout = convert.getInputLayoutAttr();
1067 LayoutInfo convertLayout(anchorLayout);
1068 // Propagate the new layout to the tensor descriptor operand.
1069 propagateIfChanged(operands[0], operands[0]->meet(convertLayout));
1070}
1071
1072/// For vector::TransposeOp, the layout of the result is transposed and
1073/// propagated to the operand.
1074void LayoutInfoPropagation::visitTransposeOp(
1075 vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
1076 ArrayRef<const LayoutInfoLattice *> results) {
1077 // Need the layout of transpose result to propagate to the operands.
1078 LayoutInfo resultLayout = results[0]->getValue();
1079 if (!resultLayout.isAssigned())
1080 return;
1081
1082 auto consumerLayoutAttr =
1083 dyn_cast<xegpu::DistributeLayoutAttr>(resultLayout.get());
1084 auto srcLayoutAttr = xegpu::inferTransposeSourceLayout(
1085 consumerLayoutAttr, transpose.getPermutation());
1086
1087 // Propagate the new layout to the vector operand.
1088 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
1089}
1090
1091/// For vector::BitCastOp, the lane_data of the source layout is changed based
1092/// on the bit width of the source and result types.
1093void LayoutInfoPropagation::visitVectorBitcastOp(
1094 vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
1095 ArrayRef<const LayoutInfoLattice *> results) {
1096 // Need the layout of bitcast result to propagate to the operands.
1097 LayoutInfo resLayoutInfo = results[0]->getValue();
1098 if (!resLayoutInfo.isAssigned())
1099 return;
1100
1101 auto srcVecType = bitcast.getSourceVectorType();
1102 auto resVecType = bitcast.getResultVectorType();
1103
1104 auto consumerLayoutAttr =
1105 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1106 const uArch *uArch = getUArch(xegpu::getChipStr(bitcast).value_or(""));
1107 if (!uArch)
1108 return;
1109 auto requiredResLayoutAttr = setupBitCastResultLayout(
1110 layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
1111
1112 xegpu::setTemporaryLayout(bitcast->getResult(0), requiredResLayoutAttr);
1113
1114 int inElemTyBitWidth = srcVecType.getElementType().getIntOrFloatBitWidth();
1115 int outElemTyBitWidth = resVecType.getElementType().getIntOrFloatBitWidth();
1116
1117 // derive the source layout from the dominant layout and reduction dims
1118 auto srcLayoutAttr = xegpu::inferBitCastSourceLayout(
1119 requiredResLayoutAttr, outElemTyBitWidth, inElemTyBitWidth);
1120
1121 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
1122}
1123
1124/// For vector::InterleaveOp, the result has double the innermost dimension size
1125/// compared to each source operand. The layout is propagated from result to
1126/// sources, adjusting for the 2x size increase.
1127void LayoutInfoPropagation::visitVectorInterleaveOp(
1128 vector::InterleaveOp interleave, ArrayRef<LayoutInfoLattice *> operands,
1129 ArrayRef<const LayoutInfoLattice *> results) {
1130 // Need the layout of interleave result to propagate to the operands.
1131 LayoutInfo resLayoutInfo = results[0]->getValue();
1132 if (!resLayoutInfo.isAssigned())
1133 return;
1134
1135 auto srcVecType = interleave.getSourceVectorType();
1136 auto resVecType = interleave.getResultVectorType();
1137
1138 auto consumerLayoutAttr =
1139 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1140 const uArch *uArch = getUArch(xegpu::getChipStr(interleave).value_or(""));
1141 if (!uArch)
1142 return;
1143
1144 // Setup the result layout to ensure the source layout can be safely derived
1145 auto requiredResLayoutAttr = setupInterleaveResultLayout(
1146 layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
1147
1148 xegpu::setTemporaryLayout(interleave->getResult(0), requiredResLayoutAttr);
1149
1150 // Derive the source layout from the result layout (halve the innermost dim)
1151 auto srcLayoutAttr =
1152 xegpu::inferInterleaveSourceLayout(requiredResLayoutAttr);
1153
1154 // Both operands (lhs and rhs) get the same source layout
1155 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
1156 propagateIfChanged(operands[1], operands[1]->meet(LayoutInfo(srcLayoutAttr)));
1157}
1158
1159/// For vector::DeinterleaveOp, the source has double the innermost dimension
1160/// size compared to each result. The layout is propagated from results to
1161/// source, adjusting for the 2x size decrease in results.
1162void LayoutInfoPropagation::visitVectorDeinterleaveOp(
1163 vector::DeinterleaveOp deinterleave, ArrayRef<LayoutInfoLattice *> operands,
1164 ArrayRef<const LayoutInfoLattice *> results) {
1165 // Need the layout of deinterleave results to propagate to the operand.
1166 // Use the first result's layout (both results should have the same layout)
1167 LayoutInfo resLayoutInfo = results[0]->getValue();
1168 if (!resLayoutInfo.isAssigned())
1169 return;
1170
1171 auto consumerLayoutAttr =
1172 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1173
1174 // Derive the source layout from the result layout (double the innermost dim)
1175 // No setup function needed - just infer directly
1176 auto srcLayoutAttr = xegpu::inferDeinterleaveSourceLayout(consumerLayoutAttr);
1177
1178 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
1179}
1180
1181void LayoutInfoPropagation::visitInsertStridedSliceOp(
1182 vector::InsertStridedSliceOp insertStridedSlice,
1183 ArrayRef<LayoutInfoLattice *> operands,
1184 ArrayRef<const LayoutInfoLattice *> results) {
1185 // The layout of the result must be present.
1186 LayoutInfo resLayoutInfo = results[0]->getValue();
1187 if (!resLayoutInfo.isAssigned())
1188 return;
1189
1190 auto srcVecType = insertStridedSlice.getSourceVectorType();
1191 auto resVecType = insertStridedSlice.getDestVectorType();
1192
1193 auto consumerLayoutAttr =
1194 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1195 const uArch *uArch =
1196 getUArch(xegpu::getChipStr(insertStridedSlice).value_or(""));
1197 if (!uArch)
1198 return;
1199
1200 auto requiredResLayoutAttr = xegpu::setupInsertStridedSliceResultLayout(
1201 layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
1202 xegpu::setTemporaryLayout(insertStridedSlice->getResult(0),
1203 requiredResLayoutAttr);
1204
1205 auto srcLayoutAttr = xegpu::inferInsertStridedSliceSourceLayout(
1206 requiredResLayoutAttr, resVecType.getShape(), srcVecType.getShape());
1207 propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
1208 propagateIfChanged(operands[1],
1209 operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
1210}
1211
1212/// Propagate the layout of the result to the tensor descriptor, mask and offset
1213/// operands in LoadGatherOp.
1214void LayoutInfoPropagation::visitLoadGatherOp(
1215 xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
1216 ArrayRef<const LayoutInfoLattice *> results) {
1217 xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
1218 xegpu::DistributeLayoutAttr anchorLayoutAttr = load.getLayoutAttr();
1219 const uArch *uArch = getUArch(getChipStr(load).value_or(""));
1220 if (!uArch)
1221 return;
1222 VectorType resVecTy = load.getValueType();
1223 int chunkSize = load.getChunkSize().value_or(1);
1224
1225 LayoutInfo resLayoutInfo = results[0]->getValue();
1226 if (!resLayoutInfo.isAssigned())
1227 return;
1228 auto consumerLayoutAttr =
1229 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1230
1231 if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
1232 requiredAnchorLayoutAttr = anchorLayoutAttr;
1233 } else {
1234 if (!resVecTy) {
1235 load.emitWarning("Not propagating, non-vector payload supplied.");
1236 return;
1237 }
1238 requiredAnchorLayoutAttr = xegpu::setupLoadGatherAnchorLayout(
1239 layoutKind, resVecTy, chunkSize, consumerLayoutAttr, uArch);
1240 load.setLayoutAttr(requiredAnchorLayoutAttr);
1241 }
1242
1243 assert((chunkSize <= 1) || (layoutKind != xegpu::LayoutKind::Subgroup));
1244 auto maskLayoutAttr = xegpu::inferMaskOffsetLayoutForScatterIO(
1245 requiredAnchorLayoutAttr, chunkSize);
1246 LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);
1247 auto loadLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);
1248
1249 // Propagate the new layout to the tensor descriptor operand.
1250 if (isa<xegpu::TensorDescType>(load.getSourceType()))
1251 propagateIfChanged(operands[0], operands[0]->meet(loadLayoutInfo));
1252 // Propagate the new layout to the offset and mask operands.
1253 propagateIfChanged(operands[1], operands[1]->meet(maskLayoutInfo));
1254 propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
1255}
1256
1257/// Set the layout for the value, tensor descriptor, offset and mask operands in
1258/// the StoreScatterOp.
1259void LayoutInfoPropagation::visitStoreScatterOp(
1260 xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
1261 ArrayRef<const LayoutInfoLattice *> results) {
1262
1263 xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
1264 xegpu::DistributeLayoutAttr anchorLayoutAttr = storeScatter.getLayoutAttr();
1265 const uArch *uArch = getUArch(getChipStr(storeScatter).value_or(""));
1266 if (!uArch)
1267 return;
1268 VectorType srcVecTy = storeScatter.getValueType();
1269 int chunkSize = storeScatter.getChunkSize().value_or(1);
1270
1271 if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
1272 requiredAnchorLayoutAttr = anchorLayoutAttr;
1273 } else {
1274 if (!srcVecTy) {
1275 storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
1276 return;
1277 }
1278 requiredAnchorLayoutAttr = xegpu::setupStoreScatterAnchorLayout(
1279 layoutKind, srcVecTy, chunkSize, uArch);
1280 storeScatter.setLayoutAttr(requiredAnchorLayoutAttr);
1281 }
1282
1283 LayoutInfo srcLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);
1284 assert((chunkSize <= 1) || (layoutKind != xegpu::LayoutKind::Subgroup));
1285 auto maskLayoutAttr = xegpu::inferMaskOffsetLayoutForScatterIO(
1286 requiredAnchorLayoutAttr, chunkSize);
1287 LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);
1288
1289 // Propagate the payload operand layout
1290 propagateIfChanged(operands[0], operands[0]->meet(srcLayoutInfo));
1291 // Propagate the destination (if tdesc) operand layout
1292 if (isa<xegpu::TensorDescType>(storeScatter.getDestType()))
1293 propagateIfChanged(operands[1], operands[1]->meet(srcLayoutInfo));
1294 // Propagate the new layout to the offset and mask operands.
1295 propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
1296 propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo));
1297}
1298
1299void LayoutInfoPropagation::visitLoadMatrixOp(
1300 xegpu::LoadMatrixOp loadMatrixOp, ArrayRef<LayoutInfoLattice *> operands,
1301 ArrayRef<const LayoutInfoLattice *> results) {
1302
1303 LayoutInfo resLayoutInfo = results[0]->getValue();
1304 if (!resLayoutInfo.isAssigned())
1305 return;
1306
1307 auto consumerLayoutAttr =
1308 dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
1309
1310 xegpu::DistributeLayoutAttr anchorLayout = loadMatrixOp.getLayoutAttr();
1311
1312 // only need to set anchor layout, no need to porpagate to memdesc and
1313 // offset
1314 if (!hasParamsOfLayoutKind(anchorLayout)) {
1315 VectorType resVecTy =
1316 llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
1317 const uArch *uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
1318 if (!uArch)
1319 return;
1320 auto requiredAnchorLayoutAttr = xegpu::setupLoadMatrixAnchorLayout(
1321 layoutKind, resVecTy, consumerLayoutAttr, uArch);
1322 loadMatrixOp.setLayoutAttr(requiredAnchorLayoutAttr);
1323 }
1324}
1325
1326void LayoutInfoPropagation::visitStoreMatrixOp(
1327 xegpu::StoreMatrixOp storeMatrix, ArrayRef<LayoutInfoLattice *> operands,
1328 ArrayRef<const LayoutInfoLattice *> results) {
1329 xegpu::DistributeLayoutAttr anchorLayout = storeMatrix.getLayoutAttr();
1330 LayoutInfo layout;
1331 if (hasParamsOfLayoutKind(anchorLayout)) {
1332 layout = LayoutInfo(anchorLayout);
1333 } else {
1334 VectorType srcVecTy =
1335 llvm::cast<VectorType>(storeMatrix.getData().getType());
1336 const uArch *uArch = getUArch(getChipStr(storeMatrix).value_or(""));
1337 if (!uArch)
1338 return;
1339 auto requiredAnchorLayoutAttr =
1340 xegpu::setupStoreMatrixAnchorLayout(layoutKind, srcVecTy, uArch);
1341 storeMatrix.setLayoutAttr(requiredAnchorLayoutAttr);
1342 layout = LayoutInfo(requiredAnchorLayoutAttr);
1343 }
1344
1345 propagateIfChanged(operands[0], operands[0]->meet(layout));
1346}
1347
1348namespace {
1349//===----------------------------------------------------------------------===//
1350// RunLayoutInfoPropagation
1351//===----------------------------------------------------------------------===//
1352
1353/// Driver class for running the LayoutInfoPropagation analysis.
1354class RunLayoutInfoPropagation {
1355public:
1356 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
1357
1358 RunLayoutInfoPropagation(Operation *op, xegpu::LayoutKind layoutKind,
1359 unsigned indexBitWidth)
1360 : target(op) {
1361 SymbolTableCollection symbolTable;
1362 loadBaselineAnalyses(solver);
1363 solver.load<LayoutInfoPropagation>(symbolTable, layoutKind, indexBitWidth);
1364 (void)solver.initializeAndRun(op);
1365 }
1366
1367 LayoutInfo getLayoutInfo(Value val);
1368
1369 void printAnalysisResult(llvm::raw_ostream &os);
1370
1371private:
1372 DataFlowSolver solver;
1373 const Operation *target;
1374};
1375} // namespace
1376
1377LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
1378 auto *state = solver.lookupState<LayoutInfoLattice>(val);
1379 if (!state)
1380 return {};
1381 return state->getValue();
1382}
1383
1384// Print the analysis result for debugging purposes.
1385void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
1386 auto printFunctionResult = [&](FunctionOpInterface funcOp) {
1387 os << "function: " << funcOp.getName() << ":\n";
1388 // Function arguments
1389 for (BlockArgument arg : funcOp.getArguments()) {
1390 LayoutInfo layout = getLayoutInfo(arg);
1391 os << "argument: " << arg << "\n";
1392 os << "layout : ";
1393 layout.print(os);
1394 os << "\n";
1395 }
1396 // Function ops
1397 funcOp.walk([&](Operation *op) {
1398 // Skip ops that do not have results
1399 if (op->getResults().empty())
1400 return;
1401 os << "op : ";
1402 // For control-flow ops, print the op name only.
1403 if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
1404 os << op->getName();
1405 else
1406 op->print(os);
1407 os << "\n";
1408 // Print the layout for each result.
1409 for (auto [i, r] : llvm::enumerate(op->getResults())) {
1410 LayoutInfo layout = getLayoutInfo(r);
1411 os << "layout for result #" << i << ": ";
1412 layout.print(os);
1413 os << "\n";
1414 }
1415 });
1416 };
1417
1418 SmallVector<FunctionOpInterface> funcOps;
1419 if (auto modOp = dyn_cast<ModuleOp>(target)) {
1420 for (auto funcOp : modOp.getOps<FunctionOpInterface>())
1421 funcOps.push_back(funcOp);
1422
1423 // Collect all GpuFuncOps in the module.
1424 for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
1425 for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>())
1426 funcOps.push_back(gpuFuncOp);
1427 }
1428 }
1429 // Print the analysis result for each function.
1430 for (FunctionOpInterface funcOp : funcOps)
1431 printFunctionResult(funcOp);
1432}
1433
1434namespace {
1435
1436//===----------------------------------------------------------------------===//
1437// ResolveLayoutConflicts
1438//===----------------------------------------------------------------------===//
1439
1440/// Helper to get the defining CreateNdDescOp of a tensor descriptor value. This
1441/// function tries to find the defining CreateNdDescOp recursively accross
1442/// control-flow boundaries.
1443static xegpu::CreateNdDescOp getDefiningCreateNdDescOp(Value tdescValue) {
1444 // Try to get the defining CreateNdDescOp of the tensor descriptor.
1445 auto definingOp = tdescValue.getDefiningOp<xegpu::CreateNdDescOp>();
1446 if (definingOp)
1447 return definingOp;
1448 // If tdescValue is an argument, try to get the tied init value from the
1449 // parent loop-like op.
1450 if (auto arg = dyn_cast<BlockArgument>(tdescValue)) {
1451 auto *parentOp = arg.getOwner()->getParentOp();
1452 if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
1453 OpOperand *tiedInit = loop.getTiedLoopInit(arg);
1454 if (tiedInit)
1455 return getDefiningCreateNdDescOp(tiedInit->get());
1456 }
1457 }
1458 // If not found, return null.
1459 return nullptr;
1460}
1461
1462struct ResolveLayoutConflicts {
1463 ResolveLayoutConflicts(Operation *parentOp)
1464 : parentOp(parentOp), builder(parentOp->getContext()) {}
1465 LogicalResult run();
1466
1467private:
1468 Operation *parentOp;
1469 OpBuilder builder;
1470 LogicalResult resolveTensorDescConsumer(OpOperand &operand);
1471 LogicalResult resolveVectorConsumer(OpOperand &operand);
1472 LogicalResult assignResultLayout(OpResult &result);
1473};
1474
1475} // namespace
1476
1477LogicalResult ResolveLayoutConflicts::run() {
1478 // Scan all operations in the parent op and resolve layout conflicts at
1479 // tensor descriptor and vector use points.
1480 auto r = parentOp->walk([&](Operation *op) -> WalkResult {
1481 // if the operation inputs vector and output scalar, like multi-reduction we
1482 // need to check if the result has layout and add a convert_layout to serve
1483 // as anchor op for the reduction op's layout.
1484 if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
1485 for (OpResult result : op->getResults()) {
1486 if (result.getType().isIntOrFloat()) {
1487 auto res = assignResultLayout(result);
1488 if (failed(res)) {
1489 DBGS() << "Failed to resolve vector consumer for multi-reduction "
1490 << *op << "\n";
1491 return WalkResult::interrupt();
1492 }
1493 }
1494 }
1495 }
1496 for (OpOperand &operand : op->getOpOperands()) {
1497 // Handle conflicts in tensor descriptor operands.
1498 Type operandType = operand.get().getType();
1499 if (isa<xegpu::AnchorLayoutInterface>(op) &&
1500 isa<xegpu::TensorDescType>(operandType)) {
1501 auto res = resolveTensorDescConsumer(operand);
1502 if (failed(res)) {
1503 DBGS() << "Failed to resolve tensor descriptor consumer: " << *op
1504 << "\n";
1505 return WalkResult::interrupt();
1506 }
1507 }
1508 // Handle conflicts in vector operands.
1509 if (isa<VectorType>(operandType)) {
1510 auto res = resolveVectorConsumer(operand);
1511 if (failed(res)) {
1512 DBGS() << "Failed to resolve vector consumer: " << *op << "\n";
1513 return WalkResult::interrupt();
1514 }
1515 }
1516 }
1517 return WalkResult::advance();
1518 });
1519
1520 LLVM_DEBUG({
1521 DBGS() << "IR after resolving layout conflicts:\n";
1522 parentOp->dump();
1523 });
1524
1525 return r.wasInterrupted() ? failure() : success();
1526}
1527
1528LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
1529 Operation *producerOp = result.getDefiningOp();
1530 auto producerLayout = xegpu::getDistributeLayoutAttr(result);
1531 // Insert a convert_layout op to assign the layout.
1533 auto convertOp = xegpu::ConvertLayoutOp::create(
1534 builder, producerOp->getLoc(), result.getType(), result, producerLayout,
1535 producerLayout);
1536 result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
1537 return success();
1538}
1539
1540LogicalResult
1541ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
1542 Value vectorValue = operand.get();
1543 Operation *consumerOp = operand.getOwner();
1544 // Get the current layout of the vector value.
1545 auto producerLayout = xegpu::getDistributeLayoutAttr(vectorValue);
1546 if (!producerLayout) {
1547 if (auto vectorTy = dyn_cast<VectorType>(vectorValue.getType());
1548 vectorTy && vectorTy.getRank() > 1)
1549 consumerOp->emitWarning("Expected layout for non-1D vectors.");
1550 return success(); // uniform non-tensor-data vector does not require layout
1551 }
1552 // Region branch ops (e.g. scf.for) and their terminators (e.g. scf.yield)
1553 // forward their operands to successor region inputs / parent op results;
1554 // their consumer layout is resolved through that forwarding, not at this
1555 // use point.
1556 if (isa<RegionBranchOpInterface, RegionBranchTerminatorOpInterface>(
1557 consumerOp))
1558 return success();
1559
1560 auto consumerLayout = xegpu::getConsumerLayoutAt(operand);
1561 if (!consumerLayout)
1562 return consumerOp->emitError(
1563 "No consumer layout found for vector operand.");
1564
1565 // If layouts are same, no conflict exists, return success.
1566 if (consumerLayout.isEqualTo(producerLayout))
1567 return success();
1568
1569 // If the producer is trivially rematerializable (e.g. `vector.step`, splat
1570 // `arith.constant`), clone it and stamp the consumer's expected layout on
1571 // the clone instead of inserting a `xegpu.convert_layout`. The convert
1572 // would otherwise lower to a cross-subgroup data movement through SLM at
1573 // WG-to-SG distribution time, which is more expensive than
1574 // recomputing a pure value generator.
1575 if (auto *producerOp = vectorValue.getDefiningOp();
1576 producerOp && producerOp->getNumResults() == 1 &&
1577 isa<OpResult>(vectorValue) &&
1579 builder.setInsertionPointAfter(producerOp);
1580 Operation *clone = builder.clone(*producerOp);
1581 OpResult cloneResult = clone->getResult(0);
1582 // Drop the inherited producer layout so the new layout takes effect
1583 xegpu::removeLayoutAttr(cloneResult);
1584 xegpu::setDistributeLayoutAttr(cloneResult, consumerLayout);
1585 operand.set(cloneResult);
1586 return success();
1587 }
1588
1589 // Insert a convert_layout op to resolve the conflict.
1590 builder.setInsertionPointAfterValue(vectorValue);
1591 auto convertOp = xegpu::ConvertLayoutOp::create(
1592 builder, consumerOp->getLoc(), vectorValue.getType(), vectorValue,
1593 producerLayout, consumerLayout);
1594
1595 // Update the operand to use the converted value.
1596 operand.set(convertOp.getResult());
1597 return success();
1598}
1599
1600LogicalResult
1601ResolveLayoutConflicts::resolveTensorDescConsumer(OpOperand &operand) {
1602 Operation *consumerOp = operand.getOwner();
1603 Value tdescValue = operand.get();
1604 auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(consumerOp);
1605 auto currTDescType = dyn_cast<xegpu::TensorDescType>(tdescValue.getType());
1606 assert(anchorOp && currTDescType &&
1607 "Expected anchor layout op and tensor descriptor consumer.");
1608 Attribute currLayout = currTDescType.getLayout();
1609 Attribute expectedLayout = anchorOp.getAnchorLayout();
1610 // A conflict exists in tensor descriptor operand if tensor descriptor's
1611 // layout is different from the anchor layout expected by the consumer.
1612 if (expectedLayout && currLayout && expectedLayout != currLayout) {
1613 // Try to get the defining CreateNdDescOp of the tensor descriptor.
1614 auto conflictingCreateNdOp = getDefiningCreateNdDescOp(tdescValue);
1615 if (!conflictingCreateNdOp) {
1616 DBGS() << "Unable to find defining CreateNdDescOp for tensor descriptor: "
1617 << tdescValue << "\n";
1618 return failure();
1619 }
1620 // Duplicate the CreateNdDescOp with the expected layout.
1621 builder.setInsertionPointAfter(conflictingCreateNdOp);
1622 auto newTensorDescType = xegpu::TensorDescType::get(
1623 conflictingCreateNdOp.getContext(), currTDescType.getShape(),
1624 currTDescType.getElementType(), currTDescType.getEncoding(),
1625 expectedLayout);
1626 xegpu::CreateNdDescOp newOp = xegpu::CreateNdDescOp::create(
1627 builder, consumerOp->getLoc(), newTensorDescType,
1628 conflictingCreateNdOp->getOperands(),
1629 conflictingCreateNdOp->getAttrs());
1630 // Replace the tensor descriptor operand in the consumer op with the new
1631 // tensor descriptor.
1632 consumerOp->replaceUsesOfWith(tdescValue, newOp.getResult());
1633 }
1634 return success();
1635}
1636
1637using GetLayoutFnTy = function_ref<xegpu::DistributeLayoutAttr(Value)>;
1638/// Update an operation with the layout of its results. If the result type is
1639/// a vector type, a temporary layout attribute is added to the operation. If
1640/// the result type is a tensor descriptor type, the type is updated with the
1641/// layout attribute. The users of the result are also updated with the layout
1642/// attribute.
1643static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
1644 GetLayoutFnTy getLayoutOfValue) {
1645 // Region ops (like scf.for) are already handled by the
1646 // updateControlFlowOps.
1647 if (mlir::isa<mlir::RegionBranchOpInterface>(op))
1648 return success();
1649
1650 // Iterate over all the results.
1651 for (OpResult result : op->getResults()) {
1652 Type resultType = result.getType();
1653 // Layouts are needed only for vector and tensor descriptor types.
1654 if (!isa<VectorType, xegpu::TensorDescType>(resultType))
1655 continue;
1656 // If the result has no layout but has users, emit a warning and continue.
1657 xegpu::DistributeLayoutAttr layout = getLayoutOfValue(result);
1658 if (!layout && result.getNumUses() > 0) {
1659 op->emitWarning("op has users but no layout assigned for its result");
1660 continue;
1661 }
1662 // If the result is a tensor descriptor type, update the tensor desc type
1663 // with layout.
1664 if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
1665 auto typeWithLayout = xegpu::TensorDescType::get(
1666 tensorDescTy.getContext(), tensorDescTy.getShape(),
1667 tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
1668 result.setType(typeWithLayout);
1669 continue;
1670 }
1671 // If the result is a vector type, add a temporary layout attribute to the
1672 // op.
1674 }
1675 return success();
1676}
1677
1678/// Region ops like scf.for need special handling because they have blocks
1679/// inside. If the blocks have tensor descriptor type as block arguments,
1680/// thier types must be updated. Also region op can have results that may not
1681/// have any users (e.g. A and B tiles). They are not assigned a layout by
1682/// layout analysis because they have no users. However inside the region op
1683/// corresponding block arguments for these results do have layouts.
1684/// Therefore, in this case we still need to update the result types with the
1685/// layout attribute. This function function updates the internal block
1686/// arguments and the result types of the region op with the assigned layouts.
1687/// clang-format off
1688/// Example: scf.for ... iter_args(...) -> (out types) {
1689/// ^bb0(block types):
1690/// ...
1691/// scf.yield ... : (yield types)
1692/// }
1693/// clang-format on
1694/// In this example, at scf.yield, control-flow can transfer to two successor
1695/// regions. One is the ^bb0 (for loop body) and the other is the scf.for op
1696/// itself (yield the results). So we update both the block arguments of the
1697/// successor region (i.e. block types) and the result types of the scf.for op
1698/// (i.e. out types). Note that yield types are updated by respective
1699/// producers inside bb0.
1700static LogicalResult
1702 mlir::RegionBranchTerminatorOpInterface terminator,
1703 GetLayoutFnTy getLayoutOfValue) {
1704 // Only process if the terminator is inside a region branch op.
1705 auto branchOp = dyn_cast<RegionBranchOpInterface>(terminator->getParentOp());
1706 if (!branchOp)
1707 return success();
1708
1710 branchOp.getSuccessorOperandInputMapping(mapping,
1711 RegionBranchPoint(terminator));
1712 for (const auto &[successorOperand, successorInputs] : mapping) {
1713 for (Value successorInput : successorInputs) {
1714 Type inputType = successorInput.getType();
1715 // We only need to operate on tensor descriptor or vector types.
1716 if (!isa<xegpu::TensorDescType, VectorType>(inputType))
1717 continue;
1718 xegpu::DistributeLayoutAttr successorOperandLayout =
1719 getLayoutOfValue(successorOperand->get());
1720
1721 // If either of the layouts is not assigned, we cannot proceed.
1722 if (!successorOperandLayout) {
1723 LLVM_DEBUG(DBGS() << "No layout assigned for forwarded operand in "
1724 "branch terminator: "
1725 << successorOperand->get() << "\n");
1726 return failure();
1727 }
1728 // Get tensor descriptor type with the layout.
1729 if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType)) {
1730 auto newTdescTy = xegpu::TensorDescType::get(
1731 tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
1732 tdescTy.getEncoding(), successorOperandLayout);
1733 successorInput.setType(newTdescTy);
1734 continue;
1735 }
1736 // If the type is a vector type and this region argument is an OpResult,
1737 // set the layout attribute on the OpResult.
1738 if (auto result = dyn_cast<OpResult>(successorInput))
1739 xegpu::setDistributeLayoutAttr(result, successorOperandLayout);
1740 }
1741 }
1742 return success();
1743}
1744
1745/// Update the function arguments and results with the layouts.
1746static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder,
1747 mlir::FunctionOpInterface funcOp,
1748 GetLayoutFnTy getLayoutOfValue) {
1749 // Only process functions whose type is a standard MLIR FunctionType.
1750 // Functions using a different type representation (e.g. llvm.func with
1751 // LLVMFunctionType) are not targets for XeGPU layout propagation, and
1752 // calling setType(FunctionType{}) on them would corrupt their type.
1753 if (!isa<FunctionType>(funcOp.getFunctionType()))
1754 return success();
1755 SmallVector<Type> newArgTypes;
1756 // Update the function arguments.
1757 for (BlockArgument arg : funcOp.getArguments()) {
1758 Type argType = arg.getType();
1759 newArgTypes.push_back(argType);
1760 if (!isa<VectorType, xegpu::TensorDescType>(argType))
1761 continue;
1762 xegpu::DistributeLayoutAttr layout = getLayoutOfValue(arg);
1763 if (!layout) {
1764 LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg
1765 << " but got none.\n");
1766 return failure();
1767 }
1768 if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(argType)) {
1769 auto newTdescTy = xegpu::TensorDescType::get(
1770 tensorDescTy.getContext(), tensorDescTy.getShape(),
1771 tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
1772 arg.setType(newTdescTy);
1773 newArgTypes.back() = newTdescTy;
1774 }
1775 }
1776 // Update the function type with the new argument types.
1777 // NOTE: We assume that function results are not expected to have layouts.
1778 funcOp.setType(FunctionType::get(funcOp.getContext(), newArgTypes,
1779 funcOp.getResultTypes()));
1780 return success();
1781}
1782
1783namespace {
1784struct XeGPUPropagateLayoutPass final
1785 : public xegpu::impl::XeGPUPropagateLayoutBase<XeGPUPropagateLayoutPass> {
1786 XeGPUPropagateLayoutPass() = default;
1787 XeGPUPropagateLayoutPass(const XeGPUPropagateLayoutPass &other) = default;
1788 XeGPUPropagateLayoutPass(xegpu::XeGPUPropagateLayoutOptions options)
1789 : XeGPUPropagateLayoutBase(std::move(options)) {}
1790 void runOnOperation() override;
1791};
1792
1793} // namespace
1794
1796 LayoutKind layoutKind,
1797 unsigned indexBitWidth, bool printOnly) {
1798 RunLayoutInfoPropagation analysis(target, layoutKind, indexBitWidth);
1799 // Print the analysis result and exit. (for debugging purposes)
1800 if (printOnly) {
1801 auto &os = llvm::outs();
1802 analysis.printAnalysisResult(os);
1803 return success();
1804 }
1805 // Helper to convert LayoutInfo to xegpu::LayoutAttr.
1806 auto getXeGPULayoutForValue = [&](Value val) -> xegpu::DistributeLayoutAttr {
1807 LayoutInfo layout = analysis.getLayoutInfo(val);
1808 if (auto opResult = dyn_cast<OpResult>(val)) {
1809 Operation *defOp = opResult.getDefiningOp();
1810 if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
1811 auto anchorLayout = anchorOp.getAnchorLayout();
1812 if (anchorLayout != nullptr)
1813 return anchorLayout;
1814 }
1815 xegpu::DistributeLayoutAttr requiredResLayoutAttr =
1816 xegpu::getTemporaryLayout(opResult);
1817 if (requiredResLayoutAttr != nullptr)
1818 return requiredResLayoutAttr;
1819 }
1820 if (!layout.isAssigned())
1821 return {};
1822 xegpu::DistributeLayoutAttr layoutAttr =
1823 cast<xegpu::DistributeLayoutAttr>(layout.get());
1824 if (layout.isSliceLayout())
1825 return cast<xegpu::SliceAttr>(layoutAttr);
1826
1827 return cast<xegpu::LayoutAttr>(layoutAttr);
1828 };
1829
1830 Operation *op = target;
1831 auto walkResult = op->walk([&](mlir::Block *block) -> WalkResult {
1832 for (mlir::Operation &op : llvm::reverse(block->getOperations())) {
1833 LogicalResult r = success();
1835 .Case([&](mlir::RegionBranchTerminatorOpInterface branchTermOp) {
1836 r = updateControlFlowOps(builder, branchTermOp,
1837 getXeGPULayoutForValue);
1838 })
1839 .Case([&](mlir::FunctionOpInterface funcOp) {
1840 r = updateFunctionOpInterface(builder, funcOp,
1841 getXeGPULayoutForValue);
1842 })
1843 .Default([&](Operation *op) {
1844 r = updateOp(builder, op, getXeGPULayoutForValue);
1845 });
1846 if (failed(r)) {
1847 op.emitError("Failed to update operation with the layout.");
1848 return WalkResult::interrupt();
1849 }
1850 }
1851 return WalkResult::advance();
1852 });
1853 if (walkResult.wasInterrupted())
1854 return failure();
1855
1856 return success();
1857}
1858
1860 ResolveLayoutConflicts resolver(target);
1861 return resolver.run();
1862}
1863
1864void XeGPUPropagateLayoutPass::runOnOperation() {
1865
1866 xegpu::removeTemporaryLayoutAttrs(getOperation());
1867
1868 xegpu::LayoutKind layoutKind;
1869 if (this->layoutKind == "lane") {
1870 layoutKind = xegpu::LayoutKind::Lane;
1871 } else if (this->layoutKind == "inst") {
1872 layoutKind = xegpu::LayoutKind::InstData;
1873 } else if (this->layoutKind == "subgroup") {
1874 layoutKind = xegpu::LayoutKind::Subgroup;
1875 } else {
1876 getOperation()->emitError("Unsupported layout kind option: " +
1877 this->layoutKind);
1878 signalPassFailure();
1879 return;
1880 }
1881 OpBuilder builder(&getContext());
1882 if (failed(xegpu::propagateLayouts(builder, getOperation(), layoutKind,
1883 this->indexBitWidth, this->printOnly))) {
1884 signalPassFailure();
1885 return;
1886 }
1887 // Resolve layout conflicts if any.
1888 if (failed(xegpu::resolveLayoutConflicts(getOperation()))) {
1889 signalPassFailure();
1890 return;
1891 }
1892}
return success()
#define DBGS()
Definition Hoisting.cpp:32
std::string join(const Ts &...args)
Helper function to concatenate arguments into a std::string.
lhs
b getContext())
auto load
static llvm::ManagedStatic< PassManagerOptions > options
static void print(spirv::VerCapExtAttr triple, DialectAsmPrinter &printer)
static Value broadcast(Location loc, Value toBroadcast, unsigned numElements, const TypeConverter &typeConverter, ConversionPatternRewriter &rewriter)
Broadcasts the value to vector with numElements number of elements.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition TypeID.h:331
static SmallVector< LayoutRepresentation > getValidLayouts(ArrayRef< int64_t > wgShape, ArrayRef< int64_t > instData, int64_t sgCount)
static LogicalResult updateControlFlowOps(mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, GetLayoutFnTy getLayoutOfValue)
Region ops like scf.for need special handling because they have blocks inside.
function_ref< xegpu::DistributeLayoutAttr(Value)> GetLayoutFnTy
FailureOr< int64_t > getNumSg(Operation *op, const int sgSize)
static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue)
Update an operation with the layout of its results.
static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, mlir::FunctionOpInterface funcOp, GetLayoutFnTy getLayoutOfValue)
Update the function arguments and results with the layouts.
Attributes are known-constant values of operations.
Definition Attributes.h:25
This class represents an argument of a Block.
Definition Value.h:306
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType & getOperations()
Definition Block.h:147
The general data-flow analysis solver.
LogicalResult initializeAndRun(Operation *top, llvm::function_ref< bool(DataFlowAnalysis &)> analysisFilter=nullptr)
Initialize analyses starting from the provided top-level operation and run the analysis until fixpoin...
const StateT * lookupState(AnchorT anchor) const
Lookup an analysis state for the given lattice anchor.
AnalysisT * load(Args &&...args)
Load an analysis into the solver. Return the analysis instance.
IRValueT get() const
Return the current value being used by this operand.
void set(IRValueT newValue)
Set the current value being used by this operand.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class helps build Operations.
Definition Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:567
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Definition Builders.h:423
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:414
This class represents an operand of an operation.
Definition Value.h:254
This is a value defined by a result of an operation.
Definition Value.h:454
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
void replaceUsesOfWith(Value from, Value to)
Replace any uses of 'from' with 'to' within this operation.
InFlightDiagnostic emitWarning(const Twine &message={})
Emit a warning about this operation, reporting up to any diagnostic handlers that may be listening.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:432
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:240
MutableArrayRef< OpOperand > getOpOperands()
Definition Operation.h:408
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition Operation.h:255
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:115
void print(raw_ostream &os, const OpPrintingFlags &flags={})
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition Operation.h:822
result_range getResults()
Definition Operation.h:440
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:429
This class represents a point being branched from in the methods of the RegionBranchOpInterface.
This class represents a successor of a region.
This class represents a collection of SymbolTables.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition Types.cpp:118
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition Value.cpp:18
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult advance()
Definition WalkResult.h:47
static WalkResult interrupt()
Definition WalkResult.h:46
This class represents a lattice holding a specific value of type ValueT.
A sparse (backward) data-flow analysis for propagating SSA value lattices backwards across the IR by ...
SparseBackwardDataFlowAnalysis(DataFlowSolver &solver, SymbolTableCollection &symbolTable)
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Operation * getOwner() const
Return the owner of this operand.
Definition UseDefLists.h:38
void loadBaselineAnalyses(DataFlowSolver &solver)
Populates a DataFlowSolver with analyses that are required to ensure user-defined analyses are run pr...
Definition Utils.h:29
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
const uArch * getUArch(llvm::StringRef archName)
DistributeLayoutAttr inferShapeCastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a shape cast operation given the result layout attribute,...
DistributeLayoutAttr setupInterleaveResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the result layout for an interleave operation to ensure the source layout can be safely deriv...
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > permutation)
Infers the source layout attribute for a transpose operation given the result layout attribute and pe...
DistributeLayoutAttr inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for an insert strided slice operation given the result layout attr...
void removeTemporaryLayoutAttrs(Operation *op)
Removes the temporary layout attributes for each OpOperand and OpResult of the given operation.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
LayoutKind
Specifies the level of a layout hierarchy for comparison or propagation.
Definition XeGPU.h:32
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
DistributeLayoutAttr inferInterleaveSourceLayout(DistributeLayoutAttr resLayout)
Infers the source layout attribute for an interleave operation given the result layout attribute.
DistributeLayoutAttr setupLoadMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for load matrix operation.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr inferBroadcastSourceLayout(DistributeLayoutAttr resLayout, ArrayRef< int64_t > resShape, ArrayRef< int64_t > srcShape)
Infers the source layout attribute for a broadcast operation given the result layout attribute,...
std::optional< std::tuple< DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr > > setupDpasMxLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, VectorType aScaleTy, VectorType bScaleTy, DistributeLayoutAttr consumerLayout, int numSg, const uArch::uArch *uArch)
Sets up the anchor layouts for dpas_mx operands (A, B, C/D, A_scale, and B_scale).
DistributeLayoutAttr setupStoreScatterAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, const uArch::uArch *uArch)
Sets up the anchor layout for a store scatter operation.
SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, DistributeLayoutAttr consumerLayout, SmallVector< int64_t > reductionDims, int numSg, const uArch::uArch *uArch)
Sets up layout for Multi-Reduction operations by creating a SliceAttr for the result.
DistributeLayoutAttr setupBitCastResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Setup the result layout attribute for a bitcast operation based on element type bitwidths.
void removeLayoutAttr(const T &operandOrResult)
Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
DistributeLayoutAttr inferMaskOffsetLayoutForScatterIO(DistributeLayoutAttr payloadLayout, int chunkSize)
Infers the layout attribute for mask and offset operand for Chunked load and store,...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
LogicalResult resolveLayoutConflicts(Operation *target)
DistributeLayoutAttr inferBitCastSourceLayout(DistributeLayoutAttr resLayout, int resElemTyBitWidth, int srcElemTyBitWidth)
Infers the source layout attribute for a bitcast operation given the result layout attribute,...
DistributeLayoutAttr setupInsertStridedSliceResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the result layout for an insert strided slice operation.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
DistributeLayoutAttr inferDeinterleaveSourceLayout(DistributeLayoutAttr resLayout)
Infers the source layout attribute for a deinterleave operation given the result layout attribute.
DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand)
Gets the expected layout for a given consumer operand.
DistributeLayoutAttr inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout, SmallVector< int64_t > reduceDims)
Infers the source layout attribute for a reduction operation given the result layout attribute and re...
bool isTriviallyRematerializable(Operation *op)
Returns true if op is safe and cheap to clone: it has no side effects, no regions,...
DistributeLayoutAttr setupLoadGatherAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, int chunkSize, DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch)
Sets up the anchor layout for a load gather operation.
LogicalResult propagateLayouts(OpBuilder &builder, Operation *target, LayoutKind layoutKind, unsigned indexBitWidth, bool printOnly=false)
std::optional< std::tuple< DistributeLayoutAttr, DistributeLayoutAttr, DistributeLayoutAttr > > setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg, const uArch::uArch *uArch)
Sets up the anchor layouts for a dpas operands (A, B, and C/D).
SliceAttr setupReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, const uArch::uArch *uArch)
Sets up layout for Reduction operations by creating a SliceAttr for the result.
DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy, const uArch::uArch *uArch)
Sets up the anchor layout for a store matrix operation.
Include the generated interface declarations.
DenseMap< OpOperand *, SmallVector< Value > > RegionBranchSuccessorMapping
A mapping from successor operands to successor inputs.
bool operator==(StringAttr lhs, std::nullptr_t)
Define comparisons for StringAttr against nullptr and itself to avoid the StringRef overloads from be...
llvm::TypeSwitch< T, ResultT > TypeSwitch
Definition LLVM.h:139
Operation * clone(OpBuilder &b, Operation *op, TypeRange newResultTypes, ValueRange newOperands)
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
llvm::function_ref< Fn > function_ref
Definition LLVM.h:147
virtual int getSubgroupSize() const =0
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:168