MLIR 23.0.0git
XeGPUUtils.h
Go to the documentation of this file.
1//===- XeGPUUtils.h - Vector Utilities --------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
10#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
11
15#include "llvm/ADT/SetVector.h"
16#include <functional>
17
18namespace mlir {
19
20class UnrealizedConversionCastOp;
21class VectorType;
22class OpOperand;
23class OpResult;
24class OpBuilder;
25class ValueRange;
26class TypeConverter;
27class OpFoldResult;
28
29namespace xegpu {
30class DistributeLayoutAttr;
31class LayoutAttr;
32class TensorDescType;
33
34namespace uArch {
35struct uArch;
36} // namespace uArch
37} // namespace xegpu
38
39namespace xegpu {
40
41/// Flatten a set of ValueRange into a single SmallVector<Value>
42SmallVector<Value> flattenValues(ArrayRef<ValueRange> values);
43
44/// If tensor descriptor has a layout attribute it is used in SIMT mode.
45/// In this mode, the distributed vector shape is determined as follows:
46/// Definitions:
47/// lane_data_size = lane_data[0] × lane_data[1]
48/// subgroup_size = lane_layout[0] × lane_layout[1]
49/// distribution_unit_size = subgroup_size × lane_data_size
50///
51/// Case 1: Regular loads/stores.
52/// The following conditions must be met:
53/// * tensor_desc[0] == lane_layout[0]
54/// Distributed vector is a 1D vector with shape:
55/// [chunk_size]
56///
57/// Case 2: Block loads/stores
58/// Additional definitions:
59/// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
60/// n_distribution_units = tensor_size / distribution_unit_size
61/// fragment_size = n_distribution_units * lane_data_size
62/// Given above definitions, the following conditions must be met:
63/// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
64/// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
65/// Distributed vector is a 1D vector with shape:
66/// [fragment_size]
67FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
68
69/// Helper to get the distributed vector type for a given vector type according
70/// to a given LayoutAttr.
71FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
72 LayoutAttr layout);
73
74/// Helper function to get distributed vector type for a source vector type
75/// according to the lane_layout. We simply divide each dimension of tensor
76/// descriptor shape by corresponding lane_layout dimension. If
77/// array_length > 1, that is appended to the front of the distributed shape.
78///
79/// Examples:
80/// | original vector shape | lane_layout | distributed vector shape |
81/// |-----------------------|-------------|--------------------------|
82/// | 32x16 | [1, 16] | 32x1 |
83/// | 32x16 | [2, 8] | 16x2 |
84/// | 2x32x16 | [1, 16] | 2x32x1 |
85FailureOr<VectorType>
86getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout,
87 VectorType originalType);
88
89/// Extract a set of small vectors from a value with a given shape using
90/// vector.extract_stride_slice
92 Location loc, Value value,
94
95/// Create a vector of shape from a set of values using
96/// vector.insert_stride_slice.
98 ValueRange values,
100
101/// Retrieves the chip string from the XeVM target attribute of the parent
102/// GPU module operation. Returns the chip identifier if found, or nullopt
103/// if no GPU module parent or XeVM target attribute exists.
104std::optional<std::string> getChipStr(Operation *op);
105
106/// Generates element-wise addition ops of two arrays with same length.
110
111/// Generates element-wise addition ops of two arrays with automatic alignment.
112/// When the input arrays have different sizes, the shorter array is
113/// right-aligned with the longer array, and the unmatched leading elements from
114/// the longer array are preserved unchanged. This is commonly used for offset
115/// computation where higher-dimensional offsets need to be added to
116/// lower-dimensional adjustments.
117///
118/// Example:
119/// lhs = [l1, l2, l3], rhs = [r1, r2]
120/// Result: [11, l2+r1, l3+r2]
124
125/// Given an `input` value representing per-lane data, this function returns the
126/// result after performing a reduction on the input over all lanes (number of
127/// lanes given by `size`). This uses butterfly shuffles to perform the
128/// reduction in a log2(size) number of steps.
129/// NOTE: Implementation taken from TestVectorTransforms.cpp
130Value subgroupReduction(Location loc, OpBuilder &builder, Value input,
131 vector::CombiningKind kind, uint32_t size);
132
133/// Given a `src` and an `acc` argumments from a vector::MultiDimReductionOp,
134/// lower to a set of vector::ReductionOp ops over 1D slices extracted from
135/// `src`. The reduction is performed along `reductionDim`. The result is a
136/// vector with the same shape as `acc`.
137/// TODO: Only 2D to 1D reduction is supported for now.
140 vector::CombiningKind kind, int64_t reductionDim,
141 Location loc, PatternRewriter &rewriter);
142
143/// Creates a constant filled with the neutral (identity) value for the
144/// given reduction kind. For example: 0 for ADD/OR/XOR, 1 for MUL/AND,
145/// max/min signed/unsigned int for MINSI/MINUI/MAXSI/MAXUI, and +/-infinity
146/// for float min/max operations. If \p type is a VectorType, returns a splat
147/// vector constant; otherwise returns a scalar constant. Returns nullptr if
148/// the element type is incompatible with the requested reduction kind.
150 vector::CombiningKind kind);
151
152/// Lowers cross-lane reductions to shuffle operations on a 2D vector.
153/// Extracts slices along the reduction dimension, performs subgroup reductions
154/// with shuffles across reductionSize work-items, and inserts the results back
155/// into an accumulator vector.
158 vector::CombiningKind kind,
159 int64_t reductionDim,
160 int64_t reductionSize, Location loc,
161 PatternRewriter &rewriter);
162
163/// Helper Function to find a proper instruction multiple for the user-supplied
164/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
165/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
166/// array length).
167template <typename T>
168int getLargestDivisor(T dim, ArrayRef<T> candidates,
169 ArrayRef<T> candidateMultiples = {});
170
171/// Retrieves the DistributeLayoutAttr associated with a given Value. For
172/// TensorDescType values, the DistributeLayoutAttr is extracted from the
173/// TensorDescType itself. For other values, it is obtained from the attributes
174/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is
175/// found.
176DistributeLayoutAttr getDistributeLayoutAttr(const Value value);
177
178/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It
179/// will first check the operand_layout_{id} of the owner operation. If not
180/// found, it will check the operand itself and its defining op.
181DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr);
182
183/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult
184/// user should use setAnchorLayout instead
186 const DistributeLayoutAttr layout);
187
188/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpOperand
189/// user should use setAnchorLayout instead
190void setDistributeLayoutAttr(const OpOperand &opr,
191 const DistributeLayoutAttr layout);
192
193/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
194std::string getTemporaryLayoutName(const OpOperand &operand);
195
196/// Return the attribute name for the OpResult to attach DistributeLayoutAttr
197std::string getTemporaryLayoutName(const OpResult result);
198
199/// get and set distribute layout attribute for non-anchor operations
200/// (and offsets/masks of load/store ops before we get rid of their temp attrs)
201template <typename T,
202 typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
203 std::is_same_v<T, OpResult>>>
204DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult);
205
206template <typename T,
207 typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
208 std::is_same_v<T, OpResult>>>
209void setTemporaryLayout(const T &operandOrResult,
210 const DistributeLayoutAttr layout);
211
212/// Helper function to check if the layout is packed. Layout is packed if it is
213/// 2D and lane_data[0] != 1 (data packed from col dimension).
214/// TODO: Move to target info.
215bool requirePacked(const DistributeLayoutAttr layout);
216
217/// Helper function to check if the layout requires a transpose effect.
218bool requireTranspose(const DistributeLayoutAttr layout,
219 const uArch::uArch *uArch);
220
221// Check if dst shape is an expansion of src shape by inserting unit dimensions.
223 SmallVector<int64_t> &expandedUnitDims);
224
225// Checks if dst shape is an expansion of src shape where each dimension in src
226// is split into one or more consecutive dimensions in dst
228 SmallVector<SmallVector<int64_t>> &splitDimGroups);
229
230/// Callback type for computing sub-shape and count for 1:N (or 1:1
231/// shape-changing) VectorType conversion. Given a VectorType and its
232/// DistributeLayoutAttr, returns (subShape, count). A count <= 0 signals
233/// "no conversion needed"; count == 1 is a 1:1 shape-changing conversion;
234/// count > 1 produces `count` copies of `subShape`.
235using SubShapeAndCountFn = std::function<std::pair<SmallVector<int64_t>, int>(
236 VectorType, DistributeLayoutAttr)>;
237
238/// Pre-computes distributed VectorType mappings for every value carried
239/// through an SCF loop under `topLevelOp` (1:1 shape-changing or 1:N): the
240/// region block args (`scf.while` before/after args, `scf.for` iter_args), the
241/// loop results, and the terminator operands feeding them. Each is derived from
242/// a single source -- the layout of the feeding value (loop init or
243/// `scf.condition` operand) -- and keyed by `Value`, because the SCF converters
244/// detach/replace the loop body mid-conversion, after which a layout query on a
245/// block arg returns null. Recording results and terminator operands lets a 1:N
246/// pass resolve them from the map after stripping the loop op's transient
247/// layout attrs. `scf.if` has no loop-carried block args and needs no entry.
250 SubShapeAndCountFn getSubShapeAndCount);
251
252/// Adds a context-aware VectorType conversion to `converter` (1:1
253/// shape-changing or 1:N, depending on `getSubShapeAndCount`'s returned
254/// count). `getSubShapeAndCount` computes (subShape, count) for a VectorType
255/// and its layout; count <= 0 means no conversion needed. `loopArgTypes`
256/// (typically obtained from `precomputeLoopBlockArgTypes`) provides the
257/// pre-computed types for SCF loop block arguments (`scf.while`,
258/// `scf.for`); pass an empty map if the IR has no such loops.
260 SubShapeAndCountFn getSubShapeAndCount,
261 DenseMap<Value, SmallVector<Type>> loopArgTypes);
262
263/// Cleans up UnrealizedConversionCastOps inserted during SCF structural type
264/// conversion and/or XeGPU unrolling. Folds cancelling N:1->1:N and 1:N->N:1
265/// cast chains (inserting vector.shape_cast when shapes differ but element
266/// counts match). Unpaired pack (1:N) and unpack (N:1) casts between a single
267/// large VectorType and N identically-typed smaller VectorTypes are lowered
268/// to vector.extract_strided_slice / vector.insert_strided_slice. Dead casts
269/// are erased. Casts in `existingCasts` are preserved.
271 Operation *root,
272 const llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts);
273
274// Checks if dst shape is a collapse of src shape where each dimension in dst is
275// produced by one or more consecutive dimensions in src whose product equals
276// the dst dimension. Populates collapseDims with groups of src indices that are
277// collapsed into each dst dimension. Leading or trailing unit dst dimensions
278// (with no backing src dim) result in empty groups. Example: src=[8,16,32],
279// dst=[1,4096] -> true, collapseDims=[[],[0,1,2]].
281 SmallVector<SmallVector<int64_t>> &collapseDims);
282
283} // namespace xegpu
284
285} // namespace mlir
286
287#endif // MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
lhs
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
This class helps build Operations.
Definition Builders.h:209
This class represents a single result from folding an operation.
This class represents an operand of an operation.
Definition Value.h:254
This is a value defined by a result of an operation.
Definition Value.h:454
Operation is the basic unit of execution within MLIR.
Definition Operation.h:87
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
bool matchDimCollapse(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &collapseDims)
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
bool requirePacked(const DistributeLayoutAttr layout)
Helper function to check if the layout is packed.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
Value subgroupReduction(Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size)
Given an input value representing per-lane data, this function returns the result after performing a ...
bool matchUnitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< int64_t > &expandedUnitDims)
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
FailureOr< VectorType > getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout, VectorType originalType)
Helper function to get distributed vector type for a source vector type according to the lane_layout.
Value lowerToVectorReductions(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, Location loc, PatternRewriter &rewriter)
Given a src and an acc argumments from a vector::MultiDimReductionOp, lower to a set of vector::Reduc...
bool requireTranspose(const DistributeLayoutAttr layout, const uArch::uArch *uArch)
Helper function to check if the layout requires a transpose effect.
bool matchSplitDimExpansion(ArrayRef< int64_t > src, ArrayRef< int64_t > dst, SmallVector< SmallVector< int64_t > > &splitDimGroups)
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
DenseMap< Value, SmallVector< Type > > precomputeLoopBlockArgTypes(Operation *topLevelOp, SubShapeAndCountFn getSubShapeAndCount)
Pre-computes distributed VectorType mappings for every value carried through an SCF loop under topLev...
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
void addVectorTypeConversion(TypeConverter &converter, SubShapeAndCountFn getSubShapeAndCount, DenseMap< Value, SmallVector< Type > > loopArgTypes)
Adds a context-aware VectorType conversion to converter (1:1 shape-changing or 1:N,...
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
Value lowerCrossLaneReductionToShuffles(TypedValue< VectorType > src, TypedValue< VectorType > acc, vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize, Location loc, PatternRewriter &rewriter)
Lowers cross-lane reductions to shuffle operations on a 2D vector.
std::function< std::pair< SmallVector< int64_t >, int >( VectorType, DistributeLayoutAttr)> SubShapeAndCountFn
Callback type for computing sub-shape and count for 1:N (or 1:1 shape-changing) VectorType conversion...
Definition XeGPUUtils.h:235
void cleanupUnrealizedConversionCasts(Operation *root, const llvm::SmallSetVector< UnrealizedConversionCastOp, 8 > &existingCasts)
Cleans up UnrealizedConversionCastOps inserted during SCF structural type conversion and/or XeGPU unr...
SmallVector< Value > flattenValues(ArrayRef< ValueRange > values)
Flatten a set of ValueRange into a single SmallVector<Value>
SmallVector< OpFoldResult > addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with automatic alignment.
SmallVector< OpFoldResult > addElementwise(OpBuilder &builder, Location loc, ArrayRef< OpFoldResult > lhs, ArrayRef< OpFoldResult > rhs)
Generates element-wise addition ops of two arrays with same length.
FailureOr< VectorType > getDistributedVectorType(xegpu::TensorDescType tdescTy)
If tensor descriptor has a layout attribute it is used in SIMT mode.
Include the generated interface declarations.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
llvm::DenseMap< KeyT, ValueT, KeyInfoT, BucketT > DenseMap
Definition LLVM.h:120