21#include "llvm/ADT/STLExtras.h"
22#include "llvm/Support/DebugLog.h"
26#define GEN_PASS_DEF_XEGPUBLOCKING
27#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
31#define DEBUG_TYPE "xegpu-blocking"
43resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
47 auto hasIdenticalVectorTypes = [](
ValueRange values) {
48 auto types = values.getTypes();
49 return llvm::all_of(types, [&](
Type type) {
50 return isa<VectorType>(type) && type == types.front();
56 if (!hasIdenticalVectorTypes(inputs) || !hasIdenticalVectorTypes(outputs)) {
57 LDBG() <<
"skip unrealized conversion cast op not emulating pack/unpack.";
61 VectorType outputTy = dyn_cast<VectorType>(outputs[0].
getType());
63 if (inputs.size() > 1 && outputs.size() == 1) {
67 builder, castOp.getLoc(), inputs,
shape);
70 }
else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
74 builder, castOp.getLoc(), inputs[0], tileShape);
75 castOp->replaceAllUsesWith(results);
88class XeGPUBlockingPass final
91 void runOnOperation()
override;
98 typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
99 std::is_same_v<T, OpResult>>>
100 std::optional<SmallVector<int64_t>>
113template <
typename T,
typename>
114std::optional<SmallVector<int64_t>>
115XeGPUBlockingPass::getTileShape(
const T &operandOrResult)
const {
117 if constexpr (std::is_same_v<T, OpOperand>) {
118 value = operandOrResult.get();
120 value = (Value)operandOrResult;
123 xegpu::DistributeLayoutAttr layout =
125 if (layout && layout.isForSubgroup()) {
126 if (!layout.getEffectiveInstDataAsInt().empty()) {
127 SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
130 if (
auto type = dyn_cast<ShapedType>(value.
getType()))
131 return llvm::to_vector(type.getShape());
133 LDBG() <<
"failed to getTileShape for: " << value;
137std::optional<SmallVector<int64_t>>
138XeGPUBlockingPass::getTileShape(Operation *op)
const {
139 if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
140 xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
142 if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
143 xegpu::StoreMatrixOp>(op))
145 if (isa<xegpu::StoreNdOp>(op))
149 if (
auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
150 if (loadGatherOp.getOffsets())
156 if (
auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
158 convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt();
159 auto targetInstData =
160 convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt();
163 return inputInstData;
165 return targetInstData;
168 if (
auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
170 ? storeScatterOp->getOpOperand(0)
171 : storeScatterOp->getOpOperand(1));
173 if (isa<xegpu::DpasOp>(op)) {
174 std::optional<SmallVector<int64_t>> aTile =
176 std::optional<SmallVector<int64_t>> bTile =
179 if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
183 if ((*aTile)[1] != (*bTile)[0])
188 std::optional<SmallVector<int64_t>> cTile =
190 int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
191 if (!cTile || !llvm::equal(*cTile, expectedCTile))
195 return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
201 if (isa<vector::MultiDimReductionOp>(op))
204 if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
205 vector::ShapeCastOp, vector::ConstantMaskOp, vector::CreateMaskOp>(
212bool XeGPUBlockingPass::needsUnroll(Operation *op)
const {
214 bool hasWgLayoutOperands =
216 xegpu::DistributeLayoutAttr layout =
217 xegpu::getDistributeLayoutAttr(opr);
218 return layout && layout.isForWorkgroup();
220 bool hasWgLayoutResults =
222 xegpu::DistributeLayoutAttr layout =
223 xegpu::getDistributeLayoutAttr(result);
224 return layout && layout.isForWorkgroup();
226 if (hasWgLayoutOperands || hasWgLayoutResults) {
227 LDBG() <<
"skip unrolling for op with workgroup level layout: " << *op;
231 auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
233 if (
auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
234 xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
235 return layout && !layout.getEffectiveInstDataAsInt().empty();
237 auto shapedType = dyn_cast<ShapedType>(valTy);
238 return shapedType && !llvm::equal(tileShape, shapedType.getShape());
241 bool hasUnrollableOperands =
243 std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
244 return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
246 bool hasUnrollableResults =
248 std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
249 return tileShape.has_value() && isUnrollable(result, *tileShape);
252 bool isConvertLayoutWithInstData =
false;
253 if (
auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
254 auto targettLayout = convertLayoutOp.getTargetLayout();
255 if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) {
256 isConvertLayoutWithInstData =
true;
259 return hasUnrollableOperands || hasUnrollableResults ||
260 isConvertLayoutWithInstData;
263void XeGPUBlockingPass::runOnOperation() {
265 Operation *op = getOperation();
272 auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
273 xegpu::LayoutAttr layout) {
275 SmallVector<int64_t> tileShape(shape);
276 if (layout && layout.getInstData()) {
278 tileShape = llvm::to_vector_of<int64_t>(instData.
asArrayRef());
281 return std::make_pair(tileShape, count);
285 TypeConverter converter;
286 converter.addConversion([](Type type) -> Type {
return type; });
287 converter.addConversion(
288 [&](RankedTensorType type,
289 SmallVectorImpl<Type> &
result) -> std::optional<LogicalResult> {
290 Type elemTy = type.getElementType();
291 ArrayRef<int64_t> shape = type.getShape();
294 llvm::dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding());
295 if (layout && layout.isForWorkgroup())
299 SmallVector<int64_t> subShape;
300 std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
301 auto newTy = VectorType::get(subShape, elemTy);
302 result.append(count, newTy);
305 converter.addConversion(
306 [&](xegpu::TensorDescType type,
307 SmallVectorImpl<Type> &
result) -> std::optional<LogicalResult> {
308 Type elemTy = type.getElementType();
309 ArrayRef<int64_t> shape = type.getShape();
311 xegpu::LayoutAttr layout = type.getLayoutAttr();
312 if (layout && layout.isForWorkgroup())
316 SmallVector<int64_t> subShape;
317 std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
320 layout = layout.dropInstData();
322 auto newTy = xegpu::TensorDescType::get(
323 type.getContext(), subShape, elemTy, type.getEncoding(), layout);
324 result.append(count, newTy);
332 [&](Operation *op) -> LogicalResult {
return success(needsUnroll(op)); });
336 options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape,
337 bool returnSingleType =
false) {
338 Type elemTy = type.getElementType();
341 if (
auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
343 Attribute encoding = tdescTy.getEncoding();
346 if (tdescTy.isScattered()) {
347 int64_t chunkSize = tdescTy.getChunkSizeAsInt();
350 int64_t blockedChunkSize = chunkSize;
351 auto instData = tdescTy.getLayoutAttr().getInstData();
352 if (!instData.empty())
353 blockedChunkSize = instData.asArrayRef().back();
356 auto newEncoding = xegpu::ScatterTensorDescAttr::get(
357 ctx, tdescTy.getMemorySpace(), blockedChunkSize);
358 encoding = newEncoding;
363 xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
364 tdescTy.getLayoutAttr().dropInstData());
366 newTy = VectorType::get(tileShape, elemTy);
369 if (returnSingleType)
370 return SmallVector<Type>{newTy};
371 std::optional<SmallVector<int64_t>> ratio =
373 assert(ratio &&
"The shape of the type must be a multiple of tileShape.");
377 RewritePatternSet patterns(ctx);
378 vector::UnrollVectorOptions vectorOptions;
382 vector::populateVectorUnrollPatterns(patterns, vectorOptions);
386 op->
walk([](Operation *op) {
397 if (
auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
399 if (!isa<LoopLikeOpInterface>(op))
405 if (
auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
406 resolveUnrealizedConversionCastOp(castOp);
static std::array< int64_t, 2 > getTileShape(ArrayRef< int64_t > operandShape, Type elementType, int64_t lineSizeBits)
Returns the number of 8 x [128|256|512] bit tiles that compose the given operand shape.
static llvm::ManagedStatic< PassManagerOptions > options
This class helps build Operations.
Operation is the basic unit of execution within MLIR.
OpResult getOpResult(unsigned idx)
AttrClass getAttrOfType(StringAttr name)
bool hasAttrOfType(NameT &&name)
MutableArrayRef< OpOperand > getOpOperands()
unsigned getNumOperands()
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
OpOperand & getOpOperand(unsigned idx)
unsigned getNumResults()
Return the number of results held by this operation.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
ArrayRef< T > asArrayRef() const
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void populateXeGPUUnrollPatterns(RewritePatternSet &patterns, const UnrollOptions &options)
Collect a set of patterns to unroll xegpu operations to a smaller shapes.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
detail::DenseArrayAttrImpl< int32_t > DenseI32ArrayAttr
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
UnrollVectorOptions & setNativeShapeFn(NativeShapeFnType fn)