21#include "llvm/ADT/STLExtras.h"
22#include "llvm/Support/DebugLog.h"
26#define GEN_PASS_DEF_XEGPUBLOCKING
27#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
31#define DEBUG_TYPE "xegpu-blocking"
43resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
47 auto hasIdenticalVectorTypes = [](
ValueRange values) {
48 auto types = values.getTypes();
49 return llvm::all_of(types, [&](
Type type) {
50 return isa<VectorType>(type) && type == types.front();
56 if (!hasIdenticalVectorTypes(inputs) || !hasIdenticalVectorTypes(outputs)) {
57 LDBG() <<
"skip unrealized conversion cast op not emulating pack/unpack.";
61 VectorType outputTy = dyn_cast<VectorType>(outputs[0].
getType());
63 if (inputs.size() > 1 && outputs.size() == 1) {
67 builder, castOp.getLoc(), inputs,
shape);
70 }
else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
74 builder, castOp.getLoc(), inputs[0], tileShape);
75 castOp->replaceAllUsesWith(results);
88class XeGPUBlockingPass final
89 :
public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
91 void runOnOperation()
override;
98 typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
99 std::is_same_v<T, OpResult>>>
100 std::optional<SmallVector<int64_t>>
113template <
typename T,
typename>
114std::optional<SmallVector<int64_t>>
115XeGPUBlockingPass::getTileShape(
const T &operandOrResult)
const {
117 if constexpr (std::is_same_v<T, OpOperand>) {
118 value = operandOrResult.get();
120 value = (Value)operandOrResult;
123 xegpu::DistributeLayoutAttr layout =
125 if (layout && layout.isForSubgroup()) {
126 if (!layout.getEffectiveInstDataAsInt().empty()) {
127 SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
130 if (
auto type = dyn_cast<ShapedType>(value.
getType()))
131 return llvm::to_vector(type.getShape());
133 LDBG() <<
"failed to getTileShape for: " << value;
137std::optional<SmallVector<int64_t>>
138XeGPUBlockingPass::getTileShape(Operation *op)
const {
139 if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::LoadMatrixOp>(
142 if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
143 xegpu::StoreMatrixOp>(op))
145 if (isa<xegpu::StoreNdOp>(op))
148 if (isa<xegpu::LoadGatherOp>(op))
151 if (
auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
153 convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt();
154 auto targetInstData =
155 convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt();
158 return inputInstData;
160 return targetInstData;
163 if (isa<xegpu::StoreScatterOp>(op))
166 if (isa<xegpu::DpasOp>(op)) {
167 std::optional<SmallVector<int64_t>> aTile =
169 std::optional<SmallVector<int64_t>> bTile =
172 if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
176 if ((*aTile)[1] != (*bTile)[0])
181 std::optional<SmallVector<int64_t>> cTile =
183 int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
184 if (!cTile || !llvm::equal(*cTile, expectedCTile))
188 return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
194 if (isa<vector::MultiDimReductionOp>(op))
197 if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
198 vector::ShapeCastOp, vector::ConstantMaskOp, vector::CreateMaskOp>(
205bool XeGPUBlockingPass::needsUnroll(Operation *op)
const {
207 bool hasWgLayoutOperands =
209 xegpu::DistributeLayoutAttr layout =
210 xegpu::getDistributeLayoutAttr(opr);
211 return layout && layout.isForWorkgroup();
213 bool hasWgLayoutResults =
215 xegpu::DistributeLayoutAttr layout =
216 xegpu::getDistributeLayoutAttr(result);
217 return layout && layout.isForWorkgroup();
219 if (hasWgLayoutOperands || hasWgLayoutResults) {
220 LDBG() <<
"skip unrolling for op with workgroup level layout: " << *op;
224 auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
226 if (
auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
227 xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
228 return layout && !layout.getEffectiveInstDataAsInt().empty();
230 auto shapedType = dyn_cast<ShapedType>(valTy);
231 return shapedType && !llvm::equal(tileShape, shapedType.getShape());
234 bool hasUnrollableOperands =
236 std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
237 return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
239 bool hasUnrollableResults =
241 std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
242 return tileShape.has_value() && isUnrollable(result, *tileShape);
245 bool isConvertLayoutWithInstData =
false;
246 if (
auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
247 auto targettLayout = convertLayoutOp.getTargetLayout();
248 if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) {
249 isConvertLayoutWithInstData =
true;
252 return hasUnrollableOperands || hasUnrollableResults ||
253 isConvertLayoutWithInstData;
256void XeGPUBlockingPass::runOnOperation() {
258 Operation *op = getOperation();
265 auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
266 xegpu::DistributeLayoutAttr layout) {
268 SmallVector<int64_t> tileShape(shape);
269 if (layout && !layout.getEffectiveInstDataAsInt().empty()) {
270 tileShape = layout.getEffectiveInstDataAsInt();
273 return std::make_pair(tileShape, count);
277 TypeConverter converter;
278 converter.addConversion([](Type type) -> Type {
return type; });
279 converter.addConversion(
280 [&](RankedTensorType type,
281 SmallVectorImpl<Type> &
result) -> std::optional<LogicalResult> {
282 Type elemTy = type.getElementType();
283 ArrayRef<int64_t> shape = type.getShape();
286 llvm::dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding());
287 if (layout && layout.isForWorkgroup())
291 SmallVector<int64_t> subShape;
292 std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
293 auto newTy = VectorType::get(subShape, elemTy);
294 result.append(count, newTy);
297 converter.addConversion(
298 [&](xegpu::TensorDescType type,
299 SmallVectorImpl<Type> &
result) -> std::optional<LogicalResult> {
300 Type elemTy = type.getElementType();
301 ArrayRef<int64_t> shape = type.getShape();
303 xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
304 if (layout && layout.isForWorkgroup())
308 SmallVector<int64_t> subShape;
309 std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
312 layout = layout.dropInstData();
314 auto newTy = xegpu::TensorDescType::get(
315 type.getContext(), subShape, elemTy, type.getEncoding(), layout);
316 result.append(count, newTy);
324 [&](Operation *op) -> LogicalResult {
return success(needsUnroll(op)); });
328 options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape,
329 bool returnSingleType =
false) {
330 Type elemTy = type.getElementType();
333 if (
auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
335 Attribute encoding = tdescTy.getEncoding();
338 xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
339 tdescTy.getLayoutAttr().dropInstData());
341 newTy = VectorType::get(tileShape, elemTy);
344 if (returnSingleType)
345 return SmallVector<Type>{newTy};
346 std::optional<SmallVector<int64_t>> ratio =
348 assert(ratio &&
"The shape of the type must be a multiple of tileShape.");
352 RewritePatternSet patterns(ctx);
353 vector::UnrollVectorOptions vectorOptions;
357 vector::populateVectorUnrollPatterns(patterns, vectorOptions);
366 op->
walk([](Operation *op) {
377 if (
auto layout = op->
getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
379 if (!isa<LoopLikeOpInterface>(op))
385 if (
auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
386 resolveUnrealizedConversionCastOp(castOp);
391 RewritePatternSet emptyPatterns(ctx);
static std::array< int64_t, 2 > getTileShape(ArrayRef< int64_t > operandShape, Type elementType, int64_t lineSizeBits)
Returns the number of 8 x [128|256|512] bit tiles that compose the given operand shape.
static llvm::ManagedStatic< PassManagerOptions > options
This class helps build Operations.
Operation is the basic unit of execution within MLIR.
OpResult getOpResult(unsigned idx)
AttrClass getAttrOfType(StringAttr name)
bool hasAttrOfType(NameT &&name)
MutableArrayRef< OpOperand > getOpOperands()
unsigned getNumOperands()
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
result_range getOpResults()
Attribute removeAttr(StringAttr name)
Remove the attribute with the specified name if it exists.
OpOperand & getOpOperand(unsigned idx)
unsigned getNumResults()
Return the number of results held by this operation.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef< int64_t > shape)
Create a vector of shape from a set of values using vector.insert_stride_slice.
void populateXeGPUUnrollPatterns(RewritePatternSet &patterns, const UnrollOptions &options)
Collect a set of patterns to unroll xegpu operations to a smaller shapes.
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
bool recoverTemporaryLayouts(Operation *rootOp)
Attach layout attributes to all vector-type operands of operations within the given operation's neste...
void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter)
Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type convertion patterns...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::string getTemporaryLayoutName(const OpOperand &operand)
Return the attribute name for the OpOperand to attach DistributeLayoutAttr.
SmallVector< Value > extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef< int64_t > shape)
Extract a set of small vectors from a value with a given shape using vector.extract_stride_slice.
Include the generated interface declarations.
Type getType(OpFoldResult ofr)
Returns the int type of the integer in ofr.
LogicalResult applyPatternsGreedily(Region ®ion, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
UnrollVectorOptions & setNativeShapeFn(NativeShapeFnType fn)