MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include <optional>
32
33namespace mlir {
34namespace xegpu {
35#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
36#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
37} // namespace xegpu
38} // namespace mlir
39
40#define DEBUG_TYPE "xegpu-optimize-peephole"
41#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
42
43using namespace mlir;
44
45namespace {
46
47/// Get the 2D lane data from a tensor desc type if it exists.
48static std::optional<SmallVector<int64_t>>
49getMaybeLaneData(xegpu::TensorDescType tdescType) {
50 auto layout = tdescType.getLayoutAttr();
51 if (!layout)
52 return std::nullopt;
53 auto laneData = layout.getEffectiveLaneDataAsInt();
54 if (laneData.size() != 2)
55 return std::nullopt;
56 return laneData;
57}
58
59/// Get the 2D lane layout from a tensor desc type if it exists.
60static std::optional<SmallVector<int64_t>>
61getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
62 auto layout = tdescType.getLayoutAttr();
63 if (!layout)
64 return std::nullopt;
65 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
66 if (laneLayout.size() != 2)
67 return std::nullopt;
68 return laneLayout;
69}
70
71/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
72/// lane[1] == 1), but inner lane data is not equal to [1, 1].
73/// Example:
74/// !xegpu.tensor_desc<16x16xf16,
75/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
76/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
77/// indicating that this is a load that requires transpose effect. However,
78/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
79/// the inner dimension. We convert this to a optimized form by converting the
80/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
81/// later lowering easily use the load with transpose instruction.
82static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
83 ArrayRef<int64_t> laneData) {
84 if (laneLayout.size() != 2 || laneData.size() != 2)
85 return false;
86 if (laneLayout[0] == 1 || laneLayout[1] != 1)
87 return false;
88 if (laneData[0] != 1 || laneData[1] == 1)
89 return false;
90 return true;
91}
92
93/// A tensor desc type can be optimized if its element type is less than 32 bits
94/// and its layout can be optimized.
95static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
96 // If the dtype is greater or equal to 32 bits, layout must be valid.
97 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
98 if (elementTyBitwidth >= 32)
99 return false;
100 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
101 auto maybeLaneData = getMaybeLaneData(tdescType);
102 if (!maybeLaneData || !maybeLaneLayout)
103 return false;
104 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
105}
106
107/// Check if a tensor desc type can be optimized for transpose, if so return the
108/// new optimized tensor desc type with a valid transpose layout.
109static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
110 const uArch *targetuArch) {
111 if (!canBeOptimizedForTranspose(tdescType))
112 return tdescType;
113 auto laneData = getMaybeLaneData(tdescType)
114 .value(); // Lane data must exist if we reach here.
115 int64_t innerLaneData = laneData[1];
116 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
117 // Required shape is total shape of the vector result that this tensor desc
118 // must eventually load after adjusting for the new bitwidth and array
119 // length.
120 SmallVector<int64_t> requiredShape(tdescType.getShape());
121 requiredShape.back() =
122 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
123 int newBitWidth = elementTyBitwidth * innerLaneData;
124 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
125 // Supported shape is the max transpose shape that can be supported by
126 // hardware that is less than or equal to required shape.
127 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
128 targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
129 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
130 newElemTy, /** has transform */ false, /** has transpose */ true);
131 // If no HW params found, return the original type.
132 if (!maybeHWParams)
133 return tdescType;
134 auto [widths, heights, counts] = maybeHWParams.value();
135 // TODO: Currently we expect array length to be 1 for transpose case.
136 if (counts.size() != 1 || counts[0] != 1)
137 return tdescType;
138 int arrayLen = counts[0];
139 int supportedHeight =
140 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
141 int supportedWidth =
142 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
143 // If no supported height or width found, return the original type.
144 if (supportedHeight == -1 || supportedWidth == -1)
145 return tdescType;
146
147 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
148 auto ctx = tdescType.getContext();
149 auto origLayout = tdescType.getLayoutAttr();
150 auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
151 SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
152 laneLayoutI64.end());
153
154 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
155 ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
156 /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
157 /*order=*/origLayout.getOrder());
158
159 // Array length can not be larger than 1 for transpose case.
160 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
161 tdescType.getBoundaryCheck(),
162 tdescType.getMemorySpace(), newLayout);
163}
164
165/// Helper to convert an OpFoldResult to Value.
166static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
167 OpFoldResult ofr) {
168 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
169 if (mayBeInt)
170 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
171 return llvm::cast<Value>(ofr);
172}
173
174/// Helper to divide a Value by a constant integer.
175static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
176 Value val, int64_t constant) {
177 // If the constant is a power of 2, use right shift for division.
178 if (llvm::isPowerOf2_64(constant)) {
179 int64_t shiftAmount = llvm::Log2_64(constant);
180 return arith::ShRUIOp::create(
181 rewriter, loc, val,
182 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
183 .getResult())
184 .getResult();
185 }
186 auto constantOp =
187 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
188 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
189}
190
191/// This function takes a larger register block `data` and generates multiple
192/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
193/// starting from `offsets`.
194static Value generateLoads(ConversionPatternRewriter &rewriter,
198 xegpu::LoadNdOp origLoadOp) {
199 Location loc = data.getLoc();
200 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
201 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
202 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
203 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
204 // Compute the ratio between original shape and supported shape. We need to
205 // generate loads in this ratio arrangement.
206 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
207 supportedShape)
208 .value(); // `ratio` must be defined if we reach here.
209 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
210 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
211 int64_t localOffsetDim0 = h * supportedShape[0];
212 int64_t localOffsetDim1 = w * supportedShape[1];
213 Value loadOffsetX = arith::AddIOp::create(
214 rewriter, loc, offsetDim0,
215 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
216 .getResult());
217 Value loadOffsetY = arith::AddIOp::create(
218 rewriter, loc, offsetDim1,
219 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
220 .getResult());
221 auto loadOp = xegpu::LoadNdOp::create(
222 rewriter, loc,
223 VectorType::get(supportedShape, data.getType().getElementType()),
224 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
225 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
226 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
227 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
228 // Set the layout for the loadOp.
229 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
230 loadOp.setAnchorLayout(layoutAttr);
231 // Insert the loaded block into the right position in data.
232 auto insertOp = vector::InsertStridedSliceOp::create(
233 rewriter, loc, loadOp.getResult(), data,
234 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
235 ArrayRef<int64_t>{1, 1});
236 // InsertOp must have the same layout as newTensorDesc.
237 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
238 data = insertOp.getResult();
239 }
240 }
241 return data;
242}
243
244/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
245/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
246/// the base pointer from the original memory source and adjusting the shape and
247/// strides of the tensor desc to fit with the new optimized transpose layout.
248class XeGPUCreateNdDescOpPattern final
249 : public OpConversionPattern<xegpu::CreateNdDescOp> {
250public:
251 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
252 LogicalResult
253 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
254 ConversionPatternRewriter &rewriter) const override {
255 auto tdescTy = createNdOp.getType();
256 // Get the target uArch info.
257 auto chipStr = xegpu::getChipStr(createNdOp);
258 // Check if the chip is supported.
259 assert(
260 chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
261 "Expecting target chip to be pvc or bmg for transpose optimization.");
262 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
263
264 auto convertType = tryOptimize(tdescTy, targetuArch);
265 if (convertType == tdescTy)
266 return failure();
267 auto strides = createNdOp.getMixedStrides();
268 auto maybeConstInnerStride = getConstantIntValue(strides.back());
269 // Only row-major memrefs are expected for now.
270 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
271 return rewriter.notifyMatchFailure(
272 createNdOp, "Expecting row-major memref for transpose optimization.");
273 Value source = createNdOp.getSource();
274 auto optionalLaneData = getMaybeLaneData(tdescTy);
275 assert(optionalLaneData && "Expected 2D lane data");
276 auto laneData = optionalLaneData.value();
277 int64_t innerLaneData = laneData[1];
278 auto memrefType = dyn_cast<MemRefType>(source.getType());
279 // Inner dimension of the shape must be adjusted based on innerLaneData.
280 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
281 modifiedShape.back() = divideByConstant(
282 rewriter, createNdOp.getLoc(),
283 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
284 innerLaneData);
285 // Similarly, second to last stride must be adjusted.
286 assert(strides.size() >= 2 &&
287 "Expected at least 2 strides for CreateNdDescOp");
288 SmallVector<OpFoldResult> modifiedStrides(strides);
289 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
290 rewriter, createNdOp.getLoc(),
291 convertToValue(rewriter, createNdOp.getLoc(),
292 modifiedStrides[modifiedStrides.size() - 2]),
293 innerLaneData);
294
295 // If the source is a static memref, we need to extract the pointer to
296 // base address.
297 if (memrefType && memrefType.hasStaticShape()) {
298 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
299 rewriter, createNdOp.getLoc(), source);
300 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
301 rewriter.getI64Type(),
302 extractOp.getResult())
303 .getResult();
304 }
305 // Create a new CreateNdDescOp with the modified shape and converted type.
306 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
307 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
308 modifiedStrides);
309 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
310 return success();
311 }
312};
313
314/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
315/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
316/// adjusted tensor desc type. This can result in multiple LoadNdOps being
317/// generated to fill in the original load shape.
318class XeGPULoadNdDescOpPattern final
319 : public OpConversionPattern<xegpu::LoadNdOp> {
320public:
321 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
322 LogicalResult
323 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
324 ConversionPatternRewriter &rewriter) const override {
325 auto origTensorDescType = loadNdOp.getTensorDescType();
326 auto adaptorType =
327 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
328 if (adaptorType == origTensorDescType)
329 return failure();
330 // Offsets must be adjusted based on innerLaneData.
331 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
332 int64_t innerLaneData = laneData[1];
333 auto offsets = loadNdOp.getMixedOffsets();
334 if (offsets.empty())
335 return rewriter.notifyMatchFailure(loadNdOp,
336 "Expecting offsets in LoadNd");
337 SmallVector<OpFoldResult> modifiedOffsets(offsets);
338 modifiedOffsets.back() = divideByConstant(
339 rewriter, loadNdOp.getLoc(),
340 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
341 innerLaneData);
342 // Get the 2D data shape of this loadNdOp in its original type including
343 // array length.
344 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
345 // Adjust the data shape based on innerLaneData.
346 origDataShape.back() /= innerLaneData;
347 // HW supported shape is the new tensor desc shape after conversion.
348 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
349 VectorType origVectorType =
350 VectorType::get(origDataShape, adaptorType.getElementType());
351 Value data;
352 // Orig data shape is 3D for the array length case.
353 if (origTensorDescType.getArrayLength() > 1) {
354 SmallVector<Value> arraySlices;
355 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
356 Value slice = arith::ConstantOp::create(
357 rewriter, loadNdOp->getLoc(), origVectorType,
358 rewriter.getZeroAttr(origVectorType));
359 // Increase the Y offset for each array slice.
360 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
361 modifiedOffsets.back());
362 modifiedOffsets.back() =
363 arith::AddIOp::create(
364 rewriter, loadNdOp->getLoc(), offsetY,
365 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
366 i * origDataShape[1])
367 .getResult())
368 .getResult();
369 slice = generateLoads(
370 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
371 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
372 loadNdOp);
373 // BitCast back to original load shape without array length.
374 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
375 origTensorDescType.getElementType());
376 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
377 bitcastType, slice);
378 // BitCastOp must have the same layout as the original loadNdOp.
379 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
380 origTensorDescType.getLayoutAttr());
381 arraySlices.push_back(bitCastOp.getResult());
382 }
383 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
384 return success();
385 }
386 data = arith::ConstantOp::create(
387 rewriter, loadNdOp->getLoc(),
388 VectorType::get(origDataShape, adaptorType.getElementType()),
389 rewriter.getZeroAttr(origVectorType));
390 data = generateLoads(
391 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
392 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
393 loadNdOp);
394 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
395 loadNdOp.getType(), data);
396 // BitCastOp must have the same layout as the original loadNdOp.
397 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
398 origTensorDescType.getLayoutAttr());
399 rewriter.replaceOp(loadNdOp, bitCastOp);
400 return success();
401 }
402};
403
404/// Vector ExtractOp must be processed if the original tensor desc type has
405/// array length greater than 1. In this case, the LoadNdOp is replaced with
406/// multiple LoadNdOps for each array slice making the extraction unnecessary.
407/// In this case, we simply remove the ExtractOp.
408class VectorExtractOpPattern final
409 : public OpConversionPattern<vector::ExtractOp> {
410public:
411 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
412 LogicalResult
413 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
414 ConversionPatternRewriter &rewriter) const override {
415 // Check if the source of the extraction is split to multiple values.
416 if (adaptor.getSource().size() == 1)
417 return failure();
418 auto mixedPos = extractOp.getMixedPosition();
419 if (mixedPos.size() != 1)
420 return failure();
421 auto mayBeInt = getConstantIntValue(mixedPos[0]);
422 if (!mayBeInt)
423 return failure();
424 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
425 return success();
426 }
427};
428
429/// Performs a reduction over 2 dimensions by decomposing it into two 1D
430/// reductions ordered based on layout to minimize cross-lane communication.
431class MultiRed2dOpPattern
432 : public OpConversionPattern<vector::MultiDimReductionOp> {
433 using OpConversionPattern::OpConversionPattern;
434 LogicalResult
435 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
436 ConversionPatternRewriter &rewriter) const override {
437 auto sourceVecType = reductionOp.getSourceVectorType();
438 if (reductionOp.getReductionDims().size() != 2)
439 return rewriter.notifyMatchFailure(reductionOp, "Expected 2D reduction");
440 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
441 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
442 auto dims = llvm::to_vector(reductionOp.getReductionDims());
443 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
444 // Order does not matter
445 if (intraLaneDim == -1 || crossLaneDim == -1) {
446 intraLaneDim = dims[0];
447 crossLaneDim = dims[1];
448 }
449 auto loc = reductionOp.getLoc();
450 auto acc = reductionOp.getAcc();
451
452 // If the result is scalar after reduction, look for consumer
453 // convert_layout op and remove it. The layout propagation pass will
454 // re-install it properly after the decomposition.
455 Type resultType = reductionOp.getResult().getType();
456 if (resultType.isIntOrFloat()) {
457 for (auto &use : reductionOp.getResult().getUses()) {
458 if (auto convertLayoutOp =
459 llvm::dyn_cast<xegpu::ConvertLayoutOp>(use.getOwner())) {
460 rewriter.replaceOp(convertLayoutOp, reductionOp.getResult());
461 break;
462 }
463 }
464 }
465
466 SmallVector<int64_t> accShape(sourceVecType.getShape());
467 accShape.erase(accShape.begin() + intraLaneDim);
468 Type eTy = sourceVecType.getElementType();
470 rewriter, loc, VectorType::get(accShape, eTy), reductionOp.getKind());
471
472 Value intraLaneReduced = vector::MultiDimReductionOp::create(
473 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
474 constNeutralVal, ArrayRef<int64_t>(intraLaneDim));
475
476 // Adjust crossLaneDim after the first reduction.
477 if (crossLaneDim > intraLaneDim)
478 crossLaneDim -= 1;
479 Value crossLaneReduced = vector::MultiDimReductionOp::create(
480 rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
481 ArrayRef<int64_t>(crossLaneDim));
482 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
483 "Type mismatch");
484 rewriter.replaceOp(reductionOp, crossLaneReduced);
485 return success();
486 }
487
488private:
489 std::pair<int64_t, int64_t>
490 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
491 xegpu::DistributeLayoutAttr layout) const {
492 assert(layout.isForSubgroup() && "Must know the lane layout");
493 assert(reductionDims.size() == 2 && "Expected 2D reduction");
494 int64_t intra, cross = -1;
495 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
496 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
497 layoutAttr =
498 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
499 assert(layoutAttr);
500 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
501
502 assert(laneLayout.size() && "Expected a non-empty layout");
503 // try to pick a dim that does not communicate
504 for (auto dim : reductionDims) {
505 if (laneLayout[dim] == 1)
506 intra = dim;
507 else
508 cross = dim;
509 }
510 return {intra, cross};
511 }
512};
513
514} // namespace
515
517 RewritePatternSet &patterns) {
518 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
519 VectorExtractOpPattern, MultiRed2dOpPattern>(
520 patterns.getContext());
521}
522
523namespace {
524
525struct XeGPUPeepHoleOptimizerPass final
526 : public xegpu::impl::XeGPUPeepHoleOptimizerBase<
527 XeGPUPeepHoleOptimizerPass> {
528 void runOnOperation() override {
529 MLIRContext &context = getContext();
530 TypeConverter converter;
531 RewritePatternSet patterns(&context);
532 ConversionTarget target(context);
533
534 // This pass is only meant for PVC and BMG targets. If unsupported target
535 // is found, exit early.
536 bool isTargetSupported = false;
537 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
538 auto chipStr = xegpu::getChipStr(funcOp);
539 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
540 isTargetSupported = true;
541 });
542
543 if (!isTargetSupported) {
544 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
545 << "\n";
546 return;
547 }
548
549 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
550 // converted.
551 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
552 [&](xegpu::CreateNdDescOp createNdOp) {
553 return !canBeOptimizedForTranspose(createNdOp.getType());
554 });
555 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
556 [&](xegpu::LoadNdOp loadNdOp) {
557 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
558 });
559 // Vector ExtractOps can have optimizable layouts if they extract from
560 // LoadNdOps with array length greater than 1. These ExtractOps must be
561 // converted.
562 target.addDynamicallyLegalOp<vector::ExtractOp>(
563 [&](vector::ExtractOp extractOp) {
564 auto layout = xegpu::getTemporaryLayout(
565 dyn_cast<OpResult>(extractOp.getResult()));
566 if (!layout)
567 return true;
568 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
569 auto laneData = layout.getEffectiveLaneDataAsInt();
570 return !canBeOptimizedForTranspose(laneLayout, laneData);
571 });
572
573 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
574 [=](Operation *op) -> bool {
575 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
576 if (!layout || !layout.isForSubgroup())
577 return true;
578 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
579 return reductionOp.getReductionDims().size() != 2;
580 return true;
581 });
582
583 converter.addConversion([](Type type) { return type; });
584
585 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
586 vector::VectorDialect>();
588 target);
590 if (failed(applyPartialConversion(getOperation(), target,
591 std::move(patterns)))) {
592 DBGS() << "Optimize block loads pass failed.\n";
593 return signalPassFailure();
594 }
595
596 // Apply folding for cleaning up IR.
597 MLIRContext *ctx = &getContext();
598 RewritePatternSet emptyPatterns(ctx);
599 (void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
600
601 // Remove the temporary layout after all patterns are applied.
602 getOperation()->walk([](Operation *op) {
603 SmallVector<StringAttr> attrsToRemove;
604 for (auto namedAttr : op->getDiscardableAttrs()) {
605 if (isa<xegpu::DistributeLayoutAttr>(namedAttr.getValue()))
606 attrsToRemove.push_back(namedAttr.getName());
607 }
608 for (auto attrName : attrsToRemove)
609 op->removeDiscardableAttr(attrName);
610 });
611 }
612};
613
614} // namespace
return success()
b getContext())
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
auto getDiscardableAttrs()
Return a range of all of discardable attributes on this operation.
Definition Operation.h:512
Attribute removeDiscardableAttr(StringAttr name)
Remove the discardable attribute with the specified name if it exists.
Definition Operation.h:498
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition Types.cpp:118
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:363
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:163