MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include <optional>
31
32namespace mlir {
33namespace xegpu {
34#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
35#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
36} // namespace xegpu
37} // namespace mlir
38
39#define DEBUG_TYPE "xegpu-optimize-peephole"
40#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
41
42using namespace mlir;
43
44namespace {
45
46/// Get the 2D lane data from a tensor desc type if it exists.
47static std::optional<SmallVector<int64_t>>
48getMaybeLaneData(xegpu::TensorDescType tdescType) {
49 auto layout = tdescType.getLayoutAttr();
50 if (!layout)
51 return std::nullopt;
52 auto laneData = layout.getEffectiveLaneDataAsInt();
53 if (laneData.size() != 2)
54 return std::nullopt;
55 return laneData;
56}
57
58/// Get the 2D lane layout from a tensor desc type if it exists.
59static std::optional<SmallVector<int64_t>>
60getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
61 auto layout = tdescType.getLayoutAttr();
62 if (!layout)
63 return std::nullopt;
64 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
65 if (laneLayout.size() != 2)
66 return std::nullopt;
67 return laneLayout;
68}
69
70/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
71/// lane[1] == 1), but inner lane data is not equal to [1, 1].
72/// Example:
73/// !xegpu.tensor_desc<16x16xf16,
74/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
75/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
76/// indicating that this is a load that requires transpose effect. However,
77/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
78/// the inner dimension. We convert this to a optimized form by converting the
79/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
80/// later lowering easily use the load with transpose instruction.
81static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
82 ArrayRef<int64_t> laneData) {
83 if (laneLayout.size() != 2 || laneData.size() != 2)
84 return false;
85 if (laneLayout[0] == 1 || laneLayout[1] != 1)
86 return false;
87 if (laneData[0] != 1 || laneData[1] == 1)
88 return false;
89 return true;
90}
91
92/// A tensor desc type can be optimized if its element type is less than 32 bits
93/// and its layout can be optimized.
94static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
95 // If the dtype is greater or equal to 32 bits, layout must be valid.
96 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
97 if (elementTyBitwidth >= 32)
98 return false;
99 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
100 auto maybeLaneData = getMaybeLaneData(tdescType);
101 if (!maybeLaneData || !maybeLaneLayout)
102 return false;
103 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
104}
105
106/// Check if a tensor desc type can be optimized for transpose, if so return the
107/// new optimized tensor desc type with a valid transpose layout.
108static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
109 const uArch *targetuArch) {
110 if (!canBeOptimizedForTranspose(tdescType))
111 return tdescType;
112 auto laneData = getMaybeLaneData(tdescType)
113 .value(); // Lane data must exist if we reach here.
114 int64_t innerLaneData = laneData[1];
115 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
116 // Required shape is total shape of the vector result that this tensor desc
117 // must eventually load after adjusting for the new bitwidth and array
118 // length.
119 SmallVector<int64_t> requiredShape(tdescType.getShape());
120 requiredShape.back() =
121 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
122 int newBitWidth = elementTyBitwidth * innerLaneData;
123 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
124 // Supported shape is the max transpose shape that can be supported by
125 // hardware that is less than or equal to required shape.
126 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
128 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
129 newElemTy, /** has transform */ false, /** has transpose */ true);
130 // If no HW params found, return the original type.
131 if (!maybeHWParams)
132 return tdescType;
133 auto [widths, heights, counts] = maybeHWParams.value();
134 // TODO: Currently we expect array length to be 1 for transpose case.
135 if (counts.size() != 1 || counts[0] != 1)
136 return tdescType;
137 int arrayLen = counts[0];
138 int supportedHeight =
139 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
140 int supportedWidth =
141 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
142 // If no supported height or width found, return the original type.
143 if (supportedHeight == -1 || supportedWidth == -1)
144 return tdescType;
146 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
147 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
148 tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout(),
149 DenseI32ArrayAttr::get(tdescType.getContext(), {1, 1}),
150 tdescType.getLayoutAttr().getOrder());
151 // Array length can not be larger than 1 for transpose case.
152 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
153 tdescType.getBoundaryCheck(),
154 tdescType.getMemorySpace(), newLayout);
155}
156
157/// Helper to convert an OpFoldResult to Value.
158static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
160 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
161 if (mayBeInt)
162 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
163 return llvm::cast<Value>(ofr);
164}
166/// Helper to divide a Value by a constant integer.
167static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
168 Value val, int64_t constant) {
169 // If the constant is a power of 2, use right shift for division.
170 if (llvm::isPowerOf2_64(constant)) {
171 int64_t shiftAmount = llvm::Log2_64(constant);
172 return arith::ShRUIOp::create(
173 rewriter, loc, val,
174 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
175 .getResult())
176 .getResult();
177 }
178 auto constantOp =
179 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
180 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
181}
182
183/// This function takes a larger register block `data` and generates multiple
184/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
185/// starting from `offsets`.
186static Value generateLoads(ConversionPatternRewriter &rewriter,
190 xegpu::LoadNdOp origLoadOp) {
191 Location loc = data.getLoc();
192 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
193 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
194 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
195 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
196 // Compute the ratio between original shape and supported shape. We need to
197 // generate loads in this ratio arrangement.
198 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
199 supportedShape)
200 .value(); // `ratio` must be defined if we reach here.
201 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
202 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
203 int64_t localOffsetDim0 = h * supportedShape[0];
204 int64_t localOffsetDim1 = w * supportedShape[1];
205 Value loadOffsetX = arith::AddIOp::create(
206 rewriter, loc, offsetDim0,
207 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
208 .getResult());
209 Value loadOffsetY = arith::AddIOp::create(
210 rewriter, loc, offsetDim1,
211 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
212 .getResult());
213 auto loadOp = xegpu::LoadNdOp::create(
214 rewriter, loc,
215 VectorType::get(supportedShape, data.getType().getElementType()),
216 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
217 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
218 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
219 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
220 // Set the layout for the loadOp.
221 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
222 loadOp.setAnchorLayout(layoutAttr);
223 // Insert the loaded block into the right position in data.
224 auto insertOp = vector::InsertStridedSliceOp::create(
225 rewriter, loc, loadOp.getResult(), data,
226 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
227 ArrayRef<int64_t>{1, 1});
228 // InsertOp must have the same layout as newTensorDesc.
229 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
230 data = insertOp.getResult();
231 }
232 }
233 return data;
234}
235
236/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
237/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
238/// the base pointer from the original memory source and adjusting the shape and
239/// strides of the tensor desc to fit with the new optimized transpose layout.
240class XeGPUCreateNdDescOpPattern final
241 : public OpConversionPattern<xegpu::CreateNdDescOp> {
242public:
243 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
244 LogicalResult
245 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
246 ConversionPatternRewriter &rewriter) const override {
247 auto tdescTy = createNdOp.getType();
248 // Get the target uArch info.
249 auto chipStr = xegpu::getChipStr(createNdOp);
250 // Check if the chip is supported.
251 assert(
252 chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
253 "Expecting target chip to be pvc or bmg for transpose optimization.");
254 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
255
256 auto convertType = tryOptimize(tdescTy, targetuArch);
257 if (convertType == tdescTy)
258 return failure();
259 auto strides = createNdOp.getMixedStrides();
260 auto maybeConstInnerStride = getConstantIntValue(strides.back());
261 // Only row-major memrefs are expected for now.
262 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
263 return rewriter.notifyMatchFailure(
264 createNdOp, "Expecting row-major memref for transpose optimization.");
265 Value source = createNdOp.getSource();
266 auto optionalLaneData = getMaybeLaneData(tdescTy);
267 assert(optionalLaneData && "Expected 2D lane data");
268 auto laneData = optionalLaneData.value();
269 int64_t innerLaneData = laneData[1];
270 auto memrefType = dyn_cast<MemRefType>(source.getType());
271 // Inner dimension of the shape must be adjusted based on innerLaneData.
272 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
273 modifiedShape.back() = divideByConstant(
274 rewriter, createNdOp.getLoc(),
275 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
276 innerLaneData);
277 // Similarly, second to last stride must be adjusted.
278 assert(strides.size() >= 2 &&
279 "Expected at least 2 strides for CreateNdDescOp");
280 SmallVector<OpFoldResult> modifiedStrides(strides);
281 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
282 rewriter, createNdOp.getLoc(),
283 convertToValue(rewriter, createNdOp.getLoc(),
284 modifiedStrides[modifiedStrides.size() - 2]),
285 innerLaneData);
286
287 // If the source is a static memref, we need to extract the pointer to
288 // base address.
289 if (memrefType && memrefType.hasStaticShape()) {
290 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
291 rewriter, createNdOp.getLoc(), source);
292 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
293 rewriter.getI64Type(),
294 extractOp.getResult())
295 .getResult();
296 }
297 // Create a new CreateNdDescOp with the modified shape and converted type.
298 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
299 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
300 modifiedStrides);
301 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
302 return success();
303 }
304};
305
306/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
307/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
308/// adjusted tensor desc type. This can result in multiple LoadNdOps being
309/// generated to fill in the original load shape.
310class XeGPULoadNdDescOpPattern final
311 : public OpConversionPattern<xegpu::LoadNdOp> {
312public:
313 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
314 LogicalResult
315 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
316 ConversionPatternRewriter &rewriter) const override {
317 auto origTensorDescType = loadNdOp.getTensorDescType();
318 auto adaptorType =
319 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
320 if (adaptorType == origTensorDescType)
321 return failure();
322 // Offsets must be adjusted based on innerLaneData.
323 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
324 int64_t innerLaneData = laneData[1];
325 auto offsets = loadNdOp.getMixedOffsets();
326 if (offsets.empty())
327 return rewriter.notifyMatchFailure(loadNdOp,
328 "Expecting offsets in LoadNd");
329 SmallVector<OpFoldResult> modifiedOffsets(offsets);
330 modifiedOffsets.back() = divideByConstant(
331 rewriter, loadNdOp.getLoc(),
332 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
333 innerLaneData);
334 // Get the 2D data shape of this loadNdOp in its original type including
335 // array length.
336 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
337 // Adjust the data shape based on innerLaneData.
338 origDataShape.back() /= innerLaneData;
339 // HW supported shape is the new tensor desc shape after conversion.
340 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
341 VectorType origVectorType =
342 VectorType::get(origDataShape, adaptorType.getElementType());
343 Value data;
344 // Orig data shape is 3D for the array length case.
345 if (origTensorDescType.getArrayLength() > 1) {
346 SmallVector<Value> arraySlices;
347 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
348 Value slice = arith::ConstantOp::create(
349 rewriter, loadNdOp->getLoc(), origVectorType,
350 rewriter.getZeroAttr(origVectorType));
351 // Increase the Y offset for each array slice.
352 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
353 modifiedOffsets.back());
354 modifiedOffsets.back() =
355 arith::AddIOp::create(
356 rewriter, loadNdOp->getLoc(), offsetY,
357 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
358 i * origDataShape[1])
359 .getResult())
360 .getResult();
361 slice = generateLoads(
362 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
363 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
364 loadNdOp);
365 // BitCast back to original load shape without array length.
366 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
367 origTensorDescType.getElementType());
368 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
369 bitcastType, slice);
370 // BitCastOp must have the same layout as the original loadNdOp.
371 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
372 origTensorDescType.getLayoutAttr());
373 arraySlices.push_back(bitCastOp.getResult());
374 }
375 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
376 return success();
377 }
378 data = arith::ConstantOp::create(
379 rewriter, loadNdOp->getLoc(),
380 VectorType::get(origDataShape, adaptorType.getElementType()),
381 rewriter.getZeroAttr(origVectorType));
382 data = generateLoads(
383 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
384 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
385 loadNdOp);
386 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
387 loadNdOp.getType(), data);
388 // BitCastOp must have the same layout as the original loadNdOp.
389 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
390 origTensorDescType.getLayoutAttr());
391 rewriter.replaceOp(loadNdOp, bitCastOp);
392 return success();
393 }
394};
395
396/// Vector ExtractOp must be processed if the original tensor desc type has
397/// array length greater than 1. In this case, the LoadNdOp is replaced with
398/// multiple LoadNdOps for each array slice making the extraction unnecessary.
399/// In this case, we simply remove the ExtractOp.
400class VectorExtractOpPattern final
401 : public OpConversionPattern<vector::ExtractOp> {
402public:
403 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
404 LogicalResult
405 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
406 ConversionPatternRewriter &rewriter) const override {
407 // Check if the source of the extraction is split to multiple values.
408 if (adaptor.getSource().size() == 1)
409 return failure();
410 auto mixedPos = extractOp.getMixedPosition();
411 if (mixedPos.size() != 1)
412 return failure();
413 auto mayBeInt = getConstantIntValue(mixedPos[0]);
414 if (!mayBeInt)
415 return failure();
416 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
417 return success();
418 }
419};
420
421/// Performs a reduction over 2 dimensions by decomposing it into two 1D
422/// reductions ordered based on layout to minimize cross-lane communication.
423class MultiRed2dOpPattern
424 : public OpConversionPattern<vector::MultiDimReductionOp> {
425 using OpConversionPattern::OpConversionPattern;
426 LogicalResult
427 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
428 ConversionPatternRewriter &rewriter) const override {
429 auto sourceVecType = reductionOp.getSourceVectorType();
430 if (reductionOp.getReductionDims().size() != 2 ||
431 sourceVecType.getRank() != 2)
432 return rewriter.notifyMatchFailure(
433 reductionOp, "Expected 2D multi reduction of a 2D source");
434 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
435 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
436 auto dims = llvm::to_vector(reductionOp.getReductionDims());
437 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
438 // Order does not matter
439 if (intraLaneDim == -1 || crossLaneDim == -1) {
440 intraLaneDim = dims[0];
441 crossLaneDim = dims[1];
442 }
443 auto loc = reductionOp.getLoc();
444 auto acc = reductionOp.getAcc();
445
446 // The first reduction's dist attribute does not have the cross lane dim.
447 auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
448 SmallVector<int64_t> dropDims{crossLaneDim};
449 auto intraLaneRedResLayout = resSliceLayoutAttr.dropSliceDims(dropDims);
450
451 SmallVector<int64_t> accShape(sourceVecType.getShape());
452 accShape.erase(accShape.begin() + intraLaneDim);
453 if (acc) {
454 acc = vector::BroadcastOp::create(
455 rewriter, loc,
456 VectorType::get(accShape, sourceVecType.getElementType()), acc);
458 llvm::dyn_cast<OpResult>(acc),
459 cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
460 }
461 Value intraLaneReduced = vector::MultiDimReductionOp::create(
462 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(), acc,
463 ArrayRef<int64_t>(intraLaneDim));
465 llvm::dyn_cast<OpResult>(intraLaneReduced),
466 cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
467
468 Value crossLaneReduced = vector::ReductionOp::create(
469 rewriter, loc, reductionOp.getKind(), intraLaneReduced, nullptr);
471 llvm::dyn_cast<OpResult>(crossLaneReduced),
472 cast<xegpu::DistributeLayoutAttr>(resLayout));
473 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
474 "Type mismatch");
475 rewriter.replaceOp(reductionOp, crossLaneReduced);
476 return success();
477 }
478
479private:
480 std::pair<int64_t, int64_t>
481 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
482 xegpu::DistributeLayoutAttr layout) const {
483 assert(layout.isForSubgroup() && "Must know the lane layout");
484 assert(reductionDims.size() == 2 && "Expected 2D reduction");
485 int64_t intra, cross = -1;
486 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
487 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
488 layoutAttr =
489 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
490 assert(layoutAttr);
491 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
492
493 assert(laneLayout.size() && "Expected a non-empty layout");
494 // try to pick a dim that does not communicate
495 for (auto dim : reductionDims) {
496 if (laneLayout[dim] == 1)
497 intra = dim;
498 else
499 cross = dim;
500 }
501 return {intra, cross};
502 }
503};
504
505} // namespace
506
508 RewritePatternSet &patterns) {
509 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
510 VectorExtractOpPattern, MultiRed2dOpPattern>(
511 patterns.getContext());
512}
513
514namespace {
515
516struct XeGPUPeepHoleOptimizerPass final
518 XeGPUPeepHoleOptimizerPass> {
519 void runOnOperation() override {
520 MLIRContext &context = getContext();
521 TypeConverter converter;
522 RewritePatternSet patterns(&context);
523 ConversionTarget target(context);
524
525 // This pass is only meant for PVC and BMG targets. If unsupported target
526 // is found, exit early.
527 bool isTargetSupported = false;
528 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
529 auto chipStr = xegpu::getChipStr(funcOp);
530 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
531 isTargetSupported = true;
532 });
533
534 if (!isTargetSupported) {
535 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
536 << "\n";
537 return;
538 }
539
540 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
541 // converted.
542 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
543 [&](xegpu::CreateNdDescOp createNdOp) {
544 return !canBeOptimizedForTranspose(createNdOp.getType());
545 });
546 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
547 [&](xegpu::LoadNdOp loadNdOp) {
548 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
549 });
550 // Vector ExtractOps can have optimizable layouts if they extract from
551 // LoadNdOps with array length greater than 1. These ExtractOps must be
552 // converted.
553 target.addDynamicallyLegalOp<vector::ExtractOp>(
554 [&](vector::ExtractOp extractOp) {
555 auto layout = xegpu::getTemporaryLayout(
556 dyn_cast<OpResult>(extractOp.getResult()));
557 if (!layout)
558 return true;
559 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
560 auto laneData = layout.getEffectiveLaneDataAsInt();
561 return !canBeOptimizedForTranspose(laneLayout, laneData);
562 });
563
564 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
565 [=](Operation *op) -> bool {
566 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
567 if (!layout || !layout.isForSubgroup())
568 return true;
569 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
570 return reductionOp.getReductionDims().size() != 2;
571 return true;
572 });
573
574 converter.addConversion([](Type type) { return type; });
575
576 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
577 vector::VectorDialect>();
579 target);
581 if (failed(applyPartialConversion(getOperation(), target,
582 std::move(patterns)))) {
583 DBGS() << "Optimize block loads pass failed.\n";
584 return signalPassFailure();
585 }
586 }
587};
588
589} // namespace
return success()
b getContext())
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:363
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
SmallVector< int64_t > dropDims(ArrayRef< int64_t > inputPerm, ArrayRef< int64_t > dropPositions)
Returns a permutation vector that drop the input dims in dropPositions from inputPerm.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:163