MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include <optional>
31
32namespace mlir {
33namespace xegpu {
34#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
35#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
36} // namespace xegpu
37} // namespace mlir
38
39#define DEBUG_TYPE "xegpu-optimize-peephole"
40#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
41
42using namespace mlir;
43
44namespace {
45
46/// Get the 2D lane data from a tensor desc type if it exists.
47static std::optional<SmallVector<int64_t>>
48getMaybeLaneData(xegpu::TensorDescType tdescType) {
49 auto layout = tdescType.getLayoutAttr();
50 if (!layout)
51 return std::nullopt;
52 auto laneData = layout.getEffectiveLaneDataAsInt();
53 if (laneData.size() != 2)
54 return std::nullopt;
55 return laneData;
56}
57
58/// Get the 2D lane layout from a tensor desc type if it exists.
59static std::optional<SmallVector<int64_t>>
60getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
61 auto layout = tdescType.getLayoutAttr();
62 if (!layout)
63 return std::nullopt;
64 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
65 if (laneLayout.size() != 2)
66 return std::nullopt;
67 return laneLayout;
68}
69
70/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
71/// lane[1] == 1), but inner lane data is not equal to [1, 1].
72/// Example:
73/// !xegpu.tensor_desc<16x16xf16,
74/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
75/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
76/// indicating that this is a load that requires transpose effect. However,
77/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
78/// the inner dimension. We convert this to a optimized form by converting the
79/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
80/// later lowering easily use the load with transpose instruction.
81static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
82 ArrayRef<int64_t> laneData) {
83 if (laneLayout.size() != 2 || laneData.size() != 2)
84 return false;
85 if (laneLayout[0] == 1 || laneLayout[1] != 1)
86 return false;
87 if (laneData[0] != 1 || laneData[1] == 1)
88 return false;
89 return true;
90}
91
92/// A tensor desc type can be optimized if its element type is less than 32 bits
93/// and its layout can be optimized.
94static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
95 // If the dtype is greater or equal to 32 bits, layout must be valid.
96 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
97 if (elementTyBitwidth >= 32)
98 return false;
99 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
100 auto maybeLaneData = getMaybeLaneData(tdescType);
101 if (!maybeLaneData || !maybeLaneLayout)
102 return false;
103 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
104}
105
106/// Check if a tensor desc type can be optimized for transpose, if so return the
107/// new optimized tensor desc type with a valid transpose layout.
108static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
109 const uArch *targetuArch) {
110 if (!canBeOptimizedForTranspose(tdescType))
111 return tdescType;
112 auto laneData = getMaybeLaneData(tdescType)
113 .value(); // Lane data must exist if we reach here.
114 int64_t innerLaneData = laneData[1];
115 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
116 // Required shape is total shape of the vector result that this tensor desc
117 // must eventually load after adjusting for the new bitwidth and array
118 // length.
119 SmallVector<int64_t> requiredShape(tdescType.getShape());
120 requiredShape.back() =
121 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
122 int newBitWidth = elementTyBitwidth * innerLaneData;
123 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
124 // Supported shape is the max transpose shape that can be supported by
125 // hardware that is less than or equal to required shape.
126 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
127 targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
128 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
129 newElemTy, /** has transform */ false, /** has transpose */ true);
130 // If no HW params found, return the original type.
131 if (!maybeHWParams)
132 return tdescType;
133 auto [widths, heights, counts] = maybeHWParams.value();
134 // TODO: Currently we expect array length to be 1 for transpose case.
135 if (counts.size() != 1 || counts[0] != 1)
136 return tdescType;
137 int arrayLen = counts[0];
138 int supportedHeight =
139 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
140 int supportedWidth =
141 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
142 // If no supported height or width found, return the original type.
143 if (supportedHeight == -1 || supportedWidth == -1)
144 return tdescType;
145
146 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
147 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
148 tdescType.getContext(),
149 tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
150 // Array length can not be larger than 1 for transpose case.
151 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
152 tdescType.getBoundaryCheck(),
153 tdescType.getMemorySpace(), newLayout);
154}
155
156/// Helper to convert an OpFoldResult to Value.
157static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
158 OpFoldResult ofr) {
159 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
160 if (mayBeInt)
161 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
162 return llvm::cast<Value>(ofr);
163}
164
165/// Helper to divide a Value by a constant integer.
166static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
167 Value val, int64_t constant) {
168 // If the constant is a power of 2, use right shift for division.
169 if (llvm::isPowerOf2_64(constant)) {
170 int64_t shiftAmount = llvm::Log2_64(constant);
171 return arith::ShRUIOp::create(
172 rewriter, loc, val,
173 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
174 .getResult())
175 .getResult();
176 }
177 auto constantOp =
178 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
179 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
180}
181
182/// This function takes a larger register block `data` and generates multiple
183/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
184/// starting from `offsets`.
185static Value generateLoads(ConversionPatternRewriter &rewriter,
189 xegpu::LoadNdOp origLoadOp) {
190 Location loc = data.getLoc();
191 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
192 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
193 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
194 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
195 // Compute the ratio between original shape and supported shape. We need to
196 // generate loads in this ratio arrangement.
197 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
198 supportedShape)
199 .value(); // `ratio` must be defined if we reach here.
200 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
201 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
202 int64_t localOffsetDim0 = h * supportedShape[0];
203 int64_t localOffsetDim1 = w * supportedShape[1];
204 Value loadOffsetX = arith::AddIOp::create(
205 rewriter, loc, offsetDim0,
206 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
207 .getResult());
208 Value loadOffsetY = arith::AddIOp::create(
209 rewriter, loc, offsetDim1,
210 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
211 .getResult());
212 auto loadOp = xegpu::LoadNdOp::create(
213 rewriter, loc,
214 VectorType::get(supportedShape, data.getType().getElementType()),
215 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
216 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
217 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
218 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
219 // Set the layout for the loadOp.
220 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
221 loadOp.setAnchorLayout(layoutAttr);
222 // Insert the loaded block into the right position in data.
223 auto insertOp = vector::InsertStridedSliceOp::create(
224 rewriter, loc, loadOp.getResult(), data,
225 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
226 ArrayRef<int64_t>{1, 1});
227 // InsertOp must have the same layout as newTensorDesc.
228 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
229 data = insertOp.getResult();
230 }
231 }
232 return data;
233}
234
235/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
236/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
237/// the base pointer from the original memory source and adjusting the shape and
238/// strides of the tensor desc to fit with the new optimized transpose layout.
239class XeGPUCreateNdDescOpPattern final
240 : public OpConversionPattern<xegpu::CreateNdDescOp> {
241public:
242 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
243 LogicalResult
244 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
245 ConversionPatternRewriter &rewriter) const override {
246 auto tdescTy = createNdOp.getType();
247 // Get the target uArch info.
248 auto chipStr = xegpu::getChipStr(createNdOp);
249 // Check if the chip is supported.
250 assert(
251 chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
252 "Expecting target chip to be pvc or bmg for transpose optimization.");
253 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
254
255 auto convertType = tryOptimize(tdescTy, targetuArch);
256 if (convertType == tdescTy)
257 return failure();
258 auto strides = createNdOp.getMixedStrides();
259 auto maybeConstInnerStride = getConstantIntValue(strides.back());
260 // Only row-major memrefs are expected for now.
261 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
262 return rewriter.notifyMatchFailure(
263 createNdOp, "Expecting row-major memref for transpose optimization.");
264 Value source = createNdOp.getSource();
265 auto optionalLaneData = getMaybeLaneData(tdescTy);
266 assert(optionalLaneData && "Expected 2D lane data");
267 auto laneData = optionalLaneData.value();
268 int64_t innerLaneData = laneData[1];
269 auto memrefType = dyn_cast<MemRefType>(source.getType());
270 // Inner dimension of the shape must be adjusted based on innerLaneData.
271 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
272 modifiedShape.back() = divideByConstant(
273 rewriter, createNdOp.getLoc(),
274 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
275 innerLaneData);
276 // Similarly, second to last stride must be adjusted.
277 assert(strides.size() >= 2 &&
278 "Expected at least 2 strides for CreateNdDescOp");
279 SmallVector<OpFoldResult> modifiedStrides(strides);
280 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
281 rewriter, createNdOp.getLoc(),
282 convertToValue(rewriter, createNdOp.getLoc(),
283 modifiedStrides[modifiedStrides.size() - 2]),
284 innerLaneData);
285
286 // If the source is a static memref, we need to extract the pointer to
287 // base address.
288 if (memrefType && memrefType.hasStaticShape()) {
289 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
290 rewriter, createNdOp.getLoc(), source);
291 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
292 rewriter.getI64Type(),
293 extractOp.getResult())
294 .getResult();
295 }
296 // Create a new CreateNdDescOp with the modified shape and converted type.
297 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
298 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
299 modifiedStrides);
300 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
301 return success();
302 }
303};
304
305/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
306/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
307/// adjusted tensor desc type. This can result in multiple LoadNdOps being
308/// generated to fill in the original load shape.
309class XeGPULoadNdDescOpPattern final
310 : public OpConversionPattern<xegpu::LoadNdOp> {
311public:
312 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
313 LogicalResult
314 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
315 ConversionPatternRewriter &rewriter) const override {
316 auto origTensorDescType = loadNdOp.getTensorDescType();
317 auto adaptorType =
318 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
319 if (adaptorType == origTensorDescType)
320 return failure();
321 // Offsets must be adjusted based on innerLaneData.
322 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
323 int64_t innerLaneData = laneData[1];
324 auto offsets = loadNdOp.getMixedOffsets();
325 if (offsets.empty())
326 return rewriter.notifyMatchFailure(loadNdOp,
327 "Expecting offsets in LoadNd");
328 SmallVector<OpFoldResult> modifiedOffsets(offsets);
329 modifiedOffsets.back() = divideByConstant(
330 rewriter, loadNdOp.getLoc(),
331 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
332 innerLaneData);
333 // Get the 2D data shape of this loadNdOp in its original type including
334 // array length.
335 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
336 // Adjust the data shape based on innerLaneData.
337 origDataShape.back() /= innerLaneData;
338 // HW supported shape is the new tensor desc shape after conversion.
339 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
340 VectorType origVectorType =
341 VectorType::get(origDataShape, adaptorType.getElementType());
342 Value data;
343 // Orig data shape is 3D for the array length case.
344 if (origTensorDescType.getArrayLength() > 1) {
345 SmallVector<Value> arraySlices;
346 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
347 Value slice = arith::ConstantOp::create(
348 rewriter, loadNdOp->getLoc(), origVectorType,
349 rewriter.getZeroAttr(origVectorType));
350 // Increase the Y offset for each array slice.
351 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
352 modifiedOffsets.back());
353 modifiedOffsets.back() =
354 arith::AddIOp::create(
355 rewriter, loadNdOp->getLoc(), offsetY,
356 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
357 i * origDataShape[1])
358 .getResult())
359 .getResult();
360 slice = generateLoads(
361 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
362 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
363 loadNdOp);
364 // BitCast back to original load shape without array length.
365 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
366 origTensorDescType.getElementType());
367 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
368 bitcastType, slice);
369 // BitCastOp must have the same layout as the original loadNdOp.
370 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
371 origTensorDescType.getLayoutAttr());
372 arraySlices.push_back(bitCastOp.getResult());
373 }
374 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
375 return success();
376 }
377 data = arith::ConstantOp::create(
378 rewriter, loadNdOp->getLoc(),
379 VectorType::get(origDataShape, adaptorType.getElementType()),
380 rewriter.getZeroAttr(origVectorType));
381 data = generateLoads(
382 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
383 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
384 loadNdOp);
385 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
386 loadNdOp.getType(), data);
387 // BitCastOp must have the same layout as the original loadNdOp.
388 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
389 origTensorDescType.getLayoutAttr());
390 rewriter.replaceOp(loadNdOp, bitCastOp);
391 return success();
392 }
393};
394
395/// Vector ExtractOp must be processed if the original tensor desc type has
396/// array length greater than 1. In this case, the LoadNdOp is replaced with
397/// multiple LoadNdOps for each array slice making the extraction unnecessary.
398/// In this case, we simply remove the ExtractOp.
399class VectorExtractOpPattern final
400 : public OpConversionPattern<vector::ExtractOp> {
401public:
402 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
403 LogicalResult
404 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
405 ConversionPatternRewriter &rewriter) const override {
406 // Check if the source of the extraction is split to multiple values.
407 if (adaptor.getSource().size() == 1)
408 return failure();
409 auto mixedPos = extractOp.getMixedPosition();
410 if (mixedPos.size() != 1)
411 return failure();
412 auto mayBeInt = getConstantIntValue(mixedPos[0]);
413 if (!mayBeInt)
414 return failure();
415 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
416 return success();
417 }
418};
419
420/// Performs a reduction over 2 dimensions by decomposing it into two 1D
421/// reductions ordered based on layout to minimize cross-lane communication.
422class MultiRed2dOpPattern
423 : public OpConversionPattern<vector::MultiDimReductionOp> {
424 using OpConversionPattern::OpConversionPattern;
425 LogicalResult
426 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
427 ConversionPatternRewriter &rewriter) const override {
428 auto sourceVecType = reductionOp.getSourceVectorType();
429 if (reductionOp.getReductionDims().size() != 2 ||
430 sourceVecType.getRank() != 2)
431 return rewriter.notifyMatchFailure(
432 reductionOp, "Expected 2D multi reduction of a 2D source");
433 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
434 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
435 auto dims = llvm::to_vector(reductionOp.getReductionDims());
436 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
437 // Order does not matter
438 if (intraLaneDim == -1 || crossLaneDim == -1) {
439 intraLaneDim = dims[0];
440 crossLaneDim = dims[1];
441 }
442 auto loc = reductionOp.getLoc();
443 auto acc = reductionOp.getAcc();
444
445 // The first reduction's dist attribute does not have the cross lane dim.
446 auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
447 SmallVector<int64_t> dropDims{crossLaneDim};
448 auto intraLaneRedResLayout = resSliceLayoutAttr.dropSliceDims(dropDims);
449
450 SmallVector<int64_t> accShape(sourceVecType.getShape());
451 accShape.erase(accShape.begin() + intraLaneDim);
452 if (acc) {
453 acc = vector::BroadcastOp::create(
454 rewriter, loc,
455 VectorType::get(accShape, sourceVecType.getElementType()), acc);
457 llvm::dyn_cast<OpResult>(acc),
458 cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
459 }
460 Value intraLaneReduced = vector::MultiDimReductionOp::create(
461 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(), acc,
462 ArrayRef<int64_t>(intraLaneDim));
464 llvm::dyn_cast<OpResult>(intraLaneReduced),
465 cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
466
467 Value crossLaneReduced = vector::ReductionOp::create(
468 rewriter, loc, reductionOp.getKind(), intraLaneReduced, nullptr);
470 llvm::dyn_cast<OpResult>(crossLaneReduced),
471 cast<xegpu::DistributeLayoutAttr>(resLayout));
472 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
473 "Type mismatch");
474 rewriter.replaceOp(reductionOp, crossLaneReduced);
475 return success();
476 }
477
478private:
479 std::pair<int64_t, int64_t>
480 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
481 xegpu::DistributeLayoutAttr layout) const {
482 assert(layout.isForSubgroup() && "Must know the lane layout");
483 assert(reductionDims.size() == 2 && "Expected 2D reduction");
484 int64_t intra, cross = -1;
485 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
486 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
487 layoutAttr =
488 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
489 assert(layoutAttr);
490 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
491
492 assert(laneLayout.size() && "Expected a non-empty layout");
493 // try to pick a dim that does not communicate
494 for (auto dim : reductionDims) {
495 if (laneLayout[dim] == 1)
496 intra = dim;
497 else
498 cross = dim;
499 }
500 return {intra, cross};
501 }
502};
503
504} // namespace
505
508 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
509 VectorExtractOpPattern, MultiRed2dOpPattern>(
510 patterns.getContext());
511}
512
513namespace {
514
515struct XeGPUPeepHoleOptimizerPass final
517 XeGPUPeepHoleOptimizerPass> {
518 void runOnOperation() override {
519 MLIRContext &context = getContext();
520 TypeConverter converter;
521 RewritePatternSet patterns(&context);
522 ConversionTarget target(context);
523
524 // This pass is only meant for PVC and BMG targets. If unsupported target
525 // is found, exit early.
526 bool isTargetSupported = false;
527 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
528 auto chipStr = xegpu::getChipStr(funcOp);
529 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
530 isTargetSupported = true;
531 });
532
533 if (!isTargetSupported) {
534 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
535 << "\n";
536 return;
537 }
538
539 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
540 // converted.
541 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
542 [&](xegpu::CreateNdDescOp createNdOp) {
543 return !canBeOptimizedForTranspose(createNdOp.getType());
544 });
545 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
546 [&](xegpu::LoadNdOp loadNdOp) {
547 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
548 });
549 // Vector ExtractOps can have optimizable layouts if they extract from
550 // LoadNdOps with array length greater than 1. These ExtractOps must be
551 // converted.
552 target.addDynamicallyLegalOp<vector::ExtractOp>(
553 [&](vector::ExtractOp extractOp) {
554 auto layout = xegpu::getTemporaryLayout(
555 dyn_cast<OpResult>(extractOp.getResult()));
556 if (!layout)
557 return true;
558 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
559 auto laneData = layout.getEffectiveLaneDataAsInt();
560 return !canBeOptimizedForTranspose(laneLayout, laneData);
561 });
562
563 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
564 [=](Operation *op) -> bool {
565 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
566 if (!layout || !layout.isForSubgroup())
567 return true;
568 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
569 return reductionOp.getReductionDims().size() != 2;
570 return true;
571 });
572
573 converter.addConversion([](Type type) { return type; });
574
575 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
576 vector::VectorDialect>();
578 target);
580 if (failed(applyPartialConversion(getOperation(), target,
581 std::move(patterns)))) {
582 DBGS() << "Optimize block loads pass failed.\n";
583 return signalPassFailure();
584 }
585 }
586};
587
588} // namespace
return success()
b getContext())
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:363
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
void setDistributeLayoutAttr(const OpResult &Result, const DistributeLayoutAttr layout)
[to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult user should use setAnchorLayout...
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
const FrozenRewritePatternSet & patterns
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
SmallVector< int64_t > dropDims(ArrayRef< int64_t > inputPerm, ArrayRef< int64_t > dropPositions)
Returns a permutation vector that drop the input dims in dropPositions from inputPerm.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:169