MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include <optional>
32
33namespace mlir {
34namespace xegpu {
35#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
36#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
37} // namespace xegpu
38} // namespace mlir
39
40#define DEBUG_TYPE "xegpu-optimize-peephole"
41#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
42
43using namespace mlir;
44
45namespace {
46
47/// Get the 2D lane data from a tensor desc type if it exists.
48static std::optional<SmallVector<int64_t>>
49getMaybeLaneData(xegpu::TensorDescType tdescType) {
50 auto layout = tdescType.getLayoutAttr();
51 if (!layout)
52 return std::nullopt;
53 auto laneData = layout.getEffectiveLaneDataAsInt();
54 if (laneData.size() != 2)
55 return std::nullopt;
56 return laneData;
57}
58
59/// Get the 2D lane layout from a tensor desc type if it exists.
60static std::optional<SmallVector<int64_t>>
61getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
62 auto layout = tdescType.getLayoutAttr();
63 if (!layout)
64 return std::nullopt;
65 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
66 if (laneLayout.size() != 2)
67 return std::nullopt;
68 return laneLayout;
69}
70
71/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
72/// lane[1] == 1), but inner lane data is not equal to [1, 1].
73/// Example:
74/// !xegpu.tensor_desc<16x16xf16,
75/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
76/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
77/// indicating that this is a load that requires transpose effect. However,
78/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
79/// the inner dimension. We convert this to a optimized form by converting the
80/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
81/// later lowering easily use the load with transpose instruction.
82static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
83 ArrayRef<int64_t> laneData) {
84 if (laneLayout.size() != 2 || laneData.size() != 2)
85 return false;
86 if (laneLayout[0] == 1 || laneLayout[1] != 1)
87 return false;
88 if (laneData[0] != 1 || laneData[1] == 1)
89 return false;
90 return true;
91}
92
93/// A tensor desc type can be optimized if its element type is less than 32 bits
94/// and its layout can be optimized.
95static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
96 // If the dtype is greater or equal to 32 bits, layout must be valid.
97 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
98 if (elementTyBitwidth >= 32)
99 return false;
100 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
101 auto maybeLaneData = getMaybeLaneData(tdescType);
102 if (!maybeLaneData || !maybeLaneLayout)
103 return false;
104 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
105}
106
107/// Check if a tensor desc type can be optimized for transpose, if so return the
108/// new optimized tensor desc type with a valid transpose layout.
109static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
110 const uArch *targetuArch) {
111 if (!canBeOptimizedForTranspose(tdescType))
112 return tdescType;
113 auto laneData = getMaybeLaneData(tdescType)
114 .value(); // Lane data must exist if we reach here.
115 int64_t innerLaneData = laneData[1];
116 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
117 // Required shape is total shape of the vector result that this tensor desc
118 // must eventually load after adjusting for the new bitwidth and array
119 // length.
120 SmallVector<int64_t> requiredShape(tdescType.getShape());
121 requiredShape.back() =
122 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
123 int newBitWidth = elementTyBitwidth * innerLaneData;
124 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
125 // Supported shape is the max transpose shape that can be supported by
126 // hardware that is less than or equal to required shape.
127 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
129 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
130 newElemTy, /** has transform */ false, /** has transpose */ true);
131 // If no HW params found, return the original type.
132 if (!maybeHWParams)
133 return tdescType;
134 auto [widths, heights, counts] = maybeHWParams.value();
135 // TODO: Currently we expect array length to be 1 for transpose case.
136 if (counts.size() != 1 || counts[0] != 1)
137 return tdescType;
138 int arrayLen = counts[0];
139 int supportedHeight =
140 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
141 int supportedWidth =
142 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
143 // If no supported height or width found, return the original type.
144 if (supportedHeight == -1 || supportedWidth == -1)
145 return tdescType;
146
147 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
148 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
149 tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout(),
150 DenseI32ArrayAttr::get(tdescType.getContext(), {1, 1}),
151 tdescType.getLayoutAttr().getOrder());
152 // Array length can not be larger than 1 for transpose case.
153 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
154 tdescType.getBoundaryCheck(),
155 tdescType.getMemorySpace(), newLayout);
156}
157
158/// Helper to convert an OpFoldResult to Value.
159static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
160 OpFoldResult ofr) {
161 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
162 if (mayBeInt)
163 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
164 return llvm::cast<Value>(ofr);
166
167/// Helper to divide a Value by a constant integer.
168static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
169 Value val, int64_t constant) {
170 // If the constant is a power of 2, use right shift for division.
171 if (llvm::isPowerOf2_64(constant)) {
172 int64_t shiftAmount = llvm::Log2_64(constant);
173 return arith::ShRUIOp::create(
174 rewriter, loc, val,
175 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
176 .getResult())
177 .getResult();
178 }
179 auto constantOp =
180 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
181 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
182}
183
184/// This function takes a larger register block `data` and generates multiple
185/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
186/// starting from `offsets`.
187static Value generateLoads(ConversionPatternRewriter &rewriter,
191 xegpu::LoadNdOp origLoadOp) {
192 Location loc = data.getLoc();
193 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
194 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
195 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
196 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
197 // Compute the ratio between original shape and supported shape. We need to
198 // generate loads in this ratio arrangement.
199 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
200 supportedShape)
201 .value(); // `ratio` must be defined if we reach here.
202 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
203 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
204 int64_t localOffsetDim0 = h * supportedShape[0];
205 int64_t localOffsetDim1 = w * supportedShape[1];
206 Value loadOffsetX = arith::AddIOp::create(
207 rewriter, loc, offsetDim0,
208 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
209 .getResult());
210 Value loadOffsetY = arith::AddIOp::create(
211 rewriter, loc, offsetDim1,
212 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
213 .getResult());
214 auto loadOp = xegpu::LoadNdOp::create(
215 rewriter, loc,
216 VectorType::get(supportedShape, data.getType().getElementType()),
217 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
218 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
219 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
220 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
221 // Set the layout for the loadOp.
222 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
223 loadOp.setAnchorLayout(layoutAttr);
224 // Insert the loaded block into the right position in data.
225 auto insertOp = vector::InsertStridedSliceOp::create(
226 rewriter, loc, loadOp.getResult(), data,
227 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
228 ArrayRef<int64_t>{1, 1});
229 // InsertOp must have the same layout as newTensorDesc.
230 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
231 data = insertOp.getResult();
232 }
233 }
234 return data;
235}
236
237/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
238/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
239/// the base pointer from the original memory source and adjusting the shape and
240/// strides of the tensor desc to fit with the new optimized transpose layout.
241class XeGPUCreateNdDescOpPattern final
242 : public OpConversionPattern<xegpu::CreateNdDescOp> {
243public:
244 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
245 LogicalResult
246 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
247 ConversionPatternRewriter &rewriter) const override {
248 auto tdescTy = createNdOp.getType();
249 // Get the target uArch info.
250 auto chipStr = xegpu::getChipStr(createNdOp);
251 // Check if the chip is supported.
252 assert(
253 chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
254 "Expecting target chip to be pvc or bmg for transpose optimization.");
255 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
256
257 auto convertType = tryOptimize(tdescTy, targetuArch);
258 if (convertType == tdescTy)
259 return failure();
260 auto strides = createNdOp.getMixedStrides();
261 auto maybeConstInnerStride = getConstantIntValue(strides.back());
262 // Only row-major memrefs are expected for now.
263 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
264 return rewriter.notifyMatchFailure(
265 createNdOp, "Expecting row-major memref for transpose optimization.");
266 Value source = createNdOp.getSource();
267 auto optionalLaneData = getMaybeLaneData(tdescTy);
268 assert(optionalLaneData && "Expected 2D lane data");
269 auto laneData = optionalLaneData.value();
270 int64_t innerLaneData = laneData[1];
271 auto memrefType = dyn_cast<MemRefType>(source.getType());
272 // Inner dimension of the shape must be adjusted based on innerLaneData.
273 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
274 modifiedShape.back() = divideByConstant(
275 rewriter, createNdOp.getLoc(),
276 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
277 innerLaneData);
278 // Similarly, second to last stride must be adjusted.
279 assert(strides.size() >= 2 &&
280 "Expected at least 2 strides for CreateNdDescOp");
281 SmallVector<OpFoldResult> modifiedStrides(strides);
282 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
283 rewriter, createNdOp.getLoc(),
284 convertToValue(rewriter, createNdOp.getLoc(),
285 modifiedStrides[modifiedStrides.size() - 2]),
286 innerLaneData);
287
288 // If the source is a static memref, we need to extract the pointer to
289 // base address.
290 if (memrefType && memrefType.hasStaticShape()) {
291 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
292 rewriter, createNdOp.getLoc(), source);
293 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
294 rewriter.getI64Type(),
295 extractOp.getResult())
296 .getResult();
297 }
298 // Create a new CreateNdDescOp with the modified shape and converted type.
299 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
300 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
301 modifiedStrides);
302 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
303 return success();
304 }
305};
306
307/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
308/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
309/// adjusted tensor desc type. This can result in multiple LoadNdOps being
310/// generated to fill in the original load shape.
311class XeGPULoadNdDescOpPattern final
312 : public OpConversionPattern<xegpu::LoadNdOp> {
313public:
314 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
315 LogicalResult
316 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
317 ConversionPatternRewriter &rewriter) const override {
318 auto origTensorDescType = loadNdOp.getTensorDescType();
319 auto adaptorType =
320 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
321 if (adaptorType == origTensorDescType)
322 return failure();
323 // Offsets must be adjusted based on innerLaneData.
324 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
325 int64_t innerLaneData = laneData[1];
326 auto offsets = loadNdOp.getMixedOffsets();
327 if (offsets.empty())
328 return rewriter.notifyMatchFailure(loadNdOp,
329 "Expecting offsets in LoadNd");
330 SmallVector<OpFoldResult> modifiedOffsets(offsets);
331 modifiedOffsets.back() = divideByConstant(
332 rewriter, loadNdOp.getLoc(),
333 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
334 innerLaneData);
335 // Get the 2D data shape of this loadNdOp in its original type including
336 // array length.
337 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
338 // Adjust the data shape based on innerLaneData.
339 origDataShape.back() /= innerLaneData;
340 // HW supported shape is the new tensor desc shape after conversion.
341 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
342 VectorType origVectorType =
343 VectorType::get(origDataShape, adaptorType.getElementType());
344 Value data;
345 // Orig data shape is 3D for the array length case.
346 if (origTensorDescType.getArrayLength() > 1) {
347 SmallVector<Value> arraySlices;
348 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
349 Value slice = arith::ConstantOp::create(
350 rewriter, loadNdOp->getLoc(), origVectorType,
351 rewriter.getZeroAttr(origVectorType));
352 // Increase the Y offset for each array slice.
353 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
354 modifiedOffsets.back());
355 modifiedOffsets.back() =
356 arith::AddIOp::create(
357 rewriter, loadNdOp->getLoc(), offsetY,
358 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
359 i * origDataShape[1])
360 .getResult())
361 .getResult();
362 slice = generateLoads(
363 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
364 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
365 loadNdOp);
366 // BitCast back to original load shape without array length.
367 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
368 origTensorDescType.getElementType());
369 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
370 bitcastType, slice);
371 // BitCastOp must have the same layout as the original loadNdOp.
372 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
373 origTensorDescType.getLayoutAttr());
374 arraySlices.push_back(bitCastOp.getResult());
375 }
376 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
377 return success();
378 }
379 data = arith::ConstantOp::create(
380 rewriter, loadNdOp->getLoc(),
381 VectorType::get(origDataShape, adaptorType.getElementType()),
382 rewriter.getZeroAttr(origVectorType));
383 data = generateLoads(
384 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
385 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
386 loadNdOp);
387 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
388 loadNdOp.getType(), data);
389 // BitCastOp must have the same layout as the original loadNdOp.
390 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
391 origTensorDescType.getLayoutAttr());
392 rewriter.replaceOp(loadNdOp, bitCastOp);
393 return success();
394 }
395};
396
397/// Vector ExtractOp must be processed if the original tensor desc type has
398/// array length greater than 1. In this case, the LoadNdOp is replaced with
399/// multiple LoadNdOps for each array slice making the extraction unnecessary.
400/// In this case, we simply remove the ExtractOp.
401class VectorExtractOpPattern final
402 : public OpConversionPattern<vector::ExtractOp> {
403public:
404 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
405 LogicalResult
406 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
407 ConversionPatternRewriter &rewriter) const override {
408 // Check if the source of the extraction is split to multiple values.
409 if (adaptor.getSource().size() == 1)
410 return failure();
411 auto mixedPos = extractOp.getMixedPosition();
412 if (mixedPos.size() != 1)
413 return failure();
414 auto mayBeInt = getConstantIntValue(mixedPos[0]);
415 if (!mayBeInt)
416 return failure();
417 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
418 return success();
419 }
420};
421
422/// Performs a reduction over 2 dimensions by decomposing it into two 1D
423/// reductions ordered based on layout to minimize cross-lane communication.
424class MultiRed2dOpPattern
425 : public OpConversionPattern<vector::MultiDimReductionOp> {
426 using OpConversionPattern::OpConversionPattern;
427 LogicalResult
428 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
429 ConversionPatternRewriter &rewriter) const override {
430 auto sourceVecType = reductionOp.getSourceVectorType();
431 if (reductionOp.getReductionDims().size() != 2)
432 return rewriter.notifyMatchFailure(reductionOp, "Expected 2D reduction");
433 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
434 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
435 auto dims = llvm::to_vector(reductionOp.getReductionDims());
436 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
437 // Order does not matter
438 if (intraLaneDim == -1 || crossLaneDim == -1) {
439 intraLaneDim = dims[0];
440 crossLaneDim = dims[1];
441 }
442 auto loc = reductionOp.getLoc();
443 auto acc = reductionOp.getAcc();
444
445 SmallVector<int64_t> accShape(sourceVecType.getShape());
446 accShape.erase(accShape.begin() + intraLaneDim);
447 Type eTy = sourceVecType.getElementType();
448 Value constNeutralVal = xegpu::createReductionNeutralValue(
449 rewriter, loc, VectorType::get(accShape, eTy), reductionOp.getKind());
450
451 Value intraLaneReduced = vector::MultiDimReductionOp::create(
452 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
453 constNeutralVal, ArrayRef<int64_t>(intraLaneDim));
454
455 // Adjust crossLaneDim after the first reduction.
456 if (crossLaneDim > intraLaneDim)
457 crossLaneDim -= 1;
458 Value crossLaneReduced = vector::MultiDimReductionOp::create(
459 rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
460 ArrayRef<int64_t>(crossLaneDim));
461 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
462 "Type mismatch");
463 rewriter.replaceOp(reductionOp, crossLaneReduced);
464 return success();
465 }
466
467private:
468 std::pair<int64_t, int64_t>
469 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
470 xegpu::DistributeLayoutAttr layout) const {
471 assert(layout.isForSubgroup() && "Must know the lane layout");
472 assert(reductionDims.size() == 2 && "Expected 2D reduction");
473 int64_t intra, cross = -1;
474 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
475 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
476 layoutAttr =
477 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
478 assert(layoutAttr);
479 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
480
481 assert(laneLayout.size() && "Expected a non-empty layout");
482 // try to pick a dim that does not communicate
483 for (auto dim : reductionDims) {
484 if (laneLayout[dim] == 1)
485 intra = dim;
486 else
487 cross = dim;
488 }
489 return {intra, cross};
490 }
491};
492
493} // namespace
494
496 RewritePatternSet &patterns) {
497 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
498 VectorExtractOpPattern, MultiRed2dOpPattern>(
499 patterns.getContext());
500}
501
502namespace {
503
504struct XeGPUPeepHoleOptimizerPass final
506 XeGPUPeepHoleOptimizerPass> {
507 void runOnOperation() override {
508 MLIRContext &context = getContext();
509 TypeConverter converter;
510 RewritePatternSet patterns(&context);
511 ConversionTarget target(context);
512
513 // This pass is only meant for PVC and BMG targets. If unsupported target
514 // is found, exit early.
515 bool isTargetSupported = false;
516 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
517 auto chipStr = xegpu::getChipStr(funcOp);
518 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
519 isTargetSupported = true;
520 });
521
522 if (!isTargetSupported) {
523 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
524 << "\n";
525 return;
526 }
527
528 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
529 // converted.
530 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
531 [&](xegpu::CreateNdDescOp createNdOp) {
532 return !canBeOptimizedForTranspose(createNdOp.getType());
533 });
534 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
535 [&](xegpu::LoadNdOp loadNdOp) {
536 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
537 });
538 // Vector ExtractOps can have optimizable layouts if they extract from
539 // LoadNdOps with array length greater than 1. These ExtractOps must be
540 // converted.
541 target.addDynamicallyLegalOp<vector::ExtractOp>(
542 [&](vector::ExtractOp extractOp) {
543 auto layout = xegpu::getTemporaryLayout(
544 dyn_cast<OpResult>(extractOp.getResult()));
545 if (!layout)
546 return true;
547 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
548 auto laneData = layout.getEffectiveLaneDataAsInt();
549 return !canBeOptimizedForTranspose(laneLayout, laneData);
550 });
551
552 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
553 [=](Operation *op) -> bool {
554 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
555 if (!layout || !layout.isForSubgroup())
556 return true;
557 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
558 return reductionOp.getReductionDims().size() != 2;
559 return true;
560 });
561
562 converter.addConversion([](Type type) { return type; });
563
564 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
565 vector::VectorDialect>();
567 target);
569 if (failed(applyPartialConversion(getOperation(), target,
570 std::move(patterns)))) {
571 DBGS() << "Optimize block loads pass failed.\n";
572 return signalPassFailure();
573 }
574
575 // Apply folding for cleaning up IR.
576 MLIRContext *ctx = &getContext();
577 RewritePatternSet emptyPatterns(ctx);
578 (void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
579 }
580};
581
582} // namespace
return success()
b getContext())
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:363
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:163