MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include <optional>
32
33namespace mlir {
34namespace xegpu {
35#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
36#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
37} // namespace xegpu
38} // namespace mlir
39
40#define DEBUG_TYPE "xegpu-optimize-peephole"
41#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
42
43using namespace mlir;
44
45namespace {
46
47/// Get the 2D lane data from a tensor desc type if it exists.
48static std::optional<SmallVector<int64_t>>
49getMaybeLaneData(xegpu::TensorDescType tdescType) {
50 auto layout = tdescType.getLayoutAttr();
51 if (!layout)
52 return std::nullopt;
53 auto laneData = layout.getEffectiveLaneDataAsInt();
54 if (laneData.size() != 2)
55 return std::nullopt;
56 return laneData;
57}
58
59/// Get the 2D lane layout from a tensor desc type if it exists.
60static std::optional<SmallVector<int64_t>>
61getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
62 auto layout = tdescType.getLayoutAttr();
63 if (!layout)
64 return std::nullopt;
65 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
66 if (laneLayout.size() != 2)
67 return std::nullopt;
68 return laneLayout;
69}
70
71/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
72/// lane[1] == 1), but inner lane data is not equal to [1, 1].
73/// Example:
74/// !xegpu.tensor_desc<16x16xf16,
75/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
76/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
77/// indicating that this is a load that requires transpose effect. However,
78/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
79/// the inner dimension. We convert this to a optimized form by converting the
80/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
81/// later lowering easily use the load with transpose instruction.
82static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
83 ArrayRef<int64_t> laneData) {
84 if (laneLayout.size() != 2 || laneData.size() != 2)
85 return false;
86 if (laneLayout[0] == 1 || laneLayout[1] != 1)
87 return false;
88 if (laneData[0] != 1 || laneData[1] == 1)
89 return false;
90 return true;
91}
92
93/// A tensor desc type can be optimized if its element type is less than 32 bits
94/// and its layout can be optimized.
95static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
96 // If the dtype is greater or equal to 32 bits, layout must be valid.
97 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
98 if (elementTyBitwidth >= 32)
99 return false;
100 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
101 auto maybeLaneData = getMaybeLaneData(tdescType);
102 if (!maybeLaneData || !maybeLaneLayout)
103 return false;
104 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
105}
106
107/// Check if a tensor desc type can be optimized for transpose, if so return the
108/// new optimized tensor desc type with a valid transpose layout.
109static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
110 const uArch *targetuArch) {
111 if (!canBeOptimizedForTranspose(tdescType))
112 return tdescType;
113 auto laneData = getMaybeLaneData(tdescType)
114 .value(); // Lane data must exist if we reach here.
115 int64_t innerLaneData = laneData[1];
116 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
117 // Required shape is total shape of the vector result that this tensor desc
118 // must eventually load after adjusting for the new bitwidth and array
119 // length.
120 SmallVector<int64_t> requiredShape(tdescType.getShape());
121 requiredShape.back() =
122 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
123 int newBitWidth = elementTyBitwidth * innerLaneData;
124 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
125 // Supported shape is the max transpose shape that can be supported by
126 // hardware that is less than or equal to required shape.
127 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
128 targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
129 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
130 newElemTy, /** has transform */ false, /** has transpose */ true);
131 // If no HW params found, return the original type.
132 if (!maybeHWParams)
133 return tdescType;
134 auto [widths, heights, counts] = maybeHWParams.value();
135 // TODO: Currently we expect array length to be 1 for transpose case.
136 if (counts.size() != 1 || counts[0] != 1)
137 return tdescType;
138 int arrayLen = counts[0];
139 int supportedHeight =
140 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
141 int supportedWidth =
142 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
143 // If no supported height or width found, return the original type.
144 if (supportedHeight == -1 || supportedWidth == -1)
145 return tdescType;
146
147 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
148 auto ctx = tdescType.getContext();
149 auto origLayout = tdescType.getLayoutAttr();
150 auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
151 SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
152 laneLayoutI64.end());
153
154 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
155 ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
156 /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
157 /*order=*/origLayout.getOrder());
158
159 // Array length can not be larger than 1 for transpose case.
160 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
161 tdescType.getBoundaryCheck(),
162 tdescType.getMemorySpace(), newLayout);
163}
164
165/// Helper to convert an OpFoldResult to Value.
166static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
167 OpFoldResult ofr) {
168 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
169 if (mayBeInt)
170 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
171 return llvm::cast<Value>(ofr);
172}
173
174/// Helper to divide a Value by a constant integer.
175static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
176 Value val, int64_t constant) {
177 // If the constant is a power of 2, use right shift for division.
178 if (llvm::isPowerOf2_64(constant)) {
179 int64_t shiftAmount = llvm::Log2_64(constant);
180 return arith::ShRUIOp::create(
181 rewriter, loc, val,
182 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
183 .getResult())
184 .getResult();
185 }
186 auto constantOp =
187 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
188 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
189}
190
191/// This function takes a larger register block `data` and generates multiple
192/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
193/// starting from `offsets`.
194static Value generateLoads(ConversionPatternRewriter &rewriter,
198 xegpu::LoadNdOp origLoadOp) {
199 Location loc = data.getLoc();
200 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
201 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
202 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
203 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
204 // Compute the ratio between original shape and supported shape. We need to
205 // generate loads in this ratio arrangement.
206 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
207 supportedShape)
208 .value(); // `ratio` must be defined if we reach here.
209 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
210 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
211 int64_t localOffsetDim0 = h * supportedShape[0];
212 int64_t localOffsetDim1 = w * supportedShape[1];
213 Value loadOffsetX = arith::AddIOp::create(
214 rewriter, loc, offsetDim0,
215 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
216 .getResult());
217 Value loadOffsetY = arith::AddIOp::create(
218 rewriter, loc, offsetDim1,
219 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
220 .getResult());
221 auto loadOp = xegpu::LoadNdOp::create(
222 rewriter, loc,
223 VectorType::get(supportedShape, data.getType().getElementType()),
224 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
225 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
226 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
227 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
228 // Set the layout for the loadOp.
229 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
230 loadOp.setAnchorLayout(layoutAttr);
231 // Insert the loaded block into the right position in data.
232 auto insertOp = vector::InsertStridedSliceOp::create(
233 rewriter, loc, loadOp.getResult(), data,
234 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
235 ArrayRef<int64_t>{1, 1});
236 // InsertOp must have the same layout as newTensorDesc.
237 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
238 data = insertOp.getResult();
239 }
240 }
241 return data;
242}
243
244/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
245/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
246/// the base pointer from the original memory source and adjusting the shape and
247/// strides of the tensor desc to fit with the new optimized transpose layout.
248class XeGPUCreateNdDescOpPattern final
249 : public OpConversionPattern<xegpu::CreateNdDescOp> {
250public:
251 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
252 LogicalResult
253 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
254 ConversionPatternRewriter &rewriter) const override {
255 auto tdescTy = createNdOp.getType();
256 // Get the target uArch info.
257 auto chipStr = xegpu::getChipStr(createNdOp);
258 // Check if the chip is supported.
259 assert(chipStr &&
260 (chipStr.value() == "pvc" || chipStr.value() == "bmg" ||
261 chipStr.value() == "cri") &&
262 "Expecting target chip to be pvc, bmg or cri for transpose "
263 "optimization.");
264 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
265
266 auto convertType = tryOptimize(tdescTy, targetuArch);
267 if (convertType == tdescTy)
268 return failure();
269 auto strides = createNdOp.getMixedStrides();
270 auto maybeConstInnerStride = getConstantIntValue(strides.back());
271 // Only row-major memrefs are expected for now.
272 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
273 return rewriter.notifyMatchFailure(
274 createNdOp, "Expecting row-major memref for transpose optimization.");
275 Value source = createNdOp.getSource();
276 auto optionalLaneData = getMaybeLaneData(tdescTy);
277 assert(optionalLaneData && "Expected 2D lane data");
278 auto laneData = optionalLaneData.value();
279 int64_t innerLaneData = laneData[1];
280 auto memrefType = dyn_cast<MemRefType>(source.getType());
281 // Inner dimension of the shape must be adjusted based on innerLaneData.
282 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
283 modifiedShape.back() = divideByConstant(
284 rewriter, createNdOp.getLoc(),
285 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
286 innerLaneData);
287 // Similarly, second to last stride must be adjusted.
288 assert(strides.size() >= 2 &&
289 "Expected at least 2 strides for CreateNdDescOp");
290 SmallVector<OpFoldResult> modifiedStrides(strides);
291 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
292 rewriter, createNdOp.getLoc(),
293 convertToValue(rewriter, createNdOp.getLoc(),
294 modifiedStrides[modifiedStrides.size() - 2]),
295 innerLaneData);
296
297 // If the source is a static memref, we need to extract the pointer to
298 // base address.
299 if (memrefType && memrefType.hasStaticShape()) {
300 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
301 rewriter, createNdOp.getLoc(), source);
302 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
303 rewriter.getI64Type(),
304 extractOp.getResult())
305 .getResult();
306 }
307 // Create a new CreateNdDescOp with the modified shape and converted type.
308 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
309 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
310 modifiedStrides);
311 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
312 return success();
313 }
314};
315
316/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
317/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
318/// adjusted tensor desc type. This can result in multiple LoadNdOps being
319/// generated to fill in the original load shape.
320class XeGPULoadNdDescOpPattern final
321 : public OpConversionPattern<xegpu::LoadNdOp> {
322public:
323 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
324 LogicalResult
325 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
326 ConversionPatternRewriter &rewriter) const override {
327 auto origTensorDescType = loadNdOp.getTensorDescType();
328 auto adaptorType =
329 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
330 if (adaptorType == origTensorDescType)
331 return failure();
332 // Offsets must be adjusted based on innerLaneData.
333 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
334 int64_t innerLaneData = laneData[1];
335 auto offsets = loadNdOp.getMixedOffsets();
336 if (offsets.empty())
337 return rewriter.notifyMatchFailure(loadNdOp,
338 "Expecting offsets in LoadNd");
339 SmallVector<OpFoldResult> modifiedOffsets(offsets);
340 modifiedOffsets.back() = divideByConstant(
341 rewriter, loadNdOp.getLoc(),
342 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
343 innerLaneData);
344 // Get the 2D data shape of this loadNdOp in its original type including
345 // array length.
346 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
347 // Adjust the data shape based on innerLaneData.
348 origDataShape.back() /= innerLaneData;
349 // HW supported shape is the new tensor desc shape after conversion.
350 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
351 VectorType origVectorType =
352 VectorType::get(origDataShape, adaptorType.getElementType());
353 Value data;
354 // Orig data shape is 3D for the array length case.
355 if (origTensorDescType.getArrayLength() > 1) {
356 SmallVector<Value> arraySlices;
357 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
358 Value slice = arith::ConstantOp::create(
359 rewriter, loadNdOp->getLoc(), origVectorType,
360 rewriter.getZeroAttr(origVectorType));
361 // Increase the Y offset for each array slice.
362 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
363 modifiedOffsets.back());
364 modifiedOffsets.back() =
365 arith::AddIOp::create(
366 rewriter, loadNdOp->getLoc(), offsetY,
367 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
368 i * origDataShape[1])
369 .getResult())
370 .getResult();
371 slice = generateLoads(
372 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
373 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
374 loadNdOp);
375 // BitCast back to original load shape without array length.
376 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
377 origTensorDescType.getElementType());
378 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
379 bitcastType, slice);
380 // BitCastOp must have the same layout as the original loadNdOp.
381 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
382 origTensorDescType.getLayoutAttr());
383 arraySlices.push_back(bitCastOp.getResult());
384 }
385 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
386 return success();
387 }
388 data = arith::ConstantOp::create(
389 rewriter, loadNdOp->getLoc(),
390 VectorType::get(origDataShape, adaptorType.getElementType()),
391 rewriter.getZeroAttr(origVectorType));
392 data = generateLoads(
393 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
394 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
395 loadNdOp);
396 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
397 loadNdOp.getType(), data);
398 // BitCastOp must have the same layout as the original loadNdOp.
399 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
400 origTensorDescType.getLayoutAttr());
401 rewriter.replaceOp(loadNdOp, bitCastOp);
402 return success();
403 }
404};
405
406/// Vector ExtractOp must be processed if the original tensor desc type has
407/// array length greater than 1. In this case, the LoadNdOp is replaced with
408/// multiple LoadNdOps for each array slice making the extraction unnecessary.
409/// In this case, we simply remove the ExtractOp.
410class VectorExtractOpPattern final
411 : public OpConversionPattern<vector::ExtractOp> {
412public:
413 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
414 LogicalResult
415 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
416 ConversionPatternRewriter &rewriter) const override {
417 // Check if the source of the extraction is split to multiple values.
418 if (adaptor.getSource().size() == 1)
419 return failure();
420 auto mixedPos = extractOp.getMixedPosition();
421 if (mixedPos.size() != 1)
422 return failure();
423 auto mayBeInt = getConstantIntValue(mixedPos[0]);
424 if (!mayBeInt)
425 return failure();
426 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
427 return success();
428 }
429};
430
431/// Performs a reduction over 2 dimensions by decomposing it into two 1D
432/// reductions ordered based on layout to minimize cross-lane communication.
433class MultiRed2dOpPattern
434 : public OpConversionPattern<vector::MultiDimReductionOp> {
435 using OpConversionPattern::OpConversionPattern;
436 LogicalResult
437 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
438 ConversionPatternRewriter &rewriter) const override {
439 auto sourceVecType = reductionOp.getSourceVectorType();
440 if (reductionOp.getReductionDims().size() != 2)
441 return rewriter.notifyMatchFailure(reductionOp, "Expected 2D reduction");
442 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
443 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
444 auto dims = llvm::to_vector(reductionOp.getReductionDims());
445 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
446 // Order does not matter
447 if (intraLaneDim == -1 || crossLaneDim == -1) {
448 intraLaneDim = dims[0];
449 crossLaneDim = dims[1];
450 }
451 auto loc = reductionOp.getLoc();
452 auto acc = reductionOp.getAcc();
453
454 // The decomposition below splits the 2D reduction into an intra-lane
455 // then a cross-lane 1D reduction. The natural result layout of the
456 // decomposed sequence (a doubly-sliced layout) differs from the
457 // original 2D reduction's result layout that the rest of the IR was
458 // written/propagated against. To keep the post-peephole IR
459 // self-consistent without depending on a follow-up layout
460 // propagation pass, we always insert a bridge xegpu.convert_layout
461 // from the natural post-decomposition layout to the original
462 // reduction's result layout. Trivial bridges fold away in
463 // canonicalization.
464 xegpu::DistributeLayoutAttr postDecompLayout;
465 if (resLayout) {
466 // Derive the source vector's layout.
467 xegpu::DistributeLayoutAttr srcLayoutForCvt;
468 if (auto resSlice = dyn_cast_if_present<xegpu::SliceAttr>(resLayout))
469 srcLayoutForCvt = resSlice.getParent();
470 if (!srcLayoutForCvt)
471 srcLayoutForCvt =
472 xegpu::getDistributeLayoutAttr(reductionOp.getSource());
473 if (srcLayoutForCvt) {
474 // The natural layout of the post-decomposition reduction result
475 // is a nested SliceAttr: REDUCE_1 (reduces `intraLaneDim` from
476 // the source) yields `slice<src, [intraLaneDim]>`; REDUCE_2
477 // then reduces `adjCrossLaneDim` from that intermediate, giving
478 // `slice<slice<src, [intraLaneDim]>, [adjCrossLaneDim]>`.
479 MLIRContext *ctx = reductionOp.getContext();
480 int64_t adjCrossLaneDim =
481 crossLaneDim > intraLaneDim ? crossLaneDim - 1 : crossLaneDim;
482 auto intermediateLayout = xegpu::SliceAttr::get(
483 ctx, srcLayoutForCvt, DenseI64ArrayAttr::get(ctx, {intraLaneDim}));
484 postDecompLayout = xegpu::SliceAttr::get(
485 ctx, intermediateLayout,
486 DenseI64ArrayAttr::get(ctx, {adjCrossLaneDim}));
487 }
488 }
489
490 SmallVector<int64_t> accShape(sourceVecType.getShape());
491 accShape.erase(accShape.begin() + intraLaneDim);
492 Type eTy = sourceVecType.getElementType();
494 rewriter, loc, VectorType::get(accShape, eTy), reductionOp.getKind());
495
496 Value intraLaneReduced = vector::MultiDimReductionOp::create(
497 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
498 constNeutralVal, ArrayRef<int64_t>(intraLaneDim));
499
500 // Adjust crossLaneDim after the first reduction.
501 if (crossLaneDim > intraLaneDim)
502 crossLaneDim -= 1;
503 Value crossLaneReduced = vector::MultiDimReductionOp::create(
504 rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
505 ArrayRef<int64_t>(crossLaneDim));
506 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
507 "Type mismatch");
508
509 Value replacement = crossLaneReduced;
510 if (resLayout && postDecompLayout) {
511 // Bridge from the natural post-decomposition layout to the
512 // original reduction's result layout. This preserves the contract
513 // any consumer (convert_layout, anchor op, or otherwise) was
514 // written against, so the rewrite is correct independent of
515 // whether layout propagation runs afterwards.
516 auto bridgeOp = xegpu::ConvertLayoutOp::create(
517 rewriter, loc, crossLaneReduced.getType(), crossLaneReduced,
518 postDecompLayout, resLayout);
519 replacement = bridgeOp.getResult();
520 }
521
522 rewriter.replaceOp(reductionOp, replacement);
523 return success();
524 }
525
526private:
527 std::pair<int64_t, int64_t>
528 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
529 xegpu::DistributeLayoutAttr layout) const {
530 assert(layout.isForSubgroup() && "Must know the lane layout");
531 assert(reductionDims.size() == 2 && "Expected 2D reduction");
532 int64_t intra, cross = -1;
533 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
534 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
535 layoutAttr =
536 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
537 assert(layoutAttr);
538 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
539
540 assert(laneLayout.size() && "Expected a non-empty layout");
541 // try to pick a dim that does not communicate
542 for (auto dim : reductionDims) {
543 if (laneLayout[dim] == 1)
544 intra = dim;
545 else
546 cross = dim;
547 }
548 return {intra, cross};
549 }
550};
551
552} // namespace
553
555 RewritePatternSet &patterns) {
556 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
557 VectorExtractOpPattern, MultiRed2dOpPattern>(
558 patterns.getContext());
559}
560
561namespace {
562
563struct XeGPUPeepHoleOptimizerPass final
564 : public xegpu::impl::XeGPUPeepHoleOptimizerBase<
565 XeGPUPeepHoleOptimizerPass> {
566 void runOnOperation() override {
567 MLIRContext &context = getContext();
568 TypeConverter converter;
569 RewritePatternSet patterns(&context);
570 ConversionTarget target(context);
571
572 // This pass is only meant for PVC, BMG or CRI targets. If unsupported
573 // target is found, exit early.
574 bool isTargetSupported = false;
575 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
576 auto chipStr = xegpu::getChipStr(funcOp);
577 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg" ||
578 chipStr.value() == "cri"))
579 isTargetSupported = true;
580 });
581
582 if (!isTargetSupported) {
583 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC, BMG targets."
584 << "\n";
585 return;
586 }
587
588 // Run array length optimization patterns first so that subsequent transpose
589 // peephole patterns operate on the array-length-optimized tensor descs.
590 {
591 RewritePatternSet arrayLenPatterns(&context);
593 if (failed(applyPatternsGreedily(getOperation(),
594 std::move(arrayLenPatterns)))) {
595 DBGS() << "Array length optimization patterns failed.\n";
596 return signalPassFailure();
597 }
598 }
599
600 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
601 // converted.
602 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
603 [&](xegpu::CreateNdDescOp createNdOp) {
604 return !canBeOptimizedForTranspose(createNdOp.getType());
605 });
606 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
607 [&](xegpu::LoadNdOp loadNdOp) {
608 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
609 });
610 // Vector ExtractOps can have optimizable layouts if they extract from
611 // LoadNdOps with array length greater than 1. These ExtractOps must be
612 // converted.
613 target.addDynamicallyLegalOp<vector::ExtractOp>(
614 [&](vector::ExtractOp extractOp) {
615 auto layout = xegpu::getTemporaryLayout(
616 dyn_cast<OpResult>(extractOp.getResult()));
617 if (!layout)
618 return true;
619 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
620 auto laneData = layout.getEffectiveLaneDataAsInt();
621 return !canBeOptimizedForTranspose(laneLayout, laneData);
622 });
623
624 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
625 [=](Operation *op) -> bool {
626 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
627 if (!layout || !layout.isForSubgroup())
628 return true;
629 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
630 return reductionOp.getReductionDims().size() != 2;
631 return true;
632 });
633
634 converter.addConversion([](Type type) { return type; });
635
636 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
637 vector::VectorDialect>();
638 // xegpu.convert_layout is left untouched by this pass; mark it legal
639 // so in-place updates don't trigger re-legalization failures.
640 target.addLegalOp<xegpu::ConvertLayoutOp>();
642 target);
644 if (failed(applyPartialConversion(getOperation(), target,
645 std::move(patterns)))) {
646 DBGS() << "Optimize block loads pass failed.\n";
647 return signalPassFailure();
648 }
649
650 // Apply folding for cleaning up IR.
651 MLIRContext *ctx = &getContext();
652 RewritePatternSet emptyPatterns(ctx);
653 (void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
654
655 xegpu::removeTemporaryLayoutAttrs(getOperation());
656 }
657};
658
659} // namespace
return success()
b getContext())
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:384
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void populateXeGPUArrayLengthOptimizationPatterns(RewritePatternSet &patterns)
Appends patterns for array length optimization into patterns.
void removeTemporaryLayoutAttrs(Operation *op)
Removes the temporary layout attributes for each OpOperand and OpResult of the given operation.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:168