MLIR 23.0.0git
XeGPUPeepHoleOptimizer.cpp
Go to the documentation of this file.
1//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
25#include "mlir/IR/Types.h"
26#include "mlir/IR/Value.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include <optional>
32
33namespace mlir {
34namespace xegpu {
35#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
36#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
37} // namespace xegpu
38} // namespace mlir
39
40#define DEBUG_TYPE "xegpu-optimize-peephole"
41#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
42
43using namespace mlir;
44
45namespace {
46
47/// Get the 2D lane data from a tensor desc type if it exists.
48static std::optional<SmallVector<int64_t>>
49getMaybeLaneData(xegpu::TensorDescType tdescType) {
50 auto layout = tdescType.getLayoutAttr();
51 if (!layout)
52 return std::nullopt;
53 auto laneData = layout.getEffectiveLaneDataAsInt();
54 if (laneData.size() != 2)
55 return std::nullopt;
56 return laneData;
57}
58
59/// Get the 2D lane layout from a tensor desc type if it exists.
60static std::optional<SmallVector<int64_t>>
61getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
62 auto layout = tdescType.getLayoutAttr();
63 if (!layout)
64 return std::nullopt;
65 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
66 if (laneLayout.size() != 2)
67 return std::nullopt;
68 return laneLayout;
69}
70
71/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
72/// lane[1] == 1), but inner lane data is not equal to [1, 1].
73/// Example:
74/// !xegpu.tensor_desc<16x16xf16,
75/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
76/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
77/// indicating that this is a load that requires transpose effect. However,
78/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
79/// the inner dimension. We convert this to a optimized form by converting the
80/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
81/// later lowering easily use the load with transpose instruction.
82static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
83 ArrayRef<int64_t> laneData) {
84 if (laneLayout.size() != 2 || laneData.size() != 2)
85 return false;
86 if (laneLayout[0] == 1 || laneLayout[1] != 1)
87 return false;
88 if (laneData[0] != 1 || laneData[1] == 1)
89 return false;
90 return true;
91}
92
93/// A tensor desc type can be optimized if its element type is less than 32 bits
94/// and its layout can be optimized.
95static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
96 // If the dtype is greater or equal to 32 bits, layout must be valid.
97 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
98 if (elementTyBitwidth >= 32)
99 return false;
100 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
101 auto maybeLaneData = getMaybeLaneData(tdescType);
102 if (!maybeLaneData || !maybeLaneLayout)
103 return false;
104 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
105}
106
107/// Check if a tensor desc type can be optimized for transpose, if so return the
108/// new optimized tensor desc type with a valid transpose layout.
109static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
110 const uArch *targetuArch) {
111 if (!canBeOptimizedForTranspose(tdescType))
112 return tdescType;
113 auto laneData = getMaybeLaneData(tdescType)
114 .value(); // Lane data must exist if we reach here.
115 int64_t innerLaneData = laneData[1];
116 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
117 // Required shape is total shape of the vector result that this tensor desc
118 // must eventually load after adjusting for the new bitwidth and array
119 // length.
120 SmallVector<int64_t> requiredShape(tdescType.getShape());
121 requiredShape.back() =
122 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
123 int newBitWidth = elementTyBitwidth * innerLaneData;
124 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
125 // Supported shape is the max transpose shape that can be supported by
126 // hardware that is less than or equal to required shape.
127 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
129 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
130 newElemTy, /** has transform */ false, /** has transpose */ true);
131 // If no HW params found, return the original type.
132 if (!maybeHWParams)
133 return tdescType;
134 auto [widths, heights, counts] = maybeHWParams.value();
135 // TODO: Currently we expect array length to be 1 for transpose case.
136 if (counts.size() != 1 || counts[0] != 1)
137 return tdescType;
138 int arrayLen = counts[0];
139 int supportedHeight =
140 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
141 int supportedWidth =
142 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
143 // If no supported height or width found, return the original type.
144 if (supportedHeight == -1 || supportedWidth == -1)
145 return tdescType;
146
147 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
148 auto ctx = tdescType.getContext();
149 auto origLayout = tdescType.getLayoutAttr();
150 auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
151 SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
152 laneLayoutI64.end());
153
154 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
155 ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
156 /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
157 /*order=*/origLayout.getOrder());
159 // Array length can not be larger than 1 for transpose case.
160 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
161 tdescType.getBoundaryCheck(),
162 tdescType.getMemorySpace(), newLayout);
163}
165/// Helper to convert an OpFoldResult to Value.
166static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
167 OpFoldResult ofr) {
168 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
169 if (mayBeInt)
170 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
171 return llvm::cast<Value>(ofr);
172}
173
174/// Helper to divide a Value by a constant integer.
175static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
176 Value val, int64_t constant) {
177 // If the constant is a power of 2, use right shift for division.
178 if (llvm::isPowerOf2_64(constant)) {
179 int64_t shiftAmount = llvm::Log2_64(constant);
180 return arith::ShRUIOp::create(
181 rewriter, loc, val,
182 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
183 .getResult())
184 .getResult();
185 }
186 auto constantOp =
187 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
188 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
189}
190
191/// This function takes a larger register block `data` and generates multiple
192/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
193/// starting from `offsets`.
194static Value generateLoads(ConversionPatternRewriter &rewriter,
198 xegpu::LoadNdOp origLoadOp) {
199 Location loc = data.getLoc();
200 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
201 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
202 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
203 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
204 // Compute the ratio between original shape and supported shape. We need to
205 // generate loads in this ratio arrangement.
206 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
207 supportedShape)
208 .value(); // `ratio` must be defined if we reach here.
209 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
210 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
211 int64_t localOffsetDim0 = h * supportedShape[0];
212 int64_t localOffsetDim1 = w * supportedShape[1];
213 Value loadOffsetX = arith::AddIOp::create(
214 rewriter, loc, offsetDim0,
215 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
216 .getResult());
217 Value loadOffsetY = arith::AddIOp::create(
218 rewriter, loc, offsetDim1,
219 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
220 .getResult());
221 auto loadOp = xegpu::LoadNdOp::create(
222 rewriter, loc,
223 VectorType::get(supportedShape, data.getType().getElementType()),
224 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
225 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
226 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
227 origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
228 // Set the layout for the loadOp.
229 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
230 loadOp.setAnchorLayout(layoutAttr);
231 // Insert the loaded block into the right position in data.
232 auto insertOp = vector::InsertStridedSliceOp::create(
233 rewriter, loc, loadOp.getResult(), data,
234 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
235 ArrayRef<int64_t>{1, 1});
236 // InsertOp must have the same layout as newTensorDesc.
237 xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
238 data = insertOp.getResult();
239 }
240 }
241 return data;
242}
243
244/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
245/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
246/// the base pointer from the original memory source and adjusting the shape and
247/// strides of the tensor desc to fit with the new optimized transpose layout.
248class XeGPUCreateNdDescOpPattern final
249 : public OpConversionPattern<xegpu::CreateNdDescOp> {
250public:
251 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
252 LogicalResult
253 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
254 ConversionPatternRewriter &rewriter) const override {
255 auto tdescTy = createNdOp.getType();
256 // Get the target uArch info.
257 auto chipStr = xegpu::getChipStr(createNdOp);
258 // Check if the chip is supported.
259 assert(chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
260 "Expecting target chip to be pvc, bmg for transpose optimization.");
261 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
262
263 auto convertType = tryOptimize(tdescTy, targetuArch);
264 if (convertType == tdescTy)
265 return failure();
266 auto strides = createNdOp.getMixedStrides();
267 auto maybeConstInnerStride = getConstantIntValue(strides.back());
268 // Only row-major memrefs are expected for now.
269 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
270 return rewriter.notifyMatchFailure(
271 createNdOp, "Expecting row-major memref for transpose optimization.");
272 Value source = createNdOp.getSource();
273 auto optionalLaneData = getMaybeLaneData(tdescTy);
274 assert(optionalLaneData && "Expected 2D lane data");
275 auto laneData = optionalLaneData.value();
276 int64_t innerLaneData = laneData[1];
277 auto memrefType = dyn_cast<MemRefType>(source.getType());
278 // Inner dimension of the shape must be adjusted based on innerLaneData.
279 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
280 modifiedShape.back() = divideByConstant(
281 rewriter, createNdOp.getLoc(),
282 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
283 innerLaneData);
284 // Similarly, second to last stride must be adjusted.
285 assert(strides.size() >= 2 &&
286 "Expected at least 2 strides for CreateNdDescOp");
287 SmallVector<OpFoldResult> modifiedStrides(strides);
288 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
289 rewriter, createNdOp.getLoc(),
290 convertToValue(rewriter, createNdOp.getLoc(),
291 modifiedStrides[modifiedStrides.size() - 2]),
292 innerLaneData);
293
294 // If the source is a static memref, we need to extract the pointer to
295 // base address.
296 if (memrefType && memrefType.hasStaticShape()) {
297 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
298 rewriter, createNdOp.getLoc(), source);
299 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
300 rewriter.getI64Type(),
301 extractOp.getResult())
302 .getResult();
303 }
304 // Create a new CreateNdDescOp with the modified shape and converted type.
305 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
306 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
307 modifiedStrides);
308 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
309 return success();
310 }
311};
312
313/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
314/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
315/// adjusted tensor desc type. This can result in multiple LoadNdOps being
316/// generated to fill in the original load shape.
317class XeGPULoadNdDescOpPattern final
318 : public OpConversionPattern<xegpu::LoadNdOp> {
319public:
320 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
321 LogicalResult
322 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
323 ConversionPatternRewriter &rewriter) const override {
324 auto origTensorDescType = loadNdOp.getTensorDescType();
325 auto adaptorType =
326 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
327 if (adaptorType == origTensorDescType)
328 return failure();
329 // Offsets must be adjusted based on innerLaneData.
330 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
331 int64_t innerLaneData = laneData[1];
332 auto offsets = loadNdOp.getMixedOffsets();
333 if (offsets.empty())
334 return rewriter.notifyMatchFailure(loadNdOp,
335 "Expecting offsets in LoadNd");
336 SmallVector<OpFoldResult> modifiedOffsets(offsets);
337 modifiedOffsets.back() = divideByConstant(
338 rewriter, loadNdOp.getLoc(),
339 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
340 innerLaneData);
341 // Get the 2D data shape of this loadNdOp in its original type including
342 // array length.
343 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
344 // Adjust the data shape based on innerLaneData.
345 origDataShape.back() /= innerLaneData;
346 // HW supported shape is the new tensor desc shape after conversion.
347 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
348 VectorType origVectorType =
349 VectorType::get(origDataShape, adaptorType.getElementType());
350 Value data;
351 // Orig data shape is 3D for the array length case.
352 if (origTensorDescType.getArrayLength() > 1) {
353 SmallVector<Value> arraySlices;
354 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
355 Value slice = arith::ConstantOp::create(
356 rewriter, loadNdOp->getLoc(), origVectorType,
357 rewriter.getZeroAttr(origVectorType));
358 // Increase the Y offset for each array slice.
359 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
360 modifiedOffsets.back());
361 modifiedOffsets.back() =
362 arith::AddIOp::create(
363 rewriter, loadNdOp->getLoc(), offsetY,
364 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
365 i * origDataShape[1])
366 .getResult())
367 .getResult();
368 slice = generateLoads(
369 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
370 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
371 loadNdOp);
372 // BitCast back to original load shape without array length.
373 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
374 origTensorDescType.getElementType());
375 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
376 bitcastType, slice);
377 // BitCastOp must have the same layout as the original loadNdOp.
378 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
379 origTensorDescType.getLayoutAttr());
380 arraySlices.push_back(bitCastOp.getResult());
381 }
382 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
383 return success();
384 }
385 data = arith::ConstantOp::create(
386 rewriter, loadNdOp->getLoc(),
387 VectorType::get(origDataShape, adaptorType.getElementType()),
388 rewriter.getZeroAttr(origVectorType));
389 data = generateLoads(
390 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
391 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
392 loadNdOp);
393 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
394 loadNdOp.getType(), data);
395 // BitCastOp must have the same layout as the original loadNdOp.
396 xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
397 origTensorDescType.getLayoutAttr());
398 rewriter.replaceOp(loadNdOp, bitCastOp);
399 return success();
400 }
401};
402
403/// Vector ExtractOp must be processed if the original tensor desc type has
404/// array length greater than 1. In this case, the LoadNdOp is replaced with
405/// multiple LoadNdOps for each array slice making the extraction unnecessary.
406/// In this case, we simply remove the ExtractOp.
407class VectorExtractOpPattern final
408 : public OpConversionPattern<vector::ExtractOp> {
409public:
410 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
411 LogicalResult
412 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
413 ConversionPatternRewriter &rewriter) const override {
414 // Check if the source of the extraction is split to multiple values.
415 if (adaptor.getSource().size() == 1)
416 return failure();
417 auto mixedPos = extractOp.getMixedPosition();
418 if (mixedPos.size() != 1)
419 return failure();
420 auto mayBeInt = getConstantIntValue(mixedPos[0]);
421 if (!mayBeInt)
422 return failure();
423 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
424 return success();
425 }
426};
427
428/// Performs a reduction over 2 dimensions by decomposing it into two 1D
429/// reductions ordered based on layout to minimize cross-lane communication.
430class MultiRed2dOpPattern
431 : public OpConversionPattern<vector::MultiDimReductionOp> {
432 using OpConversionPattern::OpConversionPattern;
433 LogicalResult
434 matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
435 ConversionPatternRewriter &rewriter) const override {
436 auto sourceVecType = reductionOp.getSourceVectorType();
437 if (reductionOp.getReductionDims().size() != 2)
438 return rewriter.notifyMatchFailure(reductionOp, "Expected 2D reduction");
439 auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
440 // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
441 auto dims = llvm::to_vector(reductionOp.getReductionDims());
442 auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
443 // Order does not matter
444 if (intraLaneDim == -1 || crossLaneDim == -1) {
445 intraLaneDim = dims[0];
446 crossLaneDim = dims[1];
447 }
448 auto loc = reductionOp.getLoc();
449 auto acc = reductionOp.getAcc();
450
451 // The decomposition below splits the 2D reduction into an intra-lane
452 // then a cross-lane 1D reduction. The natural result layout of the
453 // decomposed sequence (a doubly-sliced layout) differs from the
454 // original 2D reduction's result layout that the rest of the IR was
455 // written/propagated against. To keep the post-peephole IR
456 // self-consistent without depending on a follow-up layout
457 // propagation pass, we always insert a bridge xegpu.convert_layout
458 // from the natural post-decomposition layout to the original
459 // reduction's result layout. Trivial bridges fold away in
460 // canonicalization.
461 xegpu::DistributeLayoutAttr postDecompLayout;
462 if (resLayout) {
463 // Derive the source vector's layout.
464 xegpu::DistributeLayoutAttr srcLayoutForCvt;
465 if (auto resSlice = dyn_cast_if_present<xegpu::SliceAttr>(resLayout))
466 srcLayoutForCvt = resSlice.getParent();
467 if (!srcLayoutForCvt)
468 srcLayoutForCvt =
469 xegpu::getDistributeLayoutAttr(reductionOp.getSource());
470 if (srcLayoutForCvt) {
471 // The natural layout of the post-decomposition reduction result
472 // is a nested SliceAttr: REDUCE_1 (reduces `intraLaneDim` from
473 // the source) yields `slice<src, [intraLaneDim]>`; REDUCE_2
474 // then reduces `adjCrossLaneDim` from that intermediate, giving
475 // `slice<slice<src, [intraLaneDim]>, [adjCrossLaneDim]>`.
476 MLIRContext *ctx = reductionOp.getContext();
477 int64_t adjCrossLaneDim =
478 crossLaneDim > intraLaneDim ? crossLaneDim - 1 : crossLaneDim;
479 auto intermediateLayout = xegpu::SliceAttr::get(
480 ctx, srcLayoutForCvt, DenseI64ArrayAttr::get(ctx, {intraLaneDim}));
481 postDecompLayout = xegpu::SliceAttr::get(
482 ctx, intermediateLayout,
483 DenseI64ArrayAttr::get(ctx, {adjCrossLaneDim}));
484 }
485 }
486
487 SmallVector<int64_t> accShape(sourceVecType.getShape());
488 accShape.erase(accShape.begin() + intraLaneDim);
489 Type eTy = sourceVecType.getElementType();
490 Value constNeutralVal = xegpu::createReductionNeutralValue(
491 rewriter, loc, VectorType::get(accShape, eTy), reductionOp.getKind());
492
493 Value intraLaneReduced = vector::MultiDimReductionOp::create(
494 rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
495 constNeutralVal, ArrayRef<int64_t>(intraLaneDim));
496
497 // Adjust crossLaneDim after the first reduction.
498 if (crossLaneDim > intraLaneDim)
499 crossLaneDim -= 1;
500 Value crossLaneReduced = vector::MultiDimReductionOp::create(
501 rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
502 ArrayRef<int64_t>(crossLaneDim));
503 assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
504 "Type mismatch");
505
506 Value replacement = crossLaneReduced;
507 if (resLayout && postDecompLayout) {
508 // Bridge from the natural post-decomposition layout to the
509 // original reduction's result layout. This preserves the contract
510 // any consumer (convert_layout, anchor op, or otherwise) was
511 // written against, so the rewrite is correct independent of
512 // whether layout propagation runs afterwards.
513 auto bridgeOp = xegpu::ConvertLayoutOp::create(
514 rewriter, loc, crossLaneReduced.getType(), crossLaneReduced,
515 postDecompLayout, resLayout);
516 replacement = bridgeOp.getResult();
517 }
518
519 rewriter.replaceOp(reductionOp, replacement);
520 return success();
521 }
522
523private:
524 std::pair<int64_t, int64_t>
525 getReductionDimOrder(ArrayRef<int64_t> reductionDims,
526 xegpu::DistributeLayoutAttr layout) const {
527 assert(layout.isForSubgroup() && "Must know the lane layout");
528 assert(reductionDims.size() == 2 && "Expected 2D reduction");
529 int64_t intra, cross = -1;
530 xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
531 if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
532 layoutAttr =
533 dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
534 assert(layoutAttr);
535 SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
536
537 assert(laneLayout.size() && "Expected a non-empty layout");
538 // try to pick a dim that does not communicate
539 for (auto dim : reductionDims) {
540 if (laneLayout[dim] == 1)
541 intra = dim;
542 else
543 cross = dim;
544 }
545 return {intra, cross};
546 }
547};
548
549} // namespace
550
552 RewritePatternSet &patterns) {
553 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
554 VectorExtractOpPattern, MultiRed2dOpPattern>(
555 patterns.getContext());
556}
557
558namespace {
559
560struct XeGPUPeepHoleOptimizerPass final
562 XeGPUPeepHoleOptimizerPass> {
563 void runOnOperation() override {
564 MLIRContext &context = getContext();
565 TypeConverter converter;
566 RewritePatternSet patterns(&context);
567 ConversionTarget target(context);
568
569 // This pass is only meant for PVC and BMG targets. If unsupported target
570 // is found, exit early.
571 bool isTargetSupported = false;
572 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
573 auto chipStr = xegpu::getChipStr(funcOp);
574 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
575 isTargetSupported = true;
576 });
577
578 if (!isTargetSupported) {
579 DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC, BMG targets."
580 << "\n";
581 return;
582 }
583
584 // Run array length optimization patterns first so that subsequent transpose
585 // peephole patterns operate on the array-length-optimized tensor descs.
586 {
587 RewritePatternSet arrayLenPatterns(&context);
589 if (failed(applyPatternsGreedily(getOperation(),
590 std::move(arrayLenPatterns)))) {
591 DBGS() << "Array length optimization patterns failed.\n";
592 return signalPassFailure();
593 }
594 }
595
596 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
597 // converted.
598 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
599 [&](xegpu::CreateNdDescOp createNdOp) {
600 return !canBeOptimizedForTranspose(createNdOp.getType());
601 });
602 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
603 [&](xegpu::LoadNdOp loadNdOp) {
604 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
605 });
606 // Vector ExtractOps can have optimizable layouts if they extract from
607 // LoadNdOps with array length greater than 1. These ExtractOps must be
608 // converted.
609 target.addDynamicallyLegalOp<vector::ExtractOp>(
610 [&](vector::ExtractOp extractOp) {
611 auto layout = xegpu::getTemporaryLayout(
612 dyn_cast<OpResult>(extractOp.getResult()));
613 if (!layout)
614 return true;
615 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
616 auto laneData = layout.getEffectiveLaneDataAsInt();
617 return !canBeOptimizedForTranspose(laneLayout, laneData);
618 });
619
620 target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
621 [=](Operation *op) -> bool {
622 auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
623 if (!layout || !layout.isForSubgroup())
624 return true;
625 if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
626 return reductionOp.getReductionDims().size() != 2;
627 return true;
628 });
629
630 converter.addConversion([](Type type) { return type; });
631
632 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
633 vector::VectorDialect>();
634 // xegpu.convert_layout is left untouched by this pass; mark it legal
635 // so in-place updates don't trigger re-legalization failures.
636 target.addLegalOp<xegpu::ConvertLayoutOp>();
638 target);
640 if (failed(applyPartialConversion(getOperation(), target,
641 std::move(patterns)))) {
642 DBGS() << "Optimize block loads pass failed.\n";
643 return signalPassFailure();
644 }
645
646 // Apply folding for cleaning up IR.
647 MLIRContext *ctx = &getContext();
648 RewritePatternSet emptyPatterns(ctx);
649 (void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
650
651 xegpu::removeTemporaryLayoutAttrs(getOperation());
652 }
653};
654
655} // namespace
return success()
b getContext())
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
MLIRContext * getContext() const
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:384
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:717
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void populateXeGPUArrayLengthOptimizationPatterns(RewritePatternSet &patterns)
Appends patterns for array length optimization into patterns.
void removeTemporaryLayoutAttrs(Operation *op)
Removes the temporary layout attributes for each OpOperand and OpResult of the given operation.
void setTemporaryLayout(const T &operandOrResult, const DistributeLayoutAttr layout)
Value createReductionNeutralValue(OpBuilder &builder, Location loc, Type type, vector::CombiningKind kind)
Creates a constant filled with the neutral (identity) value for the given reduction kind.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult)
get and set distribute layout attribute for non-anchor operations (and offsets/masks of load/store op...
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:494
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:168