MLIR 22.0.0git
XeGPUOptimizeBlockLoads.cpp
Go to the documentation of this file.
1//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
24#include "mlir/IR/Types.h"
25#include "mlir/IR/Value.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallVector.h"
29#include <optional>
30
31namespace mlir {
32namespace xegpu {
33#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
34#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
35} // namespace xegpu
36} // namespace mlir
37
38#define DEBUG_TYPE "xegpu-optimize-block-loads"
39#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
40
41using namespace mlir;
42
43namespace {
44
45/// Get the 2D lane data from a tensor desc type if it exists.
46static std::optional<SmallVector<int64_t>>
47getMaybeLaneData(xegpu::TensorDescType tdescType) {
48 auto layout = tdescType.getLayoutAttr();
49 if (!layout)
50 return std::nullopt;
51 auto laneData = layout.getEffectiveLaneDataAsInt();
52 if (laneData.size() != 2)
53 return std::nullopt;
54 return laneData;
55}
56
57/// Get the 2D lane layout from a tensor desc type if it exists.
58static std::optional<SmallVector<int64_t>>
59getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
60 auto layout = tdescType.getLayoutAttr();
61 if (!layout)
62 return std::nullopt;
63 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
64 if (laneLayout.size() != 2)
65 return std::nullopt;
66 return laneLayout;
67}
68
69/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
70/// lane[1] == 1), but inner lane data is not equal to [1, 1].
71/// Example:
72/// !xegpu.tensor_desc<16x16xf16,
73/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
74/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
75/// indicating that this is a load that requires transpose effect. However,
76/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
77/// the inner dimension. We convert this to a optimized form by converting the
78/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
79/// later lowering easily use the load with transpose instruction.
80static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
81 ArrayRef<int64_t> laneData) {
82 if (laneLayout.size() != 2 || laneData.size() != 2)
83 return false;
84 if (laneLayout[0] == 1 || laneLayout[1] != 1)
85 return false;
86 if (laneData[0] != 1 || laneData[1] == 1)
87 return false;
88 return true;
89}
90
91/// A tensor desc type can be optimized if its element type is less than 32 bits
92/// and its layout can be optimized.
93static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
94 // If the dtype is greater or equal to 32 bits, layout must be valid.
95 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
96 if (elementTyBitwidth >= 32)
97 return false;
98 auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
99 auto maybeLaneData = getMaybeLaneData(tdescType);
100 if (!maybeLaneData || !maybeLaneLayout)
101 return false;
102 return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
103}
104
105/// Check if a tensor desc type can be optimized for transpose, if so return the
106/// new optimized tensor desc type with a valid transpose layout.
107static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
108 const uArch *targetuArch) {
109 if (!canBeOptimizedForTranspose(tdescType))
110 return tdescType;
111 auto laneData = getMaybeLaneData(tdescType)
112 .value(); // Lane data must exist if we reach here.
113 int64_t innerLaneData = laneData[1];
114 int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
115 // Required shape is total shape of the vector result that this tensor desc
116 // must eventually load after adjusting for the new bitwidth and array
117 // length.
118 SmallVector<int64_t> requiredShape(tdescType.getShape());
119 requiredShape.back() =
120 requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
121 int newBitWidth = elementTyBitwidth * innerLaneData;
122 Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
123 // Supported shape is the max transpose shape that can be supported by
124 // hardware that is less than or equal to required shape.
125 auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
126 targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
127 auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
128 newElemTy, /** has transform */ false, /** has transpose */ true);
129 // If no HW params found, return the original type.
130 if (!maybeHWParams)
131 return tdescType;
132 auto [widths, heights, counts] = maybeHWParams.value();
133 // TODO: Currently we expect array length to be 1 for transpose case.
134 if (counts.size() != 1 || counts[0] != 1)
135 return tdescType;
136 int arrayLen = counts[0];
137 int supportedHeight =
138 xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
139 int supportedWidth =
140 xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
141 // If no supported height or width found, return the original type.
142 if (supportedHeight == -1 || supportedWidth == -1)
143 return tdescType;
144
145 SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
146 xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
147 tdescType.getContext(),
148 tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
149 // Array length can not be larger than 1 for transpose case.
150 return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
151 tdescType.getBoundaryCheck(),
152 tdescType.getMemorySpace(), newLayout);
153}
154
155/// Helper to convert an OpFoldResult to Value.
156static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
157 OpFoldResult ofr) {
158 std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
159 if (mayBeInt)
160 return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
161 return llvm::cast<Value>(ofr);
162}
163
164/// Helper to divide a Value by a constant integer.
165static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
166 Value val, int64_t constant) {
167 // If the constant is a power of 2, use right shift for division.
168 if (llvm::isPowerOf2_64(constant)) {
169 int64_t shiftAmount = llvm::Log2_64(constant);
170 return arith::ShRUIOp::create(
171 rewriter, loc, val,
172 arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
173 .getResult())
174 .getResult();
175 }
176 auto constantOp =
177 arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
178 return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
179}
180
181/// This function takes a larger register block `data` and generates multiple
182/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
183/// starting from `offsets`.
184static Value generateLoads(ConversionPatternRewriter &rewriter,
188 xegpu::LoadNdOp origLoadOp) {
189 Location loc = data.getLoc();
190 assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
191 Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
192 Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
193 SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
194 // Compute the ratio between original shape and supported shape. We need to
195 // generate loads in this ratio arrangement.
196 auto shapeRatio = computeShapeRatio(data.getType().getShape(),
197 supportedShape)
198 .value(); // `ratio` must be defined if we reach here.
199 for (int64_t h = 0; h < shapeRatio[0]; ++h) {
200 for (int64_t w = 0; w < shapeRatio[1]; ++w) {
201 int64_t localOffsetDim0 = h * supportedShape[0];
202 int64_t localOffsetDim1 = w * supportedShape[1];
203 Value loadOffsetX = arith::AddIOp::create(
204 rewriter, loc, offsetDim0,
205 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
206 .getResult());
207 Value loadOffsetY = arith::AddIOp::create(
208 rewriter, loc, offsetDim1,
209 arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
210 .getResult());
211 auto loadOp = xegpu::LoadNdOp::create(
212 rewriter, loc,
213 VectorType::get(supportedShape, data.getType().getElementType()),
214 newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
215 origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
216 origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
217 origLoadOp.getL3HintAttr());
218 // Set the layout for the loadOp.
219 auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
220 xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
221 // Insert the loaded block into the right position in data.
222 auto insertOp = vector::InsertStridedSliceOp::create(
223 rewriter, loc, loadOp.getResult(), data,
224 ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
225 ArrayRef<int64_t>{1, 1});
226 // InsertOp must have the same layout as newTensorDesc.
227 xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
228 data = insertOp.getResult();
229 }
230 }
231 return data;
232}
233
234/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
235/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
236/// the base pointer from the original memory source and adjusting the shape and
237/// strides of the tensor desc to fit with the new optimized transpose layout.
238class XeGPUCreateNdDescOpPattern final
239 : public OpConversionPattern<xegpu::CreateNdDescOp> {
240public:
241 using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
242 LogicalResult
243 matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
244 ConversionPatternRewriter &rewriter) const override {
245 auto tdescTy = createNdOp.getType();
246 // Get the target uArch info.
247 auto chipStr = xegpu::getChipStr(createNdOp);
248 // Check if the chip is supported.
249 assert(
250 chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
251 "Expecting target chip to be pvc or bmg for transpose optimization.");
252 const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
253
254 auto convertType = tryOptimize(tdescTy, targetuArch);
255 if (convertType == tdescTy)
256 return failure();
257 auto strides = createNdOp.getMixedStrides();
258 auto maybeConstInnerStride = getConstantIntValue(strides.back());
259 // Only row-major memrefs are expected for now.
260 if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
261 return rewriter.notifyMatchFailure(
262 createNdOp, "Expecting row-major memref for transpose optimization.");
263 Value source = createNdOp.getSource();
264 auto optionalLaneData = getMaybeLaneData(tdescTy);
265 assert(optionalLaneData && "Expected 2D lane data");
266 auto laneData = optionalLaneData.value();
267 int64_t innerLaneData = laneData[1];
268 auto memrefType = dyn_cast<MemRefType>(source.getType());
269 // Inner dimension of the shape must be adjusted based on innerLaneData.
270 SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
271 modifiedShape.back() = divideByConstant(
272 rewriter, createNdOp.getLoc(),
273 convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
274 innerLaneData);
275 // Similarly, second to last stride must be adjusted.
276 assert(strides.size() >= 2 &&
277 "Expected at least 2 strides for CreateNdDescOp");
278 SmallVector<OpFoldResult> modifiedStrides(strides);
279 modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
280 rewriter, createNdOp.getLoc(),
281 convertToValue(rewriter, createNdOp.getLoc(),
282 modifiedStrides[modifiedStrides.size() - 2]),
283 innerLaneData);
284
285 // If the source is a static memref, we need to extract the pointer to
286 // base address.
287 if (memrefType && memrefType.hasStaticShape()) {
288 auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
289 rewriter, createNdOp.getLoc(), source);
290 source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
291 rewriter.getI64Type(),
292 extractOp.getResult())
293 .getResult();
294 }
295 // Create a new CreateNdDescOp with the modified shape and converted type.
296 auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
297 rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
298 modifiedStrides);
299 rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
300 return success();
301 }
302};
303
304/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
305/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
306/// adjusted tensor desc type. This can result in multiple LoadNdOps being
307/// generated to fill in the original load shape.
308class XeGPULoadNdDescOpPattern final
309 : public OpConversionPattern<xegpu::LoadNdOp> {
310public:
311 using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
312 LogicalResult
313 matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
314 ConversionPatternRewriter &rewriter) const override {
315 auto origTensorDescType = loadNdOp.getTensorDescType();
316 auto adaptorType =
317 cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
318 if (adaptorType == origTensorDescType)
319 return failure();
320 // Offsets must be adjusted based on innerLaneData.
321 auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
322 int64_t innerLaneData = laneData[1];
323 auto offsets = loadNdOp.getMixedOffsets();
324 if (offsets.empty())
325 return rewriter.notifyMatchFailure(loadNdOp,
326 "Expecting offsets in LoadNd");
327 SmallVector<OpFoldResult> modifiedOffsets(offsets);
328 modifiedOffsets.back() = divideByConstant(
329 rewriter, loadNdOp.getLoc(),
330 convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
331 innerLaneData);
332 // Get the 2D data shape of this loadNdOp in its original type including
333 // array length.
334 SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
335 // Adjust the data shape based on innerLaneData.
336 origDataShape.back() /= innerLaneData;
337 // HW supported shape is the new tensor desc shape after conversion.
338 SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
339 VectorType origVectorType =
340 VectorType::get(origDataShape, adaptorType.getElementType());
341 Value data;
342 // Orig data shape is 3D for the array length case.
343 if (origTensorDescType.getArrayLength() > 1) {
344 SmallVector<Value> arraySlices;
345 for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
346 Value slice = arith::ConstantOp::create(
347 rewriter, loadNdOp->getLoc(), origVectorType,
348 rewriter.getZeroAttr(origVectorType));
349 // Increase the Y offset for each array slice.
350 Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
351 modifiedOffsets.back());
352 modifiedOffsets.back() =
353 arith::AddIOp::create(
354 rewriter, loadNdOp->getLoc(), offsetY,
355 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
356 i * origDataShape[1])
357 .getResult())
358 .getResult();
359 slice = generateLoads(
360 rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
361 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
362 loadNdOp);
363 // BitCast back to original load shape without array length.
364 auto bitcastType = VectorType::get(origTensorDescType.getShape(),
365 origTensorDescType.getElementType());
366 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
367 bitcastType, slice);
368 // BitCastOp must have the same layout as the original loadNdOp.
369 xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
370 origTensorDescType.getLayoutAttr());
371 arraySlices.push_back(bitCastOp.getResult());
372 }
373 rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
374 return success();
375 }
376 data = arith::ConstantOp::create(
377 rewriter, loadNdOp->getLoc(),
378 VectorType::get(origDataShape, adaptorType.getElementType()),
379 rewriter.getZeroAttr(origVectorType));
380 data = generateLoads(
381 rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
382 cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
383 loadNdOp);
384 auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
385 loadNdOp.getType(), data);
386 // BitCastOp must have the same layout as the original loadNdOp.
387 xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
388 origTensorDescType.getLayoutAttr());
389 rewriter.replaceOp(loadNdOp, bitCastOp);
390 return success();
391 }
392};
393
394/// Vector ExtractOp must be processed if the original tensor desc type has
395/// array length greater than 1. In this case, the LoadNdOp is replaced with
396/// multiple LoadNdOps for each array slice making the extraction unnecessary.
397/// In this case, we simply remove the ExtractOp.
398class VectorExtractOpPattern final
399 : public OpConversionPattern<vector::ExtractOp> {
400public:
401 using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
402 LogicalResult
403 matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
404 ConversionPatternRewriter &rewriter) const override {
405 // Check if the source of the extraction is split to multiple values.
406 if (adaptor.getSource().size() == 1)
407 return failure();
408 auto mixedPos = extractOp.getMixedPosition();
409 if (mixedPos.size() != 1)
410 return failure();
411 auto mayBeInt = getConstantIntValue(mixedPos[0]);
412 if (!mayBeInt)
413 return failure();
414 rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
415 return success();
416 }
417};
418
419} // namespace
420
423 patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
424 VectorExtractOpPattern>(patterns.getContext());
425}
426
427namespace {
428
429struct XeGPUOptimizeBlockLoadsPass final
431 XeGPUOptimizeBlockLoadsPass> {
432 void runOnOperation() override {
433 MLIRContext &context = getContext();
434 TypeConverter converter;
435 RewritePatternSet patterns(&context);
436 ConversionTarget target(context);
437
438 // This pass is only meant for PVC and BMG targets. If unsupported target
439 // is found, exit early.
440 bool isTargetSupported = false;
441 getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
442 auto chipStr = xegpu::getChipStr(funcOp);
443 if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
444 isTargetSupported = true;
445 });
446
447 if (!isTargetSupported) {
448 DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
449 << "\n";
450 return;
451 }
452
453 // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
454 // converted.
455 target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
456 [&](xegpu::CreateNdDescOp createNdOp) {
457 return !canBeOptimizedForTranspose(createNdOp.getType());
458 });
459 target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
460 [&](xegpu::LoadNdOp loadNdOp) {
461 return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
462 });
463 // Vector ExtractOps can have optimizable layouts if they extract from
464 // LoadNdOps with array length greater than 1. These ExtractOps must be
465 // converted.
466 target.addDynamicallyLegalOp<vector::ExtractOp>(
467 [&](vector::ExtractOp extractOp) {
468 auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
469 if (!layout)
470 return true;
471 auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
472 auto laneData = layout.getEffectiveLaneDataAsInt();
473 return !canBeOptimizedForTranspose(laneLayout, laneData);
474 });
475 converter.addConversion([](Type type) { return type; });
476
477 target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
478 vector::VectorDialect>();
480 target);
482 if (failed(applyPartialConversion(getOperation(), target,
483 std::move(patterns)))) {
484 DBGS() << "Optimize block loads pass failed.\n";
485 return signalPassFailure();
486 }
487 }
488};
489
490} // namespace
return success()
b getContext())
#define DBGS()
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This class represents a single result from folding an operation.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:561
void populateSCFStructuralTypeConversionsAndLegality(const TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target, PatternBenefit benefit=1)
Populates patterns for SCF structural type conversions and sets up the provided ConversionTarget with...
const uArch * getUArch(llvm::StringRef archName)
void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns)
Appends patterns for optimizing block load operations into patterns.
int getLargestDivisor(T dim, ArrayRef< T > candidates, ArrayRef< T > candidateMultiples={})
Helper Function to find a proper instruction multiple for the user-supplied sg-level data shape (dive...
DistributeLayoutAttr getDistributeLayoutAttr(const Value value)
Retrieves the DistributeLayoutAttr associated with a given Value.
void setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout, bool respectPermLayout=false)
Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching it to the owner's dictio...
std::optional< std::string > getChipStr(Operation *op)
Retrieves the chip string from the XeVM target attribute of the parent GPU module operation.
Include the generated interface declarations.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
std::conditional_t< std::is_same_v< Ty, mlir::Type >, mlir::Value, detail::TypedValue< Ty > > TypedValue
If Ty is mlir::Type this will select Value instead of having a wrapper around it.
Definition Value.h:497
const FrozenRewritePatternSet & patterns
std::optional< SmallVector< int64_t > > computeShapeRatio(ArrayRef< int64_t > shape, ArrayRef< int64_t > subShape)
Return the multi-dimensional integral ratio of subShape to the trailing dimensions of shape.
const Instruction * getInstruction(InstructionKind instKind) const
Definition uArchBase.h:157