MLIR 22.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to generate ROCDLIR operations for higher-level
10// GPU operations.
11//
12//===----------------------------------------------------------------------===//
13
16#include "mlir/Pass/Pass.h"
18
41
44
45namespace mlir {
46#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
47#include "mlir/Conversion/Passes.h.inc"
48} // namespace mlir
49
50using namespace mlir;
51
52// Truncate or extend the result depending on the index bitwidth specified
53// by the LLVMTypeConverter options.
54static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
55 Location loc, Value value,
56 const LLVMTypeConverter &converter) {
57 int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
58 int64_t indexBitwidth = converter.getIndexTypeBitwidth();
59 auto indexBitwidthType =
60 IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
61 // TODO: use <=> in C++20.
62 if (indexBitwidth > intWidth) {
63 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
64 }
65 if (indexBitwidth < intWidth) {
66 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
67 }
68 return value;
69}
70
71/// Returns true if the given `gpu.func` can be safely called using the bare
72/// pointer calling convention.
73static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
74 bool canBeBare = true;
75 for (Type type : func.getArgumentTypes())
76 if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
77 canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
78 return canBeBare;
79}
80
81static Value getLaneId(RewriterBase &rewriter, Location loc) {
82 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
83 Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
84 Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
85 NamedAttribute noundef = rewriter.getNamedAttr(
86 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
87 NamedAttribute lowRange = rewriter.getNamedAttr(
88 LLVM::LLVMDialect::getRangeAttrName(),
89 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
90 APInt(32, 32)));
91 NamedAttribute highRange = rewriter.getNamedAttr(
92 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
94 APInt(32, 64)));
95 Value mbcntLo = ROCDL::MbcntLoOp::create(
96 rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
97 /*res_attrs=*/
98 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
99 Value laneId = ROCDL::MbcntHiOp::create(
100 rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
101 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
102 return laneId;
103}
104
105static constexpr StringLiteral amdgcnDataLayout =
106 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
107 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
108 "32-v32:"
109 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
110 "64-S32-A5-G1-ni:7:8:9";
111
112namespace {
113struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
115
116 LogicalResult
117 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
118 ConversionPatternRewriter &rewriter) const override {
119 Location loc = op.getLoc();
120 MLIRContext *context = rewriter.getContext();
121 // convert to:
122 // %mlo = call noundef range(i32 0, 32)
123 // @llvm.amdgcn.mbcnt.lo(-1, 0)
124 // followed by:
125 // %lid = call noundef range(i32 0, 64)
126 // @llvm.amdgcn.mbcnt.hi(-1, %mlo)
127
128 Value laneId = getLaneId(rewriter, loc);
129 // Truncate or extend the result depending on the index bitwidth specified
130 // by the LLVMTypeConverter options.
131 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
132 if (indexBitwidth > 32) {
133 laneId = LLVM::SExtOp::create(
134 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
135 } else if (indexBitwidth < 32) {
136 laneId = LLVM::TruncOp::create(
137 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
138 }
139 rewriter.replaceOp(op, {laneId});
140 return success();
141 }
142};
143
144struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
146
147 GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
148 amdgpu::Chipset chipset)
150 chipset(chipset) {}
151
152 LogicalResult
153 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
154 ConversionPatternRewriter &rewriter) const override {
155 LLVM::ConstantRangeAttr bounds = nullptr;
156 bool isBeforeGfx10 = chipset.majorVersion < 10;
157 if (auto upperBoundAttr = op.getUpperBoundAttr()) {
158 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
159 /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
160 /*upper=*/op.getUpperBoundAttr().getInt() + 1);
161 }
162 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
163 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
164 wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
165 *getTypeConverter());
166 rewriter.replaceOp(op, {wavefrontOp});
167 return success();
168 }
169
170 const amdgpu::Chipset chipset;
171};
172
173static bool isSupportedReadLaneType(Type type) {
174 // read(first)lane also supports some vector types, but limit it for scalars
175 // for now.
176 return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
177 isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
178 LLVM::LLVMPointerType>(type);
179}
180
181struct GPUSubgroupBroadcastOpToROCDL
182 : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
184
185 LogicalResult
186 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
187 ConversionPatternRewriter &rewriter) const override {
188 Value src = adaptor.getSrc();
189 if (!isSupportedReadLaneType(src.getType()))
190 return rewriter.notifyMatchFailure(op, "unsupported readlane type");
191
192 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
193 rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
194 adaptor.getLane());
195 } else { // first_active_lane
196 rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
197 src);
198 }
199 return success();
200 }
201};
202
203struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
205
206 /// Lowers a shuffle to the corresponding ROCDL ops.
207 ///
208 /// Use the `width` argument to see if src lane is participating.
209 /// If not the dstLane would be itself.
210 ///
211 /// Shuffle with DS Bpermute:
212 /// let shflMode = [xor, up, down, idx]
213 /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
214 /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
215 /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
216 /// 3. dstLane = shflMode(curLaneId, step)
217 /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
218 /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
219 /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
220 /// 7. bpermute(dwordAlignedDstLane, shfl_value).
221 ///
222 LogicalResult
223 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
224 ConversionPatternRewriter &rewriter) const override {
225 Location loc = op->getLoc();
226 Value initShflValue = adaptor.getValue();
227
228 Value srcLaneId = getLaneId(rewriter, loc);
229
230 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
231 Value width = adaptor.getWidth();
232 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
233 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
234 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
235 Value widthOrZeroIfOutside =
236 LLVM::AndOp::create(rewriter, loc, int32Type, add, negwidth);
237 Value dstLane;
238
239 switch (op.getMode()) {
240 case gpu::ShuffleMode::UP:
241 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
242 adaptor.getOffset());
243 break;
244 case gpu::ShuffleMode::DOWN:
245 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
246 adaptor.getOffset());
247 break;
248 case gpu::ShuffleMode::XOR:
249 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
250 adaptor.getOffset());
251 break;
252 case gpu::ShuffleMode::IDX:
253 dstLane = adaptor.getOffset();
254 break;
255 }
256 Value isActiveSrcLane = LLVM::ICmpOp::create(
257 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
258 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
259 dstLane, srcLaneId);
260 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
261 Value dwordAlignedDstLane =
262 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
263
264 SmallVector<Value> decomposed =
265 LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
266 SmallVector<Value> swizzled;
267 for (Value v : decomposed) {
268 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
269 dwordAlignedDstLane, v);
270 swizzled.emplace_back(res);
271 }
272 Value shflValue =
273 LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
274 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
275 return success();
276 }
277};
278
279/// Import the GPU Ops to ROCDL Patterns.
280#include "GPUToROCDL.cpp.inc"
281
282// A pass that replaces all occurrences of GPU device operations with their
283// corresponding ROCDL equivalent.
284//
285// This pass only handles device code and is not meant to be run on GPU host
286// code.
287struct LowerGpuOpsToROCDLOpsPass final
288 : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
289 using Base::Base;
290
291 void getDependentDialects(DialectRegistry &registry) const override {
292 Base::getDependentDialects(registry);
294 }
295
296 void runOnOperation() override {
297 gpu::GPUModuleOp m = getOperation();
298 MLIRContext *ctx = m.getContext();
299
300 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
301 LLVM::LLVMDialect::getDataLayoutAttrName());
302 if (!llvmDataLayout) {
303 llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
304 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
305 }
306 // Request C wrapper emission.
307 for (auto func : m.getOps<func::FuncOp>()) {
308 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
309 UnitAttr::get(ctx));
310 }
311
312 FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
313 if (failed(maybeChipset)) {
314 emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
315 return signalPassFailure();
316 }
317
318 /// Customize the bitwidth used for the device side index computations.
320 ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
321 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
322 if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
323 options.overrideIndexBitwidth(indexBitwidth);
324
325 if (useBarePtrCallConv) {
326 options.useBarePtrCallConv = true;
327 WalkResult canUseBarePointers =
328 m.walk([](gpu::GPUFuncOp func) -> WalkResult {
330 return WalkResult::advance();
331 return WalkResult::interrupt();
332 });
333 if (canUseBarePointers.wasInterrupted()) {
334 emitError(UnknownLoc::get(ctx),
335 "bare pointer calling convention requires all memrefs to "
336 "have static shape and use the identity map");
337 return signalPassFailure();
338 }
339 }
340
341 // Apply in-dialect lowering. In-dialect lowering will replace
342 // ops which need to be lowered further, which is not supported by a
343 // single conversion pass.
344 {
348 (void)applyPatternsGreedily(m, std::move(patterns));
349 }
350
351 LLVMTypeConverter converter(ctx, options);
353 converter, [](gpu::AddressSpace space) {
354 switch (space) {
355 case gpu::AddressSpace::Global:
356 return 1;
357 case gpu::AddressSpace::Workgroup:
358 return 3;
359 case gpu::AddressSpace::Private:
360 return 5;
361 }
362 llvm_unreachable("unknown address space enum value");
363 return 0;
364 });
365
366 RewritePatternSet llvmPatterns(ctx);
368
369 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
370 allowedDialects.end());
371 for (Dialect *dialect : ctx->getLoadedDialects()) {
372 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
373 // Empty `allowedDialectsSet` means all dialects are allowed.
374 if (!allowedDialectsSet.empty() && !allowed)
375 continue;
376
377 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
378 if (!iface) {
379 // Error out if dialect was explicily specified but doesn't implement
380 // conversion interface.
381 if (allowed) {
382 m.emitError()
383 << "dialect does not implement ConvertToLLVMPatternInterface: "
384 << dialect->getNamespace();
385 return signalPassFailure();
386 }
387 continue;
388 }
389
390 iface->populateConvertToLLVMConversionPatterns(target, converter,
391 llvmPatterns);
392 }
393
394 populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
395 *maybeChipset);
396 populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
397 *maybeChipset);
399 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
400 signalPassFailure();
401 auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
402 auto reqdWorkGroupSizeAttrHelper =
403 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
404 auto flatWorkGroupSizeAttrHelper =
405 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
406 // Manually rewrite known block size attributes so the LLVMIR translation
407 // infrastructure can pick them up.
408 m.walk([&](LLVM::LLVMFuncOp op) {
409 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
410 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
411 // Also set up the rocdl.flat_work_group_size attribute to prevent
412 // conflicting metadata.
413 uint32_t flatSize = 1;
414 for (uint32_t size : blockSizes.asArrayRef()) {
415 flatSize *= size;
416 }
417 StringAttr flatSizeAttr =
418 StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
419 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
420 }
421 });
422 }
423};
424
425} // namespace
426
428 target.addIllegalOp<func::FuncOp>();
429 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
430 target.addLegalDialect<ROCDL::ROCDLDialect>();
431 target.addIllegalDialect<gpu::GPUDialect>();
432 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
433 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
434 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
435 // These ops are legal for f32 type.
436 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
437 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
438 });
439 // TODO: Remove once we support replacing non-root ops.
440 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
441}
442
449 auto *rocdlDialect =
450 converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
451 populateWithGenerated(patterns);
452 patterns.add<
453 gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
454 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
455 converter, IndexKind::Block, IntrType::Id);
457 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
458 converter, IndexKind::Grid, IntrType::Id);
459 patterns.add<
460 gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
461 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
462 converter, IndexKind::Block, IntrType::Dim);
464 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
465 converter, IndexKind::Grid, IntrType::Dim);
466 patterns.add<GPUReturnOpLowering>(converter);
468 converter,
470 /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
471 /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
472 rocdlDialect->getKernelAttrHelper().getName(),
473 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
474 if (Runtime::HIP == runtime) {
475 patterns.add<GPUPrintfOpToHIPLowering>(converter);
476 } else if (Runtime::OpenCL == runtime) {
477 // Use address space = 4 to match the OpenCL definition of printf()
478 patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
479 }
480 // TODO: Add alignment for workgroup memory
482
483 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
484 GPUSubgroupBroadcastOpToROCDL>(converter);
485 patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
486
488}
return success()
b getContext())
static Value getLaneId(RewriterBase &rewriter, Location loc)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
#define add(a, b)
UnitAttr getUnitAttr()
Definition Builders.cpp:98
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
Definition Builders.cpp:266
MLIRContext * getContext() const
Definition Builders.h:56
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
Definition Builders.cpp:104
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Definition Builders.cpp:94
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition Pattern.h:209
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition Pattern.h:215
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Definition Attributes.h:164
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition Types.cpp:56
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition ArithOps.cpp:258
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition Pattern.cpp:439
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition Pattern.cpp:400
Runtime
Potential runtimes for AMD GPU kernels.
Definition Runtimes.h:15
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition Chipset.h:22
unsigned majorVersion
Definition Chipset.h:23
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition Chipset.cpp:14