MLIR 22.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to generate ROCDLIR operations for higher-level
10// GPU operations.
11//
12//===----------------------------------------------------------------------===//
13
16#include "mlir/Pass/Pass.h"
18
41
44
45namespace mlir {
46#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
47#include "mlir/Conversion/Passes.h.inc"
48} // namespace mlir
49
50using namespace mlir;
51
52// Truncate or extend the result depending on the index bitwidth specified
53// by the LLVMTypeConverter options.
54static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
55 Location loc, Value value,
56 const LLVMTypeConverter &converter) {
57 int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
58 int64_t indexBitwidth = converter.getIndexTypeBitwidth();
59 auto indexBitwidthType =
60 IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
61 // TODO: use <=> in C++20.
62 if (indexBitwidth > intWidth) {
63 return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
64 }
65 if (indexBitwidth < intWidth) {
66 return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
67 }
68 return value;
69}
70
71/// Returns true if the given `gpu.func` can be safely called using the bare
72/// pointer calling convention.
73static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
74 bool canBeBare = true;
75 for (Type type : func.getArgumentTypes())
76 if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
77 canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
78 return canBeBare;
79}
80
81static Value getLaneId(RewriterBase &rewriter, Location loc) {
82 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
83 Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
84 Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
85 NamedAttribute noundef = rewriter.getNamedAttr(
86 LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
87 NamedAttribute lowRange = rewriter.getNamedAttr(
88 LLVM::LLVMDialect::getRangeAttrName(),
89 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
90 APInt(32, 32)));
91 NamedAttribute highRange = rewriter.getNamedAttr(
92 LLVM::LLVMDialect::getRangeAttrName(),
93 LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
94 APInt(32, 64)));
95 Value mbcntLo = ROCDL::MbcntLoOp::create(
96 rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
97 /*res_attrs=*/
98 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
99 Value laneId = ROCDL::MbcntHiOp::create(
100 rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
101 rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
102 return laneId;
103}
104
105static constexpr StringLiteral amdgcnDataLayout =
106 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
107 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
108 "32-v32:"
109 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
110 "64-S32-A5-G1-ni:7:8:9";
111
112namespace {
113struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
115
116 LogicalResult
117 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
118 ConversionPatternRewriter &rewriter) const override {
119 Location loc = op.getLoc();
120 MLIRContext *context = rewriter.getContext();
121 // convert to:
122 // %mlo = call noundef range(i32 0, 32)
123 // @llvm.amdgcn.mbcnt.lo(-1, 0)
124 // followed by:
125 // %lid = call noundef range(i32 0, 64)
126 // @llvm.amdgcn.mbcnt.hi(-1, %mlo)
127
128 Value laneId = getLaneId(rewriter, loc);
129 // Truncate or extend the result depending on the index bitwidth specified
130 // by the LLVMTypeConverter options.
131 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
132 if (indexBitwidth > 32) {
133 laneId = LLVM::SExtOp::create(
134 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
135 } else if (indexBitwidth < 32) {
136 laneId = LLVM::TruncOp::create(
137 rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
138 }
139 rewriter.replaceOp(op, {laneId});
140 return success();
141 }
142};
143
144struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
146
147 GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
148 amdgpu::Chipset chipset)
150 chipset(chipset) {}
151
152 LogicalResult
153 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
154 ConversionPatternRewriter &rewriter) const override {
155 LLVM::ConstantRangeAttr bounds = nullptr;
156 bool isBeforeGfx10 = chipset.majorVersion < 10;
157 if (auto upperBoundAttr = op.getUpperBoundAttr()) {
158 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
159 /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
160 /*upper=*/op.getUpperBoundAttr().getInt() + 1);
161 }
162 Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
163 rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
164 wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
165 *getTypeConverter());
166 rewriter.replaceOp(op, {wavefrontOp});
167 return success();
168 }
169
170 const amdgpu::Chipset chipset;
171};
172
173static bool isSupportedReadLaneType(Type type) {
174 // read(first)lane also supports some vector types, but limit it for scalars
175 // for now.
176 return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
177 isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
178 LLVM::LLVMPointerType>(type);
179}
180
181struct GPUSubgroupBroadcastOpToROCDL
182 : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
184
185 LogicalResult
186 matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
187 ConversionPatternRewriter &rewriter) const override {
188 Value src = adaptor.getSrc();
189 if (!isSupportedReadLaneType(src.getType()))
190 return rewriter.notifyMatchFailure(op, "unsupported readlane type");
191
192 if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
193 rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
194 adaptor.getLane());
195 } else { // first_active_lane
196 rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
197 src);
198 }
199 return success();
200 }
201};
202
203struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
205
206 /// Lowers a shuffle to the corresponding ROCDL ops.
207 ///
208 /// Use the `width` argument to see if src lane is participating.
209 /// If not the dstLane would be itself.
210 ///
211 /// Shuffle with DS Bpermute:
212 /// let shflMode = [xor, up, down, idx]
213 /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
214 /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
215 /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
216 /// 3. dstLane = shflMode(curLaneId, step)
217 /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
218 /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
219 /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
220 /// 7. bpermute(dwordAlignedDstLane, shfl_value).
221 ///
222 LogicalResult
223 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
224 ConversionPatternRewriter &rewriter) const override {
225 Location loc = op->getLoc();
226 Value initShflValue = adaptor.getValue();
227
228 Value srcLaneId = getLaneId(rewriter, loc);
229
230 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
231 Value width = adaptor.getWidth();
232 Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
233 Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
234 Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
235 Value widthOrZeroIfOutside =
236 LLVM::AndOp::create(rewriter, loc, int32Type, add, negwidth);
237 Value dstLane;
238
239 switch (op.getMode()) {
240 case gpu::ShuffleMode::UP:
241 dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
242 adaptor.getOffset());
243 break;
244 case gpu::ShuffleMode::DOWN:
245 dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
246 adaptor.getOffset());
247 break;
248 case gpu::ShuffleMode::XOR:
249 dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
250 adaptor.getOffset());
251 break;
252 case gpu::ShuffleMode::IDX:
253 dstLane = adaptor.getOffset();
254 break;
255 }
256 Value isActiveSrcLane = LLVM::ICmpOp::create(
257 rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
258 Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
259 dstLane, srcLaneId);
260 Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
261 Value dwordAlignedDstLane =
262 LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
263
264 SmallVector<Value> decomposed =
265 LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
266 SmallVector<Value> swizzled;
267 for (Value v : decomposed) {
268 Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
269 dwordAlignedDstLane, v);
270 swizzled.emplace_back(res);
271 }
272 Value shflValue =
273 LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
274 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
275 return success();
276 }
277};
278
279/// Import the GPU Ops to ROCDL Patterns.
280#include "GPUToROCDL.cpp.inc"
281
282// A pass that replaces all occurrences of GPU device operations with their
283// corresponding ROCDL equivalent.
284//
285// This pass only handles device code and is not meant to be run on GPU host
286// code.
287struct LowerGpuOpsToROCDLOpsPass final
288 : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
289 using Base::Base;
290
291 void getDependentDialects(DialectRegistry &registry) const override {
292 Base::getDependentDialects(registry);
294 }
295
296 void runOnOperation() override {
297 gpu::GPUModuleOp m = getOperation();
298 MLIRContext *ctx = m.getContext();
299
300 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
301 LLVM::LLVMDialect::getDataLayoutAttrName());
302 if (!llvmDataLayout) {
303 llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
304 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
305 }
306 // Request C wrapper emission.
307 for (auto func : m.getOps<func::FuncOp>()) {
308 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
309 UnitAttr::get(ctx));
310 }
311
312 FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
313 if (failed(maybeChipset)) {
314 emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
315 return signalPassFailure();
316 }
317
318 /// Customize the bitwidth used for the device side index computations.
320 ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
321 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
322 if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
323 options.overrideIndexBitwidth(indexBitwidth);
324
325 if (useBarePtrCallConv) {
326 options.useBarePtrCallConv = true;
327 WalkResult canUseBarePointers =
328 m.walk([](gpu::GPUFuncOp func) -> WalkResult {
330 return WalkResult::advance();
331 return WalkResult::interrupt();
332 });
333 if (canUseBarePointers.wasInterrupted()) {
334 emitError(UnknownLoc::get(ctx),
335 "bare pointer calling convention requires all memrefs to "
336 "have static shape and use the identity map");
337 return signalPassFailure();
338 }
339 }
340
341 // Apply in-dialect lowering. In-dialect lowering will replace
342 // ops which need to be lowered further, which is not supported by a
343 // single conversion pass.
344 {
348 (void)applyPatternsGreedily(m, std::move(patterns));
349 }
350
351 LLVMTypeConverter converter(ctx, options);
353
354 RewritePatternSet llvmPatterns(ctx);
356
357 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
358 allowedDialects.end());
359 for (Dialect *dialect : ctx->getLoadedDialects()) {
360 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
361 // Empty `allowedDialectsSet` means all dialects are allowed.
362 if (!allowedDialectsSet.empty() && !allowed)
363 continue;
364
365 auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
366 if (!iface) {
367 // Error out if dialect was explicily specified but doesn't implement
368 // conversion interface.
369 if (allowed) {
370 m.emitError()
371 << "dialect does not implement ConvertToLLVMPatternInterface: "
372 << dialect->getNamespace();
373 return signalPassFailure();
374 }
375 continue;
376 }
377
378 iface->populateConvertToLLVMConversionPatterns(target, converter,
379 llvmPatterns);
380 }
381
382 populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
383 *maybeChipset);
384 populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
385 *maybeChipset);
387 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
388 signalPassFailure();
389 auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
390 auto reqdWorkGroupSizeAttrHelper =
391 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
392 auto flatWorkGroupSizeAttrHelper =
393 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
394 // Manually rewrite known block size attributes so the LLVMIR translation
395 // infrastructure can pick them up.
396 m.walk([&](LLVM::LLVMFuncOp op) {
397 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
398 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
399 // Also set up the rocdl.flat_work_group_size attribute to prevent
400 // conflicting metadata.
401 uint32_t flatSize = 1;
402 for (uint32_t size : blockSizes.asArrayRef()) {
403 flatSize *= size;
404 }
405 StringAttr flatSizeAttr =
406 StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
407 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
408 }
409 });
410 }
411};
412
413} // namespace
414
416 target.addIllegalOp<func::FuncOp>();
417 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
418 target.addLegalDialect<ROCDL::ROCDLDialect>();
419 target.addIllegalDialect<gpu::GPUDialect>();
420 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
421 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
422 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
423 // These ops are legal for f32 type.
424 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
425 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
426 });
427 // TODO: Remove once we support replacing non-root ops.
428 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
429}
430
437 auto *rocdlDialect =
438 converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
439 populateWithGenerated(patterns);
440 patterns.add<
441 gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
442 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
443 converter, IndexKind::Block, IntrType::Id);
445 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
446 converter, IndexKind::Grid, IntrType::Id);
447 patterns.add<
448 gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
449 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
450 converter, IndexKind::Block, IntrType::Dim);
452 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
453 converter, IndexKind::Grid, IntrType::Dim);
454 patterns.add<GPUReturnOpLowering>(converter);
456 converter,
458 /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
459 /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
460 rocdlDialect->getKernelAttrHelper().getName(),
461 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
462 if (Runtime::HIP == runtime) {
463 patterns.add<GPUPrintfOpToHIPLowering>(converter);
464 } else if (Runtime::OpenCL == runtime) {
465 // Use address space = 4 to match the OpenCL definition of printf()
466 patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
467 }
468 // TODO: Add alignment for workgroup memory
470
471 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
472 GPUSubgroupBroadcastOpToROCDL>(converter);
473 patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
474
476}
return success()
b getContext())
static Value getLaneId(RewriterBase &rewriter, Location loc)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
#define add(a, b)
UnitAttr getUnitAttr()
Definition Builders.cpp:98
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
Definition Builders.cpp:266
MLIRContext * getContext() const
Definition Builders.h:56
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
Definition Builders.cpp:104
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Definition Builders.cpp:94
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition Pattern.h:207
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition Pattern.h:213
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Definition Attributes.h:164
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition Types.cpp:56
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition ArithOps.cpp:258
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition Pattern.cpp:432
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition Pattern.cpp:393
Runtime
Potential runtimes for AMD GPU kernels.
Definition Runtimes.h:15
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, std::optional< amdgpu::Chipset > chipset)
Populate the given list with patterns that convert from Math to ROCDL calls.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces and types,...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
void populateCommonAMDGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition Chipset.h:22
unsigned majorVersion
Definition Chipset.h:23
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition Chipset.cpp:14