MLIR  21.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate ROCDLIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
16 #include "mlir/Pass/Pass.h"
17 #include "mlir/Pass/PassManager.h"
18 #include "mlir/Transforms/Passes.h"
19 
41 #include "mlir/Pass/Pass.h"
44 #include "llvm/Support/FormatVariadic.h"
45 
46 #include "../GPUCommon/GPUOpsLowering.h"
47 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
48 
49 namespace mlir {
50 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
51 #include "mlir/Conversion/Passes.h.inc"
52 } // namespace mlir
53 
54 using namespace mlir;
55 
56 // Truncate or extend the result depending on the index bitwidth specified
57 // by the LLVMTypeConverter options.
59  Location loc, Value value,
60  const LLVMTypeConverter &converter) {
61  int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
62  int64_t indexBitwidth = converter.getIndexTypeBitwidth();
63  auto indexBitwidthType =
64  IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
65  // TODO: use <=> in C++20.
66  if (indexBitwidth > intWidth) {
67  return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
68  }
69  if (indexBitwidth < intWidth) {
70  return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
71  }
72  return value;
73 }
74 
75 /// Returns true if the given `gpu.func` can be safely called using the bare
76 /// pointer calling convention.
77 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
78  bool canBeBare = true;
79  for (Type type : func.getArgumentTypes())
80  if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
81  canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
82  return canBeBare;
83 }
84 
86  const unsigned indexBitwidth) {
87  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
88  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
89  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
90  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
91  ValueRange{minus1, zero});
92  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
93  ValueRange{minus1, mbcntLo});
94  return laneId;
95 }
96 static constexpr StringLiteral amdgcnDataLayout =
97  "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
98  "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
99  "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
100  "64-S32-A5-G1-ni:7:8:9";
101 
102 namespace {
103 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
105 
106  LogicalResult
107  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
108  ConversionPatternRewriter &rewriter) const override {
109  auto loc = op->getLoc();
110  MLIRContext *context = rewriter.getContext();
111  // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
112  // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
113 
114  Type intTy = IntegerType::get(context, 32);
115  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
116  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
117  Value mbcntLo =
118  rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
119  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
120  loc, intTy, ValueRange{minus1, mbcntLo});
121  // Truncate or extend the result depending on the index bitwidth specified
122  // by the LLVMTypeConverter options.
123  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
124  if (indexBitwidth > 32) {
125  laneId = rewriter.create<LLVM::SExtOp>(
126  loc, IntegerType::get(context, indexBitwidth), laneId);
127  } else if (indexBitwidth < 32) {
128  laneId = rewriter.create<LLVM::TruncOp>(
129  loc, IntegerType::get(context, indexBitwidth), laneId);
130  }
131  rewriter.replaceOp(op, {laneId});
132  return success();
133  }
134 };
135 
136 struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
138 
139  GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
140  amdgpu::Chipset chipset)
142  chipset(chipset) {}
143 
144  LogicalResult
145  matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
146  ConversionPatternRewriter &rewriter) const override {
147  LLVM::ConstantRangeAttr bounds = nullptr;
148  bool isBeforeGfx10 = chipset.majorVersion < 10;
149  if (auto upperBoundAttr = op.getUpperBoundAttr()) {
150  bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
151  /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
152  /*upper=*/op.getUpperBoundAttr().getInt() + 1);
153  }
154  Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
155  op.getLoc(), rewriter.getI32Type(), bounds);
156  wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
157  *getTypeConverter());
158  rewriter.replaceOp(op, {wavefrontOp});
159  return success();
160  }
161 
162  const amdgpu::Chipset chipset;
163 };
164 
165 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
167 
168  /// Lowers a shuffle to the corresponding ROCDL ops.
169  ///
170  /// Use the `width` argument to see if src lane is participating.
171  /// If not the dstLane would be itself.
172  ///
173  /// Shuffle with DS Bpermute:
174  /// let shflMode = [xor, up, down, idx]
175  /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
176  /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
177  /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
178  /// 3. dstLane = shflMode(curLaneId, step)
179  /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
180  /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
181  /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
182  /// 7. bpermute(dwordAlignedDstLane, shfl_value).
183  ///
184  LogicalResult
185  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
186  ConversionPatternRewriter &rewriter) const override {
187  Location loc = op->getLoc();
188  Value initShflValue = adaptor.getValue();
189 
190  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
191  Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
192 
193  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
194  Value width = adaptor.getWidth();
195  Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
196  Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
197  Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
198  Value widthOrZeroIfOutside =
199  rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
200  Value dstLane;
201 
202  switch (op.getMode()) {
203  case gpu::ShuffleMode::UP:
204  dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
205  adaptor.getOffset());
206  break;
207  case gpu::ShuffleMode::DOWN:
208  dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
209  adaptor.getOffset());
210  break;
211  case gpu::ShuffleMode::XOR:
212  dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
213  adaptor.getOffset());
214  break;
215  case gpu::ShuffleMode::IDX:
216  dstLane = adaptor.getOffset();
217  break;
218  }
219  Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
220  loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
221  Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
222  dstLane, srcLaneId);
223  Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
224  Value dwordAlignedDstLane =
225  rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
226 
227  SmallVector<Value> decomposed =
228  LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
229  SmallVector<Value> swizzled;
230  for (Value v : decomposed) {
231  Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
232  dwordAlignedDstLane, v);
233  swizzled.emplace_back(res);
234  }
235  Value shflValue =
236  LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
237  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
238  return success();
239  }
240 };
241 
242 /// Import the GPU Ops to ROCDL Patterns.
243 #include "GPUToROCDL.cpp.inc"
244 
245 // A pass that replaces all occurrences of GPU device operations with their
246 // corresponding ROCDL equivalent.
247 //
248 // This pass only handles device code and is not meant to be run on GPU host
249 // code.
250 struct LowerGpuOpsToROCDLOpsPass final
251  : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
252  LowerGpuOpsToROCDLOpsPass() = default;
253  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
254  bool useBarePtrCallConv,
255  gpu::amd::Runtime runtime) {
256  if (this->chipset.getNumOccurrences() == 0)
257  this->chipset = chipset;
258  if (this->indexBitwidth.getNumOccurrences() == 0)
259  this->indexBitwidth = indexBitwidth;
260  if (this->useBarePtrCallConv.getNumOccurrences() == 0)
261  this->useBarePtrCallConv = useBarePtrCallConv;
262  if (this->runtime.getNumOccurrences() == 0)
263  this->runtime = runtime;
264  }
265 
266  void getDependentDialects(DialectRegistry &registry) const override {
267  Base::getDependentDialects(registry);
269  }
270 
271  void runOnOperation() override {
272  gpu::GPUModuleOp m = getOperation();
273  MLIRContext *ctx = m.getContext();
274 
275  auto llvmDataLayout = m->getAttrOfType<StringAttr>(
276  LLVM::LLVMDialect::getDataLayoutAttrName());
277  if (!llvmDataLayout) {
278  llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
279  m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
280  }
281  // Request C wrapper emission.
282  for (auto func : m.getOps<func::FuncOp>()) {
283  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
284  UnitAttr::get(ctx));
285  }
286 
287  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
288  if (failed(maybeChipset)) {
289  emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
290  return signalPassFailure();
291  }
292 
293  /// Customize the bitwidth used for the device side index computations.
295  ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
296  options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
297  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
298  options.overrideIndexBitwidth(indexBitwidth);
299 
300  if (useBarePtrCallConv) {
301  options.useBarePtrCallConv = true;
302  WalkResult canUseBarePointers =
303  m.walk([](gpu::GPUFuncOp func) -> WalkResult {
304  if (canBeCalledWithBarePointers(func))
305  return WalkResult::advance();
306  return WalkResult::interrupt();
307  });
308  if (canUseBarePointers.wasInterrupted()) {
310  "bare pointer calling convention requires all memrefs to "
311  "have static shape and use the identity map");
312  return signalPassFailure();
313  }
314  }
315 
316  // Apply in-dialect lowering. In-dialect lowering will replace
317  // ops which need to be lowered further, which is not supported by a
318  // single conversion pass.
319  {
323  (void)applyPatternsGreedily(m, std::move(patterns));
324  }
325 
326  LLVMTypeConverter converter(ctx, options);
328  converter, [](gpu::AddressSpace space) {
329  switch (space) {
330  case gpu::AddressSpace::Global:
331  return 1;
332  case gpu::AddressSpace::Workgroup:
333  return 3;
334  case gpu::AddressSpace::Private:
335  return 5;
336  }
337  llvm_unreachable("unknown address space enum value");
338  return 0;
339  });
340 
341  RewritePatternSet llvmPatterns(ctx);
343 
344  llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
345  allowedDialects.end());
346  for (Dialect *dialect : ctx->getLoadedDialects()) {
347  bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
348  // Empty `allowedDialectsSet` means all dialects are allowed.
349  if (!allowedDialectsSet.empty() && !allowed)
350  continue;
351 
352  auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
353  if (!iface) {
354  // Error out if dialect was explicily specified but doesn't implement
355  // conversion interface.
356  if (allowed) {
357  m.emitError()
358  << "dialect does not implement ConvertToLLVMPatternInterface: "
359  << dialect->getNamespace();
360  return signalPassFailure();
361  }
362  continue;
363  }
364 
365  iface->populateConvertToLLVMConversionPatterns(target, converter,
366  llvmPatterns);
367  }
368 
369  populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
370  *maybeChipset);
371  populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
372  *maybeChipset);
374  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
375  signalPassFailure();
376  auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
377  auto reqdWorkGroupSizeAttrHelper =
378  rocdlDialect->getReqdWorkGroupSizeAttrHelper();
379  auto flatWorkGroupSizeAttrHelper =
380  rocdlDialect->getFlatWorkGroupSizeAttrHelper();
381  // Manually rewrite known block size attributes so the LLVMIR translation
382  // infrastructure can pick them up.
383  m.walk([&](LLVM::LLVMFuncOp op) {
384  if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
385  auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
386  // Also set up the rocdl.flat_work_group_size attribute to prevent
387  // conflicting metadata.
388  uint32_t flatSize = 1;
389  for (uint32_t size : blockSizes.asArrayRef()) {
390  flatSize *= size;
391  }
392  StringAttr flatSizeAttr =
393  StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
394  flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
395  }
396  });
397  }
398 };
399 
400 } // namespace
401 
403  target.addIllegalOp<func::FuncOp>();
404  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
405  target.addLegalDialect<ROCDL::ROCDLDialect>();
406  target.addIllegalDialect<gpu::GPUDialect>();
407  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
408  LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
409  LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
410  // These ops are legal for f32 type.
411  target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
412  return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
413  });
414  // TODO: Remove once we support replacing non-root ops.
415  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
416 }
417 
419  const LLVMTypeConverter &converter, RewritePatternSet &patterns,
420  mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
424  auto *rocdlDialect =
425  converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
426  populateWithGenerated(patterns);
427  patterns.add<
428  gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
429  ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
430  converter, IndexKind::Block, IntrType::Id);
432  gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
433  converter, IndexKind::Grid, IntrType::Id);
434  patterns.add<
435  gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
436  ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
437  converter, IndexKind::Block, IntrType::Dim);
439  gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
440  converter, IndexKind::Grid, IntrType::Dim);
441  patterns.add<GPUReturnOpLowering>(converter);
443  converter,
445  /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
446  /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
447  rocdlDialect->getKernelAttrHelper().getName(),
448  rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
449  if (Runtime::HIP == runtime) {
450  patterns.add<GPUPrintfOpToHIPLowering>(converter);
451  } else if (Runtime::OpenCL == runtime) {
452  // Use address space = 4 to match the OpenCL definition of printf()
453  patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
454  }
455  // TODO: Add alignment for workgroup memory
457 
458  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
459  patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
460 
462 }
463 
464 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
465 mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
466  unsigned indexBitwidth,
467  bool useBarePtrCallConv,
468  gpu::amd::Runtime runtime) {
469  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
470  chipset, indexBitwidth, useBarePtrCallConv, runtime);
471 }
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
IntegerType getI32Type()
Definition: Builders.cpp:63
MLIRContext * getContext() const
Definition: Builders.h:55
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
Definition: Builders.h:95
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:155
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition: Pattern.h:161
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition: Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:33
static WalkResult advance()
Definition: Visitors.h:51
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:55
static WalkResult interrupt()
Definition: Visitors.h:50
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition: Pattern.cpp:439
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition: Pattern.cpp:400
Runtime
Potential runtimes for AMD GPU kernels.
Definition: Runtimes.h:15
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Definition: MathToROCDL.cpp:48
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition: Chipset.h:22
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14