MLIR  21.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate ROCDLIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
16 #include "mlir/Pass/Pass.h"
17 #include "mlir/Pass/PassManager.h"
18 #include "mlir/Transforms/Passes.h"
19 
41 #include "mlir/Pass/Pass.h"
44 #include "llvm/Support/FormatVariadic.h"
45 
46 #include "../GPUCommon/GPUOpsLowering.h"
47 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
48 
49 namespace mlir {
50 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
51 #include "mlir/Conversion/Passes.h.inc"
52 } // namespace mlir
53 
54 using namespace mlir;
55 
56 // Truncate or extend the result depending on the index bitwidth specified
57 // by the LLVMTypeConverter options.
59  Location loc, Value value,
60  const LLVMTypeConverter &converter) {
61  int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
62  int64_t indexBitwidth = converter.getIndexTypeBitwidth();
63  auto indexBitwidthType =
64  IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
65  // TODO: use <=> in C++20.
66  if (indexBitwidth > intWidth) {
67  return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
68  }
69  if (indexBitwidth < intWidth) {
70  return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
71  }
72  return value;
73 }
74 
75 /// Returns true if the given `gpu.func` can be safely called using the bare
76 /// pointer calling convention.
77 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
78  bool canBeBare = true;
79  for (Type type : func.getArgumentTypes())
80  if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
81  canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
82  return canBeBare;
83 }
84 
86  const unsigned indexBitwidth) {
87  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
88  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
89  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
90  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
91  ValueRange{minus1, zero});
92  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
93  ValueRange{minus1, mbcntLo});
94  return laneId;
95 }
96 static constexpr StringLiteral amdgcnDataLayout =
97  "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
98  "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
99  "32-v32:"
100  "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
101  "64-S32-A5-G1-ni:7:8:9";
102 
103 namespace {
104 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
106 
107  LogicalResult
108  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
109  ConversionPatternRewriter &rewriter) const override {
110  auto loc = op->getLoc();
111  MLIRContext *context = rewriter.getContext();
112  // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
113  // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
114 
115  Type intTy = IntegerType::get(context, 32);
116  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
117  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
118  Value mbcntLo =
119  rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
120  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
121  loc, intTy, ValueRange{minus1, mbcntLo});
122  // Truncate or extend the result depending on the index bitwidth specified
123  // by the LLVMTypeConverter options.
124  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
125  if (indexBitwidth > 32) {
126  laneId = rewriter.create<LLVM::SExtOp>(
127  loc, IntegerType::get(context, indexBitwidth), laneId);
128  } else if (indexBitwidth < 32) {
129  laneId = rewriter.create<LLVM::TruncOp>(
130  loc, IntegerType::get(context, indexBitwidth), laneId);
131  }
132  rewriter.replaceOp(op, {laneId});
133  return success();
134  }
135 };
136 
137 struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
139 
140  GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
141  amdgpu::Chipset chipset)
143  chipset(chipset) {}
144 
145  LogicalResult
146  matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
147  ConversionPatternRewriter &rewriter) const override {
148  LLVM::ConstantRangeAttr bounds = nullptr;
149  bool isBeforeGfx10 = chipset.majorVersion < 10;
150  if (auto upperBoundAttr = op.getUpperBoundAttr()) {
151  bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
152  /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
153  /*upper=*/op.getUpperBoundAttr().getInt() + 1);
154  }
155  Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
156  op.getLoc(), rewriter.getI32Type(), bounds);
157  wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
158  *getTypeConverter());
159  rewriter.replaceOp(op, {wavefrontOp});
160  return success();
161  }
162 
163  const amdgpu::Chipset chipset;
164 };
165 
166 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
168 
169  /// Lowers a shuffle to the corresponding ROCDL ops.
170  ///
171  /// Use the `width` argument to see if src lane is participating.
172  /// If not the dstLane would be itself.
173  ///
174  /// Shuffle with DS Bpermute:
175  /// let shflMode = [xor, up, down, idx]
176  /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
177  /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
178  /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
179  /// 3. dstLane = shflMode(curLaneId, step)
180  /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
181  /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
182  /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
183  /// 7. bpermute(dwordAlignedDstLane, shfl_value).
184  ///
185  LogicalResult
186  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
187  ConversionPatternRewriter &rewriter) const override {
188  Location loc = op->getLoc();
189  Value initShflValue = adaptor.getValue();
190 
191  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
192  Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
193 
194  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
195  Value width = adaptor.getWidth();
196  Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
197  Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
198  Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
199  Value widthOrZeroIfOutside =
200  rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
201  Value dstLane;
202 
203  switch (op.getMode()) {
204  case gpu::ShuffleMode::UP:
205  dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
206  adaptor.getOffset());
207  break;
208  case gpu::ShuffleMode::DOWN:
209  dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
210  adaptor.getOffset());
211  break;
212  case gpu::ShuffleMode::XOR:
213  dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
214  adaptor.getOffset());
215  break;
216  case gpu::ShuffleMode::IDX:
217  dstLane = adaptor.getOffset();
218  break;
219  }
220  Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
221  loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
222  Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
223  dstLane, srcLaneId);
224  Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
225  Value dwordAlignedDstLane =
226  rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
227 
228  SmallVector<Value> decomposed =
229  LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
230  SmallVector<Value> swizzled;
231  for (Value v : decomposed) {
232  Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
233  dwordAlignedDstLane, v);
234  swizzled.emplace_back(res);
235  }
236  Value shflValue =
237  LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
238  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
239  return success();
240  }
241 };
242 
243 /// Import the GPU Ops to ROCDL Patterns.
244 #include "GPUToROCDL.cpp.inc"
245 
246 // A pass that replaces all occurrences of GPU device operations with their
247 // corresponding ROCDL equivalent.
248 //
249 // This pass only handles device code and is not meant to be run on GPU host
250 // code.
251 struct LowerGpuOpsToROCDLOpsPass final
252  : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
253  LowerGpuOpsToROCDLOpsPass() = default;
254  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
255  bool useBarePtrCallConv,
256  gpu::amd::Runtime runtime) {
257  if (this->chipset.getNumOccurrences() == 0)
258  this->chipset = chipset;
259  if (this->indexBitwidth.getNumOccurrences() == 0)
260  this->indexBitwidth = indexBitwidth;
261  if (this->useBarePtrCallConv.getNumOccurrences() == 0)
262  this->useBarePtrCallConv = useBarePtrCallConv;
263  if (this->runtime.getNumOccurrences() == 0)
264  this->runtime = runtime;
265  }
266 
267  void getDependentDialects(DialectRegistry &registry) const override {
268  Base::getDependentDialects(registry);
270  }
271 
272  void runOnOperation() override {
273  gpu::GPUModuleOp m = getOperation();
274  MLIRContext *ctx = m.getContext();
275 
276  auto llvmDataLayout = m->getAttrOfType<StringAttr>(
277  LLVM::LLVMDialect::getDataLayoutAttrName());
278  if (!llvmDataLayout) {
279  llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
280  m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
281  }
282  // Request C wrapper emission.
283  for (auto func : m.getOps<func::FuncOp>()) {
284  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
285  UnitAttr::get(ctx));
286  }
287 
288  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
289  if (failed(maybeChipset)) {
290  emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
291  return signalPassFailure();
292  }
293 
294  /// Customize the bitwidth used for the device side index computations.
296  ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
297  options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
298  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
299  options.overrideIndexBitwidth(indexBitwidth);
300 
301  if (useBarePtrCallConv) {
302  options.useBarePtrCallConv = true;
303  WalkResult canUseBarePointers =
304  m.walk([](gpu::GPUFuncOp func) -> WalkResult {
305  if (canBeCalledWithBarePointers(func))
306  return WalkResult::advance();
307  return WalkResult::interrupt();
308  });
309  if (canUseBarePointers.wasInterrupted()) {
311  "bare pointer calling convention requires all memrefs to "
312  "have static shape and use the identity map");
313  return signalPassFailure();
314  }
315  }
316 
317  // Apply in-dialect lowering. In-dialect lowering will replace
318  // ops which need to be lowered further, which is not supported by a
319  // single conversion pass.
320  {
324  (void)applyPatternsGreedily(m, std::move(patterns));
325  }
326 
327  LLVMTypeConverter converter(ctx, options);
329  converter, [](gpu::AddressSpace space) {
330  switch (space) {
331  case gpu::AddressSpace::Global:
332  return 1;
333  case gpu::AddressSpace::Workgroup:
334  return 3;
335  case gpu::AddressSpace::Private:
336  return 5;
337  }
338  llvm_unreachable("unknown address space enum value");
339  return 0;
340  });
341 
342  RewritePatternSet llvmPatterns(ctx);
344 
345  llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
346  allowedDialects.end());
347  for (Dialect *dialect : ctx->getLoadedDialects()) {
348  bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
349  // Empty `allowedDialectsSet` means all dialects are allowed.
350  if (!allowedDialectsSet.empty() && !allowed)
351  continue;
352 
353  auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
354  if (!iface) {
355  // Error out if dialect was explicily specified but doesn't implement
356  // conversion interface.
357  if (allowed) {
358  m.emitError()
359  << "dialect does not implement ConvertToLLVMPatternInterface: "
360  << dialect->getNamespace();
361  return signalPassFailure();
362  }
363  continue;
364  }
365 
366  iface->populateConvertToLLVMConversionPatterns(target, converter,
367  llvmPatterns);
368  }
369 
370  populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
371  *maybeChipset);
372  populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
373  *maybeChipset);
375  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
376  signalPassFailure();
377  auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
378  auto reqdWorkGroupSizeAttrHelper =
379  rocdlDialect->getReqdWorkGroupSizeAttrHelper();
380  auto flatWorkGroupSizeAttrHelper =
381  rocdlDialect->getFlatWorkGroupSizeAttrHelper();
382  // Manually rewrite known block size attributes so the LLVMIR translation
383  // infrastructure can pick them up.
384  m.walk([&](LLVM::LLVMFuncOp op) {
385  if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
386  auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
387  // Also set up the rocdl.flat_work_group_size attribute to prevent
388  // conflicting metadata.
389  uint32_t flatSize = 1;
390  for (uint32_t size : blockSizes.asArrayRef()) {
391  flatSize *= size;
392  }
393  StringAttr flatSizeAttr =
394  StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
395  flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
396  }
397  });
398  }
399 };
400 
401 } // namespace
402 
404  target.addIllegalOp<func::FuncOp>();
405  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
406  target.addLegalDialect<ROCDL::ROCDLDialect>();
407  target.addIllegalDialect<gpu::GPUDialect>();
408  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
409  LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
410  LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
411  // These ops are legal for f32 type.
412  target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
413  return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
414  });
415  // TODO: Remove once we support replacing non-root ops.
416  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
417 }
418 
420  const LLVMTypeConverter &converter, RewritePatternSet &patterns,
421  mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
425  auto *rocdlDialect =
426  converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
427  populateWithGenerated(patterns);
428  patterns.add<
429  gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
430  ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
431  converter, IndexKind::Block, IntrType::Id);
433  gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
434  converter, IndexKind::Grid, IntrType::Id);
435  patterns.add<
436  gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
437  ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
438  converter, IndexKind::Block, IntrType::Dim);
440  gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
441  converter, IndexKind::Grid, IntrType::Dim);
442  patterns.add<GPUReturnOpLowering>(converter);
444  converter,
446  /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
447  /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
448  rocdlDialect->getKernelAttrHelper().getName(),
449  rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
450  if (Runtime::HIP == runtime) {
451  patterns.add<GPUPrintfOpToHIPLowering>(converter);
452  } else if (Runtime::OpenCL == runtime) {
453  // Use address space = 4 to match the OpenCL definition of printf()
454  patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
455  }
456  // TODO: Add alignment for workgroup memory
458 
459  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
460  patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
461 
463 }
464 
465 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
466 mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
467  unsigned indexBitwidth,
468  bool useBarePtrCallConv,
469  gpu::amd::Runtime runtime) {
470  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
471  chipset, indexBitwidth, useBarePtrCallConv, runtime);
472 }
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
IntegerType getI32Type()
Definition: Builders.cpp:65
MLIRContext * getContext() const
Definition: Builders.h:55
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
Definition: Builders.h:96
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:195
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition: Pattern.h:201
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition: Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:455
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:387
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:33
static WalkResult advance()
Definition: Visitors.h:51
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:55
static WalkResult interrupt()
Definition: Visitors.h:50
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition: Pattern.cpp:448
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition: Pattern.cpp:409
Runtime
Potential runtimes for AMD GPU kernels.
Definition: Runtimes.h:15
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Definition: MathToROCDL.cpp:48
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition: Chipset.h:22
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14