MLIR  22.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate ROCDLIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
16 #include "mlir/Pass/Pass.h"
17 #include "mlir/Pass/PassManager.h"
18 
39 #include "mlir/Pass/Pass.h"
42 
43 #include "../GPUCommon/GPUOpsLowering.h"
44 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
45 
46 namespace mlir {
47 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
48 #include "mlir/Conversion/Passes.h.inc"
49 } // namespace mlir
50 
51 using namespace mlir;
52 
53 // Truncate or extend the result depending on the index bitwidth specified
54 // by the LLVMTypeConverter options.
56  Location loc, Value value,
57  const LLVMTypeConverter &converter) {
58  int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
59  int64_t indexBitwidth = converter.getIndexTypeBitwidth();
60  auto indexBitwidthType =
61  IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
62  // TODO: use <=> in C++20.
63  if (indexBitwidth > intWidth) {
64  return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
65  }
66  if (indexBitwidth < intWidth) {
67  return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
68  }
69  return value;
70 }
71 
72 /// Returns true if the given `gpu.func` can be safely called using the bare
73 /// pointer calling convention.
74 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
75  bool canBeBare = true;
76  for (Type type : func.getArgumentTypes())
77  if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
78  canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
79  return canBeBare;
80 }
81 
82 static Value getLaneId(RewriterBase &rewriter, Location loc) {
83  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
84  Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
85  Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
86  NamedAttribute noundef = rewriter.getNamedAttr(
87  LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
88  NamedAttribute lowRange = rewriter.getNamedAttr(
89  LLVM::LLVMDialect::getRangeAttrName(),
91  APInt(32, 32)));
92  NamedAttribute highRange = rewriter.getNamedAttr(
93  LLVM::LLVMDialect::getRangeAttrName(),
95  APInt(32, 64)));
96  Value mbcntLo = ROCDL::MbcntLoOp::create(
97  rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
98  /*res_attrs=*/
99  rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
100  Value laneId = ROCDL::MbcntHiOp::create(
101  rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
102  rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
103  return laneId;
104 }
105 
106 static constexpr StringLiteral amdgcnDataLayout =
107  "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
108  "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
109  "32-v32:"
110  "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
111  "64-S32-A5-G1-ni:7:8:9";
112 
113 namespace {
114 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
116 
117  LogicalResult
118  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
119  ConversionPatternRewriter &rewriter) const override {
120  Location loc = op.getLoc();
121  MLIRContext *context = rewriter.getContext();
122  // convert to:
123  // %mlo = call noundef range(i32 0, 32)
124  // @llvm.amdgcn.mbcnt.lo(-1, 0)
125  // followed by:
126  // %lid = call noundef range(i32 0, 64)
127  // @llvm.amdgcn.mbcnt.hi(-1, %mlo)
128 
129  Value laneId = getLaneId(rewriter, loc);
130  // Truncate or extend the result depending on the index bitwidth specified
131  // by the LLVMTypeConverter options.
132  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
133  if (indexBitwidth > 32) {
134  laneId = LLVM::SExtOp::create(
135  rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
136  } else if (indexBitwidth < 32) {
137  laneId = LLVM::TruncOp::create(
138  rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
139  }
140  rewriter.replaceOp(op, {laneId});
141  return success();
142  }
143 };
144 
145 struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
147 
148  GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
149  amdgpu::Chipset chipset)
151  chipset(chipset) {}
152 
153  LogicalResult
154  matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
155  ConversionPatternRewriter &rewriter) const override {
156  LLVM::ConstantRangeAttr bounds = nullptr;
157  bool isBeforeGfx10 = chipset.majorVersion < 10;
158  if (auto upperBoundAttr = op.getUpperBoundAttr()) {
159  bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
160  /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
161  /*upper=*/op.getUpperBoundAttr().getInt() + 1);
162  }
163  Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
164  rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
165  wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
166  *getTypeConverter());
167  rewriter.replaceOp(op, {wavefrontOp});
168  return success();
169  }
170 
171  const amdgpu::Chipset chipset;
172 };
173 
174 static bool isSupportedReadLaneType(Type type) {
175  // read(first)lane also supports some vector types, but limit it for scalars
176  // for now.
177  return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
178  isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
179  LLVM::LLVMPointerType>(type);
180 }
181 
182 struct GPUSubgroupBroadcastOpToROCDL
183  : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
185 
186  LogicalResult
187  matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
188  ConversionPatternRewriter &rewriter) const override {
189  Value src = adaptor.getSrc();
190  if (!isSupportedReadLaneType(src.getType()))
191  return rewriter.notifyMatchFailure(op, "unsupported readlane type");
192 
193  if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
194  rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
195  adaptor.getLane());
196  } else { // first_active_lane or any_lane
197  // any_lane is lowered to readfirstlane too, to force value into scalar
198  // register.
199  rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
200  src);
201  }
202  return success();
203  }
204 };
205 
206 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
208 
209  /// Lowers a shuffle to the corresponding ROCDL ops.
210  ///
211  /// Use the `width` argument to see if src lane is participating.
212  /// If not the dstLane would be itself.
213  ///
214  /// Shuffle with DS Bpermute:
215  /// let shflMode = [xor, up, down, idx]
216  /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
217  /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
218  /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
219  /// 3. dstLane = shflMode(curLaneId, step)
220  /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
221  /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
222  /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
223  /// 7. bpermute(dwordAlignedDstLane, shfl_value).
224  ///
225  LogicalResult
226  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
227  ConversionPatternRewriter &rewriter) const override {
228  Location loc = op->getLoc();
229  Value initShflValue = adaptor.getValue();
230 
231  Value srcLaneId = getLaneId(rewriter, loc);
232 
233  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
234  Value width = adaptor.getWidth();
235  Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
236  Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
237  Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
238  Value widthOrZeroIfOutside =
239  LLVM::AndOp::create(rewriter, loc, int32Type, add, negwidth);
240  Value dstLane;
241 
242  switch (op.getMode()) {
243  case gpu::ShuffleMode::UP:
244  dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
245  adaptor.getOffset());
246  break;
247  case gpu::ShuffleMode::DOWN:
248  dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
249  adaptor.getOffset());
250  break;
251  case gpu::ShuffleMode::XOR:
252  dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
253  adaptor.getOffset());
254  break;
255  case gpu::ShuffleMode::IDX:
256  dstLane = adaptor.getOffset();
257  break;
258  }
259  Value isActiveSrcLane = LLVM::ICmpOp::create(
260  rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
261  Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
262  dstLane, srcLaneId);
263  Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
264  Value dwordAlignedDstLane =
265  LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
266 
267  SmallVector<Value> decomposed =
268  LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
269  SmallVector<Value> swizzled;
270  for (Value v : decomposed) {
271  Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
272  dwordAlignedDstLane, v);
273  swizzled.emplace_back(res);
274  }
275  Value shflValue =
276  LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
277  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
278  return success();
279  }
280 };
281 
282 /// Import the GPU Ops to ROCDL Patterns.
283 #include "GPUToROCDL.cpp.inc"
284 
285 // A pass that replaces all occurrences of GPU device operations with their
286 // corresponding ROCDL equivalent.
287 //
288 // This pass only handles device code and is not meant to be run on GPU host
289 // code.
290 struct LowerGpuOpsToROCDLOpsPass final
291  : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
292  LowerGpuOpsToROCDLOpsPass() = default;
293  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
294  bool useBarePtrCallConv,
295  gpu::amd::Runtime runtime) {
296  if (this->chipset.getNumOccurrences() == 0)
297  this->chipset = chipset;
298  if (this->indexBitwidth.getNumOccurrences() == 0)
299  this->indexBitwidth = indexBitwidth;
300  if (this->useBarePtrCallConv.getNumOccurrences() == 0)
301  this->useBarePtrCallConv = useBarePtrCallConv;
302  if (this->runtime.getNumOccurrences() == 0)
303  this->runtime = runtime;
304  }
305 
306  void getDependentDialects(DialectRegistry &registry) const override {
307  Base::getDependentDialects(registry);
309  }
310 
311  void runOnOperation() override {
312  gpu::GPUModuleOp m = getOperation();
313  MLIRContext *ctx = m.getContext();
314 
315  auto llvmDataLayout = m->getAttrOfType<StringAttr>(
316  LLVM::LLVMDialect::getDataLayoutAttrName());
317  if (!llvmDataLayout) {
318  llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
319  m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
320  }
321  // Request C wrapper emission.
322  for (auto func : m.getOps<func::FuncOp>()) {
323  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
324  UnitAttr::get(ctx));
325  }
326 
327  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
328  if (failed(maybeChipset)) {
329  emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
330  return signalPassFailure();
331  }
332 
333  /// Customize the bitwidth used for the device side index computations.
335  ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
336  options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
337  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
338  options.overrideIndexBitwidth(indexBitwidth);
339 
340  if (useBarePtrCallConv) {
341  options.useBarePtrCallConv = true;
342  WalkResult canUseBarePointers =
343  m.walk([](gpu::GPUFuncOp func) -> WalkResult {
344  if (canBeCalledWithBarePointers(func))
345  return WalkResult::advance();
346  return WalkResult::interrupt();
347  });
348  if (canUseBarePointers.wasInterrupted()) {
350  "bare pointer calling convention requires all memrefs to "
351  "have static shape and use the identity map");
352  return signalPassFailure();
353  }
354  }
355 
356  // Apply in-dialect lowering. In-dialect lowering will replace
357  // ops which need to be lowered further, which is not supported by a
358  // single conversion pass.
359  {
363  (void)applyPatternsGreedily(m, std::move(patterns));
364  }
365 
366  LLVMTypeConverter converter(ctx, options);
368  converter, [](gpu::AddressSpace space) {
369  switch (space) {
370  case gpu::AddressSpace::Global:
371  return 1;
372  case gpu::AddressSpace::Workgroup:
373  return 3;
374  case gpu::AddressSpace::Private:
375  return 5;
376  }
377  llvm_unreachable("unknown address space enum value");
378  return 0;
379  });
380 
381  RewritePatternSet llvmPatterns(ctx);
383 
384  llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
385  allowedDialects.end());
386  for (Dialect *dialect : ctx->getLoadedDialects()) {
387  bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
388  // Empty `allowedDialectsSet` means all dialects are allowed.
389  if (!allowedDialectsSet.empty() && !allowed)
390  continue;
391 
392  auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
393  if (!iface) {
394  // Error out if dialect was explicily specified but doesn't implement
395  // conversion interface.
396  if (allowed) {
397  m.emitError()
398  << "dialect does not implement ConvertToLLVMPatternInterface: "
399  << dialect->getNamespace();
400  return signalPassFailure();
401  }
402  continue;
403  }
404 
405  iface->populateConvertToLLVMConversionPatterns(target, converter,
406  llvmPatterns);
407  }
408 
409  populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
410  *maybeChipset);
411  populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
412  *maybeChipset);
414  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
415  signalPassFailure();
416  auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
417  auto reqdWorkGroupSizeAttrHelper =
418  rocdlDialect->getReqdWorkGroupSizeAttrHelper();
419  auto flatWorkGroupSizeAttrHelper =
420  rocdlDialect->getFlatWorkGroupSizeAttrHelper();
421  // Manually rewrite known block size attributes so the LLVMIR translation
422  // infrastructure can pick them up.
423  m.walk([&](LLVM::LLVMFuncOp op) {
424  if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
425  auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
426  // Also set up the rocdl.flat_work_group_size attribute to prevent
427  // conflicting metadata.
428  uint32_t flatSize = 1;
429  for (uint32_t size : blockSizes.asArrayRef()) {
430  flatSize *= size;
431  }
432  StringAttr flatSizeAttr =
433  StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
434  flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
435  }
436  });
437  }
438 };
439 
440 } // namespace
441 
443  target.addIllegalOp<func::FuncOp>();
444  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
445  target.addLegalDialect<ROCDL::ROCDLDialect>();
446  target.addIllegalDialect<gpu::GPUDialect>();
447  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
448  LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
449  LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
450  // These ops are legal for f32 type.
451  target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
452  return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
453  });
454  // TODO: Remove once we support replacing non-root ops.
455  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
456 }
457 
459  const LLVMTypeConverter &converter, RewritePatternSet &patterns,
460  mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
464  auto *rocdlDialect =
465  converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
466  populateWithGenerated(patterns);
467  patterns.add<
468  gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
469  ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
470  converter, IndexKind::Block, IntrType::Id);
472  gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
473  converter, IndexKind::Grid, IntrType::Id);
474  patterns.add<
475  gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
476  ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
477  converter, IndexKind::Block, IntrType::Dim);
479  gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
480  converter, IndexKind::Grid, IntrType::Dim);
481  patterns.add<GPUReturnOpLowering>(converter);
483  converter,
485  /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
486  /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
487  rocdlDialect->getKernelAttrHelper().getName(),
488  rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
489  if (Runtime::HIP == runtime) {
490  patterns.add<GPUPrintfOpToHIPLowering>(converter);
491  } else if (Runtime::OpenCL == runtime) {
492  // Use address space = 4 to match the OpenCL definition of printf()
493  patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
494  }
495  // TODO: Add alignment for workgroup memory
497 
498  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
499  GPUSubgroupBroadcastOpToROCDL>(converter);
500  patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
501 
503 }
504 
505 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
506 mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
507  unsigned indexBitwidth,
508  bool useBarePtrCallConv,
509  gpu::amd::Runtime runtime) {
510  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
511  chipset, indexBitwidth, useBarePtrCallConv, runtime);
512 }
static Value getZero(OpBuilder &b, Location loc, Type elementType)
Get zero value for an element type.
static MLIRContext * getContext(OpFoldResult val)
static Value getLaneId(RewriterBase &rewriter, Location loc)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)
static llvm::ManagedStatic< PassManagerOptions > options
UnitAttr getUnitAttr()
Definition: Builders.cpp:97
IntegerType getI32Type()
Definition: Builders.cpp:62
MLIRContext * getContext() const
Definition: Builders.h:56
ArrayAttr getArrayAttr(ArrayRef< Attribute > value)
Definition: Builders.cpp:265
DictionaryAttr getDictionaryAttr(ArrayRef< NamedAttribute > value)
Definition: Builders.cpp:103
NamedAttribute getNamedAttr(StringRef name, Attribute val)
Definition: Builders.cpp:93
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
Definition: Builders.h:98
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)
Register the given operation as dynamically legal and set the dynamic legalization callback to the on...
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:209
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
Definition: Pattern.h:215
The main mechanism for performing data layout queries.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...
Definition: Dialect.h:38
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
unsigned getIndexTypeBitwidth() const
Gets the bitwidth of the index type when converted to LLVM.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
std::vector< Dialect * > getLoadedDialects()
Return information about all IR dialects loaded in the context.
NamedAttribute represents a combination of a name and an Attribute value.
Definition: Attributes.h:164
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:716
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
Definition: PatternMatch.h:519
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
bool isInteger() const
Return true if this is an integer type (with the specified width).
Definition: Types.cpp:56
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:105
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: WalkResult.h:29
static WalkResult advance()
Definition: WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: WalkResult.h:51
static WalkResult interrupt()
Definition: WalkResult.h:46
static ConstantIntOp create(OpBuilder &builder, Location location, int64_t value, unsigned width)
Definition: ArithOps.cpp:258
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
Definition: Pattern.cpp:439
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
Definition: Pattern.cpp:400
Runtime
Potential runtimes for AMD GPU kernels.
Definition: Runtimes.h:15
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition: Remarks.h:491
detail::LazyTextBuild add(const char *fmt, Ts &&...ts)
Create a Remark with llvm::formatv formatting.
Definition: Remarks.h:463
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
const FrozenRewritePatternSet & patterns
void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)
Register the extension that will load dependent dialects for LLVM conversion.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Definition: MathToROCDL.cpp:45
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
Definition: Chipset.h:22
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition: Chipset.cpp:14