MLIR  20.0.0git
LowerGpuOpsToROCDLOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate ROCDLIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
17 #include "mlir/Pass/Pass.h"
18 #include "mlir/Pass/PassManager.h"
19 #include "mlir/Transforms/Passes.h"
20 
42 #include "mlir/Pass/Pass.h"
45 #include "llvm/Support/FormatVariadic.h"
46 
47 #include "../GPUCommon/GPUOpsLowering.h"
48 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
49 #include "../GPUCommon/OpToFuncCallLowering.h"
50 
51 namespace mlir {
52 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
53 #include "mlir/Conversion/Passes.h.inc"
54 } // namespace mlir
55 
56 using namespace mlir;
57 
58 /// Returns true if the given `gpu.func` can be safely called using the bare
59 /// pointer calling convention.
60 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
61  bool canBeBare = true;
62  for (Type type : func.getArgumentTypes())
63  if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
64  canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
65  return canBeBare;
66 }
67 
69  const unsigned indexBitwidth) {
70  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
71  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
72  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
73  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
74  ValueRange{minus1, zero});
75  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
76  ValueRange{minus1, mbcntLo});
77  return laneId;
78 }
79 static constexpr StringLiteral amdgcnDataLayout =
80  "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
81  "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
82  "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
83  "64-S32-A5-G1-ni:7:8:9";
84 
85 namespace {
86 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
88 
89  LogicalResult
90  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
91  ConversionPatternRewriter &rewriter) const override {
92  auto loc = op->getLoc();
93  MLIRContext *context = rewriter.getContext();
94  // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
95  // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
96 
97  Type intTy = IntegerType::get(context, 32);
98  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
99  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
100  Value mbcntLo =
101  rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
102  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
103  loc, intTy, ValueRange{minus1, mbcntLo});
104  // Truncate or extend the result depending on the index bitwidth specified
105  // by the LLVMTypeConverter options.
106  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
107  if (indexBitwidth > 32) {
108  laneId = rewriter.create<LLVM::SExtOp>(
109  loc, IntegerType::get(context, indexBitwidth), laneId);
110  } else if (indexBitwidth < 32) {
111  laneId = rewriter.create<LLVM::TruncOp>(
112  loc, IntegerType::get(context, indexBitwidth), laneId);
113  }
114  rewriter.replaceOp(op, {laneId});
115  return success();
116  }
117 };
118 
119 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
121 
122  /// Lowers a shuffle to the corresponding ROCDL ops.
123  ///
124  /// Use the `width` argument to see if src lane is participating.
125  /// If not the dstLane would be itself.
126  ///
127  /// Shuffle with DS Bpermute:
128  /// let shflMode = [xor, up, down, idx]
129  /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
130  /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
131  /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
132  /// 3. dstLane = shflMode(curLaneId, step)
133  /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
134  /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
135  /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
136  /// 7. bpermute(dwordAlignedDstLane, shfl_value).
137  ///
138  LogicalResult
139  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
140  ConversionPatternRewriter &rewriter) const override {
141  Location loc = op->getLoc();
142  // TODO: Add support for non 32-bit shuffle values.
143  if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
144  return failure();
145  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
146  Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
147 
148  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
149  Value width = adaptor.getWidth();
150  Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
151  Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
152  Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
153  Value widthOrZeroIfOutside =
154  rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
155  Value dstLane;
156  // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.
157  // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
158  // perf.
159  switch (op.getMode()) {
160  case gpu::ShuffleMode::XOR:
161  dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
162  adaptor.getOffset());
163  break;
164  case gpu::ShuffleMode::IDX:
165  dstLane = adaptor.getOffset();
166  break;
167  default:
168  return failure();
169  }
170  Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
171  loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
172  Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
173  dstLane, srcLaneId);
174  Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
175  Value dwordAlignedDstLane =
176  rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
177  Value initShflValue = adaptor.getValue();
178  if (adaptor.getValue().getType().isF32()) {
179  initShflValue =
180  rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
181  }
182  Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
183  loc, int32Type, dwordAlignedDstLane, initShflValue);
184  if (adaptor.getValue().getType().isF32()) {
185  shflValue = rewriter.create<LLVM::BitcastOp>(
186  loc, adaptor.getValue().getType(), shflValue);
187  }
188  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
189  return success();
190  }
191 };
192 
193 /// Import the GPU Ops to ROCDL Patterns.
194 #include "GPUToROCDL.cpp.inc"
195 
196 // A pass that replaces all occurrences of GPU device operations with their
197 // corresponding ROCDL equivalent.
198 //
199 // This pass only handles device code and is not meant to be run on GPU host
200 // code.
201 struct LowerGpuOpsToROCDLOpsPass
202  : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
203  LowerGpuOpsToROCDLOpsPass() = default;
204  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
205  bool useBarePtrCallConv,
206  gpu::amd::Runtime runtime) {
207  if (this->chipset.getNumOccurrences() == 0)
208  this->chipset = chipset;
209  if (this->indexBitwidth.getNumOccurrences() == 0)
210  this->indexBitwidth = indexBitwidth;
211  if (this->useBarePtrCallConv.getNumOccurrences() == 0)
212  this->useBarePtrCallConv = useBarePtrCallConv;
213  if (this->runtime.getNumOccurrences() == 0)
214  this->runtime = runtime;
215  }
216 
217  void runOnOperation() override {
218  gpu::GPUModuleOp m = getOperation();
219  MLIRContext *ctx = m.getContext();
220 
221  auto llvmDataLayout = m->getAttrOfType<StringAttr>(
222  LLVM::LLVMDialect::getDataLayoutAttrName());
223  if (!llvmDataLayout) {
224  llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
225  m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
226  }
227  // Request C wrapper emission.
228  for (auto func : m.getOps<func::FuncOp>()) {
229  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
230  UnitAttr::get(ctx));
231  }
232 
233  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
234  if (failed(maybeChipset)) {
235  emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
236  return signalPassFailure();
237  }
238 
239  /// Customize the bitwidth used for the device side index computations.
241  ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
242  options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
243  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
244  options.overrideIndexBitwidth(indexBitwidth);
245 
246  if (useBarePtrCallConv) {
247  options.useBarePtrCallConv = true;
248  WalkResult canUseBarePointers =
249  m.walk([](gpu::GPUFuncOp func) -> WalkResult {
250  if (canBeCalledWithBarePointers(func))
251  return WalkResult::advance();
252  return WalkResult::interrupt();
253  });
254  if (canUseBarePointers.wasInterrupted()) {
256  "bare pointer calling convention requires all memrefs to "
257  "have static shape and use the identity map");
258  return signalPassFailure();
259  }
260  }
261 
262  // Apply in-dialect lowering. In-dialect lowering will replace
263  // ops which need to be lowered further, which is not supported by a
264  // single conversion pass.
265  {
266  RewritePatternSet patterns(ctx);
267  populateGpuRewritePatterns(patterns);
269  (void)applyPatternsAndFoldGreedily(m, std::move(patterns));
270  }
271 
272  LLVMTypeConverter converter(ctx, options);
274  converter, [](gpu::AddressSpace space) {
275  switch (space) {
276  case gpu::AddressSpace::Global:
277  return 1;
278  case gpu::AddressSpace::Workgroup:
279  return 3;
280  case gpu::AddressSpace::Private:
281  return 5;
282  }
283  llvm_unreachable("unknown address space enum value");
284  return 0;
285  });
286 
287  RewritePatternSet llvmPatterns(ctx);
288 
290  populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
291  *maybeChipset);
292  populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
293  cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
294  populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
295  populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
296  populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
299  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
300  signalPassFailure();
301  auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
302  auto reqdWorkGroupSizeAttrHelper =
303  rocdlDialect->getReqdWorkGroupSizeAttrHelper();
304  auto flatWorkGroupSizeAttrHelper =
305  rocdlDialect->getFlatWorkGroupSizeAttrHelper();
306  // Manually rewrite known block size attributes so the LLVMIR translation
307  // infrastructure can pick them up.
308  m.walk([&](LLVM::LLVMFuncOp op) {
309  if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
310  auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
311  // Also set up the rocdl.flat_work_group_size attribute to prevent
312  // conflicting metadata.
313  uint32_t flatSize = 1;
314  for (uint32_t size : blockSizes.asArrayRef()) {
315  flatSize *= size;
316  }
317  StringAttr flatSizeAttr =
318  StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
319  flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
320  }
321  });
322  }
323 };
324 
325 } // namespace
326 
328  target.addIllegalOp<func::FuncOp>();
329  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
330  target.addLegalDialect<ROCDL::ROCDLDialect>();
331  target.addIllegalDialect<gpu::GPUDialect>();
332  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
333  LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
334  LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
335  LLVM::SqrtOp>();
336 
337  // TODO: Remove once we support replacing non-root ops.
338  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
339 }
340 
341 template <typename OpTy>
342 static void populateOpPatterns(LLVMTypeConverter &converter,
343  RewritePatternSet &patterns, StringRef f32Func,
344  StringRef f64Func) {
345  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
346  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
347 }
348 
350  LLVMTypeConverter &converter, RewritePatternSet &patterns,
351  mlir::gpu::amd::Runtime runtime) {
355  auto *rocdlDialect =
356  converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
357  populateWithGenerated(patterns);
358  patterns.add<
359  gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
360  ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
361  converter, IndexKind::Block, IntrType::Id);
363  gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
364  converter, IndexKind::Grid, IntrType::Id);
365  patterns.add<
366  gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
367  ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
368  converter, IndexKind::Block, IntrType::Dim);
370  gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
371  converter, IndexKind::Grid, IntrType::Dim);
372  patterns.add<GPUReturnOpLowering>(converter);
373  patterns.add<GPUFuncOpLowering>(
374  converter,
375  /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
376  /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
377  rocdlDialect->getKernelAttrHelper().getName(),
378  rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
379  if (Runtime::HIP == runtime) {
380  patterns.add<GPUPrintfOpToHIPLowering>(converter);
381  } else if (Runtime::OpenCL == runtime) {
382  // Use address space = 4 to match the OpenCL definition of printf()
383  patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
384  }
385  // TODO: Add alignment for workgroup memory
386  patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
387 
388  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
389 
390  populateMathToROCDLConversionPatterns(converter, patterns);
391 }
392 
393 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
394 mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
395  unsigned indexBitwidth,
396  bool useBarePtrCallConv,
397  gpu::amd::Runtime runtime) {
398  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
399  chipset, indexBitwidth, useBarePtrCallConv, runtime);
400 }
static MLIRContext * getContext(OpFoldResult val)
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)
Returns true if the given gpu.func can be safely called using the bare pointer calling convention.
static constexpr StringLiteral amdgcnDataLayout
static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, StringRef f64Func)
Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)
static llvm::ManagedStatic< PassManagerOptions > options
MLIRContext * getContext() const
Definition: Builders.h:55
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
PatternRewriter hook for replacing an operation.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:143
The main mechanism for performing data layout queries.
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:34
static bool canConvertToBarePtr(BaseMemRefType type)
Check if a memref type can be converted to a bare pointer.
MLIRContext & getContext() const
Returns the MLIR context.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Dialect * getLoadedDialect(StringRef name)
Get a registered IR dialect with the given namespace.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:468
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Definition: PatternMatch.h:847
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:33
static WalkResult advance()
Definition: Visitors.h:51
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:55
static WalkResult interrupt()
Definition: Visitors.h:50
void populateExpandBFloat16Patterns(RewritePatternSet &patterns)
Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
Definition: ExpandOps.cpp:397
void populateArithToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
void populateControlFlowToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect the patterns to convert from the ControlFlow dialect to LLVM.
Runtime
Potential runtimes for AMD GPU kernels.
Definition: Runtimes.h:15
Include the generated interface declarations.
void populateFuncToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, const SymbolTable *symbolTable=nullptr)
Collect the patterns to convert from the Func dialect to LLVM.
Definition: FuncToLLVM.cpp:683
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:81
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
void populateFinalizeMemRefToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert memory-related operations from the MemRef dialect to the LLVM di...
void configureGpuToROCDLConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to ROCDL.
std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)
Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
void populateVectorToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, bool reassociateFPReductions=false, bool force32BitVectorIndices=false)
Collect a set of patterns to convert from the Vector dialect to LLVM.
LogicalResult applyPatternsAndFoldGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: The ROCDL target does not support the LLVM bfloat type at this time and so this function will a...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate the given list with patterns that convert from Math to ROCDL calls.
Definition: MathToROCDL.cpp:48
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
The lowering of gpu.printf to a call to HIP hostcalls.
The lowering of gpu.printf to a call to an external printf() function.
Rewriting that replace SourceOp with a CallOp to f32Func or f64Func or f32ApproxFunc depending on the...
Rewriting that unrolls SourceOp to scalars if it's operating on vectors.
static FailureOr< Chipset > parse(StringRef name)
Definition: Chipset.cpp:16