MLIR  20.0.0git
LowerGpuOpsToNVVMOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate NVVMIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
35 
36 #include "../GPUCommon/GPUOpsLowering.h"
37 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
38 #include "../GPUCommon/OpToFuncCallLowering.h"
39 #include <optional>
40 
41 namespace mlir {
42 #define GEN_PASS_DEF_CONVERTGPUOPSTONVVMOPS
43 #include "mlir/Conversion/Passes.h.inc"
44 } // namespace mlir
45 
46 using namespace mlir;
47 
48 namespace {
49 
50 /// Convert gpu dialect shfl mode enum to the equivalent nvvm one.
51 static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
52  switch (mode) {
53  case gpu::ShuffleMode::XOR:
54  return NVVM::ShflKind::bfly;
55  case gpu::ShuffleMode::UP:
56  return NVVM::ShflKind::up;
57  case gpu::ShuffleMode::DOWN:
58  return NVVM::ShflKind::down;
59  case gpu::ShuffleMode::IDX:
60  return NVVM::ShflKind::idx;
61  }
62  llvm_unreachable("unknown shuffle mode");
63 }
64 
65 static std::optional<NVVM::ReduxKind>
66 convertReduxKind(gpu::AllReduceOperation mode) {
67  switch (mode) {
68  case gpu::AllReduceOperation::ADD:
69  return NVVM::ReduxKind::ADD;
70  case gpu::AllReduceOperation::MUL:
71  return std::nullopt;
72  case gpu::AllReduceOperation::MINSI:
73  return NVVM::ReduxKind::MIN;
75  return std::nullopt;
76  case gpu::AllReduceOperation::MINNUMF:
77  return NVVM::ReduxKind::MIN;
78  case gpu::AllReduceOperation::MAXSI:
79  return NVVM::ReduxKind::MAX;
80  case gpu::AllReduceOperation::MAXUI:
81  return std::nullopt;
82  case gpu::AllReduceOperation::MAXNUMF:
83  return NVVM::ReduxKind::MAX;
84  case gpu::AllReduceOperation::AND:
85  return NVVM::ReduxKind::AND;
86  case gpu::AllReduceOperation::OR:
87  return NVVM::ReduxKind::OR;
88  case gpu::AllReduceOperation::XOR:
89  return NVVM::ReduxKind::XOR;
90  case gpu::AllReduceOperation::MINIMUMF:
91  case gpu::AllReduceOperation::MAXIMUMF:
92  return std::nullopt;
93  }
94  return std::nullopt;
95 }
96 
97 /// This pass lowers gpu.subgroup_reduce op into to the nvvm.redux op. The op
98 /// must be run by the entire subgroup, otherwise it is undefined behaviour.
99 struct GPUSubgroupReduceOpLowering
100  : public ConvertOpToLLVMPattern<gpu::SubgroupReduceOp> {
102  LogicalResult
103 
104  matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
105  ConversionPatternRewriter &rewriter) const override {
106  if (op.getClusterSize())
107  return rewriter.notifyMatchFailure(
108  op, "lowering for clustered reduce not implemented");
109 
110  if (!op.getUniform())
111  return rewriter.notifyMatchFailure(
112  op, "cannot be lowered to redux as the op must be run "
113  "uniformly (entire subgroup).");
114  if (!op.getValue().getType().isInteger(32))
115  return rewriter.notifyMatchFailure(op, "unsupported data type");
116 
117  std::optional<NVVM::ReduxKind> mode = convertReduxKind(op.getOp());
118  if (!mode.has_value())
119  return rewriter.notifyMatchFailure(
120  op, "unsupported reduction mode for redux");
121 
122  Location loc = op->getLoc();
123  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
124  Value offset = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
125 
126  auto reduxOp = rewriter.create<NVVM::ReduxOp>(loc, int32Type, op.getValue(),
127  mode.value(), offset);
128 
129  rewriter.replaceOp(op, reduxOp->getResult(0));
130  return success();
131  }
132 };
133 
134 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
136 
137  /// Lowers a shuffle to the corresponding NVVM op.
138  ///
139  /// Convert the `width` argument into an activeMask (a bitmask which specifies
140  /// which threads participate in the shuffle) and a maskAndClamp (specifying
141  /// the highest lane which participates in the shuffle).
142  ///
143  /// %one = llvm.constant(1 : i32) : i32
144  /// %minus_one = llvm.constant(-1 : i32) : i32
145  /// %thirty_two = llvm.constant(32 : i32) : i32
146  /// %num_lanes = llvm.sub %thirty_two, %width : i32
147  /// %active_mask = llvm.lshr %minus_one, %num_lanes : i32
148  /// %mask_and_clamp = llvm.sub %width, %one : i32
149  /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
150  /// %mask_and_clamp : !llvm<"{ float, i1 }">
151  /// %shfl_value = llvm.extractvalue %shfl[0] :
152  /// !llvm<"{ float, i1 }">
153  /// %shfl_pred = llvm.extractvalue %shfl[1] :
154  /// !llvm<"{ float, i1 }">
155  LogicalResult
156  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
157  ConversionPatternRewriter &rewriter) const override {
158  Location loc = op->getLoc();
159 
160  auto valueTy = adaptor.getValue().getType();
161  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
162  auto predTy = IntegerType::get(rewriter.getContext(), 1);
163 
164  Value one = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 1);
165  Value minusOne = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
166  Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 32);
167  Value numLeadInactiveLane = rewriter.create<LLVM::SubOp>(
168  loc, int32Type, thirtyTwo, adaptor.getWidth());
169  // Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
170  Value activeMask = rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne,
171  numLeadInactiveLane);
172  Value maskAndClamp;
173  if (op.getMode() == gpu::ShuffleMode::UP) {
174  // Clamp lane: `32 - activeWidth`
175  maskAndClamp = numLeadInactiveLane;
176  } else {
177  // Clamp lane: `activeWidth - 1`
178  maskAndClamp =
179  rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
180  }
181 
182  bool predIsUsed = !op->getResult(1).use_empty();
183  UnitAttr returnValueAndIsValidAttr = nullptr;
184  Type resultTy = valueTy;
185  if (predIsUsed) {
186  returnValueAndIsValidAttr = rewriter.getUnitAttr();
187  resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
188  {valueTy, predTy});
189  }
190  Value shfl = rewriter.create<NVVM::ShflOp>(
191  loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
192  maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
193  if (predIsUsed) {
194  Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
195  Value isActiveSrcLane =
196  rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
197  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
198  } else {
199  rewriter.replaceOp(op, {shfl, nullptr});
200  }
201  return success();
202  }
203 };
204 
205 struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
207 
208  LogicalResult
209  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
210  ConversionPatternRewriter &rewriter) const override {
211  auto loc = op->getLoc();
212  MLIRContext *context = rewriter.getContext();
213  LLVM::ConstantRangeAttr bounds = nullptr;
214  if (std::optional<APInt> upperBound = op.getUpperBound())
215  bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
216  /*bitWidth=*/32, /*lower=*/0, upperBound->getZExtValue());
217  else
218  bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
219  /*bitWidth=*/32, /*lower=*/0, /*upper=*/kWarpSize);
220  Value newOp =
221  rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
222  // Truncate or extend the result depending on the index bitwidth specified
223  // by the LLVMTypeConverter options.
224  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
225  if (indexBitwidth > 32) {
226  newOp = rewriter.create<LLVM::SExtOp>(
227  loc, IntegerType::get(context, indexBitwidth), newOp);
228  } else if (indexBitwidth < 32) {
229  newOp = rewriter.create<LLVM::TruncOp>(
230  loc, IntegerType::get(context, indexBitwidth), newOp);
231  }
232  rewriter.replaceOp(op, {newOp});
233  return success();
234  }
235 };
236 
237 /// Import the GPU Ops to NVVM Patterns.
238 #include "GPUToNVVM.cpp.inc"
239 
240 /// A pass that replaces all occurrences of GPU device operations with their
241 /// corresponding NVVM equivalent.
242 ///
243 /// This pass only handles device code and is not meant to be run on GPU host
244 /// code.
245 struct LowerGpuOpsToNVVMOpsPass
246  : public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
247  using Base::Base;
248 
249  void runOnOperation() override {
250  gpu::GPUModuleOp m = getOperation();
251 
252  // Request C wrapper emission.
253  for (auto func : m.getOps<func::FuncOp>()) {
254  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
256  }
257 
258  // Customize the bitwidth used for the device side index computations.
260  m.getContext(),
261  DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
262  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
263  options.overrideIndexBitwidth(indexBitwidth);
264  options.useBarePtrCallConv = useBarePtrCallConv;
265 
266  // Apply in-dialect lowering. In-dialect lowering will replace
267  // ops which need to be lowered further, which is not supported by a
268  // single conversion pass.
269  {
270  RewritePatternSet patterns(m.getContext());
271  populateGpuRewritePatterns(patterns);
272  if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns))))
273  return signalPassFailure();
274  }
275 
276  LLVMTypeConverter converter(m.getContext(), options);
277  // NVVM uses alloca in the default address space to represent private
278  // memory allocations, so drop private annotations. NVVM uses address
279  // space 3 for shared memory. NVVM uses the default address space to
280  // represent global memory.
282  converter, [](gpu::AddressSpace space) -> unsigned {
283  switch (space) {
284  case gpu::AddressSpace::Global:
285  return static_cast<unsigned>(
287  case gpu::AddressSpace::Workgroup:
288  return static_cast<unsigned>(
290  case gpu::AddressSpace::Private:
291  return 0;
292  }
293  llvm_unreachable("unknown address space enum value");
294  return 0;
295  });
296  // Lowering for MMAMatrixType.
297  converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
298  return convertMMAToLLVMType(type);
299  });
300  RewritePatternSet llvmPatterns(m.getContext());
301 
302  arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
303  cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
304  populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
305  populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
306  populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
307  populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
308  populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
309  if (this->hasRedux)
310  populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
313  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
314  signalPassFailure();
315  }
316 };
317 
318 } // namespace
319 
321  target.addIllegalOp<func::FuncOp>();
322  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
323  target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
324  target.addIllegalDialect<gpu::GPUDialect>();
325  target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
326  LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FMAOp,
327  LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
328  LLVM::PowOp, LLVM::RoundEvenOp, LLVM::RoundOp,
329  LLVM::SinOp, LLVM::SqrtOp>();
330 
331  // TODO: Remove once we support replacing non-root ops.
332  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
333 }
334 
335 template <typename OpTy>
336 static void populateOpPatterns(const LLVMTypeConverter &converter,
337  RewritePatternSet &patterns, StringRef f32Func,
338  StringRef f64Func, StringRef f32ApproxFunc = "",
339  StringRef f16Func = "") {
340  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
341  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
342  f32ApproxFunc, f16Func);
343 }
344 
346  const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
347  patterns.add<GPUSubgroupReduceOpLowering>(converter);
348 }
349 
351  const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
354  populateWithGenerated(patterns);
355  patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
356  patterns.add<
357  gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
358  NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
359  converter, IndexKind::Block, IntrType::Id);
360  patterns.add<
361  gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
362  NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
363  converter, IndexKind::Block, IntrType::Dim);
364  patterns.add<
365  gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
366  NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
367  converter, IndexKind::Other, IntrType::Id);
369  gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
370  NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
372  gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
373  NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
374  converter, IndexKind::Other, IntrType::Id);
376  gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
377  NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
378  converter, IndexKind::Other, IntrType::Dim);
380  gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
381  converter, IndexKind::Grid, IntrType::Id);
383  gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
384  converter, IndexKind::Grid, IntrType::Dim);
385  patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
386  converter);
387 
390 
391  // Explicitly drop memory space when lowering private memory
392  // attributions since NVVM models it as `alloca`s in the default
393  // memory space and does not support `alloca`s with addrspace(5).
394  patterns.add<GPUFuncOpLowering>(
395  converter,
397  /*allocaAddrSpace=*/0,
398  /*workgroupAddrSpace=*/
399  static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
400  StringAttr::get(&converter.getContext(),
401  NVVM::NVVMDialect::getKernelFuncAttrName()),
402  StringAttr::get(&converter.getContext(),
403  NVVM::NVVMDialect::getMaxntidAttrName())});
404 
405  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
406  "__nv_fmod");
407  populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
408  "__nv_fabs");
409  populateOpPatterns<math::AcosOp>(converter, patterns, "__nv_acosf",
410  "__nv_acos");
411  populateOpPatterns<math::AcoshOp>(converter, patterns, "__nv_acoshf",
412  "__nv_acosh");
413  populateOpPatterns<math::AsinOp>(converter, patterns, "__nv_asinf",
414  "__nv_asin");
415  populateOpPatterns<math::AsinhOp>(converter, patterns, "__nv_asinhf",
416  "__nv_asinh");
417  populateOpPatterns<math::AtanOp>(converter, patterns, "__nv_atanf",
418  "__nv_atan");
419  populateOpPatterns<math::Atan2Op>(converter, patterns, "__nv_atan2f",
420  "__nv_atan2");
421  populateOpPatterns<math::AtanhOp>(converter, patterns, "__nv_atanhf",
422  "__nv_atanh");
423  populateOpPatterns<math::CbrtOp>(converter, patterns, "__nv_cbrtf",
424  "__nv_cbrt");
425  populateOpPatterns<math::CeilOp>(converter, patterns, "__nv_ceilf",
426  "__nv_ceil");
427  populateOpPatterns<math::CopySignOp>(converter, patterns, "__nv_copysignf",
428  "__nv_copysign");
429  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos",
430  "__nv_fast_cosf");
431  populateOpPatterns<math::CoshOp>(converter, patterns, "__nv_coshf",
432  "__nv_cosh");
433  populateOpPatterns<math::ErfOp>(converter, patterns, "__nv_erff", "__nv_erf");
434  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp",
435  "__nv_fast_expf");
436  populateOpPatterns<math::Exp2Op>(converter, patterns, "__nv_exp2f",
437  "__nv_exp2");
438  populateOpPatterns<math::ExpM1Op>(converter, patterns, "__nv_expm1f",
439  "__nv_expm1");
440  populateOpPatterns<math::FloorOp>(converter, patterns, "__nv_floorf",
441  "__nv_floor");
442  populateOpPatterns<math::FmaOp>(converter, patterns, "__nv_fmaf", "__nv_fma");
443  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log",
444  "__nv_fast_logf");
445  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
446  "__nv_log10", "__nv_fast_log10f");
447  populateOpPatterns<math::Log1pOp>(converter, patterns, "__nv_log1pf",
448  "__nv_log1p");
449  populateOpPatterns<math::Log2Op>(converter, patterns, "__nv_log2f",
450  "__nv_log2", "__nv_fast_log2f");
451  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf", "__nv_pow",
452  "__nv_fast_powf");
453  populateOpPatterns<math::RoundOp>(converter, patterns, "__nv_roundf",
454  "__nv_round");
455  populateOpPatterns<math::RoundEvenOp>(converter, patterns, "__nv_rintf",
456  "__nv_rint");
457  populateOpPatterns<math::RsqrtOp>(converter, patterns, "__nv_rsqrtf",
458  "__nv_rsqrt");
459  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin",
460  "__nv_fast_sinf");
461  populateOpPatterns<math::SinhOp>(converter, patterns, "__nv_sinhf",
462  "__nv_sinh");
463  populateOpPatterns<math::SqrtOp>(converter, patterns, "__nv_sqrtf",
464  "__nv_sqrt");
465  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan",
466  "__nv_fast_tanf");
467  populateOpPatterns<math::TanhOp>(converter, patterns, "__nv_tanhf",
468  "__nv_tanh");
469 }
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static void populateOpPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, StringRef f64Func, StringRef f32ApproxFunc="", StringRef f16Func="")
constexpr int kWarpSize
Definition: NVGPUDialect.h:25
static llvm::ManagedStatic< PassManagerOptions > options
#define MINUI(lhs, rhs)
UnitAttr getUnitAttr()
Definition: Builders.cpp:138
IntegerType getI32Type()
Definition: Builders.cpp:107
MLIRContext * getContext() const
Definition: Builders.h:55
Attr getAttr(Args &&...args)
Get or construct an instance of the attribute Attr with provided arguments.
Definition: Builders.h:106
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:143
The main mechanism for performing data layout queries.
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35
MLIRContext & getContext() const
Returns the MLIR context.
static LLVMStructType getLiteral(MLIRContext *context, ArrayRef< Type > types, bool isPacked=false)
Gets or creates a literal struct with the given body in the provided context.
Definition: LLVMTypes.cpp:452
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:497
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Definition: PatternMatch.h:853
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:724
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
bool use_empty() const
Returns true if this value has no uses.
Definition: Value.h:218
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition: GPUDialect.h:131
constexpr int kSharedMemoryAlignmentBit
Definition: NVVMDialect.h:31
@ kGlobalMemorySpace
Global memory space identifier.
Definition: NVVMDialect.h:36
void populateArithToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
void populateControlFlowToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect the patterns to convert from the ControlFlow dialect to LLVM.
Include the generated interface declarations.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert from the GPU dialect to NVVM.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:91
void populateFinalizeMemRefToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert memory-related operations from the MemRef dialect to the LLVM di...
void configureGpuToNVVMConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to NVVM.
LogicalResult applyPatternsAndFoldGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateVectorToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, bool reassociateFPReductions=false, bool force32BitVectorIndices=false)
Collect a set of patterns to convert from the Vector dialect to LLVM.
void populateFuncToLLVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, const SymbolTable *symbolTable=nullptr)
Collect the patterns to convert from the Func dialect to LLVM.
Definition: FuncToLLVM.cpp:733
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate GpuSubgroupReduce pattern to NVVM.
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Lowering of gpu.printf to a vprintf standard library.
Rewriting that replace SourceOp with a CallOp to f32Func or f64Func or f32ApproxFunc or f16Func depen...
Rewriting that unrolls SourceOp to scalars if it's operating on vectors.