MLIR  14.0.0git
GPUToLLVMConversion.cpp
Go to the documentation of this file.
1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu.launch_func op into a sequence of
10 // GPU runtime calls. As most of GPU runtimes does not have a stable published
11 // ABI, this pass uses a slim runtime layer that builds on top of the public
12 // API from GPU runtime headers.
13 //
14 //===----------------------------------------------------------------------===//
15 
17 
18 #include "../PassDetail.h"
31 #include "mlir/IR/Attributes.h"
32 #include "mlir/IR/Builders.h"
33 #include "mlir/IR/BuiltinOps.h"
34 #include "mlir/IR/BuiltinTypes.h"
35 
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/Support/Error.h"
38 #include "llvm/Support/FormatVariadic.h"
39 
40 using namespace mlir;
41 
42 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
43 
44 namespace {
45 
46 class GpuToLLVMConversionPass
47  : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
48 public:
49  GpuToLLVMConversionPass() = default;
50 
51  GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other)
52  : GpuToLLVMConversionPassBase(other) {}
53 
54  // Run the dialect converter on the module.
55  void runOnOperation() override;
56 
57 private:
58  Option<std::string> gpuBinaryAnnotation{
59  *this, "gpu-binary-annotation",
60  llvm::cl::desc("Annotation attribute string for GPU binary"),
61  llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())};
62 };
63 
64 struct FunctionCallBuilder {
65  FunctionCallBuilder(StringRef functionName, Type returnType,
66  ArrayRef<Type> argumentTypes)
67  : functionName(functionName),
68  functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
69  LLVM::CallOp create(Location loc, OpBuilder &builder,
70  ArrayRef<Value> arguments) const;
71 
72  StringRef functionName;
73  LLVM::LLVMFunctionType functionType;
74 };
75 
76 template <typename OpTy>
77 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
78 public:
79  explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
80  : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
81 
82 protected:
84  MemRefType type, MemRefDescriptor desc) const {
85  return type.hasStaticShape()
87  rewriter, loc, type.getNumElements())
88  // For identity maps (verified by caller), the number of
89  // elements is stride[0] * size[0].
90  : rewriter.create<LLVM::MulOp>(loc,
91  desc.stride(rewriter, loc, 0),
92  desc.size(rewriter, loc, 0));
93  }
94 
95  MLIRContext *context = &this->getTypeConverter()->getContext();
96 
97  Type llvmVoidType = LLVM::LLVMVoidType::get(context);
98  Type llvmPointerType =
99  LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
100  Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType);
101  Type llvmInt8Type = IntegerType::get(context, 8);
102  Type llvmInt32Type = IntegerType::get(context, 32);
103  Type llvmInt64Type = IntegerType::get(context, 64);
104  Type llvmIntPtrType = IntegerType::get(
105  context, this->getTypeConverter()->getPointerBitwidth(0));
106 
107  FunctionCallBuilder moduleLoadCallBuilder = {
108  "mgpuModuleLoad",
109  llvmPointerType /* void *module */,
110  {llvmPointerType /* void *cubin */}};
111  FunctionCallBuilder moduleUnloadCallBuilder = {
112  "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
113  FunctionCallBuilder moduleGetFunctionCallBuilder = {
114  "mgpuModuleGetFunction",
115  llvmPointerType /* void *function */,
116  {
117  llvmPointerType, /* void *module */
118  llvmPointerType /* char *name */
119  }};
120  FunctionCallBuilder launchKernelCallBuilder = {
121  "mgpuLaunchKernel",
122  llvmVoidType,
123  {
124  llvmPointerType, /* void* f */
125  llvmIntPtrType, /* intptr_t gridXDim */
126  llvmIntPtrType, /* intptr_t gridyDim */
127  llvmIntPtrType, /* intptr_t gridZDim */
128  llvmIntPtrType, /* intptr_t blockXDim */
129  llvmIntPtrType, /* intptr_t blockYDim */
130  llvmIntPtrType, /* intptr_t blockZDim */
131  llvmInt32Type, /* unsigned int sharedMemBytes */
132  llvmPointerType, /* void *hstream */
133  llvmPointerPointerType, /* void **kernelParams */
134  llvmPointerPointerType /* void **extra */
135  }};
136  FunctionCallBuilder streamCreateCallBuilder = {
137  "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
138  FunctionCallBuilder streamDestroyCallBuilder = {
139  "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}};
140  FunctionCallBuilder streamSynchronizeCallBuilder = {
141  "mgpuStreamSynchronize",
142  llvmVoidType,
143  {llvmPointerType /* void *stream */}};
144  FunctionCallBuilder streamWaitEventCallBuilder = {
145  "mgpuStreamWaitEvent",
146  llvmVoidType,
147  {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}};
148  FunctionCallBuilder eventCreateCallBuilder = {
149  "mgpuEventCreate", llvmPointerType /* void *event */, {}};
150  FunctionCallBuilder eventDestroyCallBuilder = {
151  "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}};
152  FunctionCallBuilder eventSynchronizeCallBuilder = {
153  "mgpuEventSynchronize",
154  llvmVoidType,
155  {llvmPointerType /* void *event */}};
156  FunctionCallBuilder eventRecordCallBuilder = {
157  "mgpuEventRecord",
158  llvmVoidType,
159  {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}};
160  FunctionCallBuilder hostRegisterCallBuilder = {
161  "mgpuMemHostRegisterMemRef",
162  llvmVoidType,
163  {llvmIntPtrType /* intptr_t rank */,
164  llvmPointerType /* void *memrefDesc */,
165  llvmIntPtrType /* intptr_t elementSizeBytes */}};
166  FunctionCallBuilder allocCallBuilder = {
167  "mgpuMemAlloc",
168  llvmPointerType /* void * */,
169  {llvmIntPtrType /* intptr_t sizeBytes */,
170  llvmPointerType /* void *stream */}};
171  FunctionCallBuilder deallocCallBuilder = {
172  "mgpuMemFree",
173  llvmVoidType,
174  {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
175  FunctionCallBuilder memcpyCallBuilder = {
176  "mgpuMemcpy",
177  llvmVoidType,
178  {llvmPointerType /* void *dst */, llvmPointerType /* void *src */,
179  llvmIntPtrType /* intptr_t sizeBytes */,
180  llvmPointerType /* void *stream */}};
181  FunctionCallBuilder memsetCallBuilder = {
182  "mgpuMemset32",
183  llvmVoidType,
184  {llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */,
185  llvmIntPtrType /* intptr_t sizeBytes */,
186  llvmPointerType /* void *stream */}};
187 };
188 
189 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
190 /// call. Currently it supports CUDA and ROCm (HIP).
191 class ConvertHostRegisterOpToGpuRuntimeCallPattern
192  : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
193 public:
194  ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
195  : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
196 
197 private:
199  matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, OpAdaptor adaptor,
200  ConversionPatternRewriter &rewriter) const override;
201 };
202 
203 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
204 /// call. Currently it supports CUDA and ROCm (HIP).
205 class ConvertAllocOpToGpuRuntimeCallPattern
206  : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
207 public:
208  ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
209  : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}
210 
211 private:
213  matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor,
214  ConversionPatternRewriter &rewriter) const override;
215 };
216 
217 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
218 /// call. Currently it supports CUDA and ROCm (HIP).
219 class ConvertDeallocOpToGpuRuntimeCallPattern
220  : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
221 public:
222  ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
223  : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}
224 
225 private:
227  matchAndRewrite(gpu::DeallocOp deallocOp, OpAdaptor adaptor,
228  ConversionPatternRewriter &rewriter) const override;
229 };
230 
231 class ConvertAsyncYieldToGpuRuntimeCallPattern
232  : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> {
233 public:
234  ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
235  : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {}
236 
237 private:
239  matchAndRewrite(async::YieldOp yieldOp, OpAdaptor adaptor,
240  ConversionPatternRewriter &rewriter) const override;
241 };
242 
243 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime
244 /// call. Currently it supports CUDA and ROCm (HIP).
245 class ConvertWaitOpToGpuRuntimeCallPattern
246  : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
247 public:
248  ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
249  : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
250 
251 private:
253  matchAndRewrite(gpu::WaitOp waitOp, OpAdaptor adaptor,
254  ConversionPatternRewriter &rewriter) const override;
255 };
256 
257 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
258 /// call. Currently it supports CUDA and ROCm (HIP).
259 class ConvertWaitAsyncOpToGpuRuntimeCallPattern
260  : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
261 public:
262  ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
263  : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
264 
265 private:
267  matchAndRewrite(gpu::WaitOp waitOp, OpAdaptor adaptor,
268  ConversionPatternRewriter &rewriter) const override;
269 };
270 
271 /// A rewrite patter to convert gpu.launch_func operations into a sequence of
272 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
273 ///
274 /// In essence, a gpu.launch_func operations gets compiled into the following
275 /// sequence of runtime calls:
276 ///
277 /// * moduleLoad -- loads the module given the cubin / hsaco data
278 /// * moduleGetFunction -- gets a handle to the actual kernel function
279 /// * getStreamHelper -- initializes a new compute stream on GPU
280 /// * launchKernel -- launches the kernel on a stream
281 /// * streamSynchronize -- waits for operations on the stream to finish
282 ///
283 /// Intermediate data structures are allocated on the stack.
284 class ConvertLaunchFuncOpToGpuRuntimeCallPattern
285  : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
286 public:
287  ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
288  StringRef gpuBinaryAnnotation)
289  : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
290  gpuBinaryAnnotation(gpuBinaryAnnotation) {}
291 
292 private:
293  Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
294  OpBuilder &builder) const;
295  Value generateKernelNameConstant(StringRef moduleName, StringRef name,
296  Location loc, OpBuilder &builder) const;
297 
299  matchAndRewrite(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
300  ConversionPatternRewriter &rewriter) const override;
301 
302  llvm::SmallString<32> gpuBinaryAnnotation;
303 };
304 
305 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
307 
308  LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
309  PatternRewriter &rewriter) const override {
310  // GPU kernel modules are no longer necessary since we have a global
311  // constant with the CUBIN, or HSACO data.
312  rewriter.eraseOp(op);
313  return success();
314  }
315 };
316 
317 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
318 /// call. Currently it supports CUDA and ROCm (HIP).
319 class ConvertMemcpyOpToGpuRuntimeCallPattern
320  : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
321 public:
322  ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
323  : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}
324 
325 private:
327  matchAndRewrite(gpu::MemcpyOp memcpyOp, OpAdaptor adaptor,
328  ConversionPatternRewriter &rewriter) const override;
329 };
330 
331 /// A rewrite pattern to convert gpu.memset operations into a GPU runtime
332 /// call. Currently it supports CUDA and ROCm (HIP).
333 class ConvertMemsetOpToGpuRuntimeCallPattern
334  : public ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp> {
335 public:
336  ConvertMemsetOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
337  : ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp>(typeConverter) {}
338 
339 private:
341  matchAndRewrite(gpu::MemsetOp memsetOp, OpAdaptor adaptor,
342  ConversionPatternRewriter &rewriter) const override;
343 };
344 } // namespace
345 
346 void GpuToLLVMConversionPass::runOnOperation() {
347  LLVMTypeConverter converter(&getContext());
348  RewritePatternSet patterns(&getContext());
349  LLVMConversionTarget target(getContext());
350 
351  target.addIllegalDialect<gpu::GPUDialect>();
352 
354  populateVectorToLLVMConversionPatterns(converter, patterns);
355  populateMemRefToLLVMConversionPatterns(converter, patterns);
356  populateStdToLLVMConversionPatterns(converter, patterns);
358  target);
359  populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
360 
361  if (failed(
362  applyPartialConversion(getOperation(), target, std::move(patterns))))
363  signalPassFailure();
364 }
365 
366 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
367  ArrayRef<Value> arguments) const {
368  auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
369  auto function = [&] {
370  if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
371  return function;
372  return OpBuilder::atBlockEnd(module.getBody())
373  .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
374  }();
375  return builder.create<LLVM::CallOp>(loc, function, arguments);
376 }
377 
378 // Returns whether all operands are of LLVM type.
380  ConversionPatternRewriter &rewriter) {
381  if (!llvm::all_of(operands, [](Value value) {
382  return LLVM::isCompatibleType(value.getType());
383  }))
384  return rewriter.notifyMatchFailure(
385  op, "Cannot convert if operands aren't of LLVM type.");
386  return success();
387 }
388 
389 static LogicalResult
391  gpu::AsyncOpInterface op) {
392  if (op.getAsyncDependencies().size() != 1)
393  return rewriter.notifyMatchFailure(
394  op, "Can only convert with exactly one async dependency.");
395 
396  if (!op.getAsyncToken())
397  return rewriter.notifyMatchFailure(op, "Can convert only async version.");
398 
399  return success();
400 }
401 
402 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
403  gpu::HostRegisterOp hostRegisterOp, OpAdaptor adaptor,
404  ConversionPatternRewriter &rewriter) const {
405  auto *op = hostRegisterOp.getOperation();
406  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
407  return failure();
408 
409  Location loc = op->getLoc();
410 
411  auto memRefType = hostRegisterOp.value().getType();
412  auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
413  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
414 
415  auto arguments = getTypeConverter()->promoteOperands(
416  loc, op->getOperands(), adaptor.getOperands(), rewriter);
417  arguments.push_back(elementSize);
418  hostRegisterCallBuilder.create(loc, rewriter, arguments);
419 
420  rewriter.eraseOp(op);
421  return success();
422 }
423 
424 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
425  gpu::AllocOp allocOp, OpAdaptor adaptor,
426  ConversionPatternRewriter &rewriter) const {
427  MemRefType memRefType = allocOp.getType();
428 
429  if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
430  !isConvertibleAndHasIdentityMaps(memRefType) ||
431  failed(isAsyncWithOneDependency(rewriter, allocOp)))
432  return failure();
433 
434  auto loc = allocOp.getLoc();
435 
436  // Get shape of the memref as values: static sizes are constant
437  // values and dynamic sizes are passed to 'alloc' as operands.
438  SmallVector<Value, 4> shape;
439  SmallVector<Value, 4> strides;
440  Value sizeBytes;
441  getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter,
442  shape, strides, sizeBytes);
443 
444  // Allocate the underlying buffer and store a pointer to it in the MemRef
445  // descriptor.
446  Type elementPtrType = this->getElementPtrType(memRefType);
447  auto stream = adaptor.asyncDependencies().front();
448  Value allocatedPtr =
449  allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
450  allocatedPtr =
451  rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
452 
453  // No alignment.
454  Value alignedPtr = allocatedPtr;
455 
456  // Create the MemRef descriptor.
457  auto memRefDescriptor = this->createMemRefDescriptor(
458  loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
459 
460  rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
461 
462  return success();
463 }
464 
465 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
466  gpu::DeallocOp deallocOp, OpAdaptor adaptor,
467  ConversionPatternRewriter &rewriter) const {
468  if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) ||
469  failed(isAsyncWithOneDependency(rewriter, deallocOp)))
470  return failure();
471 
472  Location loc = deallocOp.getLoc();
473 
474  Value pointer =
475  MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
476  auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
477  Value stream = adaptor.asyncDependencies().front();
478  deallocCallBuilder.create(loc, rewriter, {casted, stream});
479 
480  rewriter.replaceOp(deallocOp, {stream});
481  return success();
482 }
483 
485  return value.getType().isa<gpu::AsyncTokenType>();
486 }
487 
488 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The
489 // !gpu.async.token are lowered to stream within the async.execute region, but
490 // are passed as events between them. For each !gpu.async.token operand, we
491 // create an event and record it on the stream.
492 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
493  async::YieldOp yieldOp, OpAdaptor adaptor,
494  ConversionPatternRewriter &rewriter) const {
495  if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType))
496  return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand");
497 
498  Location loc = yieldOp.getLoc();
499  SmallVector<Value, 4> newOperands(adaptor.getOperands());
500  llvm::SmallDenseSet<Value> streams;
501  for (auto &operand : yieldOp->getOpOperands()) {
502  if (!isGpuAsyncTokenType(operand.get()))
503  continue;
504  auto idx = operand.getOperandNumber();
505  auto stream = adaptor.getOperands()[idx];
506  auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
507  eventRecordCallBuilder.create(loc, rewriter, {event, stream});
508  newOperands[idx] = event;
509  streams.insert(stream);
510  }
511  for (auto stream : streams)
512  streamDestroyCallBuilder.create(loc, rewriter, {stream});
513 
514  rewriter.updateRootInPlace(yieldOp,
515  [&] { yieldOp->setOperands(newOperands); });
516  return success();
517 }
518 
519 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`.
520 static bool isDefinedByCallTo(Value value, StringRef functionName) {
521  assert(value.getType().isa<LLVM::LLVMPointerType>());
522  if (auto defOp = value.getDefiningOp<LLVM::CallOp>())
523  return defOp.getCallee()->equals(functionName);
524  return false;
525 }
526 
527 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host
528 // with the stream/event operands. The operands are destroyed. That is, it
529 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a
530 // runtime error. Eventually, we should guarantee this property.
531 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
532  gpu::WaitOp waitOp, OpAdaptor adaptor,
533  ConversionPatternRewriter &rewriter) const {
534  if (waitOp.asyncToken())
535  return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op.");
536 
537  Location loc = waitOp.getLoc();
538 
539  for (auto operand : adaptor.getOperands()) {
540  if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
541  // The converted operand's definition created a stream.
542  streamSynchronizeCallBuilder.create(loc, rewriter, {operand});
543  streamDestroyCallBuilder.create(loc, rewriter, {operand});
544  } else {
545  // Otherwise the converted operand is an event. This assumes that we use
546  // events in control flow code as well.
547  eventSynchronizeCallBuilder.create(loc, rewriter, {operand});
548  eventDestroyCallBuilder.create(loc, rewriter, {operand});
549  }
550  }
551 
552  rewriter.eraseOp(waitOp);
553  return success();
554 }
555 
556 // Converts `gpu.wait async` to runtime calls. The converted op creates a new
557 // stream that is synchronized with stream/event operands. The operands are
558 // destroyed. That is, it assumes that it is not used afterwards or elsewhere.
559 // Otherwise we will get a runtime error. Eventually, we should guarantee this
560 // property.
561 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
562  gpu::WaitOp waitOp, OpAdaptor adaptor,
563  ConversionPatternRewriter &rewriter) const {
564  if (!waitOp.asyncToken())
565  return rewriter.notifyMatchFailure(waitOp, "Can only convert async op.");
566 
567  Location loc = waitOp.getLoc();
568 
569  auto insertionPoint = rewriter.saveInsertionPoint();
570  SmallVector<Value, 1> events;
571  for (auto pair :
572  llvm::zip(waitOp.asyncDependencies(), adaptor.getOperands())) {
573  auto operand = std::get<1>(pair);
574  if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
575  // The converted operand's definition created a stream. Insert an event
576  // into the stream just after the last use of the original token operand.
577  auto *defOp = std::get<0>(pair).getDefiningOp();
578  rewriter.setInsertionPointAfter(defOp);
579  auto event =
580  eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
581  eventRecordCallBuilder.create(loc, rewriter, {event, operand});
582  events.push_back(event);
583  } else {
584  // Otherwise the converted operand is an event. This assumes that we use
585  // events in control flow code as well.
586  events.push_back(operand);
587  }
588  }
589  rewriter.restoreInsertionPoint(insertionPoint);
590  auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
591  for (auto event : events)
592  streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
593  for (auto event : events)
594  eventDestroyCallBuilder.create(loc, rewriter, {event});
595  rewriter.replaceOp(waitOp, {stream});
596 
597  return success();
598 }
599 
600 // Creates a struct containing all kernel parameters on the stack and returns
601 // an array of type-erased pointers to the fields of the struct. The array can
602 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
603 // The generated code is essentially as follows:
604 //
605 // %struct = alloca(sizeof(struct { Parameters... }))
606 // %array = alloca(NumParameters * sizeof(void *))
607 // for (i : [0, NumParameters))
608 // %fieldPtr = llvm.getelementptr %struct[0, i]
609 // llvm.store parameters[i], %fieldPtr
610 // %elementPtr = llvm.getelementptr %array[i]
611 // llvm.store %fieldPtr, %elementPtr
612 // return %array
613 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
614  gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, OpBuilder &builder) const {
615  auto loc = launchOp.getLoc();
616  auto numKernelOperands = launchOp.getNumKernelOperands();
617  auto arguments = getTypeConverter()->promoteOperands(
618  loc, launchOp.getOperands().take_back(numKernelOperands),
619  adaptor.getOperands().take_back(numKernelOperands), builder);
620  auto numArguments = arguments.size();
621  SmallVector<Type, 4> argumentTypes;
622  argumentTypes.reserve(numArguments);
623  for (auto argument : arguments)
624  argumentTypes.push_back(argument.getType());
625  auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
626  argumentTypes);
627  auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
628  builder.getI32IntegerAttr(1));
629  auto structPtr = builder.create<LLVM::AllocaOp>(
630  loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0);
631  auto arraySize = builder.create<LLVM::ConstantOp>(
632  loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
633  auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
634  arraySize, /*alignment=*/0);
635  auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
636  builder.getI32IntegerAttr(0));
637  for (const auto &en : llvm::enumerate(arguments)) {
638  auto index = builder.create<LLVM::ConstantOp>(
639  loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
640  auto fieldPtr = builder.create<LLVM::GEPOp>(
641  loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr,
642  ArrayRef<Value>{zero, index.getResult()});
643  builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
644  auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
645  arrayPtr, index.getResult());
646  auto casted =
647  builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
648  builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
649  }
650  return arrayPtr;
651 }
652 
653 // Generates an LLVM IR dialect global that contains the name of the given
654 // kernel function as a C string, and returns a pointer to its beginning.
655 // The code is essentially:
656 //
657 // llvm.global constant @kernel_name("function_name\00")
658 // func(...) {
659 // %0 = llvm.addressof @kernel_name
660 // %1 = llvm.constant (0 : index)
661 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
662 // }
663 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
664  StringRef moduleName, StringRef name, Location loc,
665  OpBuilder &builder) const {
666  // Make sure the trailing zero is included in the constant.
667  std::vector<char> kernelName(name.begin(), name.end());
668  kernelName.push_back('\0');
669 
670  std::string globalName =
671  std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
673  loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
674  LLVM::Linkage::Internal);
675 }
676 
677 // Emits LLVM IR to launch a kernel function. Expects the module that contains
678 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
679 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
680 //
681 // %0 = call %binarygetter
682 // %1 = call %moduleLoad(%0)
683 // %2 = <see generateKernelNameConstant>
684 // %3 = call %moduleGetFunction(%1, %2)
685 // %4 = call %streamCreate()
686 // %5 = <see generateParamsArray>
687 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
688 // call %streamSynchronize(%4)
689 // call %streamDestroy(%4)
690 // call %moduleUnload(%1)
691 //
692 // If the op is async, the stream corresponds to the (single) async dependency
693 // as well as the async token the op produces.
694 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
695  gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
696  ConversionPatternRewriter &rewriter) const {
697  if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter)))
698  return failure();
699 
700  if (launchOp.asyncDependencies().size() > 1)
701  return rewriter.notifyMatchFailure(
702  launchOp, "Cannot convert with more than one async dependency.");
703 
704  // Fail when the synchronous version of the op has async dependencies. The
705  // lowering destroys the stream, and we do not want to check that there is no
706  // use of the stream after this op.
707  if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty())
708  return rewriter.notifyMatchFailure(
709  launchOp, "Cannot convert non-async op with async dependencies.");
710 
711  Location loc = launchOp.getLoc();
712 
713  // Create an LLVM global with CUBIN extracted from the kernel annotation and
714  // obtain a pointer to the first byte in it.
715  auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
716  launchOp, launchOp.getKernelModuleName());
717  assert(kernelModule && "expected a kernel module");
718 
719  auto binaryAttr =
720  kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
721  if (!binaryAttr) {
722  kernelModule.emitOpError()
723  << "missing " << gpuBinaryAnnotation << " attribute";
724  return failure();
725  }
726 
727  SmallString<128> nameBuffer(kernelModule.getName());
728  nameBuffer.append(kGpuBinaryStorageSuffix);
729  Value data =
730  LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
731  binaryAttr.getValue(), LLVM::Linkage::Internal);
732 
733  auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
734  // Get the function from the module. The name corresponds to the name of
735  // the kernel function.
736  auto kernelName = generateKernelNameConstant(
737  launchOp.getKernelModuleName().getValue(),
738  launchOp.getKernelName().getValue(), loc, rewriter);
739  auto function = moduleGetFunctionCallBuilder.create(
740  loc, rewriter, {module.getResult(0), kernelName});
741  auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
742  rewriter.getI32IntegerAttr(0));
743  Value stream =
744  adaptor.asyncDependencies().empty()
745  ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0)
746  : adaptor.asyncDependencies().front();
747  // Create array of pointers to kernel arguments.
748  auto kernelParams = generateParamsArray(launchOp, adaptor, rewriter);
749  auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
750  Value dynamicSharedMemorySize = launchOp.dynamicSharedMemorySize()
751  ? launchOp.dynamicSharedMemorySize()
752  : zero;
753  launchKernelCallBuilder.create(
754  loc, rewriter,
755  {function.getResult(0), adaptor.gridSizeX(), adaptor.gridSizeY(),
756  adaptor.gridSizeZ(), adaptor.blockSizeX(), adaptor.blockSizeY(),
757  adaptor.blockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
758  /*extra=*/nullpointer});
759 
760  if (launchOp.asyncToken()) {
761  // Async launch: make dependent ops use the same stream.
762  rewriter.replaceOp(launchOp, {stream});
763  } else {
764  // Synchronize with host and destroy stream. This must be the stream created
765  // above (with no other uses) because we check that the synchronous version
766  // does not have any async dependencies.
767  streamSynchronizeCallBuilder.create(loc, rewriter, stream);
768  streamDestroyCallBuilder.create(loc, rewriter, stream);
769  rewriter.eraseOp(launchOp);
770  }
771  moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));
772 
773  return success();
774 }
775 
776 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
777  gpu::MemcpyOp memcpyOp, OpAdaptor adaptor,
778  ConversionPatternRewriter &rewriter) const {
779  auto memRefType = memcpyOp.src().getType().cast<MemRefType>();
780 
781  if (failed(areAllLLVMTypes(memcpyOp, adaptor.getOperands(), rewriter)) ||
782  !isConvertibleAndHasIdentityMaps(memRefType) ||
783  failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
784  return failure();
785 
786  auto loc = memcpyOp.getLoc();
787 
788  MemRefDescriptor srcDesc(adaptor.src());
789  Value numElements = getNumElements(rewriter, loc, memRefType, srcDesc);
790 
791  Type elementPtrType = getElementPtrType(memRefType);
792  Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
793  Value gepPtr = rewriter.create<LLVM::GEPOp>(loc, elementPtrType, nullPtr,
794  ArrayRef<Value>{numElements});
795  auto sizeBytes =
796  rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
797 
798  auto src = rewriter.create<LLVM::BitcastOp>(
799  loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
800  auto dst = rewriter.create<LLVM::BitcastOp>(
801  loc, llvmPointerType,
802  MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));
803 
804  auto stream = adaptor.asyncDependencies().front();
805  memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});
806 
807  rewriter.replaceOp(memcpyOp, {stream});
808 
809  return success();
810 }
811 
812 LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite(
813  gpu::MemsetOp memsetOp, OpAdaptor adaptor,
814  ConversionPatternRewriter &rewriter) const {
815  auto memRefType = memsetOp.dst().getType().cast<MemRefType>();
816 
817  if (failed(areAllLLVMTypes(memsetOp, adaptor.getOperands(), rewriter)) ||
818  !isConvertibleAndHasIdentityMaps(memRefType) ||
819  failed(isAsyncWithOneDependency(rewriter, memsetOp)))
820  return failure();
821 
822  auto loc = memsetOp.getLoc();
823 
824  Type valueType = adaptor.value().getType();
825  if (!valueType.isIntOrFloat() || valueType.getIntOrFloatBitWidth() != 32) {
826  return rewriter.notifyMatchFailure(memsetOp,
827  "value must be a 32 bit scalar");
828  }
829 
830  MemRefDescriptor dstDesc(adaptor.dst());
831  Value numElements = getNumElements(rewriter, loc, memRefType, dstDesc);
832 
833  auto value =
834  rewriter.create<LLVM::BitcastOp>(loc, llvmInt32Type, adaptor.value());
835  auto dst = rewriter.create<LLVM::BitcastOp>(
836  loc, llvmPointerType, dstDesc.alignedPtr(rewriter, loc));
837 
838  auto stream = adaptor.asyncDependencies().front();
839  memsetCallBuilder.create(loc, rewriter, {dst, value, numElements, stream});
840 
841  rewriter.replaceOp(memsetOp, {stream});
842  return success();
843 }
844 
845 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
847  return std::make_unique<GpuToLLVMConversionPass>();
848 }
849 
851  LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
852  StringRef gpuBinaryAnnotation) {
853  converter.addConversion(
854  [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
855  return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
856  });
857  patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
858  ConvertDeallocOpToGpuRuntimeCallPattern,
859  ConvertHostRegisterOpToGpuRuntimeCallPattern,
860  ConvertMemcpyOpToGpuRuntimeCallPattern,
861  ConvertMemsetOpToGpuRuntimeCallPattern,
862  ConvertWaitAsyncOpToGpuRuntimeCallPattern,
863  ConvertWaitOpToGpuRuntimeCallPattern,
864  ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
865  patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter,
866  gpuBinaryAnnotation);
867  patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
868 }
Include the generated interface declarations.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:132
OpTy create(Location location, Args &&...args)
Create an operation of specific op type at the current insertion point.
Definition: Builders.h:430
static constexpr const char * kGpuBinaryStorageSuffix
static LLVMStructType getNewIdentified(MLIRContext *context, StringRef name, ArrayRef< Type > elements, bool isPacked=false)
Gets a new identified struct with the given body.
Definition: LLVMTypes.cpp:354
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:881
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:28
LogicalResult applyPartialConversion(ArrayRef< Operation *> ops, ConversionTarget &target, const FrozenRewritePatternSet &patterns, DenseSet< Operation *> *unconvertedOps=nullptr)
Below we define several entry points for operation conversion.
MLIRContext & getContext()
Returns the MLIR context.
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value...
Definition: LogicalResult.h:72
static Type getElementType(Type type, ArrayRef< int32_t > indices, function_ref< InFlightDiagnostic(StringRef)> emitErrorFn)
Walks the given type hierarchy with the given indices, potentially down to component granularity...
Definition: SPIRVOps.cpp:639
LLVM dialect function type.
Definition: LLVMTypes.h:123
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition: Types.cpp:87
Derived class that automatically populates legalization information for different LLVM ops...
static OpBuilder atBlockEnd(Block *block, Listener *listener=nullptr)
Create a builder and set the insertion point to after the last operation in the block but still insid...
Definition: Builders.h:215
void populateArithmeticToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
LogicalResult notifyMatchFailure(Operation *op, function_ref< void(Diagnostic &)> reasonCallback) override
PatternRewriter hook for notifying match failure reasons.
void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns, StringRef gpuBinaryAnnotation={})
Collect a set of patterns to convert from the GPU dialect to LLVM and populate converter for gpu type...
static LLVMPointerType get(Type pointee, unsigned addressSpace=0)
Gets or creates an instance of LLVM dialect pointer type pointing to an object of pointee type in the...
Definition: LLVMTypes.cpp:165
static LLVMFunctionType get(Type result, ArrayRef< Type > arguments, bool isVarArg=false)
Gets or creates an instance of LLVM dialect function in the same context as the result type...
Definition: LLVMTypes.cpp:101
void replaceOp(Operation *op, ValueRange newValues) override
PatternRewriter hook for replacing the results of an operation.
Region * getParent() const
Provide a &#39;getParent&#39; method for ilist_node_with_parent methods.
Definition: Block.cpp:26
static constexpr const bool value
std::unique_ptr< OperationPass< ModuleOp > > createGpuToLLVMConversionPass()
Creates a pass to convert a GPU operations into a sequence of GPU runtime calls.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:48
Block * getBlock() const
Returns the current block of the builder.
Definition: Builders.h:379
static bool isGpuAsyncTokenType(Value value)
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:343
void populateMemRefToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert memory-related operations from the MemRef dialect to the LLVM di...
IntegerAttr getI32IntegerAttr(int32_t value)
Definition: Builders.cpp:148
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
bool isCompatibleType(Type type)
Returns true if the given type is compatible with the LLVM dialect.
Definition: LLVMTypes.cpp:752
void populateAsyncStructuralTypeConversionsAndLegality(TypeConverter &typeConverter, RewritePatternSet &patterns, ConversionTarget &target)
Populates patterns for async structural type conversions.
Helper class to produce LLVM dialect operations extracting or inserting elements of a MemRef descript...
Definition: MemRefBuilder.h:33
Value stride(OpBuilder &builder, Location loc, unsigned pos)
Builds IR extracting the pos-th size from the descriptor.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:206
Value createIndexConstant(ConversionPatternRewriter &builder, Location loc, uint64_t value) const
Create an LLVM dialect operation defining the given index constant.
Definition: Pattern.cpp:62
static llvm::Value * getSizeInBytes(llvm::IRBuilderBase &builder, llvm::Value *basePtr)
Computes the size of type in bytes.
void updateRootInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around a root update of an operation.
Definition: PatternMatch.h:789
InsertPoint saveInsertionPoint() const
Return a saved insertion point.
Definition: Builders.h:316
ParentT getParentOfType()
Find the first parent operation of the given type, or nullptr if there is no ancestor operation...
Definition: Region.h:205
static Type getElementPtrType(Type type, ValueRange indices, Location baseLoc)
Definition: SPIRVOps.cpp:993
Value createGlobalString(Location loc, OpBuilder &builder, StringRef name, StringRef value, Linkage linkage)
Create an LLVM global containing the string "value" at the module containing surrounding the insertio...
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:72
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:84
static LogicalResult isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, gpu::AsyncOpInterface op)
Value size(OpBuilder &builder, Location loc, unsigned pos)
Builds IR extracting the pos-th size from the descriptor.
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:355
Type getType() const
Return the type of this attribute.
Definition: Attributes.h:64
static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter)
void addConversion(FnT &&callback)
Register a conversion function.
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
Type getType() const
Return the type of this value.
Definition: Value.h:117
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&... args)
Add an instance of each of the pattern types &#39;Ts&#39; to the pattern list with the given arguments...
Definition: PatternMatch.h:930
static int64_t getNumElements(ShapedType type)
Definition: TensorOps.cpp:678
Conversion from types in the Standard dialect to the LLVM IR dialect.
Definition: TypeConverter.h:30
static bool isDefinedByCallTo(Value value, StringRef functionName)
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:55
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:91
This class implements a pattern rewriter for use with ConversionPatterns.
LLVM dialect pointer type.
Definition: LLVMTypes.h:181
void eraseOp(Operation *op) override
PatternRewriter hook for erasing a dead operation.
void populateVectorToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, bool reassociateFPReductions=false)
Collect a set of patterns to convert from the Vector dialect to LLVM.
bool isa() const
Definition: Types.h:234
void restoreInsertionPoint(InsertPoint ip)
Restore the insert point to a previously saved point.
Definition: Builders.h:321
This class helps build Operations.
Definition: Builders.h:177
This class provides an abstraction over the different types of ranges over Values.
std::string getDefaultGpuBinaryAnnotation()
Returns the default annotation name for GPU binary blobs.
void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect the patterns to convert from the Standard dialect to LLVM.
U cast() const
Definition: Types.h:250