MLIR 23.0.0git
GPUOpsLowering.cpp
Go to the documentation of this file.
1//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "GPUOpsLowering.h"
10
14#include "mlir/IR/Attributes.h"
15#include "mlir/IR/Builders.h"
17#include "mlir/IR/SymbolTable.h"
18#include "llvm/ADT/SmallVectorExtras.h"
19#include "llvm/ADT/StringSet.h"
20#include "llvm/Support/DebugLog.h"
21#include "llvm/Support/FormatVariadic.h"
22
23#define DEBUG_TYPE "gpu-lowering"
24
25using namespace mlir;
26
27LLVM::LLVMFuncOp mlir::getOrDefineFunction(Operation *moduleOp, Location loc,
28 OpBuilder &b, StringRef name,
29 LLVM::LLVMFunctionType type) {
30 auto existing = dyn_cast_or_null<LLVM::LLVMFuncOp>(
31 SymbolTable::lookupSymbolIn(moduleOp, name));
32 if (existing)
33 return existing;
34
36 b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
37 return LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
38}
39
41 StringRef prefix) {
42 // Get a unique global name.
43 unsigned stringNumber = 0;
44 SmallString<16> stringConstName;
45 do {
46 stringConstName.clear();
47 (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
48 } while (SymbolTable::lookupSymbolIn(moduleOp, stringConstName));
49 return stringConstName;
50}
51
53 Operation *moduleOp, Type llvmI8,
54 StringRef namePrefix,
55 StringRef str,
56 uint64_t alignment,
57 unsigned addrSpace) {
58 llvm::SmallString<20> nullTermStr(str);
59 nullTermStr.push_back('\0'); // Null terminate for C
60 auto globalType =
61 LLVM::LLVMArrayType::get(llvmI8, nullTermStr.size_in_bytes());
62 StringAttr attr = b.getStringAttr(nullTermStr);
63
64 // Try to find existing global.
65 for (auto globalOp : moduleOp->getRegion(0).getOps<LLVM::GlobalOp>())
66 if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
67 globalOp.getValueAttr() == attr &&
68 globalOp.getAlignment().value_or(0) == alignment &&
69 globalOp.getAddrSpace() == addrSpace)
70 return globalOp;
71
72 // Not found: create new global.
74 b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
75 SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
76 return LLVM::GlobalOp::create(b, loc, globalType,
77 /*isConstant=*/true, LLVM::Linkage::Internal,
78 name, attr, alignment, addrSpace);
79}
80
82 gpu::GPUFuncOp gpuFuncOp, Type llvmFuncType, OpBuilder &rewriter) const {
83 FailureOr<LoweredLLVMFuncAttrs> loweredAttrs =
84 lowerDiscardableAttrsForLLVMFunc(gpuFuncOp, llvmFuncType);
85 if (failed(loweredAttrs))
86 return failure();
87
88 MLIRContext *ctx = rewriter.getContext();
89 LLVM::LLVMFuncOp::Properties &props = loweredAttrs->properties;
90 props.sym_name = rewriter.getStringAttr(gpuFuncOp.getName());
91 props.function_type = TypeAttr::get(llvmFuncType);
92 const bool isKernelFunc = gpuFuncOp.isKernel();
93 props.setCConv(LLVM::CConvAttr::get(ctx, isKernelFunc
94 ? kernelCallingConvention
95 : nonKernelCallingConvention));
96
97 NamedAttrList &discardable = loweredAttrs->discardableAttrs;
98 auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
99
100 auto appendIfNameAndValue = [&](StringAttr name, Attribute value) {
101 if (name && value)
102 discardable.append(name, value);
103 };
104
105 DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
106 DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
107 DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
108
109 appendIfNameAndValue(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
110 knownBlockSize);
111 appendIfNameAndValue(gpuDialect->getKnownGridSizeAttrHelper().getName(),
112 knownGridSize);
113 appendIfNameAndValue(gpuDialect->getKnownClusterSizeAttrHelper().getName(),
114 knownClusterSize);
115
116 if (isKernelFunc) {
117 discardable.append(gpuDialect->getKernelFuncAttrName(),
118 rewriter.getUnitAttr());
119 // Add a dialect specific kernel attribute in addition to GPU kernel
120 // attribute. The former is necessary for further translation while the
121 // latter is expected by gpu.launch_func.
122 appendIfNameAndValue(kernelAttributeName, rewriter.getUnitAttr());
123 appendIfNameAndValue(kernelBlockSizeAttributeName, knownBlockSize);
124 appendIfNameAndValue(kernelClusterSizeAttributeName, knownClusterSize);
125 }
126
127 return loweredAttrs;
128}
129
130LogicalResult
131GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
132 ConversionPatternRewriter &rewriter) const {
133 Location loc = gpuFuncOp.getLoc();
134
135 SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
136 if (encodeWorkgroupAttributionsAsArguments) {
137 // Append an `llvm.ptr` argument to the function signature to encode
138 // workgroup attributions.
139
140 ArrayRef<BlockArgument> workgroupAttributions =
141 gpuFuncOp.getWorkgroupAttributionBBArgs();
142 size_t numAttributions = workgroupAttributions.size();
143
144 // Insert all arguments at the end.
145 unsigned index = gpuFuncOp.getNumArguments();
146 SmallVector<unsigned> argIndices(numAttributions, index);
147
148 // New arguments will simply be `llvm.ptr` with the correct address space
149 Type workgroupPtrType =
150 rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
151 Repeated<Type> argTypes(numAttributions, workgroupPtrType);
152
153 // Attributes: noalias, llvm.mlir.workgroup_attribution(<size>, <type>)
154 std::array attrs{
155 rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
156 rewriter.getUnitAttr()),
157 rewriter.getNamedAttr(
158 getDialect().getWorkgroupAttributionAttrHelper().getName(),
159 rewriter.getUnitAttr()),
160 };
162 for (BlockArgument attribution : workgroupAttributions) {
163 auto attributionType = cast<MemRefType>(attribution.getType());
164 IntegerAttr numElements =
165 rewriter.getI64IntegerAttr(attributionType.getNumElements());
166 Type llvmElementType =
167 getTypeConverter()->convertType(attributionType.getElementType());
168 if (!llvmElementType)
169 return failure();
170 TypeAttr type = TypeAttr::get(llvmElementType);
171 attrs.back().setValue(
172 rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
173 argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
174 }
175
176 // Location match function location
177 SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());
178
179 // Perform signature modification
180 rewriter.modifyOpInPlace(
181 gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
182 LogicalResult inserted =
183 static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
184 argIndices, argTypes, argAttrs, argLocs);
185 (void)inserted;
186 assert(succeeded(inserted) &&
187 "expected GPU funcs to support inserting any argument");
188 });
189 } else {
190 workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
191 for (auto [idx, attribution] :
192 llvm::enumerate(gpuFuncOp.getWorkgroupAttributionBBArgs())) {
193 auto type = dyn_cast<MemRefType>(attribution.getType());
194 assert(type && type.hasStaticShape() && "unexpected type in attribution");
195
196 uint64_t numElements = type.getNumElements();
197
198 auto elementType =
199 cast<Type>(typeConverter->convertType(type.getElementType()));
200 auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
201 std::string name =
202 std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
203 uint64_t alignment = 0;
204 if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
205 gpuFuncOp.getWorkgroupAttributionAttr(
206 idx, LLVM::LLVMDialect::getAlignAttrName())))
207 alignment = alignAttr.getInt();
208 auto globalOp = LLVM::GlobalOp::create(
209 rewriter, gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
210 LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
211 workgroupAddrSpace);
212 workgroupBuffers.push_back(globalOp);
213 }
214 }
215
216 // Remap proper input types.
217 TypeConverter::SignatureConversion signatureConversion(
218 gpuFuncOp.front().getNumArguments());
219
221 gpuFuncOp.getFunctionType(), /*isVariadic=*/false,
222 getTypeConverter()->getOptions().useBarePtrCallConv, signatureConversion);
223 if (!funcType) {
224 return rewriter.notifyMatchFailure(gpuFuncOp, [&](Diagnostic &diag) {
225 diag << "failed to convert function signature type for: "
226 << gpuFuncOp.getFunctionType();
227 });
228 }
229
230 ArrayAttr argAttrs = gpuFuncOp.getArgAttrsAttr();
231
232 FailureOr<LoweredLLVMFuncAttrs> loweredAttrs =
233 buildLoweredGPULLVMFuncAttrs(gpuFuncOp, funcType, rewriter);
234 if (failed(loweredAttrs))
235 return rewriter.notifyMatchFailure(gpuFuncOp,
236 "failed to lower func attributes");
237
238 auto llvmFuncOp = LLVM::LLVMFuncOp::create(rewriter, gpuFuncOp.getLoc(),
239 loweredAttrs->properties,
240 loweredAttrs->discardableAttrs);
241
242 {
243 // Insert operations that correspond to converted workgroup and private
244 // memory attributions to the body of the function. This must operate on
245 // the original function, before the body region is inlined in the new
246 // function to maintain the relation between block arguments and the
247 // parent operation that assigns their semantics.
248 OpBuilder::InsertionGuard guard(rewriter);
249
250 // Rewrite workgroup memory attributions to addresses of global buffers.
251 rewriter.setInsertionPointToStart(&gpuFuncOp.front());
252 unsigned numProperArguments = gpuFuncOp.getNumArguments();
253
254 if (encodeWorkgroupAttributionsAsArguments) {
255 // Build a MemRefDescriptor with each of the arguments added above.
256
257 unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
258 assert(numProperArguments >= numAttributions &&
259 "Expecting attributions to be encoded as arguments already");
260
261 // Arguments encoding workgroup attributions will be in positions
262 // [numProperArguments, numProperArguments+numAttributions)
263 ArrayRef<BlockArgument> attributionArguments =
264 gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
265 numAttributions);
266 for (auto [idx, vals] : llvm::enumerate(
267 llvm::zip_equal(gpuFuncOp.getWorkgroupAttributionBBArgs(),
268 attributionArguments))) {
269 auto [attribution, arg] = vals;
270 auto type = cast<MemRefType>(attribution.getType());
271
272 // Arguments are of llvm.ptr type and attributions are of memref type:
273 // we need to wrap them in memref descriptors.
275 rewriter, loc, *getTypeConverter(), type, arg);
276
277 // And remap the arguments
278 signatureConversion.remapInput(numProperArguments + idx, descr);
279 }
280 } else {
281 for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
282 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
283 global.getAddrSpace());
284 Value address = LLVM::AddressOfOp::create(rewriter, loc, ptrType,
285 global.getSymNameAttr());
286 Value memory =
287 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getType(),
288 address, ArrayRef<LLVM::GEPArg>{0, 0});
289
290 // Build a memref descriptor pointing to the buffer to plug with the
291 // existing memref infrastructure. This may use more registers than
292 // otherwise necessary given that memref sizes are fixed, but we can try
293 // and canonicalize that away later.
294 Value attribution = gpuFuncOp.getWorkgroupAttributionBBArgs()[idx];
295 auto type = cast<MemRefType>(attribution.getType());
297 rewriter, loc, *getTypeConverter(), type, memory);
298 signatureConversion.remapInput(numProperArguments + idx, descr);
299 }
300 }
301
302 // Rewrite private memory attributions to alloca'ed buffers.
303 unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
304 auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
305 for (const auto [idx, attribution] :
306 llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
307 auto type = cast<MemRefType>(attribution.getType());
308 assert(type && type.hasStaticShape() && "unexpected type in attribution");
309
310 // Explicitly drop memory space when lowering private memory
311 // attributions since NVVM models it as `alloca`s in the default
312 // memory space and does not support `alloca`s with addrspace(5).
313 Type elementType = typeConverter->convertType(type.getElementType());
314 auto ptrType =
315 LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
316 Value numElements = LLVM::ConstantOp::create(
317 rewriter, gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
318 uint64_t alignment = 0;
319 if (auto alignAttr =
320 dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
321 idx, LLVM::LLVMDialect::getAlignAttrName())))
322 alignment = alignAttr.getInt();
323 Value allocated =
324 LLVM::AllocaOp::create(rewriter, gpuFuncOp.getLoc(), ptrType,
325 elementType, numElements, alignment);
327 rewriter, loc, *getTypeConverter(), type, allocated);
328 signatureConversion.remapInput(
329 numProperArguments + numWorkgroupAttributions + idx, descr);
330 }
331 }
332
333 // Move the region to the new function, update the entry block signature.
334 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
335 llvmFuncOp.end());
336 if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
337 &signatureConversion)))
338 return failure();
339
340 // Get memref type from function arguments and set the noalias to
341 // pointer arguments.
342 for (const auto [idx, argTy] :
343 llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
344 auto remapping = signatureConversion.getInputMapping(idx);
345 NamedAttrList argAttr =
346 argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) : NamedAttrList();
347 auto copyAttribute = [&](StringRef attrName) {
348 Attribute attr = argAttr.erase(attrName);
349 if (!attr)
350 return;
351 for (size_t i = 0, e = remapping->size; i < e; ++i)
352 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
353 };
354 auto copyPointerAttribute = [&](StringRef attrName) {
355 Attribute attr = argAttr.erase(attrName);
356
357 if (!attr)
358 return;
359 if (remapping->size > 1 &&
360 attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
361 emitWarning(llvmFuncOp.getLoc(),
362 "Cannot copy noalias with non-bare pointers.\n");
363 return;
364 }
365 for (size_t i = 0, e = remapping->size; i < e; ++i) {
366 if (isa<LLVM::LLVMPointerType>(
367 llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
368 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
369 }
370 }
371 };
372
373 if (argAttr.empty())
374 continue;
375
376 copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
377 copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
378 copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
379 bool lowersToPointer = false;
380 for (size_t i = 0, e = remapping->size; i < e; ++i) {
381 lowersToPointer |= isa<LLVM::LLVMPointerType>(
382 llvmFuncOp.getArgument(remapping->inputNo + i).getType());
383 }
384
385 if (lowersToPointer) {
386 copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
387 copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
388 copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
389 copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
390 copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
391 copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
392 copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
393 copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
394 copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
395 copyPointerAttribute(
396 LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
397 copyPointerAttribute(
398 LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
399 }
400 }
401 rewriter.eraseOp(gpuFuncOp);
402 return success();
403}
404
406 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
407 ConversionPatternRewriter &rewriter) const {
408 Location loc = gpuPrintfOp->getLoc();
409
410 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
411 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
412 mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
413 mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
414
415 Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
416 if (!moduleOp)
417 return rewriter.notifyMatchFailure(gpuPrintfOp,
418 "Couldn't find a parent module");
419
420 auto ocklBegin =
421 getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
422 LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
423 LLVM::LLVMFuncOp ocklAppendArgs;
424 if (!adaptor.getArgs().empty()) {
425 ocklAppendArgs = getOrDefineFunction(
426 moduleOp, loc, rewriter, "__ockl_printf_append_args",
427 LLVM::LLVMFunctionType::get(
428 llvmI64, {llvmI64, /*numArgs*/ llvmI32, llvmI64, llvmI64, llvmI64,
429 llvmI64, llvmI64, llvmI64, llvmI64, /*isLast*/ llvmI32}));
430 }
431 auto ocklAppendStringN = getOrDefineFunction(
432 moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
433 LLVM::LLVMFunctionType::get(
434 llvmI64,
435 {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));
436
437 /// Start the printf hostcall
438 Value zeroI64 = LLVM::ConstantOp::create(rewriter, loc, llvmI64, 0);
439 auto printfBeginCall =
440 LLVM::CallOp::create(rewriter, loc, ocklBegin, zeroI64);
441 Value printfDesc = printfBeginCall.getResult();
442
443 // Create the global op or find an existing one.
444 LLVM::GlobalOp global = getOrCreateStringConstant(
445 rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
446
447 // Get a pointer to the format string's first element and pass it to printf()
448 Value globalPtr = LLVM::AddressOfOp::create(
449 rewriter, loc,
450 LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
451 global.getSymNameAttr());
452 Value stringStart =
453 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
454 globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
455 Value stringLen = LLVM::ConstantOp::create(
456 rewriter, loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
457
458 Value oneI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 1);
459 Value zeroI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 0);
460
461 auto appendFormatCall = LLVM::CallOp::create(
462 rewriter, loc, ocklAppendStringN,
463 ValueRange{printfDesc, stringStart, stringLen,
464 adaptor.getArgs().empty() ? oneI32 : zeroI32});
465 printfDesc = appendFormatCall.getResult();
466
467 // __ockl_printf_append_args takes 7 values per append call
468 constexpr size_t argsPerAppend = 7;
469 size_t nArgs = adaptor.getArgs().size();
470 for (size_t group = 0; group < nArgs; group += argsPerAppend) {
471 size_t bound = std::min(group + argsPerAppend, nArgs);
472 size_t numArgsThisCall = bound - group;
473
475 arguments.push_back(printfDesc);
476 arguments.push_back(
477 LLVM::ConstantOp::create(rewriter, loc, llvmI32, numArgsThisCall));
478 for (size_t i = group; i < bound; ++i) {
479 Value arg = adaptor.getArgs()[i];
480 if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
481 if (!floatType.isF64())
482 arg = LLVM::FPExtOp::create(
483 rewriter, loc, typeConverter->convertType(rewriter.getF64Type()),
484 arg);
485 arg = LLVM::BitcastOp::create(rewriter, loc, llvmI64, arg);
486 }
487 if (arg.getType().getIntOrFloatBitWidth() != 64)
488 arg = LLVM::ZExtOp::create(rewriter, loc, llvmI64, arg);
489
490 arguments.push_back(arg);
491 }
492 // Pad out to 7 arguments since the hostcall always needs 7
493 for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
494 arguments.push_back(zeroI64);
495 }
496
497 auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
498 arguments.push_back(isLast);
499 auto call = LLVM::CallOp::create(rewriter, loc, ocklAppendArgs, arguments);
500 printfDesc = call.getResult();
501 }
502 rewriter.eraseOp(gpuPrintfOp);
503 return success();
504}
505
507 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
508 ConversionPatternRewriter &rewriter) const {
509 Location loc = gpuPrintfOp->getLoc();
510
511 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
512 mlir::Type ptrType =
513 LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
514
515 Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
516 if (!moduleOp)
517 return rewriter.notifyMatchFailure(gpuPrintfOp,
518 "Couldn't find a parent module");
519
520 auto printfType =
521 LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
522 /*isVarArg=*/true);
523 LLVM::LLVMFuncOp printfDecl =
524 getOrDefineFunction(moduleOp, loc, rewriter, funcName, printfType);
525 printfDecl.setCConv(callingConvention);
526
527 // Create the global op or find an existing one.
528 LLVM::GlobalOp global = getOrCreateStringConstant(
529 rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat(),
530 /*alignment=*/0, addressSpace);
531
532 // Get a pointer to the format string's first element
533 Value globalPtr = LLVM::AddressOfOp::create(
534 rewriter, loc,
535 LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
536 global.getSymNameAttr());
537 Value stringStart =
538 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
539 globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
540
541 // Construct arguments and function call
542 auto argsRange = adaptor.getArgs();
543 SmallVector<Value, 4> printfArgs;
544 printfArgs.reserve(argsRange.size() + 1);
545 printfArgs.push_back(stringStart);
546 printfArgs.append(argsRange.begin(), argsRange.end());
547
548 auto call = LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
549 call.setCConv(callingConvention);
550 rewriter.eraseOp(gpuPrintfOp);
551 return success();
552}
553
555 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
556 ConversionPatternRewriter &rewriter) const {
557 Location loc = gpuPrintfOp->getLoc();
558
559 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
560 mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
561
562 Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
563 if (!moduleOp)
564 return rewriter.notifyMatchFailure(gpuPrintfOp,
565 "Couldn't find a parent module");
566
567 // Create a valid global location removing any metadata attached to the
568 // location as debug info metadata inside of a function cannot be used outside
569 // of that function.
571
572 auto vprintfType =
573 LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
574 LLVM::LLVMFuncOp vprintfDecl = getOrDefineFunction(
575 moduleOp, globalLoc, rewriter, "vprintf", vprintfType);
576
577 // Create the global op or find an existing one.
578 LLVM::GlobalOp global =
579 getOrCreateStringConstant(rewriter, globalLoc, moduleOp, llvmI8,
580 "printfFormat_", adaptor.getFormat());
581
582 // Get a pointer to the format string's first element
583 Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global);
584 Value stringStart =
585 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
586 globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
587 SmallVector<Type> types;
589 // Promote and pack the arguments into a stack allocation.
590 for (Value arg : adaptor.getArgs()) {
591 Type type = arg.getType();
592 Value promotedArg = arg;
593 assert(type.isIntOrFloat());
594 if (isa<FloatType>(type)) {
595 type = rewriter.getF64Type();
596 promotedArg = LLVM::FPExtOp::create(rewriter, loc, type, arg);
597 }
598 types.push_back(type);
599 args.push_back(promotedArg);
600 }
601 Type structType =
602 LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
603 Value one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
604 rewriter.getIndexAttr(1));
605 Value tempAlloc =
606 LLVM::AllocaOp::create(rewriter, loc, ptrType, structType, one,
607 /*alignment=*/0);
608 for (auto [index, arg] : llvm::enumerate(args)) {
609 Value ptr = LLVM::GEPOp::create(
610 rewriter, loc, ptrType, structType, tempAlloc,
611 ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
612 LLVM::StoreOp::create(rewriter, loc, arg, ptr);
613 }
614 std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
615
616 LLVM::CallOp::create(rewriter, loc, vprintfDecl, printfArgs);
617 rewriter.eraseOp(gpuPrintfOp);
618 return success();
619}
620
621/// Helper for impl::scalarizeVectorOp. Scalarizes vectors to elements.
622/// Used either directly (for ops on 1D vectors) or as the callback passed to
623/// detail::handleMultidimensionalVectors (for ops on higher-rank vectors).
625 Type llvm1DVectorTy,
626 ConversionPatternRewriter &rewriter,
627 const LLVMTypeConverter &converter) {
628 TypeRange operandTypes(operands);
629 VectorType vectorType = cast<VectorType>(llvm1DVectorTy);
630 Location loc = op->getLoc();
631 Value result = LLVM::PoisonOp::create(rewriter, loc, vectorType);
632 Type indexType = converter.convertType(rewriter.getIndexType());
633 StringAttr name = op->getName().getIdentifier();
634 Type elementType = vectorType.getElementType();
635
636 for (int64_t i = 0; i < vectorType.getNumElements(); ++i) {
637 Value index = LLVM::ConstantOp::create(rewriter, loc, indexType, i);
638 auto extractElement = [&](Value operand) -> Value {
639 if (!isa<VectorType>(operand.getType()))
640 return operand;
641 return LLVM::ExtractElementOp::create(rewriter, loc, operand, index);
642 };
643 auto scalarOperands = llvm::map_to_vector(operands, extractElement);
644 Operation *scalarOp =
645 rewriter.create(loc, name, scalarOperands, elementType, op->getAttrs());
646 result = LLVM::InsertElementOp::create(rewriter, loc, result,
647 scalarOp->getResult(0), index);
648 }
649 return result;
650}
651
652/// Unrolls op to array/vector elements.
653LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands,
654 ConversionPatternRewriter &rewriter,
655 const LLVMTypeConverter &converter) {
656 TypeRange operandTypes(operands);
657 if (llvm::any_of(operandTypes, llvm::IsaPred<VectorType>)) {
658 VectorType vectorType =
659 cast<VectorType>(converter.convertType(op->getResultTypes()[0]));
660 rewriter.replaceOp(op, scalarizeVectorOpHelper(op, operands, vectorType,
661 rewriter, converter));
662 return success();
663 }
664
665 if (llvm::any_of(operandTypes, llvm::IsaPred<LLVM::LLVMArrayType>)) {
667 op, operands, converter,
668 [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
669 return scalarizeVectorOpHelper(op, operands, llvm1DVectorTy, rewriter,
670 converter);
671 },
672 rewriter);
673 }
674
675 return rewriter.notifyMatchFailure(op, "no llvm.array or vector to unroll");
676}
677
678static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
679 return IntegerAttr::get(IntegerType::get(ctx, 64), space);
680}
681
682/// Generates a symbol with 0-sized array type for dynamic shared memory usage,
683/// or uses existing symbol.
684static LLVM::GlobalOp getDynamicSharedMemorySymbol(
685 ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
686 gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
687 MemRefType memrefType, unsigned alignmentBit) {
688 uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();
689
690 FailureOr<unsigned> addressSpace =
691 typeConverter->getMemRefAddressSpace(memrefType);
692 if (failed(addressSpace)) {
693 op->emitError() << "conversion of memref memory space "
694 << memrefType.getMemorySpace()
695 << " to integer address space "
696 "failed. Consider adding memory space conversions.";
697 }
698
699 // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
700 // LLVM::GlobalOp is suitable for shared memory, return it.
701 llvm::StringSet<> existingGlobalNames;
702 for (auto globalOp : moduleOp.getBody()->getOps<LLVM::GlobalOp>()) {
703 existingGlobalNames.insert(globalOp.getSymName());
704 if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
705 if (globalOp.getAddrSpace() == addressSpace.value() &&
706 arrayType.getNumElements() == 0 &&
707 globalOp.getAlignment().value_or(0) == alignmentByte) {
708 return globalOp;
709 }
710 }
711 }
712
713 // Step 2. Find a unique symbol name
714 unsigned uniquingCounter = 0;
716 "__dynamic_shmem_",
717 [&](StringRef candidate) {
718 return existingGlobalNames.contains(candidate);
719 },
720 uniquingCounter);
721
722 // Step 3. Generate a global op
723 OpBuilder::InsertionGuard guard(rewriter);
724 rewriter.setInsertionPointToStart(moduleOp.getBody());
725
726 auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
727 typeConverter->convertType(memrefType.getElementType()), 0);
728
729 return LLVM::GlobalOp::create(rewriter, op->getLoc(), zeroSizedArrayType,
730 /*isConstant=*/false, LLVM::Linkage::Internal,
731 symName, /*value=*/Attribute(), alignmentByte,
732 addressSpace.value());
733}
734
736 gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
737 ConversionPatternRewriter &rewriter) const {
738 Location loc = op.getLoc();
739 MemRefType memrefType = op.getResultMemref().getType();
740 Type elementType = typeConverter->convertType(memrefType.getElementType());
741
742 // Step 1: Generate a memref<0xi8> type
743 MemRefLayoutAttrInterface layout = {};
744 auto memrefType0sz =
745 MemRefType::get({0}, elementType, layout, memrefType.getMemorySpace());
746
747 // Step 2: Generate a global symbol or existing for the dynamic shared
748 // memory with memref<0xi8> type
749 auto moduleOp = op->getParentOfType<gpu::GPUModuleOp>();
750 LLVM::GlobalOp shmemOp = getDynamicSharedMemorySymbol(
751 rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);
752
753 // Step 3. Get address of the global symbol
754 OpBuilder::InsertionGuard guard(rewriter);
755 rewriter.setInsertionPoint(op);
756 auto basePtr = LLVM::AddressOfOp::create(rewriter, loc, shmemOp);
757 Type baseType = basePtr->getResultTypes().front();
758
759 // Step 4. Generate GEP using offsets
760 SmallVector<LLVM::GEPArg> gepArgs = {0};
761 Value shmemPtr = LLVM::GEPOp::create(rewriter, loc, baseType, elementType,
762 basePtr, gepArgs);
763 // Step 5. Create a memref descriptor
764 SmallVector<Value> shape, strides;
765 Value sizeBytes;
766 getMemRefDescriptorSizes(loc, memrefType0sz, {}, rewriter, shape, strides,
767 sizeBytes);
768 auto memRefDescriptor = this->createMemRefDescriptor(
769 loc, memrefType0sz, shmemPtr, shmemPtr, shape, strides, rewriter);
770
771 // Step 5. Replace the op with memref descriptor
772 rewriter.replaceOp(op, {memRefDescriptor});
773 return success();
774}
775
777 gpu::ReturnOp op, OpAdaptor adaptor,
778 ConversionPatternRewriter &rewriter) const {
779 Location loc = op.getLoc();
780 unsigned numArguments = op.getNumOperands();
781 SmallVector<Value, 4> updatedOperands;
782
783 bool useBarePtrCallConv = getTypeConverter()->getOptions().useBarePtrCallConv;
784 if (useBarePtrCallConv) {
785 // For the bare-ptr calling convention, extract the aligned pointer to
786 // be returned from the memref descriptor.
787 for (auto it : llvm::zip(op->getOperands(), adaptor.getOperands())) {
788 Type oldTy = std::get<0>(it).getType();
789 Value newOperand = std::get<1>(it);
790 if (isa<MemRefType>(oldTy) && getTypeConverter()->canConvertToBarePtr(
791 cast<BaseMemRefType>(oldTy))) {
792 MemRefDescriptor memrefDesc(newOperand);
793 newOperand = memrefDesc.allocatedPtr(rewriter, loc);
794 } else if (isa<UnrankedMemRefType>(oldTy)) {
795 // Unranked memref is not supported in the bare pointer calling
796 // convention.
797 return failure();
798 }
799 updatedOperands.push_back(newOperand);
800 }
801 } else {
802 updatedOperands = llvm::to_vector<4>(adaptor.getOperands());
803 (void)copyUnrankedDescriptors(rewriter, loc, op.getOperands().getTypes(),
804 updatedOperands,
805 /*toDynamic=*/true);
806 }
807
808 // If ReturnOp has 0 or 1 operand, create it and return immediately.
809 if (numArguments <= 1) {
810 rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
811 op, TypeRange(), updatedOperands, op->getAttrs());
812 return success();
813 }
814
815 // Otherwise, we need to pack the arguments into an LLVM struct type before
816 // returning.
817 auto packedType = getTypeConverter()->packFunctionResults(
818 op.getOperandTypes(), useBarePtrCallConv);
819 if (!packedType) {
820 return rewriter.notifyMatchFailure(op, "could not convert result types");
821 }
822
823 Value packed = LLVM::PoisonOp::create(rewriter, loc, packedType);
824 for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
825 packed = LLVM::InsertValueOp::create(rewriter, loc, packed, operand, idx);
826 }
827 rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
828 op->getAttrs());
829 return success();
830}
831
833 TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
834 typeConverter.addTypeAttributeConversion(
835 [mapping](BaseMemRefType type, gpu::AddressSpaceAttr memorySpaceAttr) {
836 gpu::AddressSpace memorySpace = memorySpaceAttr.getValue();
837 unsigned addressSpace = mapping(memorySpace);
838 return wrapNumericMemorySpace(memorySpaceAttr.getContext(),
839 addressSpace);
840 });
841}
return success()
static SmallString< 16 > getUniqueSymbolName(Operation *moduleOp, StringRef prefix)
static LLVM::GlobalOp getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp, gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter, MemRefType memrefType, unsigned alignmentBit)
Generates a symbol with 0-sized array type for dynamic shared memory usage, or uses existing symbol.
static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space)
static Value scalarizeVectorOpHelper(Operation *op, ValueRange operands, Type llvm1DVectorTy, ConversionPatternRewriter &rewriter, const LLVMTypeConverter &converter)
Helper for impl::scalarizeVectorOp.
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be inserted(the insertion happens right before the *insertion point). Since `begin` can itself be invalidated due to the memref *rewriting done from this method
static std::string diag(const llvm::Value &value)
Attributes are known-constant values of operations.
Definition Attributes.h:25
This class provides a shared interface for ranked and unranked memref types.
This class represents an argument of a Block.
Definition Value.h:306
UnitAttr getUnitAttr()
Definition Builders.cpp:102
StringAttr getStringAttr(const Twine &bytes)
Definition Builders.cpp:266
MLIRContext * getContext() const
Definition Builders.h:56
typename gpu::GPUFuncOp::Adaptor OpAdaptor
Definition Pattern.h:229
MemRefDescriptor createMemRefDescriptor(Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr, ArrayRef< Value > sizes, ArrayRef< Value > strides, ConversionPatternRewriter &rewriter) const
Creates and populates a canonical memref descriptor struct.
Definition Pattern.cpp:202
void getMemRefDescriptorSizes(Location loc, MemRefType memRefType, ValueRange dynamicSizes, ConversionPatternRewriter &rewriter, SmallVectorImpl< Value > &sizes, SmallVectorImpl< Value > &strides, Value &size, bool sizeInBytes=true) const
Computes sizes, strides and buffer size of memRefType with identity layout.
Definition Pattern.cpp:90
const LLVMTypeConverter * getTypeConverter() const
Definition Pattern.cpp:29
LLVM::LLVMDialect & getDialect() const
Returns the LLVM dialect.
Definition Pattern.cpp:34
LogicalResult copyUnrankedDescriptors(OpBuilder &builder, Location loc, TypeRange origTypes, SmallVectorImpl< Value > &operands, bool toDynamic) const
Copies the memory descriptor for any operands that were unranked descriptors originally to heap-alloc...
Definition Pattern.cpp:290
This class contains all of the information necessary to report a diagnostic to the DiagnosticEngine.
An instance of this location represents a tuple of file, line number, and column number.
Definition Location.h:174
Conversion from types to the LLVM IR dialect.
Type packFunctionResults(TypeRange types, bool useBarePointerCallConv=false, SmallVector< SmallVector< Type > > *groupedTypes=nullptr, int64_t *numConvertedTypes=nullptr) const
Convert a non-empty list of types to be returned from a function into an LLVM-compatible type.
Type convertFunctionSignature(FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv, SignatureConversion &result) const
Convert a function type.
const LowerToLLVMOptions & getOptions() const
FailureOr< unsigned > getMemRefAddressSpace(BaseMemRefType type) const
Return the LLVM address space corresponding to the memory space of the memref type type or failure if...
LocationAttr findInstanceOfOrUnknown()
Return an instance of the given location type if one is nested under the current location else return...
Definition Location.h:60
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
Helper class to produce LLVM dialect operations extracting or inserting elements of a MemRef descript...
static MemRefDescriptor fromStaticShape(OpBuilder &builder, Location loc, const LLVMTypeConverter &typeConverter, MemRefType type, Value memory)
Builds IR creating a MemRef descriptor that represents type and populates it with static shape and st...
Value allocatedPtr(OpBuilder &builder, Location loc)
Builds IR extracting the allocated pointer from the descriptor.
NamedAttrList is array of NamedAttributes that tracks whether it is sorted and does some basic work t...
Attribute erase(StringAttr name)
Erase the attribute with the given name from the list.
void append(StringRef name, Attribute attr)
Add an attribute with the specified name.
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:350
This class helps build Operations.
Definition Builders.h:209
A trait used to provide symbol table functionalities to a region operation.
StringAttr getIdentifier() const
Return the name of this operation as a StringAttr.
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
Region & getRegion(unsigned index)
Returns the region held by this operation at position 'index'.
Definition Operation.h:712
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition Operation.h:538
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition Operation.h:433
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Definition Operation.h:274
Location getLoc()
The source location the operation was defined or derived from.
Definition Operation.h:241
OperationName getName()
The name of an operation is the key identifier for it.
Definition Operation.h:116
result_type_range getResultTypes()
Definition Operation.h:454
Block & front()
Definition Region.h:65
iterator_range< OpIterator > getOps()
Definition Region.h:172
static SmallString< N > generateSymbolName(StringRef name, UniqueChecker uniqueChecker, unsigned &uniquingCounter)
Generate a unique symbol name.
static Operation * lookupSymbolIn(Operation *op, StringAttr symbol)
Returns the operation registered with the given symbol name with the regions of 'symbolTableOp'.
This class provides an abstraction over the various different ranges of value types.
Definition TypeRange.h:40
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition Types.cpp:118
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition Types.cpp:124
This class provides an abstraction over the different types of ranges over Values.
Definition ValueRange.h:389
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Type getType() const
Return the type of this value.
Definition Value.h:105
LogicalResult handleMultidimensionalVectors(Operation *op, ValueRange operands, const LLVMTypeConverter &typeConverter, std::function< Value(Type, ValueRange)> createOperand, ConversionPatternRewriter &rewriter)
LogicalResult scalarizeVectorOp(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter, const LLVMTypeConverter &converter)
Unrolls op to array/vector elements.
Include the generated interface declarations.
InFlightDiagnostic emitWarning(Location loc)
Utility method to emit a warning message using this location.
FailureOr< LoweredLLVMFuncAttrs > lowerDiscardableAttrsForLLVMFunc(FunctionOpInterface funcOp, Type llvmFuncType)
Partition funcOp's discardables for llvm.func: sym_name, function_type, and typed properties from llv...
LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc, OpBuilder &b, StringRef name, LLVM::LLVMFunctionType type)
Note that these functions don't take a SymbolTable because GPU module lowerings can have name collisi...
std::function< unsigned(gpu::AddressSpace)> MemorySpaceMapping
A function that maps a MemorySpace enum to a target-specific integer value.
detail::DenseArrayAttrImpl< int32_t > DenseI32ArrayAttr
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc, Operation *moduleOp, Type llvmI8, StringRef namePrefix, StringRef str, uint64_t alignment=0, unsigned addrSpace=0)
Create a global that contains the given string.
LogicalResult matchAndRewrite(gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
FailureOr< LoweredLLVMFuncAttrs > buildLoweredGPULLVMFuncAttrs(gpu::GPUFuncOp gpuFuncOp, Type llvmFuncType, OpBuilder &rewriter) const
Lower discardable attrs like func lowering, then set llvm.func properties and append GPU / target-spe...
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::ReturnOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override