MLIR  19.0.0git
GPUOpsLowering.cpp
Go to the documentation of this file.
1 //===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "GPUOpsLowering.h"
10 
13 #include "mlir/IR/Attributes.h"
14 #include "mlir/IR/Builders.h"
15 #include "mlir/IR/BuiltinTypes.h"
16 #include "llvm/ADT/SmallVectorExtras.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/FormatVariadic.h"
19 
20 using namespace mlir;
21 
23 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
24  ConversionPatternRewriter &rewriter) const {
25  Location loc = gpuFuncOp.getLoc();
26 
27  SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
28  workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
29  for (const auto [idx, attribution] :
30  llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
31  auto type = dyn_cast<MemRefType>(attribution.getType());
32  assert(type && type.hasStaticShape() && "unexpected type in attribution");
33 
34  uint64_t numElements = type.getNumElements();
35 
36  auto elementType =
37  cast<Type>(typeConverter->convertType(type.getElementType()));
38  auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
39  std::string name =
40  std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
41  uint64_t alignment = 0;
42  if (auto alignAttr =
43  dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
44  idx, LLVM::LLVMDialect::getAlignAttrName())))
45  alignment = alignAttr.getInt();
46  auto globalOp = rewriter.create<LLVM::GlobalOp>(
47  gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
48  LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
49  workgroupAddrSpace);
50  workgroupBuffers.push_back(globalOp);
51  }
52 
53  // Remap proper input types.
54  TypeConverter::SignatureConversion signatureConversion(
55  gpuFuncOp.front().getNumArguments());
56 
58  gpuFuncOp.getFunctionType(), /*isVariadic=*/false,
59  getTypeConverter()->getOptions().useBarePtrCallConv, signatureConversion);
60  if (!funcType) {
61  return rewriter.notifyMatchFailure(gpuFuncOp, [&](Diagnostic &diag) {
62  diag << "failed to convert function signature type for: "
63  << gpuFuncOp.getFunctionType();
64  });
65  }
66 
67  // Create the new function operation. Only copy those attributes that are
68  // not specific to function modeling.
70  ArrayAttr argAttrs;
71  for (const auto &attr : gpuFuncOp->getAttrs()) {
72  if (attr.getName() == SymbolTable::getSymbolAttrName() ||
73  attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
74  attr.getName() ==
75  gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
76  attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
77  attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
78  attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
79  attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
80  continue;
81  if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
82  argAttrs = gpuFuncOp.getArgAttrsAttr();
83  continue;
84  }
85  attributes.push_back(attr);
86  }
87 
88  DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
89  DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
90  // Ensure we don't lose information if the function is lowered before its
91  // surrounding context.
92  auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
93  if (knownBlockSize)
94  attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
95  knownBlockSize);
96  if (knownGridSize)
97  attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
98  knownGridSize);
99 
100  // Add a dialect specific kernel attribute in addition to GPU kernel
101  // attribute. The former is necessary for further translation while the
102  // latter is expected by gpu.launch_func.
103  if (gpuFuncOp.isKernel()) {
104  attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
105  // Set the dialect-specific block size attribute if there is one.
106  if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
107  attributes.emplace_back(kernelBlockSizeAttributeName.value(),
108  knownBlockSize);
109  }
110  }
111  auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
112  gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
113  LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
114  /*comdat=*/nullptr, attributes);
115 
116  {
117  // Insert operations that correspond to converted workgroup and private
118  // memory attributions to the body of the function. This must operate on
119  // the original function, before the body region is inlined in the new
120  // function to maintain the relation between block arguments and the
121  // parent operation that assigns their semantics.
122  OpBuilder::InsertionGuard guard(rewriter);
123 
124  // Rewrite workgroup memory attributions to addresses of global buffers.
125  rewriter.setInsertionPointToStart(&gpuFuncOp.front());
126  unsigned numProperArguments = gpuFuncOp.getNumArguments();
127 
128  for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
129  auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
130  global.getAddrSpace());
131  Value address = rewriter.create<LLVM::AddressOfOp>(
132  loc, ptrType, global.getSymNameAttr());
133  Value memory =
134  rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
135  ArrayRef<LLVM::GEPArg>{0, 0});
136 
137  // Build a memref descriptor pointing to the buffer to plug with the
138  // existing memref infrastructure. This may use more registers than
139  // otherwise necessary given that memref sizes are fixed, but we can try
140  // and canonicalize that away later.
141  Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
142  auto type = cast<MemRefType>(attribution.getType());
144  rewriter, loc, *getTypeConverter(), type, memory);
145  signatureConversion.remapInput(numProperArguments + idx, descr);
146  }
147 
148  // Rewrite private memory attributions to alloca'ed buffers.
149  unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
150  auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
151  for (const auto [idx, attribution] :
152  llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
153  auto type = cast<MemRefType>(attribution.getType());
154  assert(type && type.hasStaticShape() && "unexpected type in attribution");
155 
156  // Explicitly drop memory space when lowering private memory
157  // attributions since NVVM models it as `alloca`s in the default
158  // memory space and does not support `alloca`s with addrspace(5).
159  Type elementType = typeConverter->convertType(type.getElementType());
160  auto ptrType =
161  LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
162  Value numElements = rewriter.create<LLVM::ConstantOp>(
163  gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
164  uint64_t alignment = 0;
165  if (auto alignAttr =
166  dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
167  idx, LLVM::LLVMDialect::getAlignAttrName())))
168  alignment = alignAttr.getInt();
169  Value allocated = rewriter.create<LLVM::AllocaOp>(
170  gpuFuncOp.getLoc(), ptrType, elementType, numElements, alignment);
172  rewriter, loc, *getTypeConverter(), type, allocated);
173  signatureConversion.remapInput(
174  numProperArguments + numWorkgroupAttributions + idx, descr);
175  }
176  }
177 
178  // Move the region to the new function, update the entry block signature.
179  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
180  llvmFuncOp.end());
181  if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
182  &signatureConversion)))
183  return failure();
184 
185  // If bare memref pointers are being used, remap them back to memref
186  // descriptors This must be done after signature conversion to get rid of the
187  // unrealized casts.
188  if (getTypeConverter()->getOptions().useBarePtrCallConv) {
189  OpBuilder::InsertionGuard guard(rewriter);
190  rewriter.setInsertionPointToStart(&llvmFuncOp.getBody().front());
191  for (const auto [idx, argTy] :
192  llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
193  auto memrefTy = dyn_cast<MemRefType>(argTy);
194  if (!memrefTy)
195  continue;
196  assert(memrefTy.hasStaticShape() &&
197  "Bare pointer convertion used with dynamically-shaped memrefs");
198  // Use a placeholder when replacing uses of the memref argument to prevent
199  // circular replacements.
200  auto remapping = signatureConversion.getInputMapping(idx);
201  assert(remapping && remapping->size == 1 &&
202  "Type converter should produce 1-to-1 mapping for bare memrefs");
203  BlockArgument newArg =
204  llvmFuncOp.getBody().getArgument(remapping->inputNo);
205  auto placeholder = rewriter.create<LLVM::UndefOp>(
206  loc, getTypeConverter()->convertType(memrefTy));
207  rewriter.replaceUsesOfBlockArgument(newArg, placeholder);
209  rewriter, loc, *getTypeConverter(), memrefTy, newArg);
210  rewriter.replaceOp(placeholder, {desc});
211  }
212  }
213 
214  // Get memref type from function arguments and set the noalias to
215  // pointer arguments.
216  for (const auto [idx, argTy] :
217  llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
218  auto remapping = signatureConversion.getInputMapping(idx);
219  NamedAttrList argAttr =
220  argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) : NamedAttrList();
221  auto copyAttribute = [&](StringRef attrName) {
222  Attribute attr = argAttr.erase(attrName);
223  if (!attr)
224  return;
225  for (size_t i = 0, e = remapping->size; i < e; ++i)
226  llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
227  };
228  auto copyPointerAttribute = [&](StringRef attrName) {
229  Attribute attr = argAttr.erase(attrName);
230 
231  if (!attr)
232  return;
233  if (remapping->size > 1 &&
234  attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
235  emitWarning(llvmFuncOp.getLoc(),
236  "Cannot copy noalias with non-bare pointers.\n");
237  return;
238  }
239  for (size_t i = 0, e = remapping->size; i < e; ++i) {
240  if (isa<LLVM::LLVMPointerType>(
241  llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
242  llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
243  }
244  }
245  };
246 
247  if (argAttr.empty())
248  continue;
249 
250  copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
251  copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
252  copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
253  bool lowersToPointer = false;
254  for (size_t i = 0, e = remapping->size; i < e; ++i) {
255  lowersToPointer |= isa<LLVM::LLVMPointerType>(
256  llvmFuncOp.getArgument(remapping->inputNo + i).getType());
257  }
258 
259  if (lowersToPointer) {
260  copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
261  copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
262  copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
263  copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
264  copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
265  copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
266  copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
267  copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
268  copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
269  copyPointerAttribute(
270  LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
271  }
272  }
273  rewriter.eraseOp(gpuFuncOp);
274  return success();
275 }
276 
277 static SmallString<16> getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp) {
278  const char formatStringPrefix[] = "printfFormat_";
279  // Get a unique global name.
280  unsigned stringNumber = 0;
281  SmallString<16> stringConstName;
282  do {
283  stringConstName.clear();
284  (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
285  } while (moduleOp.lookupSymbol(stringConstName));
286  return stringConstName;
287 }
288 
289 template <typename T>
290 static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
291  ConversionPatternRewriter &rewriter,
292  StringRef name,
293  LLVM::LLVMFunctionType type) {
294  LLVM::LLVMFuncOp ret;
295  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
296  ConversionPatternRewriter::InsertionGuard guard(rewriter);
297  rewriter.setInsertionPointToStart(moduleOp.getBody());
298  ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
299  LLVM::Linkage::External);
300  }
301  return ret;
302 }
303 
305  gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
306  ConversionPatternRewriter &rewriter) const {
307  Location loc = gpuPrintfOp->getLoc();
308 
309  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
310  auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
311  mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
312  mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
313  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
314  // This ensures that global constants and declarations are placed within
315  // the device code, not the host code
316  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
317 
318  auto ocklBegin =
319  getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
320  LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
321  LLVM::LLVMFuncOp ocklAppendArgs;
322  if (!adaptor.getArgs().empty()) {
323  ocklAppendArgs = getOrDefineFunction(
324  moduleOp, loc, rewriter, "__ockl_printf_append_args",
326  llvmI64, {llvmI64, /*numArgs*/ llvmI32, llvmI64, llvmI64, llvmI64,
327  llvmI64, llvmI64, llvmI64, llvmI64, /*isLast*/ llvmI32}));
328  }
329  auto ocklAppendStringN = getOrDefineFunction(
330  moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
332  llvmI64,
333  {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));
334 
335  /// Start the printf hostcall
336  Value zeroI64 = rewriter.create<LLVM::ConstantOp>(loc, llvmI64, 0);
337  auto printfBeginCall = rewriter.create<LLVM::CallOp>(loc, ocklBegin, zeroI64);
338  Value printfDesc = printfBeginCall.getResult();
339 
340  // Get a unique global name for the format.
341  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
342 
343  llvm::SmallString<20> formatString(adaptor.getFormat());
344  formatString.push_back('\0'); // Null terminate for C
345  size_t formatStringSize = formatString.size_in_bytes();
346 
347  auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize);
348  LLVM::GlobalOp global;
349  {
351  rewriter.setInsertionPointToStart(moduleOp.getBody());
352  global = rewriter.create<LLVM::GlobalOp>(
353  loc, globalType,
354  /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
355  rewriter.getStringAttr(formatString));
356  }
357 
358  // Get a pointer to the format string's first element and pass it to printf()
359  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
360  loc,
361  LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
362  global.getSymNameAttr());
363  Value stringStart = rewriter.create<LLVM::GEPOp>(
364  loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
365  Value stringLen =
366  rewriter.create<LLVM::ConstantOp>(loc, llvmI64, formatStringSize);
367 
368  Value oneI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 1);
369  Value zeroI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 0);
370 
371  auto appendFormatCall = rewriter.create<LLVM::CallOp>(
372  loc, ocklAppendStringN,
373  ValueRange{printfDesc, stringStart, stringLen,
374  adaptor.getArgs().empty() ? oneI32 : zeroI32});
375  printfDesc = appendFormatCall.getResult();
376 
377  // __ockl_printf_append_args takes 7 values per append call
378  constexpr size_t argsPerAppend = 7;
379  size_t nArgs = adaptor.getArgs().size();
380  for (size_t group = 0; group < nArgs; group += argsPerAppend) {
381  size_t bound = std::min(group + argsPerAppend, nArgs);
382  size_t numArgsThisCall = bound - group;
383 
385  arguments.push_back(printfDesc);
386  arguments.push_back(
387  rewriter.create<LLVM::ConstantOp>(loc, llvmI32, numArgsThisCall));
388  for (size_t i = group; i < bound; ++i) {
389  Value arg = adaptor.getArgs()[i];
390  if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
391  if (!floatType.isF64())
392  arg = rewriter.create<LLVM::FPExtOp>(
393  loc, typeConverter->convertType(rewriter.getF64Type()), arg);
394  arg = rewriter.create<LLVM::BitcastOp>(loc, llvmI64, arg);
395  }
396  if (arg.getType().getIntOrFloatBitWidth() != 64)
397  arg = rewriter.create<LLVM::ZExtOp>(loc, llvmI64, arg);
398 
399  arguments.push_back(arg);
400  }
401  // Pad out to 7 arguments since the hostcall always needs 7
402  for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
403  arguments.push_back(zeroI64);
404  }
405 
406  auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
407  arguments.push_back(isLast);
408  auto call = rewriter.create<LLVM::CallOp>(loc, ocklAppendArgs, arguments);
409  printfDesc = call.getResult();
410  }
411  rewriter.eraseOp(gpuPrintfOp);
412  return success();
413 }
414 
416  gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
417  ConversionPatternRewriter &rewriter) const {
418  Location loc = gpuPrintfOp->getLoc();
419 
420  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
421  mlir::Type ptrType =
422  LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
423 
424  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
425  // This ensures that global constants and declarations are placed within
426  // the device code, not the host code
427  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
428 
429  auto printfType =
430  LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
431  /*isVarArg=*/true);
432  LLVM::LLVMFuncOp printfDecl =
433  getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
434 
435  // Get a unique global name for the format.
436  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
437 
438  llvm::SmallString<20> formatString(adaptor.getFormat());
439  formatString.push_back('\0'); // Null terminate for C
440  auto globalType =
441  LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
442  LLVM::GlobalOp global;
443  {
445  rewriter.setInsertionPointToStart(moduleOp.getBody());
446  global = rewriter.create<LLVM::GlobalOp>(
447  loc, globalType,
448  /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
449  rewriter.getStringAttr(formatString), /*allignment=*/0, addressSpace);
450  }
451 
452  // Get a pointer to the format string's first element
453  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
454  loc,
455  LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
456  global.getSymNameAttr());
457  Value stringStart = rewriter.create<LLVM::GEPOp>(
458  loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
459 
460  // Construct arguments and function call
461  auto argsRange = adaptor.getArgs();
462  SmallVector<Value, 4> printfArgs;
463  printfArgs.reserve(argsRange.size() + 1);
464  printfArgs.push_back(stringStart);
465  printfArgs.append(argsRange.begin(), argsRange.end());
466 
467  rewriter.create<LLVM::CallOp>(loc, printfDecl, printfArgs);
468  rewriter.eraseOp(gpuPrintfOp);
469  return success();
470 }
471 
473  gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
474  ConversionPatternRewriter &rewriter) const {
475  Location loc = gpuPrintfOp->getLoc();
476 
477  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
478  mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
479 
480  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
481  // This ensures that global constants and declarations are placed within
482  // the device code, not the host code
483  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
484 
485  auto vprintfType =
486  LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
487  LLVM::LLVMFuncOp vprintfDecl =
488  getOrDefineFunction(moduleOp, loc, rewriter, "vprintf", vprintfType);
489 
490  // Get a unique global name for the format.
491  SmallString<16> stringConstName = getUniqueFormatGlobalName(moduleOp);
492 
493  llvm::SmallString<20> formatString(adaptor.getFormat());
494  formatString.push_back('\0'); // Null terminate for C
495  auto globalType =
496  LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
497  LLVM::GlobalOp global;
498  {
500  rewriter.setInsertionPointToStart(moduleOp.getBody());
501  global = rewriter.create<LLVM::GlobalOp>(
502  loc, globalType,
503  /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName,
504  rewriter.getStringAttr(formatString), /*allignment=*/0);
505  }
506 
507  // Get a pointer to the format string's first element
508  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
509  Value stringStart = rewriter.create<LLVM::GEPOp>(
510  loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
511  SmallVector<Type> types;
512  SmallVector<Value> args;
513  // Promote and pack the arguments into a stack allocation.
514  for (Value arg : adaptor.getArgs()) {
515  Type type = arg.getType();
516  Value promotedArg = arg;
517  assert(type.isIntOrFloat());
518  if (isa<FloatType>(type)) {
519  type = rewriter.getF64Type();
520  promotedArg = rewriter.create<LLVM::FPExtOp>(loc, type, arg);
521  }
522  types.push_back(type);
523  args.push_back(promotedArg);
524  }
525  Type structType =
526  LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
527  Value one = rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(),
528  rewriter.getIndexAttr(1));
529  Value tempAlloc =
530  rewriter.create<LLVM::AllocaOp>(loc, ptrType, structType, one,
531  /*alignment=*/0);
532  for (auto [index, arg] : llvm::enumerate(args)) {
533  Value ptr = rewriter.create<LLVM::GEPOp>(
534  loc, ptrType, structType, tempAlloc,
535  ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
536  rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
537  }
538  std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
539 
540  rewriter.create<LLVM::CallOp>(loc, vprintfDecl, printfArgs);
541  rewriter.eraseOp(gpuPrintfOp);
542  return success();
543 }
544 
545 /// Unrolls op if it's operating on vectors.
547  ConversionPatternRewriter &rewriter,
548  const LLVMTypeConverter &converter) {
549  TypeRange operandTypes(operands);
550  if (llvm::none_of(operandTypes, llvm::IsaPred<VectorType>)) {
551  return rewriter.notifyMatchFailure(op, "expected vector operand");
552  }
553  if (op->getNumRegions() != 0 || op->getNumSuccessors() != 0)
554  return rewriter.notifyMatchFailure(op, "expected no region/successor");
555  if (op->getNumResults() != 1)
556  return rewriter.notifyMatchFailure(op, "expected single result");
557  VectorType vectorType = dyn_cast<VectorType>(op->getResult(0).getType());
558  if (!vectorType)
559  return rewriter.notifyMatchFailure(op, "expected vector result");
560 
561  Location loc = op->getLoc();
562  Value result = rewriter.create<LLVM::UndefOp>(loc, vectorType);
563  Type indexType = converter.convertType(rewriter.getIndexType());
564  StringAttr name = op->getName().getIdentifier();
565  Type elementType = vectorType.getElementType();
566 
567  for (int64_t i = 0; i < vectorType.getNumElements(); ++i) {
568  Value index = rewriter.create<LLVM::ConstantOp>(loc, indexType, i);
569  auto extractElement = [&](Value operand) -> Value {
570  if (!isa<VectorType>(operand.getType()))
571  return operand;
572  return rewriter.create<LLVM::ExtractElementOp>(loc, operand, index);
573  };
574  auto scalarOperands = llvm::map_to_vector(operands, extractElement);
575  Operation *scalarOp =
576  rewriter.create(loc, name, scalarOperands, elementType, op->getAttrs());
577  result = rewriter.create<LLVM::InsertElementOp>(
578  loc, result, scalarOp->getResult(0), index);
579  }
580 
581  rewriter.replaceOp(op, result);
582  return success();
583 }
584 
585 static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
586  return IntegerAttr::get(IntegerType::get(ctx, 64), space);
587 }
588 
589 /// Generates a symbol with 0-sized array type for dynamic shared memory usage,
590 /// or uses existing symbol.
591 LLVM::GlobalOp
593  Operation *moduleOp, gpu::DynamicSharedMemoryOp op,
594  const LLVMTypeConverter *typeConverter,
595  MemRefType memrefType, unsigned alignmentBit) {
596  uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();
597 
598  FailureOr<unsigned> addressSpace =
599  typeConverter->getMemRefAddressSpace(memrefType);
600  if (failed(addressSpace)) {
601  op->emitError() << "conversion of memref memory space "
602  << memrefType.getMemorySpace()
603  << " to integer address space "
604  "failed. Consider adding memory space conversions.";
605  }
606 
607  // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
608  // LLVM::GlobalOp is suitable for shared memory, return it.
609  llvm::StringSet<> existingGlobalNames;
610  for (auto globalOp :
611  moduleOp->getRegion(0).front().getOps<LLVM::GlobalOp>()) {
612  existingGlobalNames.insert(globalOp.getSymName());
613  if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
614  if (globalOp.getAddrSpace() == addressSpace.value() &&
615  arrayType.getNumElements() == 0 &&
616  globalOp.getAlignment().value_or(0) == alignmentByte) {
617  return globalOp;
618  }
619  }
620  }
621 
622  // Step 2. Find a unique symbol name
623  unsigned uniquingCounter = 0;
624  SmallString<128> symName = SymbolTable::generateSymbolName<128>(
625  "__dynamic_shmem_",
626  [&](StringRef candidate) {
627  return existingGlobalNames.contains(candidate);
628  },
629  uniquingCounter);
630 
631  // Step 3. Generate a global op
632  OpBuilder::InsertionGuard guard(rewriter);
633  rewriter.setInsertionPoint(&moduleOp->getRegion(0).front().front());
634 
635  auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
636  typeConverter->convertType(memrefType.getElementType()), 0);
637 
638  return rewriter.create<LLVM::GlobalOp>(
639  op->getLoc(), zeroSizedArrayType, /*isConstant=*/false,
640  LLVM::Linkage::Internal, symName, /*value=*/Attribute(), alignmentByte,
641  addressSpace.value());
642 }
643 
645  gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
646  ConversionPatternRewriter &rewriter) const {
647  Location loc = op.getLoc();
648  MemRefType memrefType = op.getResultMemref().getType();
649  Type elementType = typeConverter->convertType(memrefType.getElementType());
650 
651  // Step 1: Generate a memref<0xi8> type
652  MemRefLayoutAttrInterface layout = {};
653  auto memrefType0sz =
654  MemRefType::get({0}, elementType, layout, memrefType.getMemorySpace());
655 
656  // Step 2: Generate a global symbol or existing for the dynamic shared
657  // memory with memref<0xi8> type
658  LLVM::LLVMFuncOp funcOp = op->getParentOfType<LLVM::LLVMFuncOp>();
659  LLVM::GlobalOp shmemOp = {};
660  Operation *moduleOp = funcOp->getParentWithTrait<OpTrait::SymbolTable>();
662  rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);
663 
664  // Step 3. Get address of the global symbol
665  OpBuilder::InsertionGuard guard(rewriter);
666  rewriter.setInsertionPoint(op);
667  auto basePtr = rewriter.create<LLVM::AddressOfOp>(loc, shmemOp);
668  Type baseType = basePtr->getResultTypes().front();
669 
670  // Step 4. Generate GEP using offsets
671  SmallVector<LLVM::GEPArg> gepArgs = {0};
672  Value shmemPtr = rewriter.create<LLVM::GEPOp>(loc, baseType, elementType,
673  basePtr, gepArgs);
674  // Step 5. Create a memref descriptor
675  SmallVector<Value> shape, strides;
676  Value sizeBytes;
677  getMemRefDescriptorSizes(loc, memrefType0sz, {}, rewriter, shape, strides,
678  sizeBytes);
679  auto memRefDescriptor = this->createMemRefDescriptor(
680  loc, memrefType0sz, shmemPtr, shmemPtr, shape, strides, rewriter);
681 
682  // Step 5. Replace the op with memref descriptor
683  rewriter.replaceOp(op, {memRefDescriptor});
684  return success();
685 }
686 
688  TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
689  typeConverter.addTypeAttributeConversion(
690  [mapping](BaseMemRefType type, gpu::AddressSpaceAttr memorySpaceAttr) {
691  gpu::AddressSpace memorySpace = memorySpaceAttr.getValue();
692  unsigned addressSpace = mapping(memorySpace);
693  return wrapNumericMemorySpace(memorySpaceAttr.getContext(),
694  addressSpace);
695  });
696 }
static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc, ConversionPatternRewriter &rewriter, StringRef name, LLVM::LLVMFunctionType type)
LLVM::GlobalOp getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter, Operation *moduleOp, gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter, MemRefType memrefType, unsigned alignmentBit)
Generates a symbol with 0-sized array type for dynamic shared memory usage, or uses existing symbol.
static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space)
static SmallString< 16 > getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp)
static std::string diag(const llvm::Value &value)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
This class provides a shared interface for ranked and unranked memref types.
Definition: BuiltinTypes.h:138
This class represents an argument of a Block.
Definition: Value.h:319
Operation & front()
Definition: Block.h:151
iterator_range< op_iterator< OpT > > getOps()
Return an iterator range over the operations within this block that are of 'OpT'.
Definition: Block.h:191
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:124
UnitAttr getUnitAttr()
Definition: Builders.cpp:114
IntegerType getI64Type()
Definition: Builders.cpp:85
IntegerType getI32Type()
Definition: Builders.cpp:83
IntegerType getIntegerType(unsigned width)
Definition: Builders.cpp:87
StringAttr getStringAttr(const Twine &bytes)
Definition: Builders.cpp:269
MLIRContext * getContext() const
Definition: Builders.h:55
IndexType getIndexType()
Definition: Builders.cpp:71
IntegerType getI8Type()
Definition: Builders.cpp:79
FloatType getF64Type()
Definition: Builders.cpp:65
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
PatternRewriter hook for replacing an operation.
FailureOr< Block * > convertRegionTypes(Region *region, const TypeConverter &converter, TypeConverter::SignatureConversion *entryConversion=nullptr)
Apply a signature conversion to each block in the given region.
void eraseOp(Operation *op) override
PatternRewriter hook for erasing a dead operation.
void replaceUsesOfBlockArgument(BlockArgument from, Value to)
Replace all the uses of the block argument from with value to.
const TypeConverter * typeConverter
An optional type converter for use by this pattern.
MemRefDescriptor createMemRefDescriptor(Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr, ArrayRef< Value > sizes, ArrayRef< Value > strides, ConversionPatternRewriter &rewriter) const
Creates and populates a canonical memref descriptor struct.
Definition: Pattern.cpp:218
void getMemRefDescriptorSizes(Location loc, MemRefType memRefType, ValueRange dynamicSizes, ConversionPatternRewriter &rewriter, SmallVectorImpl< Value > &sizes, SmallVectorImpl< Value > &strides, Value &size, bool sizeInBytes=true) const
Computes sizes, strides and buffer size of memRefType with identity layout.
Definition: Pattern.cpp:114
const LLVMTypeConverter * getTypeConverter() const
Definition: Pattern.cpp:27
This class contains all of the information necessary to report a diagnostic to the DiagnosticEngine.
Definition: Diagnostics.h:156
This class provides support for representing a failure result, or a valid value of type T.
Definition: LogicalResult.h:78
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:34
Type convertFunctionSignature(FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv, SignatureConversion &result) const
Convert a function type.
LogicalResult convertType(Type t, SmallVectorImpl< Type > &results) const
Convert the given type.
FailureOr< unsigned > getMemRefAddressSpace(BaseMemRefType type) const
Return the LLVM address space corresponding to the memory space of the memref type type or failure if...
static LLVMStructType getLiteral(MLIRContext *context, ArrayRef< Type > types, bool isPacked=false)
Gets or creates a literal struct with the given body in the provided context.
Definition: LLVMTypes.cpp:453
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
static MemRefDescriptor fromStaticShape(OpBuilder &builder, Location loc, const LLVMTypeConverter &typeConverter, MemRefType type, Value memory)
Builds IR creating a MemRef descriptor that represents type and populates it with static shape and st...
NamedAttrList is array of NamedAttributes that tracks whether it is sorted and does some basic work t...
Attribute erase(StringAttr name)
Erase the attribute with the given name from the list.
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:350
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:433
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:400
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
A trait used to provide symbol table functionalities to a region operation.
Definition: SymbolTable.h:435
StringAttr getIdentifier() const
Return the name of this operation as a StringAttr.
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
unsigned getNumSuccessors()
Definition: Operation.h:702
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
unsigned getNumRegions()
Returns the number of regions held by this operation.
Definition: Operation.h:669
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition: Operation.h:507
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
OpTy getParentOfType()
Return the closest surrounding parent operation that is of type 'OpTy'.
Definition: Operation.h:238
Region & getRegion(unsigned index)
Returns the region held by this operation at position 'index'.
Definition: Operation.h:682
Operation * getParentWithTrait()
Returns the closest surrounding parent operation with trait Trait.
Definition: Operation.h:248
OperationName getName()
The name of an operation is the key identifier for it.
Definition: Operation.h:119
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
Block & front()
Definition: Region.h:65
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
Definition: PatternMatch.h:718
void inlineRegionBefore(Region &region, Region &parent, Region::iterator before)
Move the blocks that belong to "region" before the given position in another region "parent".
static StringRef getSymbolAttrName()
Return the name of the attribute used for symbol names.
Definition: SymbolTable.h:76
This class provides all of the information necessary to convert a type signature.
std::optional< InputMapping > getInputMapping(unsigned input) const
Get the input mapping for the given argument.
void remapInput(unsigned origInputNo, Value replacement)
Remap an input of the original signature to another replacement value.
Type conversion class.
LogicalResult convertType(Type t, SmallVectorImpl< Type > &results) const
Convert the given type.
void addTypeAttributeConversion(FnT &&callback)
Register a conversion function for attributes within types.
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
bool isIntOrFloat() const
Return true if this is an integer (of any signedness) or a float type.
Definition: Types.cpp:119
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:125
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:129
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
LogicalResult scalarizeVectorOp(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter, const LLVMTypeConverter &converter)
Unrolls op if it's operating on vectors.
Include the generated interface declarations.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
InFlightDiagnostic emitWarning(Location loc)
Utility method to emit a warning message using this location.
std::function< unsigned(gpu::AddressSpace)> MemorySpaceMapping
A function that maps a MemorySpace enum to a target-specific integer value.
Definition: GPUCommonPass.h:71
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
LogicalResult matchAndRewrite(gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26