MLIR  21.0.0git
SelectObjectAttr.cpp
Go to the documentation of this file.
1 //===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the `OffloadingLLVMTranslationAttrInterface` for the
10 // `SelectObject` attribute.
11 //
12 //===----------------------------------------------------------------------===//
13 
16 
20 
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/LLVMContext.h"
25 #include "llvm/IR/Module.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Transforms/Utils/ModuleUtils.h"
28 
29 using namespace mlir;
30 
31 namespace {
32 // Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
33 class SelectObjectAttrImpl
34  : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
35  SelectObjectAttrImpl> {
36  // Returns the selected object for embedding.
37  gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
38 
39 public:
40  // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
41  // global binary string which gets loaded/unloaded into a global module
42  // object through a global ctor/dtor.
43  LogicalResult embedBinary(Attribute attribute, Operation *operation,
44  llvm::IRBuilderBase &builder,
45  LLVM::ModuleTranslation &moduleTranslation) const;
46 
47  // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
48  // in a kernel launch call.
49  LogicalResult launchKernel(Attribute attribute,
50  Operation *launchFuncOperation,
51  Operation *binaryOperation,
52  llvm::IRBuilderBase &builder,
53  LLVM::ModuleTranslation &moduleTranslation) const;
54 };
55 } // namespace
56 
57 gpu::ObjectAttr
58 SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
59  ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
60 
61  // Obtain the index of the object to select.
62  int64_t index = -1;
63  if (Attribute target =
64  cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
65  .getTarget()) {
66  // If the target attribute is a number it is the index. Otherwise compare
67  // the attribute to every target inside the object array to find the index.
68  if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
69  index = indexAttr.getInt();
70  } else {
71  for (auto [i, attr] : llvm::enumerate(objects)) {
72  auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
73  if (obj.getTarget() == target) {
74  index = i;
75  }
76  }
77  }
78  } else {
79  // If the target attribute is null then it's selecting the first object in
80  // the object array.
81  index = 0;
82  }
83 
84  if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
85  op->emitError("the requested target object couldn't be found");
86  return nullptr;
87  }
88  return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
89 }
90 
91 static Twine getModuleIdentifier(StringRef moduleName) {
92  return moduleName + "_module";
93 }
94 
95 namespace llvm {
96 static LogicalResult embedBinaryImpl(StringRef moduleName,
97  gpu::ObjectAttr object, Module &module) {
98 
99  // Embed the object as a global string.
100  // Add null for assembly output for JIT paths that expect null-terminated
101  // strings.
102  bool addNull = (object.getFormat() == gpu::CompilationTarget::Assembly);
103  StringRef serializedStr = object.getObject().getValue();
104  Constant *serializedCst =
105  ConstantDataArray::getString(module.getContext(), serializedStr, addNull);
106  GlobalVariable *serializedObj =
107  new GlobalVariable(module, serializedCst->getType(), true,
108  GlobalValue::LinkageTypes::InternalLinkage,
109  serializedCst, moduleName + "_binary");
110  serializedObj->setAlignment(MaybeAlign(8));
111  serializedObj->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
112 
113  // Default JIT optimization level.
114  auto optLevel = APInt::getZero(32);
115 
116  if (DictionaryAttr objectProps = object.getProperties()) {
117  if (auto section = dyn_cast_or_null<StringAttr>(
118  objectProps.get(gpu::elfSectionName))) {
119  serializedObj->setSection(section.getValue());
120  }
121  // Check if there's an optimization level embedded in the object.
122  if (auto optAttr = dyn_cast_or_null<IntegerAttr>(objectProps.get("O")))
123  optLevel = optAttr.getValue();
124  }
125 
126  IRBuilder<> builder(module.getContext());
127  auto i32Ty = builder.getInt32Ty();
128  auto i64Ty = builder.getInt64Ty();
129  auto ptrTy = builder.getPtrTy(0);
130  auto voidTy = builder.getVoidTy();
131 
132  // Embed the module as a global object.
133  auto *modulePtr = new GlobalVariable(
134  module, ptrTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
135  /*Initializer=*/ConstantPointerNull::get(ptrTy),
136  getModuleIdentifier(moduleName));
137 
138  auto *loadFn = Function::Create(FunctionType::get(voidTy, /*IsVarArg=*/false),
139  GlobalValue::InternalLinkage,
140  moduleName + "_load", module);
141  loadFn->setSection(".text.startup");
142  auto *loadBlock = BasicBlock::Create(module.getContext(), "entry", loadFn);
143  builder.SetInsertPoint(loadBlock);
144  Value *moduleObj = [&] {
145  if (object.getFormat() == gpu::CompilationTarget::Assembly) {
146  FunctionCallee moduleLoadFn = module.getOrInsertFunction(
147  "mgpuModuleLoadJIT", FunctionType::get(ptrTy, {ptrTy, i32Ty}, false));
148  Constant *optValue = ConstantInt::get(i32Ty, optLevel);
149  return builder.CreateCall(moduleLoadFn, {serializedObj, optValue});
150  } else {
151  FunctionCallee moduleLoadFn = module.getOrInsertFunction(
152  "mgpuModuleLoad", FunctionType::get(ptrTy, {ptrTy, i64Ty}, false));
153  Constant *binarySize =
154  ConstantInt::get(i64Ty, serializedStr.size() + (addNull ? 1 : 0));
155  return builder.CreateCall(moduleLoadFn, {serializedObj, binarySize});
156  }
157  }();
158  builder.CreateStore(moduleObj, modulePtr);
159  builder.CreateRetVoid();
160  appendToGlobalCtors(module, loadFn, /*Priority=*/123);
161 
162  auto *unloadFn = Function::Create(
163  FunctionType::get(voidTy, /*IsVarArg=*/false),
164  GlobalValue::InternalLinkage, moduleName + "_unload", module);
165  unloadFn->setSection(".text.startup");
166  auto *unloadBlock =
167  BasicBlock::Create(module.getContext(), "entry", unloadFn);
168  builder.SetInsertPoint(unloadBlock);
169  FunctionCallee moduleUnloadFn = module.getOrInsertFunction(
170  "mgpuModuleUnload", FunctionType::get(voidTy, ptrTy, false));
171  builder.CreateCall(moduleUnloadFn, builder.CreateLoad(ptrTy, modulePtr));
172  builder.CreateRetVoid();
173  appendToGlobalDtors(module, unloadFn, /*Priority=*/123);
174 
175  return success();
176 }
177 } // namespace llvm
178 
179 LogicalResult SelectObjectAttrImpl::embedBinary(
180  Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
181  LLVM::ModuleTranslation &moduleTranslation) const {
182  assert(operation && "The binary operation must be non null.");
183  if (!operation)
184  return failure();
185 
186  auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
187  if (!op) {
188  operation->emitError("operation must be a GPU binary");
189  return failure();
190  }
191 
192  gpu::ObjectAttr object = getSelectedObject(op);
193  if (!object)
194  return failure();
195 
196  return embedBinaryImpl(op.getName(), object,
197  *moduleTranslation.getLLVMModule());
198 }
199 
200 namespace llvm {
201 namespace {
202 class LaunchKernel {
203 public:
204  LaunchKernel(Module &module, IRBuilderBase &builder,
205  mlir::LLVM::ModuleTranslation &moduleTranslation);
206  // Get the kernel launch callee.
207  FunctionCallee getKernelLaunchFn();
208 
209  // Get the kernel launch callee.
210  FunctionCallee getClusterKernelLaunchFn();
211 
212  // Get the module function callee.
213  FunctionCallee getModuleFunctionFn();
214 
215  // Get the stream create callee.
216  FunctionCallee getStreamCreateFn();
217 
218  // Get the stream destroy callee.
219  FunctionCallee getStreamDestroyFn();
220 
221  // Get the stream sync callee.
222  FunctionCallee getStreamSyncFn();
223 
224  // Ger or create the function name global string.
225  Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
226 
227  // Create the void* kernel array for passing the arguments.
228  Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
229 
230  // Create the full kernel launch.
231  llvm::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
232  mlir::gpu::ObjectAttr object);
233 
234 private:
235  Module &module;
236  IRBuilderBase &builder;
237  mlir::LLVM::ModuleTranslation &moduleTranslation;
238  Type *i32Ty{};
239  Type *i64Ty{};
240  Type *voidTy{};
241  Type *intPtrTy{};
242  PointerType *ptrTy{};
243 };
244 } // namespace
245 } // namespace llvm
246 
248  Attribute attribute, Operation *launchFuncOperation,
249  Operation *binaryOperation, llvm::IRBuilderBase &builder,
250  LLVM::ModuleTranslation &moduleTranslation) const {
251 
252  assert(launchFuncOperation && "The launch func operation must be non null.");
253  if (!launchFuncOperation)
254  return failure();
255 
256  auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
257  if (!launchFuncOp) {
258  launchFuncOperation->emitError("operation must be a GPU launch func Op.");
259  return failure();
260  }
261 
262  auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
263  if (!binOp) {
264  binaryOperation->emitError("operation must be a GPU binary.");
265  return failure();
266  }
267  gpu::ObjectAttr object = getSelectedObject(binOp);
268  if (!object)
269  return failure();
270 
271  return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
272  moduleTranslation)
273  .createKernelLaunch(launchFuncOp, object);
274 }
275 
276 llvm::LaunchKernel::LaunchKernel(
277  Module &module, IRBuilderBase &builder,
278  mlir::LLVM::ModuleTranslation &moduleTranslation)
279  : module(module), builder(builder), moduleTranslation(moduleTranslation) {
280  i32Ty = builder.getInt32Ty();
281  i64Ty = builder.getInt64Ty();
282  ptrTy = builder.getPtrTy(0);
283  voidTy = builder.getVoidTy();
284  intPtrTy = builder.getIntPtrTy(module.getDataLayout());
285 }
286 
287 llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
288  return module.getOrInsertFunction(
289  "mgpuLaunchKernel",
290  FunctionType::get(voidTy,
291  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
292  intPtrTy, intPtrTy, intPtrTy, i32Ty,
293  ptrTy, ptrTy, ptrTy, i64Ty}),
294  false));
295 }
296 
297 llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
298  return module.getOrInsertFunction(
299  "mgpuLaunchClusterKernel",
301  voidTy,
302  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
303  intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
304  i32Ty, ptrTy, ptrTy, ptrTy}),
305  false));
306 }
307 
308 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
309  return module.getOrInsertFunction(
310  "mgpuModuleGetFunction",
311  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
312 }
313 
314 llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
315  return module.getOrInsertFunction("mgpuStreamCreate",
316  FunctionType::get(ptrTy, false));
317 }
318 
319 llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
320  return module.getOrInsertFunction(
321  "mgpuStreamDestroy",
322  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
323 }
324 
325 llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
326  return module.getOrInsertFunction(
327  "mgpuStreamSynchronize",
328  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
329 }
330 
331 // Generates an LLVM IR dialect global that contains the name of the given
332 // kernel function as a C string, and returns a pointer to its beginning.
333 llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
334  StringRef kernelName) {
335  std::string globalName =
336  std::string(formatv("{0}_{1}_name", moduleName, kernelName));
337 
338  if (GlobalVariable *gv = module.getGlobalVariable(globalName, true))
339  return gv;
340 
341  return builder.CreateGlobalString(kernelName, globalName);
342 }
343 
344 // Creates a struct containing all kernel parameters on the stack and returns
345 // an array of type-erased pointers to the fields of the struct. The array can
346 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
347 // The generated code is essentially as follows:
348 //
349 // %struct = alloca(sizeof(struct { Parameters... }))
350 // %array = alloca(NumParameters * sizeof(void *))
351 // for (i : [0, NumParameters))
352 // %fieldPtr = llvm.getelementptr %struct[0, i]
353 // llvm.store parameters[i], %fieldPtr
354 // %elementPtr = llvm.getelementptr %array[i]
355 // llvm.store %fieldPtr, %elementPtr
356 // return %array
357 llvm::Value *
358 llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
359  SmallVector<Value *> args =
360  moduleTranslation.lookupValues(op.getKernelOperands());
361  SmallVector<Type *> structTypes(args.size(), nullptr);
362 
363  for (auto [i, arg] : llvm::enumerate(args))
364  structTypes[i] = arg->getType();
365 
366  Type *structTy = StructType::create(module.getContext(), structTypes);
367  Value *argStruct = builder.CreateAlloca(structTy, 0u);
368  Value *argArray = builder.CreateAlloca(
369  ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
370 
371  for (auto [i, arg] : enumerate(args)) {
372  Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
373  builder.CreateStore(arg, structMember);
374  Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
375  builder.CreateStore(structMember, arrayMember);
376  }
377  return argArray;
378 }
379 
380 // Emits LLVM IR to launch a kernel function:
381 // %1 = load %global_module_object
382 // %2 = call @mgpuModuleGetFunction(%1, %global_kernel_name)
383 // %3 = call @mgpuStreamCreate()
384 // %4 = <see createKernelArgArray()>
385 // call @mgpuLaunchKernel(%2, ..., %3, %4, ...)
386 // call @mgpuStreamSynchronize(%3)
387 // call @mgpuStreamDestroy(%3)
388 llvm::LogicalResult
389 llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
390  mlir::gpu::ObjectAttr object) {
391  auto llvmValue = [&](mlir::Value value) -> Value * {
392  Value *v = moduleTranslation.lookupValue(value);
393  assert(v && "Value has not been translated.");
394  return v;
395  };
396 
397  // Get grid dimensions.
398  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
399  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
400  *gz = llvmValue(grid.z);
401 
402  // Get block dimensions.
403  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
404  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
405  *bz = llvmValue(block.z);
406 
407  // Get dynamic shared memory size.
408  Value *dynamicMemorySize = nullptr;
409  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
410  dynamicMemorySize = llvmValue(dynSz);
411  else
412  dynamicMemorySize = ConstantInt::get(i32Ty, 0);
413 
414  // Create the argument array.
415  Value *argArray = createKernelArgArray(op);
416 
417  // Load the kernel function.
418  StringRef moduleName = op.getKernelModuleName().getValue();
419  Twine moduleIdentifier = getModuleIdentifier(moduleName);
420  Value *modulePtr = module.getGlobalVariable(moduleIdentifier.str(), true);
421  if (!modulePtr)
422  return op.emitError() << "Couldn't find the binary: " << moduleIdentifier;
423  Value *moduleObj = builder.CreateLoad(ptrTy, modulePtr);
424  Value *functionName = getOrCreateFunctionName(moduleName, op.getKernelName());
425  Value *moduleFunction =
426  builder.CreateCall(getModuleFunctionFn(), {moduleObj, functionName});
427 
428  // Get the stream to use for execution. If there's no async object then create
429  // a stream to make a synchronous kernel launch.
430  Value *stream = nullptr;
431  // Sync & destroy the stream, for synchronous launches.
432  auto destroyStream = make_scope_exit([&]() {
433  builder.CreateCall(getStreamSyncFn(), {stream});
434  builder.CreateCall(getStreamDestroyFn(), {stream});
435  });
436  if (mlir::Value asyncObject = op.getAsyncObject()) {
437  stream = llvmValue(asyncObject);
438  destroyStream.release();
439  } else {
440  stream = builder.CreateCall(getStreamCreateFn(), {});
441  }
442 
443  llvm::Constant *paramsCount =
444  llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
445 
446  // Create the launch call.
447  Value *nullPtr = ConstantPointerNull::get(ptrTy);
448 
449  // Launch kernel with clusters if cluster size is specified.
450  if (op.hasClusterSize()) {
451  mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
452  Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
453  *cz = llvmValue(cluster.z);
454  builder.CreateCall(
455  getClusterKernelLaunchFn(),
456  ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
457  dynamicMemorySize, stream, argArray, nullPtr}));
458  } else {
459  builder.CreateCall(getKernelLaunchFn(),
460  ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
461  bz, dynamicMemorySize, stream,
462  argArray, nullPtr, paramsCount}));
463  }
464 
465  return success();
466 }
467 
469  DialectRegistry &registry) {
470  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
471  SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
472  });
473 }
static Value getZero(OpBuilder &b, Location loc, Type elementType)
Get zero value for an element type.
@ None
static Twine getModuleIdentifier(StringRef moduleName)
static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, size_t sharedMemBytes, void **params, size_t paramsCount)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Implementation class for module translation.
llvm::Value * lookupValue(Value value) const
Finds an LLVM IR value corresponding to the given MLIR value.
SmallVector< llvm::Value * > lookupValues(ValueRange values)
Looks up remapped a list of remapped values.
llvm::Module * getLLVMModule()
Returns the LLVM module in which the IR is being constructed.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
The OpAsmOpInterface, see OpAsmInterface.td for more details.
Definition: CallGraph.h:229
static LogicalResult embedBinaryImpl(StringRef moduleName, gpu::ObjectAttr object, Module &module)
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void registerOffloadingLLVMTranslationInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers the offloading LLVM translation interfaces for gpu.select_object.
Include the generated interface declarations.
@ Constant
Constant integer.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39