MLIR  20.0.0git
SelectObjectAttr.cpp
Go to the documentation of this file.
1 //===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the `OffloadingLLVMTranslationAttrInterface` for the
10 // `SelectObject` attribute.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
19 
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/Support/FormatVariadic.h"
25 
26 using namespace mlir;
27 
28 namespace {
29 // Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
30 class SelectObjectAttrImpl
31  : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
32  SelectObjectAttrImpl> {
33 public:
34  // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
35  // global binary string.
36  LogicalResult embedBinary(Attribute attribute, Operation *operation,
37  llvm::IRBuilderBase &builder,
38  LLVM::ModuleTranslation &moduleTranslation) const;
39 
40  // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
41  // in a kernel launch call.
42  LogicalResult launchKernel(Attribute attribute,
43  Operation *launchFuncOperation,
44  Operation *binaryOperation,
45  llvm::IRBuilderBase &builder,
46  LLVM::ModuleTranslation &moduleTranslation) const;
47 
48  // Returns the selected object for embedding.
49  gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
50 };
51 // Returns an identifier for the global string holding the binary.
52 std::string getBinaryIdentifier(StringRef binaryName) {
53  return binaryName.str() + "_bin_cst";
54 }
55 } // namespace
56 
58  DialectRegistry &registry) {
59  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
60  SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
61  });
62 }
63 
64 gpu::ObjectAttr
65 SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
66  ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
67 
68  // Obtain the index of the object to select.
69  int64_t index = -1;
70  if (Attribute target =
71  cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
72  .getTarget()) {
73  // If the target attribute is a number it is the index. Otherwise compare
74  // the attribute to every target inside the object array to find the index.
75  if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
76  index = indexAttr.getInt();
77  } else {
78  for (auto [i, attr] : llvm::enumerate(objects)) {
79  auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
80  if (obj.getTarget() == target) {
81  index = i;
82  }
83  }
84  }
85  } else {
86  // If the target attribute is null then it's selecting the first object in
87  // the object array.
88  index = 0;
89  }
90 
91  if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
92  op->emitError("the requested target object couldn't be found");
93  return nullptr;
94  }
95  return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
96 }
97 
98 LogicalResult SelectObjectAttrImpl::embedBinary(
99  Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
100  LLVM::ModuleTranslation &moduleTranslation) const {
101  assert(operation && "The binary operation must be non null.");
102  if (!operation)
103  return failure();
104 
105  auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
106  if (!op) {
107  operation->emitError("operation must be a GPU binary");
108  return failure();
109  }
110 
111  gpu::ObjectAttr object = getSelectedObject(op);
112  if (!object)
113  return failure();
114 
115  llvm::Module *module = moduleTranslation.getLLVMModule();
116 
117  // Embed the object as a global string.
118  llvm::Constant *binary = llvm::ConstantDataArray::getString(
119  builder.getContext(), object.getObject().getValue(), false);
120  llvm::GlobalVariable *serializedObj =
121  new llvm::GlobalVariable(*module, binary->getType(), true,
122  llvm::GlobalValue::LinkageTypes::InternalLinkage,
123  binary, getBinaryIdentifier(op.getName()));
124 
125  if (object.getProperties()) {
126  if (auto section = mlir::dyn_cast_or_null<mlir::StringAttr>(
127  object.getProperties().get("section"))) {
128  serializedObj->setSection(section.getValue());
129  }
130  }
131  serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
132  serializedObj->setAlignment(llvm::MaybeAlign(8));
133  serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
134  return success();
135 }
136 
137 namespace llvm {
138 namespace {
139 class LaunchKernel {
140 public:
141  LaunchKernel(Module &module, IRBuilderBase &builder,
142  mlir::LLVM::ModuleTranslation &moduleTranslation);
143  // Get the kernel launch callee.
144  FunctionCallee getKernelLaunchFn();
145 
146  // Get the kernel launch callee.
147  FunctionCallee getClusterKernelLaunchFn();
148 
149  // Get the module function callee.
150  FunctionCallee getModuleFunctionFn();
151 
152  // Get the module load callee.
153  FunctionCallee getModuleLoadFn();
154 
155  // Get the module load JIT callee.
156  FunctionCallee getModuleLoadJITFn();
157 
158  // Get the module unload callee.
159  FunctionCallee getModuleUnloadFn();
160 
161  // Get the stream create callee.
162  FunctionCallee getStreamCreateFn();
163 
164  // Get the stream destroy callee.
165  FunctionCallee getStreamDestroyFn();
166 
167  // Get the stream sync callee.
168  FunctionCallee getStreamSyncFn();
169 
170  // Ger or create the function name global string.
171  Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
172 
173  // Create the void* kernel array for passing the arguments.
174  Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
175 
176  // Create the full kernel launch.
177  llvm::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
178  mlir::gpu::ObjectAttr object);
179 
180 private:
181  Module &module;
182  IRBuilderBase &builder;
183  mlir::LLVM::ModuleTranslation &moduleTranslation;
184  Type *i32Ty{};
185  Type *i64Ty{};
186  Type *voidTy{};
187  Type *intPtrTy{};
188  PointerType *ptrTy{};
189 };
190 } // namespace
191 } // namespace llvm
192 
194  Attribute attribute, Operation *launchFuncOperation,
195  Operation *binaryOperation, llvm::IRBuilderBase &builder,
196  LLVM::ModuleTranslation &moduleTranslation) const {
197 
198  assert(launchFuncOperation && "The launch func operation must be non null.");
199  if (!launchFuncOperation)
200  return failure();
201 
202  auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
203  if (!launchFuncOp) {
204  launchFuncOperation->emitError("operation must be a GPU launch func Op.");
205  return failure();
206  }
207 
208  auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
209  if (!binOp) {
210  binaryOperation->emitError("operation must be a GPU binary.");
211  return failure();
212  }
213  gpu::ObjectAttr object = getSelectedObject(binOp);
214  if (!object)
215  return failure();
216 
217  return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
218  moduleTranslation)
219  .createKernelLaunch(launchFuncOp, object);
220 }
221 
222 llvm::LaunchKernel::LaunchKernel(
223  Module &module, IRBuilderBase &builder,
224  mlir::LLVM::ModuleTranslation &moduleTranslation)
225  : module(module), builder(builder), moduleTranslation(moduleTranslation) {
226  i32Ty = builder.getInt32Ty();
227  i64Ty = builder.getInt64Ty();
228  ptrTy = builder.getPtrTy(0);
229  voidTy = builder.getVoidTy();
230  intPtrTy = builder.getIntPtrTy(module.getDataLayout());
231 }
232 
233 llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
234  return module.getOrInsertFunction(
235  "mgpuLaunchKernel",
236  FunctionType::get(voidTy,
237  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
238  intPtrTy, intPtrTy, intPtrTy, i32Ty,
239  ptrTy, ptrTy, ptrTy, i64Ty}),
240  false));
241 }
242 
243 llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
244  return module.getOrInsertFunction(
245  "mgpuLaunchClusterKernel",
247  voidTy,
248  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
249  intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
250  i32Ty, ptrTy, ptrTy, ptrTy}),
251  false));
252 }
253 
254 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
255  return module.getOrInsertFunction(
256  "mgpuModuleGetFunction",
257  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
258 }
259 
260 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
261  return module.getOrInsertFunction(
262  "mgpuModuleLoad",
263  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i64Ty}), false));
264 }
265 
266 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
267  return module.getOrInsertFunction(
268  "mgpuModuleLoadJIT",
269  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i32Ty}), false));
270 }
271 
272 llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
273  return module.getOrInsertFunction(
274  "mgpuModuleUnload",
275  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
276 }
277 
278 llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
279  return module.getOrInsertFunction("mgpuStreamCreate",
280  FunctionType::get(ptrTy, false));
281 }
282 
283 llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
284  return module.getOrInsertFunction(
285  "mgpuStreamDestroy",
286  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
287 }
288 
289 llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
290  return module.getOrInsertFunction(
291  "mgpuStreamSynchronize",
292  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
293 }
294 
295 // Generates an LLVM IR dialect global that contains the name of the given
296 // kernel function as a C string, and returns a pointer to its beginning.
297 llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
298  StringRef kernelName) {
299  std::string globalName =
300  std::string(formatv("{0}_{1}_kernel_name", moduleName, kernelName));
301 
302  if (GlobalVariable *gv = module.getGlobalVariable(globalName))
303  return gv;
304 
305  return builder.CreateGlobalString(kernelName, globalName);
306 }
307 
308 // Creates a struct containing all kernel parameters on the stack and returns
309 // an array of type-erased pointers to the fields of the struct. The array can
310 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
311 // The generated code is essentially as follows:
312 //
313 // %struct = alloca(sizeof(struct { Parameters... }))
314 // %array = alloca(NumParameters * sizeof(void *))
315 // for (i : [0, NumParameters))
316 // %fieldPtr = llvm.getelementptr %struct[0, i]
317 // llvm.store parameters[i], %fieldPtr
318 // %elementPtr = llvm.getelementptr %array[i]
319 // llvm.store %fieldPtr, %elementPtr
320 // return %array
321 llvm::Value *
322 llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
323  SmallVector<Value *> args =
324  moduleTranslation.lookupValues(op.getKernelOperands());
325  SmallVector<Type *> structTypes(args.size(), nullptr);
326 
327  for (auto [i, arg] : llvm::enumerate(args))
328  structTypes[i] = arg->getType();
329 
330  Type *structTy = StructType::create(module.getContext(), structTypes);
331  Value *argStruct = builder.CreateAlloca(structTy, 0u);
332  Value *argArray = builder.CreateAlloca(
333  ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
334 
335  for (auto [i, arg] : enumerate(args)) {
336  Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
337  builder.CreateStore(arg, structMember);
338  Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
339  builder.CreateStore(structMember, arrayMember);
340  }
341  return argArray;
342 }
343 
344 // Emits LLVM IR to launch a kernel function:
345 // %0 = call %binarygetter
346 // %1 = call %moduleLoad(%0)
347 // %2 = <see generateKernelNameConstant>
348 // %3 = call %moduleGetFunction(%1, %2)
349 // %4 = call %streamCreate()
350 // %5 = <see generateParamsArray>
351 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
352 // call %streamSynchronize(%4)
353 // call %streamDestroy(%4)
354 // call %moduleUnload(%1)
355 llvm::LogicalResult
356 llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
357  mlir::gpu::ObjectAttr object) {
358  auto llvmValue = [&](mlir::Value value) -> Value * {
359  Value *v = moduleTranslation.lookupValue(value);
360  assert(v && "Value has not been translated.");
361  return v;
362  };
363 
364  // Get grid dimensions.
365  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
366  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
367  *gz = llvmValue(grid.z);
368 
369  // Get block dimensions.
370  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
371  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
372  *bz = llvmValue(block.z);
373 
374  // Get dynamic shared memory size.
375  Value *dynamicMemorySize = nullptr;
376  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
377  dynamicMemorySize = llvmValue(dynSz);
378  else
379  dynamicMemorySize = ConstantInt::get(i32Ty, 0);
380 
381  // Create the argument array.
382  Value *argArray = createKernelArgArray(op);
383 
384  // Default JIT optimization level.
385  llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
386  // Check if there's an optimization level embedded in the object.
387  DictionaryAttr objectProps = object.getProperties();
388  mlir::Attribute optAttr;
389  if (objectProps && (optAttr = objectProps.get("O"))) {
390  auto optLevel = dyn_cast<IntegerAttr>(optAttr);
391  if (!optLevel)
392  return op.emitError("the optimization level must be an integer");
393  optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
394  }
395 
396  // Load the kernel module.
397  StringRef moduleName = op.getKernelModuleName().getValue();
398  std::string binaryIdentifier = getBinaryIdentifier(moduleName);
399  Value *binary = module.getGlobalVariable(binaryIdentifier, true);
400  if (!binary)
401  return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
402 
403  auto binaryVar = dyn_cast<llvm::GlobalVariable>(binary);
404  if (!binaryVar)
405  return op.emitError() << "Binary is not a global variable: "
406  << binaryIdentifier;
407  llvm::Constant *binaryInit = binaryVar->getInitializer();
408  auto binaryDataSeq =
409  dyn_cast_if_present<llvm::ConstantDataSequential>(binaryInit);
410  if (!binaryDataSeq)
411  return op.emitError() << "Couldn't find binary data array: "
412  << binaryIdentifier;
413  llvm::Constant *binarySize =
414  llvm::ConstantInt::get(i64Ty, binaryDataSeq->getNumElements() *
415  binaryDataSeq->getElementByteSize());
416 
417  Value *moduleObject =
418  object.getFormat() == gpu::CompilationTarget::Assembly
419  ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
420  : builder.CreateCall(getModuleLoadFn(), {binary, binarySize});
421 
422  // Load the kernel function.
423  Value *moduleFunction = builder.CreateCall(
424  getModuleFunctionFn(),
425  {moduleObject,
426  getOrCreateFunctionName(moduleName, op.getKernelName().getValue())});
427 
428  // Get the stream to use for execution. If there's no async object then create
429  // a stream to make a synchronous kernel launch.
430  Value *stream = nullptr;
431  bool handleStream = false;
432  if (mlir::Value asyncObject = op.getAsyncObject()) {
433  stream = llvmValue(asyncObject);
434  } else {
435  handleStream = true;
436  stream = builder.CreateCall(getStreamCreateFn(), {});
437  }
438 
439  llvm::Constant *paramsCount =
440  llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
441 
442  // Create the launch call.
443  Value *nullPtr = ConstantPointerNull::get(ptrTy);
444 
445  // Launch kernel with clusters if cluster size is specified.
446  if (op.hasClusterSize()) {
447  mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
448  Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
449  *cz = llvmValue(cluster.z);
450  builder.CreateCall(
451  getClusterKernelLaunchFn(),
452  ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
453  dynamicMemorySize, stream, argArray, nullPtr}));
454  } else {
455  builder.CreateCall(getKernelLaunchFn(),
456  ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
457  bz, dynamicMemorySize, stream,
458  argArray, nullPtr, paramsCount}));
459  }
460 
461  // Sync & destroy the stream, for synchronous launches.
462  if (handleStream) {
463  builder.CreateCall(getStreamSyncFn(), {stream});
464  builder.CreateCall(getStreamDestroyFn(), {stream});
465  }
466 
467  // Unload the kernel module.
468  builder.CreateCall(getModuleUnloadFn(), {moduleObject});
469 
470  return success();
471 }
@ None
static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, size_t sharedMemBytes, void **params, size_t paramsCount)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Implementation class for module translation.
llvm::Value * lookupValue(Value value) const
Finds an LLVM IR value corresponding to the given MLIR value.
SmallVector< llvm::Value * > lookupValues(ValueRange values)
Looks up remapped a list of remapped values.
llvm::Module * getLLVMModule()
Returns the LLVM module in which the IR is being constructed.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
The OpAsmOpInterface, see OpAsmInterface.td for more details.
Definition: CallGraph.h:229
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void registerOffloadingLLVMTranslationInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers the offloading LLVM translation interfaces for gpu.select_object.
Include the generated interface declarations.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39