MLIR  20.0.0git
SelectObjectAttr.cpp
Go to the documentation of this file.
1 //===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the `OffloadingLLVMTranslationAttrInterface` for the
10 // `SelectObject` attribute.
11 //
12 //===----------------------------------------------------------------------===//
13 
16 
20 
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/LLVMContext.h"
24 #include "llvm/IR/Module.h"
25 #include "llvm/Support/FormatVariadic.h"
26 
27 using namespace mlir;
28 
29 namespace {
30 // Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
31 class SelectObjectAttrImpl
32  : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
33  SelectObjectAttrImpl> {
34 public:
35  // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
36  // global binary string.
37  LogicalResult embedBinary(Attribute attribute, Operation *operation,
38  llvm::IRBuilderBase &builder,
39  LLVM::ModuleTranslation &moduleTranslation) const;
40 
41  // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
42  // in a kernel launch call.
43  LogicalResult launchKernel(Attribute attribute,
44  Operation *launchFuncOperation,
45  Operation *binaryOperation,
46  llvm::IRBuilderBase &builder,
47  LLVM::ModuleTranslation &moduleTranslation) const;
48 
49  // Returns the selected object for embedding.
50  gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
51 };
52 // Returns an identifier for the global string holding the binary.
53 std::string getBinaryIdentifier(StringRef binaryName) {
54  return binaryName.str() + "_bin_cst";
55 }
56 } // namespace
57 
59  DialectRegistry &registry) {
60  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
61  SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
62  });
63 }
64 
65 gpu::ObjectAttr
66 SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
67  ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
68 
69  // Obtain the index of the object to select.
70  int64_t index = -1;
71  if (Attribute target =
72  cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
73  .getTarget()) {
74  // If the target attribute is a number it is the index. Otherwise compare
75  // the attribute to every target inside the object array to find the index.
76  if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
77  index = indexAttr.getInt();
78  } else {
79  for (auto [i, attr] : llvm::enumerate(objects)) {
80  auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
81  if (obj.getTarget() == target) {
82  index = i;
83  }
84  }
85  }
86  } else {
87  // If the target attribute is null then it's selecting the first object in
88  // the object array.
89  index = 0;
90  }
91 
92  if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
93  op->emitError("the requested target object couldn't be found");
94  return nullptr;
95  }
96  return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
97 }
98 
99 LogicalResult SelectObjectAttrImpl::embedBinary(
100  Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
101  LLVM::ModuleTranslation &moduleTranslation) const {
102  assert(operation && "The binary operation must be non null.");
103  if (!operation)
104  return failure();
105 
106  auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
107  if (!op) {
108  operation->emitError("operation must be a GPU binary");
109  return failure();
110  }
111 
112  gpu::ObjectAttr object = getSelectedObject(op);
113  if (!object)
114  return failure();
115 
116  llvm::Module *module = moduleTranslation.getLLVMModule();
117 
118  // Embed the object as a global string.
119  llvm::Constant *binary = llvm::ConstantDataArray::getString(
120  builder.getContext(), object.getObject().getValue(), false);
121  llvm::GlobalVariable *serializedObj =
122  new llvm::GlobalVariable(*module, binary->getType(), true,
123  llvm::GlobalValue::LinkageTypes::InternalLinkage,
124  binary, getBinaryIdentifier(op.getName()));
125 
126  if (object.getProperties()) {
127  if (auto section = mlir::dyn_cast_or_null<mlir::StringAttr>(
128  object.getProperties().get(gpu::elfSectionName))) {
129  serializedObj->setSection(section.getValue());
130  }
131  }
132  serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
133  serializedObj->setAlignment(llvm::MaybeAlign(8));
134  serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
135  return success();
136 }
137 
138 namespace llvm {
139 namespace {
140 class LaunchKernel {
141 public:
142  LaunchKernel(Module &module, IRBuilderBase &builder,
143  mlir::LLVM::ModuleTranslation &moduleTranslation);
144  // Get the kernel launch callee.
145  FunctionCallee getKernelLaunchFn();
146 
147  // Get the kernel launch callee.
148  FunctionCallee getClusterKernelLaunchFn();
149 
150  // Get the module function callee.
151  FunctionCallee getModuleFunctionFn();
152 
153  // Get the module load callee.
154  FunctionCallee getModuleLoadFn();
155 
156  // Get the module load JIT callee.
157  FunctionCallee getModuleLoadJITFn();
158 
159  // Get the module unload callee.
160  FunctionCallee getModuleUnloadFn();
161 
162  // Get the stream create callee.
163  FunctionCallee getStreamCreateFn();
164 
165  // Get the stream destroy callee.
166  FunctionCallee getStreamDestroyFn();
167 
168  // Get the stream sync callee.
169  FunctionCallee getStreamSyncFn();
170 
171  // Ger or create the function name global string.
172  Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
173 
174  // Create the void* kernel array for passing the arguments.
175  Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
176 
177  // Create the full kernel launch.
178  llvm::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
179  mlir::gpu::ObjectAttr object);
180 
181 private:
182  Module &module;
183  IRBuilderBase &builder;
184  mlir::LLVM::ModuleTranslation &moduleTranslation;
185  Type *i32Ty{};
186  Type *i64Ty{};
187  Type *voidTy{};
188  Type *intPtrTy{};
189  PointerType *ptrTy{};
190 };
191 } // namespace
192 } // namespace llvm
193 
195  Attribute attribute, Operation *launchFuncOperation,
196  Operation *binaryOperation, llvm::IRBuilderBase &builder,
197  LLVM::ModuleTranslation &moduleTranslation) const {
198 
199  assert(launchFuncOperation && "The launch func operation must be non null.");
200  if (!launchFuncOperation)
201  return failure();
202 
203  auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
204  if (!launchFuncOp) {
205  launchFuncOperation->emitError("operation must be a GPU launch func Op.");
206  return failure();
207  }
208 
209  auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
210  if (!binOp) {
211  binaryOperation->emitError("operation must be a GPU binary.");
212  return failure();
213  }
214  gpu::ObjectAttr object = getSelectedObject(binOp);
215  if (!object)
216  return failure();
217 
218  return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
219  moduleTranslation)
220  .createKernelLaunch(launchFuncOp, object);
221 }
222 
223 llvm::LaunchKernel::LaunchKernel(
224  Module &module, IRBuilderBase &builder,
225  mlir::LLVM::ModuleTranslation &moduleTranslation)
226  : module(module), builder(builder), moduleTranslation(moduleTranslation) {
227  i32Ty = builder.getInt32Ty();
228  i64Ty = builder.getInt64Ty();
229  ptrTy = builder.getPtrTy(0);
230  voidTy = builder.getVoidTy();
231  intPtrTy = builder.getIntPtrTy(module.getDataLayout());
232 }
233 
234 llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
235  return module.getOrInsertFunction(
236  "mgpuLaunchKernel",
237  FunctionType::get(voidTy,
238  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
239  intPtrTy, intPtrTy, intPtrTy, i32Ty,
240  ptrTy, ptrTy, ptrTy, i64Ty}),
241  false));
242 }
243 
244 llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
245  return module.getOrInsertFunction(
246  "mgpuLaunchClusterKernel",
248  voidTy,
249  ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
250  intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
251  i32Ty, ptrTy, ptrTy, ptrTy}),
252  false));
253 }
254 
255 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
256  return module.getOrInsertFunction(
257  "mgpuModuleGetFunction",
258  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
259 }
260 
261 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
262  return module.getOrInsertFunction(
263  "mgpuModuleLoad",
264  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i64Ty}), false));
265 }
266 
267 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
268  return module.getOrInsertFunction(
269  "mgpuModuleLoadJIT",
270  FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i32Ty}), false));
271 }
272 
273 llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() {
274  return module.getOrInsertFunction(
275  "mgpuModuleUnload",
276  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
277 }
278 
279 llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
280  return module.getOrInsertFunction("mgpuStreamCreate",
281  FunctionType::get(ptrTy, false));
282 }
283 
284 llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
285  return module.getOrInsertFunction(
286  "mgpuStreamDestroy",
287  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
288 }
289 
290 llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
291  return module.getOrInsertFunction(
292  "mgpuStreamSynchronize",
293  FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
294 }
295 
296 // Generates an LLVM IR dialect global that contains the name of the given
297 // kernel function as a C string, and returns a pointer to its beginning.
298 llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
299  StringRef kernelName) {
300  std::string globalName =
301  std::string(formatv("{0}_{1}_kernel_name", moduleName, kernelName));
302 
303  if (GlobalVariable *gv = module.getGlobalVariable(globalName))
304  return gv;
305 
306  return builder.CreateGlobalString(kernelName, globalName);
307 }
308 
309 // Creates a struct containing all kernel parameters on the stack and returns
310 // an array of type-erased pointers to the fields of the struct. The array can
311 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
312 // The generated code is essentially as follows:
313 //
314 // %struct = alloca(sizeof(struct { Parameters... }))
315 // %array = alloca(NumParameters * sizeof(void *))
316 // for (i : [0, NumParameters))
317 // %fieldPtr = llvm.getelementptr %struct[0, i]
318 // llvm.store parameters[i], %fieldPtr
319 // %elementPtr = llvm.getelementptr %array[i]
320 // llvm.store %fieldPtr, %elementPtr
321 // return %array
322 llvm::Value *
323 llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
324  SmallVector<Value *> args =
325  moduleTranslation.lookupValues(op.getKernelOperands());
326  SmallVector<Type *> structTypes(args.size(), nullptr);
327 
328  for (auto [i, arg] : llvm::enumerate(args))
329  structTypes[i] = arg->getType();
330 
331  Type *structTy = StructType::create(module.getContext(), structTypes);
332  Value *argStruct = builder.CreateAlloca(structTy, 0u);
333  Value *argArray = builder.CreateAlloca(
334  ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
335 
336  for (auto [i, arg] : enumerate(args)) {
337  Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
338  builder.CreateStore(arg, structMember);
339  Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
340  builder.CreateStore(structMember, arrayMember);
341  }
342  return argArray;
343 }
344 
345 // Emits LLVM IR to launch a kernel function:
346 // %0 = call %binarygetter
347 // %1 = call %moduleLoad(%0)
348 // %2 = <see generateKernelNameConstant>
349 // %3 = call %moduleGetFunction(%1, %2)
350 // %4 = call %streamCreate()
351 // %5 = <see generateParamsArray>
352 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
353 // call %streamSynchronize(%4)
354 // call %streamDestroy(%4)
355 // call %moduleUnload(%1)
356 llvm::LogicalResult
357 llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
358  mlir::gpu::ObjectAttr object) {
359  auto llvmValue = [&](mlir::Value value) -> Value * {
360  Value *v = moduleTranslation.lookupValue(value);
361  assert(v && "Value has not been translated.");
362  return v;
363  };
364 
365  // Get grid dimensions.
366  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
367  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
368  *gz = llvmValue(grid.z);
369 
370  // Get block dimensions.
371  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
372  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
373  *bz = llvmValue(block.z);
374 
375  // Get dynamic shared memory size.
376  Value *dynamicMemorySize = nullptr;
377  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
378  dynamicMemorySize = llvmValue(dynSz);
379  else
380  dynamicMemorySize = ConstantInt::get(i32Ty, 0);
381 
382  // Create the argument array.
383  Value *argArray = createKernelArgArray(op);
384 
385  // Default JIT optimization level.
386  llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
387  // Check if there's an optimization level embedded in the object.
388  DictionaryAttr objectProps = object.getProperties();
389  mlir::Attribute optAttr;
390  if (objectProps && (optAttr = objectProps.get("O"))) {
391  auto optLevel = dyn_cast<IntegerAttr>(optAttr);
392  if (!optLevel)
393  return op.emitError("the optimization level must be an integer");
394  optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
395  }
396 
397  // Load the kernel module.
398  StringRef moduleName = op.getKernelModuleName().getValue();
399  std::string binaryIdentifier = getBinaryIdentifier(moduleName);
400  Value *binary = module.getGlobalVariable(binaryIdentifier, true);
401  if (!binary)
402  return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
403 
404  auto binaryVar = dyn_cast<llvm::GlobalVariable>(binary);
405  if (!binaryVar)
406  return op.emitError() << "Binary is not a global variable: "
407  << binaryIdentifier;
408  llvm::Constant *binaryInit = binaryVar->getInitializer();
409  auto binaryDataSeq =
410  dyn_cast_if_present<llvm::ConstantDataSequential>(binaryInit);
411  if (!binaryDataSeq)
412  return op.emitError() << "Couldn't find binary data array: "
413  << binaryIdentifier;
414  llvm::Constant *binarySize =
415  llvm::ConstantInt::get(i64Ty, binaryDataSeq->getNumElements() *
416  binaryDataSeq->getElementByteSize());
417 
418  Value *moduleObject =
419  object.getFormat() == gpu::CompilationTarget::Assembly
420  ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
421  : builder.CreateCall(getModuleLoadFn(), {binary, binarySize});
422 
423  // Load the kernel function.
424  Value *moduleFunction = builder.CreateCall(
425  getModuleFunctionFn(),
426  {moduleObject,
427  getOrCreateFunctionName(moduleName, op.getKernelName().getValue())});
428 
429  // Get the stream to use for execution. If there's no async object then create
430  // a stream to make a synchronous kernel launch.
431  Value *stream = nullptr;
432  bool handleStream = false;
433  if (mlir::Value asyncObject = op.getAsyncObject()) {
434  stream = llvmValue(asyncObject);
435  } else {
436  handleStream = true;
437  stream = builder.CreateCall(getStreamCreateFn(), {});
438  }
439 
440  llvm::Constant *paramsCount =
441  llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
442 
443  // Create the launch call.
444  Value *nullPtr = ConstantPointerNull::get(ptrTy);
445 
446  // Launch kernel with clusters if cluster size is specified.
447  if (op.hasClusterSize()) {
448  mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
449  Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
450  *cz = llvmValue(cluster.z);
451  builder.CreateCall(
452  getClusterKernelLaunchFn(),
453  ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
454  dynamicMemorySize, stream, argArray, nullPtr}));
455  } else {
456  builder.CreateCall(getKernelLaunchFn(),
457  ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
458  bz, dynamicMemorySize, stream,
459  argArray, nullPtr, paramsCount}));
460  }
461 
462  // Sync & destroy the stream, for synchronous launches.
463  if (handleStream) {
464  builder.CreateCall(getStreamSyncFn(), {stream});
465  builder.CreateCall(getStreamDestroyFn(), {stream});
466  }
467 
468  // Unload the kernel module.
469  builder.CreateCall(getModuleUnloadFn(), {moduleObject});
470 
471  return success();
472 }
@ None
static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, size_t sharedMemBytes, void **params, size_t paramsCount)
Attributes are known-constant values of operations.
Definition: Attributes.h:25
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
Implementation class for module translation.
llvm::Value * lookupValue(Value value) const
Finds an LLVM IR value corresponding to the given MLIR value.
SmallVector< llvm::Value * > lookupValues(ValueRange values)
Looks up remapped a list of remapped values.
llvm::Module * getLLVMModule()
Returns the LLVM module in which the IR is being constructed.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:268
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
The OpAsmOpInterface, see OpAsmInterface.td for more details.
Definition: CallGraph.h:229
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void registerOffloadingLLVMTranslationInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers the offloading LLVM translation interfaces for gpu.select_object.
Include the generated interface declarations.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39