MLIR 23.0.0git
SelectObjectAttr.cpp
Go to the documentation of this file.
1//===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the `OffloadingLLVMTranslationAttrInterface` for the
10// `SelectObject` attribute.
11//
12//===----------------------------------------------------------------------===//
13
19
20#include "llvm/ADT/ScopeExit.h"
21#include "llvm/IR/Constants.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/LLVMContext.h"
24#include "llvm/IR/Module.h"
25#include "llvm/Support/FormatVariadic.h"
26#include "llvm/Transforms/Utils/ModuleUtils.h"
27
28using namespace mlir;
29
30namespace {
31// Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
32class SelectObjectAttrImpl
33 : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
34 SelectObjectAttrImpl> {
35 // Returns the selected object for embedding.
36 gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
37
38public:
39 // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
40 // global binary string which gets loaded/unloaded into a global module
41 // object through a global ctor/dtor.
42 LogicalResult embedBinary(Attribute attribute, Operation *operation,
43 llvm::IRBuilderBase &builder,
44 LLVM::ModuleTranslation &moduleTranslation) const;
45
46 // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
47 // in a kernel launch call.
48 LogicalResult launchKernel(Attribute attribute,
49 Operation *launchFuncOperation,
50 Operation *binaryOperation,
51 llvm::IRBuilderBase &builder,
52 LLVM::ModuleTranslation &moduleTranslation) const;
53};
54} // namespace
55
56gpu::ObjectAttr
57SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
58 ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
59
60 // Obtain the index of the object to select.
61 int64_t index = -1;
62 if (Attribute target =
63 cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
64 .getTarget()) {
65 // If the target attribute is a number it is the index. Otherwise compare
66 // the attribute to every target inside the object array to find the index.
67 if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
68 index = indexAttr.getInt();
69 } else {
70 for (auto [i, attr] : llvm::enumerate(objects)) {
71 auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
72 if (obj.getTarget() == target) {
73 index = i;
74 }
75 }
76 }
77 } else {
78 // If the target attribute is null then it's selecting the first object in
79 // the object array.
80 index = 0;
81 }
82
83 if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
84 op->emitError("the requested target object couldn't be found");
85 return nullptr;
86 }
87 return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
88}
89
90static Twine getModuleIdentifier(StringRef moduleName) {
91 return moduleName + "_module";
92}
93
94namespace llvm {
95static LogicalResult embedBinaryImpl(StringRef moduleName,
96 gpu::ObjectAttr object, Module &module) {
97
98 // Embed the object as a global string.
99 // Add null for assembly output for JIT paths that expect null-terminated
100 // strings.
101 bool addNull = (object.getFormat() == gpu::CompilationTarget::Assembly);
102 StringRef serializedStr = object.getObject().getValue();
103 Constant *serializedCst =
104 ConstantDataArray::getString(module.getContext(), serializedStr, addNull);
105 GlobalVariable *serializedObj =
106 new GlobalVariable(module, serializedCst->getType(), true,
107 GlobalValue::LinkageTypes::InternalLinkage,
108 serializedCst, moduleName + "_binary");
109 serializedObj->setAlignment(MaybeAlign(8));
110 serializedObj->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
111
112 // Default JIT optimization level.
113 auto optLevel = APInt::getZero(32);
114
115 if (DictionaryAttr objectProps = object.getProperties()) {
116 if (auto section = dyn_cast_or_null<StringAttr>(
117 objectProps.get(gpu::elfSectionName))) {
118 serializedObj->setSection(section.getValue());
119 }
120 // Check if there's an optimization level embedded in the object.
121 if (auto optAttr = dyn_cast_or_null<IntegerAttr>(objectProps.get("O")))
122 optLevel = optAttr.getValue();
123 }
124
125 IRBuilder<> builder(module.getContext());
126 auto *i32Ty = builder.getInt32Ty();
127 auto *i64Ty = builder.getInt64Ty();
128 auto *ptrTy = builder.getPtrTy(0);
129 auto *voidTy = builder.getVoidTy();
130
131 // Embed the module as a global object.
132 auto *modulePtr = new GlobalVariable(
133 module, ptrTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
134 /*Initializer=*/ConstantPointerNull::get(ptrTy),
135 getModuleIdentifier(moduleName));
136
137 auto *loadFn = Function::Create(FunctionType::get(voidTy, /*IsVarArg=*/false),
138 GlobalValue::InternalLinkage,
139 moduleName + "_load", module);
140 loadFn->setSection(".text.startup");
141 auto *loadBlock = BasicBlock::Create(module.getContext(), "entry", loadFn);
142 builder.SetInsertPoint(loadBlock);
143 Value *moduleObj = [&] {
144 Constant *binarySize =
145 ConstantInt::get(i64Ty, serializedStr.size() + (addNull ? 1 : 0));
146 if (object.getFormat() == gpu::CompilationTarget::Assembly) {
147 FunctionCallee moduleLoadFn = module.getOrInsertFunction(
148 "mgpuModuleLoadJIT", FunctionType::get(ptrTy,
149 {
150 ptrTy,
151 i32Ty,
152 i64Ty,
153 },
154 false));
155
156 Constant *optValue = ConstantInt::get(i32Ty, optLevel);
157 return builder.CreateCall(moduleLoadFn,
158 {serializedObj, optValue, binarySize});
159 }
160 FunctionCallee moduleLoadFn = module.getOrInsertFunction(
161 "mgpuModuleLoad", FunctionType::get(ptrTy, {ptrTy, i64Ty}, false));
162 return builder.CreateCall(moduleLoadFn, {serializedObj, binarySize});
163 }();
164 builder.CreateStore(moduleObj, modulePtr);
165 builder.CreateRetVoid();
166 appendToGlobalCtors(module, loadFn, /*Priority=*/123);
167
168 auto *unloadFn = Function::Create(
169 FunctionType::get(voidTy, /*IsVarArg=*/false),
170 GlobalValue::InternalLinkage, moduleName + "_unload", module);
171 unloadFn->setSection(".text.startup");
172 auto *unloadBlock =
173 BasicBlock::Create(module.getContext(), "entry", unloadFn);
174 builder.SetInsertPoint(unloadBlock);
175 FunctionCallee moduleUnloadFn = module.getOrInsertFunction(
176 "mgpuModuleUnload", FunctionType::get(voidTy, ptrTy, false));
177 builder.CreateCall(moduleUnloadFn, builder.CreateLoad(ptrTy, modulePtr));
178 builder.CreateRetVoid();
179 appendToGlobalDtors(module, unloadFn, /*Priority=*/123);
180
181 return success();
182}
183} // namespace llvm
184
185LogicalResult SelectObjectAttrImpl::embedBinary(
186 Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
187 LLVM::ModuleTranslation &moduleTranslation) const {
188 assert(operation && "The binary operation must be non null.");
189 if (!operation)
190 return failure();
191
192 auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
193 if (!op) {
194 operation->emitError("operation must be a GPU binary");
195 return failure();
196 }
197
198 gpu::ObjectAttr object = getSelectedObject(op);
199 if (!object)
200 return failure();
201
202 return embedBinaryImpl(op.getName(), object,
203 *moduleTranslation.getLLVMModule());
204}
205
206namespace llvm {
207namespace {
208class LaunchKernel {
209public:
210 LaunchKernel(Module &module, IRBuilderBase &builder,
211 mlir::LLVM::ModuleTranslation &moduleTranslation);
212 // Get the kernel launch callee.
213 FunctionCallee getKernelLaunchFn();
214
215 // Get the kernel launch callee.
216 FunctionCallee getClusterKernelLaunchFn();
217
218 // Get the cooperative kernel launch callee.
219 FunctionCallee getKernelLaunchCooperativeFn();
220
221 // Get the module function callee.
222 FunctionCallee getModuleFunctionFn();
223
224 // Get the stream create callee.
225 FunctionCallee getStreamCreateFn();
226
227 // Get the stream destroy callee.
228 FunctionCallee getStreamDestroyFn();
229
230 // Get the stream sync callee.
231 FunctionCallee getStreamSyncFn();
232
233 // Ger or create the function name global string.
234 Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
235
236 // Create the void* kernel array for passing the arguments.
237 Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
238
239 // Create the full kernel launch.
240 llvm::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
241 mlir::gpu::ObjectAttr object);
242
243private:
244 Module &module;
245 IRBuilderBase &builder;
246 mlir::LLVM::ModuleTranslation &moduleTranslation;
247 Type *i32Ty{};
248 Type *i64Ty{};
249 Type *voidTy{};
250 Type *intPtrTy{};
251 PointerType *ptrTy{};
252};
253} // namespace
254} // namespace llvm
255
256LogicalResult SelectObjectAttrImpl::launchKernel(
257 Attribute attribute, Operation *launchFuncOperation,
258 Operation *binaryOperation, llvm::IRBuilderBase &builder,
259 LLVM::ModuleTranslation &moduleTranslation) const {
260
261 assert(launchFuncOperation && "The launch func operation must be non null.");
262 if (!launchFuncOperation)
263 return failure();
264
265 auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
266 if (!launchFuncOp) {
267 launchFuncOperation->emitError("operation must be a GPU launch func Op.");
268 return failure();
269 }
270
271 auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
272 if (!binOp) {
273 binaryOperation->emitError("operation must be a GPU binary.");
274 return failure();
275 }
276 gpu::ObjectAttr object = getSelectedObject(binOp);
277 if (!object)
278 return failure();
279
280 return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
281 moduleTranslation)
282 .createKernelLaunch(launchFuncOp, object);
283}
284
285llvm::LaunchKernel::LaunchKernel(
286 Module &module, IRBuilderBase &builder,
287 mlir::LLVM::ModuleTranslation &moduleTranslation)
288 : module(module), builder(builder), moduleTranslation(moduleTranslation) {
289 i32Ty = builder.getInt32Ty();
290 i64Ty = builder.getInt64Ty();
291 ptrTy = builder.getPtrTy(0);
292 voidTy = builder.getVoidTy();
293 intPtrTy = builder.getIntPtrTy(module.getDataLayout());
294}
295
296llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
297 return module.getOrInsertFunction(
298 "mgpuLaunchKernel",
299 FunctionType::get(voidTy,
300 ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
301 intPtrTy, intPtrTy, intPtrTy, i32Ty,
302 ptrTy, ptrTy, ptrTy, i64Ty}),
303 false));
304}
305
306llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
307 return module.getOrInsertFunction(
308 "mgpuLaunchClusterKernel",
309 FunctionType::get(
310 voidTy,
311 ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
312 intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
313 i32Ty, ptrTy, ptrTy, ptrTy}),
314 false));
315}
316
317llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchCooperativeFn() {
318 return module.getOrInsertFunction(
319 "mgpuLaunchKernelCooperative",
320 FunctionType::get(
321 voidTy,
322 ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
323 intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
324 i32Ty, ptrTy, ptrTy, ptrTy}),
325 false));
326}
327
328llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
329 return module.getOrInsertFunction(
330 "mgpuModuleGetFunction",
331 FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
332}
333
334llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
335 return module.getOrInsertFunction("mgpuStreamCreate",
336 FunctionType::get(ptrTy, false));
337}
338
339llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
340 return module.getOrInsertFunction(
341 "mgpuStreamDestroy",
342 FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
343}
344
345llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
346 return module.getOrInsertFunction(
347 "mgpuStreamSynchronize",
348 FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
349}
350
351// Generates an LLVM IR dialect global that contains the name of the given
352// kernel function as a C string, and returns a pointer to its beginning.
353llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
354 StringRef kernelName) {
355 std::string globalName =
356 std::string(formatv("{0}_{1}_name", moduleName, kernelName));
357
358 if (GlobalVariable *gv = module.getGlobalVariable(globalName, true))
359 return gv;
360
361 return builder.CreateGlobalString(kernelName, globalName);
362}
363
364// Creates a struct containing all kernel parameters on the stack and returns
365// an array of type-erased pointers to the fields of the struct. The array can
366// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
367// The generated code is essentially as follows:
368//
369// %struct = alloca(sizeof(struct { Parameters... }))
370// %array = alloca(NumParameters * sizeof(void *))
371// for (i : [0, NumParameters))
372// %fieldPtr = llvm.getelementptr %struct[0, i]
373// llvm.store parameters[i], %fieldPtr
374// %elementPtr = llvm.getelementptr %array[i]
375// llvm.store %fieldPtr, %elementPtr
376// return %array
377llvm::Value *
378llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
379 SmallVector<Value *> args =
380 moduleTranslation.lookupValues(op.getKernelOperands());
381 SmallVector<Type *> structTypes(args.size(), nullptr);
382
383 for (auto [i, arg] : llvm::enumerate(args))
384 structTypes[i] = arg->getType();
385
386 Type *structTy = StructType::create(module.getContext(), structTypes);
387 Value *argStruct = builder.CreateAlloca(structTy, 0u);
388 Value *argArray = builder.CreateAlloca(
389 ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
390
391 for (auto [i, arg] : enumerate(args)) {
392 Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
393 builder.CreateStore(arg, structMember);
394 Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
395 builder.CreateStore(structMember, arrayMember);
396 }
397 return argArray;
398}
399
400// Emits LLVM IR to launch a kernel function:
401// %1 = load %global_module_object
402// %2 = call @mgpuModuleGetFunction(%1, %global_kernel_name)
403// %3 = call @mgpuStreamCreate()
404// %4 = <see createKernelArgArray()>
405// call @mgpuLaunchKernel(%2, ..., %3, %4, ...)
406// call @mgpuStreamSynchronize(%3)
407// call @mgpuStreamDestroy(%3)
408llvm::LogicalResult
409llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
410 mlir::gpu::ObjectAttr object) {
411 auto llvmValue = [&](mlir::Value value) -> Value * {
412 Value *v = moduleTranslation.lookupValue(value);
413 assert(v && "Value has not been translated.");
414 return v;
415 };
416
417 // Get grid dimensions.
418 mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
419 Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
420 *gz = llvmValue(grid.z);
421
422 // Get block dimensions.
423 mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
424 Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
425 *bz = llvmValue(block.z);
426
427 // Get dynamic shared memory size.
428 Value *dynamicMemorySize = nullptr;
429 if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
430 dynamicMemorySize = llvmValue(dynSz);
431 else
432 dynamicMemorySize = ConstantInt::get(i32Ty, 0);
433
434 // Create the argument array.
435 Value *argArray = createKernelArgArray(op);
436
437 // Load the kernel function.
438 StringRef moduleName = op.getKernelModuleName().getValue();
439 Twine moduleIdentifier = getModuleIdentifier(moduleName);
440 Value *modulePtr = module.getGlobalVariable(moduleIdentifier.str(), true);
441 if (!modulePtr)
442 return op.emitError() << "Couldn't find the binary: " << moduleIdentifier;
443 Value *moduleObj = builder.CreateLoad(ptrTy, modulePtr);
444 Value *functionName = getOrCreateFunctionName(moduleName, op.getKernelName());
445 Value *moduleFunction =
446 builder.CreateCall(getModuleFunctionFn(), {moduleObj, functionName});
447
448 // Get the stream to use for execution. If there's no async object then create
449 // a stream to make a synchronous kernel launch.
450 Value *stream = nullptr;
451 // Sync & destroy the stream, for synchronous launches.
452 llvm::scope_exit destroyStream([&]() {
453 builder.CreateCall(getStreamSyncFn(), {stream});
454 builder.CreateCall(getStreamDestroyFn(), {stream});
455 });
456 if (mlir::Value asyncObject = op.getAsyncObject()) {
457 stream = llvmValue(asyncObject);
458 destroyStream.release();
459 } else {
460 stream = builder.CreateCall(getStreamCreateFn(), {});
461 }
462
463 llvm::Constant *paramsCount =
464 llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
465
466 // Create the launch call.
467 Value *nullPtr = ConstantPointerNull::get(ptrTy);
468
469 // Cooperative launches go through mgpuLaunchKernelCooperative, which also
470 // handles an optional cluster. Cluster-only (non-cooperative) launches keep
471 // their existing path through mgpuLaunchClusterKernel. Plain launches go
472 // through mgpuLaunchKernel.
473 if (op.getCooperative()) {
474 Value *cx = ConstantInt::get(intPtrTy, 0);
475 Value *cy = ConstantInt::get(intPtrTy, 0);
476 Value *cz = ConstantInt::get(intPtrTy, 0);
477 if (op.hasClusterSize()) {
478 mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
479 cx = llvmValue(cluster.x);
480 cy = llvmValue(cluster.y);
481 cz = llvmValue(cluster.z);
482 }
483 builder.CreateCall(
484 getKernelLaunchCooperativeFn(),
485 ArrayRef<Value *>({moduleFunction, gx, gy, gz, cx, cy, cz, bx, by, bz,
486 dynamicMemorySize, stream, argArray, nullPtr}));
487 } else if (op.hasClusterSize()) {
488 mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
489 Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
490 *cz = llvmValue(cluster.z);
491 builder.CreateCall(
492 getClusterKernelLaunchFn(),
493 ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
494 dynamicMemorySize, stream, argArray, nullPtr}));
495 } else {
496 builder.CreateCall(getKernelLaunchFn(),
497 ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
498 bz, dynamicMemorySize, stream,
499 argArray, nullPtr, paramsCount}));
500 }
501
502 return success();
503}
504
506 DialectRegistry &registry) {
507 registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
508 SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
509 });
510}
return success()
static Twine getModuleIdentifier(StringRef moduleName)
static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, size_t sharedMemBytes, void **params, size_t paramsCount)
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
SmallVector< llvm::Value * > lookupValues(ValueRange values)
Looks up remapped a list of remapped values.
llvm::Value * lookupValue(Value value) const
Finds an LLVM IR value corresponding to the given MLIR value.
llvm::Module * getLLVMModule()
Returns the LLVM module in which the IR is being constructed.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
static LogicalResult embedBinaryImpl(StringRef moduleName, gpu::ObjectAttr object, Module &module)
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition Matchers.h:344
constexpr StringLiteral elfSectionName
void registerOffloadingLLVMTranslationInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers the offloading LLVM translation interfaces for gpu.select_object.
Include the generated interface declarations.
@ Constant
Constant integer.
Definition AffineExpr.h:57