MLIR 22.0.0git
SelectObjectAttr.cpp
Go to the documentation of this file.
1//===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the `OffloadingLLVMTranslationAttrInterface` for the
10// `SelectObject` attribute.
11//
12//===----------------------------------------------------------------------===//
13
16
20
21#include "llvm/ADT/ScopeExit.h"
22#include "llvm/IR/Constants.h"
23#include "llvm/IR/IRBuilder.h"
24#include "llvm/IR/LLVMContext.h"
25#include "llvm/IR/Module.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Transforms/Utils/ModuleUtils.h"
28
29using namespace mlir;
30
31namespace {
32// Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
33class SelectObjectAttrImpl
34 : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
35 SelectObjectAttrImpl> {
36 // Returns the selected object for embedding.
37 gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const;
38
39public:
40 // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
41 // global binary string which gets loaded/unloaded into a global module
42 // object through a global ctor/dtor.
43 LogicalResult embedBinary(Attribute attribute, Operation *operation,
44 llvm::IRBuilderBase &builder,
45 LLVM::ModuleTranslation &moduleTranslation) const;
46
47 // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
48 // in a kernel launch call.
49 LogicalResult launchKernel(Attribute attribute,
50 Operation *launchFuncOperation,
51 Operation *binaryOperation,
52 llvm::IRBuilderBase &builder,
53 LLVM::ModuleTranslation &moduleTranslation) const;
54};
55} // namespace
56
57gpu::ObjectAttr
58SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
59 ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
60
61 // Obtain the index of the object to select.
62 int64_t index = -1;
63 if (Attribute target =
64 cast<gpu::SelectObjectAttr>(op.getOffloadingHandlerAttr())
65 .getTarget()) {
66 // If the target attribute is a number it is the index. Otherwise compare
67 // the attribute to every target inside the object array to find the index.
68 if (auto indexAttr = mlir::dyn_cast<IntegerAttr>(target)) {
69 index = indexAttr.getInt();
70 } else {
71 for (auto [i, attr] : llvm::enumerate(objects)) {
72 auto obj = mlir::dyn_cast<gpu::ObjectAttr>(attr);
73 if (obj.getTarget() == target) {
74 index = i;
75 }
76 }
77 }
78 } else {
79 // If the target attribute is null then it's selecting the first object in
80 // the object array.
81 index = 0;
82 }
83
84 if (index < 0 || index >= static_cast<int64_t>(objects.size())) {
85 op->emitError("the requested target object couldn't be found");
86 return nullptr;
87 }
88 return mlir::dyn_cast<gpu::ObjectAttr>(objects[index]);
89}
90
91static Twine getModuleIdentifier(StringRef moduleName) {
92 return moduleName + "_module";
93}
94
95namespace llvm {
96static LogicalResult embedBinaryImpl(StringRef moduleName,
97 gpu::ObjectAttr object, Module &module) {
98
99 // Embed the object as a global string.
100 // Add null for assembly output for JIT paths that expect null-terminated
101 // strings.
102 bool addNull = (object.getFormat() == gpu::CompilationTarget::Assembly);
103 StringRef serializedStr = object.getObject().getValue();
104 Constant *serializedCst =
105 ConstantDataArray::getString(module.getContext(), serializedStr, addNull);
106 GlobalVariable *serializedObj =
107 new GlobalVariable(module, serializedCst->getType(), true,
108 GlobalValue::LinkageTypes::InternalLinkage,
109 serializedCst, moduleName + "_binary");
110 serializedObj->setAlignment(MaybeAlign(8));
111 serializedObj->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
112
113 // Default JIT optimization level.
114 auto optLevel = APInt::getZero(32);
115
116 if (DictionaryAttr objectProps = object.getProperties()) {
117 if (auto section = dyn_cast_or_null<StringAttr>(
118 objectProps.get(gpu::elfSectionName))) {
119 serializedObj->setSection(section.getValue());
120 }
121 // Check if there's an optimization level embedded in the object.
122 if (auto optAttr = dyn_cast_or_null<IntegerAttr>(objectProps.get("O")))
123 optLevel = optAttr.getValue();
124 }
125
126 IRBuilder<> builder(module.getContext());
127 auto i32Ty = builder.getInt32Ty();
128 auto i64Ty = builder.getInt64Ty();
129 auto ptrTy = builder.getPtrTy(0);
130 auto voidTy = builder.getVoidTy();
131
132 // Embed the module as a global object.
133 auto *modulePtr = new GlobalVariable(
134 module, ptrTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
135 /*Initializer=*/ConstantPointerNull::get(ptrTy),
136 getModuleIdentifier(moduleName));
137
138 auto *loadFn = Function::Create(FunctionType::get(voidTy, /*IsVarArg=*/false),
139 GlobalValue::InternalLinkage,
140 moduleName + "_load", module);
141 loadFn->setSection(".text.startup");
142 auto *loadBlock = BasicBlock::Create(module.getContext(), "entry", loadFn);
143 builder.SetInsertPoint(loadBlock);
144 Value *moduleObj = [&] {
145 if (object.getFormat() == gpu::CompilationTarget::Assembly) {
146 FunctionCallee moduleLoadFn = module.getOrInsertFunction(
147 "mgpuModuleLoadJIT", FunctionType::get(ptrTy, {ptrTy, i32Ty}, false));
148 Constant *optValue = ConstantInt::get(i32Ty, optLevel);
149 return builder.CreateCall(moduleLoadFn, {serializedObj, optValue});
150 }
151 FunctionCallee moduleLoadFn = module.getOrInsertFunction(
152 "mgpuModuleLoad", FunctionType::get(ptrTy, {ptrTy, i64Ty}, false));
153 Constant *binarySize =
154 ConstantInt::get(i64Ty, serializedStr.size() + (addNull ? 1 : 0));
155 return builder.CreateCall(moduleLoadFn, {serializedObj, binarySize});
156 }();
157 builder.CreateStore(moduleObj, modulePtr);
158 builder.CreateRetVoid();
159 appendToGlobalCtors(module, loadFn, /*Priority=*/123);
160
161 auto *unloadFn = Function::Create(
162 FunctionType::get(voidTy, /*IsVarArg=*/false),
163 GlobalValue::InternalLinkage, moduleName + "_unload", module);
164 unloadFn->setSection(".text.startup");
165 auto *unloadBlock =
166 BasicBlock::Create(module.getContext(), "entry", unloadFn);
167 builder.SetInsertPoint(unloadBlock);
168 FunctionCallee moduleUnloadFn = module.getOrInsertFunction(
169 "mgpuModuleUnload", FunctionType::get(voidTy, ptrTy, false));
170 builder.CreateCall(moduleUnloadFn, builder.CreateLoad(ptrTy, modulePtr));
171 builder.CreateRetVoid();
172 appendToGlobalDtors(module, unloadFn, /*Priority=*/123);
173
174 return success();
175}
176} // namespace llvm
177
178LogicalResult SelectObjectAttrImpl::embedBinary(
179 Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
180 LLVM::ModuleTranslation &moduleTranslation) const {
181 assert(operation && "The binary operation must be non null.");
182 if (!operation)
183 return failure();
184
185 auto op = mlir::dyn_cast<gpu::BinaryOp>(operation);
186 if (!op) {
187 operation->emitError("operation must be a GPU binary");
188 return failure();
189 }
190
191 gpu::ObjectAttr object = getSelectedObject(op);
192 if (!object)
193 return failure();
194
195 return embedBinaryImpl(op.getName(), object,
196 *moduleTranslation.getLLVMModule());
197}
198
199namespace llvm {
200namespace {
201class LaunchKernel {
202public:
203 LaunchKernel(Module &module, IRBuilderBase &builder,
204 mlir::LLVM::ModuleTranslation &moduleTranslation);
205 // Get the kernel launch callee.
206 FunctionCallee getKernelLaunchFn();
207
208 // Get the kernel launch callee.
209 FunctionCallee getClusterKernelLaunchFn();
210
211 // Get the module function callee.
212 FunctionCallee getModuleFunctionFn();
213
214 // Get the stream create callee.
215 FunctionCallee getStreamCreateFn();
216
217 // Get the stream destroy callee.
218 FunctionCallee getStreamDestroyFn();
219
220 // Get the stream sync callee.
221 FunctionCallee getStreamSyncFn();
222
223 // Ger or create the function name global string.
224 Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName);
225
226 // Create the void* kernel array for passing the arguments.
227 Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
228
229 // Create the full kernel launch.
230 llvm::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
231 mlir::gpu::ObjectAttr object);
232
233private:
234 Module &module;
235 IRBuilderBase &builder;
236 mlir::LLVM::ModuleTranslation &moduleTranslation;
237 Type *i32Ty{};
238 Type *i64Ty{};
239 Type *voidTy{};
240 Type *intPtrTy{};
241 PointerType *ptrTy{};
242};
243} // namespace
244} // namespace llvm
245
246LogicalResult SelectObjectAttrImpl::launchKernel(
247 Attribute attribute, Operation *launchFuncOperation,
248 Operation *binaryOperation, llvm::IRBuilderBase &builder,
249 LLVM::ModuleTranslation &moduleTranslation) const {
250
251 assert(launchFuncOperation && "The launch func operation must be non null.");
252 if (!launchFuncOperation)
253 return failure();
254
255 auto launchFuncOp = mlir::dyn_cast<gpu::LaunchFuncOp>(launchFuncOperation);
256 if (!launchFuncOp) {
257 launchFuncOperation->emitError("operation must be a GPU launch func Op.");
258 return failure();
259 }
260
261 auto binOp = mlir::dyn_cast<gpu::BinaryOp>(binaryOperation);
262 if (!binOp) {
263 binaryOperation->emitError("operation must be a GPU binary.");
264 return failure();
265 }
266 gpu::ObjectAttr object = getSelectedObject(binOp);
267 if (!object)
268 return failure();
269
270 return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
271 moduleTranslation)
272 .createKernelLaunch(launchFuncOp, object);
273}
274
275llvm::LaunchKernel::LaunchKernel(
276 Module &module, IRBuilderBase &builder,
277 mlir::LLVM::ModuleTranslation &moduleTranslation)
278 : module(module), builder(builder), moduleTranslation(moduleTranslation) {
279 i32Ty = builder.getInt32Ty();
280 i64Ty = builder.getInt64Ty();
281 ptrTy = builder.getPtrTy(0);
282 voidTy = builder.getVoidTy();
283 intPtrTy = builder.getIntPtrTy(module.getDataLayout());
284}
285
286llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
287 return module.getOrInsertFunction(
288 "mgpuLaunchKernel",
289 FunctionType::get(voidTy,
290 ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
291 intPtrTy, intPtrTy, intPtrTy, i32Ty,
292 ptrTy, ptrTy, ptrTy, i64Ty}),
293 false));
294}
295
296llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
297 return module.getOrInsertFunction(
298 "mgpuLaunchClusterKernel",
299 FunctionType::get(
300 voidTy,
301 ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
302 intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
303 i32Ty, ptrTy, ptrTy, ptrTy}),
304 false));
305}
306
307llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
308 return module.getOrInsertFunction(
309 "mgpuModuleGetFunction",
310 FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, ptrTy}), false));
311}
312
313llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() {
314 return module.getOrInsertFunction("mgpuStreamCreate",
315 FunctionType::get(ptrTy, false));
316}
317
318llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() {
319 return module.getOrInsertFunction(
320 "mgpuStreamDestroy",
321 FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
322}
323
324llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() {
325 return module.getOrInsertFunction(
326 "mgpuStreamSynchronize",
327 FunctionType::get(voidTy, ArrayRef<Type *>({ptrTy}), false));
328}
329
330// Generates an LLVM IR dialect global that contains the name of the given
331// kernel function as a C string, and returns a pointer to its beginning.
332llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName,
333 StringRef kernelName) {
334 std::string globalName =
335 std::string(formatv("{0}_{1}_name", moduleName, kernelName));
336
337 if (GlobalVariable *gv = module.getGlobalVariable(globalName, true))
338 return gv;
339
340 return builder.CreateGlobalString(kernelName, globalName);
341}
342
343// Creates a struct containing all kernel parameters on the stack and returns
344// an array of type-erased pointers to the fields of the struct. The array can
345// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
346// The generated code is essentially as follows:
347//
348// %struct = alloca(sizeof(struct { Parameters... }))
349// %array = alloca(NumParameters * sizeof(void *))
350// for (i : [0, NumParameters))
351// %fieldPtr = llvm.getelementptr %struct[0, i]
352// llvm.store parameters[i], %fieldPtr
353// %elementPtr = llvm.getelementptr %array[i]
354// llvm.store %fieldPtr, %elementPtr
355// return %array
356llvm::Value *
357llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
358 SmallVector<Value *> args =
359 moduleTranslation.lookupValues(op.getKernelOperands());
360 SmallVector<Type *> structTypes(args.size(), nullptr);
361
362 for (auto [i, arg] : llvm::enumerate(args))
363 structTypes[i] = arg->getType();
364
365 Type *structTy = StructType::create(module.getContext(), structTypes);
366 Value *argStruct = builder.CreateAlloca(structTy, 0u);
367 Value *argArray = builder.CreateAlloca(
368 ptrTy, ConstantInt::get(intPtrTy, structTypes.size()));
369
370 for (auto [i, arg] : enumerate(args)) {
371 Value *structMember = builder.CreateStructGEP(structTy, argStruct, i);
372 builder.CreateStore(arg, structMember);
373 Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i);
374 builder.CreateStore(structMember, arrayMember);
375 }
376 return argArray;
377}
378
379// Emits LLVM IR to launch a kernel function:
380// %1 = load %global_module_object
381// %2 = call @mgpuModuleGetFunction(%1, %global_kernel_name)
382// %3 = call @mgpuStreamCreate()
383// %4 = <see createKernelArgArray()>
384// call @mgpuLaunchKernel(%2, ..., %3, %4, ...)
385// call @mgpuStreamSynchronize(%3)
386// call @mgpuStreamDestroy(%3)
387llvm::LogicalResult
388llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
389 mlir::gpu::ObjectAttr object) {
390 auto llvmValue = [&](mlir::Value value) -> Value * {
391 Value *v = moduleTranslation.lookupValue(value);
392 assert(v && "Value has not been translated.");
393 return v;
394 };
395
396 // Get grid dimensions.
397 mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
398 Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
399 *gz = llvmValue(grid.z);
400
401 // Get block dimensions.
402 mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
403 Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
404 *bz = llvmValue(block.z);
405
406 // Get dynamic shared memory size.
407 Value *dynamicMemorySize = nullptr;
408 if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
409 dynamicMemorySize = llvmValue(dynSz);
410 else
411 dynamicMemorySize = ConstantInt::get(i32Ty, 0);
412
413 // Create the argument array.
414 Value *argArray = createKernelArgArray(op);
415
416 // Load the kernel function.
417 StringRef moduleName = op.getKernelModuleName().getValue();
418 Twine moduleIdentifier = getModuleIdentifier(moduleName);
419 Value *modulePtr = module.getGlobalVariable(moduleIdentifier.str(), true);
420 if (!modulePtr)
421 return op.emitError() << "Couldn't find the binary: " << moduleIdentifier;
422 Value *moduleObj = builder.CreateLoad(ptrTy, modulePtr);
423 Value *functionName = getOrCreateFunctionName(moduleName, op.getKernelName());
424 Value *moduleFunction =
425 builder.CreateCall(getModuleFunctionFn(), {moduleObj, functionName});
426
427 // Get the stream to use for execution. If there's no async object then create
428 // a stream to make a synchronous kernel launch.
429 Value *stream = nullptr;
430 // Sync & destroy the stream, for synchronous launches.
431 auto destroyStream = make_scope_exit([&]() {
432 builder.CreateCall(getStreamSyncFn(), {stream});
433 builder.CreateCall(getStreamDestroyFn(), {stream});
434 });
435 if (mlir::Value asyncObject = op.getAsyncObject()) {
436 stream = llvmValue(asyncObject);
437 destroyStream.release();
438 } else {
439 stream = builder.CreateCall(getStreamCreateFn(), {});
440 }
441
442 llvm::Constant *paramsCount =
443 llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
444
445 // Create the launch call.
446 Value *nullPtr = ConstantPointerNull::get(ptrTy);
447
448 // Launch kernel with clusters if cluster size is specified.
449 if (op.hasClusterSize()) {
450 mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
451 Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
452 *cz = llvmValue(cluster.z);
453 builder.CreateCall(
454 getClusterKernelLaunchFn(),
455 ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
456 dynamicMemorySize, stream, argArray, nullPtr}));
457 } else {
458 builder.CreateCall(getKernelLaunchFn(),
459 ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
460 bz, dynamicMemorySize, stream,
461 argArray, nullPtr, paramsCount}));
462 }
463
464 return success();
465}
466
468 DialectRegistry &registry) {
469 registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
470 SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
471 });
472}
return success()
static Twine getModuleIdentifier(StringRef moduleName)
static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, size_t sharedMemBytes, void **params, size_t paramsCount)
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
bool addExtension(TypeID extensionID, std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
SmallVector< llvm::Value * > lookupValues(ValueRange values)
Looks up remapped a list of remapped values.
llvm::Value * lookupValue(Value value) const
Finds an LLVM IR value corresponding to the given MLIR value.
llvm::Module * getLLVMModule()
Returns the LLVM module in which the IR is being constructed.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
static LogicalResult embedBinaryImpl(StringRef moduleName, gpu::ObjectAttr object, Module &module)
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition Matchers.h:344
constexpr StringLiteral elfSectionName
void registerOffloadingLLVMTranslationInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers the offloading LLVM translation interfaces for gpu.select_object.
Include the generated interface declarations.
@ Constant
Constant integer.
Definition AffineExpr.h:57