MLIR  19.0.0git
SerializeToHsaco.cpp
Go to the documentation of this file.
1 //===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass that serializes a gpu module into HSAco blob and
10 // adds that blob as a string attribute of the module.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "mlir/IR/Location.h"
16 #include "mlir/IR/MLIRContext.h"
17 
18 #if MLIR_GPU_TO_HSACO_PASS_ENABLE
20 #include "mlir/Pass/Pass.h"
24 
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/GlobalVariable.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IRReader/IRReader.h"
29 #include "llvm/Linker/Linker.h"
30 
31 #include "llvm/MC/MCAsmBackend.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/MC/MCCodeEmitter.h"
34 #include "llvm/MC/MCContext.h"
35 #include "llvm/MC/MCInstrInfo.h"
36 #include "llvm/MC/MCObjectFileInfo.h"
37 #include "llvm/MC/MCObjectWriter.h"
38 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
39 #include "llvm/MC/MCRegisterInfo.h"
40 #include "llvm/MC/MCStreamer.h"
41 #include "llvm/MC/MCSubtargetInfo.h"
42 #include "llvm/MC/TargetRegistry.h"
43 
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/FileSystem.h"
46 #include "llvm/Support/FileUtilities.h"
47 #include "llvm/Support/Path.h"
48 #include "llvm/Support/Program.h"
49 #include "llvm/Support/SourceMgr.h"
50 #include "llvm/Support/TargetSelect.h"
51 #include "llvm/Support/Threading.h"
52 #include "llvm/Support/WithColor.h"
53 
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOptions.h"
56 
57 #include "llvm/Transforms/IPO/Internalize.h"
58 
59 #include <optional>
60 
61 using namespace mlir;
62 
63 namespace {
64 class SerializeToHsacoPass
65  : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
66  static llvm::once_flag initializeBackendOnce;
67 
68 public:
70 
71  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
72  int optLevel);
73  SerializeToHsacoPass(const SerializeToHsacoPass &other);
74  StringRef getArgument() const override { return "gpu-to-hsaco"; }
75  StringRef getDescription() const override {
76  return "Lower GPU kernel function to HSACO binary annotations";
77  }
78 
79 protected:
80  Option<std::string> rocmPath{*this, "rocm-path",
81  llvm::cl::desc("Path to ROCm install")};
82 
83  // Overload to allow linking in device libs
84  std::unique_ptr<llvm::Module>
85  translateToLLVMIR(llvm::LLVMContext &llvmContext) override;
86 
87 private:
88  // Loads LLVM bitcode libraries
89  std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
90  loadLibraries(SmallVectorImpl<char> &path,
91  SmallVectorImpl<StringRef> &libraries,
92  llvm::LLVMContext &context);
93 
94  // Serializes ROCDL to HSACO.
95  std::unique_ptr<std::vector<char>>
96  serializeISA(const std::string &isa) override;
97 
98  LogicalResult assembleIsa(const std::string &isa,
99  SmallVectorImpl<char> &result);
100  std::unique_ptr<std::vector<char>> createHsaco(ArrayRef<char> isaBinary);
101 
102  std::string getRocmPath();
103 };
104 } // namespace
105 
106 SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
107  : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
108 
109 /// Get a user-specified path to ROCm
110 // Tries, in order, the --rocm-path option, the ROCM_PATH environment variable
111 // and a compile-time default
112 std::string SerializeToHsacoPass::getRocmPath() {
113  if (rocmPath.getNumOccurrences() > 0)
114  return rocmPath.getValue();
115 
116  return __DEFAULT_ROCM_PATH__;
117 }
118 
119 // Sets the 'option' to 'value' unless it already has a value.
120 static void maybeSetOption(Pass::Option<std::string> &option,
121  function_ref<std::string()> getValue) {
122  if (!option.hasValue())
123  option = getValue();
124 }
125 
126 llvm::once_flag SerializeToHsacoPass::initializeBackendOnce;
127 
128 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
129  StringRef features, int optLevel) {
130  // No matter how this pass is constructed, ensure that the AMDGPU backend
131  // is initialized exactly once.
132  llvm::call_once(initializeBackendOnce, []() {
133  // Initialize LLVM AMDGPU backend.
134  LLVMInitializeAMDGPUAsmParser();
135  LLVMInitializeAMDGPUAsmPrinter();
136  LLVMInitializeAMDGPUTarget();
137  LLVMInitializeAMDGPUTargetInfo();
138  LLVMInitializeAMDGPUTargetMC();
139  });
140  maybeSetOption(this->triple, [&triple] { return triple.str(); });
141  maybeSetOption(this->chip, [&arch] { return arch.str(); });
142  maybeSetOption(this->features, [&features] { return features.str(); });
143  if (this->optLevel.getNumOccurrences() == 0)
144  this->optLevel.setValue(optLevel);
145 }
146 
147 std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
148 SerializeToHsacoPass::loadLibraries(SmallVectorImpl<char> &path,
149  SmallVectorImpl<StringRef> &libraries,
150  llvm::LLVMContext &context) {
151  SmallVector<std::unique_ptr<llvm::Module>, 3> ret;
152  size_t dirLength = path.size();
153 
154  if (!llvm::sys::fs::is_directory(path)) {
155  getOperation().emitRemark() << "Bitcode path: " << path
156  << " does not exist or is not a directory\n";
157  return std::nullopt;
158  }
159 
160  for (const StringRef file : libraries) {
161  llvm::SMDiagnostic error;
162  llvm::sys::path::append(path, file);
163  llvm::StringRef pathRef(path.data(), path.size());
164  std::unique_ptr<llvm::Module> library =
165  llvm::getLazyIRFileModule(pathRef, error, context);
166  path.truncate(dirLength);
167  if (!library) {
168  getOperation().emitError() << "Failed to load library " << file
169  << " from " << path << error.getMessage();
170  return std::nullopt;
171  }
172  // Some ROCM builds don't strip this like they should
173  if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version"))
174  library->eraseNamedMetadata(openclVersion);
175  // Stop spamming us with clang version numbers
176  if (auto *ident = library->getNamedMetadata("llvm.ident"))
177  library->eraseNamedMetadata(ident);
178  ret.push_back(std::move(library));
179  }
180 
181  return std::move(ret);
182 }
183 
184 std::unique_ptr<llvm::Module>
185 SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
186  // MLIR -> LLVM translation
187  std::unique_ptr<llvm::Module> ret =
188  gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext);
189 
190  if (!ret) {
191  getOperation().emitOpError("Module lowering failed");
192  return ret;
193  }
194  // Walk the LLVM module in order to determine if we need to link in device
195  // libs
196  bool needOpenCl = false;
197  bool needOckl = false;
198  bool needOcml = false;
199  for (llvm::Function &f : ret->functions()) {
200  if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
201  StringRef funcName = f.getName();
202  if ("printf" == funcName)
203  needOpenCl = true;
204  if (funcName.starts_with("__ockl_"))
205  needOckl = true;
206  if (funcName.starts_with("__ocml_"))
207  needOcml = true;
208  }
209  }
210 
211  if (needOpenCl)
212  needOcml = needOckl = true;
213 
214  // No libraries needed (the typical case)
215  if (!(needOpenCl || needOcml || needOckl))
216  return ret;
217 
218  // Define one of the control constants the ROCm device libraries expect to be
219  // present These constants can either be defined in the module or can be
220  // imported by linking in bitcode that defines the constant. To simplify our
221  // logic, we define the constants into the module we are compiling
222  auto addControlConstant = [&module = *ret](StringRef name, uint32_t value,
223  uint32_t bitwidth) {
224  using llvm::GlobalVariable;
225  if (module.getNamedGlobal(name)) {
226  return;
227  }
228  llvm::IntegerType *type =
229  llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
230  auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false);
231  auto *constant = new GlobalVariable(
232  module, type,
233  /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage,
234  initializer, name,
235  /*before=*/nullptr,
236  /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal,
237  /*addressSpace=*/4);
238  constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
239  constant->setVisibility(
240  GlobalVariable::VisibilityTypes::ProtectedVisibility);
241  constant->setAlignment(llvm::MaybeAlign(bitwidth / 8));
242  };
243 
244  // Set up control variables in the module instead of linking in tiny bitcode
245  if (needOcml) {
246  // TODO(kdrewnia): Enable math optimizations once we have support for
247  // `-ffast-math`-like options
248  addControlConstant("__oclc_finite_only_opt", 0, 8);
249  addControlConstant("__oclc_daz_opt", 0, 8);
250  addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8);
251  addControlConstant("__oclc_unsafe_math_opt", 0, 8);
252  }
253  if (needOcml || needOckl) {
254  addControlConstant("__oclc_wavefrontsize64", 1, 8);
255  StringRef chipSet = this->chip.getValue();
256  if (chipSet.starts_with("gfx"))
257  chipSet = chipSet.substr(3);
258  uint32_t minor =
259  llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue();
260  uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10)
261  .getZExtValue();
262  uint32_t isaNumber = minor + 1000 * major;
263  addControlConstant("__oclc_ISA_version", isaNumber, 32);
264 
265  // This constant must always match the default code object ABI version
266  // of the AMDGPU backend.
267  addControlConstant("__oclc_ABI_version", 500, 32);
268  }
269 
270  // Determine libraries we need to link - order matters due to dependencies
272  if (needOpenCl)
273  libraries.push_back("opencl.bc");
274  if (needOcml)
275  libraries.push_back("ocml.bc");
276  if (needOckl)
277  libraries.push_back("ockl.bc");
278 
279  std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules;
280  std::string theRocmPath = getRocmPath();
281  llvm::SmallString<32> bitcodePath(theRocmPath);
282  llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode");
283  mbModules = loadLibraries(bitcodePath, libraries, llvmContext);
284 
285  if (!mbModules) {
286  getOperation()
287  .emitWarning("Could not load required device libraries")
288  .attachNote()
289  << "This will probably cause link-time or run-time failures";
290  return ret; // We can still abort here
291  }
292 
293  llvm::Linker linker(*ret);
294  for (std::unique_ptr<llvm::Module> &libModule : *mbModules) {
295  // This bitcode linking code is substantially similar to what is used in
296  // hip-clang It imports the library functions into the module, allowing LLVM
297  // optimization passes (which must run after linking) to optimize across the
298  // libraries and the module's code. We also only import symbols if they are
299  // referenced by the module or a previous library since there will be no
300  // other source of references to those symbols in this compilation and since
301  // we don't want to bloat the resulting code object.
302  bool err = linker.linkInModule(
303  std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded,
304  [](llvm::Module &m, const StringSet<> &gvs) {
305  llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) {
306  return !gv.hasName() || (gvs.count(gv.getName()) == 0);
307  });
308  });
309  // True is linker failure
310  if (err) {
311  getOperation().emitError(
312  "Unrecoverable failure during device library linking.");
313  // We have no guaranties about the state of `ret`, so bail
314  return nullptr;
315  }
316  }
317 
318  return ret;
319 }
320 
321 LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa,
322  SmallVectorImpl<char> &result) {
323  auto loc = getOperation().getLoc();
324 
325  llvm::raw_svector_ostream os(result);
326 
327  llvm::Triple triple(llvm::Triple::normalize(this->triple));
328  std::string error;
329  const llvm::Target *target =
330  llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
331  if (!target)
332  return emitError(loc, Twine("failed to lookup target: ") + error);
333 
334  llvm::SourceMgr srcMgr;
335  srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc());
336 
337  const llvm::MCTargetOptions mcOptions;
338  std::unique_ptr<llvm::MCRegisterInfo> mri(
339  target->createMCRegInfo(this->triple));
340  std::unique_ptr<llvm::MCAsmInfo> mai(
341  target->createMCAsmInfo(*mri, this->triple, mcOptions));
342  mai->setRelaxELFRelocations(true);
343  std::unique_ptr<llvm::MCSubtargetInfo> sti(
344  target->createMCSubtargetInfo(this->triple, this->chip, this->features));
345 
346  llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
347  &mcOptions);
348  std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
349  ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
350  ctx.setObjectFileInfo(mofi.get());
351 
352  SmallString<128> cwd;
353  if (!llvm::sys::fs::current_path(cwd))
354  ctx.setCompilationDir(cwd);
355 
356  std::unique_ptr<llvm::MCStreamer> mcStreamer;
357  std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
358 
359  llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx);
360  llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
361  mcStreamer.reset(target->createMCObjectStreamer(
362  triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
363  mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
364  *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
365  /*DWARFMustBeAtTheEnd*/ false));
366  mcStreamer->setUseAssemblerInfoForParsing(true);
367 
368  std::unique_ptr<llvm::MCAsmParser> parser(
369  createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
370  std::unique_ptr<llvm::MCTargetAsmParser> tap(
371  target->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
372 
373  if (!tap)
374  return emitError(loc, "assembler initialization error");
375 
376  parser->setTargetParser(*tap);
377  parser->Run(false);
378 
379  return success();
380 }
381 
382 std::unique_ptr<std::vector<char>>
383 SerializeToHsacoPass::createHsaco(ArrayRef<char> isaBinary) {
384  auto loc = getOperation().getLoc();
385 
386  // Save the ISA binary to a temp file.
387  int tempIsaBinaryFd = -1;
388  SmallString<128> tempIsaBinaryFilename;
389  if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd,
390  tempIsaBinaryFilename)) {
391  emitError(loc, "temporary file for ISA binary creation error");
392  return {};
393  }
394  llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
395  llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
396  tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size());
397  tempIsaBinaryOs.close();
398 
399  // Create a temp file for HSA code object.
400  SmallString<128> tempHsacoFilename;
401  if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
402  tempHsacoFilename)) {
403  emitError(loc, "temporary file for HSA code object creation error");
404  return {};
405  }
406  llvm::FileRemover cleanupHsaco(tempHsacoFilename);
407 
408  std::string theRocmPath = getRocmPath();
409  llvm::SmallString<32> lldPath(theRocmPath);
410  llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld");
411  int lldResult = llvm::sys::ExecuteAndWait(
412  lldPath,
413  {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
414  if (lldResult != 0) {
415  emitError(loc, "lld invocation error");
416  return {};
417  }
418 
419  // Load the HSA code object.
420  auto hsacoFile =
421  llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
422  if (!hsacoFile) {
423  emitError(loc, "read HSA code object from temp file error");
424  return {};
425  }
426 
427  StringRef buffer = (*hsacoFile)->getBuffer();
428  return std::make_unique<std::vector<char>>(buffer.begin(), buffer.end());
429 }
430 
431 std::unique_ptr<std::vector<char>>
432 SerializeToHsacoPass::serializeISA(const std::string &isa) {
433  SmallVector<char, 0> isaBinary;
434  if (failed(assembleIsa(isa, isaBinary)))
435  return {};
436  return createHsaco(isaBinary);
437 }
438 
439 // Register pass to serialize GPU kernel functions to a HSACO binary annotation.
441  PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO([] {
442  return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "", "",
443  2);
444  });
445 }
446 
447 /// Create an instance of the GPU kernel function to HSAco binary serialization
448 /// pass.
449 std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple,
450  StringRef arch,
451  StringRef features,
452  int optLevel) {
453  return std::make_unique<SerializeToHsacoPass>(triple, arch, features,
454  optLevel);
455 }
456 
457 #else // MLIR_GPU_TO_HSACO_PASS_ENABLE
459 #endif // MLIR_GPU_TO_HSACO_PASS_ENABLE
#define __DEFAULT_ROCM_PATH__
Definition: Target.cpp:53
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition: TypeID.h:274
This class provides a CRTP wrapper around a base pass class to define several necessary utility metho...
Definition: Pass.h:441
Include the generated interface declarations.
void registerGpuSerializeToHsacoPass()
Register pass to serialize GPU kernel functions to a HSAco binary annotation.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::unique_ptr< Pass > createGpuSerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, int optLevel)
Create an instance of the GPU kernel function to HSAco binary serialization pass.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
PassRegistration provides a global initializer that registers a Pass allocation routine for a concret...
Definition: PassRegistry.h:152
This class represents a specific pass option, with a provided data type.
Definition: Pass.h:93