CGCUDANV.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This provides a class for CUDA code generation targeting the NVIDIA CUDA
  10. // runtime library.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "CGCUDARuntime.h"
  14. #include "CodeGenFunction.h"
  15. #include "CodeGenModule.h"
  16. #include "clang/AST/Decl.h"
  17. #include "clang/Basic/Cuda.h"
  18. #include "clang/CodeGen/CodeGenABITypes.h"
  19. #include "clang/CodeGen/ConstantInitBuilder.h"
  20. #include "llvm/IR/BasicBlock.h"
  21. #include "llvm/IR/Constants.h"
  22. #include "llvm/IR/DerivedTypes.h"
  23. #include "llvm/Support/Format.h"
  24. using namespace clang;
  25. using namespace CodeGen;
  26. namespace {
  27. constexpr unsigned CudaFatMagic = 0x466243b1;
  28. constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
  29. class CGNVCUDARuntime : public CGCUDARuntime {
  30. private:
  31. llvm::IntegerType *IntTy, *SizeTy;
  32. llvm::Type *VoidTy;
  33. llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
  34. /// Convenience reference to LLVM Context
  35. llvm::LLVMContext &Context;
  36. /// Convenience reference to the current module
  37. llvm::Module &TheModule;
  38. /// Keeps track of kernel launch stubs emitted in this module
  39. struct KernelInfo {
  40. llvm::Function *Kernel;
  41. const Decl *D;
  42. };
  43. llvm::SmallVector<KernelInfo, 16> EmittedKernels;
  44. struct VarInfo {
  45. llvm::GlobalVariable *Var;
  46. const VarDecl *D;
  47. unsigned Flag;
  48. };
  49. llvm::SmallVector<VarInfo, 16> DeviceVars;
  50. /// Keeps track of variable containing handle of GPU binary. Populated by
  51. /// ModuleCtorFunction() and used to create corresponding cleanup calls in
  52. /// ModuleDtorFunction()
  53. llvm::GlobalVariable *GpuBinaryHandle = nullptr;
  54. /// Whether we generate relocatable device code.
  55. bool RelocatableDeviceCode;
  56. /// Mangle context for device.
  57. std::unique_ptr<MangleContext> DeviceMC;
  58. llvm::FunctionCallee getSetupArgumentFn() const;
  59. llvm::FunctionCallee getLaunchFn() const;
  60. llvm::FunctionType *getRegisterGlobalsFnTy() const;
  61. llvm::FunctionType *getCallbackFnTy() const;
  62. llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
  63. std::string addPrefixToName(StringRef FuncName) const;
  64. std::string addUnderscoredPrefixToName(StringRef FuncName) const;
  65. /// Creates a function to register all kernel stubs generated in this module.
  66. llvm::Function *makeRegisterGlobalsFn();
  67. /// Helper function that generates a constant string and returns a pointer to
  68. /// the start of the string. The result of this function can be used anywhere
  69. /// where the C code specifies const char*.
  70. llvm::Constant *makeConstantString(const std::string &Str,
  71. const std::string &Name = "",
  72. const std::string &SectionName = "",
  73. unsigned Alignment = 0) {
  74. llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
  75. llvm::ConstantInt::get(SizeTy, 0)};
  76. auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
  77. llvm::GlobalVariable *GV =
  78. cast<llvm::GlobalVariable>(ConstStr.getPointer());
  79. if (!SectionName.empty()) {
  80. GV->setSection(SectionName);
  81. // Mark the address as used which make sure that this section isn't
  82. // merged and we will really have it in the object file.
  83. GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
  84. }
  85. if (Alignment)
  86. GV->setAlignment(llvm::Align(Alignment));
  87. return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
  88. ConstStr.getPointer(), Zeros);
  89. }
  90. /// Helper function that generates an empty dummy function returning void.
  91. llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
  92. assert(FnTy->getReturnType()->isVoidTy() &&
  93. "Can only generate dummy functions returning void!");
  94. llvm::Function *DummyFunc = llvm::Function::Create(
  95. FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
  96. llvm::BasicBlock *DummyBlock =
  97. llvm::BasicBlock::Create(Context, "", DummyFunc);
  98. CGBuilderTy FuncBuilder(CGM, Context);
  99. FuncBuilder.SetInsertPoint(DummyBlock);
  100. FuncBuilder.CreateRetVoid();
  101. return DummyFunc;
  102. }
  103. void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
  104. void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
  105. std::string getDeviceSideName(const Decl *ND);
  106. public:
  107. CGNVCUDARuntime(CodeGenModule &CGM);
  108. void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
  109. void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
  110. unsigned Flags) override {
  111. DeviceVars.push_back({&Var, VD, Flags});
  112. }
  113. /// Creates module constructor function
  114. llvm::Function *makeModuleCtorFunction() override;
  115. /// Creates module destructor function
  116. llvm::Function *makeModuleDtorFunction() override;
  117. /// Construct and return the stub name of a kernel.
  118. std::string getDeviceStubName(llvm::StringRef Name) const override;
  119. };
  120. }
  121. std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
  122. if (CGM.getLangOpts().HIP)
  123. return ((Twine("hip") + Twine(FuncName)).str());
  124. return ((Twine("cuda") + Twine(FuncName)).str());
  125. }
  126. std::string
  127. CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
  128. if (CGM.getLangOpts().HIP)
  129. return ((Twine("__hip") + Twine(FuncName)).str());
  130. return ((Twine("__cuda") + Twine(FuncName)).str());
  131. }
  132. CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
  133. : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
  134. TheModule(CGM.getModule()),
  135. RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
  136. DeviceMC(CGM.getContext().createMangleContext(
  137. CGM.getContext().getAuxTargetInfo())) {
  138. CodeGen::CodeGenTypes &Types = CGM.getTypes();
  139. ASTContext &Ctx = CGM.getContext();
  140. IntTy = CGM.IntTy;
  141. SizeTy = CGM.SizeTy;
  142. VoidTy = CGM.VoidTy;
  143. CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
  144. VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
  145. VoidPtrPtrTy = VoidPtrTy->getPointerTo();
  146. }
  147. llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
  148. // cudaError_t cudaSetupArgument(void *, size_t, size_t)
  149. llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
  150. return CGM.CreateRuntimeFunction(
  151. llvm::FunctionType::get(IntTy, Params, false),
  152. addPrefixToName("SetupArgument"));
  153. }
  154. llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
  155. if (CGM.getLangOpts().HIP) {
  156. // hipError_t hipLaunchByPtr(char *);
  157. return CGM.CreateRuntimeFunction(
  158. llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
  159. } else {
  160. // cudaError_t cudaLaunch(char *);
  161. return CGM.CreateRuntimeFunction(
  162. llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
  163. }
  164. }
  165. llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
  166. return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
  167. }
  168. llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
  169. return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
  170. }
  171. llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
  172. auto CallbackFnTy = getCallbackFnTy();
  173. auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
  174. llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
  175. VoidPtrTy, CallbackFnTy->getPointerTo()};
  176. return llvm::FunctionType::get(VoidTy, Params, false);
  177. }
  178. std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
  179. auto *ND = cast<const NamedDecl>(D);
  180. std::string DeviceSideName;
  181. if (DeviceMC->shouldMangleDeclName(ND)) {
  182. SmallString<256> Buffer;
  183. llvm::raw_svector_ostream Out(Buffer);
  184. DeviceMC->mangleName(ND, Out);
  185. DeviceSideName = Out.str();
  186. } else
  187. DeviceSideName = ND->getIdentifier()->getName();
  188. return DeviceSideName;
  189. }
  190. void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
  191. FunctionArgList &Args) {
  192. // Ensure either we have different ABIs between host and device compilations,
  193. // says host compilation following MSVC ABI but device compilation follows
  194. // Itanium C++ ABI or, if they follow the same ABI, kernel names after
  195. // mangling should be the same after name stubbing. The later checking is
  196. // very important as the device kernel name being mangled in host-compilation
  197. // is used to resolve the device binaries to be executed. Inconsistent naming
  198. // result in undefined behavior. Even though we cannot check that naming
  199. // directly between host- and device-compilations, the host- and
  200. // device-mangling in host compilation could help catching certain ones.
  201. assert((CGF.CGM.getContext().getAuxTargetInfo() &&
  202. (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() !=
  203. CGF.CGM.getContext().getTargetInfo().getCXXABI())) ||
  204. getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) ==
  205. CGF.CurFn->getName());
  206. EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
  207. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  208. CudaFeature::CUDA_USES_NEW_LAUNCH) ||
  209. CGF.getLangOpts().HIPUseNewLaunchAPI)
  210. emitDeviceStubBodyNew(CGF, Args);
  211. else
  212. emitDeviceStubBodyLegacy(CGF, Args);
  213. }
  214. // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
  215. // array and kernels are launched using cudaLaunchKernel().
  216. void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
  217. FunctionArgList &Args) {
  218. // Build the shadow stack entry at the very start of the function.
  219. // Calculate amount of space we will need for all arguments. If we have no
  220. // args, allocate a single pointer so we still have a valid pointer to the
  221. // argument array that we can pass to runtime, even if it will be unused.
  222. Address KernelArgs = CGF.CreateTempAlloca(
  223. VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
  224. llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
  225. // Store pointers to the arguments in a locally allocated launch_args.
  226. for (unsigned i = 0; i < Args.size(); ++i) {
  227. llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
  228. llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
  229. CGF.Builder.CreateDefaultAlignedStore(
  230. VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
  231. }
  232. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  233. // Lookup cudaLaunchKernel/hipLaunchKernel function.
  234. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  235. // void **args, size_t sharedMem,
  236. // cudaStream_t stream);
  237. // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  238. // void **args, size_t sharedMem,
  239. // hipStream_t stream);
  240. TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
  241. DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
  242. auto LaunchKernelName = addPrefixToName("LaunchKernel");
  243. IdentifierInfo &cudaLaunchKernelII =
  244. CGM.getContext().Idents.get(LaunchKernelName);
  245. FunctionDecl *cudaLaunchKernelFD = nullptr;
  246. for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
  247. if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
  248. cudaLaunchKernelFD = FD;
  249. }
  250. if (cudaLaunchKernelFD == nullptr) {
  251. CGM.Error(CGF.CurFuncDecl->getLocation(),
  252. "Can't find declaration for " + LaunchKernelName);
  253. return;
  254. }
  255. // Create temporary dim3 grid_dim, block_dim.
  256. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
  257. QualType Dim3Ty = GridDimParam->getType();
  258. Address GridDim =
  259. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
  260. Address BlockDim =
  261. CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
  262. Address ShmemSize =
  263. CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
  264. Address Stream =
  265. CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
  266. llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
  267. llvm::FunctionType::get(IntTy,
  268. {/*gridDim=*/GridDim.getType(),
  269. /*blockDim=*/BlockDim.getType(),
  270. /*ShmemSize=*/ShmemSize.getType(),
  271. /*Stream=*/Stream.getType()},
  272. /*isVarArg=*/false),
  273. addUnderscoredPrefixToName("PopCallConfiguration"));
  274. CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
  275. {GridDim.getPointer(), BlockDim.getPointer(),
  276. ShmemSize.getPointer(), Stream.getPointer()});
  277. // Emit the call to cudaLaunch
  278. llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
  279. CallArgList LaunchKernelArgs;
  280. LaunchKernelArgs.add(RValue::get(Kernel),
  281. cudaLaunchKernelFD->getParamDecl(0)->getType());
  282. LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
  283. LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
  284. LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
  285. cudaLaunchKernelFD->getParamDecl(3)->getType());
  286. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
  287. cudaLaunchKernelFD->getParamDecl(4)->getType());
  288. LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
  289. cudaLaunchKernelFD->getParamDecl(5)->getType());
  290. QualType QT = cudaLaunchKernelFD->getType();
  291. QualType CQT = QT.getCanonicalType();
  292. llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
  293. llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
  294. const CGFunctionInfo &FI =
  295. CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
  296. llvm::FunctionCallee cudaLaunchKernelFn =
  297. CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
  298. CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
  299. LaunchKernelArgs);
  300. CGF.EmitBranch(EndBlock);
  301. CGF.EmitBlock(EndBlock);
  302. }
  303. void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
  304. FunctionArgList &Args) {
  305. // Emit a call to cudaSetupArgument for each arg in Args.
  306. llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
  307. llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
  308. CharUnits Offset = CharUnits::Zero();
  309. for (const VarDecl *A : Args) {
  310. CharUnits TyWidth, TyAlign;
  311. std::tie(TyWidth, TyAlign) =
  312. CGM.getContext().getTypeInfoInChars(A->getType());
  313. Offset = Offset.alignTo(TyAlign);
  314. llvm::Value *Args[] = {
  315. CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
  316. VoidPtrTy),
  317. llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
  318. llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
  319. };
  320. llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
  321. llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
  322. llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
  323. llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
  324. CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
  325. CGF.EmitBlock(NextBlock);
  326. Offset += TyWidth;
  327. }
  328. // Emit the call to cudaLaunch
  329. llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
  330. llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
  331. CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
  332. CGF.EmitBranch(EndBlock);
  333. CGF.EmitBlock(EndBlock);
  334. }
  335. /// Creates a function that sets up state on the host side for CUDA objects that
  336. /// have a presence on both the host and device sides. Specifically, registers
  337. /// the host side of kernel functions and device global variables with the CUDA
  338. /// runtime.
  339. /// \code
  340. /// void __cuda_register_globals(void** GpuBinaryHandle) {
  341. /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
  342. /// ...
  343. /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
  344. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
  345. /// ...
  346. /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
  347. /// }
  348. /// \endcode
  349. llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  350. // No need to register anything
  351. if (EmittedKernels.empty() && DeviceVars.empty())
  352. return nullptr;
  353. llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
  354. getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
  355. addUnderscoredPrefixToName("_register_globals"), &TheModule);
  356. llvm::BasicBlock *EntryBB =
  357. llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
  358. CGBuilderTy Builder(CGM, Context);
  359. Builder.SetInsertPoint(EntryBB);
  360. // void __cudaRegisterFunction(void **, const char *, char *, const char *,
  361. // int, uint3*, uint3*, dim3*, dim3*, int*)
  362. llvm::Type *RegisterFuncParams[] = {
  363. VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
  364. VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
  365. llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
  366. llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
  367. addUnderscoredPrefixToName("RegisterFunction"));
  368. // Extract GpuBinaryHandle passed as the first argument passed to
  369. // __cuda_register_globals() and generate __cudaRegisterFunction() call for
  370. // each emitted kernel.
  371. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
  372. for (auto &&I : EmittedKernels) {
  373. llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
  374. llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
  375. llvm::Value *Args[] = {
  376. &GpuBinaryHandlePtr,
  377. Builder.CreateBitCast(I.Kernel, VoidPtrTy),
  378. KernelName,
  379. KernelName,
  380. llvm::ConstantInt::get(IntTy, -1),
  381. NullPtr,
  382. NullPtr,
  383. NullPtr,
  384. NullPtr,
  385. llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
  386. Builder.CreateCall(RegisterFunc, Args);
  387. }
  388. // void __cudaRegisterVar(void **, char *, char *, const char *,
  389. // int, int, int, int)
  390. llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
  391. CharPtrTy, IntTy, IntTy,
  392. IntTy, IntTy};
  393. llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
  394. llvm::FunctionType::get(IntTy, RegisterVarParams, false),
  395. addUnderscoredPrefixToName("RegisterVar"));
  396. for (auto &&Info : DeviceVars) {
  397. llvm::GlobalVariable *Var = Info.Var;
  398. unsigned Flags = Info.Flag;
  399. llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
  400. uint64_t VarSize =
  401. CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
  402. llvm::Value *Args[] = {
  403. &GpuBinaryHandlePtr,
  404. Builder.CreateBitCast(Var, VoidPtrTy),
  405. VarName,
  406. VarName,
  407. llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
  408. llvm::ConstantInt::get(IntTy, VarSize),
  409. llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
  410. llvm::ConstantInt::get(IntTy, 0)};
  411. Builder.CreateCall(RegisterVar, Args);
  412. }
  413. Builder.CreateRetVoid();
  414. return RegisterKernelsFunc;
  415. }
  416. /// Creates a global constructor function for the module:
  417. ///
  418. /// For CUDA:
  419. /// \code
  420. /// void __cuda_module_ctor(void*) {
  421. /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
  422. /// __cuda_register_globals(Handle);
  423. /// }
  424. /// \endcode
  425. ///
  426. /// For HIP:
  427. /// \code
  428. /// void __hip_module_ctor(void*) {
  429. /// if (__hip_gpubin_handle == 0) {
  430. /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
  431. /// __hip_register_globals(__hip_gpubin_handle);
  432. /// }
  433. /// }
  434. /// \endcode
  435. llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  436. bool IsHIP = CGM.getLangOpts().HIP;
  437. bool IsCUDA = CGM.getLangOpts().CUDA;
  438. // No need to generate ctors/dtors if there is no GPU binary.
  439. StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
  440. if (CudaGpuBinaryFileName.empty() && !IsHIP)
  441. return nullptr;
  442. if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
  443. DeviceVars.empty())
  444. return nullptr;
  445. // void __{cuda|hip}_register_globals(void* handle);
  446. llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
  447. // We always need a function to pass in as callback. Create a dummy
  448. // implementation if we don't need to register anything.
  449. if (RelocatableDeviceCode && !RegisterGlobalsFunc)
  450. RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
  451. // void ** __{cuda|hip}RegisterFatBinary(void *);
  452. llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
  453. llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
  454. addUnderscoredPrefixToName("RegisterFatBinary"));
  455. // struct { int magic, int version, void * gpu_binary, void * dont_care };
  456. llvm::StructType *FatbinWrapperTy =
  457. llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
  458. // Register GPU binary with the CUDA runtime, store returned handle in a
  459. // global variable and save a reference in GpuBinaryHandle to be cleaned up
  460. // in destructor on exit. Then associate all known kernels with the GPU binary
  461. // handle so CUDA runtime can figure out what to call on the GPU side.
  462. std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
  463. if (!CudaGpuBinaryFileName.empty()) {
  464. llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
  465. llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
  466. if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
  467. CGM.getDiags().Report(diag::err_cannot_open_file)
  468. << CudaGpuBinaryFileName << EC.message();
  469. return nullptr;
  470. }
  471. CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
  472. }
  473. llvm::Function *ModuleCtorFunc = llvm::Function::Create(
  474. llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
  475. llvm::GlobalValue::InternalLinkage,
  476. addUnderscoredPrefixToName("_module_ctor"), &TheModule);
  477. llvm::BasicBlock *CtorEntryBB =
  478. llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
  479. CGBuilderTy CtorBuilder(CGM, Context);
  480. CtorBuilder.SetInsertPoint(CtorEntryBB);
  481. const char *FatbinConstantName;
  482. const char *FatbinSectionName;
  483. const char *ModuleIDSectionName;
  484. StringRef ModuleIDPrefix;
  485. llvm::Constant *FatBinStr;
  486. unsigned FatMagic;
  487. if (IsHIP) {
  488. FatbinConstantName = ".hip_fatbin";
  489. FatbinSectionName = ".hipFatBinSegment";
  490. ModuleIDSectionName = "__hip_module_id";
  491. ModuleIDPrefix = "__hip_";
  492. if (CudaGpuBinary) {
  493. // If fatbin is available from early finalization, create a string
  494. // literal containing the fat binary loaded from the given file.
  495. FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
  496. FatbinConstantName, 8);
  497. } else {
  498. // If fatbin is not available, create an external symbol
  499. // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
  500. // to contain the fat binary but will be populated somewhere else,
  501. // e.g. by lld through link script.
  502. FatBinStr = new llvm::GlobalVariable(
  503. CGM.getModule(), CGM.Int8Ty,
  504. /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
  505. "__hip_fatbin", nullptr,
  506. llvm::GlobalVariable::NotThreadLocal);
  507. cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
  508. }
  509. FatMagic = HIPFatMagic;
  510. } else {
  511. if (RelocatableDeviceCode)
  512. FatbinConstantName = CGM.getTriple().isMacOSX()
  513. ? "__NV_CUDA,__nv_relfatbin"
  514. : "__nv_relfatbin";
  515. else
  516. FatbinConstantName =
  517. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
  518. // NVIDIA's cuobjdump looks for fatbins in this section.
  519. FatbinSectionName =
  520. CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
  521. ModuleIDSectionName = CGM.getTriple().isMacOSX()
  522. ? "__NV_CUDA,__nv_module_id"
  523. : "__nv_module_id";
  524. ModuleIDPrefix = "__nv_";
  525. // For CUDA, create a string literal containing the fat binary loaded from
  526. // the given file.
  527. FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
  528. FatbinConstantName, 8);
  529. FatMagic = CudaFatMagic;
  530. }
  531. // Create initialized wrapper structure that points to the loaded GPU binary
  532. ConstantInitBuilder Builder(CGM);
  533. auto Values = Builder.beginStruct(FatbinWrapperTy);
  534. // Fatbin wrapper magic.
  535. Values.addInt(IntTy, FatMagic);
  536. // Fatbin version.
  537. Values.addInt(IntTy, 1);
  538. // Data.
  539. Values.add(FatBinStr);
  540. // Unused in fatbin v1.
  541. Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
  542. llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
  543. addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
  544. /*constant*/ true);
  545. FatbinWrapper->setSection(FatbinSectionName);
  546. // There is only one HIP fat binary per linked module, however there are
  547. // multiple constructor functions. Make sure the fat binary is registered
  548. // only once. The constructor functions are executed by the dynamic loader
  549. // before the program gains control. The dynamic loader cannot execute the
  550. // constructor functions concurrently since doing that would not guarantee
  551. // thread safety of the loaded program. Therefore we can assume sequential
  552. // execution of constructor functions here.
  553. if (IsHIP) {
  554. auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
  555. llvm::GlobalValue::LinkOnceAnyLinkage;
  556. llvm::BasicBlock *IfBlock =
  557. llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
  558. llvm::BasicBlock *ExitBlock =
  559. llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
  560. // The name, size, and initialization pattern of this variable is part
  561. // of HIP ABI.
  562. GpuBinaryHandle = new llvm::GlobalVariable(
  563. TheModule, VoidPtrPtrTy, /*isConstant=*/false,
  564. Linkage,
  565. /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
  566. "__hip_gpubin_handle");
  567. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  568. // Prevent the weak symbol in different shared libraries being merged.
  569. if (Linkage != llvm::GlobalValue::InternalLinkage)
  570. GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
  571. Address GpuBinaryAddr(
  572. GpuBinaryHandle,
  573. CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
  574. {
  575. auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  576. llvm::Constant *Zero =
  577. llvm::Constant::getNullValue(HandleValue->getType());
  578. llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
  579. CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
  580. }
  581. {
  582. CtorBuilder.SetInsertPoint(IfBlock);
  583. // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
  584. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  585. RegisterFatbinFunc,
  586. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  587. CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
  588. CtorBuilder.CreateBr(ExitBlock);
  589. }
  590. {
  591. CtorBuilder.SetInsertPoint(ExitBlock);
  592. // Call __hip_register_globals(GpuBinaryHandle);
  593. if (RegisterGlobalsFunc) {
  594. auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
  595. CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
  596. }
  597. }
  598. } else if (!RelocatableDeviceCode) {
  599. // Register binary with CUDA runtime. This is substantially different in
  600. // default mode vs. separate compilation!
  601. // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
  602. llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
  603. RegisterFatbinFunc,
  604. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
  605. GpuBinaryHandle = new llvm::GlobalVariable(
  606. TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
  607. llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
  608. GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
  609. CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
  610. CGM.getPointerAlign());
  611. // Call __cuda_register_globals(GpuBinaryHandle);
  612. if (RegisterGlobalsFunc)
  613. CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
  614. // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
  615. if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
  616. CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
  617. // void __cudaRegisterFatBinaryEnd(void **);
  618. llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
  619. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  620. "__cudaRegisterFatBinaryEnd");
  621. CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
  622. }
  623. } else {
  624. // Generate a unique module ID.
  625. SmallString<64> ModuleID;
  626. llvm::raw_svector_ostream OS(ModuleID);
  627. OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
  628. llvm::Constant *ModuleIDConstant =
  629. makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
  630. // Create an alias for the FatbinWrapper that nvcc will look for.
  631. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
  632. Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
  633. // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
  634. // void *, void (*)(void **))
  635. SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
  636. RegisterLinkedBinaryName += ModuleID;
  637. llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
  638. getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
  639. assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
  640. llvm::Value *Args[] = {RegisterGlobalsFunc,
  641. CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
  642. ModuleIDConstant,
  643. makeDummyFunction(getCallbackFnTy())};
  644. CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
  645. }
  646. // Create destructor and register it with atexit() the way NVCC does it. Doing
  647. // it during regular destructor phase worked in CUDA before 9.2 but results in
  648. // double-free in 9.2.
  649. if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
  650. // extern "C" int atexit(void (*f)(void));
  651. llvm::FunctionType *AtExitTy =
  652. llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
  653. llvm::FunctionCallee AtExitFunc =
  654. CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
  655. /*Local=*/true);
  656. CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
  657. }
  658. CtorBuilder.CreateRetVoid();
  659. return ModuleCtorFunc;
  660. }
  661. /// Creates a global destructor function that unregisters the GPU code blob
  662. /// registered by constructor.
  663. ///
  664. /// For CUDA:
  665. /// \code
  666. /// void __cuda_module_dtor(void*) {
  667. /// __cudaUnregisterFatBinary(Handle);
  668. /// }
  669. /// \endcode
  670. ///
  671. /// For HIP:
  672. /// \code
  673. /// void __hip_module_dtor(void*) {
  674. /// if (__hip_gpubin_handle) {
  675. /// __hipUnregisterFatBinary(__hip_gpubin_handle);
  676. /// __hip_gpubin_handle = 0;
  677. /// }
  678. /// }
  679. /// \endcode
  680. llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
  681. // No need for destructor if we don't have a handle to unregister.
  682. if (!GpuBinaryHandle)
  683. return nullptr;
  684. // void __cudaUnregisterFatBinary(void ** handle);
  685. llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
  686. llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
  687. addUnderscoredPrefixToName("UnregisterFatBinary"));
  688. llvm::Function *ModuleDtorFunc = llvm::Function::Create(
  689. llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
  690. llvm::GlobalValue::InternalLinkage,
  691. addUnderscoredPrefixToName("_module_dtor"), &TheModule);
  692. llvm::BasicBlock *DtorEntryBB =
  693. llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
  694. CGBuilderTy DtorBuilder(CGM, Context);
  695. DtorBuilder.SetInsertPoint(DtorEntryBB);
  696. Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
  697. GpuBinaryHandle->getAlignment()));
  698. auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
  699. // There is only one HIP fat binary per linked module, however there are
  700. // multiple destructor functions. Make sure the fat binary is unregistered
  701. // only once.
  702. if (CGM.getLangOpts().HIP) {
  703. llvm::BasicBlock *IfBlock =
  704. llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
  705. llvm::BasicBlock *ExitBlock =
  706. llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
  707. llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
  708. llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
  709. DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
  710. DtorBuilder.SetInsertPoint(IfBlock);
  711. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  712. DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
  713. DtorBuilder.CreateBr(ExitBlock);
  714. DtorBuilder.SetInsertPoint(ExitBlock);
  715. } else {
  716. DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
  717. }
  718. DtorBuilder.CreateRetVoid();
  719. return ModuleDtorFunc;
  720. }
  721. std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const {
  722. if (!CGM.getLangOpts().HIP)
  723. return Name;
  724. return (Name + ".stub").str();
  725. }
  726. CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
  727. return new CGNVCUDARuntime(CGM);
  728. }