6 жил өмнө · 6a79c4432e
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7143,7 +7143,7 @@ def err_kern_type_not_void_return : Error<
 
															 def err_kern_is_nonstatic_method : Error<
														
 
															   "kernel function %0 must be a free function or static member function">;
														
 
															 def err_config_scalar_return : Error<
														
 
															-  "CUDA special function 'cudaConfigureCall' must have scalar return type">;
														
 
															+  "CUDA special function '%0' must have scalar return type">;
														
 
															 def err_kern_call_not_global_function : Error<
														
 
															   "kernel call to non-global function %0">;
														
 
															 def err_global_call_not_config : Error<
														
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -10348,6 +10348,11 @@ public:
 
															   /// Copies target attributes from the template TD to the function FD.
														
 
															   void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD);
														
 
															+  /// Returns the name of the launch configuration function.  This is the name
														
 
															+  /// of the function that will be called to configure kernel call, with the
														
 
															+  /// parameters specified via <<<>>>.
														
 
															+  std::string getCudaConfigureFuncName() const;
														
 
															+
														
 
															   /// \name Code completion
														
 
															   //@{
														
 
															   /// Describes the context in which code completion occurs.
														
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -15,6 +15,8 @@
 
															 #include "CodeGenFunction.h"
														
 
															 #include "CodeGenModule.h"
														
 
															 #include "clang/AST/Decl.h"
														
 
															+#include "clang/Basic/Cuda.h"
														
 
															+#include "clang/CodeGen/CodeGenABITypes.h"
														
 
															 #include "clang/CodeGen/ConstantInitBuilder.h"
														
 
															 #include "llvm/IR/BasicBlock.h"
														
 
															 #include "llvm/IR/Constants.h"
														
@@ -102,7 +104,8 @@ private:
 
															     return DummyFunc;
														
 
															   }
														
 
															-  void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
														
 
															+  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
														
 
															+  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
														
 
															 public:
														
 
															   CGNVCUDARuntime(CodeGenModule &CGM);
														
@@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
 
															 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
														
 
															                                      FunctionArgList &Args) {
														
 
															   EmittedKernels.push_back(CGF.CurFn);
														
 
															-  emitDeviceStubBody(CGF, Args);
														
 
															+  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
														
 
															+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
														
 
															+    emitDeviceStubBodyNew(CGF, Args);
														
 
															+  else
														
 
															+    emitDeviceStubBodyLegacy(CGF, Args);
														
 
															 }
														
 
															-void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
														
 
															-                                         FunctionArgList &Args) {
														
 
															+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
														
 
															+// array and kernels are launched using cudaLaunchKernel().
														
 
															+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
														
 
															+                                            FunctionArgList &Args) {
														
 
															+  // Build the shadow stack entry at the very start of the function.
														
 
															+
														
 
															+  // Calculate amount of space we will need for all arguments.  If we have no
														
 
															+  // args, allocate a single pointer so we still have a valid pointer to the
														
 
															+  // argument array that we can pass to runtime, even if it will be unused.
														
 
															+  Address KernelArgs = CGF.CreateTempAlloca(
														
 
															+      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
														
 
															+      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
														
 
															+  // Store pointers to the arguments in a locally allocated launch_args.
														
 
															+  for (unsigned i = 0; i < Args.size(); ++i) {
														
 
															+    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
														
 
															+    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
														
 
															+    CGF.Builder.CreateDefaultAlignedStore(
														
 
															+        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
														
 
															+  }
														
 
															+
														
 
															+  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
														
 
															+
														
 
															+  // Lookup cudaLaunchKernel function.
														
 
															+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
														
 
															+  //                              void **args, size_t sharedMem,
														
 
															+  //                              cudaStream_t stream);
														
 
															+  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
														
 
															+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
														
 
															+  IdentifierInfo &cudaLaunchKernelII =
														
 
															+      CGM.getContext().Idents.get("cudaLaunchKernel");
														
 
															+  FunctionDecl *cudaLaunchKernelFD = nullptr;
														
 
															+  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
														
 
															+    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
														
 
															+      cudaLaunchKernelFD = FD;
														
 
															+  }
														
 
															+
														
 
															+  if (cudaLaunchKernelFD == nullptr) {
														
 
															+    CGM.Error(CGF.CurFuncDecl->getLocation(),
														
 
															+              "Can't find declaration for cudaLaunchKernel()");
														
 
															+    return;
														
 
															+  }
														
 
															+  // Create temporary dim3 grid_dim, block_dim.
														
 
															+  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
														
 
															+  QualType Dim3Ty = GridDimParam->getType();
														
 
															+  Address GridDim =
														
 
															+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
														
 
															+  Address BlockDim =
														
 
															+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
														
 
															+  Address ShmemSize =
														
 
															+      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
														
 
															+  Address Stream =
														
 
															+      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
														
 
															+  llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction(
														
 
															+      llvm::FunctionType::get(IntTy,
														
 
															+                              {/*gridDim=*/GridDim.getType(),
														
 
															+                               /*blockDim=*/BlockDim.getType(),
														
 
															+                               /*ShmemSize=*/ShmemSize.getType(),
														
 
															+                               /*Stream=*/Stream.getType()},
														
 
															+                              /*isVarArg=*/false),
														
 
															+      "__cudaPopCallConfiguration");
														
 
															+
														
 
															+  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
														
 
															+                              {GridDim.getPointer(), BlockDim.getPointer(),
														
 
															+                               ShmemSize.getPointer(), Stream.getPointer()});
														
 
															+
														
 
															+  // Emit the call to cudaLaunch
														
 
															+  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
														
 
															+  CallArgList LaunchKernelArgs;
														
 
															+  LaunchKernelArgs.add(RValue::get(Kernel),
														
 
															+                       cudaLaunchKernelFD->getParamDecl(0)->getType());
														
 
															+  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
														
 
															+  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
														
 
															+  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
														
 
															+                       cudaLaunchKernelFD->getParamDecl(3)->getType());
														
 
															+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
														
 
															+                       cudaLaunchKernelFD->getParamDecl(4)->getType());
														
 
															+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
														
 
															+                       cudaLaunchKernelFD->getParamDecl(5)->getType());
														
 
															+
														
 
															+  QualType QT = cudaLaunchKernelFD->getType();
														
 
															+  QualType CQT = QT.getCanonicalType();
														
 
															+  llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD);
														
 
															+  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
														
 
															+
														
 
															+  const CGFunctionInfo &FI =
														
 
															+      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
														
 
															+  llvm::Constant *cudaLaunchKernelFn =
														
 
															+      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
														
 
															+  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
														
 
															+               LaunchKernelArgs);
														
 
															+  CGF.EmitBranch(EndBlock);
														
 
															+
														
 
															+  CGF.EmitBlock(EndBlock);
														
 
															+}
														
 
															+
														
 
															+void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
														
 
															+                                               FunctionArgList &Args) {
														
 
															   // Emit a call to cudaSetupArgument for each arg in Args.
														
 
															   llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
														
 
															   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
														
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -426,5 +426,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
 
															 #pragma pop_macro("__USE_FAST_MATH__")
														
 
															 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
														
 
															+// CUDA runtime uses this undocumented function to access kernel launch
														
 
															+// configuration. The declaration is in crt/device_functions.h but that file
														
 
															+// includes a lot of other stuff we don't want. Instead, we'll provide our own
														
 
															+// declaration for it here.
														
 
															+#if CUDA_VERSION >= 9020
														
 
															+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
														
 
															+                                                size_t sharedMem = 0,
														
 
															+                                                void *stream = 0);
														
 
															+#endif
														
 
															+
														
 
															 #endif // __CUDA__
														
 
															 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
														
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -13,6 +13,7 @@
 
															 #include "clang/AST/ASTContext.h"
														
 
															 #include "clang/AST/Decl.h"
														
 
															 #include "clang/AST/ExprCXX.h"
														
 
															+#include "clang/Basic/Cuda.h"
														
 
															 #include "clang/Lex/Preprocessor.h"
														
 
															 #include "clang/Sema/Lookup.h"
														
 
															 #include "clang/Sema/Sema.h"
														
@@ -41,9 +42,8 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
 
															                                          SourceLocation GGGLoc) {
														
 
															   FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
														
 
															   if (!ConfigDecl)
														
 
															-    return ExprError(
														
 
															-        Diag(LLLLoc, diag::err_undeclared_var_use)
														
 
															-        << (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
														
 
															+    return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
														
 
															+                     << getCudaConfigureFuncName());
														
 
															   QualType ConfigQTy = ConfigDecl->getType();
														
 
															   DeclRefExpr *ConfigDR = new (Context)
														
@@ -957,3 +957,16 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
 
															   copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
														
 
															   copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
														
 
															 }
														
 
															+
														
 
															+std::string Sema::getCudaConfigureFuncName() const {
														
 
															+  if (getLangOpts().HIP)
														
 
															+    return "hipConfigureCall";
														
 
															+
														
 
															+  // New CUDA kernel launch sequence.
														
 
															+  if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
														
 
															+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
														
 
															+    return "__cudaPushCallConfiguration";
														
 
															+
														
 
															+  // Legacy CUDA kernel configuration call
														
 
															+  return "cudaConfigureCall";
														
 
															+}
														
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -9146,13 +9146,12 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
 
															   if (getLangOpts().CUDA) {
														
 
															     IdentifierInfo *II = NewFD->getIdentifier();
														
 
															-    if (II &&
														
 
															-        II->isStr(getLangOpts().HIP ? "hipConfigureCall"
														
 
															-                                    : "cudaConfigureCall") &&
														
 
															+    if (II && II->isStr(getCudaConfigureFuncName()) &&
														
 
															         !NewFD->isInvalidDecl() &&
														
 
															         NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
														
 
															       if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
														
 
															-        Diag(NewFD->getLocation(), diag::err_config_scalar_return);
														
 
															+        Diag(NewFD->getLocation(), diag::err_config_scalar_return)
														
 
															+            << getCudaConfigureFuncName();
														
 
															       Context.setcudaConfigureCallDecl(NewFD);
														
 
															     }
														
--- a/test/CodeGenCUDA/Inputs/cuda.h
+++ b/test/CodeGenCUDA/Inputs/cuda.h
@@ -15,13 +15,20 @@ struct dim3 {
 
															 };
														
 
															 typedef struct cudaStream *cudaStream_t;
														
 
															-
														
 
															+typedef enum cudaError {} cudaError_t;
														
 
															 #ifdef __HIP__
														
 
															 int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
														
 
															                      cudaStream_t stream = 0);
														
 
															 #else
														
 
															-int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
														
 
															-                      cudaStream_t stream = 0);
														
 
															+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
														
 
															+                                 size_t sharedSize = 0,
														
 
															+                                 cudaStream_t stream = 0);
														
 
															+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
														
 
															+                                           size_t sharedSize = 0,
														
 
															+                                           cudaStream_t stream = 0);
														
 
															+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
														
 
															+                                        dim3 blockDim, void **args,
														
 
															+                                        size_t sharedMem, cudaStream_t stream);
														
 
															 #endif
														
 
															 extern "C" __device__ int printf(const char*, ...);
														
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -1,14 +1,36 @@
 
															 // RUN: echo "GPU binary would be here" > %t
														
 
															 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															-// RUN:     -fcuda-include-gpubinary %t -o - \
														
 
															-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
														
 
															+// RUN:     -target-sdk-version=8.0 -fcuda-include-gpubinary %t -o - \
														
 
															+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s \
														
 
															+// RUN:       --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-OLD
														
 
															 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															-// RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
														
 
															-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
														
 
															+// RUN:     -target-sdk-version=8.0  -fcuda-include-gpubinary %t \
														
 
															+// RUN:     -o - -DNOGLOBALS \
														
 
															+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s \
														
 
															+// RUN:     -check-prefixes=NOGLOBALS,CUDANOGLOBALS
														
 
															 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															-// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - \
														
 
															-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
														
 
															-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
														
 
															+// RUN:     -target-sdk-version=8.0 -fgpu-rdc -fcuda-include-gpubinary %t \
														
 
															+// RUN:     -o - \
														
 
															+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s \
														
 
															+// RUN:       --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA-OLD
														
 
															+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															+// RUN:     -target-sdk-version=8.0 -o - \
														
 
															+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
														
 
															+
														
 
															+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s       \
														
 
															+// RUN:     -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
														
 
															+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
														
 
															+// RUN:       --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-NEW
														
 
															+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															+// RUN:     -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
														
 
															+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s \
														
 
															+// RUN:       --check-prefixes=NOGLOBALS,CUDANOGLOBALS
														
 
															+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															+// RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
														
 
															+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
														
 
															+// RUN:       --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA_NEW
														
 
															+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
 
															+// RUN:     -target-sdk-version=9.2 -o - \
														
 
															 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
														
 
															 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
														
@@ -103,15 +125,34 @@ void use_pointers() {
 
															 // by a call to cudaLaunch.
														
 
															 // ALL: define{{.*}}kernelfunc
														
 
															-// ALL: call{{.*}}[[PREFIX]]SetupArgument
														
 
															-// ALL: call{{.*}}[[PREFIX]]SetupArgument
														
 
															-// ALL: call{{.*}}[[PREFIX]]SetupArgument
														
 
															-// ALL: call{{.*}}[[PREFIX]]Launch
														
 
															+
														
 
															+// New launch sequence stores arguments into local buffer and passes array of
														
 
															+// pointers to them directly to cudaLaunchKernel
														
 
															+// CUDA-NEW: alloca
														
 
															+// CUDA-NEW: store
														
 
															+// CUDA-NEW: store
														
 
															+// CUDA-NEW: store
														
 
															+// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
														
 
															+// CUDA-NEW: call{{.*}}cudaLaunchKernel
														
 
															+
														
 
															+// Legacy style launch sequence sets up arguments by passing them to
														
 
															+// [cuda|hip]SetupArgument.
														
 
															+// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// CUDA-OLD: call{{.*}}[[PREFIX]]Launch
														
 
															+
														
 
															+// HIP: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// HIP: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// HIP: call{{.*}}[[PREFIX]]SetupArgument
														
 
															+// HIP: call{{.*}}[[PREFIX]]Launch
														
 
															 __global__ void kernelfunc(int i, int j, int k) {}
														
 
															 // Test that we've built correct kernel launch sequence.
														
 
															 // ALL: define{{.*}}hostfunc
														
 
															-// ALL: call{{.*}}[[PREFIX]]ConfigureCall
														
 
															+// CUDA-OLD: call{{.*}}[[PREFIX]]ConfigureCall
														
 
															+// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
														
 
															+// HIP: call{{.*}}[[PREFIX]]ConfigureCall
														
 
															 // ALL: call{{.*}}kernelfunc
														
 
															 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
														
 
															 #endif
														
--- a/test/CodeGenCUDA/kernel-args-alignment.cu
+++ b/test/CodeGenCUDA/kernel-args-alignment.cu
@@ -1,8 +1,12 @@
 
															-// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
														
 
															-// RUN:  FileCheck -check-prefix HOST -check-prefix CHECK %s
														
 
															+// New CUDA kernel launch sequence does not require explicit specification of
														
 
															+// size/offset for each argument, so only the old way is tested.
														
 
															+//
														
 
															+// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm \
														
 
															+// RUN:    -target-sdk-version=8.0 -o - %s \
														
 
															+// RUN:  | FileCheck -check-prefixes=HOST-OLD,CHECK %s
														
 
															 // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
														
 
															-// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
														
 
															+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK %s
														
 
															 #include "Inputs/cuda.h"
														
@@ -27,9 +31,9 @@ static_assert(alignof(S) == 8, "Unexpected alignment.");
 
															 //   1. offset 0, width 1
														
 
															 //   2. offset 8 (because alignof(S) == 8), width 16
														
 
															 //   3. offset 24, width 8
														
 
															-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
														
 
															-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
														
 
															-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
														
 
															+// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
														
 
															+// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
														
 
															+// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
														
 
															 // DEVICE-LABEL: @_Z6kernelc1SPi
														
 
															 // DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
														
--- a/test/CodeGenCUDA/kernel-call.cu
+++ b/test/CodeGenCUDA/kernel-call.cu
@@ -1,5 +1,9 @@
 
															-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
														
 
															-// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
														
 
															+// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
														
 
															+// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
														
 
															+// RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
														
 
															+// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
														
 
															+// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
														
 
															+// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
														
 
															 #include "Inputs/cuda.h"
														
@@ -7,14 +11,17 @@
 
															 // CHECK-LABEL: define{{.*}}g1
														
 
															 // HIP: call{{.*}}hipSetupArgument
														
 
															 // HIP: call{{.*}}hipLaunchByPtr
														
 
															-// CUDA: call{{.*}}cudaSetupArgument
														
 
															-// CUDA: call{{.*}}cudaLaunch
														
 
															+// CUDA-OLD: call{{.*}}cudaSetupArgument
														
 
															+// CUDA-OLD: call{{.*}}cudaLaunch
														
 
															+// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
														
 
															+// CUDA-NEW: call{{.*}}cudaLaunchKernel
														
 
															 __global__ void g1(int x) {}
														
 
															 // CHECK-LABEL: define{{.*}}main
														
 
															 int main(void) {
														
 
															   // HIP: call{{.*}}hipConfigureCall
														
 
															-  // CUDA: call{{.*}}cudaConfigureCall
														
 
															+  // CUDA-OLD: call{{.*}}cudaConfigureCall
														
 
															+  // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
														
 
															   // CHECK: icmp
														
 
															   // CHECK: br
														
 
															   // CHECK: call{{.*}}g1
														
--- a/test/Driver/cuda-simple.cu
+++ b/test/Driver/cuda-simple.cu
@@ -2,7 +2,7 @@
 
															 // http://llvm.org/PR22936
														
 
															 // RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s
														
 
															 //
														
 
															-// Verify that we pass -x cuda-cpp-output to compiler after 
														
 
															+// Verify that we pass -x cuda-cpp-output to compiler after
														
 
															 // preprocessing a CUDA file
														
 
															 // RUN: %clang  -Werror -### -save-temps -c %s 2>&1 | FileCheck %s
														
 
															 // CHECK: "-cc1"
														
@@ -14,7 +14,9 @@
 
															 // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output".
														
 
															 // RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s
														
 
															-int cudaConfigureCall(int, int);
														
 
															+extern "C" int cudaConfigureCall(int, int);
														
 
															+extern "C" int __cudaPushCallConfiguration(int, int);
														
 
															+
														
 
															 __attribute__((global)) void kernel() {}
														
 
															 void func() {
														
--- a/test/SemaCUDA/Inputs/cuda.h
+++ b/test/SemaCUDA/Inputs/cuda.h
@@ -18,9 +18,17 @@ struct dim3 {
 
															 };
														
 
															 typedef struct cudaStream *cudaStream_t;
														
 
															-
														
 
															-int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
														
 
															-                      cudaStream_t stream = 0);
														
 
															+typedef enum cudaError {} cudaError_t;
														
 
															+
														
 
															+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
														
 
															+                                 size_t sharedSize = 0,
														
 
															+                                 cudaStream_t stream = 0);
														
 
															+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
														
 
															+                                           size_t sharedSize = 0,
														
 
															+                                           cudaStream_t stream = 0);
														
 
															+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
														
 
															+                                        dim3 blockDim, void **args,
														
 
															+                                        size_t sharedMem, cudaStream_t stream);
														
 
															 // Host- and device-side placement new overloads.
														
 
															 void *operator new(__SIZE_TYPE__, void *p) { return p; }
														
--- a/test/SemaCUDA/config-type.cu
+++ b/test/SemaCUDA/config-type.cu
@@ -1,3 +1,7 @@
 
															-// RUN: %clang_cc1 -fsyntax-only -verify %s
														
 
															+// RUN: %clang_cc1 -target-sdk-version=8.0 -fsyntax-only -verify=legacy-launch %s
														
 
															+// RUN: %clang_cc1 -target-sdk-version=9.2 -fsyntax-only -verify=new-launch %s
														
 
															-void cudaConfigureCall(unsigned gridSize, unsigned blockSize); // expected-error {{must have scalar return type}}
														
 
															+// legacy-launch-error@+1 {{must have scalar return type}}
														
 
															+void cudaConfigureCall(unsigned gridSize, unsigned blockSize);
														
 
															+// new-launch-error@+1 {{must have scalar return type}}
														
 
															+void __cudaPushCallConfiguration(unsigned gridSize, unsigned blockSize);
														
--- a/unittests/ASTMatchers/ASTMatchersTest.h
+++ b/unittests/ASTMatchers/ASTMatchersTest.h
@@ -183,7 +183,9 @@ testing::AssertionResult matchesConditionallyWithCuda(
 
															       "typedef struct cudaStream *cudaStream_t;"
														
 
															       "int cudaConfigureCall(dim3 gridSize, dim3 blockSize,"
														
 
															       "                      size_t sharedSize = 0,"
														
 
															-      "                      cudaStream_t stream = 0);";
														
 
															+      "                      cudaStream_t stream = 0);"
														
 
															+      "extern \"C\" unsigned __cudaPushCallConfiguration("
														
 
															+      "    dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0);";
														
 
															   bool Found = false, DynamicFound = false;
														
 
															   MatchFinder Finder;