Browse Source

[HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

Differential Revision: https://reviews.llvm.org/D48287


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@337639 91177308-0d34-0410-b5e6-96231b3b80d8
Yaxun Liu 7 years ago
parent
commit
e72aad6948

+ 0 - 1
include/clang/Basic/LangOptions.def

@@ -209,7 +209,6 @@ LANGOPT(RenderScript      , 1, 0, "RenderScript")
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
-LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
 

+ 1 - 1
lib/CodeGen/CGCall.cpp

@@ -1800,7 +1800,7 @@ void CodeGenModule::ConstructDefaultFnAttrList(StringRef Name, bool HasOptnone,
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
 
 
     // Respect -fcuda-flush-denormals-to-zero.
     // Respect -fcuda-flush-denormals-to-zero.
-    if (getLangOpts().CUDADeviceFlushDenormalsToZero)
+    if (CodeGenOpts.FlushDenorm)
       FuncAttrs.addAttribute("nvptx-f32ftz", "true");
       FuncAttrs.addAttribute("nvptx-f32ftz", "true");
   }
   }
 }
 }

+ 1 - 1
lib/CodeGen/CodeGenModule.cpp

@@ -526,7 +526,7 @@ void CodeGenModule::Release() {
     // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
     // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
     // property.)
     // property.)
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
-                              LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
+                              CodeGenOpts.FlushDenorm ? 1 : 0);
   }
   }
 
 
   // Emit OpenCL specific module metadata: OpenCL/SPIR version.
   // Emit OpenCL specific module metadata: OpenCL/SPIR version.

+ 3 - 4
lib/Frontend/CompilerInvocation.cpp

@@ -690,7 +690,9 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
                         Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
                         Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
                         Args.hasArg(OPT_cl_fast_relaxed_math));
                         Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+                     (Args.hasArg(OPT_fcuda_is_device) &&
+                      Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
   Opts.CorrectlyRoundedDivSqrt =
       Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
       Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =
   Opts.UniformWGSize =
@@ -2191,9 +2193,6 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
     Opts.CUDAHostDeviceConstexpr = 0;
     Opts.CUDAHostDeviceConstexpr = 0;
 
 
-  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
-    Opts.CUDADeviceFlushDenormalsToZero = 1;
-
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
     Opts.CUDADeviceApproxTranscendentals = 1;
     Opts.CUDADeviceApproxTranscendentals = 1;
 
 

+ 15 - 0
test/CodeGenCUDA/flush-denormals.cu

@@ -5,6 +5,13 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 #include "Inputs/cuda.h"
 
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
@@ -12,11 +19,19 @@
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option
+// -fcuda-flush-denormals-to-zero.
+
 // CHECK-LABEL: define void @foo() #0
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 extern "C" __device__ void foo() {}
 
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}