فهرست منبع

[OpenCL] Use long instead of long long in x86 builtins

Summary: According to C99 standard long long is at least 64 bits in
size. However, OpenCL C defines long long as 128 bit signed
integer. This prevents one to use x86 builtins when compiling OpenCL C
code for x86 targets. The patch changes long long to long for OpenCL
only.

Patch by: Alexander Batashev <alexander.batashev@intel.com>

Reviewers: craig.topper, Ka-Ka, eandrews, erichkeane, Anastasia

Reviewed By: Ka-Ka, erichkeane, Anastasia

Subscribers: a.elovikov, yaxunl, Anastasia, cfe-commits, ivankara, etyurin, asavonic

Tags: #clang

Differential Revision: https://reviews.llvm.org/D62580

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@362391 91177308-0d34-0410-b5e6-96231b3b80d8
Andrew Savonichev 6 سال پیش
والد
کامیت
5663f71c62
5فایلهای تغییر یافته به همراه403 افزوده شده و 355 حذف شده
  1. 1 0
      include/clang/Basic/Builtins.def
  2. 288 288
      include/clang/Basic/BuiltinsX86.def
  3. 52 52
      include/clang/Basic/BuiltinsX86_64.def
  4. 15 4
      lib/AST/ASTContext.cpp
  5. 47 11
      test/CodeGen/builtins-x86.c

+ 1 - 0
include/clang/Basic/Builtins.def

@@ -53,6 +53,7 @@
 //  Z   -> int32_t (require a native 32-bit integer type on the target)
 //  W   -> int64_t (require a native 64-bit integer type on the target)
 //  N   -> 'int' size if target is LP64, 'L' otherwise.
+//  O   -> long for OpenCL targets, long long otherwise.
 //  S   -> signed
 //  U   -> unsigned
 //  I   -> Required to constant fold to an integer constant expression.

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 288 - 288
include/clang/Basic/BuiltinsX86.def


+ 52 - 52
include/clang/Basic/BuiltinsX86_64.def

@@ -43,65 +43,65 @@ TARGET_HEADER_BUILTIN(_InterlockedOr64,          "LLiLLiD*LLi", "nh", "intrin.h"
 TARGET_HEADER_BUILTIN(_InterlockedXor64,         "LLiLLiD*LLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128, "UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "cx16")
 
-TARGET_BUILTIN(__builtin_ia32_readeflags_u64, "ULLi", "n", "")
-TARGET_BUILTIN(__builtin_ia32_writeeflags_u64, "vULLi", "n", "")
-TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "LLiV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "LLiV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "LLiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "LLiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "n", "sse2")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v2di, "LLiV2LLiIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v2di, "V2LLiV2LLiLLiIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_crc32di, "ULLiULLiULLi", "nc", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4di, "LLiV4LLiIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4di, "V4LLiV4LLiLLiIi", "ncV:256:", "avx")
+TARGET_BUILTIN(__builtin_ia32_readeflags_u64, "UOi", "n", "")
+TARGET_BUILTIN(__builtin_ia32_writeeflags_u64, "vUOi", "n", "")
+TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "OiV4f", "ncV:128:", "sse")
+TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "OiV4f", "ncV:128:", "sse")
+TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "OiV2d", "ncV:128:", "sse2")
+TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "OiV2d", "ncV:128:", "sse2")
+TARGET_BUILTIN(__builtin_ia32_movnti64, "vOi*Oi", "n", "sse2")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v2di, "OiV2OiIi", "ncV:128:", "sse2")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v2di, "V2OiV2OiOiIi", "ncV:128:", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_crc32di, "UOiUOiUOi", "nc", "sse4.2")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4di, "OiV4OiIi", "ncV:256:", "avx")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4di, "V4OiV4OiOiIi", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_rdfsbase32, "Ui", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_rdfsbase64, "ULLi", "n", "fsgsbase")
+TARGET_BUILTIN(__builtin_ia32_rdfsbase64, "UOi", "n", "fsgsbase")
 TARGET_BUILTIN(__builtin_ia32_rdgsbase32, "Ui", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_rdgsbase64, "ULLi", "n", "fsgsbase")
+TARGET_BUILTIN(__builtin_ia32_rdgsbase64, "UOi", "n", "fsgsbase")
 TARGET_BUILTIN(__builtin_ia32_wrfsbase32, "vUi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrfsbase64, "vULLi", "n", "fsgsbase")
+TARGET_BUILTIN(__builtin_ia32_wrfsbase64, "vUOi", "n", "fsgsbase")
 TARGET_BUILTIN(__builtin_ia32_wrgsbase32, "vUi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrgsbase64, "vULLi", "n", "fsgsbase")
+TARGET_BUILTIN(__builtin_ia32_wrgsbase64, "vUOi", "n", "fsgsbase")
 TARGET_BUILTIN(__builtin_ia32_fxrstor64, "vv*", "n", "fxsr")
 TARGET_BUILTIN(__builtin_ia32_fxsave64, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_xsave64, "vv*ULLi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xrstor64, "vv*ULLi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xsaveopt64, "vv*ULLi", "n", "xsaveopt")
-TARGET_BUILTIN(__builtin_ia32_xrstors64, "vv*ULLi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_xsavec64, "vv*ULLi", "n", "xsavec")
-TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*ULLi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_incsspq, "vULLi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rdsspq, "ULLiULLi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrssq, "vULLiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrussq, "vULLiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcULLiULLiULLi*", "n", "")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcULLiULLiULLi*", "n", "")
-TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiULLi*", "n", "rdrnd")
-TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiULLi*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "ULLiULLi", "nc", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_bextr_u64, "ULLiULLiULLi", "nc", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "ULLiULLi", "nc", "")
-TARGET_BUILTIN(__builtin_ia32_bzhi_di, "ULLiULLiULLi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_di, "ULLiULLiULLi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_di, "ULLiULLiULLi", "nc", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_bextri_u64, "ULLiULLiIULLi", "nc", "tbm")
-TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcULLiUiUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpval64, "vULLiUiUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "ULLiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "LLiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "ULLiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "LLiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "ULLiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dLLiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fLLiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dULLiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fULLiIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_xsave64, "vv*UOi", "n", "xsave")
+TARGET_BUILTIN(__builtin_ia32_xrstor64, "vv*UOi", "n", "xsave")
+TARGET_BUILTIN(__builtin_ia32_xsaveopt64, "vv*UOi", "n", "xsaveopt")
+TARGET_BUILTIN(__builtin_ia32_xrstors64, "vv*UOi", "n", "xsaves")
+TARGET_BUILTIN(__builtin_ia32_xsavec64, "vv*UOi", "n", "xsavec")
+TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*UOi", "n", "xsaves")
+TARGET_BUILTIN(__builtin_ia32_incsspq, "vUOi", "n", "shstk")
+TARGET_BUILTIN(__builtin_ia32_rdsspq, "UOiUOi", "n", "shstk")
+TARGET_BUILTIN(__builtin_ia32_wrssq, "vUOiv*", "n", "shstk")
+TARGET_BUILTIN(__builtin_ia32_wrussq, "vUOiv*", "n", "shstk")
+TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "n", "")
+TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "n", "")
+TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd")
+TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed")
+TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "nc", "lzcnt")
+TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "nc", "bmi")
+TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "nc", "")
+TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "nc", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "nc", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "nc", "bmi2")
+TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "nc", "tbm")
+TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiUi", "n", "lwp")
+TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiUi", "n", "lwp")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "OiV4fIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "UOiV4fIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "OiV2dIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "OiV4fIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "UOiV4fIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dOiIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
-TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vULLi", "n", "ptwrite")
+TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN

+ 15 - 4
lib/AST/ASTContext.cpp

@@ -9282,13 +9282,13 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       Unsigned = true;
       break;
     case 'L':
-      assert(!IsSpecial && "Can't use 'L' with 'W', 'N' or 'Z' modifiers");
+      assert(!IsSpecial && "Can't use 'L' with 'W', 'N', 'Z' or 'O' modifiers");
       assert(HowLong <= 2 && "Can't have LLLL modifier");
       ++HowLong;
       break;
     case 'N':
       // 'N' behaves like 'L' for all non LP64 targets and 'int' otherwise.
-      assert(!IsSpecial && "Can't use two 'N', 'W' or 'Z' modifiers!");
+      assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
       assert(HowLong == 0 && "Can't use both 'L' and 'N' modifiers!");
       #ifndef NDEBUG
       IsSpecial = true;
@@ -9298,7 +9298,7 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       break;
     case 'W':
       // This modifier represents int64 type.
-      assert(!IsSpecial && "Can't use two 'N', 'W' or 'Z'  modifiers!");
+      assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
       assert(HowLong == 0 && "Can't use both 'L' and 'W' modifiers!");
       #ifndef NDEBUG
       IsSpecial = true;
@@ -9316,7 +9316,7 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       break;
     case 'Z':
       // This modifier represents int32 type.
-      assert(!IsSpecial && "Can't use two 'N', 'W' or 'Z' modifiers!");
+      assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
       assert(HowLong == 0 && "Can't use both 'L' and 'Z' modifiers!");
       #ifndef NDEBUG
       IsSpecial = true;
@@ -9335,6 +9335,17 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
         break;
       }
       break;
+    case 'O':
+      assert(!IsSpecial && "Can't use two 'N', 'W', 'Z' or 'O' modifiers!");
+      assert(HowLong == 0 && "Can't use both 'L' and 'O' modifiers!");
+      #ifndef NDEBUG
+      IsSpecial = true;
+      #endif
+      if (Context.getLangOpts().OpenCL)
+        HowLong = 1;
+      else
+        HowLong = 2;
+      break;
     }
   }
 

+ 47 - 11
test/CodeGen/builtins-x86.c

@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -emit-llvm -o %t %s
 // RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +shstk -target-feature +clzero -target-feature +wbnoinvd -target-feature +cldemote -fsyntax-only -o %t %s
+// RUN: %clang_cc1 -DUSE_64 -DOPENCL -x cl -cl-std=CL2.0 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -emit-llvm -o %t %s
 
 #ifdef USE_ALL
 #define USE_3DNOW
@@ -11,7 +12,11 @@
 typedef char V8c __attribute__((vector_size(8 * sizeof(char))));
 typedef signed short V4s __attribute__((vector_size(8)));
 typedef signed int V2i __attribute__((vector_size(8)));
+#ifndef OPENCL
 typedef signed long long V1LLi __attribute__((vector_size(8)));
+#else
+typedef signed long V1LLi __attribute__((vector_size(8)));
+#endif
 
 typedef float V2f __attribute__((vector_size(8)));
 
@@ -19,7 +24,11 @@ typedef float V2f __attribute__((vector_size(8)));
 typedef char V16c __attribute__((vector_size(16)));
 typedef signed short V8s __attribute__((vector_size(16)));
 typedef signed int V4i __attribute__((vector_size(16)));
+#ifndef OPENCL
 typedef signed long long V2LLi __attribute__((vector_size(16)));
+#else
+typedef signed long V2LLi __attribute__((vector_size(16)));
+#endif
 
 typedef float V4f __attribute__((vector_size(16)));
 typedef double V2d __attribute__((vector_size(16)));
@@ -27,7 +36,11 @@ typedef double V2d __attribute__((vector_size(16)));
 // 256-bit
 typedef char V32c __attribute__((vector_size(32)));
 typedef signed int V8i __attribute__((vector_size(32)));
+#ifndef OPENCL
 typedef signed long long V4LLi __attribute__((vector_size(32)));
+#else
+typedef signed long V4LLi __attribute__((vector_size(32)));
+#endif
 
 typedef double V4d __attribute__((vector_size(32)));
 typedef float  V8f __attribute__((vector_size(32)));
@@ -41,21 +54,30 @@ void f0() {
 #endif
   signed int          tmp_i;
   unsigned int        tmp_Ui;
+#ifndef OPENCL
   signed long long    tmp_LLi;
   unsigned long long  tmp_ULLi;
+#else
+  signed long         tmp_LLi;
+  unsigned long       tmp_ULLi;
+#endif
   float               tmp_f;
   double              tmp_d;
 
   void*          tmp_vp;
   const void*    tmp_vCp;
-  char*          tmp_cp; 
-  const char*    tmp_cCp; 
+  char*          tmp_cp;
+  const char*    tmp_cCp;
   int*           tmp_ip;
   float*         tmp_fp;
   const float*   tmp_fCp;
   double*        tmp_dp;
   const double*  tmp_dCp;
+#ifndef OPENCL
   long long*     tmp_LLip;
+#else
+  long*          tmp_LLip;
+#endif
 
 #define imm_i 32
 #define imm_i_0_2 0
@@ -102,8 +124,8 @@ void f0() {
   const V4d* tmp_V4dCp;
   const V8f* tmp_V8fCp;
 
-  tmp_V2LLi = __builtin_ia32_undef128();
-  tmp_V4LLi = __builtin_ia32_undef256();
+  tmp_V2d = __builtin_ia32_undef128();
+  tmp_V4d = __builtin_ia32_undef256();
 
   tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f);
   tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f);
@@ -203,9 +225,9 @@ void f0() {
   tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s);
   tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c);
   tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s);
-  tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s);
-  tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i);
-  tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s);
+  tmp_V16c = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s);
+  tmp_V8s = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i);
+  tmp_V16c = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s);
   tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s);
   tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f);
   tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d);
@@ -225,7 +247,7 @@ void f0() {
   tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
   tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
   tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
-  tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
+  tmp_V8s = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
   tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
   tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
   tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
@@ -271,9 +293,13 @@ void f0() {
   __builtin_ia32_clrssbsy(tmp_vp);
 
   (void) __builtin_ia32_ldmxcsr(tmp_Ui);
+#ifndef OPENCL
   (void) _mm_setcsr(tmp_Ui);
+#endif
   tmp_Ui = __builtin_ia32_stmxcsr();
+#ifndef OPENCL
   tmp_Ui = _mm_getcsr();
+#endif
   (void)__builtin_ia32_fxsave(tmp_vp);
   (void)__builtin_ia32_fxsave64(tmp_vp);
   (void)__builtin_ia32_fxrstor(tmp_vp);
@@ -321,7 +347,9 @@ void f0() {
   tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
   (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
+#ifndef OPENCL
   (void) _mm_sfence();
+#endif
 
   tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
   tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
@@ -356,13 +384,21 @@ void f0() {
   tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
   tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
   (void) __builtin_ia32_clflush(tmp_vCp);
+#ifndef OPENCL
   (void) _mm_clflush(tmp_vCp);
+#endif
   (void) __builtin_ia32_lfence();
+#ifndef OPENCL
   (void) _mm_lfence();
+#endif
   (void) __builtin_ia32_mfence();
+#ifndef OPENCL
   (void) _mm_mfence();
+#endif
   (void) __builtin_ia32_pause();
+#ifndef OPENCL
   (void) _mm_pause();
+#endif
   tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
   tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
   tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
@@ -389,12 +425,12 @@ void f0() {
   tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i);
   tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i);
   tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i);
-  tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s);
+  tmp_V4i = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s);
   (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui);
   (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
   tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
-  tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i);
-  tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i);
+  tmp_V16c = __builtin_ia32_palignr128(tmp_V16c, tmp_V16c, imm_i);
+  tmp_V8c = __builtin_ia32_palignr(tmp_V8c, tmp_V8c, imm_i);
 #ifdef USE_SSE4
   tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است