parallel_for_codegen.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
  2. // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
  3. // RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
  4. // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
  5. // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -O1 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=CLEANUP
  6. // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
  7. // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
  8. // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
  9. // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
  10. // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -O1 -fopenmp-simd -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
  11. // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
  12. // expected-no-diagnostics
  13. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix=OMP5 %s
  14. // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
  15. // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=OMP5 %s
  16. // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
  17. #ifndef HEADER
  18. #define HEADER
  19. // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
  20. // CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 514, i32 0, i32 0, i8*
  21. // CHECK-LABEL: with_var_schedule
  22. void with_var_schedule() {
  23. double a = 5;
  24. // CHECK: [[CHUNK_SIZE:%.+]] = fptosi double %{{.+}}to i8
  25. // CHECK: store i8 %{{.+}}, i8* [[CHUNK:%.+]],
  26. // CHECK: [[VAL:%.+]] = load i8, i8* [[CHUNK]],
  27. // CHECK: store i8 [[VAL]], i8*
  28. // CHECK: [[CHUNK:%.+]] = load i64, i64* %
  29. // CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, i64 [[CHUNK]])
  30. // CHECK: [[CHUNK_VAL:%.+]] = load i8, i8* %
  31. // CHECK: [[CHUNK_SIZE:%.+]] = sext i8 [[CHUNK_VAL]] to i64
  32. // CHECK: call void @__kmpc_for_static_init_8u([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID:%[^,]+]], i32 33, i32* [[IS_LAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]], i64 1, i64 [[CHUNK_SIZE]])
  33. // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  34. #pragma omp parallel for schedule(static, char(a))
  35. for (unsigned long long i = 1; i < 2; ++i) {
  36. }
  37. }
  38. // CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  39. void without_schedule_clause(float *a, float *b, float *c, float *d) {
  40. #pragma omp parallel for
  41. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  42. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  43. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  44. // CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID:%.+]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
  45. // UB = min(UB, GlobalUB)
  46. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  47. // CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
  48. // CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
  49. // CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
  50. // CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
  51. // CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  52. // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  53. // Loop header
  54. // CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  55. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  56. // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
  57. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  58. for (int i = 33; i < 32000000; i += 7) {
  59. // CHECK: [[LOOP1_BODY]]
  60. // Start of body: calculate i from IV:
  61. // CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
  62. // CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
  63. // CHECK-NEXT: [[CALC_I_2:%.+]] = add nsw i32 33, [[CALC_I_1]]
  64. // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
  65. // ... loop body ...
  66. // End of body: store into a[i]:
  67. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  68. a[i] = b[i] * c[i] * d[i];
  69. // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  70. // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
  71. // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
  72. // CHECK-NEXT: br label %{{.+}}
  73. }
  74. // CHECK: [[LOOP1_END]]
  75. // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  76. // CHECK: ret void
  77. }
  78. // CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  79. void static_not_chunked(float *a, float *b, float *c, float *d) {
  80. #pragma omp parallel for schedule(static)
  81. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  82. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  83. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  84. // CHECK: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID:%.+]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
  85. // UB = min(UB, GlobalUB)
  86. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  87. // CHECK-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4571423
  88. // CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
  89. // CHECK: [[UBRESULT:%.+]] = phi i32 [ 4571423, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
  90. // CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
  91. // CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  92. // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  93. // Loop header
  94. // CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  95. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  96. // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
  97. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  98. for (int i = 32000000; i > 33; i += -7) {
  99. // CHECK: [[LOOP1_BODY]]
  100. // Start of body: calculate i from IV:
  101. // CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
  102. // CHECK-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 7
  103. // CHECK-NEXT: [[CALC_I_2:%.+]] = sub nsw i32 32000000, [[CALC_I_1]]
  104. // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
  105. // ... loop body ...
  106. // End of body: store into a[i]:
  107. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  108. a[i] = b[i] * c[i] * d[i];
  109. // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  110. // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
  111. // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
  112. // CHECK-NEXT: br label %{{.+}}
  113. }
  114. // CHECK: [[LOOP1_END]]
  115. // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  116. // CHECK: ret void
  117. }
  118. // CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  119. void static_chunked(float *a, float *b, float *c, float *d) {
  120. #pragma omp parallel for schedule(static, 5)
  121. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  122. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  123. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  124. // CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID:%.+]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
  125. // UB = min(UB, GlobalUB)
  126. // CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  127. // CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
  128. // CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
  129. // CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
  130. // CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
  131. // CHECK-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  132. // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  133. // Outer loop header
  134. // CHECK: [[O_IV:%.+]] = load i32, i32* [[OMP_IV]]
  135. // CHECK-NEXT: [[O_UB:%.+]] = load i32, i32* [[OMP_UB]]
  136. // CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
  137. // CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
  138. // Loop header
  139. // CHECK: [[O_LOOP1_BODY]]
  140. // CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  141. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  142. // CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
  143. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  144. for (unsigned i = 131071; i <= 2147483647; i += 127) {
  145. // CHECK: [[LOOP1_BODY]]
  146. // Start of body: calculate i from IV:
  147. // CHECK: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
  148. // CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
  149. // CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
  150. // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
  151. // ... loop body ...
  152. // End of body: store into a[i]:
  153. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  154. a[i] = b[i] * c[i] * d[i];
  155. // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  156. // CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
  157. // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
  158. // CHECK-NEXT: br label %{{.+}}
  159. }
  160. // CHECK: [[LOOP1_END]]
  161. // Update the counters, adding stride
  162. // CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  163. // CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
  164. // CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
  165. // CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
  166. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  167. // CHECK-NEXT: [[ST:%.+]] = load i32, i32* [[OMP_ST]]
  168. // CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
  169. // CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]
  170. // CHECK: [[O_LOOP1_END]]
  171. // CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  172. // CHECK: ret void
  173. }
  174. // CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  175. void dynamic1(float *a, float *b, float *c, float *d) {
  176. #pragma omp parallel for schedule(dynamic)
  177. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  178. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  179. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  180. // CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID:%.+]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
  181. //
  182. // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
  183. // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
  184. // CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
  185. // Loop header
  186. // CHECK: [[O_LOOP1_BODY]]
  187. // CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
  188. // CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
  189. // CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
  190. // CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
  191. // CHECK-NEXT: [[BOUND:%.+]] = add i64 [[UB]], 1
  192. // CHECK-NEXT: [[CMP:%.+]] = icmp ult i64 [[IV]], [[BOUND]]
  193. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  194. for (unsigned long long i = 131071; i < 2147483647; i += 127) {
  195. // CHECK: [[LOOP1_BODY]]
  196. // Start of body: calculate i from IV:
  197. // CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
  198. // CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
  199. // CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
  200. // CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
  201. // ... loop body ...
  202. // End of body: store into a[i]:
  203. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  204. a[i] = b[i] * c[i] * d[i];
  205. // CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
  206. // CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
  207. // CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
  208. // CHECK-NEXT: br label %{{.+}}
  209. }
  210. // CHECK: [[LOOP1_END]]
  211. // CHECK: [[O_LOOP1_END]]
  212. // CHECK: ret void
  213. }
  214. // CHECK-LABEL: define {{.*void}} @{{.*}}guided7{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  215. void guided7(float *a, float *b, float *c, float *d) {
  216. #pragma omp parallel for schedule(guided, 7)
  217. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  218. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  219. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  220. // CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID:%.+]], i32 36, i64 0, i64 16908287, i64 1, i64 7)
  221. //
  222. // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
  223. // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
  224. // CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
  225. // Loop header
  226. // CHECK: [[O_LOOP1_BODY]]
  227. // CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
  228. // CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
  229. // CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
  230. // CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
  231. // CHECK-NEXT: [[BOUND:%.+]] = add i64 [[UB]], 1
  232. // CHECK-NEXT: [[CMP:%.+]] = icmp ult i64 [[IV]], [[BOUND]]
  233. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  234. for (unsigned long long i = 131071; i < 2147483647; i += 127) {
  235. // CHECK: [[LOOP1_BODY]]
  236. // Start of body: calculate i from IV:
  237. // CHECK: [[IV1_1:%.+]] = load i64, i64* [[OMP_IV]]
  238. // CHECK-NEXT: [[CALC_I_1:%.+]] = mul i64 [[IV1_1]], 127
  239. // CHECK-NEXT: [[CALC_I_2:%.+]] = add i64 131071, [[CALC_I_1]]
  240. // CHECK-NEXT: store i64 [[CALC_I_2]], i64* [[LC_I:.+]]
  241. // ... loop body ...
  242. // End of body: store into a[i]:
  243. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  244. a[i] = b[i] * c[i] * d[i];
  245. // CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
  246. // CHECK-NEXT: [[ADD1_2:%.+]] = add i64 [[IV1_2]], 1
  247. // CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
  248. // CHECK-NEXT: br label %{{.+}}
  249. }
  250. // CHECK: [[LOOP1_END]]
  251. // CHECK: [[O_LOOP1_END]]
  252. // CHECK: ret void
  253. }
  254. // CHECK-LABEL: define {{.*void}} @{{.*}}test_auto{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  255. void test_auto(float *a, float *b, float *c, float *d) {
  256. unsigned int x = 0;
  257. unsigned int y = 0;
  258. #pragma omp parallel for schedule(auto) collapse(2)
  259. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  260. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  261. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  262. // CHECK: call void @__kmpc_dispatch_init_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID:%.+]], i32 38, i64 0, i64 [[LAST_ITER:%[^,]+]], i64 1, i64 1)
  263. //
  264. // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
  265. // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
  266. // CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
  267. // Loop header
  268. // CHECK: [[O_LOOP1_BODY]]
  269. // CHECK: [[LB:%.+]] = load i64, i64* [[OMP_LB]]
  270. // CHECK-NEXT: store i64 [[LB]], i64* [[OMP_IV:[^,]+]]
  271. // CHECK: [[IV:%.+]] = load i64, i64* [[OMP_IV]]
  272. // CHECK-NEXT: [[UB:%.+]] = load i64, i64* [[OMP_UB]]
  273. // CHECK-NEXT: [[CMP:%.+]] = icmp sle i64 [[IV]], [[UB]]
  274. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  275. // FIXME: When the iteration count of some nested loop is not a known constant,
  276. // we should pre-calculate it, like we do for the total number of iterations!
  277. for (char i = static_cast<char>(y); i <= '9'; ++i)
  278. for (x = 11; x > 0; --x) {
  279. // CHECK: [[LOOP1_BODY]]
  280. // Start of body: indices are calculated from IV:
  281. // CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
  282. // CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
  283. // ... loop body ...
  284. // End of body: store into a[i]:
  285. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  286. a[i] = b[i] * c[i] * d[i];
  287. // CHECK: [[IV1_2:%.+]] = load i64, i64* [[OMP_IV]]{{.*}}
  288. // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i64 [[IV1_2]], 1
  289. // CHECK-NEXT: store i64 [[ADD1_2]], i64* [[OMP_IV]]
  290. // CHECK-NEXT: br label %{{.+}}
  291. }
  292. // CHECK: [[LOOP1_END]]
  293. // CHECK: [[O_LOOP1_END]]
  294. // CHECK: ret void
  295. }
  296. // CHECK-LABEL: define {{.*void}} @{{.*}}runtime{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
  297. void runtime(float *a, float *b, float *c, float *d) {
  298. int x = 0;
  299. #pragma omp parallel for collapse(2) schedule(runtime)
  300. // CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float**, float**, float**, float**)* [[OMP_PARALLEL_FUNC:@.+]] to void (i32*, i32*, ...)*),
  301. // CHECK: define internal void [[OMP_PARALLEL_FUNC]](i32* noalias [[GTID_PARAM_ADDR:%.+]], i32* noalias %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}}, float** dereferenceable(8) %{{.+}})
  302. // CHECK: store i32* [[GTID_PARAM_ADDR]], i32** [[GTID_REF_ADDR:%.+]],
  303. // CHECK: call void @__kmpc_dispatch_init_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID:%.+]], i32 37, i32 0, i32 199, i32 1, i32 1)
  304. //
  305. // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_4([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]])
  306. // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
  307. // CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
  308. // Loop header
  309. // CHECK: [[O_LOOP1_BODY]]
  310. // CHECK: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  311. // CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  312. // CHECK: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  313. // CHECK-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  314. // CHECK-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
  315. // CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  316. for (unsigned char i = '0' ; i <= '9'; ++i)
  317. for (x = -10; x < 10; ++x) {
  318. // CHECK: [[LOOP1_BODY]]
  319. // Start of body: indices are calculated from IV:
  320. // CHECK: store i8 {{%[^,]+}}, i8* {{%[^,]+}}
  321. // CHECK: store i32 {{%[^,]+}}, i32* {{%[^,]+}}
  322. // ... loop body ...
  323. // End of body: store into a[i]:
  324. // CHECK: store float [[RESULT:%.+]], float* {{%.+}}
  325. a[i] = b[i] * c[i] * d[i];
  326. // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  327. // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
  328. // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
  329. // CHECK-NEXT: br label %{{.+}}
  330. }
  331. // CHECK: [[LOOP1_END]]
  332. // CHECK: [[O_LOOP1_END]]
  333. // CHECK: ret void
  334. }
  335. // TERM_DEBUG-LABEL: foo
  336. int foo() {return 0;};
  337. // TERM_DEBUG-LABEL: parallel_for
  338. // CLEANUP: parallel_for
  339. void parallel_for(float *a, const int n) {
  340. float arr[n];
  341. #pragma omp parallel for schedule(static, 5) private(arr) default(none) firstprivate(n) shared(a)
  342. // TERM_DEBUG-NOT: __kmpc_global_thread_num
  343. // TERM_DEBUG: call void @__kmpc_for_static_init_4u({{.+}}), !dbg [[DBG_LOC_START:![0-9]+]]
  344. // TERM_DEBUG: invoke i32 {{.*}}foo{{.*}}()
  345. // TERM_DEBUG: unwind label %[[TERM_LPAD:.+]],
  346. // TERM_DEBUG-NOT: __kmpc_global_thread_num
  347. // TERM_DEBUG: call void @__kmpc_for_static_fini({{.+}}), !dbg [[DBG_LOC_END:![0-9]+]]
  348. // TERM_DEBUG: [[TERM_LPAD]]
  349. // TERM_DEBUG: call void @__clang_call_terminate
  350. // TERM_DEBUG: unreachable
  351. // CLEANUP-NOT: __kmpc_global_thread_num
  352. // CLEANUP: call void @__kmpc_for_static_init_4u({{.+}})
  353. // CLEANUP: call void @__kmpc_for_static_fini({{.+}})
  354. for (unsigned i = 131071; i <= 2147483647; i += 127)
  355. a[i] += foo() + arr[i] + n;
  356. }
  357. // Check source line corresponds to "#pragma omp parallel for schedule(static, 5)" above:
  358. // TERM_DEBUG-DAG: [[DBG_LOC_START]] = !DILocation(line: [[@LINE-4]],
  359. // TERM_DEBUG-DAG: [[DBG_LOC_END]] = !DILocation(line: [[@LINE-18]],
  360. #ifdef OMP5
  361. // OMP5-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
  362. // OMP5-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 514, i32 0, i32 0, i8*
  363. // OMP5-LABEL: increment
  364. int increment () {
  365. // OMP5: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
  366. #pragma omp for
  367. // Determine UB = min(UB, GlobalUB)
  368. // OMP5: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
  369. // OMP5-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  370. // OMP5-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4
  371. // OMP5-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
  372. // OMP5: [[UBRESULT:%.+]] = phi i32 [ 4, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
  373. // OMP5-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
  374. // OMP5-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  375. // OMP5-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  376. // OMP5-NEXT: br label %[[LOOP1_HEAD:.+]]
  377. // Loop header
  378. // OMP5: [[LOOP1_HEAD]]
  379. // OMP5: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  380. // OMP5-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  381. // OMP5-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
  382. // OMP5-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  383. for (int i = 0 ; i != 5; ++i)
  384. // Start of body: calculate i from IV:
  385. // OMP5: [[LOOP1_BODY]]
  386. // OMP5: [[IV1_1:%.+]] = load i32, i32* [[OMP_IV]]
  387. // OMP5-NEXT: [[CALC_I_1:%.+]] = mul nsw i32 [[IV1_1]], 1
  388. // OMP5-NEXT: [[CALC_I_2:%.+]] = add nsw i32 0, [[CALC_I_1]]
  389. // OMP5-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
  390. // OMP5: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  391. // OMP5-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
  392. // OMP5-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
  393. // OMP5-NEXT: br label %[[LOOP1_HEAD]]
  394. ;
  395. // OMP5: [[LOOP1_END]]
  396. // OMP5: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  397. // OMP5: __kmpc_barrier
  398. return 0;
  399. // OMP5: ret i32 0
  400. }
  401. // OMP5-LABEL: decrement_nowait
  402. int decrement_nowait () {
  403. // OMP5: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
  404. #pragma omp for nowait
  405. // Determine UB = min(UB, GlobalUB)
  406. // OMP5: call void @__kmpc_for_static_init_4([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]], i32 34, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 1)
  407. // OMP5-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  408. // OMP5-NEXT: [[UBCMP:%.+]] = icmp sgt i32 [[UB]], 4
  409. // OMP5-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
  410. // OMP5: [[UBRESULT:%.+]] = phi i32 [ 4, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
  411. // OMP5-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
  412. // OMP5-NEXT: [[LB:%.+]] = load i32, i32* [[OMP_LB]]
  413. // OMP5-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
  414. // OMP5-NEXT: br label %[[LOOP1_HEAD:.+]]
  415. // Loop header
  416. // OMP5: [[LOOP1_HEAD]]
  417. // OMP5: [[IV:%.+]] = load i32, i32* [[OMP_IV]]
  418. // OMP5-NEXT: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
  419. // OMP5-NEXT: [[CMP:%.+]] = icmp sle i32 [[IV]], [[UB]]
  420. // OMP5-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
  421. for (int j = 5 ; j != 0; --j)
  422. // Start of body: calculate i from IV:
  423. // OMP5: [[LOOP1_BODY]]
  424. // OMP5: [[IV2_1:%.+]] = load i32, i32* [[OMP_IV]]
  425. // OMP5-NEXT: [[CALC_II_1:%.+]] = mul nsw i32 [[IV2_1]], 1
  426. // OMP5-NEXT: [[CALC_II_2:%.+]] = sub nsw i32 5, [[CALC_II_1]]
  427. // OMP5-NEXT: store i32 [[CALC_II_2]], i32* [[LC_I:.+]]
  428. // OMP5: [[IV2_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}
  429. // OMP5-NEXT: [[ADD2_2:%.+]] = add nsw i32 [[IV2_2]], 1
  430. // OMP5-NEXT: store i32 [[ADD2_2]], i32* [[OMP_IV]]
  431. // OMP5-NEXT: br label %[[LOOP1_HEAD]]
  432. ;
  433. // OMP5: [[LOOP1_END]]
  434. // OMP5: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[LOOP_LOC]], i32 [[GTID]])
  435. // OMP5-NOT: __kmpc_barrier
  436. return 0;
  437. // OMP5: ret i32 0
  438. }
  439. #endif
  440. #endif // HEADER