ConvertUTFTest.cpp 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711
  1. //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/ADT/ArrayRef.h"
  10. #include "gtest/gtest.h"
  11. #include <string>
  12. #include <vector>
  13. using namespace llvm;
  14. TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
  15. // Src is the look of disapproval.
  16. static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
  17. ArrayRef<char> Ref(Src, sizeof(Src) - 1);
  18. std::string Result;
  19. bool Success = convertUTF16ToUTF8String(Ref, Result);
  20. EXPECT_TRUE(Success);
  21. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  22. EXPECT_EQ(Expected, Result);
  23. }
  24. TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
  25. // Src is the look of disapproval.
  26. static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
  27. ArrayRef<char> Ref(Src, sizeof(Src) - 1);
  28. std::string Result;
  29. bool Success = convertUTF16ToUTF8String(Ref, Result);
  30. EXPECT_TRUE(Success);
  31. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  32. EXPECT_EQ(Expected, Result);
  33. }
  34. TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
  35. // Src is the look of disapproval.
  36. static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
  37. StringRef Ref(Src, sizeof(Src) - 1);
  38. SmallVector<UTF16, 5> Result;
  39. bool Success = convertUTF8ToUTF16String(Ref, Result);
  40. EXPECT_TRUE(Success);
  41. static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
  42. ASSERT_EQ(3u, Result.size());
  43. for (int I = 0, E = 3; I != E; ++I)
  44. EXPECT_EQ(Expected[I], Result[I]);
  45. }
  46. TEST(ConvertUTFTest, OddLengthInput) {
  47. std::string Result;
  48. bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
  49. EXPECT_FALSE(Success);
  50. }
  51. TEST(ConvertUTFTest, Empty) {
  52. std::string Result;
  53. bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
  54. EXPECT_TRUE(Success);
  55. EXPECT_TRUE(Result.empty());
  56. }
  57. TEST(ConvertUTFTest, HasUTF16BOM) {
  58. bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
  59. EXPECT_TRUE(HasBOM);
  60. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
  61. EXPECT_TRUE(HasBOM);
  62. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
  63. EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
  64. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
  65. EXPECT_TRUE(HasBOM);
  66. HasBOM = hasUTF16ByteOrderMark(None);
  67. EXPECT_FALSE(HasBOM);
  68. HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
  69. EXPECT_FALSE(HasBOM);
  70. }
  71. TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
  72. // Src is the look of disapproval.
  73. static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
  74. ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
  75. std::string Result;
  76. bool Success = convertUTF16ToUTF8String(SrcRef, Result);
  77. EXPECT_TRUE(Success);
  78. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  79. EXPECT_EQ(Expected, Result);
  80. }
  81. TEST(ConvertUTFTest, ConvertUTF8toWide) {
  82. // Src is the look of disapproval.
  83. static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
  84. std::wstring Result;
  85. bool Success = ConvertUTF8toWide((const char*)Src, Result);
  86. EXPECT_TRUE(Success);
  87. std::wstring Expected(L"\x0ca0_\x0ca0");
  88. EXPECT_EQ(Expected, Result);
  89. Result.clear();
  90. Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
  91. EXPECT_TRUE(Success);
  92. EXPECT_EQ(Expected, Result);
  93. }
  94. TEST(ConvertUTFTest, convertWideToUTF8) {
  95. // Src is the look of disapproval.
  96. static const wchar_t Src[] = L"\x0ca0_\x0ca0";
  97. std::string Result;
  98. bool Success = convertWideToUTF8(Src, Result);
  99. EXPECT_TRUE(Success);
  100. std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
  101. EXPECT_EQ(Expected, Result);
  102. }
  103. struct ConvertUTFResultContainer {
  104. ConversionResult ErrorCode;
  105. std::vector<unsigned> UnicodeScalars;
  106. ConvertUTFResultContainer(ConversionResult ErrorCode)
  107. : ErrorCode(ErrorCode) {}
  108. ConvertUTFResultContainer
  109. withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
  110. unsigned US2 = 0x110000, unsigned US3 = 0x110000,
  111. unsigned US4 = 0x110000, unsigned US5 = 0x110000,
  112. unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
  113. ConvertUTFResultContainer Result(*this);
  114. if (US0 != 0x110000)
  115. Result.UnicodeScalars.push_back(US0);
  116. if (US1 != 0x110000)
  117. Result.UnicodeScalars.push_back(US1);
  118. if (US2 != 0x110000)
  119. Result.UnicodeScalars.push_back(US2);
  120. if (US3 != 0x110000)
  121. Result.UnicodeScalars.push_back(US3);
  122. if (US4 != 0x110000)
  123. Result.UnicodeScalars.push_back(US4);
  124. if (US5 != 0x110000)
  125. Result.UnicodeScalars.push_back(US5);
  126. if (US6 != 0x110000)
  127. Result.UnicodeScalars.push_back(US6);
  128. if (US7 != 0x110000)
  129. Result.UnicodeScalars.push_back(US7);
  130. return Result;
  131. }
  132. };
  133. std::pair<ConversionResult, std::vector<unsigned>>
  134. ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
  135. const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
  136. const UTF8 *SourceNext = SourceStart;
  137. std::vector<UTF32> Decoded(S.size(), 0);
  138. UTF32 *TargetStart = Decoded.data();
  139. auto ErrorCode =
  140. ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
  141. Decoded.data() + Decoded.size(), lenientConversion);
  142. Decoded.resize(TargetStart - Decoded.data());
  143. return std::make_pair(ErrorCode, Decoded);
  144. }
  145. std::pair<ConversionResult, std::vector<unsigned>>
  146. ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
  147. const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
  148. const UTF8 *SourceNext = SourceStart;
  149. std::vector<UTF32> Decoded(S.size(), 0);
  150. UTF32 *TargetStart = Decoded.data();
  151. auto ErrorCode = ConvertUTF8toUTF32Partial(
  152. &SourceNext, SourceStart + S.size(), &TargetStart,
  153. Decoded.data() + Decoded.size(), lenientConversion);
  154. Decoded.resize(TargetStart - Decoded.data());
  155. return std::make_pair(ErrorCode, Decoded);
  156. }
  157. ::testing::AssertionResult
  158. CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
  159. StringRef S, bool Partial = false) {
  160. ConversionResult ErrorCode;
  161. std::vector<unsigned> Decoded;
  162. if (!Partial)
  163. std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
  164. else
  165. std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
  166. if (Expected.ErrorCode != ErrorCode)
  167. return ::testing::AssertionFailure() << "Expected error code "
  168. << Expected.ErrorCode << ", actual "
  169. << ErrorCode;
  170. if (Expected.UnicodeScalars != Decoded)
  171. return ::testing::AssertionFailure()
  172. << "Expected lenient decoded result:\n"
  173. << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
  174. << "Actual result:\n" << ::testing::PrintToString(Decoded);
  175. return ::testing::AssertionSuccess();
  176. }
  177. TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
  178. //
  179. // 1-byte sequences
  180. //
  181. // U+0041 LATIN CAPITAL LETTER A
  182. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  183. ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
  184. //
  185. // 2-byte sequences
  186. //
  187. // U+0283 LATIN SMALL LETTER ESH
  188. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  189. ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
  190. "\xca\x83"));
  191. // U+03BA GREEK SMALL LETTER KAPPA
  192. // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
  193. // U+03C3 GREEK SMALL LETTER SIGMA
  194. // U+03BC GREEK SMALL LETTER MU
  195. // U+03B5 GREEK SMALL LETTER EPSILON
  196. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  197. ConvertUTFResultContainer(conversionOK)
  198. .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
  199. "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
  200. //
  201. // 3-byte sequences
  202. //
  203. // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
  204. // U+6587 CJK UNIFIED IDEOGRAPH-6587
  205. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  206. ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
  207. "\xe4\xbe\x8b\xe6\x96\x87"));
  208. // U+D55C HANGUL SYLLABLE HAN
  209. // U+AE00 HANGUL SYLLABLE GEUL
  210. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  211. ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
  212. "\xed\x95\x9c\xea\xb8\x80"));
  213. // U+1112 HANGUL CHOSEONG HIEUH
  214. // U+1161 HANGUL JUNGSEONG A
  215. // U+11AB HANGUL JONGSEONG NIEUN
  216. // U+1100 HANGUL CHOSEONG KIYEOK
  217. // U+1173 HANGUL JUNGSEONG EU
  218. // U+11AF HANGUL JONGSEONG RIEUL
  219. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  220. ConvertUTFResultContainer(conversionOK)
  221. .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
  222. "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
  223. "\xe1\x86\xaf"));
  224. //
  225. // 4-byte sequences
  226. //
  227. // U+E0100 VARIATION SELECTOR-17
  228. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  229. ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
  230. "\xf3\xa0\x84\x80"));
  231. //
  232. // First possible sequence of a certain length
  233. //
  234. // U+0000 NULL
  235. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  236. ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
  237. StringRef("\x00", 1)));
  238. // U+0080 PADDING CHARACTER
  239. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  240. ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
  241. "\xc2\x80"));
  242. // U+0800 SAMARITAN LETTER ALAF
  243. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  244. ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
  245. "\xe0\xa0\x80"));
  246. // U+10000 LINEAR B SYLLABLE B008 A
  247. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  248. ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
  249. "\xf0\x90\x80\x80"));
  250. // U+200000 (invalid)
  251. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  252. ConvertUTFResultContainer(sourceIllegal)
  253. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  254. "\xf8\x88\x80\x80\x80"));
  255. // U+4000000 (invalid)
  256. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  257. ConvertUTFResultContainer(sourceIllegal)
  258. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  259. "\xfc\x84\x80\x80\x80\x80"));
  260. //
  261. // Last possible sequence of a certain length
  262. //
  263. // U+007F DELETE
  264. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  265. ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
  266. // U+07FF (unassigned)
  267. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  268. ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
  269. "\xdf\xbf"));
  270. // U+FFFF (noncharacter)
  271. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  272. ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
  273. "\xef\xbf\xbf"));
  274. // U+1FFFFF (invalid)
  275. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  276. ConvertUTFResultContainer(sourceIllegal)
  277. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  278. "\xf7\xbf\xbf\xbf"));
  279. // U+3FFFFFF (invalid)
  280. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  281. ConvertUTFResultContainer(sourceIllegal)
  282. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  283. "\xfb\xbf\xbf\xbf\xbf"));
  284. // U+7FFFFFFF (invalid)
  285. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  286. ConvertUTFResultContainer(sourceIllegal)
  287. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  288. "\xfd\xbf\xbf\xbf\xbf\xbf"));
  289. //
  290. // Other boundary conditions
  291. //
  292. // U+D7FF (unassigned)
  293. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  294. ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
  295. "\xed\x9f\xbf"));
  296. // U+E000 (private use)
  297. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  298. ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
  299. "\xee\x80\x80"));
  300. // U+FFFD REPLACEMENT CHARACTER
  301. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  302. ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
  303. "\xef\xbf\xbd"));
  304. // U+10FFFF (noncharacter)
  305. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  306. ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
  307. "\xf4\x8f\xbf\xbf"));
  308. // U+110000 (invalid)
  309. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  310. ConvertUTFResultContainer(sourceIllegal)
  311. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  312. "\xf4\x90\x80\x80"));
  313. //
  314. // Unexpected continuation bytes
  315. //
  316. // A sequence of unexpected continuation bytes that don't follow a first
  317. // byte, every byte is a maximal subpart.
  318. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  319. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
  320. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  321. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
  322. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  323. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  324. "\x80\x80"));
  325. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  326. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  327. "\x80\xbf"));
  328. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  329. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  330. "\xbf\x80"));
  331. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  332. ConvertUTFResultContainer(sourceIllegal)
  333. .withScalars(0xfffd, 0xfffd, 0xfffd),
  334. "\x80\xbf\x80"));
  335. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  336. ConvertUTFResultContainer(sourceIllegal)
  337. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  338. "\x80\xbf\x80\xbf"));
  339. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  340. ConvertUTFResultContainer(sourceIllegal)
  341. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  342. "\x80\xbf\x82\xbf\xaa"));
  343. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  344. ConvertUTFResultContainer(sourceIllegal)
  345. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  346. "\xaa\xb0\xbb\xbf\xaa\xa0"));
  347. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  348. ConvertUTFResultContainer(sourceIllegal)
  349. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  350. "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
  351. // All continuation bytes (0x80--0xbf).
  352. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  353. ConvertUTFResultContainer(sourceIllegal)
  354. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  355. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  356. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  357. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  358. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  359. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  360. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  361. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  362. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  363. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  364. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  365. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  366. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  367. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  368. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  369. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  370. "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
  371. "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
  372. "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
  373. "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
  374. //
  375. // Lonely start bytes
  376. //
  377. // Start bytes of 2-byte sequences (0xc0--0xdf).
  378. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  379. ConvertUTFResultContainer(sourceIllegal)
  380. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  381. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  382. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  383. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  384. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  385. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  386. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  387. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  388. "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
  389. "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
  390. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  391. ConvertUTFResultContainer(sourceIllegal)
  392. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  393. 0xfffd, 0x0020, 0xfffd, 0x0020)
  394. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  395. 0xfffd, 0x0020, 0xfffd, 0x0020)
  396. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  397. 0xfffd, 0x0020, 0xfffd, 0x0020)
  398. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  399. 0xfffd, 0x0020, 0xfffd, 0x0020)
  400. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  401. 0xfffd, 0x0020, 0xfffd, 0x0020)
  402. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  403. 0xfffd, 0x0020, 0xfffd, 0x0020)
  404. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  405. 0xfffd, 0x0020, 0xfffd, 0x0020)
  406. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  407. 0xfffd, 0x0020, 0xfffd, 0x0020),
  408. "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
  409. "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
  410. "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
  411. "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
  412. // Start bytes of 3-byte sequences (0xe0--0xef).
  413. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  414. ConvertUTFResultContainer(sourceIllegal)
  415. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  416. 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  417. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  418. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  419. "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
  420. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  421. ConvertUTFResultContainer(sourceIllegal)
  422. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  423. 0xfffd, 0x0020, 0xfffd, 0x0020)
  424. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  425. 0xfffd, 0x0020, 0xfffd, 0x0020)
  426. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  427. 0xfffd, 0x0020, 0xfffd, 0x0020)
  428. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  429. 0xfffd, 0x0020, 0xfffd, 0x0020),
  430. "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
  431. "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
  432. // Start bytes of 4-byte sequences (0xf0--0xf7).
  433. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  434. ConvertUTFResultContainer(sourceIllegal)
  435. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
  436. 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  437. "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
  438. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  439. ConvertUTFResultContainer(sourceIllegal)
  440. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  441. 0xfffd, 0x0020, 0xfffd, 0x0020)
  442. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  443. 0xfffd, 0x0020, 0xfffd, 0x0020),
  444. "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
  445. // Start bytes of 5-byte sequences (0xf8--0xfb).
  446. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  447. ConvertUTFResultContainer(sourceIllegal)
  448. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  449. "\xf8\xf9\xfa\xfb"));
  450. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  451. ConvertUTFResultContainer(sourceIllegal)
  452. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  453. 0xfffd, 0x0020, 0xfffd, 0x0020),
  454. "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
  455. // Start bytes of 6-byte sequences (0xfc--0xfd).
  456. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  457. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  458. "\xfc\xfd"));
  459. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  460. ConvertUTFResultContainer(sourceIllegal)
  461. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
  462. "\xfc\x20\xfd\x20"));
  463. //
  464. // Other bytes (0xc0--0xc1, 0xfe--0xff).
  465. //
  466. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  467. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
  468. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  469. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
  470. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  471. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
  472. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  473. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
  474. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  475. ConvertUTFResultContainer(sourceIllegal)
  476. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  477. "\xc0\xc1\xfe\xff"));
  478. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  479. ConvertUTFResultContainer(sourceIllegal)
  480. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  481. "\xfe\xfe\xff\xff"));
  482. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  483. ConvertUTFResultContainer(sourceIllegal)
  484. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  485. "\xfe\x80\x80\x80\x80\x80"));
  486. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  487. ConvertUTFResultContainer(sourceIllegal)
  488. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  489. "\xff\x80\x80\x80\x80\x80"));
  490. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  491. ConvertUTFResultContainer(sourceIllegal)
  492. .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
  493. 0xfffd, 0x0020, 0xfffd, 0x0020),
  494. "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
  495. //
  496. // Sequences with one continuation byte missing
  497. //
  498. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  499. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
  500. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  501. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
  502. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  503. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  504. "\xe0\xa0"));
  505. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  506. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  507. "\xe0\xbf"));
  508. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  509. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  510. "\xe1\x80"));
  511. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  512. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  513. "\xec\xbf"));
  514. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  515. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  516. "\xed\x80"));
  517. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  518. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  519. "\xed\x9f"));
  520. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  521. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  522. "\xee\x80"));
  523. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  524. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  525. "\xef\xbf"));
  526. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  527. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  528. "\xf0\x90\x80"));
  529. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  530. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  531. "\xf0\xbf\xbf"));
  532. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  533. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  534. "\xf1\x80\x80"));
  535. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  536. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  537. "\xf3\xbf\xbf"));
  538. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  539. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  540. "\xf4\x80\x80"));
  541. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  542. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  543. "\xf4\x8f\xbf"));
  544. // Overlong sequences with one trailing byte missing.
  545. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  546. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  547. "\xc0"));
  548. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  549. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  550. "\xc1"));
  551. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  552. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  553. "\xe0\x80"));
  554. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  555. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  556. "\xe0\x9f"));
  557. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  558. ConvertUTFResultContainer(sourceIllegal)
  559. .withScalars(0xfffd, 0xfffd, 0xfffd),
  560. "\xf0\x80\x80"));
  561. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  562. ConvertUTFResultContainer(sourceIllegal)
  563. .withScalars(0xfffd, 0xfffd, 0xfffd),
  564. "\xf0\x8f\x80"));
  565. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  566. ConvertUTFResultContainer(sourceIllegal)
  567. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  568. "\xf8\x80\x80\x80"));
  569. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  570. ConvertUTFResultContainer(sourceIllegal)
  571. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  572. "\xfc\x80\x80\x80\x80"));
  573. // Sequences that represent surrogates with one trailing byte missing.
  574. // High surrogates
  575. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  576. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  577. "\xed\xa0"));
  578. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  579. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  580. "\xed\xac"));
  581. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  582. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  583. "\xed\xaf"));
  584. // Low surrogates
  585. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  586. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  587. "\xed\xb0"));
  588. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  589. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  590. "\xed\xb4"));
  591. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  592. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  593. "\xed\xbf"));
  594. // Ill-formed 4-byte sequences.
  595. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  596. // U+1100xx (invalid)
  597. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  598. ConvertUTFResultContainer(sourceIllegal)
  599. .withScalars(0xfffd, 0xfffd, 0xfffd),
  600. "\xf4\x90\x80"));
  601. // U+13FBxx (invalid)
  602. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  603. ConvertUTFResultContainer(sourceIllegal)
  604. .withScalars(0xfffd, 0xfffd, 0xfffd),
  605. "\xf4\xbf\xbf"));
  606. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  607. ConvertUTFResultContainer(sourceIllegal)
  608. .withScalars(0xfffd, 0xfffd, 0xfffd),
  609. "\xf5\x80\x80"));
  610. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  611. ConvertUTFResultContainer(sourceIllegal)
  612. .withScalars(0xfffd, 0xfffd, 0xfffd),
  613. "\xf6\x80\x80"));
  614. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  615. ConvertUTFResultContainer(sourceIllegal)
  616. .withScalars(0xfffd, 0xfffd, 0xfffd),
  617. "\xf7\x80\x80"));
  618. // U+1FFBxx (invalid)
  619. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  620. ConvertUTFResultContainer(sourceIllegal)
  621. .withScalars(0xfffd, 0xfffd, 0xfffd),
  622. "\xf7\xbf\xbf"));
  623. // Ill-formed 5-byte sequences.
  624. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  625. // U+2000xx (invalid)
  626. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  627. ConvertUTFResultContainer(sourceIllegal)
  628. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  629. "\xf8\x88\x80\x80"));
  630. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  631. ConvertUTFResultContainer(sourceIllegal)
  632. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  633. "\xf8\xbf\xbf\xbf"));
  634. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  635. ConvertUTFResultContainer(sourceIllegal)
  636. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  637. "\xf9\x80\x80\x80"));
  638. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  639. ConvertUTFResultContainer(sourceIllegal)
  640. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  641. "\xfa\x80\x80\x80"));
  642. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  643. ConvertUTFResultContainer(sourceIllegal)
  644. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  645. "\xfb\x80\x80\x80"));
  646. // U+3FFFFxx (invalid)
  647. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  648. ConvertUTFResultContainer(sourceIllegal)
  649. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  650. "\xfb\xbf\xbf\xbf"));
  651. // Ill-formed 6-byte sequences.
  652. // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
  653. // U+40000xx (invalid)
  654. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  655. ConvertUTFResultContainer(sourceIllegal)
  656. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  657. "\xfc\x84\x80\x80\x80"));
  658. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  659. ConvertUTFResultContainer(sourceIllegal)
  660. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  661. "\xfc\xbf\xbf\xbf\xbf"));
  662. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  663. ConvertUTFResultContainer(sourceIllegal)
  664. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  665. "\xfd\x80\x80\x80\x80"));
  666. // U+7FFFFFxx (invalid)
  667. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  668. ConvertUTFResultContainer(sourceIllegal)
  669. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  670. "\xfd\xbf\xbf\xbf\xbf"));
  671. //
  672. // Sequences with two continuation bytes missing
  673. //
  674. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  675. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  676. "\xf0\x90"));
  677. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  678. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  679. "\xf0\xbf"));
  680. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  681. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  682. "\xf1\x80"));
  683. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  684. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  685. "\xf3\xbf"));
  686. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  687. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  688. "\xf4\x80"));
  689. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  690. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
  691. "\xf4\x8f"));
  692. // Overlong sequences with two trailing byte missing.
  693. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  694. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
  695. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  696. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  697. "\xf0\x80"));
  698. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  699. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  700. "\xf0\x8f"));
  701. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  702. ConvertUTFResultContainer(sourceIllegal)
  703. .withScalars(0xfffd, 0xfffd, 0xfffd),
  704. "\xf8\x80\x80"));
  705. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  706. ConvertUTFResultContainer(sourceIllegal)
  707. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  708. "\xfc\x80\x80\x80"));
  709. // Sequences that represent surrogates with two trailing bytes missing.
  710. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  711. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
  712. // Ill-formed 4-byte sequences.
  713. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  714. // U+110yxx (invalid)
  715. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  716. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  717. "\xf4\x90"));
  718. // U+13Fyxx (invalid)
  719. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  720. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  721. "\xf4\xbf"));
  722. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  723. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  724. "\xf5\x80"));
  725. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  726. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  727. "\xf6\x80"));
  728. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  729. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  730. "\xf7\x80"));
  731. // U+1FFyxx (invalid)
  732. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  733. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  734. "\xf7\xbf"));
  735. // Ill-formed 5-byte sequences.
  736. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  737. // U+200yxx (invalid)
  738. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  739. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  740. "\xf8\x88\x80"));
  741. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  742. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  743. "\xf8\xbf\xbf"));
  744. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  745. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  746. "\xf9\x80\x80"));
  747. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  748. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  749. "\xfa\x80\x80"));
  750. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  751. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  752. "\xfb\x80\x80"));
  753. // U+3FFFyxx (invalid)
  754. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  755. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  756. "\xfb\xbf\xbf"));
  757. // Ill-formed 6-byte sequences.
  758. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  759. // U+4000yxx (invalid)
  760. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  761. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  762. "\xfc\x84\x80\x80"));
  763. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  764. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  765. "\xfc\xbf\xbf\xbf"));
  766. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  767. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  768. "\xfd\x80\x80\x80"));
  769. // U+7FFFFyxx (invalid)
  770. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  771. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  772. "\xfd\xbf\xbf\xbf"));
  773. //
  774. // Sequences with three continuation bytes missing
  775. //
  776. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  777. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
  778. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  779. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
  780. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  781. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
  782. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  783. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
  784. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  785. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
  786. // Broken overlong sequences.
  787. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  788. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
  789. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  790. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  791. "\xf8\x80"));
  792. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  793. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  794. "\xfc\x80\x80"));
  795. // Ill-formed 4-byte sequences.
  796. // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
  797. // U+14yyxx (invalid)
  798. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  799. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
  800. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  801. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
  802. // U+1Cyyxx (invalid)
  803. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  804. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
  805. // Ill-formed 5-byte sequences.
  806. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  807. // U+20yyxx (invalid)
  808. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  809. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  810. "\xf8\x88"));
  811. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  812. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  813. "\xf8\xbf"));
  814. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  815. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  816. "\xf9\x80"));
  817. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  818. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  819. "\xfa\x80"));
  820. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  821. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  822. "\xfb\x80"));
  823. // U+3FCyyxx (invalid)
  824. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  825. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  826. "\xfb\xbf"));
  827. // Ill-formed 6-byte sequences.
  828. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  829. // U+400yyxx (invalid)
  830. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  831. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  832. "\xfc\x84\x80"));
  833. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  834. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  835. "\xfc\xbf\xbf"));
  836. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  837. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  838. "\xfd\x80\x80"));
  839. // U+7FFCyyxx (invalid)
  840. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  841. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
  842. "\xfd\xbf\xbf"));
  843. //
  844. // Sequences with four continuation bytes missing
  845. //
  846. // Ill-formed 5-byte sequences.
  847. // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  848. // U+uzyyxx (invalid)
  849. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  850. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
  851. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  852. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
  853. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  854. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
  855. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  856. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
  857. // U+3zyyxx (invalid)
  858. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  859. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
  860. // Broken overlong sequences.
  861. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  862. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
  863. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  864. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  865. "\xfc\x80"));
  866. // Ill-formed 6-byte sequences.
  867. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  868. // U+uzzyyxx (invalid)
  869. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  870. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  871. "\xfc\x84"));
  872. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  873. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  874. "\xfc\xbf"));
  875. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  876. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  877. "\xfd\x80"));
  878. // U+7Fzzyyxx (invalid)
  879. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  880. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  881. "\xfd\xbf"));
  882. //
  883. // Sequences with five continuation bytes missing
  884. //
  885. // Ill-formed 6-byte sequences.
  886. // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
  887. // U+uzzyyxx (invalid)
  888. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  889. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
  890. // U+uuzzyyxx (invalid)
  891. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  892. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
  893. //
  894. // Consecutive sequences with trailing bytes missing
  895. //
  896. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  897. ConvertUTFResultContainer(sourceIllegal)
  898. .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
  899. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
  900. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
  901. .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
  902. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
  903. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  904. "\xc0" "\xe0\x80" "\xf0\x80\x80"
  905. "\xf8\x80\x80\x80"
  906. "\xfc\x80\x80\x80\x80"
  907. "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
  908. "\xfb\xbf\xbf\xbf"
  909. "\xfd\xbf\xbf\xbf\xbf"));
  910. //
  911. // Overlong UTF-8 sequences
  912. //
  913. // U+002F SOLIDUS
  914. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  915. ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
  916. // Overlong sequences of the above.
  917. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  918. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  919. "\xc0\xaf"));
  920. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  921. ConvertUTFResultContainer(sourceIllegal)
  922. .withScalars(0xfffd, 0xfffd, 0xfffd),
  923. "\xe0\x80\xaf"));
  924. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  925. ConvertUTFResultContainer(sourceIllegal)
  926. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  927. "\xf0\x80\x80\xaf"));
  928. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  929. ConvertUTFResultContainer(sourceIllegal)
  930. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  931. "\xf8\x80\x80\x80\xaf"));
  932. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  933. ConvertUTFResultContainer(sourceIllegal)
  934. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  935. "\xfc\x80\x80\x80\x80\xaf"));
  936. // U+0000 NULL
  937. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  938. ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
  939. StringRef("\x00", 1)));
  940. // Overlong sequences of the above.
  941. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  942. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  943. "\xc0\x80"));
  944. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  945. ConvertUTFResultContainer(sourceIllegal)
  946. .withScalars(0xfffd, 0xfffd, 0xfffd),
  947. "\xe0\x80\x80"));
  948. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  949. ConvertUTFResultContainer(sourceIllegal)
  950. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  951. "\xf0\x80\x80\x80"));
  952. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  953. ConvertUTFResultContainer(sourceIllegal)
  954. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  955. "\xf8\x80\x80\x80\x80"));
  956. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  957. ConvertUTFResultContainer(sourceIllegal)
  958. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  959. "\xfc\x80\x80\x80\x80\x80"));
  960. // Other overlong sequences.
  961. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  962. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  963. "\xc0\xbf"));
  964. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  965. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  966. "\xc1\x80"));
  967. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  968. ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
  969. "\xc1\xbf"));
  970. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  971. ConvertUTFResultContainer(sourceIllegal)
  972. .withScalars(0xfffd, 0xfffd, 0xfffd),
  973. "\xe0\x9f\xbf"));
  974. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  975. ConvertUTFResultContainer(sourceIllegal)
  976. .withScalars(0xfffd, 0xfffd, 0xfffd),
  977. "\xed\xa0\x80"));
  978. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  979. ConvertUTFResultContainer(sourceIllegal)
  980. .withScalars(0xfffd, 0xfffd, 0xfffd),
  981. "\xed\xbf\xbf"));
  982. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  983. ConvertUTFResultContainer(sourceIllegal)
  984. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  985. "\xf0\x8f\x80\x80"));
  986. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  987. ConvertUTFResultContainer(sourceIllegal)
  988. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
  989. "\xf0\x8f\xbf\xbf"));
  990. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  991. ConvertUTFResultContainer(sourceIllegal)
  992. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  993. "\xf8\x87\xbf\xbf\xbf"));
  994. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  995. ConvertUTFResultContainer(sourceIllegal)
  996. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  997. "\xfc\x83\xbf\xbf\xbf\xbf"));
  998. //
  999. // Isolated surrogates
  1000. //
  1001. // Unicode 6.3.0:
  1002. //
  1003. // D71. High-surrogate code point: A Unicode code point in the range
  1004. // U+D800 to U+DBFF.
  1005. //
  1006. // D73. Low-surrogate code point: A Unicode code point in the range
  1007. // U+DC00 to U+DFFF.
  1008. // Note: U+E0100 is <DB40 DD00> in UTF16.
  1009. // High surrogates
  1010. // U+D800
  1011. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1012. ConvertUTFResultContainer(sourceIllegal)
  1013. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1014. "\xed\xa0\x80"));
  1015. // U+DB40
  1016. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1017. ConvertUTFResultContainer(sourceIllegal)
  1018. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1019. "\xed\xac\xa0"));
  1020. // U+DBFF
  1021. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1022. ConvertUTFResultContainer(sourceIllegal)
  1023. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1024. "\xed\xaf\xbf"));
  1025. // Low surrogates
  1026. // U+DC00
  1027. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1028. ConvertUTFResultContainer(sourceIllegal)
  1029. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1030. "\xed\xb0\x80"));
  1031. // U+DD00
  1032. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1033. ConvertUTFResultContainer(sourceIllegal)
  1034. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1035. "\xed\xb4\x80"));
  1036. // U+DFFF
  1037. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1038. ConvertUTFResultContainer(sourceIllegal)
  1039. .withScalars(0xfffd, 0xfffd, 0xfffd),
  1040. "\xed\xbf\xbf"));
  1041. // Surrogate pairs
  1042. // U+D800 U+DC00
  1043. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1044. ConvertUTFResultContainer(sourceIllegal)
  1045. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1046. "\xed\xa0\x80\xed\xb0\x80"));
  1047. // U+D800 U+DD00
  1048. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1049. ConvertUTFResultContainer(sourceIllegal)
  1050. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1051. "\xed\xa0\x80\xed\xb4\x80"));
  1052. // U+D800 U+DFFF
  1053. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1054. ConvertUTFResultContainer(sourceIllegal)
  1055. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1056. "\xed\xa0\x80\xed\xbf\xbf"));
  1057. // U+DB40 U+DC00
  1058. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1059. ConvertUTFResultContainer(sourceIllegal)
  1060. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1061. "\xed\xac\xa0\xed\xb0\x80"));
  1062. // U+DB40 U+DD00
  1063. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1064. ConvertUTFResultContainer(sourceIllegal)
  1065. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1066. "\xed\xac\xa0\xed\xb4\x80"));
  1067. // U+DB40 U+DFFF
  1068. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1069. ConvertUTFResultContainer(sourceIllegal)
  1070. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1071. "\xed\xac\xa0\xed\xbf\xbf"));
  1072. // U+DBFF U+DC00
  1073. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1074. ConvertUTFResultContainer(sourceIllegal)
  1075. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1076. "\xed\xaf\xbf\xed\xb0\x80"));
  1077. // U+DBFF U+DD00
  1078. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1079. ConvertUTFResultContainer(sourceIllegal)
  1080. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1081. "\xed\xaf\xbf\xed\xb4\x80"));
  1082. // U+DBFF U+DFFF
  1083. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1084. ConvertUTFResultContainer(sourceIllegal)
  1085. .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
  1086. "\xed\xaf\xbf\xed\xbf\xbf"));
  1087. //
  1088. // Noncharacters
  1089. //
  1090. // Unicode 6.3.0:
  1091. //
  1092. // D14. Noncharacter: A code point that is permanently reserved for
  1093. // internal use and that should never be interchanged. Noncharacters
  1094. // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
  1095. // and the values U+FDD0..U+FDEF.
  1096. // U+FFFE
  1097. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1098. ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
  1099. "\xef\xbf\xbe"));
  1100. // U+FFFF
  1101. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1102. ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
  1103. "\xef\xbf\xbf"));
  1104. // U+1FFFE
  1105. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1106. ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
  1107. "\xf0\x9f\xbf\xbe"));
  1108. // U+1FFFF
  1109. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1110. ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
  1111. "\xf0\x9f\xbf\xbf"));
  1112. // U+2FFFE
  1113. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1114. ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
  1115. "\xf0\xaf\xbf\xbe"));
  1116. // U+2FFFF
  1117. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1118. ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
  1119. "\xf0\xaf\xbf\xbf"));
  1120. // U+3FFFE
  1121. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1122. ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
  1123. "\xf0\xbf\xbf\xbe"));
  1124. // U+3FFFF
  1125. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1126. ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
  1127. "\xf0\xbf\xbf\xbf"));
  1128. // U+4FFFE
  1129. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1130. ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
  1131. "\xf1\x8f\xbf\xbe"));
  1132. // U+4FFFF
  1133. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1134. ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
  1135. "\xf1\x8f\xbf\xbf"));
  1136. // U+5FFFE
  1137. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1138. ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
  1139. "\xf1\x9f\xbf\xbe"));
  1140. // U+5FFFF
  1141. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1142. ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
  1143. "\xf1\x9f\xbf\xbf"));
  1144. // U+6FFFE
  1145. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1146. ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
  1147. "\xf1\xaf\xbf\xbe"));
  1148. // U+6FFFF
  1149. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1150. ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
  1151. "\xf1\xaf\xbf\xbf"));
  1152. // U+7FFFE
  1153. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1154. ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
  1155. "\xf1\xbf\xbf\xbe"));
  1156. // U+7FFFF
  1157. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1158. ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
  1159. "\xf1\xbf\xbf\xbf"));
  1160. // U+8FFFE
  1161. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1162. ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
  1163. "\xf2\x8f\xbf\xbe"));
  1164. // U+8FFFF
  1165. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1166. ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
  1167. "\xf2\x8f\xbf\xbf"));
  1168. // U+9FFFE
  1169. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1170. ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
  1171. "\xf2\x9f\xbf\xbe"));
  1172. // U+9FFFF
  1173. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1174. ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
  1175. "\xf2\x9f\xbf\xbf"));
  1176. // U+AFFFE
  1177. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1178. ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
  1179. "\xf2\xaf\xbf\xbe"));
  1180. // U+AFFFF
  1181. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1182. ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
  1183. "\xf2\xaf\xbf\xbf"));
  1184. // U+BFFFE
  1185. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1186. ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
  1187. "\xf2\xbf\xbf\xbe"));
  1188. // U+BFFFF
  1189. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1190. ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
  1191. "\xf2\xbf\xbf\xbf"));
  1192. // U+CFFFE
  1193. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1194. ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
  1195. "\xf3\x8f\xbf\xbe"));
  1196. // U+CFFFF
  1197. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1198. ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
  1199. "\xf3\x8f\xbf\xbf"));
  1200. // U+DFFFE
  1201. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1202. ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
  1203. "\xf3\x9f\xbf\xbe"));
  1204. // U+DFFFF
  1205. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1206. ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
  1207. "\xf3\x9f\xbf\xbf"));
  1208. // U+EFFFE
  1209. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1210. ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
  1211. "\xf3\xaf\xbf\xbe"));
  1212. // U+EFFFF
  1213. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1214. ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
  1215. "\xf3\xaf\xbf\xbf"));
  1216. // U+FFFFE
  1217. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1218. ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
  1219. "\xf3\xbf\xbf\xbe"));
  1220. // U+FFFFF
  1221. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1222. ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
  1223. "\xf3\xbf\xbf\xbf"));
  1224. // U+10FFFE
  1225. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1226. ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
  1227. "\xf4\x8f\xbf\xbe"));
  1228. // U+10FFFF
  1229. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1230. ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
  1231. "\xf4\x8f\xbf\xbf"));
  1232. // U+FDD0
  1233. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1234. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
  1235. "\xef\xb7\x90"));
  1236. // U+FDD1
  1237. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1238. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
  1239. "\xef\xb7\x91"));
  1240. // U+FDD2
  1241. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1242. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
  1243. "\xef\xb7\x92"));
  1244. // U+FDD3
  1245. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1246. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
  1247. "\xef\xb7\x93"));
  1248. // U+FDD4
  1249. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1250. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
  1251. "\xef\xb7\x94"));
  1252. // U+FDD5
  1253. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1254. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
  1255. "\xef\xb7\x95"));
  1256. // U+FDD6
  1257. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1258. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
  1259. "\xef\xb7\x96"));
  1260. // U+FDD7
  1261. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1262. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
  1263. "\xef\xb7\x97"));
  1264. // U+FDD8
  1265. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1266. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
  1267. "\xef\xb7\x98"));
  1268. // U+FDD9
  1269. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1270. ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
  1271. "\xef\xb7\x99"));
  1272. // U+FDDA
  1273. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1274. ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
  1275. "\xef\xb7\x9a"));
  1276. // U+FDDB
  1277. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1278. ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
  1279. "\xef\xb7\x9b"));
  1280. // U+FDDC
  1281. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1282. ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
  1283. "\xef\xb7\x9c"));
  1284. // U+FDDD
  1285. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1286. ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
  1287. "\xef\xb7\x9d"));
  1288. // U+FDDE
  1289. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1290. ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
  1291. "\xef\xb7\x9e"));
  1292. // U+FDDF
  1293. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1294. ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
  1295. "\xef\xb7\x9f"));
  1296. // U+FDE0
  1297. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1298. ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
  1299. "\xef\xb7\xa0"));
  1300. // U+FDE1
  1301. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1302. ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
  1303. "\xef\xb7\xa1"));
  1304. // U+FDE2
  1305. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1306. ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
  1307. "\xef\xb7\xa2"));
  1308. // U+FDE3
  1309. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1310. ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
  1311. "\xef\xb7\xa3"));
  1312. // U+FDE4
  1313. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1314. ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
  1315. "\xef\xb7\xa4"));
  1316. // U+FDE5
  1317. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1318. ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
  1319. "\xef\xb7\xa5"));
  1320. // U+FDE6
  1321. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1322. ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
  1323. "\xef\xb7\xa6"));
  1324. // U+FDE7
  1325. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1326. ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
  1327. "\xef\xb7\xa7"));
  1328. // U+FDE8
  1329. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1330. ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
  1331. "\xef\xb7\xa8"));
  1332. // U+FDE9
  1333. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1334. ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
  1335. "\xef\xb7\xa9"));
  1336. // U+FDEA
  1337. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1338. ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
  1339. "\xef\xb7\xaa"));
  1340. // U+FDEB
  1341. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1342. ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
  1343. "\xef\xb7\xab"));
  1344. // U+FDEC
  1345. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1346. ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
  1347. "\xef\xb7\xac"));
  1348. // U+FDED
  1349. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1350. ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
  1351. "\xef\xb7\xad"));
  1352. // U+FDEE
  1353. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1354. ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
  1355. "\xef\xb7\xae"));
  1356. // U+FDEF
  1357. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1358. ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
  1359. "\xef\xb7\xaf"));
  1360. // U+FDF0
  1361. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1362. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
  1363. "\xef\xb7\xb0"));
  1364. // U+FDF1
  1365. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1366. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
  1367. "\xef\xb7\xb1"));
  1368. // U+FDF2
  1369. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1370. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
  1371. "\xef\xb7\xb2"));
  1372. // U+FDF3
  1373. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1374. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
  1375. "\xef\xb7\xb3"));
  1376. // U+FDF4
  1377. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1378. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
  1379. "\xef\xb7\xb4"));
  1380. // U+FDF5
  1381. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1382. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
  1383. "\xef\xb7\xb5"));
  1384. // U+FDF6
  1385. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1386. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
  1387. "\xef\xb7\xb6"));
  1388. // U+FDF7
  1389. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1390. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
  1391. "\xef\xb7\xb7"));
  1392. // U+FDF8
  1393. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1394. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
  1395. "\xef\xb7\xb8"));
  1396. // U+FDF9
  1397. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1398. ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
  1399. "\xef\xb7\xb9"));
  1400. // U+FDFA
  1401. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1402. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
  1403. "\xef\xb7\xba"));
  1404. // U+FDFB
  1405. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1406. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
  1407. "\xef\xb7\xbb"));
  1408. // U+FDFC
  1409. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1410. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
  1411. "\xef\xb7\xbc"));
  1412. // U+FDFD
  1413. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1414. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
  1415. "\xef\xb7\xbd"));
  1416. // U+FDFE
  1417. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1418. ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
  1419. "\xef\xb7\xbe"));
  1420. // U+FDFF
  1421. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1422. ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
  1423. "\xef\xb7\xbf"));
  1424. }
  1425. TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
  1426. // U+0041 LATIN CAPITAL LETTER A
  1427. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1428. ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
  1429. "\x41", true));
  1430. //
  1431. // Sequences with one continuation byte missing
  1432. //
  1433. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1434. ConvertUTFResultContainer(sourceExhausted),
  1435. "\xc2", true));
  1436. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1437. ConvertUTFResultContainer(sourceExhausted),
  1438. "\xdf", true));
  1439. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1440. ConvertUTFResultContainer(sourceExhausted),
  1441. "\xe0\xa0", true));
  1442. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1443. ConvertUTFResultContainer(sourceExhausted),
  1444. "\xe0\xbf", true));
  1445. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1446. ConvertUTFResultContainer(sourceExhausted),
  1447. "\xe1\x80", true));
  1448. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1449. ConvertUTFResultContainer(sourceExhausted),
  1450. "\xec\xbf", true));
  1451. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1452. ConvertUTFResultContainer(sourceExhausted),
  1453. "\xed\x80", true));
  1454. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1455. ConvertUTFResultContainer(sourceExhausted),
  1456. "\xed\x9f", true));
  1457. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1458. ConvertUTFResultContainer(sourceExhausted),
  1459. "\xee\x80", true));
  1460. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1461. ConvertUTFResultContainer(sourceExhausted),
  1462. "\xef\xbf", true));
  1463. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1464. ConvertUTFResultContainer(sourceExhausted),
  1465. "\xf0\x90\x80", true));
  1466. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1467. ConvertUTFResultContainer(sourceExhausted),
  1468. "\xf0\xbf\xbf", true));
  1469. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1470. ConvertUTFResultContainer(sourceExhausted),
  1471. "\xf1\x80\x80", true));
  1472. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1473. ConvertUTFResultContainer(sourceExhausted),
  1474. "\xf3\xbf\xbf", true));
  1475. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1476. ConvertUTFResultContainer(sourceExhausted),
  1477. "\xf4\x80\x80", true));
  1478. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1479. ConvertUTFResultContainer(sourceExhausted),
  1480. "\xf4\x8f\xbf", true));
  1481. EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
  1482. ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
  1483. "\x41\xc2", true));
  1484. }