configor_encoding.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // Copyright (c) 2018-2020 configor - Nomango
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. //
  10. // The above copyright notice and this permission notice shall be included in
  11. // all copies or substantial portions of the Software.
  12. //
  13. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. // THE SOFTWARE.
  20. #pragma once
  21. #include <array> // std::array
  22. #include <cstdint> // uint32_t, uint8_t
  23. #include <istream> // std::basic_istream
  24. #include <ostream> // std::basic_ostream
  25. #include <type_traits> // std::char_traits
  26. namespace configor
  27. {
  28. namespace encoding
  29. {
  30. namespace unicode
  31. {
  32. namespace constants
  33. {
  34. constexpr auto surrogate_base = static_cast<uint32_t>(0x10000);
  35. constexpr auto lead_surrogate_begin = static_cast<uint32_t>(0xD800);
  36. constexpr auto lead_surrogate_end = static_cast<uint32_t>(0xDBFF);
  37. constexpr auto trail_surrogate_begin = static_cast<uint32_t>(0xDC00);
  38. constexpr auto trail_surrogate_end = static_cast<uint32_t>(0xDFFF);
  39. constexpr auto trail_surrogate_max = static_cast<uint32_t>(0x3FF);
  40. constexpr auto surrogate_bits = static_cast<uint32_t>(10);
  41. } // namespace constants
  42. inline bool is_lead_surrogate(const uint32_t codepoint)
  43. {
  44. return constants::lead_surrogate_begin <= codepoint && codepoint <= constants::lead_surrogate_end;
  45. }
  46. inline bool is_trail_surrogate(const uint32_t codepoint)
  47. {
  48. return constants::trail_surrogate_begin <= codepoint && codepoint <= constants::trail_surrogate_end;
  49. }
  50. inline uint32_t decode_surrogates(uint32_t lead_surrogate, uint32_t trail_surrogate)
  51. {
  52. uint32_t codepoint = ((lead_surrogate - constants::lead_surrogate_begin) << constants::surrogate_bits);
  53. codepoint += (trail_surrogate - constants::trail_surrogate_begin);
  54. codepoint += constants::surrogate_base;
  55. return codepoint;
  56. }
  57. inline void encode_surrogates(uint32_t codepoint, uint32_t& lead_surrogate, uint32_t& trail_surrogate)
  58. {
  59. codepoint = codepoint - constants::surrogate_base;
  60. lead_surrogate = static_cast<uint16_t>(constants::lead_surrogate_begin + (codepoint >> constants::surrogate_bits));
  61. trail_surrogate =
  62. static_cast<uint16_t>(constants::trail_surrogate_begin + (codepoint & constants::trail_surrogate_max));
  63. }
  64. } // namespace unicode
  65. template <typename _CharTy>
  66. using encoder = void (*)(std::basic_ostream<_CharTy>&, uint32_t);
  67. template <typename _CharTy>
  68. using decoder = bool (*)(std::basic_istream<_CharTy>&, uint32_t&);
  69. template <typename _CharTy>
  70. class ignore
  71. {
  72. public:
  73. using char_type = _CharTy;
  74. using traits_type = std::char_traits<char_type>;
  75. using istream_type = std::basic_istream<char_type>;
  76. using ostream_type = std::basic_ostream<char_type>;
  77. static void encode(ostream_type& os, uint32_t codepoint)
  78. {
  79. os.put(static_cast<char_type>(codepoint));
  80. }
  81. static bool decode(istream_type& is, uint32_t& codepoint)
  82. {
  83. codepoint = static_cast<uint32_t>(static_cast<char_type>(is.get()));
  84. return !is.eof();
  85. }
  86. };
  87. template <typename _CharTy>
  88. class utf8
  89. {
  90. public:
  91. using char_type = _CharTy;
  92. using traits_type = std::char_traits<char_type>;
  93. using istream_type = std::basic_istream<char_type>;
  94. using ostream_type = std::basic_ostream<char_type>;
  95. static void encode(ostream_type& os, uint32_t codepoint)
  96. {
  97. // Unicode UTF-8
  98. // U+0000...U+007F 0xxxxxxx
  99. // U+0080...U+07FF 110xxxxx 10xxxxxx
  100. // U+0800...U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
  101. // U+10000...U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  102. if (codepoint < 0x80)
  103. {
  104. // 0xxxxxxx
  105. os.put(static_cast<char_type>(codepoint));
  106. }
  107. else if (codepoint <= 0x7FF)
  108. {
  109. // 110xxxxx 10xxxxxx
  110. os.put(static_cast<char_type>(0xC0 | (codepoint >> 6)));
  111. os.put(static_cast<char_type>(0x80 | (codepoint & 0x3F)));
  112. }
  113. else if (codepoint <= 0xFFFF)
  114. {
  115. // 1110xxxx 10xxxxxx 10xxxxxx
  116. os.put(static_cast<char_type>(0xE0 | (codepoint >> 12)));
  117. os.put(static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3F)));
  118. os.put(static_cast<char_type>(0x80 | (codepoint & 0x3F)));
  119. }
  120. else if (codepoint <= 0x10FFFF)
  121. {
  122. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  123. os.put(static_cast<char_type>(0xF0 | (codepoint >> 18)));
  124. os.put(static_cast<char_type>(0x80 | ((codepoint >> 12) & 0x3F)));
  125. os.put(static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3F)));
  126. os.put(static_cast<char_type>(0x80 | (codepoint & 0x3F)));
  127. }
  128. else
  129. {
  130. os.setstate(std::ios_base::failbit);
  131. }
  132. }
  133. static bool decode(istream_type& is, uint32_t& codepoint)
  134. {
  135. // Unicode UTF-8
  136. // U+0000...U+007F 0xxxxxxx
  137. // U+0080...U+07FF 110xxxxx 10xxxxxx
  138. // U+0800...U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
  139. // U+10000...U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  140. static const std::array<std::uint8_t, 256> utf8_extra_bytes = {
  141. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  142. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  143. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  144. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  145. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  146. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  147. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  148. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
  149. };
  150. static const std::array<std::uint32_t, 6> utf8_offsets = {
  151. 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080,
  152. };
  153. // peek one byte and check eof
  154. const auto first_byte = static_cast<uint8_t>(is.peek());
  155. if (is.eof())
  156. return false;
  157. codepoint = 0;
  158. // read bytes
  159. const auto extra_bytes_to_read = utf8_extra_bytes[first_byte];
  160. switch (extra_bytes_to_read)
  161. {
  162. case 5:
  163. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  164. codepoint <<= 6;
  165. case 4:
  166. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  167. codepoint <<= 6;
  168. case 3:
  169. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  170. codepoint <<= 6;
  171. case 2:
  172. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  173. codepoint <<= 6;
  174. case 1:
  175. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  176. codepoint <<= 6;
  177. case 0:
  178. codepoint += static_cast<uint32_t>(static_cast<uint8_t>(is.get()));
  179. }
  180. codepoint -= utf8_offsets[extra_bytes_to_read];
  181. if (codepoint > 0x10FFFF)
  182. {
  183. is.setstate(std::ios_base::failbit);
  184. }
  185. return true;
  186. }
  187. };
  188. template <typename _CharTy>
  189. class utf16
  190. {
  191. public:
  192. using char_type = _CharTy;
  193. using traits_type = std::char_traits<char_type>;
  194. using istream_type = std::basic_istream<char_type>;
  195. using ostream_type = std::basic_ostream<char_type>;
  196. static_assert(sizeof(char_type) >= 2, "The size of utf16 characters must be larger than 16 bits");
  197. static void encode(ostream_type& os, uint32_t codepoint)
  198. {
  199. if (codepoint <= 0xFFFF)
  200. {
  201. os.put(traits_type::to_char_type(static_cast<typename traits_type::int_type>(codepoint)));
  202. }
  203. else if (codepoint <= 0x10FFFF)
  204. {
  205. uint32_t lead_surrogate = 0, trail_surrogate = 0;
  206. unicode::encode_surrogates(codepoint, lead_surrogate, trail_surrogate);
  207. os.put(traits_type::to_char_type(static_cast<typename traits_type::int_type>(lead_surrogate)));
  208. os.put(traits_type::to_char_type(static_cast<typename traits_type::int_type>(trail_surrogate)));
  209. }
  210. else
  211. {
  212. os.setstate(std::ios_base::failbit);
  213. }
  214. }
  215. static bool decode(istream_type& is, uint32_t& codepoint)
  216. {
  217. codepoint = static_cast<uint32_t>(static_cast<uint16_t>(is.get()));
  218. if (is.eof())
  219. return false;
  220. if (unicode::is_lead_surrogate(codepoint))
  221. {
  222. uint32_t lead_surrogate = codepoint;
  223. uint32_t trail_surrogate = static_cast<uint32_t>(static_cast<uint16_t>(is.get()));
  224. if (unicode::is_trail_surrogate(trail_surrogate))
  225. {
  226. codepoint = unicode::decode_surrogates(lead_surrogate, trail_surrogate);
  227. }
  228. else
  229. {
  230. is.setstate(std::ios_base::failbit);
  231. }
  232. }
  233. if (codepoint > 0x10FFFF)
  234. {
  235. is.setstate(std::ios_base::failbit);
  236. }
  237. return true;
  238. }
  239. };
  240. template <typename _CharTy>
  241. class utf32
  242. {
  243. public:
  244. using char_type = _CharTy;
  245. using traits_type = std::char_traits<char_type>;
  246. using istream_type = std::basic_istream<char_type>;
  247. using ostream_type = std::basic_ostream<char_type>;
  248. static_assert(sizeof(char_type) >= 4, "The size of utf32 characters must be larger than 32 bits");
  249. static void encode(ostream_type& os, uint32_t codepoint)
  250. {
  251. if (codepoint > 0x10FFFF)
  252. {
  253. os.setstate(std::ios_base::failbit);
  254. }
  255. os.put(traits_type::to_char_type(static_cast<typename traits_type::int_type>(codepoint)));
  256. }
  257. static bool decode(istream_type& is, uint32_t& codepoint)
  258. {
  259. codepoint = static_cast<uint32_t>(is.get());
  260. if (is.eof())
  261. return false;
  262. if (codepoint > 0x10FFFF)
  263. {
  264. is.setstate(std::ios_base::failbit);
  265. }
  266. return true;
  267. }
  268. };
  269. template <typename _CharTy>
  270. class auto_utf
  271. {
  272. public:
  273. using char_type = _CharTy;
  274. using traits_type = std::char_traits<char_type>;
  275. using istream_type = std::basic_istream<char_type>;
  276. using ostream_type = std::basic_ostream<char_type>;
  277. static inline void encode(ostream_type& os, uint32_t codepoint)
  278. {
  279. encode(os, codepoint, std::integral_constant<int, sizeof(char_type)>());
  280. }
  281. static inline bool decode(istream_type& is, uint32_t& codepoint)
  282. {
  283. return decode(is, codepoint, std::integral_constant<int, sizeof(char_type)>());
  284. }
  285. private:
  286. static inline void encode(ostream_type& os, uint32_t codepoint, std::integral_constant<int, 1>)
  287. {
  288. utf8<char_type>::encode(os, codepoint);
  289. }
  290. static inline void encode(ostream_type& os, uint32_t codepoint, std::integral_constant<int, 2>)
  291. {
  292. utf16<char_type>::encode(os, codepoint);
  293. }
  294. static inline void encode(ostream_type& os, uint32_t codepoint, std::integral_constant<int, 4>)
  295. {
  296. utf32<char_type>::encode(os, codepoint);
  297. }
  298. static inline bool decode(istream_type& is, uint32_t& codepoint, std::integral_constant<int, 1>)
  299. {
  300. return utf8<char_type>::decode(is, codepoint);
  301. }
  302. static inline bool decode(istream_type& is, uint32_t& codepoint, std::integral_constant<int, 2>)
  303. {
  304. return utf16<char_type>::decode(is, codepoint);
  305. }
  306. static inline bool decode(istream_type& is, uint32_t& codepoint, std::integral_constant<int, 4>)
  307. {
  308. return utf32<char_type>::decode(is, codepoint);
  309. }
  310. };
  311. //
  312. // type traits
  313. //
  314. template <typename _Encoding>
  315. struct is_unicode_encoding : std::false_type
  316. {
  317. };
  318. template <typename _CharTy>
  319. struct is_unicode_encoding<utf8<_CharTy>> : std::true_type
  320. {
  321. };
  322. template <typename _CharTy>
  323. struct is_unicode_encoding<utf16<_CharTy>> : std::true_type
  324. {
  325. };
  326. template <typename _CharTy>
  327. struct is_unicode_encoding<utf32<_CharTy>> : std::true_type
  328. {
  329. };
  330. template <typename _CharTy>
  331. struct is_unicode_encoding<auto_utf<_CharTy>> : std::true_type
  332. {
  333. };
  334. } // namespace encoding
  335. } // namespace configor