json-lexer.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*
  2. * JSON lexer
  3. *
  4. * Copyright IBM, Corp. 2009
  5. *
  6. * Authors:
  7. * Anthony Liguori <aliguori@us.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  10. * See the COPYING.LIB file in the top-level directory.
  11. *
  12. */
  13. #include "qemu/osdep.h"
  14. #include "json-parser-int.h"
  15. #define MAX_TOKEN_SIZE (64ULL << 20)
  16. /*
  17. * From RFC 8259 "The JavaScript Object Notation (JSON) Data
  18. * Interchange Format", with [comments in brackets]:
  19. *
  20. * The set of tokens includes six structural characters, strings,
  21. * numbers, and three literal names.
  22. *
  23. * These are the six structural characters:
  24. *
  25. * begin-array = ws %x5B ws ; [ left square bracket
  26. * begin-object = ws %x7B ws ; { left curly bracket
  27. * end-array = ws %x5D ws ; ] right square bracket
  28. * end-object = ws %x7D ws ; } right curly bracket
  29. * name-separator = ws %x3A ws ; : colon
  30. * value-separator = ws %x2C ws ; , comma
  31. *
  32. * Insignificant whitespace is allowed before or after any of the six
  33. * structural characters.
  34. * [This lexer accepts it before or after any token, which is actually
  35. * the same, as the grammar always has structural characters between
  36. * other tokens.]
  37. *
  38. * ws = *(
  39. * %x20 / ; Space
  40. * %x09 / ; Horizontal tab
  41. * %x0A / ; Line feed or New line
  42. * %x0D ) ; Carriage return
  43. *
  44. * [...] three literal names:
  45. * false null true
  46. * [This lexer accepts [a-z]+, and leaves rejecting unknown literal
  47. * names to the parser.]
  48. *
  49. * [Numbers:]
  50. *
  51. * number = [ minus ] int [ frac ] [ exp ]
  52. * decimal-point = %x2E ; .
  53. * digit1-9 = %x31-39 ; 1-9
  54. * e = %x65 / %x45 ; e E
  55. * exp = e [ minus / plus ] 1*DIGIT
  56. * frac = decimal-point 1*DIGIT
  57. * int = zero / ( digit1-9 *DIGIT )
  58. * minus = %x2D ; -
  59. * plus = %x2B ; +
  60. * zero = %x30 ; 0
  61. *
  62. * [Strings:]
  63. * string = quotation-mark *char quotation-mark
  64. *
  65. * char = unescaped /
  66. * escape (
  67. * %x22 / ; " quotation mark U+0022
  68. * %x5C / ; \ reverse solidus U+005C
  69. * %x2F / ; / solidus U+002F
  70. * %x62 / ; b backspace U+0008
  71. * %x66 / ; f form feed U+000C
  72. * %x6E / ; n line feed U+000A
  73. * %x72 / ; r carriage return U+000D
  74. * %x74 / ; t tab U+0009
  75. * %x75 4HEXDIG ) ; uXXXX U+XXXX
  76. * escape = %x5C ; \
  77. * quotation-mark = %x22 ; "
  78. * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
  79. * [This lexer accepts any non-control character after escape, and
  80. * leaves rejecting invalid ones to the parser.]
  81. *
  82. *
  83. * Extensions over RFC 8259:
  84. * - Extra escape sequence in strings:
  85. * 0x27 (apostrophe) is recognized after escape, too
  86. * - Single-quoted strings:
  87. * Like double-quoted strings, except they're delimited by %x27
  88. * (apostrophe) instead of %x22 (quotation mark), and can't contain
  89. * unescaped apostrophe, but can contain unescaped quotation mark.
  90. * - Interpolation, if enabled:
  91. * The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
  92. * ones to the parser.
  93. *
  94. * Note:
  95. * - Input must be encoded in modified UTF-8.
  96. * - Decoding and validating is left to the parser.
  97. */
  98. enum json_lexer_state {
  99. IN_RECOVERY = 1,
  100. IN_DQ_STRING_ESCAPE,
  101. IN_DQ_STRING,
  102. IN_SQ_STRING_ESCAPE,
  103. IN_SQ_STRING,
  104. IN_ZERO,
  105. IN_EXP_DIGITS,
  106. IN_EXP_SIGN,
  107. IN_EXP_E,
  108. IN_MANTISSA,
  109. IN_MANTISSA_DIGITS,
  110. IN_DIGITS,
  111. IN_SIGN,
  112. IN_KEYWORD,
  113. IN_INTERP,
  114. IN_START,
  115. IN_START_INTERP, /* must be IN_START + 1 */
  116. };
  117. QEMU_BUILD_BUG_ON(JSON_ERROR != 0);
  118. QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1);
  119. QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
  120. QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80);
  121. QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
  122. #define LOOKAHEAD 0x80
  123. #define TERMINAL(state) [0 ... 0xFF] = ((state) | LOOKAHEAD)
  124. static const uint8_t json_lexer[][256] = {
  125. /* Relies on default initialization to IN_ERROR! */
  126. /* error recovery */
  127. [IN_RECOVERY] = {
  128. /*
  129. * Skip characters until a structural character, an ASCII
  130. * control character other than '\t', or impossible UTF-8
  131. * bytes '\xFE', '\xFF'. Structural characters and line
  132. * endings are promising resynchronization points. Clients
  133. * may use the others to force the JSON parser into known-good
  134. * state; see docs/interop/qmp-spec.rst.
  135. */
  136. [0 ... 0x1F] = IN_START | LOOKAHEAD,
  137. [0x20 ... 0xFD] = IN_RECOVERY,
  138. [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
  139. ['\t'] = IN_RECOVERY,
  140. ['['] = IN_START | LOOKAHEAD,
  141. [']'] = IN_START | LOOKAHEAD,
  142. ['{'] = IN_START | LOOKAHEAD,
  143. ['}'] = IN_START | LOOKAHEAD,
  144. [':'] = IN_START | LOOKAHEAD,
  145. [','] = IN_START | LOOKAHEAD,
  146. },
  147. /* double quote string */
  148. [IN_DQ_STRING_ESCAPE] = {
  149. [0x20 ... 0xFD] = IN_DQ_STRING,
  150. },
  151. [IN_DQ_STRING] = {
  152. [0x20 ... 0xFD] = IN_DQ_STRING,
  153. ['\\'] = IN_DQ_STRING_ESCAPE,
  154. ['"'] = JSON_STRING,
  155. },
  156. /* single quote string */
  157. [IN_SQ_STRING_ESCAPE] = {
  158. [0x20 ... 0xFD] = IN_SQ_STRING,
  159. },
  160. [IN_SQ_STRING] = {
  161. [0x20 ... 0xFD] = IN_SQ_STRING,
  162. ['\\'] = IN_SQ_STRING_ESCAPE,
  163. ['\''] = JSON_STRING,
  164. },
  165. /* Zero */
  166. [IN_ZERO] = {
  167. TERMINAL(JSON_INTEGER),
  168. ['0' ... '9'] = JSON_ERROR,
  169. ['.'] = IN_MANTISSA,
  170. },
  171. /* Float */
  172. [IN_EXP_DIGITS] = {
  173. TERMINAL(JSON_FLOAT),
  174. ['0' ... '9'] = IN_EXP_DIGITS,
  175. },
  176. [IN_EXP_SIGN] = {
  177. ['0' ... '9'] = IN_EXP_DIGITS,
  178. },
  179. [IN_EXP_E] = {
  180. ['-'] = IN_EXP_SIGN,
  181. ['+'] = IN_EXP_SIGN,
  182. ['0' ... '9'] = IN_EXP_DIGITS,
  183. },
  184. [IN_MANTISSA_DIGITS] = {
  185. TERMINAL(JSON_FLOAT),
  186. ['0' ... '9'] = IN_MANTISSA_DIGITS,
  187. ['e'] = IN_EXP_E,
  188. ['E'] = IN_EXP_E,
  189. },
  190. [IN_MANTISSA] = {
  191. ['0' ... '9'] = IN_MANTISSA_DIGITS,
  192. },
  193. /* Number */
  194. [IN_DIGITS] = {
  195. TERMINAL(JSON_INTEGER),
  196. ['0' ... '9'] = IN_DIGITS,
  197. ['e'] = IN_EXP_E,
  198. ['E'] = IN_EXP_E,
  199. ['.'] = IN_MANTISSA,
  200. },
  201. [IN_SIGN] = {
  202. ['0'] = IN_ZERO,
  203. ['1' ... '9'] = IN_DIGITS,
  204. },
  205. /* keywords */
  206. [IN_KEYWORD] = {
  207. TERMINAL(JSON_KEYWORD),
  208. ['a' ... 'z'] = IN_KEYWORD,
  209. },
  210. /* interpolation */
  211. [IN_INTERP] = {
  212. TERMINAL(JSON_INTERP),
  213. ['A' ... 'Z'] = IN_INTERP,
  214. ['a' ... 'z'] = IN_INTERP,
  215. ['0' ... '9'] = IN_INTERP,
  216. },
  217. /*
  218. * Two start states:
  219. * - IN_START recognizes JSON tokens with our string extensions
  220. * - IN_START_INTERP additionally recognizes interpolation.
  221. */
  222. [IN_START ... IN_START_INTERP] = {
  223. ['"'] = IN_DQ_STRING,
  224. ['\''] = IN_SQ_STRING,
  225. ['0'] = IN_ZERO,
  226. ['1' ... '9'] = IN_DIGITS,
  227. ['-'] = IN_SIGN,
  228. ['{'] = JSON_LCURLY,
  229. ['}'] = JSON_RCURLY,
  230. ['['] = JSON_LSQUARE,
  231. [']'] = JSON_RSQUARE,
  232. [','] = JSON_COMMA,
  233. [':'] = JSON_COLON,
  234. ['a' ... 'z'] = IN_KEYWORD,
  235. [' '] = IN_START,
  236. ['\t'] = IN_START,
  237. ['\r'] = IN_START,
  238. ['\n'] = IN_START,
  239. },
  240. [IN_START_INTERP]['%'] = IN_INTERP,
  241. };
  242. static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush,
  243. bool *char_consumed)
  244. {
  245. uint8_t next;
  246. assert(lexer->state < ARRAY_SIZE(json_lexer));
  247. next = json_lexer[lexer->state][(uint8_t)ch];
  248. *char_consumed = !flush && !(next & LOOKAHEAD);
  249. return next & ~LOOKAHEAD;
  250. }
  251. void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
  252. {
  253. lexer->start_state = lexer->state = enable_interpolation
  254. ? IN_START_INTERP : IN_START;
  255. lexer->token = g_string_sized_new(3);
  256. lexer->x = lexer->y = 0;
  257. }
  258. static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
  259. {
  260. int new_state;
  261. bool char_consumed = false;
  262. lexer->x++;
  263. if (ch == '\n') {
  264. lexer->x = 0;
  265. lexer->y++;
  266. }
  267. while (flush ? lexer->state != lexer->start_state : !char_consumed) {
  268. new_state = next_state(lexer, ch, flush, &char_consumed);
  269. if (char_consumed) {
  270. assert(!flush);
  271. g_string_append_c(lexer->token, ch);
  272. }
  273. switch (new_state) {
  274. case JSON_LCURLY:
  275. case JSON_RCURLY:
  276. case JSON_LSQUARE:
  277. case JSON_RSQUARE:
  278. case JSON_COLON:
  279. case JSON_COMMA:
  280. case JSON_INTERP:
  281. case JSON_INTEGER:
  282. case JSON_FLOAT:
  283. case JSON_KEYWORD:
  284. case JSON_STRING:
  285. json_message_process_token(lexer, lexer->token, new_state,
  286. lexer->x, lexer->y);
  287. /* fall through */
  288. case IN_START:
  289. g_string_truncate(lexer->token, 0);
  290. new_state = lexer->start_state;
  291. break;
  292. case JSON_ERROR:
  293. json_message_process_token(lexer, lexer->token, JSON_ERROR,
  294. lexer->x, lexer->y);
  295. new_state = IN_RECOVERY;
  296. /* fall through */
  297. case IN_RECOVERY:
  298. g_string_truncate(lexer->token, 0);
  299. break;
  300. default:
  301. break;
  302. }
  303. lexer->state = new_state;
  304. }
  305. /* Do not let a single token grow to an arbitrarily large size,
  306. * this is a security consideration.
  307. */
  308. if (lexer->token->len > MAX_TOKEN_SIZE) {
  309. json_message_process_token(lexer, lexer->token, lexer->state,
  310. lexer->x, lexer->y);
  311. g_string_truncate(lexer->token, 0);
  312. lexer->state = lexer->start_state;
  313. }
  314. }
  315. void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
  316. {
  317. size_t i;
  318. for (i = 0; i < size; i++) {
  319. json_lexer_feed_char(lexer, buffer[i], false);
  320. }
  321. }
  322. void json_lexer_flush(JSONLexer *lexer)
  323. {
  324. json_lexer_feed_char(lexer, 0, true);
  325. assert(lexer->state == lexer->start_state);
  326. json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT,
  327. lexer->x, lexer->y);
  328. }
  329. void json_lexer_destroy(JSONLexer *lexer)
  330. {
  331. g_string_free(lexer->token, true);
  332. }