|
@@ -18,21 +18,83 @@
|
|
|
#define MAX_TOKEN_SIZE (64ULL << 20)
|
|
|
|
|
|
/*
|
|
|
- * Required by JSON (RFC 7159):
|
|
|
+ * From RFC 8259 "The JavaScript Object Notation (JSON) Data
|
|
|
+ * Interchange Format", with [comments in brackets]:
|
|
|
*
|
|
|
- * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\"
|
|
|
- * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)?
|
|
|
- * [{}\[\],:]
|
|
|
- * [a-z]+ # covers null, true, false
|
|
|
+ * The set of tokens includes six structural characters, strings,
|
|
|
+ * numbers, and three literal names.
|
|
|
*
|
|
|
- * Extension of '' strings:
|
|
|
+ * These are the six structural characters:
|
|
|
*
|
|
|
- * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*'
|
|
|
+ * begin-array = ws %x5B ws ; [ left square bracket
|
|
|
+ * begin-object = ws %x7B ws ; { left curly bracket
|
|
|
+ * end-array = ws %x5D ws ; ] right square bracket
|
|
|
+ * end-object = ws %x7D ws ; } right curly bracket
|
|
|
+ * name-separator = ws %x3A ws ; : colon
|
|
|
+ * value-separator = ws %x2C ws ; , comma
|
|
|
*
|
|
|
- * Extension for vararg handling in JSON construction:
|
|
|
+ * Insignificant whitespace is allowed before or after any of the six
|
|
|
+ * structural characters.
|
|
|
+ * [This lexer accepts it before or after any token, which is actually
|
|
|
+ * the same, as the grammar always has structural characters between
|
|
|
+ * other tokens.]
|
|
|
*
|
|
|
- * %((l|ll|I64)?d|[ipsf])
|
|
|
+ * ws = *(
|
|
|
+ * %x20 / ; Space
|
|
|
+ * %x09 / ; Horizontal tab
|
|
|
+ * %x0A / ; Line feed or New line
|
|
|
+ * %x0D ) ; Carriage return
|
|
|
*
|
|
|
+ * [...] three literal names:
|
|
|
+ * false null true
|
|
|
+ * [This lexer accepts [a-z]+, and leaves rejecting unknown literal
|
|
|
+ * names to the parser.]
|
|
|
+ *
|
|
|
+ * [Numbers:]
|
|
|
+ *
|
|
|
+ * number = [ minus ] int [ frac ] [ exp ]
|
|
|
+ * decimal-point = %x2E ; .
|
|
|
+ * digit1-9 = %x31-39 ; 1-9
|
|
|
+ * e = %x65 / %x45 ; e E
|
|
|
+ * exp = e [ minus / plus ] 1*DIGIT
|
|
|
+ * frac = decimal-point 1*DIGIT
|
|
|
+ * int = zero / ( digit1-9 *DIGIT )
|
|
|
+ * minus = %x2D ; -
|
|
|
+ * plus = %x2B ; +
|
|
|
+ * zero = %x30 ; 0
|
|
|
+ *
|
|
|
+ * [Strings:]
|
|
|
+ * string = quotation-mark *char quotation-mark
|
|
|
+ *
|
|
|
+ * char = unescaped /
|
|
|
+ * escape (
|
|
|
+ * %x22 / ; " quotation mark U+0022
|
|
|
+ * %x5C / ; \ reverse solidus U+005C
|
|
|
+ * %x2F / ; / solidus U+002F
|
|
|
+ * %x62 / ; b backspace U+0008
|
|
|
+ * %x66 / ; f form feed U+000C
|
|
|
+ * %x6E / ; n line feed U+000A
|
|
|
+ * %x72 / ; r carriage return U+000D
|
|
|
+ * %x74 / ; t tab U+0009
|
|
|
+ * %x75 4HEXDIG ) ; uXXXX U+XXXX
|
|
|
+ * escape = %x5C ; \
|
|
|
+ * quotation-mark = %x22 ; "
|
|
|
+ * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Extensions over RFC 8259:
|
|
|
+ * - Extra escape sequence in strings:
|
|
|
+ * 0x27 (apostrophe) is recognized after escape, too
|
|
|
+ * - Single-quoted strings:
|
|
|
+ * Like double-quoted strings, except they're delimited by %x27
|
|
|
+ * (apostrophe) instead of %x22 (quotation mark), and can't contain
|
|
|
+ * unescaped apostrophe, but can contain unescaped quotation mark.
|
|
|
+ * - Interpolation:
|
|
|
+ * interpolation = %((l|ll|I64)[du]|[ipsf])
|
|
|
+ *
|
|
|
+ * Note:
|
|
|
+ * - Input must be encoded in UTF-8.
|
|
|
+ * - Decoding and validating is left to the parser.
|
|
|
*/
|
|
|
|
|
|
enum json_lexer_state {
|