json-lexer.c 12.2 KB
Newer Older
A
Anthony Liguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * JSON lexer
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
A
Anthony Liguori 已提交
15
#include "qemu-common.h"
16
#include "qapi/qmp/json-lexer.h"
A
Anthony Liguori 已提交
17

18 19
#define MAX_TOKEN_SIZE (64ULL << 20)

A
Anthony Liguori 已提交
20
/*
21 22
 * From RFC 8259 "The JavaScript Object Notation (JSON) Data
 * Interchange Format", with [comments in brackets]:
23
 *
24 25
 * The set of tokens includes six structural characters, strings,
 * numbers, and three literal names.
26
 *
27
 * These are the six structural characters:
28
 *
29 30 31 32 33 34
 *    begin-array     = ws %x5B ws  ; [ left square bracket
 *    begin-object    = ws %x7B ws  ; { left curly bracket
 *    end-array       = ws %x5D ws  ; ] right square bracket
 *    end-object      = ws %x7D ws  ; } right curly bracket
 *    name-separator  = ws %x3A ws  ; : colon
 *    value-separator = ws %x2C ws  ; , comma
35
 *
36 37 38 39 40
 * Insignificant whitespace is allowed before or after any of the six
 * structural characters.
 * [This lexer accepts it before or after any token, which is actually
 * the same, as the grammar always has structural characters between
 * other tokens.]
41
 *
42 43 44 45 46
 *    ws = *(
 *           %x20 /              ; Space
 *           %x09 /              ; Horizontal tab
 *           %x0A /              ; Line feed or New line
 *           %x0D )              ; Carriage return
A
Anthony Liguori 已提交
47
 *
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 * [...] three literal names:
 *    false null true
 *  [This lexer accepts [a-z]+, and leaves rejecting unknown literal
 *  names to the parser.]
 *
 * [Numbers:]
 *
 *    number = [ minus ] int [ frac ] [ exp ]
 *    decimal-point = %x2E       ; .
 *    digit1-9 = %x31-39         ; 1-9
 *    e = %x65 / %x45            ; e E
 *    exp = e [ minus / plus ] 1*DIGIT
 *    frac = decimal-point 1*DIGIT
 *    int = zero / ( digit1-9 *DIGIT )
 *    minus = %x2D               ; -
 *    plus = %x2B                ; +
 *    zero = %x30                ; 0
 *
 * [Strings:]
 *    string = quotation-mark *char quotation-mark
 *
 *    char = unescaped /
 *        escape (
 *            %x22 /          ; "    quotation mark  U+0022
 *            %x5C /          ; \    reverse solidus U+005C
 *            %x2F /          ; /    solidus         U+002F
 *            %x62 /          ; b    backspace       U+0008
 *            %x66 /          ; f    form feed       U+000C
 *            %x6E /          ; n    line feed       U+000A
 *            %x72 /          ; r    carriage return U+000D
 *            %x74 /          ; t    tab             U+0009
 *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
 *    escape = %x5C              ; \
 *    quotation-mark = %x22      ; "
 *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
 *
 *
 * Extensions over RFC 8259:
 * - Extra escape sequence in strings:
 *   0x27 (apostrophe) is recognized after escape, too
 * - Single-quoted strings:
 *   Like double-quoted strings, except they're delimited by %x27
 *   (apostrophe) instead of %x22 (quotation mark), and can't contain
 *   unescaped apostrophe, but can contain unescaped quotation mark.
 * - Interpolation:
 *   interpolation = %((l|ll|I64)[du]|[ipsf])
 *
 * Note:
96
 * - Input must be encoded in modified UTF-8.
97
 * - Decoding and validating is left to the parser.
A
Anthony Liguori 已提交
98 99 100
 */

enum json_lexer_state {
101
    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
A
Anthony Liguori 已提交
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    IN_DQ_UCODE3,
    IN_DQ_UCODE2,
    IN_DQ_UCODE1,
    IN_DQ_UCODE0,
    IN_DQ_STRING_ESCAPE,
    IN_DQ_STRING,
    IN_SQ_UCODE3,
    IN_SQ_UCODE2,
    IN_SQ_UCODE1,
    IN_SQ_UCODE0,
    IN_SQ_STRING_ESCAPE,
    IN_SQ_STRING,
    IN_ZERO,
    IN_DIGITS,
    IN_DIGIT,
    IN_EXP_E,
    IN_MANTISSA,
    IN_MANTISSA_DIGITS,
    IN_NONZERO_NUMBER,
    IN_NEG_NONZERO_NUMBER,
    IN_KEYWORD,
    IN_ESCAPE,
    IN_ESCAPE_L,
    IN_ESCAPE_LL,
R
Roy Tam 已提交
126 127 128
    IN_ESCAPE_I,
    IN_ESCAPE_I6,
    IN_ESCAPE_I64,
A
Anthony Liguori 已提交
129 130 131 132
    IN_WHITESPACE,
    IN_START,
};

133 134
QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);

A
Anthony Liguori 已提交
135 136
#define TERMINAL(state) [0 ... 0x7F] = (state)

137 138 139 140
/* Return whether TERMINAL is a terminal state and the transition to it
   from OLD_STATE required lookahead.  This happens whenever the table
   below uses the TERMINAL macro.  */
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
141
    (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal))
142

A
Anthony Liguori 已提交
143
static const uint8_t json_lexer[][256] =  {
144 145
    /* Relies on default initialization to IN_ERROR! */

A
Anthony Liguori 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
    /* double quote string */
    [IN_DQ_UCODE3] = {
        ['0' ... '9'] = IN_DQ_STRING,
        ['a' ... 'f'] = IN_DQ_STRING,
        ['A' ... 'F'] = IN_DQ_STRING,
    },
    [IN_DQ_UCODE2] = {
        ['0' ... '9'] = IN_DQ_UCODE3,
        ['a' ... 'f'] = IN_DQ_UCODE3,
        ['A' ... 'F'] = IN_DQ_UCODE3,
    },
    [IN_DQ_UCODE1] = {
        ['0' ... '9'] = IN_DQ_UCODE2,
        ['a' ... 'f'] = IN_DQ_UCODE2,
        ['A' ... 'F'] = IN_DQ_UCODE2,
    },
    [IN_DQ_UCODE0] = {
        ['0' ... '9'] = IN_DQ_UCODE1,
        ['a' ... 'f'] = IN_DQ_UCODE1,
        ['A' ... 'F'] = IN_DQ_UCODE1,
    },
    [IN_DQ_STRING_ESCAPE] = {
        ['b'] = IN_DQ_STRING,
        ['f'] =  IN_DQ_STRING,
        ['n'] =  IN_DQ_STRING,
        ['r'] =  IN_DQ_STRING,
        ['t'] =  IN_DQ_STRING,
173 174
        ['/'] = IN_DQ_STRING,
        ['\\'] = IN_DQ_STRING,
A
Anthony Liguori 已提交
175 176 177 178 179
        ['\''] = IN_DQ_STRING,
        ['\"'] = IN_DQ_STRING,
        ['u'] = IN_DQ_UCODE0,
    },
    [IN_DQ_STRING] = {
180
        [0x20 ... 0xFD] = IN_DQ_STRING,
A
Anthony Liguori 已提交
181
        ['\\'] = IN_DQ_STRING_ESCAPE,
P
Paolo Bonzini 已提交
182
        ['"'] = JSON_STRING,
A
Anthony Liguori 已提交
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
    },

    /* single quote string */
    [IN_SQ_UCODE3] = {
        ['0' ... '9'] = IN_SQ_STRING,
        ['a' ... 'f'] = IN_SQ_STRING,
        ['A' ... 'F'] = IN_SQ_STRING,
    },
    [IN_SQ_UCODE2] = {
        ['0' ... '9'] = IN_SQ_UCODE3,
        ['a' ... 'f'] = IN_SQ_UCODE3,
        ['A' ... 'F'] = IN_SQ_UCODE3,
    },
    [IN_SQ_UCODE1] = {
        ['0' ... '9'] = IN_SQ_UCODE2,
        ['a' ... 'f'] = IN_SQ_UCODE2,
        ['A' ... 'F'] = IN_SQ_UCODE2,
    },
    [IN_SQ_UCODE0] = {
        ['0' ... '9'] = IN_SQ_UCODE1,
        ['a' ... 'f'] = IN_SQ_UCODE1,
        ['A' ... 'F'] = IN_SQ_UCODE1,
    },
    [IN_SQ_STRING_ESCAPE] = {
        ['b'] = IN_SQ_STRING,
        ['f'] =  IN_SQ_STRING,
        ['n'] =  IN_SQ_STRING,
        ['r'] =  IN_SQ_STRING,
        ['t'] =  IN_SQ_STRING,
212 213
        ['/'] = IN_SQ_STRING,
        ['\\'] = IN_SQ_STRING,
A
Anthony Liguori 已提交
214 215 216 217 218
        ['\''] = IN_SQ_STRING,
        ['\"'] = IN_SQ_STRING,
        ['u'] = IN_SQ_UCODE0,
    },
    [IN_SQ_STRING] = {
219
        [0x20 ... 0xFD] = IN_SQ_STRING,
A
Anthony Liguori 已提交
220
        ['\\'] = IN_SQ_STRING_ESCAPE,
P
Paolo Bonzini 已提交
221
        ['\''] = JSON_STRING,
A
Anthony Liguori 已提交
222 223 224 225 226
    },

    /* Zero */
    [IN_ZERO] = {
        TERMINAL(JSON_INTEGER),
227
        ['0' ... '9'] = IN_ERROR,
A
Anthony Liguori 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
        ['.'] = IN_MANTISSA,
    },

    /* Float */
    [IN_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_DIGIT] = {
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_EXP_E] = {
        ['-'] = IN_DIGIT,
        ['+'] = IN_DIGIT,
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_MANTISSA_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
    },

    [IN_MANTISSA] = {
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    },

    /* Number */
    [IN_NONZERO_NUMBER] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_NONZERO_NUMBER,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
        ['.'] = IN_MANTISSA,
    },

    [IN_NEG_NONZERO_NUMBER] = {
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_NONZERO_NUMBER,
    },

    /* keywords */
    [IN_KEYWORD] = {
        TERMINAL(JSON_KEYWORD),
        ['a' ... 'z'] = IN_KEYWORD,
    },

    /* whitespace */
    [IN_WHITESPACE] = {
        TERMINAL(JSON_SKIP),
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
285
    },
A
Anthony Liguori 已提交
286 287 288

    /* escape */
    [IN_ESCAPE_LL] = {
P
Paolo Bonzini 已提交
289
        ['d'] = JSON_ESCAPE,
290
        ['u'] = JSON_ESCAPE,
A
Anthony Liguori 已提交
291 292 293
    },

    [IN_ESCAPE_L] = {
P
Paolo Bonzini 已提交
294
        ['d'] = JSON_ESCAPE,
A
Anthony Liguori 已提交
295
        ['l'] = IN_ESCAPE_LL,
296
        ['u'] = JSON_ESCAPE,
A
Anthony Liguori 已提交
297 298
    },

R
Roy Tam 已提交
299
    [IN_ESCAPE_I64] = {
P
Paolo Bonzini 已提交
300
        ['d'] = JSON_ESCAPE,
301
        ['u'] = JSON_ESCAPE,
R
Roy Tam 已提交
302 303 304 305 306 307 308 309 310 311
    },

    [IN_ESCAPE_I6] = {
        ['4'] = IN_ESCAPE_I64,
    },

    [IN_ESCAPE_I] = {
        ['6'] = IN_ESCAPE_I6,
    },

A
Anthony Liguori 已提交
312
    [IN_ESCAPE] = {
P
Paolo Bonzini 已提交
313 314 315 316
        ['d'] = JSON_ESCAPE,
        ['i'] = JSON_ESCAPE,
        ['p'] = JSON_ESCAPE,
        ['s'] = JSON_ESCAPE,
317
        ['u'] = JSON_ESCAPE,
P
Paolo Bonzini 已提交
318
        ['f'] = JSON_ESCAPE,
A
Anthony Liguori 已提交
319
        ['l'] = IN_ESCAPE_L,
R
Roy Tam 已提交
320
        ['I'] = IN_ESCAPE_I,
A
Anthony Liguori 已提交
321 322 323 324 325 326 327 328 329
    },

    /* top level rule */
    [IN_START] = {
        ['"'] = IN_DQ_STRING,
        ['\''] = IN_SQ_STRING,
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_NONZERO_NUMBER,
        ['-'] = IN_NEG_NONZERO_NUMBER,
330 331 332 333 334 335
        ['{'] = JSON_LCURLY,
        ['}'] = JSON_RCURLY,
        ['['] = JSON_LSQUARE,
        [']'] = JSON_RSQUARE,
        [','] = JSON_COMMA,
        [':'] = JSON_COLON,
A
Anthony Liguori 已提交
336 337 338 339 340 341 342 343 344 345 346 347 348
        ['a' ... 'z'] = IN_KEYWORD,
        ['%'] = IN_ESCAPE,
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },
};

void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
{
    lexer->emit = func;
    lexer->state = IN_START;
349
    lexer->token = g_string_sized_new(3);
350
    lexer->x = lexer->y = 0;
A
Anthony Liguori 已提交
351 352
}

353
static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
A
Anthony Liguori 已提交
354
{
355 356
    int char_consumed, new_state;

A
Anthony Liguori 已提交
357 358 359 360 361 362
    lexer->x++;
    if (ch == '\n') {
        lexer->x = 0;
        lexer->y++;
    }

363
    do {
364
        assert(lexer->state <= ARRAY_SIZE(json_lexer));
365 366
        new_state = json_lexer[lexer->state][(uint8_t)ch];
        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
367
        if (char_consumed && !flush) {
368
            g_string_append_c(lexer->token, ch);
369
        }
A
Anthony Liguori 已提交
370

371
        switch (new_state) {
372 373 374 375 376 377
        case JSON_LCURLY:
        case JSON_RCURLY:
        case JSON_LSQUARE:
        case JSON_RSQUARE:
        case JSON_COLON:
        case JSON_COMMA:
378 379 380 381 382 383
        case JSON_ESCAPE:
        case JSON_INTEGER:
        case JSON_FLOAT:
        case JSON_KEYWORD:
        case JSON_STRING:
            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
384
            /* fall through */
385
        case JSON_SKIP:
386
            g_string_truncate(lexer->token, 0);
387 388
            new_state = IN_START;
            break;
389
        case IN_ERROR:
390 391 392 393 394 395 396 397 398 399 400 401 402 403
            /* XXX: To avoid having previous bad input leaving the parser in an
             * unresponsive state where we consume unpredictable amounts of
             * subsequent "good" input, percolate this error state up to the
             * tokenizer/parser by forcing a NULL object to be emitted, then
             * reset state.
             *
             * Also note that this handling is required for reliable channel
             * negotiation between QMP and the guest agent, since chr(0xFF)
             * is placed at the beginning of certain events to ensure proper
             * delivery when the channel is in an unknown state. chr(0xFF) is
             * never a valid ASCII/UTF-8 sequence, so this should reliably
             * induce an error/flush state.
             */
            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
404
            g_string_truncate(lexer->token, 0);
405
            new_state = IN_START;
406 407
            lexer->state = new_state;
            return 0;
408 409 410 411
        default:
            break;
        }
        lexer->state = new_state;
412
    } while (!char_consumed && !flush);
413 414 415 416

    /* Do not let a single token grow to an arbitrarily large size,
     * this is a security consideration.
     */
417
    if (lexer->token->len > MAX_TOKEN_SIZE) {
418
        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
419
        g_string_truncate(lexer->token, 0);
420 421 422
        lexer->state = IN_START;
    }

A
Anthony Liguori 已提交
423 424 425 426 427 428 429 430 431 432
    return 0;
}

int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
{
    size_t i;

    for (i = 0; i < size; i++) {
        int err;

433
        err = json_lexer_feed_char(lexer, buffer[i], false);
A
Anthony Liguori 已提交
434 435 436 437 438 439 440 441 442 443
        if (err < 0) {
            return err;
        }
    }

    return 0;
}

int json_lexer_flush(JSONLexer *lexer)
{
444
    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
A
Anthony Liguori 已提交
445 446 447 448
}

void json_lexer_destroy(JSONLexer *lexer)
{
449
    g_string_free(lexer->token, true);
A
Anthony Liguori 已提交
450
}