numam-spdk/lib/json/json_parse.c
Daniel Verkamp 69c7ff06dc json: allow decoding of non-standard comments
Comments are not allowed in the JSON RFC, but some JSON libraries accept
JavaScript-style comments.

Add a flag that enables non-spec-compliant comment parsing.

Change-Id: I9dfb66bb46ecff1a22d8af5a9c50620686a4707c
Signed-off-by: Daniel Verkamp <daniel.verkamp@intel.com>
2016-08-19 09:49:18 -07:00

671 lines
17 KiB
C

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "json_internal.h"
static int
hex_value(uint8_t c)
{
#define V(x, y) [x] = y + 1
static const int8_t val[256] = {
V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
};
#undef V
return val[c] - 1;
}
static int
json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
{
uint8_t *str = *strp;
int v0, v1, v2, v3;
uint32_t val;
uint32_t surrogate_high = 0;
int rc;
decode:
/* \uXXXX */
assert(buf_end > str);
if (*str++ != '\\') return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
if (*str++ != 'u') return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
if ((v3 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
if ((v2 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
if ((v1 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
if ((v0 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
if (surrogate_high) {
/* We already parsed the high surrogate, so this should be the low part. */
if (!utf16_valid_surrogate_low(val)) {
return SPDK_JSON_PARSE_INVALID;
}
/* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
val = utf16_decode_surrogate_pair(surrogate_high, val);
} else if (utf16_valid_surrogate_high(val)) {
surrogate_high = val;
/*
* We parsed a \uXXXX sequence that decoded to the first half of a
* UTF-16 surrogate pair, so it must be immediately followed by another
* \uXXXX escape.
*
* Loop around to get the low half of the surrogate pair.
*/
if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
goto decode;
} else if (utf16_valid_surrogate_low(val)) {
/*
* We found the second half of surrogate pair without the first half;
* this is an invalid encoding.
*/
return SPDK_JSON_PARSE_INVALID;
}
/*
* Convert Unicode escape (or surrogate pair) to UTF-8 in place.
*
* This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
* (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
* single codepoint is 4 bytes.
*/
if (out) {
rc = utf8_encode_unsafe(out, val);
} else {
rc = utf8_codepoint_len(val);
}
if (rc < 0) {
return SPDK_JSON_PARSE_INVALID;
}
*strp = str; /* update input pointer */
return rc; /* return number of bytes decoded */
}
static int
json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
{
static const uint8_t escapes[256] = {
['b'] = '\b',
['f'] = '\f',
['n'] = '\n',
['r'] = '\r',
['t'] = '\t',
['/'] = '/',
['"'] = '"',
['\\'] = '\\',
};
uint8_t *str = *strp;
uint8_t c;
assert(buf_end > str);
if (buf_end - str < 2) {
return SPDK_JSON_PARSE_INCOMPLETE;
}
assert(str[0] == '\\');
c = escapes[str[1]];
if (c) {
if (out) {
*out = c;
}
*strp += 2; /* consumed two bytes */
return 1; /* produced one byte */
}
return SPDK_JSON_PARSE_INVALID;
}
/*
* Decode JSON string backslash escape.
* \param strp pointer to pointer to first character of escape (the backslash).
* *strp is also advanced to indicate how much input was consumed.
*
* \return Number of bytes appended to out
*/
static int
json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
{
int rc;
rc = json_decode_string_escape_twochar(strp, buf_end, out);
if (rc > 0) {
return rc;
}
return json_decode_string_escape_unicode(strp, buf_end, out);
}
/*
* Decode JSON string in place.
*
* \param str_start Pointer to the beginning of the string (the opening " character).
*
* \return Number of bytes in decoded string (beginning from start).
*/
static int
json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
{
uint8_t *str = str_start;
uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
int rc;
if (buf_end - str_start < 2) {
/*
* Shortest valid string (the empty string) is two bytes (""),
* so this can't possibly be valid
*/
return SPDK_JSON_PARSE_INCOMPLETE;
}
if (*str++ != '"') {
return SPDK_JSON_PARSE_INVALID;
}
while (str < buf_end) {
if (str[0] == '"') {
/*
* End of string.
* Update str_end to point at next input byte and return output length.
*/
*str_end = str + 1;
return out - str_start - 1;
} else if (str[0] == '\\') {
rc = json_decode_string_escape(&str, buf_end,
flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
assert(rc != 0);
if (rc < 0) {
return rc;
}
out += rc;
} else if (str[0] <= 0x1f) {
/* control characters must be escaped */
return SPDK_JSON_PARSE_INVALID;
} else {
rc = utf8_valid(str, buf_end);
if (rc == 0) {
return SPDK_JSON_PARSE_INCOMPLETE;
} else if (rc < 0) {
return SPDK_JSON_PARSE_INVALID;
}
if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
memmove(out, str, rc);
}
out += rc;
str += rc;
}
}
/* If execution gets here, we ran out of buffer. */
return SPDK_JSON_PARSE_INCOMPLETE;
}
static int
json_valid_number(uint8_t *start, uint8_t *buf_end)
{
uint8_t *p = start;
enum {
NUM_STATE_START,
NUM_STATE_INT_FIRST_DIGIT,
NUM_STATE_INT_DIGITS,
NUM_STATE_FRAC_OR_EXP,
NUM_STATE_FRAC_FIRST_DIGIT,
NUM_STATE_FRAC_DIGITS,
NUM_STATE_EXP_SIGN,
NUM_STATE_EXP_FIRST_DIGIT,
NUM_STATE_EXP_DIGITS,
} state = NUM_STATE_START;
if (p >= buf_end) return -1;
while (p != buf_end) {
uint8_t c = *p++;
switch (c) {
case '0':
if (state == NUM_STATE_START || state == NUM_STATE_INT_FIRST_DIGIT) {
/*
* If the very first digit is 0,
* it must be the last digit of the integer part
* (no leading zeroes allowed).
*/
state = NUM_STATE_FRAC_OR_EXP;
break;
}
/* fallthrough */
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
switch (state) {
case NUM_STATE_START:
case NUM_STATE_INT_FIRST_DIGIT:
state = NUM_STATE_INT_DIGITS;
break;
case NUM_STATE_FRAC_FIRST_DIGIT:
state = NUM_STATE_FRAC_DIGITS;
break;
case NUM_STATE_EXP_SIGN:
case NUM_STATE_EXP_FIRST_DIGIT:
state = NUM_STATE_EXP_DIGITS;
break;
case NUM_STATE_INT_DIGITS:
case NUM_STATE_FRAC_DIGITS:
case NUM_STATE_EXP_DIGITS:
/* stay in same state */
break;
default:
return SPDK_JSON_PARSE_INVALID;
}
break;
case '.':
if (state != NUM_STATE_INT_DIGITS && state != NUM_STATE_FRAC_OR_EXP) {
return SPDK_JSON_PARSE_INVALID;
}
state = NUM_STATE_FRAC_FIRST_DIGIT;
break;
case 'e':
case 'E':
switch (state) {
case NUM_STATE_INT_DIGITS:
case NUM_STATE_FRAC_OR_EXP:
case NUM_STATE_FRAC_DIGITS:
state = NUM_STATE_EXP_SIGN;
break;
default:
return SPDK_JSON_PARSE_INVALID;
}
break;
case '-':
if (state == NUM_STATE_START) {
state = NUM_STATE_INT_FIRST_DIGIT;
break;
}
/* fallthrough */
case '+':
if (state == NUM_STATE_EXP_SIGN) {
state = NUM_STATE_EXP_FIRST_DIGIT;
} else {
return SPDK_JSON_PARSE_INVALID;
}
break;
default:
/*
* Got an unexpected character - back up and stop parsing number.
* The top-level parsing code will handle invalid trailing characters.
*/
p--;
goto done;
}
}
done:
switch (state) {
case NUM_STATE_INT_DIGITS:
case NUM_STATE_FRAC_OR_EXP:
case NUM_STATE_FRAC_DIGITS:
case NUM_STATE_EXP_DIGITS:
/* Valid end state */
return p - start;
default:
return SPDK_JSON_PARSE_INCOMPLETE;
}
}
static int
json_valid_comment(const uint8_t *start, const uint8_t *buf_end)
{
const uint8_t *p = start;
bool multiline;
assert(buf_end > p);
if (buf_end - p < 2) {
return SPDK_JSON_PARSE_INCOMPLETE;
}
if (p[0] != '/') {
return SPDK_JSON_PARSE_INVALID;
}
if (p[1] == '*') {
multiline = true;
} else if (p[1] == '/') {
multiline = false;
} else {
return SPDK_JSON_PARSE_INVALID;
}
p += 2;
if (multiline) {
while (p != buf_end - 1) {
if (p[0] == '*' && p[1] == '/') {
/* Include the terminating star and slash in the comment */
return p - start + 2;
}
p++;
}
} else {
while (p != buf_end) {
if (*p == '\r' || *p == '\n') {
/* Do not include the line terminator in the comment */
return p - start;
}
p++;
}
}
return SPDK_JSON_PARSE_INCOMPLETE;
}
struct json_literal {
enum spdk_json_val_type type;
uint32_t len;
uint8_t str[8];
};
/*
* JSON only defines 3 possible literals; they can be uniquely identified by bits
* 3 and 4 of the first character:
* 'f' = 0b11[00]110
* 'n' = 0b11[01]110
* 't' = 0b11[10]100
* These two bits can be used as an index into the g_json_literals array.
*/
static const struct json_literal g_json_literals[] = {
{SPDK_JSON_VAL_FALSE, 5, "false"},
{SPDK_JSON_VAL_NULL, 4, "null"},
{SPDK_JSON_VAL_TRUE, 4, "true"},
{}
};
static int
match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
{
assert(end >= start);
if ((size_t)(end - start) < len) {
return SPDK_JSON_PARSE_INCOMPLETE;
}
if (memcmp(start, literal, len) != 0) {
return SPDK_JSON_PARSE_INVALID;
}
return len;
}
ssize_t
spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
void **end, uint32_t flags)
{
uint8_t *json_end = json + size;
enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
bool trailing_comma = false;
size_t depth = 0; /* index into containers */
size_t cur_value = 0; /* index into values */
size_t con_start_value;
uint8_t *data = json;
uint8_t *new_data;
int rc;
const struct json_literal *lit;
enum {
STATE_VALUE, /* initial state */
STATE_VALUE_SEPARATOR, /* value separator (comma) */
STATE_NAME, /* "name": value */
STATE_NAME_SEPARATOR, /* colon */
STATE_END, /* parsed the complete value, so only whitespace is valid */
} state = STATE_VALUE;
#define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
if (values && cur_value < num_values) { \
values[cur_value].type = t; \
values[cur_value].start = val_start_ptr; \
values[cur_value].len = val_end_ptr - val_start_ptr; \
} \
cur_value++
while (data < json_end) {
uint8_t c = *data;
switch (c) {
case ' ':
case '\t':
case '\r':
case '\n':
/* Whitespace is allowed between any tokens. */
data++;
break;
case 't':
case 'f':
case 'n':
/* true, false, or null */
if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
assert(lit->str[0] == c);
rc = match_literal(data, json_end, lit->str, lit->len);
if (rc < 0) return rc;
ADD_VALUE(lit->type, data, data + rc);
data += rc;
state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
trailing_comma = false;
break;
case '"':
if (state != STATE_VALUE && state != STATE_NAME) return SPDK_JSON_PARSE_INVALID;
rc = json_decode_string(data, json_end, &new_data, flags);
if (rc < 0) return rc;
/*
* Start is data + 1 to skip initial quote.
* Length is data + rc - 1 to skip both quotes.
*/
ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
data + 1, data + rc - 1);
data = new_data;
if (state == STATE_NAME) {
state = STATE_NAME_SEPARATOR;
} else {
state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
}
trailing_comma = false;
break;
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
rc = json_valid_number(data, json_end);
if (rc < 0) return rc;
ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
data += rc;
state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
trailing_comma = false;
break;
case '{':
case '[':
if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
return SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
}
if (c == '{') {
con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
state = STATE_NAME;
} else {
con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
state = STATE_VALUE;
}
con_value[depth] = cur_value;
containers[depth++] = con_type;
ADD_VALUE(con_type, data, data + 1);
data++;
trailing_comma = false;
break;
case '}':
case ']':
if (trailing_comma) return SPDK_JSON_PARSE_INVALID;
if (depth == 0) return SPDK_JSON_PARSE_INVALID;
con_type = containers[--depth];
con_start_value = con_value[depth];
if (values && con_start_value < num_values) {
values[con_start_value].len = cur_value - con_start_value - 1;
}
if (c == '}') {
if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
return SPDK_JSON_PARSE_INVALID;
}
if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
return SPDK_JSON_PARSE_INVALID;
}
ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
} else {
if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
return SPDK_JSON_PARSE_INVALID;
}
if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
return SPDK_JSON_PARSE_INVALID;
}
ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
}
con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
data++;
state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
trailing_comma = false;
break;
case ',':
if (state != STATE_VALUE_SEPARATOR) return SPDK_JSON_PARSE_INVALID;
data++;
assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
trailing_comma = true;
break;
case ':':
if (state != STATE_NAME_SEPARATOR) return SPDK_JSON_PARSE_INVALID;
data++;
state = STATE_VALUE;
break;
case '/':
if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) {
return SPDK_JSON_PARSE_INVALID;
}
rc = json_valid_comment(data, json_end);
if (rc < 0) return rc;
/* Skip over comment */
data += rc;
break;
default:
return SPDK_JSON_PARSE_INVALID;
}
if (state == STATE_END) {
break;
}
}
if (state == STATE_END) {
/* Skip trailing whitespace */
while (data < json_end) {
uint8_t c = *data;
if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
data++;
} else {
break;
}
}
/*
* These asserts are just for sanity checking - they are guaranteed by the allowed
* state transitions.
*/
assert(depth == 0);
assert(trailing_comma == false);
assert(data <= json_end);
if (end) {
*end = data;
}
return cur_value;
}
/* Invalid end state - ran out of data */
if (end) {
*end = data;
}
return SPDK_JSON_PARSE_INCOMPLETE;
}