aboutsummaryrefslogtreecommitdiff
path: root/src/common/jsonapi.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/jsonapi.c')
-rw-r--r--src/common/jsonapi.c954
1 files changed, 945 insertions, 9 deletions
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 98d6e66a217..3d1bd37ac26 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -43,6 +43,169 @@ typedef enum /* contexts of JSON parser */
JSON_PARSE_END, /* saw the end of a document, expect nothing */
} JsonParseContext;
+/*
+ * Setup for table-driven parser.
+ * These enums need to be separate from the JsonTokenType and from each other
+ * so we can have all of them on the prediction stack, which consists of
+ * tokens, non-terminals, and semantic action markers.
+ */
+
+typedef enum
+{
+ JSON_NT_JSON = 32,
+ JSON_NT_ARRAY_ELEMENTS,
+ JSON_NT_MORE_ARRAY_ELEMENTS,
+ JSON_NT_KEY_PAIRS,
+ JSON_NT_MORE_KEY_PAIRS,
+} JsonNonTerminal;
+
+typedef enum
+{
+ JSON_SEM_OSTART = 64,
+ JSON_SEM_OEND,
+ JSON_SEM_ASTART,
+ JSON_SEM_AEND,
+ JSON_SEM_OFIELD_INIT,
+ JSON_SEM_OFIELD_START,
+ JSON_SEM_OFIELD_END,
+ JSON_SEM_AELEM_START,
+ JSON_SEM_AELEM_END,
+ JSON_SEM_SCALAR_INIT,
+ JSON_SEM_SCALAR_CALL,
+} JsonParserSem;
+
+/*
+ * struct containing the 3 stacks used in non-recursive parsing,
+ * and the token and value for scalars that need to be preserved
+ * across calls.
+ */
+typedef struct JsonParserStack
+{
+ int stack_size;
+ char *prediction;
+ int pred_index;
+ /* these two are indexed by lex_level */
+ char **fnames;
+ bool *fnull;
+ JsonTokenType scalar_tok;
+ char *scalar_val;
+} JsonParserStack;
+
+/*
+ * struct containing state used when there is a possible partial token at the
+ * end of a json chunk when we are doing incremental parsing.
+ */
+typedef struct JsonIncrementalState
+{
+ bool is_last_chunk;
+ bool partial_completed;
+ StringInfoData partial_token;
+} JsonIncrementalState;
+
+/*
+ * constants and macros used in the nonrecursive parser
+ */
+#define JSON_NUM_TERMINALS 13
+#define JSON_NUM_NONTERMINALS 5
+#define JSON_NT_OFFSET JSON_NT_JSON
+/* for indexing the table */
+#define OFS(NT) (NT) - JSON_NT_OFFSET
+/* classify items we get off the stack */
+#define IS_SEM(x) ((x) & 0x40)
+#define IS_NT(x) ((x) & 0x20)
+
+/*
+ * These productions are stored in reverse order right to left so that when
+ * they are pushed on the stack what we expect next is at the top of the stack.
+ */
+static char JSON_PROD_EPSILON[] = {0}; /* epsilon - an empty production */
+
+/* JSON -> string */
+static char JSON_PROD_SCALAR_STRING[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_STRING, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> number */
+static char JSON_PROD_SCALAR_NUMBER[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NUMBER, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'true' */
+static char JSON_PROD_SCALAR_TRUE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_TRUE, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'false' */
+static char JSON_PROD_SCALAR_FALSE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_FALSE, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'null' */
+static char JSON_PROD_SCALAR_NULL[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NULL, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> '{' KEY_PAIRS '}' */
+static char JSON_PROD_OBJECT[] = {JSON_SEM_OEND, JSON_TOKEN_OBJECT_END, JSON_NT_KEY_PAIRS, JSON_TOKEN_OBJECT_START, JSON_SEM_OSTART, 0};
+
+/* JSON -> '[' ARRAY_ELEMENTS ']' */
+static char JSON_PROD_ARRAY[] = {JSON_SEM_AEND, JSON_TOKEN_ARRAY_END, JSON_NT_ARRAY_ELEMENTS, JSON_TOKEN_ARRAY_START, JSON_SEM_ASTART, 0};
+
+/* ARRAY_ELEMENTS -> JSON MORE_ARRAY_ELEMENTS */
+static char JSON_PROD_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, 0};
+
+/* MORE_ARRAY_ELEMENTS -> ',' JSON MORE_ARRAY_ELEMENTS */
+static char JSON_PROD_MORE_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, JSON_TOKEN_COMMA, 0};
+
+/* KEY_PAIRS -> string ':' JSON MORE_KEY_PAIRS */
+static char JSON_PROD_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, 0};
+
+/* MORE_KEY_PAIRS -> ',' string ':' JSON MORE_KEY_PAIRS */
+static char JSON_PROD_MORE_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, JSON_TOKEN_COMMA, 0};
+
+/*
+ * Note: there are also epsilon productions for ARRAY_ELEMENTS,
+ * MORE_ARRAY_ELEMENTS, KEY_PAIRS and MORE_KEY_PAIRS
+ * They are all the same as none require any semantic actions.
+ */
+
+/*
+ * Table connecting the productions with their director sets of
+ * terminal symbols.
+ * Any combination not specified here represents an error.
+ */
+
+typedef struct
+{
+ size_t len;
+ char *prod;
+} td_entry;
+
+#define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) }
+
+static td_entry td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] =
+{
+ /* JSON */
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_SCALAR_STRING),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_SCALAR_NUMBER),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_SCALAR_FALSE),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY),
+ [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_OBJECT),
+ /* ARRAY_ELEMENTS */
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
+ /* MORE_ARRAY_ELEMENTS */
+ [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS),
+ [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
+ /* KEY_PAIRS */
+ [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEY_PAIRS),
+ [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
+ /* MORE_KEY_PAIRS */
+ [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS),
+ [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
+};
+
+/* the GOAL production. Not stored in the table, but will be the initial contents of the prediction stack */
+static char JSON_PROD_GOAL[] = {JSON_TOKEN_END, JSON_NT_JSON, 0};
+
static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
bool *num_err, int *total_len);
@@ -60,7 +223,7 @@ JsonSemAction nullSemAction =
NULL, NULL, NULL, NULL, NULL
};
-/* Recursive Descent parser support routines */
+/* Parser support routines */
/*
* lex_peek
@@ -111,6 +274,8 @@ IsValidJsonNumber(const char *str, int len)
if (len <= 0)
return false;
+ dummy_lex.incremental = false;
+
/*
* json_lex_number expects a leading '-' to have been eaten already.
*
@@ -175,6 +340,130 @@ makeJsonLexContextCstringLen(JsonLexContext *lex, char *json,
return lex;
}
+
+/*
+ * makeJsonLexContextIncremental
+ *
+ * Similar to above but set up for use in incremental parsing. That means we
+ * need explicit stacks for predictions, field names and null indicators, but
+ * we don't need the input, that will be handed in bit by bit to the
+ * parse routine. We also need an accumulator for partial tokens in case
+ * the boundary between chunks happns to fall in the middle of a token.
+ */
+#define JS_STACK_CHUNK_SIZE 64
+#define JS_MAX_PROD_LEN 10 /* more than we need */
+#define JSON_TD_MAX_STACK 6400 /* hard coded for now - this is a REALLY high
+ * number */
+
+JsonLexContext *
+makeJsonLexContextIncremental(JsonLexContext *lex, int encoding,
+ bool need_escapes)
+{
+ if (lex == NULL)
+ {
+ lex = palloc0(sizeof(JsonLexContext));
+ lex->flags |= JSONLEX_FREE_STRUCT;
+ }
+ else
+ memset(lex, 0, sizeof(JsonLexContext));
+
+ lex->line_number = 1;
+ lex->input_encoding = encoding;
+ lex->incremental = true;
+ lex->inc_state = palloc0(sizeof(JsonIncrementalState));
+ initStringInfo(&(lex->inc_state->partial_token));
+ lex->pstack = palloc(sizeof(JsonParserStack));
+ lex->pstack->stack_size = JS_STACK_CHUNK_SIZE;
+ lex->pstack->prediction = palloc(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN);
+ lex->pstack->pred_index = 0;
+ lex->pstack->fnames = palloc(JS_STACK_CHUNK_SIZE * sizeof(char *));
+ lex->pstack->fnull = palloc(JS_STACK_CHUNK_SIZE * sizeof(bool));
+ if (need_escapes)
+ {
+ lex->strval = makeStringInfo();
+ lex->flags |= JSONLEX_FREE_STRVAL;
+ }
+ return lex;
+}
+
+static inline void
+inc_lex_level(JsonLexContext *lex)
+{
+ lex->lex_level += 1;
+
+ if (lex->incremental && lex->lex_level >= lex->pstack->stack_size)
+ {
+ lex->pstack->stack_size += JS_STACK_CHUNK_SIZE;
+ lex->pstack->prediction =
+ repalloc(lex->pstack->prediction,
+ lex->pstack->stack_size * JS_MAX_PROD_LEN);
+ if (lex->pstack->fnames)
+ lex->pstack->fnames =
+ repalloc(lex->pstack->fnames,
+ lex->pstack->stack_size * sizeof(char *));
+ if (lex->pstack->fnull)
+ lex->pstack->fnull =
+ repalloc(lex->pstack->fnull, lex->pstack->stack_size * sizeof(bool));
+ }
+}
+
+static inline void
+dec_lex_level(JsonLexContext *lex)
+{
+ lex->lex_level -= 1;
+}
+
+static inline void
+push_prediction(JsonParserStack *pstack, td_entry entry)
+{
+ memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len);
+ pstack->pred_index += entry.len;
+}
+
+static inline char
+pop_prediction(JsonParserStack *pstack)
+{
+ Assert(pstack->pred_index > 0);
+ return pstack->prediction[--pstack->pred_index];
+}
+
+static inline char
+next_prediction(JsonParserStack *pstack)
+{
+ Assert(pstack->pred_index > 0);
+ return pstack->prediction[pstack->pred_index - 1];
+}
+
+static inline bool
+have_prediction(JsonParserStack *pstack)
+{
+ return pstack->pred_index > 0;
+}
+
+static inline void
+set_fname(JsonLexContext *lex, char *fname)
+{
+ lex->pstack->fnames[lex->lex_level] = fname;
+}
+
+static inline char *
+get_fname(JsonLexContext *lex)
+{
+ return lex->pstack->fnames[lex->lex_level];
+}
+
+static inline void
+set_fnull(JsonLexContext *lex, bool fnull)
+{
+ lex->pstack->fnull[lex->lex_level] = fnull;
+}
+
+static inline bool
+get_fnull(JsonLexContext *lex)
+{
+ return lex->pstack->fnull[lex->lex_level];
+}
+
/*
* Free memory in a JsonLexContext.
*
@@ -192,7 +481,18 @@ freeJsonLexContext(JsonLexContext *lex)
destroyStringInfo(lex->errormsg);
if (lex->flags & JSONLEX_FREE_STRUCT)
+ {
+ if (lex->incremental)
+ {
+ pfree(lex->inc_state->partial_token.data);
+ pfree(lex->inc_state);
+ pfree(lex->pstack->prediction);
+ pfree(lex->pstack->fnames);
+ pfree(lex->pstack->fnull);
+ pfree(lex->pstack);
+ }
pfree(lex);
+ }
}
/*
@@ -204,13 +504,44 @@ freeJsonLexContext(JsonLexContext *lex)
* makeJsonLexContext(). sem is a structure of function pointers to semantic
* action routines to be called at appropriate spots during parsing, and a
* pointer to a state object to be passed to those routines.
+ *
+ * If FORCE_JSON_PSTACK is defined then the routine will call the non-recursive
+ * JSON parser. This is a useful way to validate that it's doing the right
+ * think at least for non-incremental cases. If this is on we expect to see
+ * regression diffs relating to error messages about stack depth, but no
+ * other differences.
*/
JsonParseErrorType
pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
{
+#ifdef FORCE_JSON_PSTACK
+
+ lex->incremental = true;
+ lex->inc_state = palloc0(sizeof(JsonIncrementalState));
+
+ /*
+ * We don't need partial token processing, there is only one chunk. But we
+ * still need to init the partial token string so that freeJsonLexContext
+ * works.
+ */
+ initStringInfo(&(lex->inc_state->partial_token));
+ lex->pstack = palloc(sizeof(JsonParserStack));
+ lex->pstack->stack_size = JS_STACK_CHUNK_SIZE;
+ lex->pstack->prediction = palloc(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN);
+ lex->pstack->pred_index = 0;
+ lex->pstack->fnames = palloc(JS_STACK_CHUNK_SIZE * sizeof(char *));
+ lex->pstack->fnull = palloc(JS_STACK_CHUNK_SIZE * sizeof(bool));
+
+ return pg_parse_json_incremental(lex, sem, lex->input, lex->input_length, true);
+
+#else
+
JsonTokenType tok;
JsonParseErrorType result;
+ if (lex->incremental)
+ return JSON_INVALID_LEXER_TYPE;
+
/* get the initial token */
result = json_lex(lex);
if (result != JSON_SUCCESS)
@@ -235,6 +566,7 @@ pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
return result;
+#endif
}
/*
@@ -291,6 +623,372 @@ json_count_array_elements(JsonLexContext *lex, int *elements)
}
/*
+ * pg_parse_json_incremental
+ *
+ * Routine for incremental parsing of json. This uses the non-recursive top
+ * down method of the Dragon Book Algorithm 4.3. It's somewhat slower than
+ * the Recursive Descent pattern used above, so we only use it for incremental
+ * parsing of JSON.
+ *
+ * The lexing context needs to be set up by a call to
+ * makeJsonLexContextIncremental(). sem is a structure of function pointers
+ * to semantic action routines, which should function exactly as those used
+ * in the recursive descent parser.
+ *
+ * This routine can be called repeatedly with chunks of JSON. On the final
+ * chunk is_last must be set to true. len is the length of the json chunk,
+ * which does not need to be null terminated.
+ */
+JsonParseErrorType
+pg_parse_json_incremental(JsonLexContext *lex,
+ JsonSemAction *sem,
+ char *json,
+ int len,
+ bool is_last)
+{
+ JsonTokenType tok;
+ JsonParseErrorType result;
+ JsonParseContext ctx = JSON_PARSE_VALUE;
+ JsonParserStack *pstack = lex->pstack;
+
+
+ if (!lex->incremental)
+ return JSON_INVALID_LEXER_TYPE;
+
+ lex->input = lex->token_terminator = lex->line_start = json;
+ lex->input_length = len;
+ lex->inc_state->is_last_chunk = is_last;
+
+ /* get the initial token */
+ result = json_lex(lex);
+ if (result != JSON_SUCCESS)
+ return result;
+
+ tok = lex_peek(lex);
+
+ /* use prediction stack for incremental parsing */
+
+ if (!have_prediction(pstack))
+ {
+ td_entry goal = TD_ENTRY(JSON_PROD_GOAL);
+
+ push_prediction(pstack, goal);
+ }
+
+ while (have_prediction(pstack))
+ {
+ char top = pop_prediction(pstack);
+ td_entry entry;
+
+ /*
+ * these first two branches are the guts of the Table Driven method
+ */
+ if (top == tok)
+ {
+ /*
+ * tok can only be a terminal symbol, so top must be too. the
+ * token matches the top of the stack, so get the next token.
+ */
+ if (tok < JSON_TOKEN_END)
+ {
+ result = json_lex(lex);
+ if (result != JSON_SUCCESS)
+ return result;
+ tok = lex_peek(lex);
+ }
+ }
+ else if (IS_NT(top) && (entry = td_parser_table[OFS(top)][tok]).prod != NULL)
+ {
+ /*
+ * the token is in the director set for a production of the
+ * non-terminal at the top of the stack, so push the reversed RHS
+ * of the production onto the stack.
+ */
+ push_prediction(pstack, entry);
+ }
+ else if (IS_SEM(top))
+ {
+ /*
+ * top is a semantic action marker, so take action accordingly.
+ * It's important to have these markers in the prediction stack
+ * before any token they might need so we don't advance the token
+ * prematurely. Note in a couple of cases we need to do something
+ * both before and after the token.
+ */
+ switch (top)
+ {
+ case JSON_SEM_OSTART:
+ {
+ json_struct_action ostart = sem->object_start;
+
+ if (lex->lex_level >= JSON_TD_MAX_STACK)
+ return JSON_NESTING_TOO_DEEP;
+
+ if (ostart != NULL)
+ {
+ result = (*ostart) (sem->semstate);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ inc_lex_level(lex);
+ }
+ break;
+ case JSON_SEM_OEND:
+ {
+ json_struct_action oend = sem->object_end;
+
+ dec_lex_level(lex);
+ if (oend != NULL)
+ {
+ result = (*oend) (sem->semstate);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_ASTART:
+ {
+ json_struct_action astart = sem->array_start;
+
+ if (lex->lex_level >= JSON_TD_MAX_STACK)
+ return JSON_NESTING_TOO_DEEP;
+
+ if (astart != NULL)
+ {
+ result = (*astart) (sem->semstate);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ inc_lex_level(lex);
+ }
+ break;
+ case JSON_SEM_AEND:
+ {
+ json_struct_action aend = sem->array_end;
+
+ dec_lex_level(lex);
+ if (aend != NULL)
+ {
+ result = (*aend) (sem->semstate);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_OFIELD_INIT:
+ {
+ /*
+ * all we do here is save out the field name. We have
+ * to wait to get past the ':' to see if the next
+ * value is null so we can call the semantic routine
+ */
+ char *fname = NULL;
+ json_ofield_action ostart = sem->object_field_start;
+ json_ofield_action oend = sem->object_field_end;
+
+ if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
+ {
+ fname = pstrdup(lex->strval->data);
+ }
+ set_fname(lex, fname);
+ }
+ break;
+ case JSON_SEM_OFIELD_START:
+ {
+ /*
+ * the current token should be the first token of the
+ * value
+ */
+ bool isnull = tok == JSON_TOKEN_NULL;
+ json_ofield_action ostart = sem->object_field_start;
+
+ set_fnull(lex, isnull);
+
+ if (ostart != NULL)
+ {
+ char *fname = get_fname(lex);
+
+ result = (*ostart) (sem->semstate, fname, isnull);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_OFIELD_END:
+ {
+ json_ofield_action oend = sem->object_field_end;
+
+ if (oend != NULL)
+ {
+ char *fname = get_fname(lex);
+ bool isnull = get_fnull(lex);
+
+ result = (*oend) (sem->semstate, fname, isnull);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_AELEM_START:
+ {
+ json_aelem_action astart = sem->array_element_start;
+ bool isnull = tok == JSON_TOKEN_NULL;
+
+ set_fnull(lex, isnull);
+
+ if (astart != NULL)
+ {
+ result = (*astart) (sem->semstate, isnull);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_AELEM_END:
+ {
+ json_aelem_action aend = sem->array_element_end;
+
+ if (aend != NULL)
+ {
+ bool isnull = get_fnull(lex);
+
+ result = (*aend) (sem->semstate, isnull);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ case JSON_SEM_SCALAR_INIT:
+ {
+ json_scalar_action sfunc = sem->scalar;
+
+ pstack->scalar_val = NULL;
+
+ if (sfunc != NULL)
+ {
+ /*
+ * extract the de-escaped string value, or the raw
+ * lexeme
+ */
+ /*
+ * XXX copied from RD parser but looks like a
+ * buglet
+ */
+ if (tok == JSON_TOKEN_STRING)
+ {
+ if (lex->strval != NULL)
+ pstack->scalar_val = pstrdup(lex->strval->data);
+ }
+ else
+ {
+ int tlen = (lex->token_terminator - lex->token_start);
+
+ pstack->scalar_val = palloc(tlen + 1);
+ memcpy(pstack->scalar_val, lex->token_start, tlen);
+ pstack->scalar_val[tlen] = '\0';
+ }
+ pstack->scalar_tok = tok;
+ }
+ }
+ break;
+ case JSON_SEM_SCALAR_CALL:
+ {
+ /*
+ * We'd like to be able to get rid of this business of
+ * two bits of scalar action, but we can't. It breaks
+ * certain semantic actions which expect that when
+ * called the lexer has consumed the item. See for
+ * example get_scalar() in jsonfuncs.c.
+ */
+ json_scalar_action sfunc = sem->scalar;
+
+ if (sfunc != NULL)
+ {
+ result = (*sfunc) (sem->semstate, pstack->scalar_val, pstack->scalar_tok);
+ if (result != JSON_SUCCESS)
+ return result;
+ }
+ }
+ break;
+ default:
+ /* should not happen */
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * The token didn't match the stack top if it's a terminal nor a
+ * production for the stack top if it's a non-terminal.
+ *
+ * Various cases here are Asserted to be not possible, as the
+ * token would not appear at the top of the prediction stack
+ * unless the lookahead matched.
+ */
+ switch (top)
+ {
+ case JSON_TOKEN_STRING:
+ if (next_prediction(pstack) == JSON_TOKEN_COLON)
+ ctx = JSON_PARSE_STRING;
+ else
+ {
+ Assert(false);
+ ctx = JSON_PARSE_VALUE;
+ }
+ break;
+ case JSON_TOKEN_NUMBER:
+ case JSON_TOKEN_TRUE:
+ case JSON_TOKEN_FALSE:
+ case JSON_TOKEN_NULL:
+ case JSON_TOKEN_ARRAY_START:
+ case JSON_TOKEN_OBJECT_START:
+ Assert(false);
+ ctx = JSON_PARSE_VALUE;
+ break;
+ case JSON_TOKEN_ARRAY_END:
+ Assert(false);
+ ctx = JSON_PARSE_ARRAY_NEXT;
+ break;
+ case JSON_TOKEN_OBJECT_END:
+ Assert(false);
+ ctx = JSON_PARSE_OBJECT_NEXT;
+ break;
+ case JSON_TOKEN_COMMA:
+ Assert(false);
+ if (next_prediction(pstack) == JSON_TOKEN_STRING)
+ ctx = JSON_PARSE_OBJECT_NEXT;
+ else
+ ctx = JSON_PARSE_ARRAY_NEXT;
+ break;
+ case JSON_TOKEN_COLON:
+ ctx = JSON_PARSE_OBJECT_LABEL;
+ break;
+ case JSON_TOKEN_END:
+ ctx = JSON_PARSE_END;
+ break;
+ case JSON_NT_MORE_ARRAY_ELEMENTS:
+ ctx = JSON_PARSE_ARRAY_NEXT;
+ break;
+ case JSON_NT_ARRAY_ELEMENTS:
+ ctx = JSON_PARSE_ARRAY_START;
+ break;
+ case JSON_NT_MORE_KEY_PAIRS:
+ ctx = JSON_PARSE_OBJECT_NEXT;
+ break;
+ case JSON_NT_KEY_PAIRS:
+ ctx = JSON_PARSE_OBJECT_START;
+ break;
+ default:
+ ctx = JSON_PARSE_VALUE;
+ }
+ return report_parse_error(ctx, lex);
+ }
+ }
+
+ return JSON_SUCCESS;
+}
+
+/*
* Recursive Descent parse routines. There is one for each structural
* element in a json document:
* - scalar (string, number, true, false, null)
@@ -587,6 +1285,18 @@ parse_array(JsonLexContext *lex, JsonSemAction *sem)
/*
* Lex one token from the input stream.
+ *
+ * When doing incremental parsing, we can reach the end of the input string
+ * without having (or knowing we have) a complete token. If it's not the
+ * final chunk of input, the partial token is then saved to the lex
+ * structure's ptok StringInfo. On subsequent calls input is appended to this
+ * buffer until we have something that we think is a complete token,
+ * which is then lexed using a recursive call to json_lex. Processing then
+ * continues as normal on subsequent calls.
+ *
+ * Note than when doing incremental processing, the lex.prev_token_terminator
+ * should not be relied on. It could point into a previous input chunk or
+ * worse.
*/
JsonParseErrorType
json_lex(JsonLexContext *lex)
@@ -595,8 +1305,202 @@ json_lex(JsonLexContext *lex)
char *const end = lex->input + lex->input_length;
JsonParseErrorType result;
- /* Skip leading whitespace. */
+ if (lex->incremental && lex->inc_state->partial_completed)
+ {
+ /*
+ * We just lexed a completed partial token on the last call, so reset
+ * everything
+ */
+ resetStringInfo(&(lex->inc_state->partial_token));
+ lex->token_terminator = lex->input;
+ lex->inc_state->partial_completed = false;
+ }
+
s = lex->token_terminator;
+
+ if (lex->incremental && lex->inc_state->partial_token.len)
+ {
+ /*
+ * We have a partial token. Extend it and if completed lex it by a
+ * recursive call
+ */
+ StringInfo ptok = &(lex->inc_state->partial_token);
+ int added = 0;
+ bool tok_done = false;
+ JsonLexContext dummy_lex;
+ JsonParseErrorType partial_result;
+
+ if (ptok->data[0] == '"')
+ {
+ /*
+ * It's a string. Accumulate characters until we reach an
+ * unescaped '"'.
+ */
+ int escapes = 0;
+
+ for (int i = ptok->len - 1; i > 0; i--)
+ {
+ /* count the trailing backslashes on the partial token */
+ if (ptok->data[i] == '\\')
+ escapes++;
+ else
+ break;
+ }
+
+ for (int i = 0; i < lex->input_length; i++)
+ {
+ char c = lex->input[i];
+
+ appendStringInfoCharMacro(ptok, c);
+ added++;
+ if (c == '"' && escapes % 2 == 0)
+ {
+ tok_done = true;
+ break;
+ }
+ if (c == '\\')
+ escapes++;
+ else
+ escapes = 0;
+ }
+ }
+ else
+ {
+ /* not a string */
+ char c = ptok->data[0];
+
+ if (c == '-' || (c >= '0' && c <= '9'))
+ {
+ /* for numbers look for possible numeric continuations */
+
+ bool numend = false;
+
+ for (int i = 0; i < lex->input_length && !numend; i++)
+ {
+ char cc = lex->input[i];
+
+ switch (cc)
+ {
+ case '+':
+ case '-':
+ case 'e':
+ case 'E':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ appendStringInfoCharMacro(ptok, cc);
+ added++;
+ }
+ break;
+ default:
+ numend = true;
+ }
+ }
+ }
+
+ /*
+ * Add any remaining alpha_numeric chars. This takes care of the
+ * {null, false, true} literals as well as any trailing
+ * alpha-numeric junk on non-string tokens.
+ */
+ for (int i = added; i < lex->input_length; i++)
+ {
+ char cc = lex->input[i];
+
+ if (JSON_ALPHANUMERIC_CHAR(cc))
+ {
+ appendStringInfoCharMacro(ptok, cc);
+ added++;
+ }
+ else
+ {
+ tok_done = true;
+ break;
+ }
+ }
+ if (added == lex->input_length &&
+ lex->inc_state->is_last_chunk)
+ {
+ tok_done = true;
+ }
+ }
+
+ if (!tok_done)
+ {
+ /* We should have consumed the whole chunk in this case. */
+ Assert(added == lex->input_length);
+
+ if (!lex->inc_state->is_last_chunk)
+ return JSON_INCOMPLETE;
+
+ /* json_errdetail() needs access to the accumulated token. */
+ lex->token_start = ptok->data;
+ lex->token_terminator = ptok->data + ptok->len;
+ return JSON_INVALID_TOKEN;
+ }
+
+ /*
+ * Everything up to lex->input[added] has been added to the partial
+ * token, so move the input past it.
+ */
+ lex->input += added;
+ lex->input_length -= added;
+
+ dummy_lex.input = dummy_lex.token_terminator =
+ dummy_lex.line_start = ptok->data;
+ dummy_lex.line_number = lex->line_number;
+ dummy_lex.input_length = ptok->len;
+ dummy_lex.input_encoding = lex->input_encoding;
+ dummy_lex.incremental = false;
+ dummy_lex.strval = lex->strval;
+
+ partial_result = json_lex(&dummy_lex);
+
+ /*
+ * We either have a complete token or an error. In either case we need
+ * to point to the partial token data for the semantic or error
+ * routines. If it's not an error we'll readjust on the next call to
+ * json_lex.
+ */
+ lex->token_type = dummy_lex.token_type;
+ lex->line_number = dummy_lex.line_number;
+
+ /*
+ * We know the prev_token_terminator must be back in some previous
+ * piece of input, so we just make it NULL.
+ */
+ lex->prev_token_terminator = NULL;
+
+ /*
+ * Normally token_start would be ptok->data, but it could be later,
+ * see json_lex_string's handling of invalid escapes.
+ */
+ lex->token_start = dummy_lex.token_start;
+ lex->token_terminator = dummy_lex.token_terminator;
+ if (partial_result == JSON_SUCCESS)
+ {
+ /* make sure we've used all the input */
+ if (lex->token_terminator - lex->token_start != ptok->len)
+ {
+ Assert(false);
+ return JSON_INVALID_TOKEN;
+ }
+
+ lex->inc_state->partial_completed = true;
+ }
+ return partial_result;
+ /* end of partial token processing */
+ }
+
+ /* Skip leading whitespace. */
while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
{
if (*s++ == '\n')
@@ -708,6 +1612,14 @@ json_lex(JsonLexContext *lex)
return JSON_INVALID_TOKEN;
}
+ if (lex->incremental && !lex->inc_state->is_last_chunk &&
+ p == lex->input + lex->input_length)
+ {
+ appendBinaryStringInfo(
+ &(lex->inc_state->partial_token), s, end - s);
+ return JSON_INCOMPLETE;
+ }
+
/*
* We've got a real alphanumeric token here. If it
* happens to be true, false, or null, all is well. If
@@ -732,7 +1644,10 @@ json_lex(JsonLexContext *lex)
} /* end of switch */
}
- return JSON_SUCCESS;
+ if (lex->incremental && lex->token_type == JSON_TOKEN_END && !lex->inc_state->is_last_chunk)
+ return JSON_INCOMPLETE;
+ else
+ return JSON_SUCCESS;
}
/*
@@ -754,8 +1669,14 @@ json_lex_string(JsonLexContext *lex)
int hi_surrogate = -1;
/* Convenience macros for error exits */
-#define FAIL_AT_CHAR_START(code) \
+#define FAIL_OR_INCOMPLETE_AT_CHAR_START(code) \
do { \
+ if (lex->incremental && !lex->inc_state->is_last_chunk) \
+ { \
+ appendBinaryStringInfo(&lex->inc_state->partial_token, \
+ lex->token_start, end - lex->token_start); \
+ return JSON_INCOMPLETE; \
+ } \
lex->token_terminator = s; \
return code; \
} while (0)
@@ -776,7 +1697,7 @@ json_lex_string(JsonLexContext *lex)
s++;
/* Premature end of the string. */
if (s >= end)
- FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+ FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
else if (*s == '"')
break;
else if (*s == '\\')
@@ -784,7 +1705,7 @@ json_lex_string(JsonLexContext *lex)
/* OK, we have an escape character. */
s++;
if (s >= end)
- FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+ FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
else if (*s == 'u')
{
int i;
@@ -794,7 +1715,7 @@ json_lex_string(JsonLexContext *lex)
{
s++;
if (s >= end)
- FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+ FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
else if (*s >= '0' && *s <= '9')
ch = (ch * 16) + (*s - '0');
else if (*s >= 'a' && *s <= 'f')
@@ -979,7 +1900,7 @@ json_lex_string(JsonLexContext *lex)
lex->token_terminator = s + 1;
return JSON_SUCCESS;
-#undef FAIL_AT_CHAR_START
+#undef FAIL_OR_INCOMPLETE_AT_CHAR_START
#undef FAIL_AT_CHAR_END
}
@@ -1088,7 +2009,14 @@ json_lex_number(JsonLexContext *lex, char *s,
if (total_len != NULL)
*total_len = len;
- if (num_err != NULL)
+ if (lex->incremental && !lex->inc_state->is_last_chunk &&
+ len >= lex->input_length)
+ {
+ appendBinaryStringInfo(&lex->inc_state->partial_token,
+ lex->token_start, s - lex->token_start);
+ return JSON_INCOMPLETE;
+ }
+ else if (num_err != NULL)
{
/* let the caller handle any error */
*num_err = error;
@@ -1174,9 +2102,17 @@ json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
switch (error)
{
+ case JSON_INCOMPLETE:
case JSON_SUCCESS:
/* fall through to the error code after switch */
break;
+ case JSON_INVALID_LEXER_TYPE:
+ if (lex->incremental)
+ return (_("Recursive descent parser cannot use incremental lexer"));
+ else
+ return (_("Incremental parser requires incremental lexer"));
+ case JSON_NESTING_TOO_DEEP:
+ return (_("JSON nested too deep, maximum permitted depth is 6400"));
case JSON_ESCAPING_INVALID:
token_error(lex, "Escape sequence \"\\%.*s\" is invalid.");
break;