Introduce a non-recursive JSON parser

This parser uses an explicit prediction stack, unlike the present recursive descent parser where the parser state is represented on the call stack. This difference makes the new parser suitable for use in incremental parsing of huge JSON documents that cannot be conveniently handled piece-wise by the recursive descent parser. One potential use for this will be in parsing large backup manifests associated with incremental backups. Because this parser is somewhat slower than the recursive descent parser, it is not replacing that parser, but is an additional parser available to callers. For testing purposes, if the build is done with -DFORCE_JSON_PSTACK, all JSON parsing is done with the non-recursive parser, in which case only trivial regression differences in error messages should be observed. Author: Andrew Dunstan Reviewed-By: Jacob Champion Discussion: https://postgr.es/m/7b0a51d6-0d9d-7366-3a1a-f74397a02f55@dunslane.net
author: Andrew Dunstan <andrew@dunslane.net> 2024-03-10 23:10:14 -0400
committer: Andrew Dunstan <andrew@dunslane.net> 2024-04-04 06:46:40 -0400
commit: 3311ea86edc7a689614bad754e17371865cdc11f (patch)
tree: 7c9d55385afb9b21a8c790a64b1b9ea8eedff90e /src/common/jsonapi.c
parent: 585df02b445f63167f145685e045e5b6074a5a30 (diff)
download: postgresql-3311ea86edc7a689614bad754e17371865cdc11f.tar.gz
postgresql-3311ea86edc7a689614bad754e17371865cdc11f.zip
1 files changed, 945 insertions, 9 deletions
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 98d6e66a217..3d1bd37ac26 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -43,6 +43,169 @@ typedef enum					/* contexts of JSON parser */
 	JSON_PARSE_END,				/* saw the end of a document, expect nothing */
 } JsonParseContext;
 
+/*
+ * Setup for table-driven parser.
+ * These enums need to be separate from the JsonTokenType and from each other
+ * so we can have all of them on the prediction stack, which consists of
+ * tokens, non-terminals, and semantic action markers.
+ */
+
+typedef enum
+{
+	JSON_NT_JSON = 32,
+	JSON_NT_ARRAY_ELEMENTS,
+	JSON_NT_MORE_ARRAY_ELEMENTS,
+	JSON_NT_KEY_PAIRS,
+	JSON_NT_MORE_KEY_PAIRS,
+} JsonNonTerminal;
+
+typedef enum
+{
+	JSON_SEM_OSTART = 64,
+	JSON_SEM_OEND,
+	JSON_SEM_ASTART,
+	JSON_SEM_AEND,
+	JSON_SEM_OFIELD_INIT,
+	JSON_SEM_OFIELD_START,
+	JSON_SEM_OFIELD_END,
+	JSON_SEM_AELEM_START,
+	JSON_SEM_AELEM_END,
+	JSON_SEM_SCALAR_INIT,
+	JSON_SEM_SCALAR_CALL,
+} JsonParserSem;
+
+/*
+ * struct containing the 3 stacks used in non-recursive parsing,
+ * and the token and value for scalars that need to be preserved
+ * across calls.
+ */
+typedef struct JsonParserStack
+{
+	int			stack_size;
+	char	   *prediction;
+	int			pred_index;
+	/* these two are indexed by lex_level */
+	char	  **fnames;
+	bool	   *fnull;
+	JsonTokenType scalar_tok;
+	char	   *scalar_val;
+} JsonParserStack;
+
+/*
+ * struct containing state used when there is a possible partial token at the
+ * end of a json chunk when we are doing incremental parsing.
+ */
+typedef struct JsonIncrementalState
+{
+	bool		is_last_chunk;
+	bool		partial_completed;
+	StringInfoData partial_token;
+} JsonIncrementalState;
+
+/*
+ * constants and macros used in the nonrecursive parser
+ */
+#define JSON_NUM_TERMINALS 13
+#define JSON_NUM_NONTERMINALS 5
+#define JSON_NT_OFFSET JSON_NT_JSON
+/* for indexing the table */
+#define OFS(NT) (NT) - JSON_NT_OFFSET
+/* classify items we get off the stack */
+#define IS_SEM(x) ((x) & 0x40)
+#define IS_NT(x)  ((x) & 0x20)
+
+/*
+ * These productions are stored in reverse order right to left so that when
+ * they are pushed on the stack what we expect next is at the top of the stack.
+ */
+static char JSON_PROD_EPSILON[] = {0};	/* epsilon - an empty production */
+
+/* JSON -> string */
+static char JSON_PROD_SCALAR_STRING[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_STRING, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> number */
+static char JSON_PROD_SCALAR_NUMBER[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NUMBER, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'true' */
+static char JSON_PROD_SCALAR_TRUE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_TRUE, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'false' */
+static char JSON_PROD_SCALAR_FALSE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_FALSE, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> 'null' */
+static char JSON_PROD_SCALAR_NULL[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NULL, JSON_SEM_SCALAR_INIT, 0};
+
+/* JSON -> '{' KEY_PAIRS '}' */
+static char JSON_PROD_OBJECT[] = {JSON_SEM_OEND, JSON_TOKEN_OBJECT_END, JSON_NT_KEY_PAIRS, JSON_TOKEN_OBJECT_START, JSON_SEM_OSTART, 0};
+
+/* JSON -> '[' ARRAY_ELEMENTS ']' */
+static char JSON_PROD_ARRAY[] = {JSON_SEM_AEND, JSON_TOKEN_ARRAY_END, JSON_NT_ARRAY_ELEMENTS, JSON_TOKEN_ARRAY_START, JSON_SEM_ASTART, 0};
+
+/* ARRAY_ELEMENTS -> JSON MORE_ARRAY_ELEMENTS */
+static char JSON_PROD_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, 0};
+
+/* MORE_ARRAY_ELEMENTS -> ',' JSON MORE_ARRAY_ELEMENTS */
+static char JSON_PROD_MORE_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, JSON_TOKEN_COMMA, 0};
+
+/* KEY_PAIRS -> string ':' JSON MORE_KEY_PAIRS */
+static char JSON_PROD_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, 0};
+
+/* MORE_KEY_PAIRS -> ',' string ':'  JSON MORE_KEY_PAIRS */
+static char JSON_PROD_MORE_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, JSON_TOKEN_COMMA, 0};
+
+/*
+ * Note: there are also epsilon productions for ARRAY_ELEMENTS,
+ * MORE_ARRAY_ELEMENTS, KEY_PAIRS and MORE_KEY_PAIRS
+ * They are all the same as none require any semantic actions.
+ */
+
+/*
+ * Table connecting the productions with their director sets of
+ * terminal symbols.
+ * Any combination not specified here represents an error.
+ */
+
+typedef struct
+{
+	size_t		len;
+	char	   *prod;
+}			td_entry;
+
+#define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) }
+
+static td_entry td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] =
+{
+	/* JSON */
+	[OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_SCALAR_STRING),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_SCALAR_NUMBER),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_SCALAR_FALSE),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY),
+		[OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_OBJECT),
+	/* ARRAY_ELEMENTS */
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
+	/* MORE_ARRAY_ELEMENTS */
+		[OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS),
+		[OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
+	/* KEY_PAIRS */
+		[OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEY_PAIRS),
+		[OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
+	/* MORE_KEY_PAIRS */
+		[OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS),
+		[OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
+};
+
+/* the GOAL production. Not stored in the table, but will be the initial contents of the prediction stack */
+static char JSON_PROD_GOAL[] = {JSON_TOKEN_END, JSON_NT_JSON, 0};
+
 static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
 												 bool *num_err, int *total_len);
@@ -60,7 +223,7 @@ JsonSemAction nullSemAction =
 	NULL, NULL, NULL, NULL, NULL
 };
 
-/* Recursive Descent parser support routines */
+/* Parser support routines */
 
 /*
  * lex_peek
@@ -111,6 +274,8 @@ IsValidJsonNumber(const char *str, int len)
 	if (len <= 0)
 		return false;
 
+	dummy_lex.incremental = false;
+
 	/*
 	 * json_lex_number expects a leading  '-' to have been eaten already.
 	 *
@@ -175,6 +340,130 @@ makeJsonLexContextCstringLen(JsonLexContext *lex, char *json,
 	return lex;
 }
 
+
+/*
+ * makeJsonLexContextIncremental
+ *
+ * Similar to above but set up for use in incremental parsing. That means we
+ * need explicit stacks for predictions, field names and null indicators, but
+ * we don't need the input, that will be handed in bit by bit to the
+ * parse routine. We also need an accumulator for partial tokens in case
+ * the boundary between chunks happns to fall in the middle of a token.
+ */
+#define JS_STACK_CHUNK_SIZE 64
+#define JS_MAX_PROD_LEN 10		/* more than we need */
+#define JSON_TD_MAX_STACK 6400	/* hard coded for now - this is a REALLY high
+								 * number */
+
+JsonLexContext *
+makeJsonLexContextIncremental(JsonLexContext *lex, int encoding,
+							  bool need_escapes)
+{
+	if (lex == NULL)
+	{
+		lex = palloc0(sizeof(JsonLexContext));
+		lex->flags |= JSONLEX_FREE_STRUCT;
+	}
+	else
+		memset(lex, 0, sizeof(JsonLexContext));
+
+	lex->line_number = 1;
+	lex->input_encoding = encoding;
+	lex->incremental = true;
+	lex->inc_state = palloc0(sizeof(JsonIncrementalState));
+	initStringInfo(&(lex->inc_state->partial_token));
+	lex->pstack = palloc(sizeof(JsonParserStack));
+	lex->pstack->stack_size = JS_STACK_CHUNK_SIZE;
+	lex->pstack->prediction = palloc(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN);
+	lex->pstack->pred_index = 0;
+	lex->pstack->fnames = palloc(JS_STACK_CHUNK_SIZE * sizeof(char *));
+	lex->pstack->fnull = palloc(JS_STACK_CHUNK_SIZE * sizeof(bool));
+	if (need_escapes)
+	{
+		lex->strval = makeStringInfo();
+		lex->flags |= JSONLEX_FREE_STRVAL;
+	}
+	return lex;
+}
+
+static inline void
+inc_lex_level(JsonLexContext *lex)
+{
+	lex->lex_level += 1;
+
+	if (lex->incremental && lex->lex_level >= lex->pstack->stack_size)
+	{
+		lex->pstack->stack_size += JS_STACK_CHUNK_SIZE;
+		lex->pstack->prediction =
+			repalloc(lex->pstack->prediction,
+					 lex->pstack->stack_size * JS_MAX_PROD_LEN);
+		if (lex->pstack->fnames)
+			lex->pstack->fnames =
+				repalloc(lex->pstack->fnames,
+						 lex->pstack->stack_size * sizeof(char *));
+		if (lex->pstack->fnull)
+			lex->pstack->fnull =
+				repalloc(lex->pstack->fnull, lex->pstack->stack_size * sizeof(bool));
+	}
+}
+
+static inline void
+dec_lex_level(JsonLexContext *lex)
+{
+	lex->lex_level -= 1;
+}
+
+static inline void
+push_prediction(JsonParserStack *pstack, td_entry entry)
+{
+	memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len);
+	pstack->pred_index += entry.len;
+}
+
+static inline char
+pop_prediction(JsonParserStack *pstack)
+{
+	Assert(pstack->pred_index > 0);
+	return pstack->prediction[--pstack->pred_index];
+}
+
+static inline char
+next_prediction(JsonParserStack *pstack)
+{
+	Assert(pstack->pred_index > 0);
+	return pstack->prediction[pstack->pred_index - 1];
+}
+
+static inline bool
+have_prediction(JsonParserStack *pstack)
+{
+	return pstack->pred_index > 0;
+}
+
+static inline void
+set_fname(JsonLexContext *lex, char *fname)
+{
+	lex->pstack->fnames[lex->lex_level] = fname;
+}
+
+static inline char *
+get_fname(JsonLexContext *lex)
+{
+	return lex->pstack->fnames[lex->lex_level];
+}
+
+static inline void
+set_fnull(JsonLexContext *lex, bool fnull)
+{
+	lex->pstack->fnull[lex->lex_level] = fnull;
+}
+
+static inline bool
+get_fnull(JsonLexContext *lex)
+{
+	return lex->pstack->fnull[lex->lex_level];
+}
+
 /*
  * Free memory in a JsonLexContext.
  *
@@ -192,7 +481,18 @@ freeJsonLexContext(JsonLexContext *lex)
 		destroyStringInfo(lex->errormsg);
 
 	if (lex->flags & JSONLEX_FREE_STRUCT)
+	{
+		if (lex->incremental)
+		{
+			pfree(lex->inc_state->partial_token.data);
+			pfree(lex->inc_state);
+			pfree(lex->pstack->prediction);
+			pfree(lex->pstack->fnames);
+			pfree(lex->pstack->fnull);
+			pfree(lex->pstack);
+		}
 		pfree(lex);
+	}
 }
 
 /*
@@ -204,13 +504,44 @@ freeJsonLexContext(JsonLexContext *lex)
  * makeJsonLexContext(). sem is a structure of function pointers to semantic
  * action routines to be called at appropriate spots during parsing, and a
  * pointer to a state object to be passed to those routines.
+ *
+ * If FORCE_JSON_PSTACK is defined then the routine will call the non-recursive
+ * JSON parser. This is a useful way to validate that it's doing the right
+ * think at least for non-incremental cases. If this is on we expect to see
+ * regression diffs relating to error messages about stack depth, but no
+ * other differences.
  */
 JsonParseErrorType
 pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
 {
+#ifdef FORCE_JSON_PSTACK
+
+	lex->incremental = true;
+	lex->inc_state = palloc0(sizeof(JsonIncrementalState));
+
+	/*
+	 * We don't need partial token processing, there is only one chunk. But we
+	 * still need to init the partial token string so that freeJsonLexContext
+	 * works.
+	 */
+	initStringInfo(&(lex->inc_state->partial_token));
+	lex->pstack = palloc(sizeof(JsonParserStack));
+	lex->pstack->stack_size = JS_STACK_CHUNK_SIZE;
+	lex->pstack->prediction = palloc(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN);
+	lex->pstack->pred_index = 0;
+	lex->pstack->fnames = palloc(JS_STACK_CHUNK_SIZE * sizeof(char *));
+	lex->pstack->fnull = palloc(JS_STACK_CHUNK_SIZE * sizeof(bool));
+
+	return pg_parse_json_incremental(lex, sem, lex->input, lex->input_length, true);
+
+#else
+
 	JsonTokenType tok;
 	JsonParseErrorType result;
 
+	if (lex->incremental)
+		return JSON_INVALID_LEXER_TYPE;
+
 	/* get the initial token */
 	result = json_lex(lex);
 	if (result != JSON_SUCCESS)
@@ -235,6 +566,7 @@ pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
 		result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
 
 	return result;
+#endif
 }
 
 /*
@@ -291,6 +623,372 @@ json_count_array_elements(JsonLexContext *lex, int *elements)
 }
 
 /*
+ * pg_parse_json_incremental
+ *
+ * Routine for incremental parsing of json. This uses the non-recursive top
+ * down method of the Dragon Book Algorithm 4.3. It's somewhat slower than
+ * the Recursive Descent pattern used above, so we only use it for incremental
+ * parsing of JSON.
+ *
+ * The lexing context needs to be set up by a call to
+ * makeJsonLexContextIncremental(). sem is a structure of function pointers
+ * to semantic action routines, which should function exactly as those used
+ * in the recursive descent parser.
+ *
+ * This routine can be called repeatedly with chunks of JSON. On the final
+ * chunk is_last must be set to true. len is the length of the json chunk,
+ * which does not need to be null terminated.
+ */
+JsonParseErrorType
+pg_parse_json_incremental(JsonLexContext *lex,
+						  JsonSemAction *sem,
+						  char *json,
+						  int len,
+						  bool is_last)
+{
+	JsonTokenType tok;
+	JsonParseErrorType result;
+	JsonParseContext ctx = JSON_PARSE_VALUE;
+	JsonParserStack *pstack = lex->pstack;
+
+
+	if (!lex->incremental)
+		return JSON_INVALID_LEXER_TYPE;
+
+	lex->input = lex->token_terminator = lex->line_start = json;
+	lex->input_length = len;
+	lex->inc_state->is_last_chunk = is_last;
+
+	/* get the initial token */
+	result = json_lex(lex);
+	if (result != JSON_SUCCESS)
+		return result;
+
+	tok = lex_peek(lex);
+
+	/* use prediction stack for incremental parsing */
+
+	if (!have_prediction(pstack))
+	{
+		td_entry	goal = TD_ENTRY(JSON_PROD_GOAL);
+
+		push_prediction(pstack, goal);
+	}
+
+	while (have_prediction(pstack))
+	{
+		char		top = pop_prediction(pstack);
+		td_entry	entry;
+
+		/*
+		 * these first two branches are the guts of the Table Driven method
+		 */
+		if (top == tok)
+		{
+			/*
+			 * tok can only be a terminal symbol, so top must be too. the
+			 * token matches the top of the stack, so get the next token.
+			 */
+			if (tok < JSON_TOKEN_END)
+			{
+				result = json_lex(lex);
+				if (result != JSON_SUCCESS)
+					return result;
+				tok = lex_peek(lex);
+			}
+		}
+		else if (IS_NT(top) && (entry = td_parser_table[OFS(top)][tok]).prod != NULL)
+		{
+			/*
+			 * the token is in the director set for a production of the
+			 * non-terminal at the top of the stack, so push the reversed RHS
+			 * of the production onto the stack.
+			 */
+			push_prediction(pstack, entry);
+		}
+		else if (IS_SEM(top))
+		{
+			/*
+			 * top is a semantic action marker, so take action accordingly.
+			 * It's important to have these markers in the prediction stack
+			 * before any token they might need so we don't advance the token
+			 * prematurely. Note in a couple of cases we need to do something
+			 * both before and after the token.
+			 */
+			switch (top)
+			{
+				case JSON_SEM_OSTART:
+					{
+						json_struct_action ostart = sem->object_start;
+
+						if (lex->lex_level >= JSON_TD_MAX_STACK)
+							return JSON_NESTING_TOO_DEEP;
+
+						if (ostart != NULL)
+						{
+							result = (*ostart) (sem->semstate);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+						inc_lex_level(lex);
+					}
+					break;
+				case JSON_SEM_OEND:
+					{
+						json_struct_action oend = sem->object_end;
+
+						dec_lex_level(lex);
+						if (oend != NULL)
+						{
+							result = (*oend) (sem->semstate);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_ASTART:
+					{
+						json_struct_action astart = sem->array_start;
+
+						if (lex->lex_level >= JSON_TD_MAX_STACK)
+							return JSON_NESTING_TOO_DEEP;
+
+						if (astart != NULL)
+						{
+							result = (*astart) (sem->semstate);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+						inc_lex_level(lex);
+					}
+					break;
+				case JSON_SEM_AEND:
+					{
+						json_struct_action aend = sem->array_end;
+
+						dec_lex_level(lex);
+						if (aend != NULL)
+						{
+							result = (*aend) (sem->semstate);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_OFIELD_INIT:
+					{
+						/*
+						 * all we do here is save out the field name. We have
+						 * to wait to get past the ':' to see if the next
+						 * value is null so we can call the semantic routine
+						 */
+						char	   *fname = NULL;
+						json_ofield_action ostart = sem->object_field_start;
+						json_ofield_action oend = sem->object_field_end;
+
+						if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
+						{
+							fname = pstrdup(lex->strval->data);
+						}
+						set_fname(lex, fname);
+					}
+					break;
+				case JSON_SEM_OFIELD_START:
+					{
+						/*
+						 * the current token should be the first token of the
+						 * value
+						 */
+						bool		isnull = tok == JSON_TOKEN_NULL;
+						json_ofield_action ostart = sem->object_field_start;
+
+						set_fnull(lex, isnull);
+
+						if (ostart != NULL)
+						{
+							char	   *fname = get_fname(lex);
+
+							result = (*ostart) (sem->semstate, fname, isnull);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_OFIELD_END:
+					{
+						json_ofield_action oend = sem->object_field_end;
+
+						if (oend != NULL)
+						{
+							char	   *fname = get_fname(lex);
+							bool		isnull = get_fnull(lex);
+
+							result = (*oend) (sem->semstate, fname, isnull);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_AELEM_START:
+					{
+						json_aelem_action astart = sem->array_element_start;
+						bool		isnull = tok == JSON_TOKEN_NULL;
+
+						set_fnull(lex, isnull);
+
+						if (astart != NULL)
+						{
+							result = (*astart) (sem->semstate, isnull);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_AELEM_END:
+					{
+						json_aelem_action aend = sem->array_element_end;
+
+						if (aend != NULL)
+						{
+							bool		isnull = get_fnull(lex);
+
+							result = (*aend) (sem->semstate, isnull);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				case JSON_SEM_SCALAR_INIT:
+					{
+						json_scalar_action sfunc = sem->scalar;
+
+						pstack->scalar_val = NULL;
+
+						if (sfunc != NULL)
+						{
+							/*
+							 * extract the de-escaped string value, or the raw
+							 * lexeme
+							 */
+							/*
+							 * XXX copied from RD parser but looks like a
+							 * buglet
+							 */
+							if (tok == JSON_TOKEN_STRING)
+							{
+								if (lex->strval != NULL)
+									pstack->scalar_val = pstrdup(lex->strval->data);
+							}
+							else
+							{
+								int			tlen = (lex->token_terminator - lex->token_start);
+
+								pstack->scalar_val = palloc(tlen + 1);
+								memcpy(pstack->scalar_val, lex->token_start, tlen);
+								pstack->scalar_val[tlen] = '\0';
+							}
+							pstack->scalar_tok = tok;
+						}
+					}
+					break;
+				case JSON_SEM_SCALAR_CALL:
+					{
+						/*
+						 * We'd like to be able to get rid of this business of
+						 * two bits of scalar action, but we can't. It breaks
+						 * certain semantic actions which expect that when
+						 * called the lexer has consumed the item. See for
+						 * example get_scalar() in jsonfuncs.c.
+						 */
+						json_scalar_action sfunc = sem->scalar;
+
+						if (sfunc != NULL)
+						{
+							result = (*sfunc) (sem->semstate, pstack->scalar_val, pstack->scalar_tok);
+							if (result != JSON_SUCCESS)
+								return result;
+						}
+					}
+					break;
+				default:
+					/* should not happen */
+					break;
+			}
+		}
+		else
+		{
+			/*
+			 * The token didn't match the stack top if it's a terminal nor a
+			 * production for the stack top if it's a non-terminal.
+			 *
+			 * Various cases here are Asserted to be not possible, as the
+			 * token would not appear at the top of the prediction stack
+			 * unless the lookahead matched.
+			 */
+			switch (top)
+			{
+				case JSON_TOKEN_STRING:
+					if (next_prediction(pstack) == JSON_TOKEN_COLON)
+						ctx = JSON_PARSE_STRING;
+					else
+					{
+						Assert(false);
+						ctx = JSON_PARSE_VALUE;
+					}
+					break;
+				case JSON_TOKEN_NUMBER:
+				case JSON_TOKEN_TRUE:
+				case JSON_TOKEN_FALSE:
+				case JSON_TOKEN_NULL:
+				case JSON_TOKEN_ARRAY_START:
+				case JSON_TOKEN_OBJECT_START:
+					Assert(false);
+					ctx = JSON_PARSE_VALUE;
+					break;
+				case JSON_TOKEN_ARRAY_END:
+					Assert(false);
+					ctx = JSON_PARSE_ARRAY_NEXT;
+					break;
+				case JSON_TOKEN_OBJECT_END:
+					Assert(false);
+					ctx = JSON_PARSE_OBJECT_NEXT;
+					break;
+				case JSON_TOKEN_COMMA:
+					Assert(false);
+					if (next_prediction(pstack) == JSON_TOKEN_STRING)
+						ctx = JSON_PARSE_OBJECT_NEXT;
+					else
+						ctx = JSON_PARSE_ARRAY_NEXT;
+					break;
+				case JSON_TOKEN_COLON:
+					ctx = JSON_PARSE_OBJECT_LABEL;
+					break;
+				case JSON_TOKEN_END:
+					ctx = JSON_PARSE_END;
+					break;
+				case JSON_NT_MORE_ARRAY_ELEMENTS:
+					ctx = JSON_PARSE_ARRAY_NEXT;
+					break;
+				case JSON_NT_ARRAY_ELEMENTS:
+					ctx = JSON_PARSE_ARRAY_START;
+					break;
+				case JSON_NT_MORE_KEY_PAIRS:
+					ctx = JSON_PARSE_OBJECT_NEXT;
+					break;
+				case JSON_NT_KEY_PAIRS:
+					ctx = JSON_PARSE_OBJECT_START;
+					break;
+				default:
+					ctx = JSON_PARSE_VALUE;
+			}
+			return report_parse_error(ctx, lex);
+		}
+	}
+
+	return JSON_SUCCESS;
+}
+
+/*
  *	Recursive Descent parse routines. There is one for each structural
  *	element in a json document:
  *	  - scalar (string, number, true, false, null)
@@ -587,6 +1285,18 @@ parse_array(JsonLexContext *lex, JsonSemAction *sem)
 
 /*
  * Lex one token from the input stream.
+ *
+ * When doing incremental parsing, we can reach the end of the input string
+ * without having (or knowing we have) a complete token. If it's not the
+ * final chunk of input, the partial token is then saved to the lex
+ * structure's ptok StringInfo. On subsequent calls input is appended to this
+ * buffer until we have something that we think is a complete token,
+ * which is then lexed using a recursive call to json_lex. Processing then
+ * continues as normal on subsequent calls.
+ *
+ * Note than when doing incremental processing, the lex.prev_token_terminator
+ * should not be relied on. It could point into a previous input chunk or
+ * worse.
  */
 JsonParseErrorType
 json_lex(JsonLexContext *lex)
@@ -595,8 +1305,202 @@ json_lex(JsonLexContext *lex)
 	char	   *const end = lex->input + lex->input_length;
 	JsonParseErrorType result;
 
-	/* Skip leading whitespace. */
+	if (lex->incremental && lex->inc_state->partial_completed)
+	{
+		/*
+		 * We just lexed a completed partial token on the last call, so reset
+		 * everything
+		 */
+		resetStringInfo(&(lex->inc_state->partial_token));
+		lex->token_terminator = lex->input;
+		lex->inc_state->partial_completed = false;
+	}
+
 	s = lex->token_terminator;
+
+	if (lex->incremental && lex->inc_state->partial_token.len)
+	{
+		/*
+		 * We have a partial token. Extend it and if completed lex it by a
+		 * recursive call
+		 */
+		StringInfo	ptok = &(lex->inc_state->partial_token);
+		int			added = 0;
+		bool		tok_done = false;
+		JsonLexContext dummy_lex;
+		JsonParseErrorType partial_result;
+
+		if (ptok->data[0] == '"')
+		{
+			/*
+			 * It's a string. Accumulate characters until we reach an
+			 * unescaped '"'.
+			 */
+			int			escapes = 0;
+
+			for (int i = ptok->len - 1; i > 0; i--)
+			{
+				/* count the trailing backslashes on the partial token */
+				if (ptok->data[i] == '\\')
+					escapes++;
+				else
+					break;
+			}
+
+			for (int i = 0; i < lex->input_length; i++)
+			{
+				char		c = lex->input[i];
+
+				appendStringInfoCharMacro(ptok, c);
+				added++;
+				if (c == '"' && escapes % 2 == 0)
+				{
+					tok_done = true;
+					break;
+				}
+				if (c == '\\')
+					escapes++;
+				else
+					escapes = 0;
+			}
+		}
+		else
+		{
+			/* not a string */
+			char		c = ptok->data[0];
+
+			if (c == '-' || (c >= '0' && c <= '9'))
+			{
+				/* for numbers look for possible numeric continuations */
+
+				bool		numend = false;
+
+				for (int i = 0; i < lex->input_length && !numend; i++)
+				{
+					char		cc = lex->input[i];
+
+					switch (cc)
+					{
+						case '+':
+						case '-':
+						case 'e':
+						case 'E':
+						case '0':
+						case '1':
+						case '2':
+						case '3':
+						case '4':
+						case '5':
+						case '6':
+						case '7':
+						case '8':
+						case '9':
+							{
+								appendStringInfoCharMacro(ptok, cc);
+								added++;
+							}
+							break;
+						default:
+							numend = true;
+					}
+				}
+			}
+
+			/*
+			 * Add any remaining alpha_numeric chars. This takes care of the
+			 * {null, false, true} literals as well as any trailing
+			 * alpha-numeric junk on non-string tokens.
+			 */
+			for (int i = added; i < lex->input_length; i++)
+			{
+				char		cc = lex->input[i];
+
+				if (JSON_ALPHANUMERIC_CHAR(cc))
+				{
+					appendStringInfoCharMacro(ptok, cc);
+					added++;
+				}
+				else
+				{
+					tok_done = true;
+					break;
+				}
+			}
+			if (added == lex->input_length &&
+				lex->inc_state->is_last_chunk)
+			{
+				tok_done = true;
+			}
+		}
+
+		if (!tok_done)
+		{
+			/* We should have consumed the whole chunk in this case. */
+			Assert(added == lex->input_length);
+
+			if (!lex->inc_state->is_last_chunk)
+				return JSON_INCOMPLETE;
+
+			/* json_errdetail() needs access to the accumulated token. */
+			lex->token_start = ptok->data;
+			lex->token_terminator = ptok->data + ptok->len;
+			return JSON_INVALID_TOKEN;
+		}
+
+		/*
+		 * Everything up to lex->input[added] has been added to the partial
+		 * token, so move the input past it.
+		 */
+		lex->input += added;
+		lex->input_length -= added;
+
+		dummy_lex.input = dummy_lex.token_terminator =
+			dummy_lex.line_start = ptok->data;
+		dummy_lex.line_number = lex->line_number;
+		dummy_lex.input_length = ptok->len;
+		dummy_lex.input_encoding = lex->input_encoding;
+		dummy_lex.incremental = false;
+		dummy_lex.strval = lex->strval;
+
+		partial_result = json_lex(&dummy_lex);
+
+		/*
+		 * We either have a complete token or an error. In either case we need
+		 * to point to the partial token data for the semantic or error
+		 * routines. If it's not an error we'll readjust on the next call to
+		 * json_lex.
+		 */
+		lex->token_type = dummy_lex.token_type;
+		lex->line_number = dummy_lex.line_number;
+
+		/*
+		 * We know the prev_token_terminator must be back in some previous
+		 * piece of input, so we just make it NULL.
+		 */
+		lex->prev_token_terminator = NULL;
+
+		/*
+		 * Normally token_start would be ptok->data, but it could be later,
+		 * see json_lex_string's handling of invalid escapes.
+		 */
+		lex->token_start = dummy_lex.token_start;
+		lex->token_terminator = dummy_lex.token_terminator;
+		if (partial_result == JSON_SUCCESS)
+		{
+			/* make sure we've used all the input */
+			if (lex->token_terminator - lex->token_start != ptok->len)
+			{
+				Assert(false);
+				return JSON_INVALID_TOKEN;
+			}
+
+			lex->inc_state->partial_completed = true;
+		}
+		return partial_result;
+		/* end of partial token processing */
+	}
+
+	/* Skip leading whitespace. */
 	while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
 	{
 		if (*s++ == '\n')
@@ -708,6 +1612,14 @@ json_lex(JsonLexContext *lex)
 						return JSON_INVALID_TOKEN;
 					}
 
+					if (lex->incremental && !lex->inc_state->is_last_chunk &&
+						p == lex->input + lex->input_length)
+					{
+						appendBinaryStringInfo(
+											   &(lex->inc_state->partial_token), s, end - s);
+						return JSON_INCOMPLETE;
+					}
+
 					/*
 					 * We've got a real alphanumeric token here.  If it
 					 * happens to be true, false, or null, all is well.  If
@@ -732,7 +1644,10 @@ json_lex(JsonLexContext *lex)
 		}						/* end of switch */
 	}
 
-	return JSON_SUCCESS;
+	if (lex->incremental && lex->token_type == JSON_TOKEN_END && !lex->inc_state->is_last_chunk)
+		return JSON_INCOMPLETE;
+	else
+		return JSON_SUCCESS;
 }
 
 /*
@@ -754,8 +1669,14 @@ json_lex_string(JsonLexContext *lex)
 	int			hi_surrogate = -1;
 
 	/* Convenience macros for error exits */
-#define FAIL_AT_CHAR_START(code) \
+#define FAIL_OR_INCOMPLETE_AT_CHAR_START(code) \
 	do { \
+		if (lex->incremental && !lex->inc_state->is_last_chunk) \
+		{ \
+			appendBinaryStringInfo(&lex->inc_state->partial_token, \
+								   lex->token_start, end - lex->token_start); \
+			return JSON_INCOMPLETE; \
+		} \
 		lex->token_terminator = s; \
 		return code; \
 	} while (0)
@@ -776,7 +1697,7 @@ json_lex_string(JsonLexContext *lex)
 		s++;
 		/* Premature end of the string. */
 		if (s >= end)
-			FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+			FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
 		else if (*s == '"')
 			break;
 		else if (*s == '\\')
@@ -784,7 +1705,7 @@ json_lex_string(JsonLexContext *lex)
 			/* OK, we have an escape character. */
 			s++;
 			if (s >= end)
-				FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+				FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
 			else if (*s == 'u')
 			{
 				int			i;
@@ -794,7 +1715,7 @@ json_lex_string(JsonLexContext *lex)
 				{
 					s++;
 					if (s >= end)
-						FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+						FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
 					else if (*s >= '0' && *s <= '9')
 						ch = (ch * 16) + (*s - '0');
 					else if (*s >= 'a' && *s <= 'f')
@@ -979,7 +1900,7 @@ json_lex_string(JsonLexContext *lex)
 	lex->token_terminator = s + 1;
 	return JSON_SUCCESS;
 
-#undef FAIL_AT_CHAR_START
+#undef FAIL_OR_INCOMPLETE_AT_CHAR_START
 #undef FAIL_AT_CHAR_END
 }
 
@@ -1088,7 +2009,14 @@ json_lex_number(JsonLexContext *lex, char *s,
 	if (total_len != NULL)
 		*total_len = len;
 
-	if (num_err != NULL)
+	if (lex->incremental && !lex->inc_state->is_last_chunk &&
+		len >= lex->input_length)
+	{
+		appendBinaryStringInfo(&lex->inc_state->partial_token,
+							   lex->token_start, s - lex->token_start);
+		return JSON_INCOMPLETE;
+	}
+	else if (num_err != NULL)
 	{
 		/* let the caller handle any error */
 		*num_err = error;
@@ -1174,9 +2102,17 @@ json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
 
 	switch (error)
 	{
+		case JSON_INCOMPLETE:
 		case JSON_SUCCESS:
 			/* fall through to the error code after switch */
 			break;
+		case JSON_INVALID_LEXER_TYPE:
+			if (lex->incremental)
+				return (_("Recursive descent parser cannot use incremental lexer"));
+			else
+				return (_("Incremental parser requires incremental lexer"));
+		case JSON_NESTING_TOO_DEEP:
+			return (_("JSON nested too deep, maximum permitted depth is 6400"));
 		case JSON_ESCAPING_INVALID:
 			token_error(lex, "Escape sequence \"\\%.*s\" is invalid.");
 			break;
author	Andrew Dunstan <andrew@dunslane.net>	2024-03-10 23:10:14 -0400
committer	Andrew Dunstan <andrew@dunslane.net>	2024-04-04 06:46:40 -0400
commit	3311ea86edc7a689614bad754e17371865cdc11f (patch)
tree	7c9d55385afb9b21a8c790a64b1b9ea8eedff90e /src/common/jsonapi.c
parent	585df02b445f63167f145685e045e5b6074a5a30 (diff)
download	postgresql-3311ea86edc7a689614bad754e17371865cdc11f.tar.gz postgresql-3311ea86edc7a689614bad754e17371865cdc11f.zip