Introduce a non-recursive JSON parser

This parser uses an explicit prediction stack, unlike the present recursive descent parser where the parser state is represented on the call stack. This difference makes the new parser suitable for use in incremental parsing of huge JSON documents that cannot be conveniently handled piece-wise by the recursive descent parser. One potential use for this will be in parsing large backup manifests associated with incremental backups. Because this parser is somewhat slower than the recursive descent parser, it is not replacing that parser, but is an additional parser available to callers. For testing purposes, if the build is done with -DFORCE_JSON_PSTACK, all JSON parsing is done with the non-recursive parser, in which case only trivial regression differences in error messages should be observed. Author: Andrew Dunstan Reviewed-By: Jacob Champion Discussion: https://postgr.es/m/7b0a51d6-0d9d-7366-3a1a-f74397a02f55@dunslane.net
author: Andrew Dunstan <andrew@dunslane.net> 2024-03-10 23:10:14 -0400
committer: Andrew Dunstan <andrew@dunslane.net> 2024-04-04 06:46:40 -0400
commit: 3311ea86edc7a689614bad754e17371865cdc11f (patch)
tree: 7c9d55385afb9b21a8c790a64b1b9ea8eedff90e /src/test/modules/test_json_parser/test_json_parser_incremental.c
parent: 585df02b445f63167f145685e045e5b6074a5a30 (diff)
download: postgresql-3311ea86edc7a689614bad754e17371865cdc11f.tar.gz
postgresql-3311ea86edc7a689614bad754e17371865cdc11f.zip
1 files changed, 320 insertions, 0 deletions
diff --git a/src/test/modules/test_json_parser/test_json_parser_incremental.c b/src/test/modules/test_json_parser/test_json_parser_incremental.c
new file mode 100644
index 00000000000..c28db05647c
--- /dev/null
+++ b/src/test/modules/test_json_parser/test_json_parser_incremental.c
@@ -0,0 +1,320 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_json_parser_incremental.c
+ *    Test program for incremental JSON parser
+ *
+ * Copyright (c) 2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/test/modules/test_json_parser/test_json_parser_incremental.c
+ *
+ * This progam tests incremental parsing of json. The input is fed into
+ * the parser in very small chunks. In practice you would normally use
+ * much larger chunks, but doing this makes it more likely that the
+ * full range of incement handling, especially in the lexer, is exercised.
+ * If the "-c SIZE" option is provided, that chunk size is used instead.
+ *
+ * The argument specifies the file containing the JSON input.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "common/jsonapi.h"
+#include "lib/stringinfo.h"
+#include "mb/pg_wchar.h"
+#include "pg_getopt.h"
+
+typedef struct DoState
+{
+	JsonLexContext *lex;
+	bool		elem_is_first;
+	StringInfo	buf;
+} DoState;
+
+static void usage(const char *progname);
+static void escape_json(StringInfo buf, const char *str);
+
+/* semantic action functions for parser */
+static JsonParseErrorType do_object_start(void *state);
+static JsonParseErrorType do_object_end(void *state);
+static JsonParseErrorType do_object_field_start(void *state, char *fname, bool isnull);
+static JsonParseErrorType do_object_field_end(void *state, char *fname, bool isnull);
+static JsonParseErrorType do_array_start(void *state);
+static JsonParseErrorType do_array_end(void *state);
+static JsonParseErrorType do_array_element_start(void *state, bool isnull);
+static JsonParseErrorType do_array_element_end(void *state, bool isnull);
+static JsonParseErrorType do_scalar(void *state, char *token, JsonTokenType tokentype);
+
+JsonSemAction sem = {
+	.object_start = do_object_start,
+	.object_end = do_object_end,
+	.object_field_start = do_object_field_start,
+	.object_field_end = do_object_field_end,
+	.array_start = do_array_start,
+	.array_end = do_array_end,
+	.array_element_start = do_array_element_start,
+	.array_element_end = do_array_element_end,
+	.scalar = do_scalar
+};
+
+int
+main(int argc, char **argv)
+{
+	/* max delicious line length is less than this */
+	char		buff[6001];
+	FILE	   *json_file;
+	JsonParseErrorType result;
+	JsonLexContext lex;
+	StringInfoData json;
+	int			n_read;
+	size_t		chunk_size = 60;
+	struct stat statbuf;
+	off_t		bytes_left;
+	JsonSemAction *testsem = &nullSemAction;
+	char	   *testfile;
+	int			c;
+	bool		need_strings = false;
+
+	while ((c = getopt(argc, argv, "c:s")) != -1)
+	{
+		switch (c)
+		{
+			case 'c':			/* chunksize */
+				sscanf(optarg, "%zu", &chunk_size);
+				break;
+			case 's':			/* do semantic processing */
+				testsem = &sem;
+				sem.semstate = palloc(sizeof(struct DoState));
+				((struct DoState *) sem.semstate)->lex = &lex;
+				((struct DoState *) sem.semstate)->buf = makeStringInfo();
+				need_strings = true;
+				break;
+		}
+	}
+
+	if (optind < argc)
+	{
+		testfile = pg_strdup(argv[optind]);
+		optind++;
+	}
+	else
+	{
+		usage(argv[0]);
+		exit(1);
+	}
+
+	makeJsonLexContextIncremental(&lex, PG_UTF8, need_strings);
+	initStringInfo(&json);
+
+	json_file = fopen(testfile, "r");
+	fstat(fileno(json_file), &statbuf);
+	bytes_left = statbuf.st_size;
+
+	for (;;)
+	{
+		n_read = fread(buff, 1, chunk_size, json_file);
+		appendBinaryStringInfo(&json, buff, n_read);
+		appendStringInfoString(&json, "1+23 trailing junk");
+		bytes_left -= n_read;
+		if (bytes_left > 0)
+		{
+			result = pg_parse_json_incremental(&lex, testsem,
+											   json.data, n_read,
+											   false);
+			if (result != JSON_INCOMPLETE)
+			{
+				fprintf(stderr, "%s\n", json_errdetail(result, &lex));
+				exit(1);
+			}
+			resetStringInfo(&json);
+		}
+		else
+		{
+			result = pg_parse_json_incremental(&lex, testsem,
+											   json.data, n_read,
+											   true);
+			if (result != JSON_SUCCESS)
+			{
+				fprintf(stderr, "%s\n", json_errdetail(result, &lex));
+				exit(1);
+			}
+			if (!need_strings)
+				printf("SUCCESS!\n");
+			break;
+		}
+	}
+	fclose(json_file);
+	exit(0);
+}
+
+/*
+ * The semantic routines here essentially just output the same json, except
+ * for white space. We could pretty print it but there's no need for our
+ * purposes. The result should be able to be fed to any JSON processor
+ * such as jq for validation.
+ */
+
+static JsonParseErrorType
+do_object_start(void *state)
+{
+	DoState    *_state = (DoState *) state;
+
+	printf("{\n");
+	_state->elem_is_first = true;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_object_end(void *state)
+{
+	DoState    *_state = (DoState *) state;
+
+	printf("\n}\n");
+	_state->elem_is_first = false;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_object_field_start(void *state, char *fname, bool isnull)
+{
+	DoState    *_state = (DoState *) state;
+
+	if (!_state->elem_is_first)
+		printf(",\n");
+	resetStringInfo(_state->buf);
+	escape_json(_state->buf, fname);
+	printf("%s: ", _state->buf->data);
+	_state->elem_is_first = false;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_object_field_end(void *state, char *fname, bool isnull)
+{
+	/* nothing to do really */
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_array_start(void *state)
+{
+	DoState    *_state = (DoState *) state;
+
+	printf("[\n");
+	_state->elem_is_first = true;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_array_end(void *state)
+{
+	DoState    *_state = (DoState *) state;
+
+	printf("\n]\n");
+	_state->elem_is_first = false;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_array_element_start(void *state, bool isnull)
+{
+	DoState    *_state = (DoState *) state;
+
+	if (!_state->elem_is_first)
+		printf(",\n");
+	_state->elem_is_first = false;
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_array_element_end(void *state, bool isnull)
+{
+	/* nothing to do */
+
+	return JSON_SUCCESS;
+}
+
+static JsonParseErrorType
+do_scalar(void *state, char *token, JsonTokenType tokentype)
+{
+	DoState    *_state = (DoState *) state;
+
+	if (tokentype == JSON_TOKEN_STRING)
+	{
+		resetStringInfo(_state->buf);
+		escape_json(_state->buf, token);
+		printf("%s", _state->buf->data);
+	}
+	else
+		printf("%s", token);
+
+	return JSON_SUCCESS;
+}
+
+
+/*  copied from backend code */
+static void
+escape_json(StringInfo buf, const char *str)
+{
+	const char *p;
+
+	appendStringInfoCharMacro(buf, '"');
+	for (p = str; *p; p++)
+	{
+		switch (*p)
+		{
+			case '\b':
+				appendStringInfoString(buf, "\\b");
+				break;
+			case '\f':
+				appendStringInfoString(buf, "\\f");
+				break;
+			case '\n':
+				appendStringInfoString(buf, "\\n");
+				break;
+			case '\r':
+				appendStringInfoString(buf, "\\r");
+				break;
+			case '\t':
+				appendStringInfoString(buf, "\\t");
+				break;
+			case '"':
+				appendStringInfoString(buf, "\\\"");
+				break;
+			case '\\':
+				appendStringInfoString(buf, "\\\\");
+				break;
+			default:
+				if ((unsigned char) *p < ' ')
+					appendStringInfo(buf, "\\u%04x", (int) *p);
+				else
+					appendStringInfoCharMacro(buf, *p);
+				break;
+		}
+	}
+	appendStringInfoCharMacro(buf, '"');
+}
+
+static void
+usage(const char *progname)
+{
+	fprintf(stderr, "Usage: %s [OPTION ...] testfile\n", progname);
+	fprintf(stderr, "Options:\n");
+	fprintf(stderr, "  -c chunksize      size of piece fed to parser (default 64)n");
+	fprintf(stderr, "  -s                do semantic processing\n");
+
+}
author	Andrew Dunstan <andrew@dunslane.net>	2024-03-10 23:10:14 -0400
committer	Andrew Dunstan <andrew@dunslane.net>	2024-04-04 06:46:40 -0400
commit	3311ea86edc7a689614bad754e17371865cdc11f (patch)
tree	7c9d55385afb9b21a8c790a64b1b9ea8eedff90e /src/test/modules/test_json_parser/test_json_parser_incremental.c
parent	585df02b445f63167f145685e045e5b6074a5a30 (diff)
download	postgresql-3311ea86edc7a689614bad754e17371865cdc11f.tar.gz postgresql-3311ea86edc7a689614bad754e17371865cdc11f.zip