Add a test module for the regular expression package.

This module provides a function test_regex() that is functionally rather like regexp_matches(), but with additional debugging-oriented options and additional output. The debug options are somewhat obscure; they are chosen to match the API of the test harness that Henry Spencer wrote way-back-when for use in Tcl. With this, we can import all the test cases that Spencer wrote originally, even for regex functionality that we don't currently expose in Postgres. This seems necessary because we can no longer rely on Tcl to act as upstream and verify any fixes or improvements that we make. In addition to Spencer's tests, I added a few for lookbehind constraints (which we added in 2015, and Tcl still hasn't absorbed) that are modeled on his tests for lookahead constraints. After looking at code coverage reports, I also threw in a couple of tests to more fully exercise our "high colormap" logic. According to my testing, this brings the check-world coverage for src/backend/regex/ from 71.1% to 86.7% of lines. (coverage.postgresql.org shows a slightly different number, which I think is because it measures a non-assert build.) Discussion: https://postgr.es/m/2873268.1609732164@sss.pgh.pa.us
author: Tom Lane <tgl@sss.pgh.pa.us> 2021-01-06 10:51:14 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2021-01-06 10:51:14 -0500
commit: ca8217c10138fa3ffe1e7d1def2484fd0eb78226 (patch)
tree: 65e42fc9ad482246780e2dc1abadb502c68cf517 /src/test/modules/test_regex/test_regex.c
parent: 4656e3d66893f286767285cf74dabb3877068e49 (diff)
download: postgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.tar.gz
postgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.zip
1 files changed, 759 insertions, 0 deletions
diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c
new file mode 100644
index 00000000000..ad3c6d3b1a6
--- /dev/null
+++ b/src/test/modules/test_regex/test_regex.c
@@ -0,0 +1,759 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_regex.c
+ *		Test harness for the regular expression package.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *		src/test/modules/test_regex/test_regex.c
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "regex/regex.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+
+/* all the options of interest for regex functions */
+typedef struct test_re_flags
+{
+	int			cflags;			/* compile flags for Spencer's regex code */
+	int			eflags;			/* execute flags for Spencer's regex code */
+	long		info;			/* expected re_info bits */
+	bool		glob;			/* do it globally (for each occurrence) */
+	bool		indices;		/* report indices not actual strings */
+	bool		partial;		/* expect partial match */
+} test_re_flags;
+
+/* cross-call state for test_regex() */
+typedef struct test_regex_ctx
+{
+	test_re_flags re_flags;		/* flags */
+	rm_detail_t details;		/* "details" from execution */
+	text	   *orig_str;		/* data string in original TEXT form */
+	int			nmatches;		/* number of places where pattern matched */
+	int			npatterns;		/* number of capturing subpatterns */
+	/* We store start char index and end+1 char index for each match */
+	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
+	int		   *match_locs;		/* 0-based character indexes */
+	int			next_match;		/* 0-based index of next match to process */
+	/* workspace for build_test_match_result() */
+	Datum	   *elems;			/* has npatterns+1 elements */
+	bool	   *nulls;			/* has npatterns+1 elements */
+	pg_wchar   *wide_str;		/* wide-char version of original string */
+	char	   *conv_buf;		/* conversion buffer, if needed */
+	int			conv_bufsiz;	/* size thereof */
+} test_regex_ctx;
+
+/* Local functions */
+static void test_re_compile(text *text_re, int cflags, Oid collation,
+							regex_t *result_re);
+static void parse_test_flags(test_re_flags *flags, text *opts);
+static test_regex_ctx *setup_test_matches(text *orig_str,
+										  regex_t *cpattern,
+										  test_re_flags *flags,
+										  Oid collation,
+										  bool use_subpatterns);
+static ArrayType *build_test_info_result(regex_t *cpattern,
+										 test_re_flags *flags);
+static ArrayType *build_test_match_result(test_regex_ctx *matchctx);
+
+
+/*
+ * test_regex(pattern text, string text, flags text) returns setof text[]
+ *
+ * This is largely based on regexp.c's regexp_matches, with additions
+ * for debugging purposes.
+ */
+PG_FUNCTION_INFO_V1(test_regex);
+
+Datum
+test_regex(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	test_regex_ctx *matchctx;
+	ArrayType  *result_ary;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		text	   *pattern = PG_GETARG_TEXT_PP(0);
+		text	   *flags = PG_GETARG_TEXT_PP(2);
+		Oid			collation = PG_GET_COLLATION();
+		test_re_flags re_flags;
+		regex_t		cpattern;
+		MemoryContext oldcontext;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Determine options */
+		parse_test_flags(&re_flags, flags);
+
+		/* set up the compiled pattern */
+		test_re_compile(pattern, re_flags.cflags, collation, &cpattern);
+
+		/* be sure to copy the input string into the multi-call ctx */
+		matchctx = setup_test_matches(PG_GETARG_TEXT_P_COPY(1), &cpattern,
+									  &re_flags,
+									  collation,
+									  true);
+
+		/* Pre-create workspace that build_test_match_result needs */
+		matchctx->elems = (Datum *) palloc(sizeof(Datum) *
+										   (matchctx->npatterns + 1));
+		matchctx->nulls = (bool *) palloc(sizeof(bool) *
+										  (matchctx->npatterns + 1));
+
+		MemoryContextSwitchTo(oldcontext);
+		funcctx->user_fctx = (void *) matchctx;
+
+		/*
+		 * Return the first result row, which is info equivalent to Tcl's
+		 * "regexp -about" output
+		 */
+		result_ary = build_test_info_result(&cpattern, &re_flags);
+
+		pg_regfree(&cpattern);
+
+		SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
+	}
+	else
+	{
+		/* Each subsequent row describes one match */
+		funcctx = SRF_PERCALL_SETUP();
+		matchctx = (test_regex_ctx *) funcctx->user_fctx;
+
+		if (matchctx->next_match < matchctx->nmatches)
+		{
+			result_ary = build_test_match_result(matchctx);
+			matchctx->next_match++;
+			SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
+		}
+	}
+
+	SRF_RETURN_DONE(funcctx);
+}
+
+
+/*
+ * test_re_compile - compile a RE
+ *
+ *	text_re --- the pattern, expressed as a TEXT object
+ *	cflags --- compile options for the pattern
+ *	collation --- collation to use for LC_CTYPE-dependent behavior
+ *  result_re --- output, compiled RE is stored here
+ *
+ * Pattern is given in the database encoding.  We internally convert to
+ * an array of pg_wchar, which is what Spencer's regex package wants.
+ *
+ * Caller must eventually pg_regfree the resulting RE to avoid memory leaks.
+ */
+static void
+test_re_compile(text *text_re, int cflags, Oid collation,
+				regex_t *result_re)
+{
+	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
+	char	   *text_re_val = VARDATA_ANY(text_re);
+	pg_wchar   *pattern;
+	int			pattern_len;
+	int			regcomp_result;
+	char		errMsg[100];
+
+	/* Convert pattern string to wide characters */
+	pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
+	pattern_len = pg_mb2wchar_with_len(text_re_val,
+									   pattern,
+									   text_re_len);
+
+	regcomp_result = pg_regcomp(result_re,
+								pattern,
+								pattern_len,
+								cflags,
+								collation);
+
+	pfree(pattern);
+
+	if (regcomp_result != REG_OKAY)
+	{
+		/* re didn't compile (no need for pg_regfree, if so) */
+
+		/*
+		 * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
+		 * before reporting a regex error.  This is so that if the regex
+		 * library aborts and returns REG_CANCEL, we don't print an error
+		 * message that implies the regex was invalid.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg));
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+				 errmsg("invalid regular expression: %s", errMsg)));
+	}
+}
+
+/*
+ * test_re_execute - execute a RE on pg_wchar data
+ *
+ * Returns true on match, false on no match
+ * Arguments are as for pg_regexec
+ */
+static bool
+test_re_execute(regex_t *re, pg_wchar *data, int data_len,
+				int start_search,
+				rm_detail_t *details,
+				int nmatch, regmatch_t *pmatch,
+				int eflags)
+{
+	int			regexec_result;
+	char		errMsg[100];
+
+	/* Initialize match locations in case engine doesn't */
+	details->rm_extend.rm_so = -1;
+	details->rm_extend.rm_eo = -1;
+	for (int i = 0; i < nmatch; i++)
+	{
+		pmatch[i].rm_so = -1;
+		pmatch[i].rm_eo = -1;
+	}
+
+	/* Perform RE match and return result */
+	regexec_result = pg_regexec(re,
+								data,
+								data_len,
+								start_search,
+								details,
+								nmatch,
+								pmatch,
+								eflags);
+
+	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
+	{
+		/* re failed??? */
+		CHECK_FOR_INTERRUPTS();
+		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+				 errmsg("regular expression failed: %s", errMsg)));
+	}
+
+	return (regexec_result == REG_OKAY);
+}
+
+
+/*
+ * parse_test_flags - parse the flags argument
+ *
+ *	flags --- output argument, filled with desired options
+ *	opts --- TEXT object, or NULL for defaults
+ */
+static void
+parse_test_flags(test_re_flags *flags, text *opts)
+{
+	/* these defaults must match Tcl's */
+	int			cflags = REG_ADVANCED;
+	int			eflags = 0;
+	long		info = 0;
+
+	flags->glob = false;
+	flags->indices = false;
+	flags->partial = false;
+
+	if (opts)
+	{
+		char	   *opt_p = VARDATA_ANY(opts);
+		int			opt_len = VARSIZE_ANY_EXHDR(opts);
+		int			i;
+
+		for (i = 0; i < opt_len; i++)
+		{
+			switch (opt_p[i])
+			{
+				case '-':
+					/* allowed, no-op */
+					break;
+				case '!':
+					flags->partial = true;
+					break;
+				case '*':
+					/* test requires Unicode --- ignored here */
+					break;
+				case '0':
+					flags->indices = true;
+					break;
+
+					/* These flags correspond to user-exposed RE options: */
+				case 'g':		/* global match */
+					flags->glob = true;
+					break;
+				case 'i':		/* case insensitive */
+					cflags |= REG_ICASE;
+					break;
+				case 'n':		/* \n affects ^ $ . [^ */
+					cflags |= REG_NEWLINE;
+					break;
+				case 'p':		/* ~Perl, \n affects . [^ */
+					cflags |= REG_NLSTOP;
+					cflags &= ~REG_NLANCH;
+					break;
+				case 'w':		/* weird, \n affects ^ $ only */
+					cflags &= ~REG_NLSTOP;
+					cflags |= REG_NLANCH;
+					break;
+				case 'x':		/* expanded syntax */
+					cflags |= REG_EXPANDED;
+					break;
+
+					/* These flags correspond to Tcl's -xflags options: */
+				case 'a':
+					cflags |= REG_ADVF;
+					break;
+				case 'b':
+					cflags &= ~REG_ADVANCED;
+					break;
+				case 'c':
+
+					/*
+					 * Tcl calls this TCL_REG_CANMATCH, but it's really
+					 * REG_EXPECT.  In this implementation we must also set
+					 * the partial and indices flags, so that
+					 * setup_test_matches and build_test_match_result will
+					 * emit the desired data.  (They'll emit more fields than
+					 * Tcl would, but that's fine.)
+					 */
+					cflags |= REG_EXPECT;
+					flags->partial = true;
+					flags->indices = true;
+					break;
+				case 'e':
+					cflags &= ~REG_ADVANCED;
+					cflags |= REG_EXTENDED;
+					break;
+				case 'q':
+					cflags &= ~REG_ADVANCED;
+					cflags |= REG_QUOTE;
+					break;
+				case 'o':		/* o for opaque */
+					cflags |= REG_NOSUB;
+					break;
+				case 's':		/* s for start */
+					cflags |= REG_BOSONLY;
+					break;
+				case '+':
+					cflags |= REG_FAKE;
+					break;
+				case ',':
+					cflags |= REG_PROGRESS;
+					break;
+				case '.':
+					cflags |= REG_DUMP;
+					break;
+				case ':':
+					eflags |= REG_MTRACE;
+					break;
+				case ';':
+					eflags |= REG_FTRACE;
+					break;
+				case '^':
+					eflags |= REG_NOTBOL;
+					break;
+				case '$':
+					eflags |= REG_NOTEOL;
+					break;
+				case 't':
+					cflags |= REG_EXPECT;
+					break;
+				case '%':
+					eflags |= REG_SMALL;
+					break;
+
+					/* These flags define expected info bits: */
+				case 'A':
+					info |= REG_UBSALNUM;
+					break;
+				case 'B':
+					info |= REG_UBRACES;
+					break;
+				case 'E':
+					info |= REG_UBBS;
+					break;
+				case 'H':
+					info |= REG_ULOOKAROUND;
+					break;
+				case 'I':
+					info |= REG_UIMPOSSIBLE;
+					break;
+				case 'L':
+					info |= REG_ULOCALE;
+					break;
+				case 'M':
+					info |= REG_UUNPORT;
+					break;
+				case 'N':
+					info |= REG_UEMPTYMATCH;
+					break;
+				case 'P':
+					info |= REG_UNONPOSIX;
+					break;
+				case 'Q':
+					info |= REG_UBOUNDS;
+					break;
+				case 'R':
+					info |= REG_UBACKREF;
+					break;
+				case 'S':
+					info |= REG_UUNSPEC;
+					break;
+				case 'T':
+					info |= REG_USHORTEST;
+					break;
+				case 'U':
+					info |= REG_UPBOTCH;
+					break;
+
+				default:
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("invalid regular expression test option: \"%.*s\"",
+									pg_mblen(opt_p + i), opt_p + i)));
+					break;
+			}
+		}
+	}
+	flags->cflags = cflags;
+	flags->eflags = eflags;
+	flags->info = info;
+}
+
+/*
+ * setup_test_matches --- do the initial matching
+ *
+ * To simplify memory management, we do all the matching in one swoop.
+ * The returned test_regex_ctx contains the locations of all the substrings
+ * matching the pattern.
+ */
+static test_regex_ctx *
+setup_test_matches(text *orig_str,
+				   regex_t *cpattern, test_re_flags *re_flags,
+				   Oid collation,
+				   bool use_subpatterns)
+{
+	test_regex_ctx *matchctx = palloc0(sizeof(test_regex_ctx));
+	int			eml = pg_database_encoding_max_length();
+	int			orig_len;
+	pg_wchar   *wide_str;
+	int			wide_len;
+	regmatch_t *pmatch;
+	int			pmatch_len;
+	int			array_len;
+	int			array_idx;
+	int			prev_match_end;
+	int			start_search;
+	int			maxlen = 0;		/* largest fetch length in characters */
+
+	/* save flags */
+	matchctx->re_flags = *re_flags;
+
+	/* save original string --- we'll extract result substrings from it */
+	matchctx->orig_str = orig_str;
+
+	/* convert string to pg_wchar form for matching */
+	orig_len = VARSIZE_ANY_EXHDR(orig_str);
+	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
+	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
+
+	/* do we want to remember subpatterns? */
+	if (use_subpatterns && cpattern->re_nsub > 0)
+	{
+		matchctx->npatterns = cpattern->re_nsub + 1;
+		pmatch_len = cpattern->re_nsub + 1;
+	}
+	else
+	{
+		use_subpatterns = false;
+		matchctx->npatterns = 1;
+		pmatch_len = 1;
+	}
+
+	/* temporary output space for RE package */
+	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
+
+	/*
+	 * the real output space (grown dynamically if needed)
+	 *
+	 * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
+	 * than at 2^27
+	 */
+	array_len = re_flags->glob ? 255 : 31;
+	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
+	array_idx = 0;
+
+	/* search for the pattern, perhaps repeatedly */
+	prev_match_end = 0;
+	start_search = 0;
+	while (test_re_execute(cpattern, wide_str, wide_len,
+						   start_search,
+						   &matchctx->details,
+						   pmatch_len, pmatch,
+						   re_flags->eflags))
+	{
+		/* enlarge output space if needed */
+		while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
+		{
+			array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
+			if (array_len > MaxAllocSize / sizeof(int))
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("too many regular expression matches")));
+			matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
+													sizeof(int) * array_len);
+		}
+
+		/* save this match's locations */
+		for (int i = 0; i < matchctx->npatterns; i++)
+		{
+			int			so = pmatch[i].rm_so;
+			int			eo = pmatch[i].rm_eo;
+
+			matchctx->match_locs[array_idx++] = so;
+			matchctx->match_locs[array_idx++] = eo;
+			if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+				maxlen = (eo - so);
+		}
+		matchctx->nmatches++;
+		prev_match_end = pmatch[0].rm_eo;
+
+		/* if not glob, stop after one match */
+		if (!re_flags->glob)
+			break;
+
+		/*
+		 * Advance search position.  Normally we start the next search at the
+		 * end of the previous match; but if the match was of zero length, we
+		 * have to advance by one character, or we'd just find the same match
+		 * again.
+		 */
+		start_search = prev_match_end;
+		if (pmatch[0].rm_so == pmatch[0].rm_eo)
+			start_search++;
+		if (start_search > wide_len)
+			break;
+	}
+
+	/*
+	 * If we had no match, but "partial" and "indices" are set, emit the
+	 * details.
+	 */
+	if (matchctx->nmatches == 0 && re_flags->partial && re_flags->indices)
+	{
+		matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_so;
+		matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_eo;
+		/* we don't have pmatch data, so emit -1 */
+		for (int i = 1; i < matchctx->npatterns; i++)
+		{
+			matchctx->match_locs[array_idx++] = -1;
+			matchctx->match_locs[array_idx++] = -1;
+		}
+		matchctx->nmatches++;
+	}
+
+	if (eml > 1)
+	{
+		int64		maxsiz = eml * (int64) maxlen;
+		int			conv_bufsiz;
+
+		/*
+		 * Make the conversion buffer large enough for any substring of
+		 * interest.
+		 *
+		 * Worst case: assume we need the maximum size (maxlen*eml), but take
+		 * advantage of the fact that the original string length in bytes is
+		 * an upper bound on the byte length of any fetched substring (and we
+		 * know that len+1 is safe to allocate because the varlena header is
+		 * longer than 1 byte).
+		 */
+		if (maxsiz > orig_len)
+			conv_bufsiz = orig_len + 1;
+		else
+			conv_bufsiz = maxsiz + 1;	/* safe since maxsiz < 2^30 */
+
+		matchctx->conv_buf = palloc(conv_bufsiz);
+		matchctx->conv_bufsiz = conv_bufsiz;
+		matchctx->wide_str = wide_str;
+	}
+	else
+	{
+		/* No need to keep the wide string if we're in a single-byte charset. */
+		pfree(wide_str);
+		matchctx->wide_str = NULL;
+		matchctx->conv_buf = NULL;
+		matchctx->conv_bufsiz = 0;
+	}
+
+	/* Clean up temp storage */
+	pfree(pmatch);
+
+	return matchctx;
+}
+
+/*
+ * build_test_info_result - build output array describing compiled regexp
+ *
+ * This borrows some code from Tcl's TclRegAbout().
+ */
+static ArrayType *
+build_test_info_result(regex_t *cpattern, test_re_flags *flags)
+{
+	/* Translation data for flag bits in regex_t.re_info */
+	struct infoname
+	{
+		int			bit;
+		const char *text;
+	};
+	static const struct infoname infonames[] = {
+		{REG_UBACKREF, "REG_UBACKREF"},
+		{REG_ULOOKAROUND, "REG_ULOOKAROUND"},
+		{REG_UBOUNDS, "REG_UBOUNDS"},
+		{REG_UBRACES, "REG_UBRACES"},
+		{REG_UBSALNUM, "REG_UBSALNUM"},
+		{REG_UPBOTCH, "REG_UPBOTCH"},
+		{REG_UBBS, "REG_UBBS"},
+		{REG_UNONPOSIX, "REG_UNONPOSIX"},
+		{REG_UUNSPEC, "REG_UUNSPEC"},
+		{REG_UUNPORT, "REG_UUNPORT"},
+		{REG_ULOCALE, "REG_ULOCALE"},
+		{REG_UEMPTYMATCH, "REG_UEMPTYMATCH"},
+		{REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"},
+		{REG_USHORTEST, "REG_USHORTEST"},
+		{0, NULL}
+	};
+	const struct infoname *inf;
+	Datum		elems[lengthof(infonames) + 1];
+	int			nresults = 0;
+	char		buf[80];
+	int			dims[1];
+	int			lbs[1];
+
+	/* Set up results: first, the number of subexpressions */
+	snprintf(buf, sizeof(buf), "%d", (int) cpattern->re_nsub);
+	elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
+
+	/* Report individual info bit states */
+	for (inf = infonames; inf->bit != 0; inf++)
+	{
+		if (cpattern->re_info & inf->bit)
+		{
+			if (flags->info & inf->bit)
+				elems[nresults++] = PointerGetDatum(cstring_to_text(inf->text));
+			else
+			{
+				snprintf(buf, sizeof(buf), "unexpected %s!", inf->text);
+				elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
+			}
+		}
+		else
+		{
+			if (flags->info & inf->bit)
+			{
+				snprintf(buf, sizeof(buf), "missing %s!", inf->text);
+				elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
+			}
+		}
+	}
+
+	/* And form an array */
+	dims[0] = nresults;
+	lbs[0] = 1;
+	/* XXX: this hardcodes assumptions about the text type */
+	return construct_md_array(elems, NULL, 1, dims, lbs,
+							  TEXTOID, -1, false, TYPALIGN_INT);
+}
+
+/*
+ * build_test_match_result - build output array for current match
+ *
+ * Note that if the indices flag is set, we don't need any strings,
+ * just the location data.
+ */
+static ArrayType *
+build_test_match_result(test_regex_ctx *matchctx)
+{
+	char	   *buf = matchctx->conv_buf;
+	Datum	   *elems = matchctx->elems;
+	bool	   *nulls = matchctx->nulls;
+	bool		indices = matchctx->re_flags.indices;
+	char		bufstr[80];
+	int			dims[1];
+	int			lbs[1];
+	int			loc;
+	int			i;
+
+	/* Extract matching substrings from the original string */
+	loc = matchctx->next_match * matchctx->npatterns * 2;
+	for (i = 0; i < matchctx->npatterns; i++)
+	{
+		int			so = matchctx->match_locs[loc++];
+		int			eo = matchctx->match_locs[loc++];
+
+		if (indices)
+		{
+			/* Report eo this way for consistency with Tcl */
+			snprintf(bufstr, sizeof(bufstr), "%d %d",
+					 so, so < 0 ? eo : eo - 1);
+			elems[i] = PointerGetDatum(cstring_to_text(bufstr));
+			nulls[i] = false;
+		}
+		else if (so < 0 || eo < 0)
+		{
+			elems[i] = (Datum) 0;
+			nulls[i] = true;
+		}
+		else if (buf)
+		{
+			int			len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+												   buf,
+												   eo - so);
+
+			Assert(len < matchctx->conv_bufsiz);
+			elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+			nulls[i] = false;
+		}
+		else
+		{
+			elems[i] = DirectFunctionCall3(text_substr,
+										   PointerGetDatum(matchctx->orig_str),
+										   Int32GetDatum(so + 1),
+										   Int32GetDatum(eo - so));
+			nulls[i] = false;
+		}
+	}
+
+	/* In EXPECT indices mode, also report the "details" */
+	if (indices && (matchctx->re_flags.cflags & REG_EXPECT))
+	{
+		int			so = matchctx->details.rm_extend.rm_so;
+		int			eo = matchctx->details.rm_extend.rm_eo;
+
+		snprintf(bufstr, sizeof(bufstr), "%d %d",
+				 so, so < 0 ? eo : eo - 1);
+		elems[i] = PointerGetDatum(cstring_to_text(bufstr));
+		nulls[i] = false;
+		i++;
+	}
+
+	/* And form an array */
+	dims[0] = i;
+	lbs[0] = 1;
+	/* XXX: this hardcodes assumptions about the text type */
+	return construct_md_array(elems, nulls, 1, dims, lbs,
+							  TEXTOID, -1, false, TYPALIGN_INT);
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2021-01-06 10:51:14 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2021-01-06 10:51:14 -0500
commit	ca8217c10138fa3ffe1e7d1def2484fd0eb78226 (patch)
tree	65e42fc9ad482246780e2dc1abadb502c68cf517 /src/test/modules/test_regex/test_regex.c
parent	4656e3d66893f286767285cf74dabb3877068e49 (diff)
download	postgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.tar.gz postgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.zip