1 files changed, 331 insertions, 290 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 7f140ddfdcb..05c00deaf98 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.71 2007/03/28 22:59:37 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.72 2007/08/11 03:56:24 tgl Exp $
  *
  *		Alistair Crooks added the code for the regex caching
  *		agc - cached the regular expressions used - there's a good chance
@@ -29,19 +29,42 @@
  */
 #include "postgres.h"
 
-#include "access/heapam.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "regex/regex.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
-#include "utils/lsyscache.h"
+
+#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
+	(PG_NARGS() > (_n) ? PG_GETARG_TEXT_P(_n) : NULL)
 
 
 /* GUC-settable flavor parameter */
 static int	regex_flavor = REG_ADVANCED;
 
 
+/* all the options of interest for regex functions */
+typedef struct pg_re_flags
+{
+	int			cflags;			/* compile flags for Spencer's regex code */
+	bool		glob;			/* do it globally (for each occurrence) */
+} pg_re_flags;
+
+/* cross-call state for regexp_matches(), also regexp_split() */
+typedef struct regexp_matches_ctx
+{
+	text	   *orig_str;		/* data string in original TEXT form */
+	int			nmatches;		/* number of places where pattern matched */
+	int			npatterns;		/* number of capturing subpatterns */
+	/* We store start char index and end+1 char index for each match */
+	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
+	int		   *match_locs;		/* 0-based character indexes */
+	int			next_match;		/* 0-based index of next match to process */
+	/* workspace for build_regexp_matches_result() */
+	Datum	   *elems;			/* has npatterns elements */
+	bool	   *nulls;			/* has npatterns elements */
+} regexp_matches_ctx;
+
 /*
  * We cache precompiled regular expressions using a "self organizing list"
  * structure, in which recently-used items tend to be near the front.
@@ -79,48 +102,18 @@ typedef struct cached_re_str
 	regex_t		cre_re;			/* the compiled regular expression */
 } cached_re_str;
 
-typedef struct re_comp_flags
-{
-	int			  cflags;
-	bool		  glob;
-} re_comp_flags;
-
-typedef struct regexp_matches_ctx
-{
-	text		 *orig_str;
-	size_t		  orig_len;
-	pg_wchar	 *wide_str;
-	size_t		  wide_len;
-	regex_t		 *cpattern;
-	regmatch_t	 *pmatch;
-	size_t		  offset;
-
-	re_comp_flags flags;
-} regexp_matches_ctx;
-
-typedef struct regexp_split_ctx
-{
-	text		 *orig_str;
-	size_t		  orig_len;
-	pg_wchar	 *wide_str;
-	size_t		  wide_len;
-	regex_t		 *cpattern;
-	regmatch_t	  match;
-	size_t		  offset;
-	re_comp_flags flags;
-} regexp_split_ctx;
-
-
 static int	num_res = 0;		/* # of cached re's */
 static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
 
-static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
-												text *flags);
-static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx);
 
-static regexp_split_ctx *setup_regexp_split(text *str, text *pattern,
-											text *flags);
-static Datum get_next_split(regexp_split_ctx *splitctx);
+/* Local functions */
+static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
+												text *flags,
+												bool force_glob,
+												bool use_subpatterns,
+												bool ignore_degenerate);
+static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 
 
 /*
@@ -139,7 +132,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 {
 	int			text_re_len = VARSIZE(text_re);
 	pg_wchar   *pattern;
-	size_t		pattern_len;
+	int			pattern_len;
 	int			i;
 	int			regcomp_result;
 	cached_re_str re_temp;
@@ -235,7 +228,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 }
 
 /*
- * RE_wchar_execute - execute a RE
+ * RE_wchar_execute - execute a RE on pg_wchar data
  *
  * Returns TRUE on match, FALSE on no match
  *
@@ -250,7 +243,7 @@ RE_compile_and_cache(text *text_re, int cflags)
  */
 static bool
 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
-				 size_t start_search, int nmatch, regmatch_t *pmatch)
+				 int start_search, int nmatch, regmatch_t *pmatch)
 {
 	int			regexec_result;
 	char		errMsg[100];
@@ -295,7 +288,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 		   int nmatch, regmatch_t *pmatch)
 {
 	pg_wchar   *data;
-	size_t		data_len;
+	int			data_len;
 	bool		match;
 
 	/* Convert data string to wide characters */
@@ -304,6 +297,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 
 	/* Perform RE match and return result */
 	match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
+
 	pfree(data);
 	return match;
 }
@@ -334,17 +328,28 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 	return RE_execute(re, dat, dat_len, nmatch, pmatch);
 }
 
+
+/*
+ * parse_re_flags - parse the options argument of regexp_matches and friends
+ *
+ *	flags --- output argument, filled with desired options
+ *	opts --- *untoasted* TEXT object, or NULL for defaults
+ *
+ * This accepts all the options allowed by any of the callers; callers that
+ * don't want some have to reject them after the fact.
+ */
 static void
-parse_re_comp_flags(re_comp_flags *flags, text *opts)
+parse_re_flags(pg_re_flags *flags, text *opts)
 {
-	MemSet(flags, 0, sizeof(re_comp_flags));
+	/* regex_flavor is always folded into the compile flags */
 	flags->cflags = regex_flavor;
+	flags->glob = false;
 
 	if (opts)
 	{
-		char  *opt_p = VARDATA(opts);
-		size_t opt_len = VARSIZE(opts) - VARHDRSZ;
-		int i;
+		char   *opt_p = VARDATA(opts);
+		int		opt_len = VARSIZE(opts) - VARHDRSZ;
+		int		i;
 
 		for (i = 0; i < opt_len; i++)
 		{
@@ -353,28 +358,49 @@ parse_re_comp_flags(re_comp_flags *flags, text *opts)
 				case 'g':
 					flags->glob = true;
 					break;
-				case 'i':
+				case 'b':	/* BREs (but why???) */
+					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
+					break;
+				case 'c':	/* case sensitive */
+					flags->cflags &= ~REG_ICASE;
+					break;
+				case 'e':	/* plain EREs */
+					flags->cflags |= REG_EXTENDED;
+					flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+					break;
+				case 'i':	/* case insensitive */
 					flags->cflags |= REG_ICASE;
 					break;
-				case 'm':
-				case 'n':
+				case 'm':	/* Perloid synonym for n */
+				case 'n':	/* \n affects ^ $ . [^ */
 					flags->cflags |= REG_NEWLINE;
 					break;
-				case 'p':
+				case 'p':	/* ~Perl, \n affects . [^ */
 					flags->cflags |= REG_NLSTOP;
 					flags->cflags &= ~REG_NLANCH;
 					break;
-				case 'w':
+				case 'q':	/* literal string */
+					flags->cflags |= REG_QUOTE;
+					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
+					break;
+				case 's':	/* single line, \n ordinary */
+					flags->cflags &= ~REG_NEWLINE;
+					break;
+				case 't':	/* tight syntax */
+					flags->cflags &= ~REG_EXPANDED;
+					break;
+				case 'w':	/* weird, \n affects ^ $ only */
 					flags->cflags &= ~REG_NLSTOP;
 					flags->cflags |= REG_NLANCH;
 					break;
-				case 'x':
+				case 'x':	/* expanded syntax */
 					flags->cflags |= REG_EXPANDED;
 					break;
 				default:
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-							 errmsg("invalid regexp option: %c", opt_p[i])));
+							 errmsg("invalid regexp option: \"%c\"",
+									opt_p[i])));
 					break;
 			}
 		}
@@ -410,6 +436,16 @@ assign_regex_flavor(const char *value, bool doit, GucSource source)
 
 
 /*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+	return (regex_flavor == REG_BASIC);
+}
+
+
+/*
  *	interface routines called by the function manager
  */
 
@@ -605,16 +641,17 @@ textregexreplace(PG_FUNCTION_ARGS)
 	text	   *r = PG_GETARG_TEXT_P(2);
 	text	   *opt = PG_GETARG_TEXT_P(3);
 	regex_t    *re;
-	re_comp_flags flags;
+	pg_re_flags flags;
 
-	parse_re_comp_flags(&flags, opt);
+	parse_re_flags(&flags, opt);
 
 	re = RE_compile_and_cache(p, flags.cflags);
 
 	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 }
 
-/* similar_escape()
+/*
+ * similar_escape()
  * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
  * our regexp engine.
  */
@@ -735,185 +772,255 @@ similar_escape(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(result);
 }
 
-#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
-	(PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL)
-
+/*
+ * regexp_matches()
+ *		Return a table of matches of a pattern within a string.
+ */
 Datum
 regexp_matches(PG_FUNCTION_ARGS)
 {
 	FuncCallContext		*funcctx;
-	MemoryContext		 oldcontext;
 	regexp_matches_ctx	*matchctx;
 
 	if (SRF_IS_FIRSTCALL())
 	{
 		text *pattern = PG_GETARG_TEXT_P(1);
 		text *flags   = PG_GETARG_TEXT_P_IF_EXISTS(2);
+		MemoryContext		 oldcontext;
 
 		funcctx = SRF_FIRSTCALL_INIT();
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
 		/* be sure to copy the input string into the multi-call ctx */
 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags);
+										flags, false, true, false);
+
+		/* Pre-create workspace that build_regexp_matches_result needs */
+		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
+		matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
 
 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) matchctx;
-
-		/*
-		 * Avoid run-away function by making sure we never iterate
-		 * more than the length of the text + 1 (the number of matches
-		 * an empty pattern will make is length + 1)
-		 */
-		if (matchctx->flags.glob)
-			funcctx->max_calls = matchctx->wide_len + 1;
-		else
-			funcctx->max_calls = 0;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
 	matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
 
-	if (funcctx->call_cntr > funcctx->max_calls)
-	{
-		/*
-		 * If max_calls == 0, then we are doing a non-global match, we
-		 * should stop now, no problem.  Otherwise, if we exceed
-		 * max_calls something really wonky is going on, since it is
-		 * returning more matches than there are characters in the
-		 * string, which should not happen
-		 */
-		if (funcctx->max_calls != 0)
-			elog(ERROR, "set returning match function terminated after iterating %d times",
-				 funcctx->call_cntr);
-
-		SRF_RETURN_DONE(funcctx);
-	}
-
-	if (matchctx->offset < matchctx->wide_len)
+	if (matchctx->next_match < matchctx->nmatches)
 	{
 		ArrayType *result_ary;
 
-		if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo)
-			matchctx->offset++;
-
-		result_ary = perform_regexp_matches(matchctx);
-		if (result_ary != NULL)
-		{
-			matchctx->offset = matchctx->pmatch[0].rm_eo;
-			SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
-		}
-		/* else fall through and return done */
+		result_ary = build_regexp_matches_result(matchctx);
+		matchctx->next_match++;
+		SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
 	}
 
 	SRF_RETURN_DONE(funcctx);
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 regexp_matches_no_flags(PG_FUNCTION_ARGS)
 {
 	return regexp_matches(fcinfo);
 }
 
+/*
+ * setup_regexp_matches --- do the initial matching for regexp_matches()
+ *		or regexp_split()
+ *
+ * To avoid having to re-find the compiled pattern on each call, we do
+ * all the matching in one swoop.  The returned regexp_matches_ctx contains
+ * the locations of all the substrings matching the pattern.
+ *
+ * The three bool parameters have only two patterns (one for each caller)
+ * but it seems clearer to distinguish the functionality this way than to
+ * key it all off one "is_split" flag.
+ */
 static regexp_matches_ctx *
-setup_regexp_matches(text *orig_str, text *pattern, text *flags)
+setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+					 bool force_glob, bool use_subpatterns,
+					 bool ignore_degenerate)
 {
-	regexp_matches_ctx	*matchctx = palloc(sizeof(regexp_matches_ctx));
-
+	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+	int			orig_len;
+	pg_wchar   *wide_str;
+	int			wide_len;
+	pg_re_flags	re_flags;
+	regex_t	   *cpattern;
+	regmatch_t *pmatch;
+	int			pmatch_len;
+	int			array_len;
+	int			array_idx;
+	int			prev_match_end;
+	int			start_search;
+
+	/* save original string --- we'll extract result substrings from it */
 	matchctx->orig_str = orig_str;
-	matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ;
-
-	parse_re_comp_flags(&matchctx->flags, flags);
 
-	matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags);
-	matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1));
-	matchctx->offset = 0;
+	/* convert string to pg_wchar form for matching */
+	orig_len = VARSIZE(orig_str) - VARHDRSZ;
+	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
+	wide_len = pg_mb2wchar_with_len(VARDATA(orig_str), wide_str, orig_len);
 
-	matchctx->wide_str = palloc(sizeof(pg_wchar) * (matchctx->orig_len + 1));
-	matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str),
-											  matchctx->wide_str, matchctx->orig_len);
-
-	matchctx->pmatch[0].rm_so = -1;
-	/* both < 0 but not equal */
-	matchctx->pmatch[0].rm_eo = -2;
+	/* determine options */
+	parse_re_flags(&re_flags, flags);
+	if (force_glob)
+	{
+		/* user mustn't specify 'g' for regexp_split */
+		if (re_flags.glob)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("regexp_split does not support the global option")));
+		/* but we find all the matches anyway */
+		re_flags.glob = true;
+	}
 
-	return matchctx;
-}
+	/* set up the compiled pattern */
+	cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
 
-static ArrayType *
-perform_regexp_matches(regexp_matches_ctx *matchctx)
-{
-	Datum 		*elems;
-	bool 		*nulls;
-	Datum 		 fullmatch;		/* used to avoid a palloc if no matches */
-	int 		 ndims = 1;
-	int 		 dims[1];
-	int          lbs[1] = {1};
-
-	if (RE_wchar_execute(matchctx->cpattern,
-						 matchctx->wide_str,
-						 matchctx->wide_len,
-						 matchctx->offset,
-						 matchctx->cpattern->re_nsub + 1,
-						 matchctx->pmatch) == false)
-		return NULL;
-
-	if (matchctx->cpattern->re_nsub > 0)
+	/* do we want to remember subpatterns? */
+	if (use_subpatterns && cpattern->re_nsub > 0)
 	{
-		int i;
+		matchctx->npatterns = cpattern->re_nsub;
+		pmatch_len = cpattern->re_nsub + 1;
+	}
+	else
+	{
+		use_subpatterns = false;
+		matchctx->npatterns = 1;
+		pmatch_len = 1;
+	}
 
-		elems = palloc(sizeof(Datum) * matchctx->cpattern->re_nsub);
-		nulls = palloc(sizeof(bool) * matchctx->cpattern->re_nsub);
-		dims[0] = matchctx->cpattern->re_nsub;
+	/* temporary output space for RE package */
+	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
 
-		for (i = 0; i < matchctx->cpattern->re_nsub; i++)
+	/* the real output space (grown dynamically if needed) */
+	array_len = re_flags.glob ? 256 : 32;
+	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
+	array_idx = 0;
+
+	/* search for the pattern, perhaps repeatedly */
+	prev_match_end = 0;
+	start_search = 0;
+	while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
+							pmatch_len, pmatch))
+	{
+		/*
+		 * If requested, ignore degenerate matches, which are zero-length
+		 * matches occurring at the start or end of a string or just after
+		 * a previous match.
+		 */
+		if (!ignore_degenerate ||
+			(pmatch[0].rm_so < wide_len &&
+			 pmatch[0].rm_eo > prev_match_end))
 		{
-			int so = matchctx->pmatch[i + 1].rm_so;
-			int	eo = matchctx->pmatch[i + 1].rm_eo;
+			/* enlarge output space if needed */
+			while (array_idx + matchctx->npatterns * 2 > array_len)
+			{
+				array_len *= 2;
+				matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
+														sizeof(int) * array_len);
+			}
 
-			if (so < 0 || eo < 0)
+			/* save this match's locations */
+			if (use_subpatterns)
 			{
-				elems[i] = 0;
-				nulls[i] = true;
+				int i;
+
+				for (i = 1; i <= matchctx->npatterns; i++)
+				{
+					matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
+					matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+				}
 			}
 			else
 			{
-				elems[i] = DirectFunctionCall3(text_substr,
-											   PointerGetDatum(matchctx->orig_str),
-											   Int32GetDatum(so + 1),
-											   Int32GetDatum(eo - so));
-				nulls[i] = false;
+				matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
+				matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
 			}
+			matchctx->nmatches++;
 		}
+		prev_match_end = pmatch[0].rm_eo;
+
+		/* if not glob, stop after one match */
+		if (!re_flags.glob)
+			break;
+
+		/*
+		 * Advance search position.  Normally we start just after the end
+		 * of the previous match, but always advance at least one character
+		 * (the special case can occur if the pattern matches zero characters
+		 * just after the prior match or at the end of the string).
+		 */
+		if (start_search < pmatch[0].rm_eo)
+			start_search = pmatch[0].rm_eo;
+		else
+			start_search++;
+		if (start_search > wide_len)
+			break;
 	}
-	else
-	{
-		int so = matchctx->pmatch[0].rm_so;
-		int	eo = matchctx->pmatch[0].rm_eo;
 
-		if (so < 0 || eo < 0)
-			elog(ERROR, "regexp code said it had a match, but did not return it");
+	/* Clean up temp storage */
+	pfree(wide_str);
+	pfree(pmatch);
 
-		fullmatch = DirectFunctionCall3(text_substr,
-										PointerGetDatum(matchctx->orig_str),
-										Int32GetDatum(so + 1),
-										Int32GetDatum(eo - so));
+	return matchctx;
+}
+
+/*
+ * build_regexp_matches_result - build output array for current match
+ */
+static ArrayType *
+build_regexp_matches_result(regexp_matches_ctx *matchctx)
+{
+	Datum	   *elems = matchctx->elems;
+	bool	   *nulls = matchctx->nulls;
+	int 		dims[1];
+	int         lbs[1];
+	int			loc;
+	int			i;
 
-		elems = &fullmatch;
-		nulls = NULL;
-		dims[0] = 1;
+	/* Extract matching substrings from the original string */
+	loc = matchctx->next_match * matchctx->npatterns * 2;
+	for (i = 0; i < matchctx->npatterns; i++)
+	{
+		int	so = matchctx->match_locs[loc++];
+		int	eo = matchctx->match_locs[loc++];
+
+		if (so < 0 || eo < 0)
+		{
+			elems[i] = (Datum) 0;
+			nulls[i] = true;
+		}
+		else
+		{
+			elems[i] = DirectFunctionCall3(text_substr,
+										   PointerGetDatum(matchctx->orig_str),
+										   Int32GetDatum(so + 1),
+										   Int32GetDatum(eo - so));
+			nulls[i] = false;
+		}
 	}
 
+	/* And form an array */
+	dims[0] = matchctx->npatterns;
+	lbs[0] = 1;
 	/* XXX: this hardcodes assumptions about the text type */
-	return construct_md_array(elems, nulls, ndims, dims, lbs,
+	return construct_md_array(elems, nulls, 1, dims, lbs,
 							  TEXTOID, -1, false, 'i');
 }
 
+/*
+ * regexp_split_to_table()
+ *		Split the string at matches of the pattern, returning the
+ *		split-out substrings as a table.
+ */
 Datum
 regexp_split_to_table(PG_FUNCTION_ARGS)
 {
 	FuncCallContext  *funcctx;
-	regexp_split_ctx *splitctx;
+	regexp_matches_ctx *splitctx;
 
 	if (SRF_IS_FIRSTCALL())
 	{
@@ -924,168 +1031,102 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 		funcctx = SRF_FIRSTCALL_INIT();
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
-		splitctx = setup_regexp_split(PG_GETARG_TEXT_P_COPY(0), pattern, flags);
+		/* be sure to copy the input string into the multi-call ctx */
+		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
+										flags, true, false, true);
 
 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) splitctx;
-
-		/*
-		 * Avoid run-away function by making sure we never iterate
-		 * more than the length of the text
-		 */
-		funcctx->max_calls = splitctx->wide_len;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
-	splitctx = (regexp_split_ctx *) funcctx->user_fctx;
+	splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
 
-	if (funcctx->call_cntr > funcctx->max_calls)
+	if (splitctx->next_match <= splitctx->nmatches)
 	{
-		/*
-		 * If we exceed wide_len something really wonky is going on,
-		 * since it is returning more matches than there are
-		 * characters in the string, which should not happen
-		 */
-		elog(ERROR, "set returning split function terminated after iterating %d times",
-			 funcctx->call_cntr);
+		Datum result = build_regexp_split_result(splitctx);
+
+		splitctx->next_match++;
+		SRF_RETURN_NEXT(funcctx, result);
 	}
 
-	if (splitctx->offset < splitctx->wide_len)
-		SRF_RETURN_NEXT(funcctx, get_next_split(splitctx));
-	else
-		SRF_RETURN_DONE(funcctx);
+	SRF_RETURN_DONE(funcctx);
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
 {
 	return regexp_split_to_table(fcinfo);
 }
 
+/*
+ * regexp_split_to_array()
+ *		Split the string at matches of the pattern, returning the
+ *		split-out substrings as an array.
+ */
 Datum regexp_split_to_array(PG_FUNCTION_ARGS)
 {
 	ArrayBuildState 	*astate = NULL;
-	regexp_split_ctx 	*splitctx;
-	int 				 nitems;
+	regexp_matches_ctx 	*splitctx;
 
-	splitctx = setup_regexp_split(PG_GETARG_TEXT_P(0),
-								  PG_GETARG_TEXT_P(1),
-								  PG_GETARG_TEXT_P_IF_EXISTS(2));
+	splitctx = setup_regexp_matches(PG_GETARG_TEXT_P(0),
+									PG_GETARG_TEXT_P(1),
+									PG_GETARG_TEXT_P_IF_EXISTS(2),
+									true, false, true);
 
-	for (nitems = 0; splitctx->offset < splitctx->wide_len; nitems++)
+	while (splitctx->next_match <= splitctx->nmatches)
 	{
-		if (nitems > splitctx->wide_len)
-			elog(ERROR, "split function terminated after iterating %d times",
-				 nitems);
-
 		astate = accumArrayResult(astate,
-								  get_next_split(splitctx),
+								  build_regexp_split_result(splitctx),
 								  false,
 								  TEXTOID,
 								  CurrentMemoryContext);
+		splitctx->next_match++;
 	}
 
 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
 }
 
+/* This is separate to keep the opr_sanity regression test from complaining */
 Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 {
 	return regexp_split_to_array(fcinfo);
 }
 
-static regexp_split_ctx *
-setup_regexp_split(text *str, text *pattern, text *flags)
-{
-	regexp_split_ctx *splitctx = palloc(sizeof(regexp_split_ctx));
-
-	splitctx->orig_str = str;
-	splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ;
-
-	parse_re_comp_flags(&splitctx->flags, flags);
-	if (splitctx->flags.glob)
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("regexp_split does not support the global option")));
-
-	splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags);
-
-	splitctx->wide_str = palloc(sizeof(pg_wchar) * (splitctx->orig_len + 1));
-	splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str),
-											  splitctx->wide_str,
-											  splitctx->orig_len);
-
-	splitctx->offset = 0;
-
-	splitctx->match.rm_so = -1;
-	/* both < 0 but not equal */
-	splitctx->match.rm_eo = -2;
-
-	return splitctx;
-}
-
+/*
+ * build_regexp_split_result - build output string for current match
+ *
+ * We return the string between the current match and the previous one,
+ * or the string after the last match when next_match == nmatches.
+ */
 static Datum
-get_next_split(regexp_split_ctx *splitctx)
+build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
-	regmatch_t *pmatch = &(splitctx->match);
-
-	for (;;)
-	{
-		Datum result;
-		int	  startpos = splitctx->offset + 1;
-
-		/*
-		 * If the last match was zero-length, we need to push the
-		 * offset forward to avoid matching the same place forever
-		 */
-		if (pmatch->rm_so == pmatch->rm_eo)
-			splitctx->offset++;
-
-		if (RE_wchar_execute(splitctx->cpattern,
-							 splitctx->wide_str,
-							 splitctx->wide_len,
-							 splitctx->offset,
-							 1,
-							 pmatch))
-		{
-			int length = splitctx->match.rm_so - startpos + 1;
-
-			/*
-			 * If we are trying to match at the beginning of the string and
-			 * we got a zero-length match, or if we just matched where we
-			 * left off last time, go around the loop again and increment
-			 * the offset.  If we have incremented the offset already and
-			 * it matched at the new offset, that's ok
-			 */
-			if (length == 0)
-				continue;
+	int		startpos;
+	int		endpos;
 
-			result = DirectFunctionCall3(text_substr,
-										 PointerGetDatum(splitctx->orig_str),
-										 Int32GetDatum(startpos),
-										 Int32GetDatum(length));
-
-			/* set the offset to the end of this match for next time */
-			splitctx->offset = pmatch->rm_eo;
-
-			return result;
-		}
+	if (splitctx->next_match > 0)
+		startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
+	else
+		startpos = 0;
+	if (startpos < 0)
+		elog(ERROR, "invalid match ending position");
 
+	if (splitctx->next_match < splitctx->nmatches)
+	{
+		endpos = splitctx->match_locs[splitctx->next_match * 2];
+		if (endpos < startpos)
+			elog(ERROR, "invalid match starting position");
+		return DirectFunctionCall3(text_substr,
+								   PointerGetDatum(splitctx->orig_str),
+								   Int32GetDatum(startpos + 1),
+								   Int32GetDatum(endpos - startpos));
+	}
+	else
+	{
 		/* no more matches, return rest of string */
-		result = DirectFunctionCall2(text_substr_no_len,
-									 PointerGetDatum(splitctx->orig_str),
-									 Int32GetDatum(startpos));
-
-		/* so we know we're done next time through */
-		splitctx->offset = splitctx->wide_len;
-
-		return result;
+		return DirectFunctionCall2(text_substr_no_len,
+								   PointerGetDatum(splitctx->orig_str),
+								   Int32GetDatum(startpos + 1));
 	}
 }
-
-/*
- * report whether regex_flavor is currently BASIC
- */
-bool
-regex_flavor_is_basic(void)
-{
-	return (regex_flavor == REG_BASIC);
-}