Re-implement extraction of fixed prefixes from regular expressions.

To generate btree-indexable conditions from regex WHERE conditions (such as WHERE indexed_col ~ '^foo'), we need to be able to identify any fixed prefix that a regex might have; that is, find any string that must be a prefix of all strings satisfying the regex. We used to do that with entirely ad-hoc code that looked at the source text of the regex. It didn't know very much about regex syntax, which mostly meant that it would fail to identify some optimizable cases; but Viktor Rosenfeld reported that it would produce actively wrong answers for quantified parenthesized subexpressions, such as '^(foo)?bar'. Rather than trying to extend the ad-hoc code to cover this, let's get rid of it altogether in favor of identifying prefixes by examining the compiled form of a regex. To do this, I've added a new entry point "pg_regprefix" to the regex library; hopefully it is defined in a sufficiently general fashion that it can remain in the library when/if that code gets split out as a standalone project. Since this bug has been there for a very long time, this fix needs to get back-patched. However it depends on some other recent commits (particularly the addition of wchar-to-database-encoding conversion), so I'll commit this separately and then go to work on back-porting the necessary fixes.
author: Tom Lane <tgl@sss.pgh.pa.us> 2012-07-10 14:54:37 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2012-07-10 14:54:37 -0400
commit: 628cbb50ba80c83917b07a7609ddec12cda172d0 (patch)
tree: 7008492921c90e6de7c431633e33624a597a8416 /src/backend/regex/regprefix.c
parent: 00dac6000d422033c3e8d191f01ee0e6525794c2 (diff)
download: postgresql-628cbb50ba80c83917b07a7609ddec12cda172d0.tar.gz
postgresql-628cbb50ba80c83917b07a7609ddec12cda172d0.zip
1 files changed, 259 insertions, 0 deletions
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644
index 00000000000..6f91288e9df
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,259 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ *	  Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+					  chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ *	REG_NOMATCH: there is no common prefix of strings matching the regex
+ *	REG_PREFIX: there is a common prefix of strings matching the regex
+ *	REG_EXACT: all strings satisfying the regex must match the same string
+ *	or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly.  Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex.  But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+			 chr **string,
+			 size_t *slength)
+{
+	struct guts *g;
+	struct cnfa *cnfa;
+	int			st;
+
+	/* sanity checks */
+	if (string == NULL || slength == NULL)
+		return REG_INVARG;
+	*string = NULL;				/* initialize for failure cases */
+	*slength = 0;
+	if (re == NULL || re->re_magic != REMAGIC)
+		return REG_INVARG;
+	if (re->re_csize != sizeof(chr))
+		return REG_MIXED;
+
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
+
+	/* setup */
+	g = (struct guts *) re->re_guts;
+	if (g->info & REG_UIMPOSSIBLE)
+		return REG_NOMATCH;
+
+	/*
+	 * This implementation considers only the search NFA for the topmost regex
+	 * tree node.  Therefore, constraints such as backrefs are not fully
+	 * applied, which is allowed per the function's API spec.
+	 */
+	assert(g->tree != NULL);
+	cnfa = &g->tree->cnfa;
+
+	/*
+	 * Since a correct NFA should never contain any exit-free loops, it should
+	 * not be possible for our traversal to return to a previously visited
+	 * NFA state.  Hence we need at most nstates chrs in the output string.
+	 */
+	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+	if (*string == NULL)
+		return REG_ESPACE;
+
+	/* do it */
+	st = findprefix(cnfa, &g->cmap, *string, slength);
+
+	assert(*slength <= cnfa->nstates);
+
+	/* clean up */
+	if (st != REG_PREFIX && st != REG_EXACT)
+	{
+		FREE(*string);
+		*string = NULL;
+		*slength = 0;
+	}
+
+	return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int						/* regprefix return code */
+findprefix(struct cnfa * cnfa,
+		   struct colormap * cm,
+		   chr *string,
+		   size_t *slength)
+{
+	int			st;
+	int			nextst;
+	color		thiscolor;
+	chr			c;
+	struct carc *ca;
+
+	/*
+	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+	 * anchored left.  If we have both BOS and BOL, they must go to the
+	 * same next state.
+	 */
+	st = cnfa->pre;
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+				return REG_NOMATCH;
+		}
+		else
+			return REG_NOMATCH;
+	}
+	if (nextst == -1)
+		return REG_NOMATCH;
+
+	/*
+	 * Scan through successive states, stopping as soon as we find one with
+	 * more than one acceptable transition character (either multiple colors
+	 * on out-arcs, or a color with more than one member chr).
+	 *
+	 * We could find a state with multiple out-arcs that are all labeled with
+	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+	 * In that case we add the chr "c" to the output string but then exit the
+	 * loop with nextst == -1.  This leaves a little bit on the table: if the
+	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
+	 * worth the trouble.
+	 */
+	do
+	{
+		st = nextst;
+		nextst = -1;
+		thiscolor = COLORLESS;
+		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+		{
+			/* We ignore lookahead constraints */
+			if (ca->co >= cnfa->ncolors)
+				continue;
+			/* We can also ignore BOS/BOL arcs */
+			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+				continue;
+			/* ... but EOS/EOL arcs terminate the search */
+			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+			{
+				thiscolor = COLORLESS;
+				break;
+			}
+			if (thiscolor == COLORLESS)
+			{
+				/* First plain outarc */
+				thiscolor = ca->co;
+				nextst = ca->to;
+			}
+			else if (thiscolor == ca->co)
+			{
+				/* Another plain outarc for same color */
+				nextst = -1;
+			}
+			else
+			{
+				/* More than one plain outarc color terminates the search */
+				thiscolor = COLORLESS;
+				break;
+			}
+		}
+		/* Done if we didn't find exactly one color on plain outarcs */
+		if (thiscolor == COLORLESS)
+			break;
+		/* The color must be a singleton */
+		if (cm->cd[thiscolor].nchrs != 1)
+			break;
+
+		/*
+		 * Identify the color's sole member chr and add it to the prefix
+		 * string.  In general the colormap data structure doesn't provide a
+		 * way to find color member chrs, except by trying GETCOLOR() on each
+		 * possible chr value, which won't do at all.  However, for the cases
+		 * we care about it should be sufficient to test the "firstchr" value,
+		 * that is the first chr ever added to the color.  There are cases
+		 * where this might no longer be a member of the color (so we do need
+		 * to test), but none of them are likely to arise for a character that
+		 * is a member of a common prefix.  If we do hit such a corner case,
+		 * we just fall out without adding anything to the prefix string.
+		 */
+		c = cm->cd[thiscolor].firstchr;
+		if (GETCOLOR(cm, c) != thiscolor)
+			break;
+
+		string[(*slength)++] = c;
+
+		/* Advance to next state, but only if we have a unique next state */
+	} while (nextst != -1);
+
+	/*
+	 * If we ended at a state that only has EOS/EOL outarcs leading to the
+	 * "post" state, then we have an exact-match string.  Note this is true
+	 * even if the string is of zero length.
+	 */
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+			{
+				nextst = -1;
+				break;
+			}
+		}
+		else
+		{
+			nextst = -1;
+			break;
+		}
+	}
+	if (nextst == cnfa->post)
+		return REG_EXACT;
+
+	/*
+	 * Otherwise, if we were unable to identify any prefix characters, say
+	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
+	 * particular first character.
+	 */
+	if (*slength > 0)
+		return REG_PREFIX;
+
+	return REG_NOMATCH;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2012-07-10 14:54:37 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2012-07-10 14:54:37 -0400
commit	628cbb50ba80c83917b07a7609ddec12cda172d0 (patch)
tree	7008492921c90e6de7c431633e33624a597a8416 /src/backend/regex/regprefix.c
parent	00dac6000d422033c3e8d191f01ee0e6525794c2 (diff)
download	postgresql-628cbb50ba80c83917b07a7609ddec12cda172d0.tar.gz postgresql-628cbb50ba80c83917b07a7609ddec12cda172d0.zip