aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2012-02-24 01:40:18 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2012-02-24 01:41:03 -0500
commit173e29aa5deefd9e71c183583ba37805c8102a72 (patch)
treef7e997faabaa7aa3e19e00ee561096404817d092 /src/include/regex/regguts.h
parent0c9e5d5e0d407013bf66af01942a7b2dd3342546 (diff)
downloadpostgresql-173e29aa5deefd9e71c183583ba37805c8102a72.tar.gz
postgresql-173e29aa5deefd9e71c183583ba37805c8102a72.zip
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h24
1 files changed, 21 insertions, 3 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index fb6789b560f..d420ea8316e 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -372,10 +372,28 @@ struct cnfa
/*
* subexpression tree
+ *
+ * "op" is one of:
+ * '=' plain regex without interesting substructure (implemented as DFA)
+ * 'b' back-reference (has no substructure either)
+ * '(' capture node: captures the match of its single child
+ * '.' concatenation: matches a match for left, then a match for right
+ * '|' alternation: matches a match for left or a match for right
+ * '*' iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect. This could stand to be changed. Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top. This is
+ * for efficiency: there is no need to search for possible division points.
*/
struct subre
{
- char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */
+ char op; /* see type codes above */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
@@ -393,8 +411,8 @@ struct subre
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */
- short min; /* min repetitions, for backref only */
- short max; /* max repetitions, for backref only */
+ short min; /* min repetitions for iteration or backref */
+ short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */