aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
commit12c9a04008870c283931d6b3b648ee21bbc2cfda (patch)
tree2afd1e048b3681e5a93b7d8b3c37968e71b2532d /src/include/regex/regguts.h
parentc5057b2b34813ca114bc808cb56b7a7fcde64393 (diff)
downloadpostgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.tar.gz
postgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.zip
Implement lookbehind constraints in our regular-expression engine.
A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND constraints, which already existed in the engine. This avoids the overhead of the LACON machinery entirely for these rather common cases. The net result is that lookbehind constraints run a factor of three or so slower than Perl's for multi-character constraints, but faster than Perl's for one-character constraints ... and they work fine for variable-length constraints, which Perl gives up on entirely. So that's not bad from a competitive perspective, and there's room for further optimization if anyone cares. (In reality, raw scan rate across a large input string is probably not that big a deal for Postgres usage anyway; so I'm happy if it's linear.)
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h20
1 files changed, 14 insertions, 6 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 19fe991c74f..2ceffa6563b 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -89,13 +89,19 @@
*/
#define NOTREACHED 0
-#define xxx 1
#define DUPMAX _POSIX2_RE_DUP_MAX
#define DUPINF (DUPMAX+1)
#define REMAGIC 0xfed7 /* magic number for main struct */
+/* Type codes for lookaround constraints */
+#define LATYPE_AHEAD_POS 03 /* positive lookahead */
+#define LATYPE_AHEAD_NEG 02 /* negative lookahead */
+#define LATYPE_BEHIND_POS 01 /* positive lookbehind */
+#define LATYPE_BEHIND_NEG 00 /* negative lookbehind */
+#define LATYPE_IS_POS(la) ((la) & 01)
+#define LATYPE_IS_AHEAD(la) ((la) & 02)
/*
@@ -351,7 +357,7 @@ struct nfa
*
* The non-dummy carc structs are of two types: plain arcs and LACON arcs.
* Plain arcs just store the transition color number as "co". LACON arcs
- * store the lookahead constraint number plus cnfa.ncolors as "co". LACON
+ * store the lookaround constraint number plus cnfa.ncolors as "co". LACON
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
*/
struct carc
@@ -365,7 +371,7 @@ struct cnfa
int nstates; /* number of states */
int ncolors; /* number of colors (max color in use + 1) */
int flags;
-#define HASLACONS 01 /* uses lookahead constraints */
+#define HASLACONS 01 /* uses lookaround constraints */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
@@ -433,7 +439,8 @@ struct subre
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short id; /* ID of subre (1..ntree-1) */
- int subno; /* subexpression number (for 'b' and '(') */
+ int subno; /* subexpression number for 'b' and '(', or
+ * LATYPE code for lookaround constraint */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
@@ -479,6 +486,7 @@ struct guts
int ntree; /* number of subre's, plus one */
struct colormap cmap;
int FUNCPTR(compare, (const chr *, const chr *, size_t));
- struct subre *lacons; /* lookahead-constraint vector */
- int nlacons; /* size of lacons */
+ struct subre *lacons; /* lookaround-constraint vector */
+ int nlacons; /* size of lacons[]; note that only slots
+ * numbered 1 .. nlacons-1 are used */
};