aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regcomp.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
commit12c9a04008870c283931d6b3b648ee21bbc2cfda (patch)
tree2afd1e048b3681e5a93b7d8b3c37968e71b2532d /src/backend/regex/regcomp.c
parentc5057b2b34813ca114bc808cb56b7a7fcde64393 (diff)
downloadpostgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.tar.gz
postgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.zip
Implement lookbehind constraints in our regular-expression engine.
A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND constraints, which already existed in the engine. This avoids the overhead of the LACON machinery entirely for these rather common cases. The net result is that lookbehind constraints run a factor of three or so slower than Perl's for multi-character constraints, but faster than Perl's for one-character constraints ... and they work fine for variable-length constraints, which Perl gives up on entirely. So that's not bad from a competitive perspective, and there's room for further optimization if anyone cares. (In reality, raw scan rate across a large input string is probably not that big a deal for Postgres usage anyway; so I'm happy if it's linear.)
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r--src/backend/regex/regcomp.c152
1 files changed, 126 insertions, 26 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index b733bc7824e..aa759c26486 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -57,6 +57,8 @@ static const chr *scanplain(struct vars *);
static void onechr(struct vars *, chr, struct state *, struct state *);
static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
static void wordchrs(struct vars *);
+static void processlacon(struct vars *, struct state *, struct state *, int,
+ struct state *, struct state *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
@@ -65,7 +67,7 @@ static int numst(struct subre *, int);
static void markst(struct subre *);
static void cleanst(struct vars *);
static long nfatree(struct vars *, struct subre *, FILE *);
-static long nfanode(struct vars *, struct subre *, FILE *);
+static long nfanode(struct vars *, struct subre *, int, FILE *);
static int newlacon(struct vars *, struct state *, struct state *, int);
static void freelacons(struct subre *, int);
static void rfree(regex_t *);
@@ -146,6 +148,7 @@ static void deltraverse(struct nfa *, struct state *, struct state *);
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
static void duptraverse(struct nfa *, struct state *, struct state *);
static void cleartraverse(struct nfa *, struct state *);
+static struct state *single_color_transition(struct state *, struct state *);
static void specialcolors(struct nfa *);
static long optimize(struct nfa *, FILE *);
static void pullback(struct nfa *, FILE *);
@@ -245,8 +248,9 @@ struct vars
int ntree; /* number of tree nodes, plus one */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
- struct subre *lacons; /* lookahead-constraint vector */
- int nlacons; /* size of lacons */
+ struct subre *lacons; /* lookaround-constraint vector */
+ int nlacons; /* size of lacons[]; note that only slots
+ * numbered 1 .. nlacons-1 are used */
size_t spaceused; /* approx. space used for compilation */
};
@@ -277,7 +281,7 @@ struct vars
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
#define RANGE 'R' /* - within [] which might be range delim. */
-#define LACON 'L' /* lookahead constraint subRE */
+#define LACON 'L' /* lookaround constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
#define BEHIND 'r' /* color-lookbehind arc */
#define WBDRY 'w' /* word boundary constraint */
@@ -432,11 +436,15 @@ pg_regcomp(regex_t *re,
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++)
{
+ struct subre *lasub = &v->lacons[i];
+
#ifdef REG_DEBUG
if (debug != NULL)
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
#endif
- nfanode(v, &v->lacons[i], debug);
+
+ /* Prepend .* to pattern if it's a lookbehind LACON */
+ nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
}
CNOERR();
if (v->tree->flags & SHORTER)
@@ -640,7 +648,7 @@ makesearch(struct vars * v,
static struct subre *
parse(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * init, /* initial state */
struct state * final) /* final state */
{
@@ -719,7 +727,7 @@ parse(struct vars * v,
static struct subre *
parsebranch(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * left, /* leftmost state */
struct state * right, /* rightmost state */
int partial) /* is this only part of a branch? */
@@ -768,7 +776,7 @@ parsebranch(struct vars * v,
static void
parseqatom(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * lp, /* left state to hang it on */
struct state * rp, /* right state to hang it on */
struct subre * top) /* subtree top */
@@ -782,7 +790,7 @@ parseqatom(struct vars * v,
struct subre *atom; /* atom's subtree */
struct subre *t;
int cap; /* capturing parens? */
- int pos; /* positive lookahead? */
+ int latype; /* lookaround constraint type */
int subno; /* capturing-parens or backref number */
int atomtype;
int qprefer; /* quantifier short/long preference */
@@ -866,19 +874,18 @@ parseqatom(struct vars * v,
nonword(v, AHEAD, s, rp);
return;
break;
- case LACON: /* lookahead constraint */
- pos = v->nextvalue;
+ case LACON: /* lookaround constraint */
+ latype = v->nextvalue;
NEXT();
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERR();
t = parse(v, ')', LACON, s, s2);
freesubre(v, t); /* internal structure irrelevant */
- assert(SEE(')') || ISERR());
- NEXT();
- n = newlacon(v, s, s2, pos);
NOERR();
- ARCV(LACON, n);
+ assert(SEE(')'));
+ NEXT();
+ processlacon(v, s, s2, latype, lp, rp);
return;
break;
/* then errors, to get them out of the way */
@@ -1634,6 +1641,75 @@ wordchrs(struct vars * v)
}
/*
+ * processlacon - generate the NFA representation of a LACON
+ *
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+static void
+processlacon(struct vars * v,
+ struct state * begin, /* start of parsed LACON sub-re */
+ struct state * end, /* end of parsed LACON sub-re */
+ int latype,
+ struct state * lp, /* left state to hang it on */
+ struct state * rp) /* right state to hang it on */
+{
+ struct state *s1;
+ int n;
+
+ /*
+ * Check for lookaround RE consisting of a single plain color arc (or set
+ * of arcs); this would typically be a simple chr or a bracket expression.
+ */
+ s1 = single_color_transition(begin, end);
+ switch (latype)
+ {
+ case LATYPE_AHEAD_POS:
+ /* If lookahead RE is just colorset C, convert to AHEAD(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, AHEAD);
+ return;
+ }
+ break;
+ case LATYPE_AHEAD_NEG:
+ /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+ newarc(v->nfa, '$', 1, lp, rp);
+ newarc(v->nfa, '$', 0, lp, rp);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_POS:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, BEHIND);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_NEG:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+ newarc(v->nfa, '^', 1, lp, rp);
+ newarc(v->nfa, '^', 0, lp, rp);
+ return;
+ }
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+
+ /* General case: we need a LACON subre and arc */
+ n = newlacon(v, begin, end, latype);
+ newarc(v->nfa, LACON, n, lp, rp);
+}
+
+/*
* subre - allocate a subre
*/
static struct subre *
@@ -1826,15 +1902,18 @@ nfatree(struct vars * v,
if (t->right != NULL)
(DISCARD) nfatree(v, t->right, f);
- return nfanode(v, t, f);
+ return nfanode(v, t, 0, f);
}
/*
- * nfanode - do one NFA for nfatree
+ * nfanode - do one NFA for nfatree or lacons
+ *
+ * If converttosearch is true, apply makesearch() to the NFA.
*/
static long /* optimize results */
nfanode(struct vars * v,
struct subre * t,
+ int converttosearch,
FILE *f) /* for debug output */
{
struct nfa *nfa;
@@ -1855,10 +1934,11 @@ nfanode(struct vars * v,
NOERRZ();
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
if (!ISERR())
- {
specialcolors(nfa);
+ if (!ISERR())
ret = optimize(nfa, f);
- }
+ if (converttosearch && !ISERR())
+ makesearch(v, nfa);
if (!ISERR())
compact(nfa, &t->cnfa);
@@ -1867,13 +1947,13 @@ nfanode(struct vars * v,
}
/*
- * newlacon - allocate a lookahead-constraint subRE
+ * newlacon - allocate a lookaround-constraint subRE
*/
static int /* lacon number */
newlacon(struct vars * v,
struct state * begin,
struct state * end,
- int pos)
+ int latype)
{
int n;
struct subre *newlacons;
@@ -1900,13 +1980,13 @@ newlacon(struct vars * v,
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
- sub->subno = pos;
+ sub->subno = latype;
ZAPCNFA(sub->cnfa);
return n;
}
/*
- * freelacons - free lookahead-constraint subRE vector
+ * freelacons - free lookaround-constraint subRE vector
*/
static void
freelacons(struct subre * subs,
@@ -2020,9 +2100,29 @@ dump(regex_t *re,
}
for (i = 1; i < g->nlacons; i++)
{
- fprintf(f, "\nla%d (%s):\n", i,
- (g->lacons[i].subno) ? "positive" : "negative");
- dumpcnfa(&g->lacons[i].cnfa, f);
+ struct subre *lasub = &g->lacons[i];
+ const char *latype;
+
+ switch (lasub->subno)
+ {
+ case LATYPE_AHEAD_POS:
+ latype = "positive lookahead";
+ break;
+ case LATYPE_AHEAD_NEG:
+ latype = "negative lookahead";
+ break;
+ case LATYPE_BEHIND_POS:
+ latype = "positive lookbehind";
+ break;
+ case LATYPE_BEHIND_NEG:
+ latype = "negative lookbehind";
+ break;
+ default:
+ latype = "???";
+ break;
+ }
+ fprintf(f, "\nla%d (%s):\n", i, latype);
+ dumpcnfa(&lasub->cnfa, f);
}
fprintf(f, "\n");
dumpst(g->tree, f, 0);