aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regexec.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-10-30 19:14:19 -0400
commit12c9a04008870c283931d6b3b648ee21bbc2cfda (patch)
tree2afd1e048b3681e5a93b7d8b3c37968e71b2532d /src/backend/regex/regexec.c
parentc5057b2b34813ca114bc808cb56b7a7fcde64393 (diff)
downloadpostgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.tar.gz
postgresql-12c9a04008870c283931d6b3b648ee21bbc2cfda.zip
Implement lookbehind constraints in our regular-expression engine.
A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND constraints, which already existed in the engine. This avoids the overhead of the LACON machinery entirely for these rather common cases. The net result is that lookbehind constraints run a factor of three or so slower than Perl's for multi-character constraints, but faster than Perl's for one-character constraints ... and they work fine for variable-length constraints, which Perl gives up on entirely. So that's not bad from a competitive perspective, and there's room for further optimization if anyone cares. (In reality, raw scan rate across a large input string is probably not that big a deal for Postgres usage anyway; so I'm happy if it's linear.)
Diffstat (limited to 'src/backend/regex/regexec.c')
-rw-r--r--src/backend/regex/regexec.c104
1 files changed, 91 insertions, 13 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 8a21f2cb787..82659a0f2f4 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -112,7 +112,10 @@ struct vars
chr *search_start; /* search start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
- struct dfa **subdfas; /* per-subre DFAs */
+ struct dfa **subdfas; /* per-tree-subre DFAs */
+ struct dfa **ladfas; /* per-lacon-subre DFAs */
+ struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
+ chr **lblastcp; /* per-lacon-subre lookbehind restart data */
struct smalldfa dfa1;
struct smalldfa dfa2;
};
@@ -132,6 +135,7 @@ struct vars
*/
/* === regexec.c === */
static struct dfa *getsubdfa(struct vars *, struct subre *);
+static struct dfa *getladfa(struct vars *, int);
static int find(struct vars *, struct cnfa *, struct colormap *);
static int cfind(struct vars *, struct cnfa *, struct colormap *);
static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
@@ -149,6 +153,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
static chr *lastcold(struct vars *, struct dfa *);
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
static void freedfa(struct dfa *);
@@ -226,21 +231,54 @@ pg_regexec(regex_t *re,
v->search_start = (chr *) string + search_start;
v->stop = (chr *) string + len;
v->err = 0;
+ v->subdfas = NULL;
+ v->ladfas = NULL;
+ v->lblastcss = NULL;
+ v->lblastcp = NULL;
+ /* below this point, "goto cleanup" will behave sanely */
+
assert(v->g->ntree >= 0);
n = (size_t) v->g->ntree;
if (n <= LOCALDFAS)
v->subdfas = subdfas;
else
- v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
- if (v->subdfas == NULL)
{
- if (v->pmatch != pmatch && v->pmatch != mat)
- FREE(v->pmatch);
- return REG_ESPACE;
+ v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->subdfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
}
for (i = 0; i < n; i++)
v->subdfas[i] = NULL;
+ assert(v->g->nlacons >= 0);
+ n = (size_t) v->g->nlacons;
+ if (n > 0)
+ {
+ v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->ladfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ v->ladfas[i] = NULL;
+ v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+ v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+ if (v->lblastcss == NULL || v->lblastcp == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ {
+ v->lblastcss[i] = NULL;
+ v->lblastcp[i] = NULL;
+ }
+ }
+
/* do it */
assert(v->g->tree != NULL);
if (backref)
@@ -257,22 +295,40 @@ pg_regexec(regex_t *re,
}
/* clean up */
+cleanup:
if (v->pmatch != pmatch && v->pmatch != mat)
FREE(v->pmatch);
- n = (size_t) v->g->ntree;
- for (i = 0; i < n; i++)
+ if (v->subdfas != NULL)
+ {
+ n = (size_t) v->g->ntree;
+ for (i = 0; i < n; i++)
+ {
+ if (v->subdfas[i] != NULL)
+ freedfa(v->subdfas[i]);
+ }
+ if (v->subdfas != subdfas)
+ FREE(v->subdfas);
+ }
+ if (v->ladfas != NULL)
{
- if (v->subdfas[i] != NULL)
- freedfa(v->subdfas[i]);
+ n = (size_t) v->g->nlacons;
+ for (i = 0; i < n; i++)
+ {
+ if (v->ladfas[i] != NULL)
+ freedfa(v->ladfas[i]);
+ }
+ FREE(v->ladfas);
}
- if (v->subdfas != subdfas)
- FREE(v->subdfas);
+ if (v->lblastcss != NULL)
+ FREE(v->lblastcss);
+ if (v->lblastcp != NULL)
+ FREE(v->lblastcp);
return st;
}
/*
- * getsubdfa - create or re-fetch the DFA for a subre node
+ * getsubdfa - create or re-fetch the DFA for a tree subre node
*
* We only need to create the DFA once per overall regex execution.
* The DFA will be freed by the cleanup step in pg_regexec().
@@ -291,6 +347,28 @@ getsubdfa(struct vars * v,
}
/*
+ * getladfa - create or re-fetch the DFA for a LACON subre node
+ *
+ * Same as above, but for LACONs.
+ */
+static struct dfa *
+getladfa(struct vars * v,
+ int n)
+{
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+
+ if (v->ladfas[n] == NULL)
+ {
+ struct subre *sub = &v->g->lacons[n];
+
+ v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ return NULL;
+ }
+ return v->ladfas[n];
+}
+
+/*
* find - find a match for the main NFA (no-complications case)
*/
static int