diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-03-02 11:55:12 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-03-02 11:55:12 -0500 |
commit | 0c3405cf11a12da1a4278c6833f4d979fe06c866 (patch) | |
tree | 1a7d72b194224ceb9933533131cebc4ada5a502f /src/backend/regex | |
parent | 4aea704a5bfd4b5894a268499369ccab89940c9c (diff) | |
download | postgresql-0c3405cf11a12da1a4278c6833f4d979fe06c866.tar.gz postgresql-0c3405cf11a12da1a4278c6833f4d979fe06c866.zip |
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheck
of whether a backref subexpression can match at a particular place
in the string, we already know which substring the referenced
subexpression matched. If so, we might as well forget about the NFA
and just compare the substring; this is faster and it gives an exact
rather than approximate answer.
In general, this optimization can help while we are prechecking within
the second child expression of a concat node, while the capture was
within the first child expression; then the substring was saved during
cdissect() of the first child and will be available to NFA checks done
while cdissect() recurses into the second child. It can help quite a
lot if the tree looks like
concat
/ \
capture concat
/ \
expensive stuff backref
as we will be able to avoid recursively dissecting the "expensive
stuff" before discovering that the backref isn't satisfied with a
particular midpoint that the lower concat node is testing. This
doesn't help if the concat tree is left-deep, as the capture node
won't get set soon enough (and it's hard to fix that without changing
the engine's match behavior). Fortunately, right-deep concat trees
are the common case.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/rege_dfa.c | 117 | ||||
-rw-r--r-- | src/backend/regex/regexec.c | 24 |
2 files changed, 138 insertions, 3 deletions
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 1db52d1005c..18f87c69d47 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -58,6 +58,19 @@ longest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* if this is a backref to a known string, just match against that */ + if (d->backno >= 0) + { + assert((size_t) d->backno < v->nmatch); + if (v->pmatch[d->backno].rm_so >= 0) + { + cp = dfa_backref(v, d, start, start, stop, false); + if (cp == v->stop && stop == v->stop && hitstopp != NULL) + *hitstopp = 1; + return cp; + } + } + /* fast path for matchall NFAs */ if (d->cnfa->flags & MATCHALL) { @@ -210,6 +223,20 @@ shortest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* if this is a backref to a known string, just match against that */ + if (d->backno >= 0) + { + assert((size_t) d->backno < v->nmatch); + if (v->pmatch[d->backno].rm_so >= 0) + { + cp = dfa_backref(v, d, start, min, max, true); + if (cp != NULL && coldp != NULL) + *coldp = start; + /* there is no case where we should set *hitstopp */ + return cp; + } + } + /* fast path for matchall NFAs */ if (d->cnfa->flags & MATCHALL) { @@ -468,6 +495,94 @@ matchuntil(struct vars *v, } /* + * dfa_backref - find best match length for a known backref string + * + * When the backref's referent is already available, we can deliver an exact + * answer with considerably less work than running the backref node's NFA. + * + * Return match endpoint for longest or shortest valid repeated match, + * or NULL if there is no valid match. + * + * Should be in sync with cbrdissect(), although that has the different task + * of checking a match to a predetermined section of the string. + */ +static chr * +dfa_backref(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *min, /* match must end at or after here */ + chr *max, /* match must end at or before here */ + bool shortest) +{ + int n = d->backno; + int backmin = d->backmin; + int backmax = d->backmax; + size_t numreps; + size_t minreps; + size_t maxreps; + size_t brlen; + chr *brstring; + chr *p; + + /* get the backreferenced string (caller should have checked this) */ + if (v->pmatch[n].rm_so == -1) + return NULL; + brstring = v->start + v->pmatch[n].rm_so; + brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + + /* special-case zero-length backreference to avoid divide by zero */ + if (brlen == 0) + { + /* + * matches only a zero-length string, but any number of repetitions + * can be considered to be present + */ + if (min == start && backmin <= backmax) + return start; + return NULL; + } + + /* + * convert min and max into numbers of possible repetitions of the backref + * string, rounding appropriately + */ + if (min <= start) + minreps = 0; + else + minreps = (min - start - 1) / brlen + 1; + maxreps = (max - start) / brlen; + + /* apply bounds, then see if there is any allowed match length */ + if (minreps < backmin) + minreps = backmin; + if (backmax != DUPINF && maxreps > backmax) + maxreps = backmax; + if (maxreps < minreps) + return NULL; + + /* quick exit if zero-repetitions match is valid and preferred */ + if (shortest && minreps == 0) + return start; + + /* okay, compare the actual string contents */ + p = start; + numreps = 0; + while (numreps < maxreps) + { + if ((*v->g->compare) (brstring, p, brlen) != 0) + break; + p += brlen; + numreps++; + if (shortest && numreps >= minreps) + break; + } + + if (numreps >= minreps) + return p; + return NULL; +} + +/* * lastcold - determine last point at which no progress had been made */ static chr * /* endpoint, or NULL */ @@ -563,6 +678,8 @@ newdfa(struct vars *v, d->lastpost = NULL; d->lastnopr = NULL; d->search = d->ssets; + d->backno = -1; /* may be set by caller */ + d->backmin = d->backmax = 0; /* initialization of sset fields is done as needed */ diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index cf959899489..8d7777f8c62 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -77,6 +77,9 @@ struct dfa chr *lastpost; /* location of last cache-flushed success */ chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ struct sset *search; /* replacement-search-pointer memory */ + int backno; /* if DFA for a backref, subno it refers to */ + short backmin; /* min repetitions for backref */ + short backmax; /* max repetitions for backref */ bool ismalloced; /* should this struct dfa be freed? */ bool arraysmalloced; /* should its subsidiary arrays be freed? */ }; @@ -154,6 +157,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *); static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **); +static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool); static chr *lastcold(struct vars *, struct dfa *); static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); static void freedfa(struct dfa *); @@ -342,13 +346,23 @@ static struct dfa * getsubdfa(struct vars *v, struct subre *t) { - if (v->subdfas[t->id] == NULL) + struct dfa *d = v->subdfas[t->id]; + + if (d == NULL) { - v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC); + d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC); if (ISERR()) return NULL; + /* set up additional info if this is a backref node */ + if (t->op == 'b') + { + d->backno = t->backno; + d->backmin = t->min; + d->backmax = t->max; + } + v->subdfas[t->id] = d; } - return v->subdfas[t->id]; + return d; } /* @@ -369,6 +383,7 @@ getladfa(struct vars *v, v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC); if (ISERR()) return NULL; + /* a LACON can't contain a backref, so nothing else to do */ } return v->ladfas[n]; } @@ -927,6 +942,9 @@ crevcondissect(struct vars *v, /* * cbrdissect - dissect match for backref node + * + * The backref match might already have been verified by dfa_backref(), + * but we don't know that for sure so must check it here. */ static int /* regexec return code */ cbrdissect(struct vars *v, |