aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regc_color.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
commit08c0d6ad65f7c161add82ae906efb90dbd7f653d (patch)
treecc6376d6fd084c6584b18d818c0816e3b7e190c9 /src/backend/regex/regc_color.c
parent17661188336c8cbb1783808912096932c57893a3 (diff)
downloadpostgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.tar.gz
postgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.zip
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regc_color.c')
-rw-r--r--src/backend/regex/regc_color.c22
1 files changed, 21 insertions, 1 deletions
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index f5a4151757d..0864011cce1 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -977,6 +977,7 @@ colorchain(struct colormap *cm,
{
struct colordesc *cd = &cm->cd[a->co];
+ assert(a->co >= 0);
if (cd->arcs != NULL)
cd->arcs->colorchainRev = a;
a->colorchain = cd->arcs;
@@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm,
struct colordesc *cd = &cm->cd[a->co];
struct arc *aa = a->colorchainRev;
+ assert(a->co >= 0);
if (aa == NULL)
{
assert(cd->arcs == a);
@@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm,
/*
* rainbow - add arcs of all full colors (but one) between specified states
+ *
+ * If there isn't an exception color, we now generate just a single arc
+ * labeled RAINBOW, saving lots of arc-munging later on.
*/
static void
rainbow(struct nfa *nfa,
@@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa,
struct colordesc *end = CDEND(cm);
color co;
+ if (but == COLORLESS)
+ {
+ newarc(nfa, type, RAINBOW, from, to);
+ return;
+ }
+
+ /* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
!(cd->flags & PSEUDO))
@@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa,
/*
* colorcomplement - add arcs of complementary colors
*
+ * We add arcs of all colors that are not pseudocolors and do not match
+ * any of the "of" state's PLAIN outarcs.
+ *
* The calling sequence ought to be reconciled with cloneouts().
*/
static void
colorcomplement(struct nfa *nfa,
struct colormap *cm,
int type,
- struct state *of, /* complements of this guy's PLAIN outarcs */
+ struct state *of,
struct state *from,
struct state *to)
{
@@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa,
color co;
assert(of != from);
+
+ /* A RAINBOW arc matches all colors, making the complement empty */
+ if (findarc(of, PLAIN, RAINBOW) != NULL)
+ return;
+
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
if (findarc(of, PLAIN, co) == NULL)