diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-20 18:11:56 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-20 18:11:56 -0500 |
commit | 08c0d6ad65f7c161add82ae906efb90dbd7f653d (patch) | |
tree | cc6376d6fd084c6584b18d818c0816e3b7e190c9 /src/include/regex/regguts.h | |
parent | 17661188336c8cbb1783808912096932c57893a3 (diff) | |
download | postgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.tar.gz postgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.zip |
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything
metacharacter, produce a sheaf of parallel NFA arcs covering all
possible colors (that is, character equivalence classes). We can make
a noticeable improvement in the space and time needed to process large
regexes by replacing such cases with a single arc bearing the special
color code "RAINBOW". This requires only minor additional complication
in places such as pull() and push().
Callers of pg_reg_getoutarcs() must now be prepared for the possibility
of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm,
that's a net benefit since it cuts the number of arcs to be dealt with,
and the handling isn't any different than for other colors that contain
too many characters to be dealt with individually.
This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r-- | src/include/regex/regguts.h | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 0a616562d03..6d391083194 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -130,11 +130,16 @@ /* * As soon as possible, we map chrs into equivalence classes -- "colors" -- * which are of much more manageable number. + * + * To further reduce the number of arcs in NFAs and DFAs, we also have a + * special RAINBOW "color" that can be assigned to an arc. This is not a + * real color, in that it has no entry in color maps. */ typedef short color; /* colors of characters */ #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */ #define COLORLESS (-1) /* impossible color */ +#define RAINBOW (-2) /* represents all colors except pseudocolors */ #define WHITE 0 /* default color, parent of all others */ /* Note: various places in the code know that WHITE is zero */ @@ -276,7 +281,7 @@ struct state; struct arc { int type; /* 0 if free, else an NFA arc type code */ - color co; + color co; /* color the arc matches (possibly RAINBOW) */ struct state *from; /* where it's from (and contained within) */ struct state *to; /* where it's to */ struct arc *outchain; /* link in *from's outs chain or free chain */ @@ -284,6 +289,7 @@ struct arc #define freechain outchain /* we do not maintain "freechainRev" */ struct arc *inchain; /* link in *to's ins chain */ struct arc *inchainRev; /* back-link in *to's ins chain */ + /* these fields are not used when co == RAINBOW: */ struct arc *colorchain; /* link in color's arc chain */ struct arc *colorchainRev; /* back-link in color's arc chain */ }; @@ -344,6 +350,9 @@ struct nfa * Plain arcs just store the transition color number as "co". LACON arcs * store the lookaround constraint number plus cnfa.ncolors as "co". LACON * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. + * + * Note that in a plain arc, "co" can be RAINBOW; since that's negative, + * it doesn't break the rule about how to recognize LACON arcs. */ struct carc { |