aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
commit08c0d6ad65f7c161add82ae906efb90dbd7f653d (patch)
treecc6376d6fd084c6584b18d818c0816e3b7e190c9 /src/include/regex/regguts.h
parent17661188336c8cbb1783808912096932c57893a3 (diff)
downloadpostgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.tar.gz
postgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.zip
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h11
1 files changed, 10 insertions, 1 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 0a616562d03..6d391083194 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -130,11 +130,16 @@
/*
* As soon as possible, we map chrs into equivalence classes -- "colors" --
* which are of much more manageable number.
+ *
+ * To further reduce the number of arcs in NFAs and DFAs, we also have a
+ * special RAINBOW "color" that can be assigned to an arc. This is not a
+ * real color, in that it has no entry in color maps.
*/
typedef short color; /* colors of characters */
#define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */
#define COLORLESS (-1) /* impossible color */
+#define RAINBOW (-2) /* represents all colors except pseudocolors */
#define WHITE 0 /* default color, parent of all others */
/* Note: various places in the code know that WHITE is zero */
@@ -276,7 +281,7 @@ struct state;
struct arc
{
int type; /* 0 if free, else an NFA arc type code */
- color co;
+ color co; /* color the arc matches (possibly RAINBOW) */
struct state *from; /* where it's from (and contained within) */
struct state *to; /* where it's to */
struct arc *outchain; /* link in *from's outs chain or free chain */
@@ -284,6 +289,7 @@ struct arc
#define freechain outchain /* we do not maintain "freechainRev" */
struct arc *inchain; /* link in *to's ins chain */
struct arc *inchainRev; /* back-link in *to's ins chain */
+ /* these fields are not used when co == RAINBOW: */
struct arc *colorchain; /* link in color's arc chain */
struct arc *colorchainRev; /* back-link in color's arc chain */
};
@@ -344,6 +350,9 @@ struct nfa
* Plain arcs just store the transition color number as "co". LACON arcs
* store the lookaround constraint number plus cnfa.ncolors as "co". LACON
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
+ *
+ * Note that in a plain arc, "co" can be RAINBOW; since that's negative,
+ * it doesn't break the rule about how to recognize LACON arcs.
*/
struct carc
{