diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-25 13:00:40 -0500 |
commit | 2a0af7fe460eb46f9af996075972bf7c2e3f211d (patch) | |
tree | dc99ebbf913c05e67796401ebbd1cabe4fad349b /src/backend/regex/regcomp.c | |
parent | 6b40d9bdbdc9f873868b0ddecacd9a307fc8ee26 (diff) | |
download | postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.tar.gz postgresql-2a0af7fe460eb46f9af996075972bf7c2e3f211d.zip |
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within
bracket expressions. There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.
Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets. POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.
One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".
Some implementation cleanup along the way:
* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.
* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.
* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does? Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.
Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r-- | src/backend/regex/regcomp.c | 285 |
1 files changed, 240 insertions, 45 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 0cd4b4c4c29..7b77a29136c 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); static void nonword(struct vars *, int, struct state *, struct state *); static void word(struct vars *, int, struct state *, struct state *); +static void charclass(struct vars *, enum char_classes, + struct state *, struct state *); +static void charclasscomplement(struct vars *, enum char_classes, + struct state *, struct state *); static int scannum(struct vars *); static void repeat(struct vars *, struct state *, struct state *, int, int); static void bracket(struct vars *, struct state *, struct state *); static void cbracket(struct vars *, struct state *, struct state *); -static void brackpart(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *, bool *); static const chr *scanplain(struct vars *); static void onechr(struct vars *, chr, struct state *, struct state *); +static void optimizebracket(struct vars *, struct state *, struct state *); static void wordchrs(struct vars *); static void processlacon(struct vars *, struct state *, struct state *, int, struct state *, struct state *); @@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t); /* === regc_lex.c === */ static void lexstart(struct vars *); static void prefixes(struct vars *); -static void lexnest(struct vars *, const chr *, const chr *); -static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); static chr lexdigits(struct vars *, int, int, int); @@ -206,6 +209,7 @@ static void freecvec(struct cvec *); static int pg_wc_isdigit(pg_wchar c); static int pg_wc_isalpha(pg_wchar c); static int pg_wc_isalnum(pg_wchar c); +static int pg_wc_isword(pg_wchar c); static int pg_wc_isupper(pg_wchar c); static int pg_wc_islower(pg_wchar c); static int pg_wc_isgraph(pg_wchar c); @@ -220,7 +224,8 @@ static chr element(struct vars *, const chr *, const chr *); static struct cvec *range(struct vars *, chr, chr, int); static int before(chr, chr); static struct cvec *eclass(struct vars *, chr, int); -static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static enum char_classes lookupcclass(struct vars *, const chr *, const chr *); +static struct cvec *cclasscvec(struct vars *, enum char_classes, int); static int cclass_column_index(struct colormap *, chr); static struct cvec *allcases(struct vars *, chr); static int cmp(const chr *, const chr *, size_t); @@ -233,14 +238,12 @@ struct vars regex_t *re; const chr *now; /* scan pointer into string */ const chr *stop; /* end of string */ - const chr *savenow; /* saved now and stop for "subroutine call" */ - const chr *savestop; int err; /* error code (0 if none) */ int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ chr nextvalue; /* value (if any) of next token */ - int lexcon; /* lexical context type (see lex.c) */ + int lexcon; /* lexical context type (see regc_lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ size_t nsubs; /* length of vector */ @@ -287,6 +290,8 @@ struct vars #define ECLASS 'E' /* start of [= */ #define CCLASS 'C' /* start of [: */ #define END 'X' /* end of [. [= [: */ +#define CCLASSS 's' /* char class shorthand escape */ +#define CCLASSC 'c' /* complement char class shorthand escape */ #define RANGE 'R' /* - within [] which might be range delim. */ #define LACON 'L' /* lookaround constraint subRE */ #define AHEAD 'a' /* color-lookahead arc */ @@ -356,7 +361,6 @@ pg_regcomp(regex_t *re, v->re = re; v->now = string; v->stop = v->now + len; - v->savenow = v->savestop = NULL; v->err = 0; v->cflags = flags; v->nsubexp = 0; @@ -835,23 +839,25 @@ parseqatom(struct vars *v, return; break; case '<': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); + NEXT(); return; break; case '>': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case WBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); @@ -860,10 +866,11 @@ parseqatom(struct vars *v, NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case NWBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); @@ -872,6 +879,7 @@ parseqatom(struct vars *v, NOERR(); nonword(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case LACON: /* lookaround constraint */ @@ -925,6 +933,16 @@ parseqatom(struct vars *v, assert(SEE(']') || ISERR()); NEXT(); break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NEXT(); + break; + case CCLASSC: + charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp); + /* charclasscomplement() did okcolors() internally */ + NEXT(); + break; case '.': rainbow(v->nfa, v->cm, PLAIN, (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, @@ -1339,6 +1357,75 @@ word(struct vars *v, } /* + * charclass - generate arcs for a character class + * + * This is used for both atoms (\w and sibling escapes) and for elements + * of bracket expressions. The caller is responsible for calling okcolors() + * at the end of processing the atom or bracket. + */ +static void +charclass(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct cvec *cv; + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, lp, rp); +} + +/* + * charclasscomplement - generate arcs for a complemented character class + * + * This is used for both atoms (\W and sibling escapes) and for elements + * of bracket expressions. In bracket expressions, it is the caller's + * responsibility that there not be any open subcolors when this is called. + */ +static void +charclasscomplement(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct state *cstate; + struct cvec *cv; + + /* make dummy state to hang temporary arcs on */ + cstate = newstate(v->nfa); + NOERR(); + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build arcs for char class; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + + /* in NLSTOP mode, ensure newline is not part of the result set */ + if (v->cflags & REG_NLSTOP) + newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate); + NOERR(); + + /* clean up any subcolors in the arc set */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* now build output arcs for the complement of the char class */ + colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp); + NOERR(); + + /* clean up dummy state */ + dropstate(v->nfa, cstate); +} + +/* * scannum - scan a number */ static int /* value, <= DUPMAX */ @@ -1456,6 +1543,7 @@ repeat(struct vars *v, /* * bracket - handle non-complemented bracket expression + * * Also called from cbracket for complemented bracket expressions. */ static void @@ -1463,16 +1551,52 @@ bracket(struct vars *v, struct state *lp, struct state *rp) { + /* + * We can't process complemented char classes (e.g. \W) immediately while + * scanning the bracket expression, else color bookkeeping gets confused. + * Instead, remember whether we saw any in have_cclassc[], and process + * them at the end. + */ + bool have_cclassc[NUM_CCLASSES]; + bool any_cclassc; + int i; + + memset(have_cclassc, false, sizeof(have_cclassc)); + assert(SEE('[')); NEXT(); while (!SEE(']') && !SEE(EOS)) - brackpart(v, lp, rp); + brackpart(v, lp, rp, have_cclassc); assert(SEE(']') || ISERR()); + + /* close up open subcolors from the positive bracket elements */ okcolors(v->nfa, v->cm); + NOERR(); + + /* now handle any complemented elements */ + any_cclassc = false; + for (i = 0; i < NUM_CCLASSES; i++) + { + if (have_cclassc[i]) + { + charclasscomplement(v, (enum char_classes) i, lp, rp); + NOERR(); + any_cclassc = true; + } + } + + /* + * If we had any complemented elements, see if we can optimize the bracket + * into a rainbow. Since a complemented element is the only way a WHITE + * arc could get into the result, there's no point in checking otherwise. + */ + if (any_cclassc) + optimizebracket(v, lp, rp); } /* * cbracket - handle complemented bracket expression + * * We do it by calling bracket() with dummy endpoints, and then complementing * the result. The alternative would be to invoke rainbow(), and then delete * arcs as the b.e. is seen... but that gets messy, and is really quite @@ -1496,7 +1620,9 @@ cbracket(struct vars *v, /* * Easy part of complementing, and all there is to do since the MCCE code - * was removed. + * was removed. Note that the result of colorcomplement() cannot be a + * rainbow, since we don't allow empty brackets; so there's no point in + * calling optimizebracket() again. */ colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); NOERR(); @@ -1511,14 +1637,15 @@ cbracket(struct vars *v, static void brackpart(struct vars *v, struct state *lp, - struct state *rp) + struct state *rp, + bool *have_cclassc) { chr startc; chr endc; struct cvec *cv; + enum char_classes cls; const chr *startp; const chr *endp; - chr c[1]; /* parse something, get rid of special cases, take shortcuts */ switch (v->nexttype) @@ -1528,15 +1655,14 @@ brackpart(struct vars *v, return; break; case PLAIN: - c[0] = v->nextvalue; + startc = v->nextvalue; NEXT(); /* shortcut for ordinary chr (not range) */ if (!SEE(RANGE)) { - onechr(v, c[0], lp, rp); + onechr(v, startc, lp, rp); return; } - startc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1564,9 +1690,20 @@ brackpart(struct vars *v, endp = scanplain(v); INSIST(startp < endp, REG_ECTYPE); NOERR(); - cv = cclass(v, startp, endp, (v->cflags & REG_ICASE)); + cls = lookupcclass(v, startp, endp); NOERR(); - subcolorcvec(v, cv, lp, rp); + charclass(v, cls, lp, rp); + return; + break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + NEXT(); + return; + break; + case CCLASSC: + /* we cannot call charclasscomplement() immediately */ + have_cclassc[v->nextvalue] = true; + NEXT(); return; break; default: @@ -1582,9 +1719,8 @@ brackpart(struct vars *v, { case PLAIN: case RANGE: - c[0] = v->nextvalue; + endc = v->nextvalue; NEXT(); - endc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1618,7 +1754,7 @@ brackpart(struct vars *v, /* * scanplain - scan PLAIN contents of [. etc. * - * Certain bits of trickery in lex.c know that this code does not try + * Certain bits of trickery in regc_lex.c know that this code does not try * to look past the final bracket of the [. etc. */ static const chr * /* just after end of sequence */ @@ -1665,38 +1801,97 @@ onechr(struct vars *v, } /* + * optimizebracket - see if bracket expression can be converted to RAINBOW + * + * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we + * can replace by a single RAINBOW arc for efficiency. (This might seem + * like a silly way to write ".", but it's seemingly a common locution in + * some other flavors of regex, so take the trouble to support it well.) + */ +static void +optimizebracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(v->cm); + struct arc *a; + bool israinbow; + + /* + * Scan lp's out-arcs and transiently mark the mentioned colors. We + * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp. + * (Note: there shouldn't be any pseudocolors yet, but check anyway.) + */ + for (a = lp->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + assert(a->co >= 0); /* i.e. not RAINBOW */ + assert(a->to == rp); + cd = &v->cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)); + cd->flags |= COLMARK; + } + + /* Scan colors, clear transient marks, check for unmarked live colors */ + israinbow = true; + for (cd = v->cm->cd; cd < end; cd++) + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + israinbow = false; + } + + /* Can't do anything if not all colors have arcs */ + if (!israinbow) + return; + + /* OK, drop existing arcs and replace with a rainbow */ + while ((a = lp->outs) != NULL) + freearc(v->nfa, a); + newarc(v->nfa, PLAIN, RAINBOW, lp, rp); +} + +/* * wordchrs - set up word-chr list for word-boundary stuff, if needed * - * The list is kept as a bunch of arcs between two dummy states; it's - * disposed of by the unreachable-states sweep in NFA optimization. - * Does NEXT(). Must not be called from any unusual lexical context. - * This should be reconciled with the \w etc. handling in lex.c, and - * should be cleaned up to reduce dependencies on input scanning. + * The list is kept as a bunch of circular arcs on an otherwise-unused state. + * + * Note that this must not be called while we have any open subcolors, + * else construction of the list would confuse color bookkeeping. + * Hence, we can't currently apply a similar optimization in + * charclass[complement](), as those need to be usable within bracket + * expressions. */ static void wordchrs(struct vars *v) { - struct state *left; - struct state *right; + struct state *cstate; + struct cvec *cv; if (v->wordchrs != NULL) - { - NEXT(); /* for consistency */ - return; - } + return; /* done already */ - left = newstate(v->nfa); - right = newstate(v->nfa); + /* make dummy state to hang the cache arcs on */ + cstate = newstate(v->nfa); NOERR(); - /* fine point: implemented with [::], and lexer will set REG_ULOCALE */ - lexword(v); - NEXT(); - assert(v->savenow != NULL && SEE('[')); - bracket(v, left, right); - assert((v->savenow != NULL && SEE(']')) || ISERR()); - NEXT(); + + /* obtain possibly-cached cvec for \w characters */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE)); NOERR(); - v->wordchrs = left; + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + NOERR(); + + /* close new open subcolors to ensure the cache entry is self-contained */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* success! save the cache pointer */ + v->wordchrs = cstate; } /* |